1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*-
2 * vim: set sw=4 ts=8 et tw=78:
4 * ***** BEGIN LICENSE BLOCK *****
5 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
7 * The contents of this file are subject to the Mozilla Public License Version
8 * 1.1 (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
10 * http://www.mozilla.org/MPL/
12 * Software distributed under the License is distributed on an "AS IS" basis,
13 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
14 * for the specific language governing rights and limitations under the
17 * The Original Code is Mozilla Communicator client code, released
20 * The Initial Developer of the Original Code is
21 * Netscape Communications Corporation.
22 * Portions created by the Initial Developer are Copyright (C) 1998
23 * the Initial Developer. All Rights Reserved.
27 * Alternatively, the contents of this file may be used under the terms of
28 * either of the GNU General Public License Version 2 or later (the "GPL"),
29 * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
30 * in which case the provisions of the GPL or the LGPL are applicable instead
31 * of those above. If you wish to allow use of your version of this file only
32 * under the terms of either the GPL or the LGPL, and not to allow others to
33 * use your version of this file under the terms of the MPL, indicate your
34 * decision by deleting the provisions above and replace them with the notice
35 * and other provisions required by the GPL or the LGPL. If you do not delete
36 * the provisions above, a recipient may use your version of this file under
37 * the terms of any one of the MPL, the GPL or the LGPL.
39 * ***** END LICENSE BLOCK ***** */
42 * JS regular expressions, after Perl.
49 #include "jsarena.h" /* Added by JSIFY */
50 #include "jsutil.h" /* Added by JSIFY */
54 #include "jsbuiltins.h"
56 #include "jsversion.h"
67 #include "jsstaticcheck.h"
73 using namespace avmplus
;
74 using namespace nanojit
;
77 #include "jsobjinlines.h"
82 #define REOP_DEF(opcode, name) opcode,
83 #include "jsreops.tbl"
85 REOP_LIMIT
/* META: no operator >= to this */
88 #define REOP_IS_SIMPLE(op) ((op) <= REOP_NCLASS)
91 const char *reop_names
[] = {
92 #define REOP_DEF(opcode, name) name,
93 #include "jsreops.tbl"
101 re_debug(const char *fmt
, ...) __attribute__ ((format(printf
, 1, 2)));
106 re_debug(const char *fmt
, ...)
112 retval
= vprintf(fmt
, ap
);
118 re_debug_chars(const jschar
*chrs
, size_t length
)
123 while (*chrs
&& i
++ < length
) {
124 putchar((char)*chrs
++);
128 #else /* !REGEXP_DEBUG */
129 /* This should be optimized to a no-op by our tier-1 compilers. */
131 re_debug(const char *fmt
, ...)
137 re_debug_chars(const jschar
*chrs
, size_t length
)
140 #endif /* !REGEXP_DEBUG */
143 REOp op
; /* r.e. op bytecode */
144 RENode
*next
; /* next in concatenation order */
145 void *kid
; /* first operand */
147 void *kid2
; /* second operand */
148 jsint num
; /* could be a number */
149 size_t parenIndex
; /* or a parenthesis index */
150 struct { /* or a quantifier range */
155 struct { /* or a character class */
157 size_t kidlen
; /* length of string at kid, in jschars */
158 size_t index
; /* index into class list */
159 uint16 bmsize
; /* bitmap size, based on max char code */
162 struct { /* or a literal sequence */
163 jschar chr
; /* of one character */
164 size_t length
; /* or many (via the kid) */
167 RENode
*kid2
; /* second operand from ALT */
168 jschar ch1
; /* match char for ALTPREREQ */
169 jschar ch2
; /* ditto, or class index for ALTPREREQ2 */
174 #define RE_IS_LETTER(c) (((c >= 'A') && (c <= 'Z')) || \
175 ((c >= 'a') && (c <= 'z')) )
176 #define RE_IS_LINE_TERM(c) ((c == '\n') || (c == '\r') || \
177 (c == LINE_SEPARATOR) || (c == PARA_SEPARATOR))
179 #define CLASS_CACHE_SIZE 4
181 typedef struct CompilerState
{
183 TokenStream
*tokenStream
; /* For reporting errors */
184 const jschar
*cpbegin
;
188 size_t classCount
; /* number of [] encountered */
189 size_t treeDepth
; /* maximum depth of parse tree */
190 size_t progLength
; /* estimated bytecode length */
192 size_t classBitmapsMem
; /* memory to hold all class bitmaps */
194 const jschar
*start
; /* small cache of class strings */
195 size_t length
; /* since they're often the same */
197 } classCache
[CLASS_CACHE_SIZE
];
201 typedef struct EmitStateStackEntry
{
202 jsbytecode
*altHead
; /* start of REOP_ALT* opcode */
203 jsbytecode
*nextAltFixup
; /* fixup pointer to next-alt offset */
204 jsbytecode
*nextTermFixup
; /* fixup ptr. to REOP_JUMP offset */
205 jsbytecode
*endTermFixup
; /* fixup ptr. to REOPT_ALTPREREQ* offset */
206 RENode
*continueNode
; /* original REOP_ALT* node being stacked */
207 jsbytecode continueOp
; /* REOP_JUMP or REOP_ENDALT continuation */
208 JSPackedBool jumpToJumpFlag
; /* true if we've patched jump-to-jump to
209 avoid 16-bit unsigned offset overflow */
210 } EmitStateStackEntry
;
213 * Immediate operand sizes and getter/setters. Unlike the ones in jsopcode.h,
214 * the getters and setters take the pc of the offset, not of the opcode before
218 #define GET_ARG(pc) ((uint16)(((pc)[0] << 8) | (pc)[1]))
219 #define SET_ARG(pc, arg) ((pc)[0] = (jsbytecode) ((arg) >> 8), \
220 (pc)[1] = (jsbytecode) (arg))
222 #define OFFSET_LEN ARG_LEN
223 #define OFFSET_MAX (JS_BIT(ARG_LEN * 8) - 1)
224 #define GET_OFFSET(pc) GET_ARG(pc)
227 * Maximum supported tree depth is maximum size of EmitStateStackEntry stack.
228 * For sanity, we limit it to 2^24 bytes.
230 #define TREE_DEPTH_MAX (JS_BIT(24) / sizeof(EmitStateStackEntry))
233 * The maximum memory that can be allocated for class bitmaps.
234 * For sanity, we limit it to 2^24 bytes.
236 #define CLASS_BITMAPS_MEM_LIMIT JS_BIT(24)
239 * Functions to get size and write/read bytecode that represent small indexes
241 * Each byte in the code represent 7-bit chunk of the index. 8th bit when set
242 * indicates that the following byte brings more bits to the index. Otherwise
243 * this is the last byte in the index bytecode representing highest index bits.
246 GetCompactIndexWidth(size_t index
)
250 for (width
= 1; (index
>>= 7) != 0; ++width
) { }
254 static JS_ALWAYS_INLINE jsbytecode
*
255 WriteCompactIndex(jsbytecode
*pc
, size_t index
)
259 while ((next
= index
>> 7) != 0) {
260 *pc
++ = (jsbytecode
)(index
| 0x80);
263 *pc
++ = (jsbytecode
)index
;
267 static JS_ALWAYS_INLINE jsbytecode
*
268 ReadCompactIndex(jsbytecode
*pc
, size_t *result
)
273 if ((nextByte
& 0x80) == 0) {
275 * Short-circuit the most common case when compact index <= 127.
280 *result
= 0x7F & nextByte
;
283 *result
|= (nextByte
& 0x7F) << shift
;
285 } while ((nextByte
& 0x80) != 0);
290 typedef struct RECapture
{
291 ptrdiff_t index
; /* start of contents, -1 for empty */
292 size_t length
; /* length of capture */
295 typedef struct REMatchState
{
297 RECapture parens
[1]; /* first of 're->parenCount' captures,
298 allocated at end of this struct */
301 struct REBackTrackData
;
303 typedef struct REProgState
{
304 jsbytecode
*continue_pc
; /* current continuation data */
305 jsbytecode continue_op
;
306 ptrdiff_t index
; /* progress in text */
307 size_t parenSoFar
; /* highest indexed paren started */
310 uintN min
; /* current quantifier limits */
314 size_t top
; /* backtrack stack state */
320 typedef struct REBackTrackData
{
321 size_t sz
; /* size of previous stack entry */
322 jsbytecode
*backtrack_pc
; /* where to backtrack to */
323 jsbytecode backtrack_op
;
324 const jschar
*cp
; /* index in text of match at backtrack */
325 size_t parenIndex
; /* start index of saved paren contents */
326 size_t parenCount
; /* # of saved paren contents */
327 size_t saveStateStackTop
; /* number of parent states */
328 /* saved parent states follow */
329 /* saved paren contents follow */
332 #define INITIAL_STATESTACK 100
333 #define INITIAL_BACKTRACK 8000
335 typedef struct REGlobalData
{
337 JSRegExp
*regexp
; /* the RE in execution */
338 JSBool ok
; /* runtime error (out_of_memory only?) */
339 size_t start
; /* offset to start at */
340 ptrdiff_t skipped
; /* chars skipped anchoring this r.e. */
341 const jschar
*cpbegin
; /* text base address */
342 const jschar
*cpend
; /* text limit address */
344 REProgState
*stateStack
; /* stack of state of current parents */
345 size_t stateStackTop
;
346 size_t stateStackLimit
;
348 REBackTrackData
*backTrackStack
;/* stack of matched-so-far positions */
349 REBackTrackData
*backTrackSP
;
350 size_t backTrackStackSize
;
351 size_t cursz
; /* size of current stack entry */
352 size_t backTrackCount
; /* how many times we've backtracked */
353 size_t backTrackLimit
; /* upper limit on backtrack states */
357 JSRegExpStatics::clearRoots()
360 cx
->runtime
->gcPoke
= JS_TRUE
;
364 JSRegExpStatics::copy(const JSRegExpStatics
& other
)
368 multiline
= other
.multiline
;
369 lastMatch
= other
.lastMatch
;
370 lastParen
= other
.lastParen
;
371 leftContext
= other
.leftContext
;
372 rightContext
= other
.rightContext
;
373 if (!parens
.resize(other
.parens
.length()))
375 memcpy(parens
.begin(), other
.parens
.begin(), sizeof(JSSubString
) * parens
.length());
380 JSRegExpStatics::clear()
384 lastMatch
= lastParen
= leftContext
= rightContext
= js_EmptySubString
;
389 * 1. If IgnoreCase is false, return ch.
390 * 2. Let u be ch converted to upper case as if by calling
391 * String.prototype.toUpperCase on the one-character string ch.
392 * 3. If u does not consist of a single character, return ch.
393 * 4. Let cu be u's character.
394 * 5. If ch's code point value is greater than or equal to decimal 128 and cu's
395 * code point value is less than decimal 128, then return ch.
398 static JS_ALWAYS_INLINE uintN
403 JS_ASSERT((uintN
) (jschar
) ch
== ch
);
405 if (ch
- (uintN
) 'a' <= (uintN
) ('z' - 'a'))
406 ch
-= (uintN
) ('a' - 'A');
411 return (cu
< 128) ? ch
: cu
;
415 * Return the 'canonical' inverse upcase of |ch|. That is the character
416 * |lch| such that |upcase(lch) == ch| and (|lch| is the lower-case form
417 * of |ch| or is |ch|).
419 static inline jschar
inverse_upcase(jschar ch
)
421 jschar lch
= JS_TOLOWER(ch
);
422 return (upcase(lch
) == ch
) ? lch
: ch
;
425 /* Construct and initialize an RENode, returning NULL for out-of-memory */
427 NewRENode(CompilerState
*state
, REOp op
)
433 cx
->tempPool
.allocateCast
<RENode
*>(ren
, sizeof *ren
);
435 js_ReportOutOfScriptQuota(cx
);
445 * Validates and converts hex ascii value.
448 isASCIIHexDigit(jschar c
, uintN
*digit
)
459 if (cv
>= 'a' && cv
<= 'f') {
460 *digit
= cv
- 'a' + 10;
469 const jschar
*errPos
;
474 ReportRegExpErrorHelper(CompilerState
*state
, uintN flags
, uintN errorNumber
,
477 if (state
->tokenStream
) {
478 return ReportCompileErrorNumber(state
->context
, state
->tokenStream
,
479 NULL
, JSREPORT_UC
| flags
, errorNumber
, arg
);
481 return JS_ReportErrorFlagsAndNumberUC(state
->context
, flags
,
482 js_GetErrorMessage
, NULL
,
487 ReportRegExpError(CompilerState
*state
, uintN flags
, uintN errorNumber
)
489 return ReportRegExpErrorHelper(state
, flags
, errorNumber
, NULL
);
493 * Process the op against the two top operands, reducing them to a single
494 * operand in the penultimate slot. Update progLength and treeDepth.
497 ProcessOp(CompilerState
*state
, REOpData
*opData
, RENode
**operandStack
,
502 switch (opData
->op
) {
504 result
= NewRENode(state
, REOP_ALT
);
507 result
->kid
= operandStack
[operandSP
- 2];
508 result
->u
.kid2
= operandStack
[operandSP
- 1];
509 operandStack
[operandSP
- 2] = result
;
511 if (state
->treeDepth
== TREE_DEPTH_MAX
) {
512 ReportRegExpError(state
, JSREPORT_ERROR
, JSMSG_REGEXP_TOO_COMPLEX
);
518 * Look at both alternates to see if there's a FLAT or a CLASS at
519 * the start of each. If so, use a prerequisite match.
521 if (((RENode
*) result
->kid
)->op
== REOP_FLAT
&&
522 ((RENode
*) result
->u
.kid2
)->op
== REOP_FLAT
&&
523 (state
->flags
& JSREG_FOLD
) == 0) {
524 result
->op
= REOP_ALTPREREQ
;
525 result
->u
.altprereq
.ch1
= ((RENode
*) result
->kid
)->u
.flat
.chr
;
526 result
->u
.altprereq
.ch2
= ((RENode
*) result
->u
.kid2
)->u
.flat
.chr
;
527 /* ALTPREREQ, <end>, uch1, uch2, <next>, ...,
528 JUMP, <end> ... ENDALT */
529 state
->progLength
+= 13;
532 if (((RENode
*) result
->kid
)->op
== REOP_CLASS
&&
533 ((RENode
*) result
->kid
)->u
.ucclass
.index
< 256 &&
534 ((RENode
*) result
->u
.kid2
)->op
== REOP_FLAT
&&
535 (state
->flags
& JSREG_FOLD
) == 0) {
536 result
->op
= REOP_ALTPREREQ2
;
537 result
->u
.altprereq
.ch1
= ((RENode
*) result
->u
.kid2
)->u
.flat
.chr
;
538 result
->u
.altprereq
.ch2
= jschar(((RENode
*) result
->kid
)->u
.ucclass
.index
);
539 /* ALTPREREQ2, <end>, uch1, uch2, <next>, ...,
540 JUMP, <end> ... ENDALT */
541 state
->progLength
+= 13;
544 if (((RENode
*) result
->kid
)->op
== REOP_FLAT
&&
545 ((RENode
*) result
->u
.kid2
)->op
== REOP_CLASS
&&
546 ((RENode
*) result
->u
.kid2
)->u
.ucclass
.index
< 256 &&
547 (state
->flags
& JSREG_FOLD
) == 0) {
548 result
->op
= REOP_ALTPREREQ2
;
549 result
->u
.altprereq
.ch1
= ((RENode
*) result
->kid
)->u
.flat
.chr
;
550 result
->u
.altprereq
.ch2
=
551 jschar(((RENode
*) result
->u
.kid2
)->u
.ucclass
.index
);
552 /* ALTPREREQ2, <end>, uch1, uch2, <next>, ...,
553 JUMP, <end> ... ENDALT */
554 state
->progLength
+= 13;
557 /* ALT, <next>, ..., JUMP, <end> ... ENDALT */
558 state
->progLength
+= 7;
563 result
= operandStack
[operandSP
- 2];
565 result
= result
->next
;
566 result
->next
= operandStack
[operandSP
- 1];
570 case REOP_ASSERT_NOT
:
573 /* These should have been processed by a close paren. */
574 ReportRegExpErrorHelper(state
, JSREPORT_ERROR
, JSMSG_MISSING_PAREN
,
584 * Parser forward declarations.
586 static JSBool
ParseTerm(CompilerState
*state
);
587 static JSBool
ParseQuantifier(CompilerState
*state
);
588 static intN
ParseMinMaxQuantifier(CompilerState
*state
, JSBool ignoreValues
);
591 * Top-down regular expression grammar, based closely on Perl4.
593 * regexp: altern A regular expression is one or more
594 * altern '|' regexp alternatives separated by vertical bar.
596 #define INITIAL_STACK_SIZE 128
599 ParseRegExp(CompilerState
*state
)
603 REOpData
*operatorStack
;
604 RENode
**operandStack
;
607 JSBool result
= JS_FALSE
;
609 intN operatorSP
= 0, operatorStackSize
= INITIAL_STACK_SIZE
;
610 intN operandSP
= 0, operandStackSize
= INITIAL_STACK_SIZE
;
612 /* Watch out for empty regexp */
613 if (state
->cp
== state
->cpend
) {
614 state
->result
= NewRENode(state
, REOP_EMPTY
);
615 return (state
->result
!= NULL
);
618 operatorStack
= (REOpData
*)
619 state
->context
->malloc(sizeof(REOpData
) * operatorStackSize
);
623 operandStack
= (RENode
**)
624 state
->context
->malloc(sizeof(RENode
*) * operandStackSize
);
629 parenIndex
= state
->parenCount
;
630 if (state
->cp
== state
->cpend
) {
632 * If we are at the end of the regexp and we're short one or more
633 * operands, the regexp must have the form /x|/ or some such, with
634 * left parentheses making us short more than one operand.
636 if (operatorSP
>= operandSP
) {
637 operand
= NewRENode(state
, REOP_EMPTY
);
643 switch (*state
->cp
) {
646 if (state
->cp
+ 1 < state
->cpend
&&
648 (state
->cp
[1] == '=' ||
649 state
->cp
[1] == '!' ||
650 state
->cp
[1] == ':')) {
651 switch (state
->cp
[1]) {
654 /* ASSERT, <next>, ... ASSERTTEST */
655 state
->progLength
+= 4;
658 op
= REOP_ASSERT_NOT
;
659 /* ASSERTNOT, <next>, ... ASSERTNOTTEST */
660 state
->progLength
+= 4;
669 /* LPAREN, <index>, ... RPAREN, <index> */
671 += 2 * (1 + GetCompactIndexWidth(parenIndex
));
673 if (state
->parenCount
== 65535) {
674 ReportRegExpError(state
, JSREPORT_ERROR
,
675 JSMSG_TOO_MANY_PARENS
);
683 * If there's no stacked open parenthesis, throw syntax error.
685 for (i
= operatorSP
- 1; ; i
--) {
687 ReportRegExpError(state
, JSREPORT_ERROR
,
688 JSMSG_UNMATCHED_RIGHT_PAREN
);
691 if (operatorStack
[i
].op
== REOP_ASSERT
||
692 operatorStack
[i
].op
== REOP_ASSERT_NOT
||
693 operatorStack
[i
].op
== REOP_LPARENNON
||
694 operatorStack
[i
].op
== REOP_LPAREN
) {
701 /* Expected an operand before these, so make an empty one */
702 operand
= NewRENode(state
, REOP_EMPTY
);
708 if (!ParseTerm(state
))
710 operand
= state
->result
;
712 if (operandSP
== operandStackSize
) {
714 operandStackSize
+= operandStackSize
;
716 state
->context
->realloc(operandStack
,
717 sizeof(RENode
*) * operandStackSize
);
722 operandStack
[operandSP
++] = operand
;
727 /* At the end; process remaining operators. */
729 if (state
->cp
== state
->cpend
) {
732 if (!ProcessOp(state
, &operatorStack
[operatorSP
],
733 operandStack
, operandSP
))
737 JS_ASSERT(operandSP
== 1);
738 state
->result
= operandStack
[0];
743 switch (*state
->cp
) {
745 /* Process any stacked 'concat' operators */
748 operatorStack
[operatorSP
- 1].op
== REOP_CONCAT
) {
750 if (!ProcessOp(state
, &operatorStack
[operatorSP
],
751 operandStack
, operandSP
)) {
761 * If there's no stacked open parenthesis, throw syntax error.
763 for (i
= operatorSP
- 1; ; i
--) {
765 ReportRegExpError(state
, JSREPORT_ERROR
,
766 JSMSG_UNMATCHED_RIGHT_PAREN
);
769 if (operatorStack
[i
].op
== REOP_ASSERT
||
770 operatorStack
[i
].op
== REOP_ASSERT_NOT
||
771 operatorStack
[i
].op
== REOP_LPARENNON
||
772 operatorStack
[i
].op
== REOP_LPAREN
) {
778 /* Process everything on the stack until the open parenthesis. */
780 JS_ASSERT(operatorSP
);
782 switch (operatorStack
[operatorSP
].op
) {
784 case REOP_ASSERT_NOT
:
786 operand
= NewRENode(state
, operatorStack
[operatorSP
].op
);
789 operand
->u
.parenIndex
=
790 operatorStack
[operatorSP
].parenIndex
;
791 JS_ASSERT(operandSP
);
792 operand
->kid
= operandStack
[operandSP
- 1];
793 operandStack
[operandSP
- 1] = operand
;
794 if (state
->treeDepth
== TREE_DEPTH_MAX
) {
795 ReportRegExpError(state
, JSREPORT_ERROR
,
796 JSMSG_REGEXP_TOO_COMPLEX
);
803 state
->result
= operandStack
[operandSP
- 1];
804 if (!ParseQuantifier(state
))
806 operandStack
[operandSP
- 1] = state
->result
;
807 goto restartOperator
;
809 if (!ProcessOp(state
, &operatorStack
[operatorSP
],
810 operandStack
, operandSP
))
820 const jschar
*errp
= state
->cp
;
822 if (ParseMinMaxQuantifier(state
, JS_TRUE
) < 0) {
824 * This didn't even scan correctly as a quantifier, so we should
838 ReportRegExpErrorHelper(state
, JSREPORT_ERROR
, JSMSG_BAD_QUANTIFIER
,
844 /* Anything else is the start of the next term. */
847 if (operatorSP
== operatorStackSize
) {
849 operatorStackSize
+= operatorStackSize
;
851 state
->context
->realloc(operatorStack
,
852 sizeof(REOpData
) * operatorStackSize
);
857 operatorStack
[operatorSP
].op
= op
;
858 operatorStack
[operatorSP
].errPos
= state
->cp
;
859 operatorStack
[operatorSP
++].parenIndex
= parenIndex
;
865 state
->context
->free(operatorStack
);
867 state
->context
->free(operandStack
);
872 * Hack two bits in CompilerState.flags, for use within FindParenCount to flag
873 * its being on the stack, and to propagate errors to its callers.
875 #define JSREG_FIND_PAREN_COUNT 0x8000
876 #define JSREG_FIND_PAREN_ERROR 0x4000
879 * Magic return value from FindParenCount and GetDecimalValue, to indicate
880 * overflow beyond GetDecimalValue's max parameter, or a computed maximum if
881 * its findMax parameter is non-null.
883 #define OVERFLOW_VALUE ((uintN)-1)
886 FindParenCount(CompilerState
*state
)
891 if (state
->flags
& JSREG_FIND_PAREN_COUNT
)
892 return OVERFLOW_VALUE
;
895 * Copy state into temp, flag it so we never report an invalid backref,
896 * and reset its members to parse the entire regexp. This is obviously
897 * suboptimal, but GetDecimalValue calls us only if a backref appears to
898 * refer to a forward parenthetical, which is rare.
901 temp
.flags
|= JSREG_FIND_PAREN_COUNT
;
902 temp
.cp
= temp
.cpbegin
;
907 temp
.classBitmapsMem
= 0;
908 for (i
= 0; i
< CLASS_CACHE_SIZE
; i
++)
909 temp
.classCache
[i
].start
= NULL
;
911 if (!ParseRegExp(&temp
)) {
912 state
->flags
|= JSREG_FIND_PAREN_ERROR
;
913 return OVERFLOW_VALUE
;
915 return temp
.parenCount
;
919 * Extract and return a decimal value at state->cp. The initial character c
920 * has already been read. Return OVERFLOW_VALUE if the result exceeds max.
921 * Callers who pass a non-null findMax should test JSREG_FIND_PAREN_ERROR in
922 * state->flags to discover whether an error occurred under findMax.
925 GetDecimalValue(jschar c
, uintN max
, uintN (*findMax
)(CompilerState
*state
),
926 CompilerState
*state
)
928 uintN value
= JS7_UNDEC(c
);
929 JSBool overflow
= (value
> max
&& (!findMax
|| value
> findMax(state
)));
931 /* The following restriction allows simpler overflow checks. */
932 JS_ASSERT(max
<= ((uintN
)-1 - 9) / 10);
933 while (state
->cp
< state
->cpend
) {
937 value
= 10 * value
+ JS7_UNDEC(c
);
938 if (!overflow
&& value
> max
&& (!findMax
|| value
> findMax(state
)))
942 return overflow
? OVERFLOW_VALUE
: value
;
946 * Calculate the total size of the bitmap required for a class expression.
949 CalculateBitmapSize(CompilerState
*state
, RENode
*target
, const jschar
*src
,
953 JSBool inRange
= JS_FALSE
;
954 jschar c
, rangeStart
= 0;
955 uintN n
, digit
, nDigits
, i
;
957 target
->u
.ucclass
.bmsize
= 0;
958 target
->u
.ucclass
.sense
= JS_TRUE
;
965 target
->u
.ucclass
.sense
= JS_FALSE
;
969 JSBool canStartRange
= JS_TRUE
;
996 if (src
< end
&& RE_IS_LETTER(*src
)) {
997 localMax
= (uintN
) (*src
++) & 0x1F;
1010 for (i
= 0; (i
< nDigits
) && (src
< end
); i
++) {
1012 if (!isASCIIHexDigit(c
, &digit
)) {
1014 * Back off to accepting the original
1021 n
= (n
<< 4) | digit
;
1023 localMax
= jschar(n
);
1026 canStartRange
= JS_FALSE
;
1028 JS_ReportErrorNumber(state
->context
,
1029 js_GetErrorMessage
, NULL
,
1030 JSMSG_BAD_CLASS_RANGE
);
1040 canStartRange
= JS_FALSE
;
1042 JS_ReportErrorNumber(state
->context
,
1043 js_GetErrorMessage
, NULL
,
1044 JSMSG_BAD_CLASS_RANGE
);
1050 * If this is the start of a range, ensure that it's less than
1064 * This is a non-ECMA extension - decimal escapes (in this
1065 * case, octal!) are supposed to be an error inside class
1066 * ranges, but supported here for backwards compatibility.
1071 if ('0' <= c
&& c
<= '7') {
1073 n
= 8 * n
+ JS7_UNDEC(c
);
1075 if ('0' <= c
&& c
<= '7') {
1077 i
= 8 * n
+ JS7_UNDEC(c
);
1084 localMax
= jschar(n
);
1098 /* Throw a SyntaxError here, per ECMA-262, 15.10.2.15. */
1099 if (rangeStart
> localMax
) {
1100 JS_ReportErrorNumber(state
->context
,
1101 js_GetErrorMessage
, NULL
,
1102 JSMSG_BAD_CLASS_RANGE
);
1107 if (canStartRange
&& src
< end
- 1) {
1111 rangeStart
= (jschar
)localMax
;
1115 if (state
->flags
& JSREG_FOLD
)
1116 rangeStart
= localMax
; /* one run of the uc/dc loop below */
1119 if (state
->flags
& JSREG_FOLD
) {
1120 jschar maxch
= localMax
;
1122 for (i
= rangeStart
; i
<= localMax
; i
++) {
1125 uch
= jschar(upcase(i
));
1126 dch
= inverse_upcase(jschar(i
));
1127 maxch
= JS_MAX(maxch
, uch
);
1128 maxch
= JS_MAX(maxch
, dch
);
1134 max
= uintN(localMax
);
1136 target
->u
.ucclass
.bmsize
= uint16(max
);
1141 * item: assertion An item is either an assertion or
1142 * quantatom a quantified atom.
1144 * assertion: '^' Assertions match beginning of string
1145 * (or line if the class static property
1146 * RegExp.multiline is true).
1147 * '$' End of string (or line if the class
1148 * static property RegExp.multiline is
1150 * '\b' Word boundary (between \w and \W).
1151 * '\B' Word non-boundary.
1153 * quantatom: atom An unquantified atom.
1154 * quantatom '{' n ',' m '}'
1155 * Atom must occur between n and m times.
1156 * quantatom '{' n ',' '}' Atom must occur at least n times.
1157 * quantatom '{' n '}' Atom must occur exactly n times.
1158 * quantatom '*' Zero or more times (same as {0,}).
1159 * quantatom '+' One or more times (same as {1,}).
1160 * quantatom '?' Zero or one time (same as {0,1}).
1162 * any of which can be optionally followed by '?' for ungreedy
1164 * atom: '(' regexp ')' A parenthesized regexp (what matched
1165 * can be addressed using a backreference,
1167 * '.' Matches any char except '\n'.
1168 * '[' classlist ']' A character class.
1169 * '[' '^' classlist ']' A negated character class.
1171 * '\n' Newline (Line Feed).
1172 * '\r' Carriage Return.
1173 * '\t' Horizontal Tab.
1174 * '\v' Vertical Tab.
1175 * '\d' A digit (same as [0-9]).
1177 * '\w' A word character, [0-9a-z_A-Z].
1178 * '\W' A non-word character.
1179 * '\s' A whitespace character, [ \b\f\n\r\t\v].
1180 * '\S' A non-whitespace character.
1181 * '\' n A backreference to the nth (n decimal
1182 * and positive) parenthesized expression.
1183 * '\' octal An octal escape sequence (octal must be
1184 * two or three digits long, unless it is
1185 * 0 for the null character).
1186 * '\x' hex A hex escape (hex must be two digits).
1187 * '\u' unicode A unicode escape (must be four digits).
1188 * '\c' ctrl A control character, ctrl is a letter.
1189 * '\' literalatomchar Any character except one of the above
1190 * that follow '\' in an atom.
1191 * otheratomchar Any character not first among the other
1192 * atom right-hand sides.
1195 ParseTerm(CompilerState
*state
)
1197 jschar c
= *state
->cp
++;
1199 uintN num
, tmp
, n
, i
;
1200 const jschar
*termStart
;
1203 /* assertions and atoms */
1205 state
->result
= NewRENode(state
, REOP_BOL
);
1208 state
->progLength
++;
1211 state
->result
= NewRENode(state
, REOP_EOL
);
1214 state
->progLength
++;
1217 if (state
->cp
>= state
->cpend
) {
1218 /* a trailing '\' is an error */
1219 ReportRegExpError(state
, JSREPORT_ERROR
, JSMSG_TRAILING_SLASH
);
1224 /* assertion escapes */
1226 state
->result
= NewRENode(state
, REOP_WBDRY
);
1229 state
->progLength
++;
1232 state
->result
= NewRENode(state
, REOP_WNONBDRY
);
1235 state
->progLength
++;
1237 /* Decimal escape */
1239 /* Give a strict warning. See also the note below. */
1240 if (!ReportRegExpError(state
, JSREPORT_WARNING
| JSREPORT_STRICT
,
1241 JSMSG_INVALID_BACKREF
)) {
1246 while (state
->cp
< state
->cpend
) {
1248 if (c
< '0' || '7' < c
)
1251 tmp
= 8 * num
+ (uintN
)JS7_UNDEC(c
);
1258 state
->result
= NewRENode(state
, REOP_FLAT
);
1261 state
->result
->u
.flat
.chr
= c
;
1262 state
->result
->u
.flat
.length
= 1;
1263 state
->progLength
+= 3;
1274 termStart
= state
->cp
- 1;
1275 num
= GetDecimalValue(c
, state
->parenCount
, FindParenCount
, state
);
1276 if (state
->flags
& JSREG_FIND_PAREN_ERROR
)
1278 if (num
== OVERFLOW_VALUE
) {
1279 /* Give a strict mode warning. */
1280 if (!ReportRegExpError(state
,
1281 JSREPORT_WARNING
| JSREPORT_STRICT
,
1283 ? JSMSG_INVALID_BACKREF
1284 : JSMSG_BAD_BACKREF
)) {
1289 * Note: ECMA 262, 15.10.2.9 says that we should throw a syntax
1290 * error here. However, for compatibility with IE, we treat the
1291 * whole backref as flat if the first character in it is not a
1292 * valid octal character, and as an octal escape otherwise.
1294 state
->cp
= termStart
;
1296 /* Treat this as flat. termStart - 1 is the \. */
1301 /* Treat this as an octal escape. */
1306 * When FindParenCount calls the regex parser recursively (to find
1307 * the number of backrefs) num can be arbitrary and the maximum
1308 * supported number of backrefs does not bound it.
1310 JS_ASSERT_IF(!(state
->flags
& JSREG_FIND_PAREN_COUNT
),
1311 1 <= num
&& num
<= 0x10000);
1312 state
->result
= NewRENode(state
, REOP_BACKREF
);
1315 state
->result
->u
.parenIndex
= num
- 1;
1317 += 1 + GetCompactIndexWidth(state
->result
->u
.parenIndex
);
1319 /* Control escape */
1335 /* Control letter */
1337 if (state
->cp
< state
->cpend
&& RE_IS_LETTER(*state
->cp
)) {
1338 c
= (jschar
) (*state
->cp
++ & 0x1F);
1340 /* back off to accepting the original '\' as a literal */
1345 /* HexEscapeSequence */
1349 /* UnicodeEscapeSequence */
1354 for (i
= 0; i
< nDigits
&& state
->cp
< state
->cpend
; i
++) {
1357 if (!isASCIIHexDigit(c
, &digit
)) {
1359 * Back off to accepting the original 'u' or 'x' as a
1366 n
= (n
<< 4) | digit
;
1370 /* Character class escapes */
1372 state
->result
= NewRENode(state
, REOP_DIGIT
);
1376 state
->progLength
++;
1379 state
->result
= NewRENode(state
, REOP_NONDIGIT
);
1382 state
->result
= NewRENode(state
, REOP_SPACE
);
1385 state
->result
= NewRENode(state
, REOP_NONSPACE
);
1388 state
->result
= NewRENode(state
, REOP_ALNUM
);
1391 state
->result
= NewRENode(state
, REOP_NONALNUM
);
1393 /* IdentityEscape */
1395 state
->result
= NewRENode(state
, REOP_FLAT
);
1398 state
->result
->u
.flat
.chr
= c
;
1399 state
->result
->u
.flat
.length
= 1;
1400 state
->result
->kid
= (void *) (state
->cp
- 1);
1401 state
->progLength
+= 3;
1406 state
->result
= NewRENode(state
, REOP_CLASS
);
1409 termStart
= state
->cp
;
1410 state
->result
->u
.ucclass
.startIndex
= termStart
- state
->cpbegin
;
1412 if (state
->cp
== state
->cpend
) {
1413 ReportRegExpErrorHelper(state
, JSREPORT_ERROR
,
1414 JSMSG_UNTERM_CLASS
, termStart
);
1418 if (*state
->cp
== '\\') {
1420 if (state
->cp
!= state
->cpend
)
1424 if (*state
->cp
== ']') {
1425 state
->result
->u
.ucclass
.kidlen
= state
->cp
- termStart
;
1430 for (i
= 0; i
< CLASS_CACHE_SIZE
; i
++) {
1431 if (!state
->classCache
[i
].start
) {
1432 state
->classCache
[i
].start
= termStart
;
1433 state
->classCache
[i
].length
= state
->result
->u
.ucclass
.kidlen
;
1434 state
->classCache
[i
].index
= state
->classCount
;
1437 if (state
->classCache
[i
].length
==
1438 state
->result
->u
.ucclass
.kidlen
) {
1439 for (n
= 0; ; n
++) {
1440 if (n
== state
->classCache
[i
].length
) {
1441 state
->result
->u
.ucclass
.index
1442 = state
->classCache
[i
].index
;
1445 if (state
->classCache
[i
].start
[n
] != termStart
[n
])
1450 state
->result
->u
.ucclass
.index
= state
->classCount
++;
1454 * Call CalculateBitmapSize now as we want any errors it finds
1455 * to be reported during the parse phase, not at execution.
1457 if (!CalculateBitmapSize(state
, state
->result
, termStart
, state
->cp
++))
1460 * Update classBitmapsMem with number of bytes to hold bmsize bits,
1461 * which is (bitsCount + 7) / 8 or (highest_bit + 1 + 7) / 8
1462 * or highest_bit / 8 + 1 where highest_bit is u.ucclass.bmsize.
1464 n
= (state
->result
->u
.ucclass
.bmsize
>> 3) + 1;
1465 if (n
> CLASS_BITMAPS_MEM_LIMIT
- state
->classBitmapsMem
) {
1466 ReportRegExpError(state
, JSREPORT_ERROR
, JSMSG_REGEXP_TOO_COMPLEX
);
1469 state
->classBitmapsMem
+= n
;
1470 /* CLASS, <index> */
1472 += 1 + GetCompactIndexWidth(state
->result
->u
.ucclass
.index
);
1476 state
->result
= NewRENode(state
, REOP_DOT
);
1481 const jschar
*errp
= state
->cp
--;
1484 err
= ParseMinMaxQuantifier(state
, JS_TRUE
);
1495 ReportRegExpErrorHelper(state
, JSREPORT_ERROR
,
1496 JSMSG_BAD_QUANTIFIER
, state
->cp
- 1);
1500 state
->result
= NewRENode(state
, REOP_FLAT
);
1503 state
->result
->u
.flat
.chr
= c
;
1504 state
->result
->u
.flat
.length
= 1;
1505 state
->result
->kid
= (void *) (state
->cp
- 1);
1506 state
->progLength
+= 3;
1509 return ParseQuantifier(state
);
1513 ParseQuantifier(CompilerState
*state
)
1516 term
= state
->result
;
1517 if (state
->cp
< state
->cpend
) {
1518 switch (*state
->cp
) {
1520 state
->result
= NewRENode(state
, REOP_QUANT
);
1523 state
->result
->u
.range
.min
= 1;
1524 state
->result
->u
.range
.max
= (uintN
)-1;
1525 /* <PLUS>, <next> ... <ENDCHILD> */
1526 state
->progLength
+= 4;
1529 state
->result
= NewRENode(state
, REOP_QUANT
);
1532 state
->result
->u
.range
.min
= 0;
1533 state
->result
->u
.range
.max
= (uintN
)-1;
1534 /* <STAR>, <next> ... <ENDCHILD> */
1535 state
->progLength
+= 4;
1538 state
->result
= NewRENode(state
, REOP_QUANT
);
1541 state
->result
->u
.range
.min
= 0;
1542 state
->result
->u
.range
.max
= 1;
1543 /* <OPT>, <next> ... <ENDCHILD> */
1544 state
->progLength
+= 4;
1546 case '{': /* balance '}' */
1549 const jschar
*errp
= state
->cp
;
1551 err
= ParseMinMaxQuantifier(state
, JS_FALSE
);
1557 ReportRegExpErrorHelper(state
, JSREPORT_ERROR
, err
, errp
);
1566 if (state
->treeDepth
== TREE_DEPTH_MAX
) {
1567 ReportRegExpError(state
, JSREPORT_ERROR
, JSMSG_REGEXP_TOO_COMPLEX
);
1573 state
->result
->kid
= term
;
1574 if (state
->cp
< state
->cpend
&& *state
->cp
== '?') {
1576 state
->result
->u
.range
.greedy
= JS_FALSE
;
1578 state
->result
->u
.range
.greedy
= JS_TRUE
;
1584 ParseMinMaxQuantifier(CompilerState
*state
, JSBool ignoreValues
)
1588 const jschar
*errp
= state
->cp
++;
1593 min
= GetDecimalValue(c
, 0xFFFF, NULL
, state
);
1596 if (!ignoreValues
&& min
== OVERFLOW_VALUE
)
1597 return JSMSG_MIN_TOO_BIG
;
1603 max
= GetDecimalValue(c
, 0xFFFF, NULL
, state
);
1605 if (!ignoreValues
&& max
== OVERFLOW_VALUE
)
1606 return JSMSG_MAX_TOO_BIG
;
1607 if (!ignoreValues
&& min
> max
)
1608 return JSMSG_OUT_OF_ORDER
;
1616 state
->result
= NewRENode(state
, REOP_QUANT
);
1618 return JSMSG_OUT_OF_MEMORY
;
1619 state
->result
->u
.range
.min
= min
;
1620 state
->result
->u
.range
.max
= max
;
1622 * QUANT, <min>, <max>, <next> ... <ENDCHILD>
1623 * where <max> is written as compact(max+1) to make
1624 * (uintN)-1 sentinel to occupy 1 byte, not width_of(max)+1.
1626 state
->progLength
+= (1 + GetCompactIndexWidth(min
)
1627 + GetCompactIndexWidth(max
+ 1)
1638 SetForwardJumpOffset(jsbytecode
*jump
, jsbytecode
*target
)
1640 ptrdiff_t offset
= target
- jump
;
1642 /* Check that target really points forward. */
1643 JS_ASSERT(offset
>= 2);
1644 if ((size_t)offset
> OFFSET_MAX
)
1647 jump
[0] = JUMP_OFFSET_HI(offset
);
1648 jump
[1] = JUMP_OFFSET_LO(offset
);
1652 /* Copy the charset data from a character class node to the charset list
1653 * in the regexp object. */
1654 static JS_ALWAYS_INLINE RECharSet
*
1655 InitNodeCharSet(JSRegExp
*re
, RENode
*node
)
1657 RECharSet
*charSet
= &re
->classList
[node
->u
.ucclass
.index
];
1658 charSet
->converted
= JS_FALSE
;
1659 charSet
->length
= node
->u
.ucclass
.bmsize
;
1660 charSet
->u
.src
.startIndex
= node
->u
.ucclass
.startIndex
;
1661 charSet
->u
.src
.length
= node
->u
.ucclass
.kidlen
;
1662 charSet
->sense
= node
->u
.ucclass
.sense
;
1667 * Generate bytecode for the tree rooted at t using an explicit stack instead
1671 EmitREBytecode(CompilerState
*state
, JSRegExp
*re
, size_t treeDepth
,
1672 jsbytecode
*pc
, RENode
*t
)
1674 EmitStateStackEntry
*emitStateSP
, *emitStateStack
;
1677 if (treeDepth
== 0) {
1678 emitStateStack
= NULL
;
1681 (EmitStateStackEntry
*)
1682 state
->context
->malloc(sizeof(EmitStateStackEntry
) * treeDepth
);
1683 if (!emitStateStack
)
1686 emitStateSP
= emitStateStack
;
1688 JS_ASSERT(op
< REOP_LIMIT
);
1697 case REOP_ALTPREREQ2
:
1698 case REOP_ALTPREREQ
:
1699 JS_ASSERT(emitStateSP
);
1700 emitStateSP
->altHead
= pc
- 1;
1701 emitStateSP
->endTermFixup
= pc
;
1703 SET_ARG(pc
, t
->u
.altprereq
.ch1
);
1705 SET_ARG(pc
, t
->u
.altprereq
.ch2
);
1708 emitStateSP
->nextAltFixup
= pc
; /* offset to next alternate */
1711 emitStateSP
->continueNode
= t
;
1712 emitStateSP
->continueOp
= REOP_JUMP
;
1713 emitStateSP
->jumpToJumpFlag
= JS_FALSE
;
1715 JS_ASSERT((size_t)(emitStateSP
- emitStateStack
) <= treeDepth
);
1716 t
= (RENode
*) t
->kid
;
1718 JS_ASSERT(op
< REOP_LIMIT
);
1722 emitStateSP
->nextTermFixup
= pc
; /* offset to following term */
1724 if (!SetForwardJumpOffset(emitStateSP
->nextAltFixup
, pc
))
1726 emitStateSP
->continueOp
= REOP_ENDALT
;
1728 JS_ASSERT((size_t)(emitStateSP
- emitStateStack
) <= treeDepth
);
1729 t
= (RENode
*) t
->u
.kid2
;
1731 JS_ASSERT(op
< REOP_LIMIT
);
1736 * If we already patched emitStateSP->nextTermFixup to jump to
1737 * a nearer jump, to avoid 16-bit immediate offset overflow, we
1740 if (emitStateSP
->jumpToJumpFlag
)
1744 * Fix up the REOP_JUMP offset to go to the op after REOP_ENDALT.
1745 * REOP_ENDALT is executed only on successful match of the last
1746 * alternate in a group.
1748 if (!SetForwardJumpOffset(emitStateSP
->nextTermFixup
, pc
))
1750 if (t
->op
!= REOP_ALT
) {
1751 if (!SetForwardJumpOffset(emitStateSP
->endTermFixup
, pc
))
1756 * If the program is bigger than the REOP_JUMP offset range, then
1757 * we must check for alternates before this one that are part of
1758 * the same group, and fix up their jump offsets to target jumps
1759 * close enough to fit in a 16-bit unsigned offset immediate.
1761 if ((size_t)(pc
- re
->program
) > OFFSET_MAX
&&
1762 emitStateSP
> emitStateStack
) {
1763 EmitStateStackEntry
*esp
, *esp2
;
1764 jsbytecode
*alt
, *jump
;
1765 ptrdiff_t span
, header
;
1768 alt
= esp2
->altHead
;
1769 for (esp
= esp2
- 1; esp
>= emitStateStack
; --esp
) {
1770 if (esp
->continueOp
== REOP_ENDALT
&&
1771 !esp
->jumpToJumpFlag
&&
1772 esp
->nextTermFixup
+ OFFSET_LEN
== alt
&&
1773 (size_t)(pc
- ((esp
->continueNode
->op
!= REOP_ALT
)
1775 : esp
->nextTermFixup
)) > OFFSET_MAX
) {
1777 jump
= esp
->nextTermFixup
;
1780 * The span must be 1 less than the distance from
1781 * jump offset to jump offset, so we actually jump
1782 * to a REOP_JUMP bytecode, not to its offset!
1785 JS_ASSERT(jump
< esp2
->nextTermFixup
);
1786 span
= esp2
->nextTermFixup
- jump
- 1;
1787 if ((size_t)span
<= OFFSET_MAX
)
1792 } while (esp2
->continueOp
!= REOP_ENDALT
);
1795 jump
[0] = JUMP_OFFSET_HI(span
);
1796 jump
[1] = JUMP_OFFSET_LO(span
);
1798 if (esp
->continueNode
->op
!= REOP_ALT
) {
1800 * We must patch the offset at esp->endTermFixup
1801 * as well, for the REOP_ALTPREREQ{,2} opcodes.
1802 * If we're unlucky and endTermFixup is more than
1803 * OFFSET_MAX bytes from its target, we cheat by
1804 * jumping 6 bytes to the jump whose offset is at
1805 * esp->nextTermFixup, which has the same target.
1807 jump
= esp
->endTermFixup
;
1808 header
= esp
->nextTermFixup
- jump
;
1810 if ((size_t)span
> OFFSET_MAX
)
1813 jump
[0] = JUMP_OFFSET_HI(span
);
1814 jump
[1] = JUMP_OFFSET_LO(span
);
1817 esp
->jumpToJumpFlag
= JS_TRUE
;
1824 JS_ASSERT(emitStateSP
);
1825 emitStateSP
->altHead
= pc
- 1;
1826 emitStateSP
->nextAltFixup
= pc
; /* offset to next alternate */
1828 emitStateSP
->continueNode
= t
;
1829 emitStateSP
->continueOp
= REOP_JUMP
;
1830 emitStateSP
->jumpToJumpFlag
= JS_FALSE
;
1832 JS_ASSERT((size_t)(emitStateSP
- emitStateStack
) <= treeDepth
);
1833 t
= (RENode
*) t
->kid
;
1835 JS_ASSERT(op
< REOP_LIMIT
);
1840 * Coalesce FLATs if possible and if it would not increase bytecode
1841 * beyond preallocated limit. The latter happens only when bytecode
1842 * size for coalesced string with offset p and length 2 exceeds 6
1843 * bytes preallocated for 2 single char nodes, i.e. when
1844 * 1 + GetCompactIndexWidth(p) + GetCompactIndexWidth(2) > 6 or
1845 * GetCompactIndexWidth(p) > 4.
1846 * Since when GetCompactIndexWidth(p) <= 4 coalescing of 3 or more
1847 * nodes strictly decreases bytecode size, the check has to be
1848 * done only for the first coalescing.
1851 GetCompactIndexWidth((jschar
*)t
->kid
- state
->cpbegin
) <= 4)
1854 t
->next
->op
== REOP_FLAT
&&
1855 (jschar
*)t
->kid
+ t
->u
.flat
.length
==
1856 (jschar
*)t
->next
->kid
) {
1857 t
->u
.flat
.length
+= t
->next
->u
.flat
.length
;
1858 t
->next
= t
->next
->next
;
1861 if (t
->kid
&& t
->u
.flat
.length
> 1) {
1862 pc
[-1] = (state
->flags
& JSREG_FOLD
) ? REOP_FLATi
: REOP_FLAT
;
1863 pc
= WriteCompactIndex(pc
, (jschar
*)t
->kid
- state
->cpbegin
);
1864 pc
= WriteCompactIndex(pc
, t
->u
.flat
.length
);
1865 } else if (t
->u
.flat
.chr
< 256) {
1866 pc
[-1] = (state
->flags
& JSREG_FOLD
) ? REOP_FLAT1i
: REOP_FLAT1
;
1867 *pc
++ = (jsbytecode
) t
->u
.flat
.chr
;
1869 pc
[-1] = (state
->flags
& JSREG_FOLD
)
1872 SET_ARG(pc
, t
->u
.flat
.chr
);
1878 JS_ASSERT(emitStateSP
);
1879 pc
= WriteCompactIndex(pc
, t
->u
.parenIndex
);
1880 emitStateSP
->continueNode
= t
;
1881 emitStateSP
->continueOp
= REOP_RPAREN
;
1883 JS_ASSERT((size_t)(emitStateSP
- emitStateStack
) <= treeDepth
);
1884 t
= (RENode
*) t
->kid
;
1889 pc
= WriteCompactIndex(pc
, t
->u
.parenIndex
);
1893 pc
= WriteCompactIndex(pc
, t
->u
.parenIndex
);
1897 JS_ASSERT(emitStateSP
);
1898 emitStateSP
->nextTermFixup
= pc
;
1900 emitStateSP
->continueNode
= t
;
1901 emitStateSP
->continueOp
= REOP_ASSERTTEST
;
1903 JS_ASSERT((size_t)(emitStateSP
- emitStateStack
) <= treeDepth
);
1904 t
= (RENode
*) t
->kid
;
1908 case REOP_ASSERTTEST
:
1909 case REOP_ASSERTNOTTEST
:
1910 if (!SetForwardJumpOffset(emitStateSP
->nextTermFixup
, pc
))
1914 case REOP_ASSERT_NOT
:
1915 JS_ASSERT(emitStateSP
);
1916 emitStateSP
->nextTermFixup
= pc
;
1918 emitStateSP
->continueNode
= t
;
1919 emitStateSP
->continueOp
= REOP_ASSERTNOTTEST
;
1921 JS_ASSERT((size_t)(emitStateSP
- emitStateStack
) <= treeDepth
);
1922 t
= (RENode
*) t
->kid
;
1927 JS_ASSERT(emitStateSP
);
1928 if (t
->u
.range
.min
== 0 && t
->u
.range
.max
== (uintN
)-1) {
1929 pc
[-1] = (t
->u
.range
.greedy
) ? REOP_STAR
: REOP_MINIMALSTAR
;
1930 } else if (t
->u
.range
.min
== 0 && t
->u
.range
.max
== 1) {
1931 pc
[-1] = (t
->u
.range
.greedy
) ? REOP_OPT
: REOP_MINIMALOPT
;
1932 } else if (t
->u
.range
.min
== 1 && t
->u
.range
.max
== (uintN
) -1) {
1933 pc
[-1] = (t
->u
.range
.greedy
) ? REOP_PLUS
: REOP_MINIMALPLUS
;
1935 if (!t
->u
.range
.greedy
)
1936 pc
[-1] = REOP_MINIMALQUANT
;
1937 pc
= WriteCompactIndex(pc
, t
->u
.range
.min
);
1939 * Write max + 1 to avoid using size_t(max) + 1 bytes
1940 * for (uintN)-1 sentinel.
1942 pc
= WriteCompactIndex(pc
, t
->u
.range
.max
+ 1);
1944 emitStateSP
->nextTermFixup
= pc
;
1946 emitStateSP
->continueNode
= t
;
1947 emitStateSP
->continueOp
= REOP_ENDCHILD
;
1949 JS_ASSERT((size_t)(emitStateSP
- emitStateStack
) <= treeDepth
);
1950 t
= (RENode
*) t
->kid
;
1955 if (!SetForwardJumpOffset(emitStateSP
->nextTermFixup
, pc
))
1960 if (!t
->u
.ucclass
.sense
)
1961 pc
[-1] = REOP_NCLASS
;
1962 pc
= WriteCompactIndex(pc
, t
->u
.ucclass
.index
);
1963 InitNodeCharSet(re
, t
);
1974 if (emitStateSP
== emitStateStack
)
1977 t
= emitStateSP
->continueNode
;
1978 op
= (REOp
) emitStateSP
->continueOp
;
1984 state
->context
->free(emitStateStack
);
1988 ReportRegExpError(state
, JSREPORT_ERROR
, JSMSG_REGEXP_TOO_COMPLEX
);
1994 CompileRegExpToAST(JSContext
* cx
, TokenStream
* ts
,
1995 JSString
* str
, uintN flags
, CompilerState
& state
)
2000 len
= str
->length();
2003 state
.tokenStream
= ts
;
2004 state
.cp
= js_UndependString(cx
, str
);
2007 state
.cpbegin
= state
.cp
;
2008 state
.cpend
= state
.cp
+ len
;
2009 state
.flags
= uint16(flags
);
2010 state
.parenCount
= 0;
2011 state
.classCount
= 0;
2012 state
.progLength
= 0;
2013 state
.treeDepth
= 0;
2014 state
.classBitmapsMem
= 0;
2015 for (i
= 0; i
< CLASS_CACHE_SIZE
; i
++)
2016 state
.classCache
[i
].start
= NULL
;
2018 if (len
!= 0 && (flags
& JSREG_FLAT
)) {
2019 state
.result
= NewRENode(&state
, REOP_FLAT
);
2022 state
.result
->u
.flat
.chr
= *state
.cpbegin
;
2023 state
.result
->u
.flat
.length
= len
;
2024 state
.result
->kid
= (void *) state
.cpbegin
;
2025 /* Flat bytecode: REOP_FLAT compact(string_offset) compact(len). */
2026 state
.progLength
+= 1 + GetCompactIndexWidth(0)
2027 + GetCompactIndexWidth(len
);
2031 return ParseRegExp(&state
);
2035 typedef js::Vector
<LIns
*, 4, js::ContextAllocPolicy
> LInsList
;
2039 struct REFragment
: public nanojit::Fragment
2041 REFragment(const void* _ip
verbose_only(, uint32_t profFragID
))
2042 : nanojit::Fragment(ip
verbose_only(, profFragID
))
2046 } /* namespace js */
2048 /* Return the cached fragment for the given regexp, or create one. */
2050 LookupNativeRegExp(JSContext
* cx
, uint16 re_flags
,
2051 const jschar
* re_chars
, size_t re_length
)
2053 TraceMonitor
*tm
= &JS_TRACE_MONITOR(cx
);
2054 VMAllocator
&alloc
= *tm
->dataAlloc
;
2055 REHashMap
&table
= *tm
->reFragments
;
2057 REHashKey
k(re_length
, re_flags
, re_chars
);
2058 REFragment
*frag
= table
.get(k
);
2062 uint32_t profFragID
= (LogController
.lcbits
& LC_FragProfile
)
2063 ? (++(tm
->lastFragID
)) : 0;
2065 frag
= new (alloc
) REFragment(0 verbose_only(, profFragID
));
2067 * Copy the re_chars portion of the hash key into the Allocator, so
2068 * its lifecycle is disconnected from the lifecycle of the
2069 * underlying regexp.
2071 k
.re_chars
= (const jschar
*) new (alloc
) jschar
[re_length
];
2072 memcpy((void*) k
.re_chars
, re_chars
, re_length
* sizeof(jschar
));
2079 ProcessCharSet(JSContext
*cx
, JSRegExp
*re
, RECharSet
*charSet
);
2081 /* Utilities for the RegExpNativeCompiler */
2085 * An efficient way to simultaneously statically guard that the sizeof(bool) is a
2086 * small power of 2 and take its log2.
2088 template <int> struct StaticLog2
{};
2089 template <> struct StaticLog2
<1> { static const int result
= 0; };
2090 template <> struct StaticLog2
<2> { static const int result
= 1; };
2091 template <> struct StaticLog2
<4> { static const int result
= 2; };
2092 template <> struct StaticLog2
<8> { static const int result
= 3; };
2096 * This table allows efficient testing for the ASCII portion of \s during a
2097 * trace. ECMA-262 15.10.2.12 defines the following characters below 128 to be
2098 * whitespace: 0x9 (0), 0xA (10), 0xB (11), 0xC (12), 0xD (13), 0x20 (32). The
2099 * index must be <= 32.
2101 static const bool js_ws
[] = {
2102 /* 0 1 2 3 4 5 5 7 8 9 */
2103 /* 0 */ false, false, false, false, false, false, false, false, false, true,
2104 /* 1 */ true, true, true, true, false, false, false, false, false, false,
2105 /* 2 */ false, false, false, false, false, false, false, false, false, false,
2106 /* 3 */ false, false, true
2109 /* Sets of characters are described in terms of individuals and classes. */
2112 CharSet() : charEnd(charBuf
), classes(0) {}
2114 static const uintN sBufSize
= 8;
2116 bool full() { return charEnd
== charBuf
+ sBufSize
; }
2118 /* Add a single char to the set. */
2119 bool addChar(jschar c
)
2128 LineTerms
= 1 << 0, /* Line Terminators (E262 7.3) */
2129 OtherSpace
= 1 << 1, /* \s (E262 15.10.2.12) - LineTerms */
2130 Digit
= 1 << 2, /* \d (E262 15.10.2.12) */
2131 OtherAlnum
= 1 << 3, /* \w (E262 15,10.2.12) - Digit */
2132 Other
= 1 << 4, /* all other characters */
2133 All
= LineTerms
| OtherSpace
| Digit
| OtherAlnum
| Other
,
2135 Space
= LineTerms
| OtherSpace
,
2136 AlNum
= Digit
| OtherAlnum
,
2137 Dot
= All
& ~LineTerms
2140 /* Add a set of chars to the set. */
2141 void addClass(Class c
) { classes
|= c
; }
2143 /* Return whether two sets of chars are disjoint. */
2144 bool disjoint(const CharSet
&) const;
2147 static bool disjoint(const jschar
*beg
, const jschar
*end
, uintN classes
);
2149 mutable jschar charBuf
[sBufSize
];
2154 /* Appease the type checker. */
2155 static inline CharSet::Class
2156 operator|(CharSet::Class c1
, CharSet::Class c2
) {
2157 return (CharSet::Class
)(((int)c1
) | ((int)c2
));
2159 static inline CharSet::Class
2160 operator~(CharSet::Class c
) {
2161 return (CharSet::Class
)(~(int)c
);
2165 * Return whether the characters in the range [beg, end) fall within any of the
2166 * classes with a bit set in 'classes'.
2169 CharSet::disjoint(const jschar
*beg
, const jschar
*end
, uintN classes
)
2171 for (const jschar
*p
= beg
; p
!= end
; ++p
) {
2172 if (JS7_ISDEC(*p
)) {
2173 if (classes
& Digit
)
2175 } else if (JS_ISWORD(*p
)) {
2176 if (classes
& OtherAlnum
)
2178 } else if (RE_IS_LINE_TERM(*p
)) {
2179 if (classes
& LineTerms
)
2181 } else if (JS_ISSPACE(*p
)) {
2182 if (classes
& OtherSpace
)
2185 if (classes
& Other
)
2193 * Predicate version of the STL's set_intersection. Assumes both ranges are
2194 * sorted and thus runs in linear time.
2196 * FIXME: This is a reusable algorithm, perhaps it should be put somewhere.
2198 template <class InputIterator1
, class InputIterator2
>
2200 set_disjoint(InputIterator1 p1
, InputIterator1 end1
,
2201 InputIterator2 p2
, InputIterator2 end2
)
2203 if (p1
== end1
|| p2
== end2
)
2205 while (*p1
!= *p2
) {
2210 } else if (*p2
< *p1
) {
2220 CharCmp(void *arg
, const void *a
, const void *b
, int *result
)
2222 jschar ca
= *(jschar
*)a
, cb
= *(jschar
*)b
;
2228 CharSet::disjoint(const CharSet
&other
) const
2230 /* Check overlap between classes. */
2231 if (classes
& other
.classes
)
2235 * Check char-class overlap. Compare this->charBuf with other.classes and
2236 * vice versa with a loop.
2238 if (!disjoint(this->charBuf
, this->charEnd
, other
.classes
) ||
2239 !disjoint(other
.charBuf
, other
.charEnd
, this->classes
))
2242 /* Check char-char overlap. */
2243 jschar tmp
[CharSet::sBufSize
];
2244 js_MergeSort(charBuf
, charEnd
- charBuf
, sizeof(jschar
),
2246 js_MergeSort(other
.charBuf
, other
.charEnd
- other
.charBuf
, sizeof(jschar
),
2248 return set_disjoint(charBuf
, charEnd
, other
.charBuf
, other
.charEnd
);
2252 * Return true if the given subexpression may match the empty string. The
2253 * conservative answer is |true|. If |next| is true, then the subexpression is
2254 * considered to be |node| followed by the rest of |node->next|. Otherwise, the
2255 * subexpression is considered to be |node| by itself.
2258 mayMatchEmpty(RENode
*node
, bool next
= true)
2263 case REOP_EMPTY
: return true;
2264 case REOP_FLAT
: return false;
2265 case REOP_CLASS
: return false;
2266 case REOP_ALNUM
: return false;
2267 case REOP_ALT
: return (mayMatchEmpty((RENode
*)node
->kid
) ||
2268 mayMatchEmpty((RENode
*)node
->u
.kid2
)) &&
2269 (!next
|| mayMatchEmpty(node
->next
));
2270 case REOP_QUANT
: return (node
->u
.range
.min
== 0 ||
2271 mayMatchEmpty((RENode
*)node
->kid
)) &&
2272 (!next
|| mayMatchEmpty(node
->next
));
2273 default: return true;
2278 * Enumerate the set of characters that may be consumed next by the given
2279 * subexpression in isolation. Return whether the enumeration was successful.
2282 enumerateNextChars(JSContext
*cx
, RENode
*node
, CharSet
&set
)
2284 JS_CHECK_RECURSION(cx
, return JS_FALSE
);
2290 /* Record as bitflags. */
2291 case REOP_DOT
: set
.addClass(CharSet::Dot
); return true;
2292 case REOP_DIGIT
: set
.addClass(CharSet::Digit
); return true;
2293 case REOP_NONDIGIT
: set
.addClass(~CharSet::Digit
); return true;
2294 case REOP_ALNUM
: set
.addClass(CharSet::AlNum
); return true;
2295 case REOP_NONALNUM
: set
.addClass(~CharSet::AlNum
); return true;
2296 case REOP_SPACE
: set
.addClass(CharSet::Space
); return true;
2297 case REOP_NONSPACE
: set
.addClass(~CharSet::Space
); return true;
2299 /* Record as individual characters. */
2301 return set
.addChar(node
->u
.flat
.chr
);
2303 /* Control structures. */
2307 case REOP_ALTPREREQ
:
2308 return enumerateNextChars(cx
, (RENode
*)node
->kid
, set
) &&
2309 enumerateNextChars(cx
, (RENode
*)node
->u
.kid2
, set
) &&
2310 (!mayMatchEmpty(node
, false) ||
2311 enumerateNextChars(cx
, (RENode
*)node
->next
, set
));
2313 return enumerateNextChars(cx
, (RENode
*)node
->kid
, set
) &&
2314 (!mayMatchEmpty(node
, false) ||
2315 enumerateNextChars(cx
, (RENode
*)node
->next
, set
));
2317 /* Arbitrary character classes and oddities. */
2323 class RegExpNativeCompiler
{
2325 VMAllocator
& tempAlloc
;
2328 CompilerState
* cs
; /* RegExp to compile */
2332 LirWriter
* validate_writer
;
2335 LirWriter
* verbose_filter
;
2337 LirBufWriter
* lirBufWriter
; /* for skip */
2343 LirBuffer
* const lirbuf
;
2345 bool outOfMemory() {
2346 return tempAlloc
.outOfMemory() || JS_TRACE_MONITOR(cx
).dataAlloc
->outOfMemory();
2349 JSBool
isCaseInsensitive() const { return (cs
->flags
& JSREG_FOLD
) != 0; }
2351 void targetCurrentPoint(LIns
*ins
)
2353 ins
->setTarget(lir
->ins0(LIR_label
));
2356 void targetCurrentPoint(LInsList
&fails
)
2358 LIns
*fail
= lir
->ins0(LIR_label
);
2359 for (size_t i
= 0; i
< fails
.length(); ++i
) {
2360 fails
[i
]->setTarget(fail
);
2366 * These functions return the new position after their match operation,
2367 * or NULL if there was an error.
2369 LIns
* compileEmpty(RENode
* node
, LIns
* pos
, LInsList
& fails
)
2374 #if defined(AVMPLUS_ARM) || defined(AVMPLUS_SPARC)
2375 /* We can't do this on ARM or SPARC, since it relies on doing a 32-bit load from
2376 * a pointer which is only 2-byte aligned.
2378 #undef USE_DOUBLE_CHAR_MATCH
2380 #define USE_DOUBLE_CHAR_MATCH
2383 LIns
* compileFlatSingleChar(jschar ch
, LIns
* pos
, LInsList
& fails
)
2385 LIns
* to_fail
= lir
->insBranch(LIR_jf
, lir
->ins2(LIR_ltp
, pos
, cpend
), 0);
2386 if (!fails
.append(to_fail
))
2388 LIns
* text_ch
= lir
->insLoad(LIR_ldus2ui
, pos
, 0, ACC_READONLY
);
2390 // Extra characters that need to be compared against when doing folding.
2398 if (cs
->flags
& JSREG_FOLD
) {
2399 ch
= JS_TOUPPER(ch
);
2400 jschar lch
= inverse_upcase(ch
);
2403 if (L
'A' <= ch
&& ch
<= L
'Z') {
2404 // Fast conversion of text character to lower case by OR-ing with 32.
2405 text_ch
= lir
->ins2(LIR_ori
, text_ch
, lir
->insImmI(32));
2406 // These ASCII letters have 2 lower-case forms. We put the ASCII one in
2407 // |extras| so it is tested first, because we expect that to be the common
2408 // case. Note that the code points of the non-ASCII forms both have the
2409 // 32 bit set, so it is OK to compare against the OR-32-converted text char.
2412 extras
[nextras
++].ch
= ch
;
2414 } else if (ch
== L
's') {
2415 extras
[nextras
++].ch
= ch
;
2419 } else if (0x01c4 <= ch
&& ch
<= 0x1e60) {
2420 // The following group of conditionals handles characters that have 1 or 2
2421 // lower-case forms in addition to JS_TOLOWER(ch).
2422 if (ch
<= 0x1f1) { // DZ,LJ,NJ
2424 extras
[nextras
++].ch
= 0x01c5;
2425 } else if (ch
== 0x01c7) {
2426 extras
[nextras
++].ch
= 0x01c8;
2427 } else if (ch
== 0x01ca) {
2428 extras
[nextras
++].ch
= 0x01cb;
2429 } else if (ch
== 0x01f1) {
2430 extras
[nextras
++].ch
= 0x01f2;
2432 } else if (ch
< 0x0392) { // no extra lower-case forms in this range
2433 } else if (ch
<= 0x03a6) { // Greek
2435 extras
[nextras
++].ch
= 0x03d0;
2436 } else if (ch
== 0x0395) {
2437 extras
[nextras
++].ch
= 0x03f5;
2438 } else if (ch
== 0x0398) {
2439 extras
[nextras
++].ch
= 0x03d1;
2440 } else if (ch
== 0x0399) {
2441 extras
[nextras
++].ch
= 0x0345;
2442 extras
[nextras
++].ch
= 0x1fbe;
2443 } else if (ch
== 0x039a) {
2444 extras
[nextras
++].ch
= 0x03f0;
2445 } else if (ch
== 0x039c) {
2446 extras
[nextras
++].ch
= 0xb5;
2447 } else if (ch
== 0x03a0) {
2448 extras
[nextras
++].ch
= 0x03d6;
2449 } else if (ch
== 0x03a1) {
2450 extras
[nextras
++].ch
= 0x03f1;
2451 } else if (ch
== 0x03a3) {
2452 extras
[nextras
++].ch
= 0x03c2;
2453 } else if (ch
== 0x03a6) {
2454 extras
[nextras
++].ch
= 0x03d5;
2456 } else if (ch
== 0x1e60) { // S with dot above
2457 extras
[nextras
++].ch
= 0x1e9b;
2461 extras
[nextras
++].ch
= lch
;
2466 for (int i
= 0; i
< nextras
; ++i
) {
2467 LIns
*test
= lir
->ins2(LIR_eqi
, text_ch
, lir
->insImmI(extras
[i
].ch
));
2468 LIns
*branch
= lir
->insBranch(LIR_jt
, test
, 0);
2469 extras
[i
].match
= branch
;
2472 if (!fails
.append(lir
->insBranch(LIR_jf
, lir
->ins2(LIR_eqi
, text_ch
, lir
->insImmI(ch
)), 0)))
2475 for (int i
= 0; i
< nextras
; ++i
)
2476 targetCurrentPoint(extras
[i
].match
);
2477 return lir
->ins2(LIR_addp
, pos
, lir
->insImmWord(2));
2480 JS_INLINE
bool hasCases(jschar ch
)
2482 return JS_TOLOWER(ch
) != JS_TOUPPER(ch
);
2485 LIns
* compileFlatDoubleChar(jschar ch1
, jschar ch2
, LIns
* pos
, LInsList
& fails
)
2487 #ifdef IS_BIG_ENDIAN
2488 uint32 word
= (ch1
<< 16) | ch2
;
2490 uint32 word
= (ch2
<< 16) | ch1
;
2493 * Fast case-insensitive test for ASCII letters: convert text
2494 * char to lower case by bit-or-ing in 32 and compare.
2496 JSBool useFastCI
= JS_FALSE
;
2497 union { jschar c
[2]; uint32 i
; } mask
;
2498 if (cs
->flags
& JSREG_FOLD
) {
2499 jschar uch1
= JS_TOUPPER(ch1
);
2500 jschar uch2
= JS_TOUPPER(ch2
);
2501 JSBool mask1
= (L
'A' <= uch1
&& uch1
<= L
'Z' && uch1
!= L
'I' && uch1
!= L
'S');
2502 JSBool mask2
= (L
'A' <= uch2
&& uch2
<= L
'Z' && uch2
!= L
'I' && uch2
!= L
'S');
2503 if ((!mask1
&& hasCases(ch1
)) || (!mask2
&& hasCases(ch2
))) {
2504 pos
= compileFlatSingleChar(ch1
, pos
, fails
);
2505 if (!pos
) return NULL
;
2506 return compileFlatSingleChar(ch2
, pos
, fails
);
2509 mask
.c
[0] = mask1
? 0x0020 : 0x0;
2510 mask
.c
[1] = mask2
? 0x0020 : 0x0;
2514 useFastCI
= JS_TRUE
;
2518 LIns
* to_fail
= lir
->insBranch(LIR_jf
,
2523 lir
->insImmWord(-2))),
2525 if (!fails
.append(to_fail
))
2527 LIns
* text_word
= lir
->insLoad(LIR_ldi
, pos
, 0, ACC_OTHER
);
2528 LIns
* comp_word
= useFastCI
?
2529 lir
->ins2(LIR_ori
, text_word
, lir
->insImmI(mask
.i
)) :
2531 if (!fails
.append(lir
->insBranch(LIR_jf
, lir
->ins2(LIR_eqi
, comp_word
, lir
->insImmI(word
)), 0)))
2534 return lir
->ins2(LIR_addp
, pos
, lir
->insImmWord(4));
2537 LIns
* compileFlat(RENode
*&node
, LIns
* pos
, LInsList
& fails
)
2539 #ifdef USE_DOUBLE_CHAR_MATCH
2540 if (node
->u
.flat
.length
== 1) {
2541 if (node
->next
&& node
->next
->op
== REOP_FLAT
&&
2542 node
->next
->u
.flat
.length
== 1) {
2543 pos
= compileFlatDoubleChar(node
->u
.flat
.chr
,
2544 node
->next
->u
.flat
.chr
,
2548 pos
= compileFlatSingleChar(node
->u
.flat
.chr
, pos
, fails
);
2553 for (i
= 0; i
< node
->u
.flat
.length
- 1; i
+= 2) {
2556 pos
= compileFlatDoubleChar(((jschar
*) node
->kid
)[i
],
2557 ((jschar
*) node
->kid
)[i
+1],
2562 JS_ASSERT(pos
!= 0);
2563 if (i
== node
->u
.flat
.length
- 1)
2564 pos
= compileFlatSingleChar(((jschar
*) node
->kid
)[i
], pos
, fails
);
2568 if (node
->u
.flat
.length
== 1) {
2569 return compileFlatSingleChar(node
->u
.flat
.chr
, pos
, fails
);
2571 for (size_t i
= 0; i
< node
->u
.flat
.length
; i
++) {
2574 pos
= compileFlatSingleChar(((jschar
*) node
->kid
)[i
], pos
, fails
);
2583 LIns
* compileClass(RENode
* node
, LIns
* pos
, LInsList
& fails
)
2585 if (!node
->u
.ucclass
.sense
)
2588 * If we share generated native code, we need to make a copy
2589 * of the bitmap because the original regexp's copy is destroyed when
2592 RECharSet
*charSet
= &re
->classList
[node
->u
.ucclass
.index
];
2593 size_t bitmapLen
= (charSet
->length
>> 3) + 1;
2594 /* Arbitrary size limit on bitmap. */
2595 if (bitmapLen
> 1024)
2597 Allocator
&alloc
= *JS_TRACE_MONITOR(cx
).dataAlloc
;
2598 /* The following line allocates charSet.u.bits if successful. */
2599 if (!charSet
->converted
&& !ProcessCharSet(cx
, re
, charSet
))
2601 void* bitmapData
= alloc
.alloc(bitmapLen
);
2604 memcpy(bitmapData
, charSet
->u
.bits
, bitmapLen
);
2606 LIns
* to_fail
= lir
->insBranch(LIR_jf
, lir
->ins2(LIR_ltp
, pos
, cpend
), 0);
2607 if (!fails
.append(to_fail
))
2609 LIns
* text_ch
= lir
->insLoad(LIR_ldus2ui
, pos
, 0, ACC_READONLY
);
2610 if (!fails
.append(lir
->insBranch(LIR_jf
,
2611 lir
->ins2(LIR_lei
, text_ch
, lir
->insImmI(charSet
->length
)),
2615 LIns
* byteIndex
= lir
->insI2P(lir
->ins2(LIR_rshi
, text_ch
, lir
->insImmI(3)));
2616 LIns
* bitmap
= lir
->insImmP(bitmapData
);
2617 LIns
* byte
= lir
->insLoad(LIR_lduc2ui
, lir
->ins2(LIR_addp
, bitmap
, byteIndex
), (int) 0,
2619 LIns
* bitMask
= lir
->ins2(LIR_lshi
, lir
->insImmI(1),
2620 lir
->ins2(LIR_andi
, text_ch
, lir
->insImmI(0x7)));
2621 LIns
* test
= lir
->ins2(LIR_eqi
, lir
->ins2(LIR_andi
, byte
, bitMask
), lir
->insImmI(0));
2623 LIns
* to_next
= lir
->insBranch(LIR_jt
, test
, 0);
2624 if (!fails
.append(to_next
))
2626 return lir
->ins2(LIR_addp
, pos
, lir
->insImmWord(2));
2629 /* Factor out common code to index js_alnum. */
2630 LIns
*compileTableRead(LIns
*chr
, const bool *tbl
)
2632 if (sizeof(bool) != 1) {
2633 LIns
*sizeLog2
= lir
->insImmI(StaticLog2
<sizeof(bool)>::result
);
2634 chr
= lir
->ins2(LIR_lshi
, chr
, sizeLog2
);
2636 LIns
*addr
= lir
->ins2(LIR_addp
, lir
->insImmP(tbl
), lir
->insUI2P(chr
));
2637 return lir
->insLoad(LIR_lduc2ui
, addr
, 0, ACC_READONLY
);
2640 /* Compile a builtin character class. */
2641 LIns
*compileBuiltinClass(RENode
*node
, LIns
*pos
, LInsList
&fails
)
2643 /* All the builtins checked below consume one character. */
2644 if (!fails
.append(lir
->insBranch(LIR_jf
, lir
->ins2(LIR_ltp
, pos
, cpend
), 0)))
2646 LIns
*chr
= lir
->insLoad(LIR_ldus2ui
, pos
, 0, ACC_READONLY
);
2651 /* Accept any character except those in ECMA-262 15.10.2.8. */
2652 LIns
*eq1
= lir
->ins2(LIR_eqi
, chr
, lir
->insImmI('\n'));
2653 if (!fails
.append(lir
->insBranch(LIR_jt
, eq1
, NULL
)))
2655 LIns
*eq2
= lir
->ins2(LIR_eqi
, chr
, lir
->insImmI('\r'));
2656 if (!fails
.append(lir
->insBranch(LIR_jt
, eq2
, NULL
)))
2658 LIns
*eq3
= lir
->ins2(LIR_eqi
, chr
, lir
->insImmI(LINE_SEPARATOR
));
2659 if (!fails
.append(lir
->insBranch(LIR_jt
, eq3
, NULL
)))
2661 LIns
*eq4
= lir
->ins2(LIR_eqi
, chr
, lir
->insImmI(PARA_SEPARATOR
));
2662 if (!fails
.append(lir
->insBranch(LIR_jt
, eq4
, NULL
)))
2668 LIns
*ge
= lir
->ins2(LIR_gei
, chr
, lir
->insImmI('0'));
2669 if (!fails
.append(lir
->insBranch(LIR_jf
, ge
, NULL
)))
2671 LIns
*le
= lir
->ins2(LIR_lei
, chr
, lir
->insImmI('9'));
2672 if (!fails
.append(lir
->insBranch(LIR_jf
, le
, NULL
)))
2678 /* Use 'and' to give a predictable branch for success path. */
2679 LIns
*ge
= lir
->ins2(LIR_gei
, chr
, lir
->insImmI('0'));
2680 LIns
*le
= lir
->ins2(LIR_lei
, chr
, lir
->insImmI('9'));
2681 LIns
*both
= lir
->ins2(LIR_andi
, ge
, le
);
2682 if (!fails
.append(lir
->insBranch(LIR_jf
, lir
->insEqI_0(both
), NULL
)))
2689 * Compile the condition:
2690 * ((uint)*cp) < 128 && js_alnum[(uint)*cp]
2692 LIns
*rangeCnd
= lir
->ins2(LIR_ltui
, chr
, lir
->insImmI(128));
2693 if (!fails
.append(lir
->insBranch(LIR_jf
, rangeCnd
, NULL
)))
2695 LIns
*tableVal
= compileTableRead(chr
, js_alnum
);
2696 if (!fails
.append(lir
->insBranch(LIR_jt
, lir
->insEqI_0(tableVal
), NULL
)))
2703 * Compile the condition:
2704 * ((uint)*cp) >= 128 || !js_alnum[(uint)*cp]
2706 LIns
*rangeCnd
= lir
->ins2(LIR_geui
, chr
, lir
->insImmI(128));
2707 LIns
*rangeBr
= lir
->insBranch(LIR_jt
, rangeCnd
, NULL
);
2708 LIns
*tableVal
= compileTableRead(chr
, js_alnum
);
2709 if (!fails
.append(lir
->insBranch(LIR_jf
, lir
->insEqI_0(tableVal
), NULL
)))
2711 LIns
*success
= lir
->ins0(LIR_label
);
2712 rangeBr
->setTarget(success
);
2719 * ECMA-262 7.2, 7.3, and 15.10.2.12 define a bunch of Unicode code
2720 * points for whitespace. We optimize here for the common case of
2721 * ASCII characters using a table lookup for the lower block that
2722 * can actually contain spaces. For the rest, use a (more or less)
2723 * binary search to minimize tests.
2725 * [0000,0020]: 9, A, B, C, D, 20
2727 * [00A0,2000): A0, 1680, 180E
2729 * (200A, max): 2028, 2029, 202F, 205F, 3000
2732 LIns
*tableRangeCnd
= lir
->ins2(LIR_leui
, chr
, lir
->insImmI(0x20));
2733 LIns
*tableRangeBr
= lir
->insBranch(LIR_jt
, tableRangeCnd
, NULL
);
2734 /* Fall through means *chr > 0x20. */
2736 /* Handle (0x20,0xA0). */
2737 LIns
*asciiCnd
= lir
->ins2(LIR_ltui
, chr
, lir
->insImmI(0xA0));
2738 LIns
*asciiMissBr
= lir
->insBranch(LIR_jt
, asciiCnd
, NULL
);
2739 /* Fall through means *chr >= 0xA0. */
2741 /* Partition around [0x2000,0x200A]. */
2742 LIns
*belowCnd
= lir
->ins2(LIR_ltui
, chr
, lir
->insImmI(0x2000));
2743 LIns
*belowBr
= lir
->insBranch(LIR_jt
, belowCnd
, NULL
);
2744 LIns
*aboveCnd
= lir
->ins2(LIR_gtui
, chr
, lir
->insImmI(0x200A));
2745 LIns
*aboveBr
= lir
->insBranch(LIR_jt
, aboveCnd
, NULL
);
2746 LIns
*intervalMatchBr
= lir
->insBranch(LIR_j
, NULL
, NULL
);
2748 /* Handle [0xA0,0x2000). */
2749 LIns
*belowLbl
= lir
->ins0(LIR_label
);
2750 belowBr
->setTarget(belowLbl
);
2751 LIns
*eq1Cnd
= lir
->ins2(LIR_eqi
, chr
, lir
->insImmI(0xA0));
2752 LIns
*eq1Br
= lir
->insBranch(LIR_jt
, eq1Cnd
, NULL
);
2753 LIns
*eq2Cnd
= lir
->ins2(LIR_eqi
, chr
, lir
->insImmI(0x1680));
2754 LIns
*eq2Br
= lir
->insBranch(LIR_jt
, eq2Cnd
, NULL
);
2755 LIns
*eq3Cnd
= lir
->ins2(LIR_eqi
, chr
, lir
->insImmI(0x180E));
2756 LIns
*eq3Br
= lir
->insBranch(LIR_jt
, eq3Cnd
, NULL
);
2757 LIns
*belowMissBr
= lir
->insBranch(LIR_j
, NULL
, NULL
);
2759 /* Handle (0x200A, max). */
2760 LIns
*aboveLbl
= lir
->ins0(LIR_label
);
2761 aboveBr
->setTarget(aboveLbl
);
2762 LIns
*eq4Cnd
= lir
->ins2(LIR_eqi
, chr
, lir
->insImmI(0x2028));
2763 LIns
*eq4Br
= lir
->insBranch(LIR_jt
, eq4Cnd
, NULL
);
2764 LIns
*eq5Cnd
= lir
->ins2(LIR_eqi
, chr
, lir
->insImmI(0x2029));
2765 LIns
*eq5Br
= lir
->insBranch(LIR_jt
, eq5Cnd
, NULL
);
2766 LIns
*eq6Cnd
= lir
->ins2(LIR_eqi
, chr
, lir
->insImmI(0x202F));
2767 LIns
*eq6Br
= lir
->insBranch(LIR_jt
, eq6Cnd
, NULL
);
2768 LIns
*eq7Cnd
= lir
->ins2(LIR_eqi
, chr
, lir
->insImmI(0x205F));
2769 LIns
*eq7Br
= lir
->insBranch(LIR_jt
, eq7Cnd
, NULL
);
2770 LIns
*eq8Cnd
= lir
->ins2(LIR_eqi
, chr
, lir
->insImmI(0x3000));
2771 LIns
*eq8Br
= lir
->insBranch(LIR_jt
, eq8Cnd
, NULL
);
2772 LIns
*aboveMissBr
= lir
->insBranch(LIR_j
, NULL
, NULL
);
2774 /* Handle [0,0x20]. */
2775 LIns
*tableLbl
= lir
->ins0(LIR_label
);
2776 tableRangeBr
->setTarget(tableLbl
);
2777 LIns
*tableVal
= compileTableRead(chr
, js_ws
);
2778 LIns
*tableCnd
= lir
->insEqI_0(tableVal
);
2779 LIns
*tableMatchBr
= lir
->insBranch(LIR_jf
, tableCnd
, NULL
);
2781 /* Collect misses. */
2782 LIns
*missLbl
= lir
->ins0(LIR_label
);
2783 asciiMissBr
->setTarget(missLbl
);
2784 belowMissBr
->setTarget(missLbl
);
2785 aboveMissBr
->setTarget(missLbl
);
2786 LIns
*missBr
= lir
->insBranch(LIR_j
, NULL
, NULL
);
2787 if (node
->op
== REOP_SPACE
) {
2788 if (!fails
.append(missBr
))
2792 /* Collect matches. */
2793 LIns
*matchLbl
= lir
->ins0(LIR_label
);
2794 intervalMatchBr
->setTarget(matchLbl
);
2795 tableMatchBr
->setTarget(matchLbl
);
2796 eq1Br
->setTarget(matchLbl
); eq2Br
->setTarget(matchLbl
);
2797 eq3Br
->setTarget(matchLbl
); eq4Br
->setTarget(matchLbl
);
2798 eq5Br
->setTarget(matchLbl
); eq6Br
->setTarget(matchLbl
);
2799 eq7Br
->setTarget(matchLbl
); eq8Br
->setTarget(matchLbl
);
2800 if (node
->op
== REOP_NONSPACE
) {
2801 LIns
*matchBr
= lir
->insBranch(LIR_j
, NULL
, NULL
);
2802 if (!fails
.append(matchBr
))
2805 /* Fall through means match == success. */
2807 /* Collect successes to fall through. */
2808 LIns
*success
= lir
->ins0(LIR_label
);
2809 if (node
->op
== REOP_NONSPACE
)
2810 missBr
->setTarget(success
);
2817 return lir
->ins2(LIR_addp
, pos
, lir
->insImmWord(2));
2820 LIns
*compileAlt(RENode
*node
, LIns
*pos
, bool atEnd
, LInsList
&fails
)
2822 RENode
*leftRe
= (RENode
*)node
->kid
, *rightRe
= (RENode
*)node
->u
.kid2
;
2825 * If the RE continues after the alternative, we need to ensure that no
2826 * backtracking is required. Recursive calls to compileNode will fail
2827 * on capturing parens, so the only thing we have to check here is that,
2828 * if the left subexpression matches, we can keep going without later
2829 * deciding we need to try the right subexpression.
2833 * If there is no character overlap between left and right, then
2834 * there is only one possible path through the alternative.
2836 CharSet leftSet
, rightSet
;
2837 if (!enumerateNextChars(cx
, leftRe
, leftSet
) ||
2838 !enumerateNextChars(cx
, rightRe
, rightSet
) ||
2839 !leftSet
.disjoint(rightSet
))
2843 * If there is an empty path through either subexpression, the above
2844 * check is incomplete; we need to include |node->next| as well.
2846 bool epsLeft
= mayMatchEmpty(leftRe
),
2847 epsRight
= mayMatchEmpty(rightRe
);
2848 if (epsRight
&& epsLeft
) {
2850 } else if (epsLeft
|| epsRight
) {
2852 if (!enumerateNextChars(cx
, node
->next
, nextSet
) ||
2853 (epsLeft
&& !nextSet
.disjoint(rightSet
)) ||
2854 (epsRight
&& !nextSet
.disjoint(leftSet
))) {
2860 /* Try left branch. */
2861 LInsList
kidFails(cx
);
2862 LIns
*branchEnd
= compileNode(leftRe
, pos
, atEnd
, kidFails
);
2867 * Since there are no phis, simulate by writing to and reading from
2868 * memory (REGlobalData::stateStack, since it is unused).
2870 lir
->insStore(branchEnd
, state
,
2871 offsetof(REGlobalData
, stateStack
), ACC_OTHER
);
2872 LIns
*leftSuccess
= lir
->insBranch(LIR_j
, NULL
, NULL
);
2874 /* Try right branch. */
2875 targetCurrentPoint(kidFails
);
2876 if (!(branchEnd
= compileNode(rightRe
, pos
, atEnd
, fails
)))
2878 lir
->insStore(branchEnd
, state
,
2879 offsetof(REGlobalData
, stateStack
), ACC_OTHER
);
2881 /* Land success on the left branch. */
2882 targetCurrentPoint(leftSuccess
);
2883 return addName(fragment
->lirbuf
,
2884 lir
->insLoad(LIR_ldp
, state
, offsetof(REGlobalData
, stateStack
), ACC_OTHER
),
2888 LIns
*compileOpt(RENode
*node
, LIns
*pos
, bool atEnd
, LInsList
&fails
)
2891 * Since there are no phis, simulate by writing to and reading from
2892 * memory (REGlobalData::stateStack, since it is unused).
2894 lir
->insStore(pos
, state
, offsetof(REGlobalData
, stateStack
), ACC_OTHER
);
2897 LInsList
kidFails(cx
);
2898 if (!(pos
= compileNode(node
, pos
, atEnd
, kidFails
)))
2900 lir
->insStore(pos
, state
, offsetof(REGlobalData
, stateStack
), ACC_OTHER
);
2902 /* Join success and failure and get new position. */
2903 targetCurrentPoint(kidFails
);
2904 pos
= addName(fragment
->lirbuf
,
2905 lir
->insLoad(LIR_ldp
, state
, offsetof(REGlobalData
, stateStack
), ACC_OTHER
),
2911 LIns
*compileQuant(RENode
*node
, LIns
*pos
, bool atEnd
, LInsList
&fails
)
2913 /* Only support greedy *, +, ?. */
2914 if (!node
->u
.range
.greedy
||
2915 node
->u
.range
.min
> 1 ||
2916 (node
->u
.range
.max
> 1 && node
->u
.range
.max
< (uintN
)-1)) {
2920 RENode
*bodyRe
= (RENode
*)node
->kid
;
2923 * If the RE continues after the alternative, we need to ensure that no
2924 * backtracking is required. Recursive calls to compileNode will fail
2925 * on capturing parens, so the only thing we have to check here is that,
2926 * if the quantifier body matches, we can continue matching the body
2927 * without later deciding we need to undo the body matches.
2931 * If there is no character overlap between the body and
2932 * |node->next|, then all possible body matches are used.
2934 CharSet bodySet
, nextSet
;
2935 if (!enumerateNextChars(cx
, bodyRe
, bodySet
) ||
2936 !enumerateNextChars(cx
, node
->next
, nextSet
) ||
2937 !bodySet
.disjoint(nextSet
)) {
2942 /* Fork off ? and {1,1}. */
2943 if (node
->u
.range
.max
== 1) {
2944 if (node
->u
.range
.min
== 1)
2945 return compileNode(bodyRe
, pos
, atEnd
, fails
);
2947 return compileOpt(bodyRe
, pos
, atEnd
, fails
);
2950 /* For +, compile a copy of the body where failure is real failure. */
2951 if (node
->u
.range
.min
== 1) {
2952 if (!(pos
= compileNode(bodyRe
, pos
, atEnd
, fails
)))
2957 * Since there are no phis, simulate by writing to and reading from
2958 * memory (REGlobalData::stateStack, since it is unused).
2960 lir
->insStore(pos
, state
, offsetof(REGlobalData
, stateStack
), ACC_OTHER
);
2962 /* Begin iteration: load loop variables. */
2963 LIns
*loopTop
= lir
->ins0(LIR_label
);
2964 LIns
*iterBegin
= addName(fragment
->lirbuf
,
2965 lir
->insLoad(LIR_ldp
, state
,
2966 offsetof(REGlobalData
, stateStack
), ACC_OTHER
),
2969 /* Match quantifier body. */
2970 LInsList
kidFails(cx
);
2971 LIns
*iterEnd
= compileNode(bodyRe
, iterBegin
, atEnd
, kidFails
);
2976 * If there is an epsilon path through the body then, when it is taken,
2977 * we need to abort the loop or else we will loop forever.
2979 if (mayMatchEmpty(bodyRe
)) {
2980 LIns
*eqCnd
= lir
->ins2(LIR_eqp
, iterBegin
, iterEnd
);
2981 if (!kidFails
.append(lir
->insBranch(LIR_jt
, eqCnd
, NULL
)))
2985 /* End iteration: store loop variables, increment, jump */
2986 lir
->insStore(iterEnd
, state
, offsetof(REGlobalData
, stateStack
), ACC_OTHER
);
2987 lir
->insBranch(LIR_j
, NULL
, loopTop
);
2990 * Using '+' as branch, the intended control flow is:
3006 * We are currently at point X. Since the regalloc makes a single,
3007 * linear, backwards sweep over the IR (going from E to A), point X
3008 * must tell the regalloc what LIR insns are live at the end of D.
3009 * Thus, we need to report *all* insns defined *before* the end of D
3010 * that may be used *after* D. This means insns defined in A, B, C, or
3011 * D and used in B, C, D, or E. Since insns in B, C, and D are
3012 * conditionally executed, and we (currently) don't have real phi
3013 * nodes, we need only consider insns defined in A and used in E.
3015 lir
->ins1(LIR_livep
, state
);
3016 lir
->ins1(LIR_livep
, cpend
);
3017 lir
->ins1(LIR_livep
, start
);
3019 /* After the loop: reload 'pos' from memory and continue. */
3020 targetCurrentPoint(kidFails
);
3025 * Compile the regular expression rooted at 'node'. Return 0 on failed
3026 * compilation. Otherwise, generate code that falls through on success (the
3027 * returned LIns* is the current 'pos') and jumps to the end on failure (by
3028 * adding the guard LIns to 'fails').
3030 LIns
*compileNode(RENode
*node
, LIns
*pos
, bool atEnd
, LInsList
&fails
)
3032 for (; pos
&& node
; node
= node
->next
) {
3036 bool childNextIsEnd
= atEnd
&& !node
->next
;
3040 pos
= compileEmpty(node
, pos
, fails
);
3043 pos
= compileFlat(node
, pos
, fails
);
3046 case REOP_ALTPREREQ
:
3047 pos
= compileAlt(node
, pos
, childNextIsEnd
, fails
);
3050 pos
= compileQuant(node
, pos
, childNextIsEnd
, fails
);
3053 pos
= compileClass(node
, pos
, fails
);
3062 pos
= compileBuiltinClass(node
, pos
, fails
);
3072 * This function kicks off recursive compileNode compilation, finishes the
3073 * success path, and lets the failed-match path fall through.
3075 bool compileRootNode(RENode
*root
, LIns
*pos
, LIns
*anchorFail
)
3077 /* Compile the regular expression body. */
3079 pos
= compileNode(root
, pos
, true, fails
);
3083 /* Fall-through from compileNode means success. */
3084 lir
->insStore(pos
, state
, offsetof(REGlobalData
, stateStack
), ACC_OTHER
);
3085 lir
->ins0(LIR_regfence
);
3086 lir
->ins1(LIR_reti
, lir
->insImmI(1));
3088 /* Stick return here so we don't have to jump over it every time. */
3090 targetCurrentPoint(anchorFail
);
3091 lir
->ins0(LIR_regfence
);
3092 lir
->ins1(LIR_reti
, lir
->insImmI(0));
3095 /* Target failed matches. */
3096 targetCurrentPoint(fails
);
3100 /* Compile a regular expressions that can only match on the first char. */
3101 bool compileSticky(RENode
*root
, LIns
*start
)
3103 if (!compileRootNode(root
, start
, NULL
))
3106 /* Failed to match on first character, so fail whole match. */
3107 lir
->ins0(LIR_regfence
);
3108 lir
->ins1(LIR_reti
, lir
->insImmI(0));
3109 return !outOfMemory();
3112 /* Compile normal regular expressions that can match starting at any char. */
3113 bool compileAnchoring(RENode
*root
, LIns
*start
)
3115 /* Guard outer anchoring loop. Use <= to allow empty regexp match. */
3116 LIns
*anchorFail
= lir
->insBranch(LIR_jf
, lir
->ins2(LIR_lep
, start
, cpend
), 0);
3118 if (!compileRootNode(root
, start
, anchorFail
))
3121 /* Outer loop increment. */
3122 lir
->insStore(lir
->ins2(LIR_addp
, start
, lir
->insImmWord(2)), state
,
3123 offsetof(REGlobalData
, skipped
), ACC_OTHER
);
3125 return !outOfMemory();
3129 addName(LirBuffer
* lirbuf
, LIns
* ins
, const char* name
)
3132 debug_only_stmt(lirbuf
->printer
->lirNameMap
->addName(ins
, name
);)
3138 * Insert the side exit and guard record for a compiled regexp. Most
3139 * of the fields are not used. The important part is the regexp source
3140 * and flags, which we use as the fragment lookup key.
3142 GuardRecord
* insertGuard(LIns
* loopLabel
, const jschar
* re_chars
, size_t re_length
)
3145 lir
->insBranch(LIR_j
, NULL
, loopLabel
);
3146 LirBuffer
* lirbuf
= fragment
->lirbuf
;
3147 lir
->ins1(LIR_livep
, lirbuf
->state
);
3148 lir
->ins1(LIR_livep
, lirbuf
->param1
);
3151 Allocator
&alloc
= *JS_TRACE_MONITOR(cx
).dataAlloc
;
3153 /* Must only create a VMSideExit; see StackFilter::getTops. */
3154 size_t len
= (sizeof(GuardRecord
) +
3155 sizeof(VMSideExit
) +
3156 (re_length
-1) * sizeof(jschar
));
3157 GuardRecord
* guard
= (GuardRecord
*) alloc
.alloc(len
);
3158 VMSideExit
* exit
= (VMSideExit
*)(guard
+1);
3160 guard
->exit
->target
= fragment
;
3161 fragment
->lastIns
= lir
->insGuard(LIR_x
, NULL
, guard
);
3162 // guard->profCount is calloc'd to zero
3164 guard
->profGuardID
= fragment
->guardNumberer
++;
3165 guard
->nextInFrag
= fragment
->guardsForFrag
;
3166 fragment
->guardsForFrag
= guard
;
3172 RegExpNativeCompiler(JSContext
* cx
, JSRegExp
* re
, CompilerState
* cs
, Fragment
* fragment
)
3173 : tempAlloc(*JS_TRACE_MONITOR(cx
).reTempAlloc
), cx(cx
),
3174 re(re
), cs(cs
), fragment(fragment
), lir(NULL
), lirBufWriter(NULL
),
3175 lirbuf(new (tempAlloc
) LirBuffer(tempAlloc
))
3177 fragment
->lirbuf
= lirbuf
;
3179 lirbuf
->printer
= new (tempAlloc
) LInsPrinter(tempAlloc
);
3183 ~RegExpNativeCompiler() {
3184 /* Purge the tempAlloc used during recording. */
3190 GuardRecord
* guard
= NULL
;
3191 const jschar
* re_chars
;
3193 TraceMonitor
* tm
= &JS_TRACE_MONITOR(cx
);
3194 Assembler
*assm
= tm
->assembler
;
3195 LIns
* loopLabel
= NULL
;
3197 if (outOfMemory() || OverfullJITCache(tm
))
3200 re
->source
->getCharsAndLength(re_chars
, re_length
);
3202 * If the regexp is too long nanojit will assert when we
3203 * try to insert the guard record.
3205 if (re_length
> 1024) {
3206 re
->flags
|= JSREG_NOCOMPILE
;
3210 /* At this point we have an empty fragment. */
3211 LirBuffer
* lirbuf
= fragment
->lirbuf
;
3214 /* FIXME Use bug 463260 smart pointer when available. */
3215 lir
= lirBufWriter
= new LirBufWriter(lirbuf
, nanojit::AvmCore::config
);
3217 /* FIXME Use bug 463260 smart pointer when available. */
3220 if (LogController
.lcbits
& LC_TMRegexp
) {
3221 lir
= verbose_filter
= new VerboseWriter(tempAlloc
, lir
, lirbuf
->printer
,
3227 lir
= validate_writer
= new ValidateWriter(lir
, lirbuf
->printer
, "regexp writer pipeline");
3231 * Although we could just load REGlobalData::cpend from 'state', by
3232 * passing it as a parameter, we avoid loading it every iteration.
3234 lir
->ins0(LIR_start
);
3236 for (int i
= 0; i
< NumSavedRegs
; ++i
)
3237 lir
->insParam(i
, 1);
3239 for (int i
= 0; i
< NumSavedRegs
; ++i
)
3240 addName(lirbuf
, lirbuf
->savedRegs
[i
], regNames
[Assembler::savedRegs
[i
]]);
3243 lirbuf
->state
= state
= addName(lirbuf
, lir
->insParam(0, 0), "state");
3244 lirbuf
->param1
= cpend
= addName(lirbuf
, lir
->insParam(1, 0), "cpend");
3246 loopLabel
= lir
->ins0(LIR_label
);
3247 // If profiling, record where the loop label is, so that the
3248 // assembler can insert a frag-entry-counter increment at that
3250 verbose_only( if (LogController
.lcbits
& LC_FragProfile
) {
3251 NanoAssert(!fragment
->loopLabel
);
3252 fragment
->loopLabel
= loopLabel
;
3255 start
= addName(lirbuf
,
3256 lir
->insLoad(LIR_ldp
, state
, offsetof(REGlobalData
, skipped
), ACC_OTHER
),
3259 if (cs
->flags
& JSREG_STICKY
) {
3260 if (!compileSticky(cs
->result
, start
))
3263 if (!compileAnchoring(cs
->result
, start
))
3267 guard
= insertGuard(loopLabel
, re_chars
, re_length
);
3273 * Deep in the nanojit compiler, the StackFilter is trying to throw
3274 * away stores above the VM interpreter/native stacks. We have no such
3275 * stacks, so rely on the fact that lirbuf->sp and lirbuf->rp are null
3276 * to ensure our stores are ignored.
3278 JS_ASSERT(!lirbuf
->sp
&& !lirbuf
->rp
);
3280 assm
->compile(fragment
, tempAlloc
, /*optimize*/true verbose_only(, lirbuf
->printer
));
3281 if (assm
->error() != nanojit::None
)
3284 delete lirBufWriter
;
3286 delete validate_writer
;
3289 debug_only_stmt( if (LogController
.lcbits
& LC_TMRegexp
)
3290 delete verbose_filter
; )
3294 if (outOfMemory() || OverfullJITCache(tm
)) {
3295 delete lirBufWriter
;
3296 // recover profiling data from expiring Fragments
3298 REHashMap::Iter
iter(*(tm
->reFragments
));
3299 while (iter
.next()) {
3300 nanojit::Fragment
* frag
= iter
.value();
3301 FragProfiling_FragFinalizer(frag
, tm
);
3306 if (!guard
) insertGuard(loopLabel
, re_chars
, re_length
);
3307 re
->flags
|= JSREG_NOCOMPILE
;
3308 delete lirBufWriter
;
3311 delete validate_writer
;
3314 debug_only_stmt( if (LogController
.lcbits
& LC_TMRegexp
)
3315 delete verbose_filter
; )
3322 * Compile a regexp to native code in the given fragment.
3324 static inline JSBool
3325 CompileRegExpToNative(JSContext
* cx
, JSRegExp
* re
, Fragment
* fragment
)
3327 JSBool rv
= JS_FALSE
;
3329 CompilerState state
;
3330 RegExpNativeCompiler
rc(cx
, re
, &state
, fragment
);
3332 JS_ASSERT(!fragment
->code());
3333 mark
= cx
->tempPool
.getMark();
3334 if (!CompileRegExpToAST(cx
, NULL
, re
->source
, re
->flags
, state
)) {
3339 cx
->tempPool
.release(mark
);
3343 /* Function type for a compiled native regexp. */
3344 typedef void *(FASTCALL
*NativeRegExp
)(REGlobalData
*, const jschar
*);
3347 * Return a compiled native regexp if one already exists or can be created
3348 * now, or NULL otherwise.
3351 GetNativeRegExp(JSContext
* cx
, JSRegExp
* re
)
3353 const jschar
*re_chars
;
3355 re
->source
->getCharsAndLength(re_chars
, re_length
);
3356 Fragment
*fragment
= LookupNativeRegExp(cx
, re
->flags
, re_chars
, re_length
);
3357 JS_ASSERT(fragment
);
3358 if (!fragment
->code() && fragment
->recordAttempts
== 0) {
3359 fragment
->recordAttempts
++;
3360 if (!CompileRegExpToNative(cx
, re
, fragment
))
3363 union { NIns
*code
; NativeRegExp func
; } u
;
3364 u
.code
= fragment
->code();
3370 js_NewRegExp(JSContext
*cx
, TokenStream
*ts
,
3371 JSString
*str
, uintN flags
, JSBool flat
)
3375 CompilerState state
;
3381 mark
= cx
->tempPool
.getMark();
3384 * Parsing the string as flat is now expressed internally using
3385 * a flag, so that we keep this information in the JSRegExp, but
3386 * we keep the 'flat' parameter for now for compatibility.
3388 if (flat
) flags
|= JSREG_FLAT
;
3389 if (!CompileRegExpToAST(cx
, ts
, str
, flags
, state
))
3392 resize
= offsetof(JSRegExp
, program
) + state
.progLength
+ 1;
3393 re
= (JSRegExp
*) cx
->malloc(resize
);
3398 JS_ASSERT(state
.classBitmapsMem
<= CLASS_BITMAPS_MEM_LIMIT
);
3399 re
->classCount
= state
.classCount
;
3400 if (re
->classCount
) {
3401 re
->classList
= (RECharSet
*)
3402 cx
->malloc(re
->classCount
* sizeof(RECharSet
));
3403 if (!re
->classList
) {
3404 js_DestroyRegExp(cx
, re
);
3408 for (i
= 0; i
< re
->classCount
; i
++)
3409 re
->classList
[i
].converted
= JS_FALSE
;
3411 re
->classList
= NULL
;
3414 /* Compile the bytecode version. */
3415 endPC
= EmitREBytecode(&state
, re
, state
.treeDepth
, re
->program
, state
.result
);
3417 js_DestroyRegExp(cx
, re
);
3421 *endPC
++ = REOP_END
;
3423 * Check whether size was overestimated and shrink using realloc.
3424 * This is safe since no pointers to newly parsed regexp or its parts
3425 * besides re exist here.
3427 if ((size_t)(endPC
- re
->program
) != state
.progLength
+ 1) {
3429 JS_ASSERT((size_t)(endPC
- re
->program
) < state
.progLength
+ 1);
3430 resize
= offsetof(JSRegExp
, program
) + (endPC
- re
->program
);
3431 tmp
= (JSRegExp
*) cx
->realloc(re
, resize
);
3436 re
->flags
= uint16(flags
);
3437 re
->parenCount
= state
.parenCount
;
3441 cx
->tempPool
.release(mark
);
3446 js_NewRegExpOpt(JSContext
*cx
, JSString
*str
, JSString
*opt
, JSBool flat
)
3455 opt
->getCharsAndLength(s
, n
);
3456 for (i
= 0; i
< n
; i
++) {
3457 #define HANDLE_FLAG(name) \
3459 if (flags & (name)) \
3465 HANDLE_FLAG(JSREG_GLOB
);
3468 HANDLE_FLAG(JSREG_FOLD
);
3471 HANDLE_FLAG(JSREG_MULTILINE
);
3474 HANDLE_FLAG(JSREG_STICKY
);
3478 charBuf
[0] = (char)s
[i
];
3480 JS_ReportErrorFlagsAndNumber(cx
, JSREPORT_ERROR
,
3481 js_GetErrorMessage
, NULL
,
3482 JSMSG_BAD_REGEXP_FLAG
, charBuf
);
3488 return js_NewRegExp(cx
, NULL
, str
, flags
, flat
);
3492 * Save the current state of the match - the position in the input
3493 * text as well as the position in the bytecode. The state of any
3494 * parent expressions is also saved (preceding state).
3495 * Contents of parenCount parentheses from parenIndex are also saved.
3497 static REBackTrackData
*
3498 PushBackTrackState(REGlobalData
*gData
, REOp op
,
3499 jsbytecode
*target
, REMatchState
*x
, const jschar
*cp
,
3500 size_t parenIndex
, size_t parenCount
)
3503 REBackTrackData
*result
=
3504 (REBackTrackData
*) ((char *)gData
->backTrackSP
+ gData
->cursz
);
3506 size_t sz
= sizeof(REBackTrackData
) +
3507 gData
->stateStackTop
* sizeof(REProgState
) +
3508 parenCount
* sizeof(RECapture
);
3510 ptrdiff_t btsize
= gData
->backTrackStackSize
;
3511 ptrdiff_t btincr
= ((char *)result
+ sz
) -
3512 ((char *)gData
->backTrackStack
+ btsize
);
3514 re_debug("\tBT_Push: %lu,%lu",
3515 (unsigned long) parenIndex
, (unsigned long) parenCount
);
3518 ptrdiff_t offset
= (char *)result
- (char *)gData
->backTrackStack
;
3520 btincr
= JS_ROUNDUP(btincr
, btsize
);
3521 gData
->cx
->regexpPool
.growCast
<REBackTrackData
*>(gData
->backTrackStack
, btsize
, btincr
);
3522 if (!gData
->backTrackStack
) {
3523 js_ReportOutOfScriptQuota(gData
->cx
);
3524 gData
->ok
= JS_FALSE
;
3527 gData
->backTrackStackSize
= btsize
+ btincr
;
3528 result
= (REBackTrackData
*) ((char *)gData
->backTrackStack
+ offset
);
3530 gData
->backTrackSP
= result
;
3531 result
->sz
= gData
->cursz
;
3534 result
->backtrack_op
= op
;
3535 result
->backtrack_pc
= target
;
3537 result
->parenCount
= parenCount
;
3538 result
->parenIndex
= parenIndex
;
3540 result
->saveStateStackTop
= gData
->stateStackTop
;
3541 JS_ASSERT(gData
->stateStackTop
);
3542 memcpy(result
+ 1, gData
->stateStack
,
3543 sizeof(REProgState
) * result
->saveStateStackTop
);
3545 if (parenCount
!= 0) {
3546 memcpy((char *)(result
+ 1) +
3547 sizeof(REProgState
) * result
->saveStateStackTop
,
3548 &x
->parens
[parenIndex
],
3549 sizeof(RECapture
) * parenCount
);
3550 for (i
= 0; i
!= parenCount
; i
++)
3551 x
->parens
[parenIndex
+ i
].index
= -1;
3559 * Consecutive literal characters.
3562 static REMatchState
*
3563 FlatNMatcher(REGlobalData
*gData
, REMatchState
*x
, jschar
*matchChars
,
3567 if (length
> gData
->cpend
- x
->cp
)
3569 for (i
= 0; i
!= length
; i
++) {
3570 if (matchChars
[i
] != x
->cp
[i
])
3578 static JS_ALWAYS_INLINE REMatchState
*
3579 FlatNIMatcher(REGlobalData
*gData
, REMatchState
*x
, jschar
*matchChars
,
3583 JS_ASSERT(gData
->cpend
>= x
->cp
);
3584 if (length
> (size_t)(gData
->cpend
- x
->cp
))
3586 for (i
= 0; i
!= length
; i
++) {
3587 if (upcase(matchChars
[i
]) != upcase(x
->cp
[i
]))
3595 * 1. Evaluate DecimalEscape to obtain an EscapeValue E.
3596 * 2. If E is not a character then go to step 6.
3597 * 3. Let ch be E's character.
3598 * 4. Let A be a one-element RECharSet containing the character ch.
3599 * 5. Call CharacterSetMatcher(A, false) and return its Matcher result.
3600 * 6. E must be an integer. Let n be that integer.
3601 * 7. If n=0 or n>NCapturingParens then throw a SyntaxError exception.
3602 * 8. Return an internal Matcher closure that takes two arguments, a State x
3603 * and a Continuation c, and performs the following:
3604 * 1. Let cap be x's captures internal array.
3605 * 2. Let s be cap[n].
3606 * 3. If s is undefined, then call c(x) and return its result.
3607 * 4. Let e be x's endIndex.
3608 * 5. Let len be s's length.
3609 * 6. Let f be e+len.
3610 * 7. If f>InputLength, return failure.
3611 * 8. If there exists an integer i between 0 (inclusive) and len (exclusive)
3612 * such that Canonicalize(s[i]) is not the same character as
3613 * Canonicalize(Input [e+i]), then return failure.
3614 * 9. Let y be the State (f, cap).
3615 * 10. Call c(y) and return its result.
3617 static REMatchState
*
3618 BackrefMatcher(REGlobalData
*gData
, REMatchState
*x
, size_t parenIndex
)
3621 const jschar
*parenContent
;
3622 RECapture
*cap
= &x
->parens
[parenIndex
];
3624 if (cap
->index
== -1)
3628 if (x
->cp
+ len
> gData
->cpend
)
3631 parenContent
= &gData
->cpbegin
[cap
->index
];
3632 if (gData
->regexp
->flags
& JSREG_FOLD
) {
3633 for (i
= 0; i
< len
; i
++) {
3634 if (upcase(parenContent
[i
]) != upcase(x
->cp
[i
]))
3638 for (i
= 0; i
< len
; i
++) {
3639 if (parenContent
[i
] != x
->cp
[i
])
3648 /* Add a single character to the RECharSet */
3650 AddCharacterToCharSet(RECharSet
*cs
, jschar c
)
3652 uintN byteIndex
= (uintN
)(c
>> 3);
3653 JS_ASSERT(c
<= cs
->length
);
3654 cs
->u
.bits
[byteIndex
] |= 1 << (c
& 0x7);
3658 /* Add a character range, c1 to c2 (inclusive) to the RECharSet */
3660 AddCharacterRangeToCharSet(RECharSet
*cs
, uintN c1
, uintN c2
)
3664 uintN byteIndex1
= c1
>> 3;
3665 uintN byteIndex2
= c2
>> 3;
3667 JS_ASSERT(c2
<= cs
->length
&& c1
<= c2
);
3672 if (byteIndex1
== byteIndex2
) {
3673 cs
->u
.bits
[byteIndex1
] |= ((uint8
)0xFF >> (7 - (c2
- c1
))) << c1
;
3675 cs
->u
.bits
[byteIndex1
] |= 0xFF << c1
;
3676 for (i
= byteIndex1
+ 1; i
< byteIndex2
; i
++)
3677 cs
->u
.bits
[i
] = 0xFF;
3678 cs
->u
.bits
[byteIndex2
] |= (uint8
)0xFF >> (7 - c2
);
3682 struct CharacterRange
{
3688 * The following characters are taken from the ECMA-262 standard, section 7.2
3689 * and 7.3, and the Unicode 3 standard, Table 6-1.
3691 static const CharacterRange WhiteSpaceRanges
[] = {
3692 /* TAB, LF, VT, FF, CR */
3696 /* NO-BREAK SPACE */
3699 * EN QUAD, EM QUAD, EN SPACE, EM SPACE, THREE-PER-EM SPACE, FOUR-PER-EM
3700 * SPACE, SIX-PER-EM SPACE, FIGURE SPACE, PUNCTUATION SPACE, THIN SPACE,
3701 * HAIR SPACE, ZERO WIDTH SPACE
3706 /* NARROW NO-BREAK SPACE */
3708 /* IDEOGRAPHIC SPACE */
3712 /* ECMA-262 standard, section 15.10.2.6. */
3713 static const CharacterRange WordRanges
[] = {
3714 { jschar('0'), jschar('9') },
3715 { jschar('A'), jschar('Z') },
3716 { jschar('_'), jschar('_') },
3717 { jschar('a'), jschar('z') }
3721 AddCharacterRanges(RECharSet
*charSet
,
3722 const CharacterRange
*range
,
3723 const CharacterRange
*end
)
3725 for (; range
< end
; ++range
)
3726 AddCharacterRangeToCharSet(charSet
, range
->start
, range
->end
);
3730 AddInvertedCharacterRanges(RECharSet
*charSet
,
3731 const CharacterRange
*range
,
3732 const CharacterRange
*end
)
3734 uint16 previous
= 0;
3735 for (; range
< end
; ++range
) {
3736 AddCharacterRangeToCharSet(charSet
, previous
, range
->start
- 1);
3737 previous
= range
->end
+ 1;
3739 AddCharacterRangeToCharSet(charSet
, previous
, charSet
->length
);
3742 /* Compile the source of the class into a RECharSet */
3744 ProcessCharSet(JSContext
*cx
, JSRegExp
*re
, RECharSet
*charSet
)
3746 const jschar
*src
, *end
;
3747 JSBool inRange
= JS_FALSE
;
3748 jschar rangeStart
= 0;
3749 uintN byteLength
, n
;
3753 JS_ASSERT(!charSet
->converted
);
3755 * Assert that startIndex and length points to chars inside [] inside
3758 JS_ASSERT(1 <= charSet
->u
.src
.startIndex
);
3759 JS_ASSERT(charSet
->u
.src
.startIndex
< re
->source
->length());
3760 JS_ASSERT(charSet
->u
.src
.length
<= re
->source
->length()
3761 - 1 - charSet
->u
.src
.startIndex
);
3763 charSet
->converted
= JS_TRUE
;
3764 src
= re
->source
->chars() + charSet
->u
.src
.startIndex
;
3765 end
= src
+ charSet
->u
.src
.length
;
3766 JS_ASSERT(src
[-1] == '[');
3767 JS_ASSERT(end
[0] == ']');
3769 byteLength
= (charSet
->length
>> 3) + 1;
3770 charSet
->u
.bits
= (uint8
*)cx
->malloc(byteLength
);
3771 if (!charSet
->u
.bits
) {
3772 JS_ReportOutOfMemory(cx
);
3775 memset(charSet
->u
.bits
, 0, byteLength
);
3781 JS_ASSERT(charSet
->sense
== JS_FALSE
);
3784 JS_ASSERT(charSet
->sense
== JS_TRUE
);
3787 while (src
!= end
) {
3812 if (src
< end
&& JS_ISWORD(*src
)) {
3813 thisCh
= (jschar
)(*src
++ & 0x1F);
3826 for (i
= 0; (i
< nDigits
) && (src
< end
); i
++) {
3829 if (!isASCIIHexDigit(c
, &digit
)) {
3831 * Back off to accepting the original '\'
3838 n
= (n
<< 4) | digit
;
3851 * This is a non-ECMA extension - decimal escapes (in this
3852 * case, octal!) are supposed to be an error inside class
3853 * ranges, but supported here for backwards compatibility.
3857 if ('0' <= c
&& c
<= '7') {
3859 n
= 8 * n
+ JS7_UNDEC(c
);
3861 if ('0' <= c
&& c
<= '7') {
3863 i
= 8 * n
+ JS7_UNDEC(c
);
3874 AddCharacterRangeToCharSet(charSet
, '0', '9');
3875 continue; /* don't need range processing */
3877 AddCharacterRangeToCharSet(charSet
, 0, '0' - 1);
3878 AddCharacterRangeToCharSet(charSet
,
3880 (jschar
)charSet
->length
);
3883 AddCharacterRanges(charSet
, WhiteSpaceRanges
,
3884 WhiteSpaceRanges
+ JS_ARRAY_LENGTH(WhiteSpaceRanges
));
3887 AddInvertedCharacterRanges(charSet
, WhiteSpaceRanges
,
3888 WhiteSpaceRanges
+ JS_ARRAY_LENGTH(WhiteSpaceRanges
));
3891 AddCharacterRanges(charSet
, WordRanges
,
3892 WordRanges
+ JS_ARRAY_LENGTH(WordRanges
));
3895 AddInvertedCharacterRanges(charSet
, WordRanges
,
3896 WordRanges
+ JS_ARRAY_LENGTH(WordRanges
));
3911 if (re
->flags
& JSREG_FOLD
) {
3914 JS_ASSERT(rangeStart
<= thisCh
);
3915 for (i
= rangeStart
; i
<= thisCh
; i
++) {
3918 AddCharacterToCharSet(charSet
, jschar(i
));
3919 uch
= jschar(upcase(i
));
3920 dch
= inverse_upcase(jschar(i
));
3922 AddCharacterToCharSet(charSet
, uch
);
3924 AddCharacterToCharSet(charSet
, dch
);
3927 AddCharacterRangeToCharSet(charSet
, rangeStart
, thisCh
);
3931 if (re
->flags
& JSREG_FOLD
) {
3932 AddCharacterToCharSet(charSet
, jschar(upcase(thisCh
)));
3933 AddCharacterToCharSet(charSet
, inverse_upcase(thisCh
));
3935 AddCharacterToCharSet(charSet
, thisCh
);
3937 if (src
< end
- 1) {
3941 rangeStart
= thisCh
;
3949 static inline JSBool
3950 MatcherProcessCharSet(REGlobalData
*gData
, RECharSet
*charSet
) {
3951 JSBool rv
= ProcessCharSet(gData
->cx
, gData
->regexp
, charSet
);
3952 if (!rv
) gData
->ok
= JS_FALSE
;
3957 js_DestroyRegExp(JSContext
*cx
, JSRegExp
*re
)
3959 if (JS_ATOMIC_DECREMENT(&re
->nrefs
) == 0) {
3960 if (re
->classList
) {
3962 for (i
= 0; i
< re
->classCount
; i
++) {
3963 if (re
->classList
[i
].converted
)
3964 cx
->free(re
->classList
[i
].u
.bits
);
3965 re
->classList
[i
].u
.bits
= NULL
;
3967 cx
->free(re
->classList
);
3974 ReallocStateStack(REGlobalData
*gData
)
3976 size_t limit
= gData
->stateStackLimit
;
3977 size_t sz
= sizeof(REProgState
) * limit
;
3979 gData
->cx
->regexpPool
.growCast
<REProgState
*>(gData
->stateStack
, sz
, sz
);
3980 if (!gData
->stateStack
) {
3981 js_ReportOutOfScriptQuota(gData
->cx
);
3982 gData
->ok
= JS_FALSE
;
3985 gData
->stateStackLimit
= limit
+ limit
;
3989 #define PUSH_STATE_STACK(data) \
3991 ++(data)->stateStackTop; \
3992 if ((data)->stateStackTop == (data)->stateStackLimit && \
3993 !ReallocStateStack((data))) { \
3999 * Apply the current op against the given input to see if it's going to match
4000 * or fail. Return false if we don't get a match, true if we do. If updatecp is
4001 * true, then update the current state's cp. Always update startpc to the next
4004 static JS_ALWAYS_INLINE REMatchState
*
4005 SimpleMatch(REGlobalData
*gData
, REMatchState
*x
, REOp op
,
4006 jsbytecode
**startpc
, JSBool updatecp
)
4008 REMatchState
*result
= NULL
;
4011 size_t offset
, length
, index
;
4012 jsbytecode
*pc
= *startpc
; /* pc has already been incremented past op */
4014 const jschar
*startcp
= x
->cp
;
4019 const char *opname
= reop_names
[op
];
4020 re_debug("\n%06d: %*s%s", pc
- gData
->regexp
->program
,
4021 gData
->stateStackTop
* 2, "", opname
);
4028 if (x
->cp
!= gData
->cpbegin
) {
4029 if (!gData
->cx
->regExpStatics
.multiline
&&
4030 !(gData
->regexp
->flags
& JSREG_MULTILINE
)) {
4033 if (!RE_IS_LINE_TERM(x
->cp
[-1]))
4039 if (x
->cp
!= gData
->cpend
) {
4040 if (!gData
->cx
->regExpStatics
.multiline
&&
4041 !(gData
->regexp
->flags
& JSREG_MULTILINE
)) {
4044 if (!RE_IS_LINE_TERM(*x
->cp
))
4050 if ((x
->cp
== gData
->cpbegin
|| !JS_ISWORD(x
->cp
[-1])) ^
4051 !(x
->cp
!= gData
->cpend
&& JS_ISWORD(*x
->cp
))) {
4056 if ((x
->cp
== gData
->cpbegin
|| !JS_ISWORD(x
->cp
[-1])) ^
4057 (x
->cp
!= gData
->cpend
&& JS_ISWORD(*x
->cp
))) {
4062 if (x
->cp
!= gData
->cpend
&& !RE_IS_LINE_TERM(*x
->cp
)) {
4068 if (x
->cp
!= gData
->cpend
&& JS7_ISDEC(*x
->cp
)) {
4074 if (x
->cp
!= gData
->cpend
&& !JS7_ISDEC(*x
->cp
)) {
4080 if (x
->cp
!= gData
->cpend
&& JS_ISWORD(*x
->cp
)) {
4086 if (x
->cp
!= gData
->cpend
&& !JS_ISWORD(*x
->cp
)) {
4092 if (x
->cp
!= gData
->cpend
&& JS_ISSPACE(*x
->cp
)) {
4098 if (x
->cp
!= gData
->cpend
&& !JS_ISSPACE(*x
->cp
)) {
4104 pc
= ReadCompactIndex(pc
, &parenIndex
);
4105 JS_ASSERT(parenIndex
< gData
->regexp
->parenCount
);
4106 result
= BackrefMatcher(gData
, x
, parenIndex
);
4109 pc
= ReadCompactIndex(pc
, &offset
);
4110 JS_ASSERT(offset
< gData
->regexp
->source
->length());
4111 pc
= ReadCompactIndex(pc
, &length
);
4112 JS_ASSERT(1 <= length
);
4113 JS_ASSERT(length
<= gData
->regexp
->source
->length() - offset
);
4114 if (length
<= (size_t)(gData
->cpend
- x
->cp
)) {
4115 source
= gData
->regexp
->source
->chars() + offset
;
4116 re_debug_chars(source
, length
);
4117 for (index
= 0; index
!= length
; index
++) {
4118 if (source
[index
] != x
->cp
[index
])
4127 re_debug(" '%c' == '%c'", (char)matchCh
, (char)*x
->cp
);
4128 if (x
->cp
!= gData
->cpend
&& *x
->cp
== matchCh
) {
4134 pc
= ReadCompactIndex(pc
, &offset
);
4135 JS_ASSERT(offset
< gData
->regexp
->source
->length());
4136 pc
= ReadCompactIndex(pc
, &length
);
4137 JS_ASSERT(1 <= length
);
4138 JS_ASSERT(length
<= gData
->regexp
->source
->length() - offset
);
4139 source
= gData
->regexp
->source
->chars();
4140 result
= FlatNIMatcher(gData
, x
, source
+ offset
, length
);
4144 if (x
->cp
!= gData
->cpend
&& upcase(*x
->cp
) == upcase(matchCh
)) {
4150 matchCh
= GET_ARG(pc
);
4151 re_debug(" '%c' == '%c'", (char)matchCh
, (char)*x
->cp
);
4153 if (x
->cp
!= gData
->cpend
&& *x
->cp
== matchCh
) {
4159 matchCh
= GET_ARG(pc
);
4161 if (x
->cp
!= gData
->cpend
&& upcase(*x
->cp
) == upcase(matchCh
)) {
4167 pc
= ReadCompactIndex(pc
, &index
);
4168 JS_ASSERT(index
< gData
->regexp
->classCount
);
4169 if (x
->cp
!= gData
->cpend
) {
4170 charSet
= &gData
->regexp
->classList
[index
];
4171 JS_ASSERT(charSet
->converted
);
4174 if (ch
<= charSet
->length
&&
4175 (charSet
->u
.bits
[index
] & (1 << (ch
& 0x7)))) {
4182 pc
= ReadCompactIndex(pc
, &index
);
4183 JS_ASSERT(index
< gData
->regexp
->classCount
);
4184 if (x
->cp
!= gData
->cpend
) {
4185 charSet
= &gData
->regexp
->classList
[index
];
4186 JS_ASSERT(charSet
->converted
);
4189 if (ch
> charSet
->length
||
4190 !(charSet
->u
.bits
[index
] & (1 << (ch
& 0x7)))) {
4198 JS_ASSERT(JS_FALSE
);
4211 static JS_ALWAYS_INLINE REMatchState
*
4212 ExecuteREBytecode(REGlobalData
*gData
, REMatchState
*x
)
4214 REMatchState
*result
= NULL
;
4215 REBackTrackData
*backTrackData
;
4216 jsbytecode
*nextpc
, *testpc
;
4219 REProgState
*curState
;
4220 const jschar
*startcp
;
4221 size_t parenIndex
, k
;
4222 size_t parenSoFar
= 0;
4224 jschar matchCh1
, matchCh2
;
4228 jsbytecode
*pc
= gData
->regexp
->program
;
4229 REOp op
= (REOp
) *pc
++;
4232 * If the first node is a simple match, step the index into the string
4233 * until that match is made, or fail if it can't be found at all.
4235 if (REOP_IS_SIMPLE(op
) && !(gData
->regexp
->flags
& JSREG_STICKY
)) {
4237 while (x
->cp
<= gData
->cpend
) {
4238 nextpc
= pc
; /* reset back to start each time */
4239 result
= SimpleMatch(gData
, x
, op
, &nextpc
, JS_TRUE
);
4243 pc
= nextpc
; /* accept skip to next opcode */
4245 JS_ASSERT(op
< REOP_LIMIT
);
4257 const char *opname
= reop_names
[op
];
4258 re_debug("\n%06d: %*s%s", pc
- gData
->regexp
->program
,
4259 gData
->stateStackTop
* 2, "", opname
);
4261 if (REOP_IS_SIMPLE(op
)) {
4262 result
= SimpleMatch(gData
, x
, op
, &pc
, JS_TRUE
);
4264 curState
= &gData
->stateStack
[gData
->stateStackTop
];
4268 case REOP_ALTPREREQ2
:
4269 nextpc
= pc
+ GET_OFFSET(pc
); /* start of next op */
4271 matchCh2
= GET_ARG(pc
);
4276 if (x
->cp
!= gData
->cpend
) {
4277 if (*x
->cp
== matchCh2
)
4280 charSet
= &gData
->regexp
->classList
[k
];
4281 if (!charSet
->converted
&& !MatcherProcessCharSet(gData
, charSet
))
4285 if ((matchCh1
> charSet
->length
||
4286 !(charSet
->u
.bits
[k
] & (1 << (matchCh1
& 0x7)))) ^
4294 case REOP_ALTPREREQ
:
4295 nextpc
= pc
+ GET_OFFSET(pc
); /* start of next op */
4297 matchCh1
= GET_ARG(pc
);
4299 matchCh2
= GET_ARG(pc
);
4301 if (x
->cp
== gData
->cpend
||
4302 (*x
->cp
!= matchCh1
&& *x
->cp
!= matchCh2
)) {
4306 /* else false thru... */
4310 nextpc
= pc
+ GET_OFFSET(pc
); /* start of next alternate */
4311 pc
+= ARG_LEN
; /* start of this alternate */
4312 curState
->parenSoFar
= parenSoFar
;
4313 PUSH_STATE_STACK(gData
);
4316 if (REOP_IS_SIMPLE(op
)) {
4317 if (!SimpleMatch(gData
, x
, op
, &pc
, JS_TRUE
)) {
4318 op
= (REOp
) *nextpc
++;
4325 nextop
= (REOp
) *nextpc
++;
4326 if (!PushBackTrackState(gData
, nextop
, nextpc
, x
, startcp
, 0, 0))
4331 * Occurs at (successful) end of REOP_ALT,
4335 * If we have not gotten a result here, it is because of an
4336 * empty match. Do the same thing REOP_EMPTY would do.
4341 --gData
->stateStackTop
;
4342 pc
+= GET_OFFSET(pc
);
4347 * Occurs at last (successful) end of REOP_ALT,
4351 * If we have not gotten a result here, it is because of an
4352 * empty match. Do the same thing REOP_EMPTY would do.
4357 --gData
->stateStackTop
;
4362 pc
= ReadCompactIndex(pc
, &parenIndex
);
4363 re_debug("[ %lu ]", (unsigned long) parenIndex
);
4364 JS_ASSERT(parenIndex
< gData
->regexp
->parenCount
);
4365 if (parenIndex
+ 1 > parenSoFar
)
4366 parenSoFar
= parenIndex
+ 1;
4367 x
->parens
[parenIndex
].index
= x
->cp
- gData
->cpbegin
;
4368 x
->parens
[parenIndex
].length
= 0;
4376 pc
= ReadCompactIndex(pc
, &parenIndex
);
4377 JS_ASSERT(parenIndex
< gData
->regexp
->parenCount
);
4378 cap
= &x
->parens
[parenIndex
];
4379 delta
= x
->cp
- (gData
->cpbegin
+ cap
->index
);
4380 cap
->length
= (delta
< 0) ? 0 : (size_t) delta
;
4388 nextpc
= pc
+ GET_OFFSET(pc
); /* start of term after ASSERT */
4389 pc
+= ARG_LEN
; /* start of ASSERT child */
4392 if (REOP_IS_SIMPLE(op
) &&
4393 !SimpleMatch(gData
, x
, op
, &testpc
, JS_FALSE
)) {
4397 curState
->u
.assertion
.top
=
4398 (char *)gData
->backTrackSP
- (char *)gData
->backTrackStack
;
4399 curState
->u
.assertion
.sz
= gData
->cursz
;
4400 curState
->index
= x
->cp
- gData
->cpbegin
;
4401 curState
->parenSoFar
= parenSoFar
;
4402 PUSH_STATE_STACK(gData
);
4403 if (!PushBackTrackState(gData
, REOP_ASSERTTEST
,
4404 nextpc
, x
, x
->cp
, 0, 0)) {
4409 case REOP_ASSERT_NOT
:
4410 nextpc
= pc
+ GET_OFFSET(pc
);
4414 if (REOP_IS_SIMPLE(op
) /* Note - fail to fail! */ &&
4415 SimpleMatch(gData
, x
, op
, &testpc
, JS_FALSE
) &&
4416 *testpc
== REOP_ASSERTNOTTEST
) {
4420 curState
->u
.assertion
.top
4421 = (char *)gData
->backTrackSP
-
4422 (char *)gData
->backTrackStack
;
4423 curState
->u
.assertion
.sz
= gData
->cursz
;
4424 curState
->index
= x
->cp
- gData
->cpbegin
;
4425 curState
->parenSoFar
= parenSoFar
;
4426 PUSH_STATE_STACK(gData
);
4427 if (!PushBackTrackState(gData
, REOP_ASSERTNOTTEST
,
4428 nextpc
, x
, x
->cp
, 0, 0)) {
4433 case REOP_ASSERTTEST
:
4434 --gData
->stateStackTop
;
4436 x
->cp
= gData
->cpbegin
+ curState
->index
;
4437 gData
->backTrackSP
=
4438 (REBackTrackData
*) ((char *)gData
->backTrackStack
+
4439 curState
->u
.assertion
.top
);
4440 gData
->cursz
= curState
->u
.assertion
.sz
;
4445 case REOP_ASSERTNOTTEST
:
4446 --gData
->stateStackTop
;
4448 x
->cp
= gData
->cpbegin
+ curState
->index
;
4449 gData
->backTrackSP
=
4450 (REBackTrackData
*) ((char *)gData
->backTrackStack
+
4451 curState
->u
.assertion
.top
);
4452 gData
->cursz
= curState
->u
.assertion
.sz
;
4453 result
= (!result
) ? x
: NULL
;
4456 curState
->u
.quantifier
.min
= 0;
4457 curState
->u
.quantifier
.max
= (uintN
)-1;
4460 curState
->u
.quantifier
.min
= 1;
4461 curState
->u
.quantifier
.max
= (uintN
)-1;
4464 curState
->u
.quantifier
.min
= 0;
4465 curState
->u
.quantifier
.max
= 1;
4468 pc
= ReadCompactIndex(pc
, &k
);
4469 curState
->u
.quantifier
.min
= k
;
4470 pc
= ReadCompactIndex(pc
, &k
);
4471 /* max is k - 1 to use one byte for (uintN)-1 sentinel. */
4472 curState
->u
.quantifier
.max
= k
- 1;
4473 JS_ASSERT(curState
->u
.quantifier
.min
4474 <= curState
->u
.quantifier
.max
);
4476 if (curState
->u
.quantifier
.max
== 0) {
4477 pc
= pc
+ GET_OFFSET(pc
);
4482 /* Step over <next> */
4483 nextpc
= pc
+ ARG_LEN
;
4484 op
= (REOp
) *nextpc
++;
4486 if (REOP_IS_SIMPLE(op
)) {
4487 if (!SimpleMatch(gData
, x
, op
, &nextpc
, JS_TRUE
)) {
4488 if (curState
->u
.quantifier
.min
== 0)
4492 pc
= pc
+ GET_OFFSET(pc
);
4495 op
= (REOp
) *nextpc
++;
4498 curState
->index
= startcp
- gData
->cpbegin
;
4499 curState
->continue_op
= REOP_REPEAT
;
4500 curState
->continue_pc
= pc
;
4501 curState
->parenSoFar
= parenSoFar
;
4502 PUSH_STATE_STACK(gData
);
4503 if (curState
->u
.quantifier
.min
== 0 &&
4504 !PushBackTrackState(gData
, REOP_REPEAT
, pc
, x
, startcp
,
4511 case REOP_ENDCHILD
: /* marks the end of a quantifier child */
4512 pc
= curState
[-1].continue_pc
;
4513 op
= (REOp
) curState
[-1].continue_op
;
4522 --gData
->stateStackTop
;
4524 /* Failed, see if we have enough children. */
4525 if (curState
->u
.quantifier
.min
== 0)
4529 if (curState
->u
.quantifier
.min
== 0 &&
4530 x
->cp
== gData
->cpbegin
+ curState
->index
) {
4531 /* matched an empty string, that'll get us nowhere */
4535 if (curState
->u
.quantifier
.min
!= 0)
4536 curState
->u
.quantifier
.min
--;
4537 if (curState
->u
.quantifier
.max
!= (uintN
) -1)
4538 curState
->u
.quantifier
.max
--;
4539 if (curState
->u
.quantifier
.max
== 0)
4541 nextpc
= pc
+ ARG_LEN
;
4542 nextop
= (REOp
) *nextpc
;
4544 if (REOP_IS_SIMPLE(nextop
)) {
4546 if (!SimpleMatch(gData
, x
, nextop
, &nextpc
, JS_TRUE
)) {
4547 if (curState
->u
.quantifier
.min
== 0)
4554 curState
->index
= startcp
- gData
->cpbegin
;
4555 PUSH_STATE_STACK(gData
);
4556 if (curState
->u
.quantifier
.min
== 0 &&
4557 !PushBackTrackState(gData
, REOP_REPEAT
,
4559 curState
->parenSoFar
,
4561 curState
->parenSoFar
)) {
4564 } while (*nextpc
== REOP_ENDCHILD
);
4567 parenSoFar
= curState
->parenSoFar
;
4572 pc
+= GET_OFFSET(pc
);
4575 case REOP_MINIMALSTAR
:
4576 curState
->u
.quantifier
.min
= 0;
4577 curState
->u
.quantifier
.max
= (uintN
)-1;
4578 goto minimalquantcommon
;
4579 case REOP_MINIMALPLUS
:
4580 curState
->u
.quantifier
.min
= 1;
4581 curState
->u
.quantifier
.max
= (uintN
)-1;
4582 goto minimalquantcommon
;
4583 case REOP_MINIMALOPT
:
4584 curState
->u
.quantifier
.min
= 0;
4585 curState
->u
.quantifier
.max
= 1;
4586 goto minimalquantcommon
;
4587 case REOP_MINIMALQUANT
:
4588 pc
= ReadCompactIndex(pc
, &k
);
4589 curState
->u
.quantifier
.min
= k
;
4590 pc
= ReadCompactIndex(pc
, &k
);
4591 /* See REOP_QUANT comments about k - 1. */
4592 curState
->u
.quantifier
.max
= k
- 1;
4593 JS_ASSERT(curState
->u
.quantifier
.min
4594 <= curState
->u
.quantifier
.max
);
4596 curState
->index
= x
->cp
- gData
->cpbegin
;
4597 curState
->parenSoFar
= parenSoFar
;
4598 PUSH_STATE_STACK(gData
);
4599 if (curState
->u
.quantifier
.min
!= 0) {
4600 curState
->continue_op
= REOP_MINIMALREPEAT
;
4601 curState
->continue_pc
= pc
;
4602 /* step over <next> */
4606 if (!PushBackTrackState(gData
, REOP_MINIMALREPEAT
,
4607 pc
, x
, x
->cp
, 0, 0)) {
4610 --gData
->stateStackTop
;
4611 pc
= pc
+ GET_OFFSET(pc
);
4616 case REOP_MINIMALREPEAT
:
4617 --gData
->stateStackTop
;
4620 re_debug("{%d,%d}", curState
->u
.quantifier
.min
,
4621 curState
->u
.quantifier
.max
);
4622 #define PREPARE_REPEAT() \
4624 curState->index = x->cp - gData->cpbegin; \
4625 curState->continue_op = REOP_MINIMALREPEAT; \
4626 curState->continue_pc = pc; \
4628 for (k = curState->parenSoFar; k < parenSoFar; k++) \
4629 x->parens[k].index = -1; \
4630 PUSH_STATE_STACK(gData); \
4631 op = (REOp) *pc++; \
4632 JS_ASSERT(op < REOP_LIMIT); \
4638 * Non-greedy failure - try to consume another child.
4640 if (curState
->u
.quantifier
.max
== (uintN
) -1 ||
4641 curState
->u
.quantifier
.max
> 0) {
4645 /* Don't need to adjust pc since we're going to pop. */
4648 if (curState
->u
.quantifier
.min
== 0 &&
4649 x
->cp
== gData
->cpbegin
+ curState
->index
) {
4650 /* Matched an empty string, that'll get us nowhere. */
4654 if (curState
->u
.quantifier
.min
!= 0)
4655 curState
->u
.quantifier
.min
--;
4656 if (curState
->u
.quantifier
.max
!= (uintN
) -1)
4657 curState
->u
.quantifier
.max
--;
4658 if (curState
->u
.quantifier
.min
!= 0) {
4662 curState
->index
= x
->cp
- gData
->cpbegin
;
4663 curState
->parenSoFar
= parenSoFar
;
4664 PUSH_STATE_STACK(gData
);
4665 if (!PushBackTrackState(gData
, REOP_MINIMALREPEAT
,
4667 curState
->parenSoFar
,
4668 parenSoFar
- curState
->parenSoFar
)) {
4671 --gData
->stateStackTop
;
4672 pc
= pc
+ GET_OFFSET(pc
);
4674 JS_ASSERT(op
< REOP_LIMIT
);
4677 JS_ASSERT(JS_FALSE
);
4684 * If the match failed and there's a backtrack option, take it.
4685 * Otherwise this is a complete and utter failure.
4688 if (gData
->cursz
== 0)
4690 if (!JS_CHECK_OPERATION_LIMIT(gData
->cx
)) {
4691 gData
->ok
= JS_FALSE
;
4695 /* Potentially detect explosive regex here. */
4696 gData
->backTrackCount
++;
4697 if (gData
->backTrackLimit
&&
4698 gData
->backTrackCount
>= gData
->backTrackLimit
) {
4699 JS_ReportErrorNumber(gData
->cx
, js_GetErrorMessage
, NULL
,
4700 JSMSG_REGEXP_TOO_COMPLEX
);
4701 gData
->ok
= JS_FALSE
;
4705 backTrackData
= gData
->backTrackSP
;
4706 gData
->cursz
= backTrackData
->sz
;
4707 gData
->backTrackSP
=
4708 (REBackTrackData
*) ((char *)backTrackData
- backTrackData
->sz
);
4709 x
->cp
= backTrackData
->cp
;
4710 pc
= backTrackData
->backtrack_pc
;
4711 op
= (REOp
) backTrackData
->backtrack_op
;
4712 JS_ASSERT(op
< REOP_LIMIT
);
4713 gData
->stateStackTop
= backTrackData
->saveStateStackTop
;
4714 JS_ASSERT(gData
->stateStackTop
);
4716 memcpy(gData
->stateStack
, backTrackData
+ 1,
4717 sizeof(REProgState
) * backTrackData
->saveStateStackTop
);
4718 curState
= &gData
->stateStack
[gData
->stateStackTop
- 1];
4720 if (backTrackData
->parenCount
) {
4721 memcpy(&x
->parens
[backTrackData
->parenIndex
],
4722 (char *)(backTrackData
+ 1) +
4723 sizeof(REProgState
) * backTrackData
->saveStateStackTop
,
4724 sizeof(RECapture
) * backTrackData
->parenCount
);
4725 parenSoFar
= backTrackData
->parenIndex
+ backTrackData
->parenCount
;
4727 for (k
= curState
->parenSoFar
; k
< parenSoFar
; k
++)
4728 x
->parens
[k
].index
= -1;
4729 parenSoFar
= curState
->parenSoFar
;
4732 re_debug("\tBT_Pop: %ld,%ld",
4733 (unsigned long) backTrackData
->parenIndex
,
4734 (unsigned long) backTrackData
->parenCount
);
4740 * Continue with the expression.
4743 JS_ASSERT(op
< REOP_LIMIT
);
4755 static REMatchState
*
4756 MatchRegExp(REGlobalData
*gData
, REMatchState
*x
)
4758 const jschar
*cpOrig
= x
->cp
;
4761 NativeRegExp native
;
4763 /* Run with native regexp if possible. */
4764 if (TRACING_ENABLED(gData
->cx
) &&
4765 !(gData
->regexp
->flags
& JSREG_NOCOMPILE
) &&
4766 (native
= GetNativeRegExp(gData
->cx
, gData
->regexp
))) {
4769 * For efficient native execution, store offset as a direct pointer into
4770 * the buffer and convert back after execution finishes.
4772 gData
->skipped
= (ptrdiff_t)cpOrig
;
4776 VOUCH_DOES_NOT_REQUIRE_STACK();
4777 JSStackFrame
*caller
= (JS_ON_TRACE(gData
->cx
))
4779 : js_GetScriptedCaller(gData
->cx
, NULL
);
4780 debug_only_printf(LC_TMRegexp
,
4781 "entering REGEXP trace at %s:%u@%u, code: %p\n",
4782 caller
? caller
->script
->filename
: "<unknown>",
4783 caller
? js_FramePCToLineNumber(gData
->cx
, caller
) : 0,
4784 caller
? FramePCOffset(caller
) : 0,
4785 JS_FUNC_TO_DATA_PTR(void *, native
));
4790 #if defined(JS_NO_FASTCALL) && defined(NANOJIT_IA32)
4792 * Although a NativeRegExp takes one argument and SIMULATE_FASTCALL is
4793 * passing two, the second goes into 'edx' and can safely be ignored.
4795 SIMULATE_FASTCALL(result
, gData
, gData
->cpend
, native
);
4797 result
= native(gData
, gData
->cpend
);
4799 debug_only_print0(LC_TMRegexp
, "leaving REGEXP trace\n");
4803 /* Restore REGlobalData::skipped and fill REMatchState. */
4804 x
->cp
= (const jschar
*)gData
->stateStack
;
4805 gData
->skipped
= (const jschar
*)gData
->skipped
- cpOrig
;
4811 * Have to include the position beyond the last character
4812 * in order to detect end-of-input/line condition.
4814 for (const jschar
*p
= cpOrig
; p
<= gData
->cpend
; p
++) {
4815 gData
->skipped
= p
- cpOrig
;
4817 for (uintN j
= 0; j
< gData
->regexp
->parenCount
; j
++)
4818 x
->parens
[j
].index
= -1;
4819 REMatchState
*result
= ExecuteREBytecode(gData
, x
);
4820 if (!gData
->ok
|| result
|| (gData
->regexp
->flags
& JSREG_STICKY
))
4822 gData
->backTrackSP
= gData
->backTrackStack
;
4824 gData
->stateStackTop
= 0;
4825 p
= cpOrig
+ gData
->skipped
;
4830 #define MIN_BACKTRACK_LIMIT 400000
4832 static REMatchState
*
4833 InitMatch(JSContext
*cx
, REGlobalData
*gData
, JSRegExp
*re
, size_t length
)
4835 REMatchState
*result
;
4838 gData
->backTrackStackSize
= INITIAL_BACKTRACK
;
4839 JSArenaPool
®expPool
= cx
->regexpPool
;
4840 regexpPool
.allocateCast
<REBackTrackData
*>(gData
->backTrackStack
, INITIAL_BACKTRACK
);
4841 if (!gData
->backTrackStack
)
4844 gData
->backTrackSP
= gData
->backTrackStack
;
4846 gData
->backTrackCount
= 0;
4847 gData
->backTrackLimit
= 0;
4848 if (JS_GetOptions(cx
) & JSOPTION_RELIMIT
) {
4849 gData
->backTrackLimit
= length
* length
* length
; /* O(n^3) */
4850 if (gData
->backTrackLimit
< MIN_BACKTRACK_LIMIT
)
4851 gData
->backTrackLimit
= MIN_BACKTRACK_LIMIT
;
4854 gData
->stateStackLimit
= INITIAL_STATESTACK
;
4855 regexpPool
.allocateCast
<REProgState
*>(gData
->stateStack
,
4856 sizeof(REProgState
) * INITIAL_STATESTACK
);
4857 if (!gData
->stateStack
)
4860 gData
->stateStackTop
= 0;
4863 gData
->ok
= JS_TRUE
;
4865 regexpPool
.allocateCast
<REMatchState
*>(result
, offsetof(REMatchState
, parens
)
4866 + re
->parenCount
* sizeof(RECapture
));
4870 for (i
= 0; i
< re
->classCount
; i
++) {
4871 if (!re
->classList
[i
].converted
&&
4872 !MatcherProcessCharSet(gData
, &re
->classList
[i
])) {
4880 js_ReportOutOfScriptQuota(cx
);
4881 gData
->ok
= JS_FALSE
;
4886 js_ExecuteRegExp(JSContext
*cx
, JSRegExp
*re
, JSString
*str
, size_t *indexp
,
4887 JSBool test
, jsval
*rval
)
4890 REMatchState
*x
, *result
;
4892 const jschar
*cp
, *ep
;
4893 size_t i
, length
, start
;
4895 JSRegExpStatics
*res
;
4898 JSString
*parstr
, *matchstr
;
4901 RECapture
*parsub
= NULL
;
4906 * It's safe to load from cp because JSStrings have a zero at the end,
4907 * and we never let cp get beyond cpend.
4910 str
->getCharsAndLength(cp
, length
);
4914 gData
.cpend
= cp
+ length
;
4916 gData
.start
= start
;
4919 if (!cx
->regexpPool
.getSecond()) {
4921 * The first arena in the regexpPool must have a timestamp at its base.
4923 cx
->regexpPool
.allocateCast
<int64
*>(timestamp
, sizeof *timestamp
);
4926 *timestamp
= JS_Now();
4928 mark
= cx
->regexpPool
.getMark();
4930 x
= InitMatch(cx
, &gData
, re
, length
);
4939 * Call the recursive matcher to do the real work. Return null on mismatch
4940 * whether testing or not. On match, return an extended Array object.
4942 result
= MatchRegExp(&gData
, x
);
4951 i
= cp
- gData
.cpbegin
;
4953 matchlen
= i
- (start
+ gData
.skipped
);
4954 JS_ASSERT(matchlen
>= 0);
4960 * Testing for a match and updating cx->regExpStatics: don't allocate
4961 * an array object, do return true.
4965 /* Avoid warning. (gcc doesn't detect that obj is needed iff !test); */
4969 * The array returned on match has element 0 bound to the matched
4970 * string, elements 1 through state.parenCount bound to the paren
4971 * matches, an index property telling the length of the left context,
4972 * and an input property referring to the input string.
4974 obj
= js_NewSlowArrayObject(cx
);
4979 *rval
= OBJECT_TO_JSVAL(obj
);
4981 #define DEFVAL(val, id) { \
4982 ok = js_DefineProperty(cx, obj, id, val, \
4983 JS_PropertyStub, JS_PropertyStub, \
4984 JSPROP_ENUMERATE); \
4989 matchstr
= js_NewDependentString(cx
, str
, cp
- str
->chars(),
4995 DEFVAL(STRING_TO_JSVAL(matchstr
), INT_TO_JSID(0));
4998 res
= &cx
->regExpStatics
;
5000 if (!res
->parens
.resize(re
->parenCount
)) {
5004 if (re
->parenCount
== 0) {
5005 res
->lastParen
= js_EmptySubString
;
5007 for (num
= 0; num
< re
->parenCount
; num
++) {
5008 JSSubString
*sub
= &res
->parens
[num
];
5009 parsub
= &result
->parens
[num
];
5010 if (parsub
->index
== -1) {
5014 sub
->chars
= gData
.cpbegin
+ parsub
->index
;
5015 sub
->length
= parsub
->length
;
5019 if (parsub
->index
== -1) {
5020 ok
= js_DefineProperty(cx
, obj
, INT_TO_JSID(num
+ 1), JSVAL_VOID
, NULL
, NULL
,
5023 parstr
= js_NewDependentString(cx
, str
,
5024 gData
.cpbegin
+ parsub
->index
-
5031 ok
= js_DefineProperty(cx
, obj
, INT_TO_JSID(num
+ 1), STRING_TO_JSVAL(parstr
),
5032 NULL
, NULL
, JSPROP_ENUMERATE
);
5037 if (parsub
->index
== -1) {
5038 res
->lastParen
= js_EmptySubString
;
5040 res
->lastParen
.chars
= gData
.cpbegin
+ parsub
->index
;
5041 res
->lastParen
.length
= parsub
->length
;
5047 * Define the index and input properties last for better for/in loop
5048 * order (so they come after the elements).
5050 DEFVAL(INT_TO_JSVAL(start
+ gData
.skipped
),
5051 ATOM_TO_JSID(cx
->runtime
->atomState
.indexAtom
));
5052 DEFVAL(STRING_TO_JSVAL(str
),
5053 ATOM_TO_JSID(cx
->runtime
->atomState
.inputAtom
));
5058 res
->lastMatch
.chars
= cp
;
5059 res
->lastMatch
.length
= matchlen
;
5062 * For JS1.3 and ECMAv2, emulate Perl5 exactly:
5064 * js1.3 "hi", "hi there" "hihitherehi therebye"
5066 res
->leftContext
.chars
= str
->chars();
5067 res
->leftContext
.length
= start
+ gData
.skipped
;
5068 res
->rightContext
.chars
= ep
;
5069 res
->rightContext
.length
= gData
.cpend
- ep
;
5072 cx
->regexpPool
.release(mark
);
5076 /************************************************************************/
5079 SetRegExpLastIndex(JSContext
*cx
, JSObject
*obj
, jsdouble lastIndex
)
5081 JS_ASSERT(obj
->isRegExp());
5082 return JS_NewNumberValue(cx
, lastIndex
, obj
->addressOfRegExpLastIndex());
5086 regexp_getProperty(JSContext
*cx
, JSObject
*obj
, jsval id
, jsval
*vp
)
5091 if (!JSVAL_IS_INT(id
))
5093 while (obj
->getClass() != &js_RegExpClass
) {
5094 obj
= obj
->getProto();
5098 slot
= JSVAL_TO_INT(id
);
5099 if (slot
== REGEXP_LAST_INDEX
) {
5100 *vp
= obj
->getRegExpLastIndex();
5104 JS_LOCK_OBJ(cx
, obj
);
5105 re
= (JSRegExp
*) obj
->getPrivate();
5109 *vp
= STRING_TO_JSVAL(re
->source
);
5112 *vp
= BOOLEAN_TO_JSVAL((re
->flags
& JSREG_GLOB
) != 0);
5114 case REGEXP_IGNORE_CASE
:
5115 *vp
= BOOLEAN_TO_JSVAL((re
->flags
& JSREG_FOLD
) != 0);
5117 case REGEXP_MULTILINE
:
5118 *vp
= BOOLEAN_TO_JSVAL((re
->flags
& JSREG_MULTILINE
) != 0);
5121 *vp
= BOOLEAN_TO_JSVAL((re
->flags
& JSREG_STICKY
) != 0);
5125 JS_UNLOCK_OBJ(cx
, obj
);
5130 regexp_setProperty(JSContext
*cx
, JSObject
*obj
, jsval id
, jsval
*vp
)
5137 if (!JSVAL_IS_INT(id
))
5139 while (obj
->getClass() != &js_RegExpClass
) {
5140 obj
= obj
->getProto();
5144 slot
= JSVAL_TO_INT(id
);
5145 if (slot
== REGEXP_LAST_INDEX
) {
5146 if (!JS_ValueToNumber(cx
, *vp
, &lastIndex
))
5148 lastIndex
= js_DoubleToInteger(lastIndex
);
5149 ok
= SetRegExpLastIndex(cx
, obj
, lastIndex
);
5154 #define REGEXP_PROP_ATTRS (JSPROP_PERMANENT | JSPROP_SHARED)
5155 #define RO_REGEXP_PROP_ATTRS (REGEXP_PROP_ATTRS | JSPROP_READONLY)
5157 #define G regexp_getProperty
5158 #define S regexp_setProperty
5160 static JSPropertySpec regexp_props
[] = {
5161 {"source", REGEXP_SOURCE
, RO_REGEXP_PROP_ATTRS
,G
,S
},
5162 {"global", REGEXP_GLOBAL
, RO_REGEXP_PROP_ATTRS
,G
,S
},
5163 {"ignoreCase", REGEXP_IGNORE_CASE
, RO_REGEXP_PROP_ATTRS
,G
,S
},
5164 {"lastIndex", REGEXP_LAST_INDEX
, REGEXP_PROP_ATTRS
,G
,S
},
5165 {"multiline", REGEXP_MULTILINE
, RO_REGEXP_PROP_ATTRS
,G
,S
},
5166 {"sticky", REGEXP_STICKY
, RO_REGEXP_PROP_ATTRS
,G
,S
},
5174 * RegExp class static properties and their Perl counterparts:
5177 * RegExp.multiline $*
5178 * RegExp.lastMatch $&
5179 * RegExp.lastParen $+
5180 * RegExp.leftContext $`
5181 * RegExp.rightContext $'
5183 enum regexp_static_tinyid
{
5184 REGEXP_STATIC_INPUT
= -1,
5185 REGEXP_STATIC_MULTILINE
= -2,
5186 REGEXP_STATIC_LAST_MATCH
= -3,
5187 REGEXP_STATIC_LAST_PAREN
= -4,
5188 REGEXP_STATIC_LEFT_CONTEXT
= -5,
5189 REGEXP_STATIC_RIGHT_CONTEXT
= -6
5193 js_InitRegExpStatics(JSContext
*cx
)
5196 * To avoid multiple allocations in InitMatch(), the arena size parameter
5197 * should be at least as big as:
5199 * + (sizeof(REProgState) * INITIAL_STATESTACK)
5200 * + (offsetof(REMatchState, parens) + avgParanSize * sizeof(RECapture))
5202 cx
->regexpPool
.init("regexp", 12 * 1024 - 40, /* FIXME: bug 421435 */
5203 sizeof(void *), &cx
->scriptStackQuota
);
5205 JS_ClearRegExpStatics(cx
);
5209 js_SaveAndClearRegExpStatics(JSContext
*cx
, JSRegExpStatics
*statics
,
5210 AutoValueRooter
*tvr
)
5212 statics
->copy(cx
->regExpStatics
);
5214 tvr
->setString(statics
->input
);
5215 JS_ClearRegExpStatics(cx
);
5219 js_RestoreRegExpStatics(JSContext
*cx
, JSRegExpStatics
*statics
,
5220 AutoValueRooter
*tvr
)
5222 /* Clear/free any new JSRegExpStatics data before clobbering. */
5223 cx
->regExpStatics
.copy(*statics
);
5227 js_TraceRegExpStatics(JSTracer
*trc
, JSContext
*acx
)
5229 JSRegExpStatics
*res
= &acx
->regExpStatics
;
5232 JS_CALL_STRING_TRACER(trc
, res
->input
, "res->input");
5236 js_FreeRegExpStatics(JSContext
*cx
)
5238 JS_ClearRegExpStatics(cx
);
5239 cx
->regexpPool
.finish();
5243 regexp_static_getProperty(JSContext
*cx
, JSObject
*obj
, jsval id
, jsval
*vp
)
5246 JSRegExpStatics
*res
;
5250 res
= &cx
->regExpStatics
;
5251 if (!JSVAL_IS_INT(id
))
5253 slot
= JSVAL_TO_INT(id
);
5255 case REGEXP_STATIC_INPUT
:
5256 *vp
= res
->input
? STRING_TO_JSVAL(res
->input
)
5257 : JS_GetEmptyStringValue(cx
);
5259 case REGEXP_STATIC_MULTILINE
:
5260 *vp
= BOOLEAN_TO_JSVAL(res
->multiline
);
5262 case REGEXP_STATIC_LAST_MATCH
:
5263 sub
= &res
->lastMatch
;
5265 case REGEXP_STATIC_LAST_PAREN
:
5266 sub
= &res
->lastParen
;
5268 case REGEXP_STATIC_LEFT_CONTEXT
:
5269 sub
= &res
->leftContext
;
5271 case REGEXP_STATIC_RIGHT_CONTEXT
:
5272 sub
= &res
->rightContext
;
5275 sub
= (size_t(slot
) < res
->parens
.length()) ? &res
->parens
[slot
] : &js_EmptySubString
;
5278 str
= js_NewStringCopyN(cx
, sub
->chars
, sub
->length
);
5281 *vp
= STRING_TO_JSVAL(str
);
5286 regexp_static_setProperty(JSContext
*cx
, JSObject
*obj
, jsval id
, jsval
*vp
)
5288 JSRegExpStatics
*res
;
5290 if (!JSVAL_IS_INT(id
))
5292 res
= &cx
->regExpStatics
;
5293 /* XXX use if-else rather than switch to keep MSVC1.52 from crashing */
5294 if (JSVAL_TO_INT(id
) == REGEXP_STATIC_INPUT
) {
5295 if (!JSVAL_IS_STRING(*vp
) &&
5296 !JS_ConvertValue(cx
, *vp
, JSTYPE_STRING
, vp
)) {
5299 res
->input
= JSVAL_TO_STRING(*vp
);
5300 } else if (JSVAL_TO_INT(id
) == REGEXP_STATIC_MULTILINE
) {
5301 if (!JSVAL_IS_BOOLEAN(*vp
) &&
5302 !JS_ConvertValue(cx
, *vp
, JSTYPE_BOOLEAN
, vp
)) {
5305 res
->multiline
= JSVAL_TO_BOOLEAN(*vp
);
5309 #define REGEXP_STATIC_PROP_ATTRS (REGEXP_PROP_ATTRS | JSPROP_ENUMERATE)
5310 #define RO_REGEXP_STATIC_PROP_ATTRS (REGEXP_STATIC_PROP_ATTRS | JSPROP_READONLY)
5312 static JSPropertySpec regexp_static_props
[] = {
5314 REGEXP_STATIC_INPUT
,
5315 REGEXP_STATIC_PROP_ATTRS
,
5316 regexp_static_getProperty
, regexp_static_setProperty
},
5318 REGEXP_STATIC_MULTILINE
,
5319 REGEXP_STATIC_PROP_ATTRS
,
5320 regexp_static_getProperty
, regexp_static_setProperty
},
5322 REGEXP_STATIC_LAST_MATCH
,
5323 RO_REGEXP_STATIC_PROP_ATTRS
,
5324 regexp_static_getProperty
, regexp_static_getProperty
},
5326 REGEXP_STATIC_LAST_PAREN
,
5327 RO_REGEXP_STATIC_PROP_ATTRS
,
5328 regexp_static_getProperty
, regexp_static_getProperty
},
5330 REGEXP_STATIC_LEFT_CONTEXT
,
5331 RO_REGEXP_STATIC_PROP_ATTRS
,
5332 regexp_static_getProperty
, regexp_static_getProperty
},
5334 REGEXP_STATIC_RIGHT_CONTEXT
,
5335 RO_REGEXP_STATIC_PROP_ATTRS
,
5336 regexp_static_getProperty
, regexp_static_getProperty
},
5338 /* XXX should have block scope and local $1, etc. */
5339 {"$1", 0, RO_REGEXP_STATIC_PROP_ATTRS
,
5340 regexp_static_getProperty
, regexp_static_getProperty
},
5341 {"$2", 1, RO_REGEXP_STATIC_PROP_ATTRS
,
5342 regexp_static_getProperty
, regexp_static_getProperty
},
5343 {"$3", 2, RO_REGEXP_STATIC_PROP_ATTRS
,
5344 regexp_static_getProperty
, regexp_static_getProperty
},
5345 {"$4", 3, RO_REGEXP_STATIC_PROP_ATTRS
,
5346 regexp_static_getProperty
, regexp_static_getProperty
},
5347 {"$5", 4, RO_REGEXP_STATIC_PROP_ATTRS
,
5348 regexp_static_getProperty
, regexp_static_getProperty
},
5349 {"$6", 5, RO_REGEXP_STATIC_PROP_ATTRS
,
5350 regexp_static_getProperty
, regexp_static_getProperty
},
5351 {"$7", 6, RO_REGEXP_STATIC_PROP_ATTRS
,
5352 regexp_static_getProperty
, regexp_static_getProperty
},
5353 {"$8", 7, RO_REGEXP_STATIC_PROP_ATTRS
,
5354 regexp_static_getProperty
, regexp_static_getProperty
},
5355 {"$9", 8, RO_REGEXP_STATIC_PROP_ATTRS
,
5356 regexp_static_getProperty
, regexp_static_getProperty
},
5362 regexp_finalize(JSContext
*cx
, JSObject
*obj
)
5364 JSRegExp
*re
= (JSRegExp
*) obj
->getPrivate();
5367 js_DestroyRegExp(cx
, re
);
5370 /* Forward static prototype. */
5372 regexp_exec_sub(JSContext
*cx
, JSObject
*obj
, uintN argc
, jsval
*argv
,
5373 JSBool test
, jsval
*rval
);
5376 regexp_call(JSContext
*cx
, JSObject
*obj
, uintN argc
, jsval
*argv
, jsval
*rval
)
5378 return regexp_exec_sub(cx
, JSVAL_TO_OBJECT(argv
[-2]), argc
, argv
,
5384 #include "jsxdrapi.h"
5387 js_XDRRegExpObject(JSXDRState
*xdr
, JSObject
**objp
)
5394 if (xdr
->mode
== JSXDR_ENCODE
) {
5395 re
= (JSRegExp
*) (*objp
)->getPrivate();
5398 source
= re
->source
;
5399 flagsword
= (uint32
)re
->flags
;
5401 if (!JS_XDRString(xdr
, &source
) ||
5402 !JS_XDRUint32(xdr
, &flagsword
)) {
5405 if (xdr
->mode
== JSXDR_DECODE
) {
5406 obj
= NewObject(xdr
->cx
, &js_RegExpClass
, NULL
, NULL
);
5411 re
= js_NewRegExp(xdr
->cx
, NULL
, source
, (uint8
)flagsword
, JS_FALSE
);
5414 obj
->setPrivate(re
);
5415 obj
->zeroRegExpLastIndex();
5421 #else /* !JS_HAS_XDR */
5423 #define js_XDRRegExpObject NULL
5425 #endif /* !JS_HAS_XDR */
5428 regexp_trace(JSTracer
*trc
, JSObject
*obj
)
5430 JSRegExp
*re
= (JSRegExp
*) obj
->getPrivate();
5431 if (re
&& re
->source
)
5432 JS_CALL_STRING_TRACER(trc
, re
->source
, "source");
5435 JSClass js_RegExpClass
= {
5437 JSCLASS_HAS_PRIVATE
|
5438 JSCLASS_HAS_RESERVED_SLOTS(JSObject::REGEXP_FIXED_RESERVED_SLOTS
) |
5439 JSCLASS_MARK_IS_TRACE
| JSCLASS_HAS_CACHED_PROTO(JSProto_RegExp
),
5440 JS_PropertyStub
, JS_PropertyStub
,
5441 JS_PropertyStub
, JS_PropertyStub
,
5442 JS_EnumerateStub
, JS_ResolveStub
,
5443 JS_ConvertStub
, regexp_finalize
,
5446 js_XDRRegExpObject
, NULL
,
5447 JS_CLASS_TRACE(regexp_trace
), 0
5450 static const jschar empty_regexp_ucstr
[] = {'(', '?', ':', ')', 0};
5453 js_regexp_toString(JSContext
*cx
, JSObject
*obj
, jsval
*vp
)
5456 const jschar
*source
;
5458 size_t length
, nflags
;
5462 if (!JS_InstanceOf(cx
, obj
, &js_RegExpClass
, vp
+ 2))
5464 JS_LOCK_OBJ(cx
, obj
);
5465 re
= (JSRegExp
*) obj
->getPrivate();
5467 JS_UNLOCK_OBJ(cx
, obj
);
5468 *vp
= STRING_TO_JSVAL(cx
->runtime
->emptyString
);
5472 re
->source
->getCharsAndLength(source
, length
);
5474 source
= empty_regexp_ucstr
;
5475 length
= JS_ARRAY_LENGTH(empty_regexp_ucstr
) - 1;
5479 for (flags
= re
->flags
; flags
!= 0; flags
&= flags
- 1)
5481 chars
= (jschar
*) cx
->malloc((length
+ nflags
+ 1) * sizeof(jschar
));
5483 JS_UNLOCK_OBJ(cx
, obj
);
5488 js_strncpy(&chars
[1], source
, length
- 2);
5489 chars
[length
-1] = '/';
5491 if (re
->flags
& JSREG_GLOB
)
5492 chars
[length
++] = 'g';
5493 if (re
->flags
& JSREG_FOLD
)
5494 chars
[length
++] = 'i';
5495 if (re
->flags
& JSREG_MULTILINE
)
5496 chars
[length
++] = 'm';
5497 if (re
->flags
& JSREG_STICKY
)
5498 chars
[length
++] = 'y';
5500 JS_UNLOCK_OBJ(cx
, obj
);
5503 str
= js_NewString(cx
, chars
, length
);
5508 *vp
= STRING_TO_JSVAL(str
);
5513 regexp_toString(JSContext
*cx
, uintN argc
, jsval
*vp
)
5517 obj
= JS_THIS_OBJECT(cx
, vp
);
5518 return obj
&& js_regexp_toString(cx
, obj
, vp
);
5522 regexp_compile_sub(JSContext
*cx
, JSObject
*obj
, uintN argc
, jsval
*argv
,
5525 JSString
*opt
, *str
;
5526 JSRegExp
*oldre
, *re
;
5528 size_t length
, nbytes
;
5529 const jschar
*cp
, *start
, *end
;
5530 jschar
*nstart
, *ncp
, *tmp
;
5532 if (!JS_InstanceOf(cx
, obj
, &js_RegExpClass
, argv
))
5536 str
= cx
->runtime
->emptyString
;
5538 if (JSVAL_IS_OBJECT(argv
[0])) {
5540 * If we get passed in a RegExp object we construct a new
5541 * RegExp that is a duplicate of it by re-compiling the
5542 * original source code. ECMA requires that it be an error
5543 * here if the flags are specified. (We must use the flags
5544 * from the original RegExp also).
5546 obj2
= JSVAL_TO_OBJECT(argv
[0]);
5547 if (obj2
&& obj2
->getClass() == &js_RegExpClass
) {
5548 if (argc
>= 2 && !JSVAL_IS_VOID(argv
[1])) { /* 'flags' passed */
5549 JS_ReportErrorNumber(cx
, js_GetErrorMessage
, NULL
,
5550 JSMSG_NEWREGEXP_FLAGGED
);
5553 JS_LOCK_OBJ(cx
, obj2
);
5554 re
= (JSRegExp
*) obj2
->getPrivate();
5556 JS_UNLOCK_OBJ(cx
, obj2
);
5559 re
= js_NewRegExp(cx
, NULL
, re
->source
, re
->flags
, JS_FALSE
);
5560 JS_UNLOCK_OBJ(cx
, obj2
);
5564 str
= js_ValueToString(cx
, argv
[0]);
5567 argv
[0] = STRING_TO_JSVAL(str
);
5569 if (JSVAL_IS_VOID(argv
[1])) {
5572 opt
= js_ValueToString(cx
, argv
[1]);
5575 argv
[1] = STRING_TO_JSVAL(opt
);
5579 /* Escape any naked slashes in the regexp source. */
5580 str
->getCharsAndLength(start
, length
);
5581 end
= start
+ length
;
5582 nstart
= ncp
= NULL
;
5583 for (cp
= start
; cp
< end
; cp
++) {
5584 if (*cp
== '/' && (cp
== start
|| cp
[-1] != '\\')) {
5585 nbytes
= (++length
+ 1) * sizeof(jschar
);
5587 nstart
= (jschar
*) cx
->malloc(nbytes
);
5590 ncp
= nstart
+ (cp
- start
);
5591 js_strncpy(nstart
, start
, cp
- start
);
5593 tmp
= (jschar
*) cx
->realloc(nstart
, nbytes
);
5598 ncp
= tmp
+ (ncp
- nstart
);
5608 /* Don't forget to store the backstop after the new string. */
5609 JS_ASSERT((size_t)(ncp
- nstart
) == length
);
5611 str
= js_NewString(cx
, nstart
, length
);
5616 argv
[0] = STRING_TO_JSVAL(str
);
5620 re
= js_NewRegExpOpt(cx
, str
, opt
, JS_FALSE
);
5624 JS_LOCK_OBJ(cx
, obj
);
5625 oldre
= (JSRegExp
*) obj
->getPrivate();
5626 obj
->setPrivate(re
);
5627 obj
->zeroRegExpLastIndex();
5628 JS_UNLOCK_OBJ(cx
, obj
);
5630 js_DestroyRegExp(cx
, oldre
);
5631 *rval
= OBJECT_TO_JSVAL(obj
);
5636 regexp_compile(JSContext
*cx
, uintN argc
, jsval
*vp
)
5640 obj
= JS_THIS_OBJECT(cx
, vp
);
5641 return obj
&& regexp_compile_sub(cx
, obj
, argc
, vp
+ 2, vp
);
5645 regexp_exec_sub(JSContext
*cx
, JSObject
*obj
, uintN argc
, jsval
*argv
,
5646 JSBool test
, jsval
*rval
)
5654 ok
= JS_InstanceOf(cx
, obj
, &js_RegExpClass
, argv
);
5657 JS_LOCK_OBJ(cx
, obj
);
5658 re
= (JSRegExp
*) obj
->getPrivate();
5660 JS_UNLOCK_OBJ(cx
, obj
);
5664 /* NB: we must reach out: after this paragraph, in order to drop re. */
5665 HOLD_REGEXP(cx
, re
);
5666 sticky
= (re
->flags
& JSREG_STICKY
) != 0;
5667 if (re
->flags
& (JSREG_GLOB
| JSREG_STICKY
)) {
5668 jsval v
= obj
->getRegExpLastIndex();
5669 if (JSVAL_IS_INT(v
)) {
5670 lastIndex
= JSVAL_TO_INT(v
);
5672 JS_ASSERT(JSVAL_IS_DOUBLE(v
));
5673 lastIndex
= *JSVAL_TO_DOUBLE(v
);
5678 JS_UNLOCK_OBJ(cx
, obj
);
5680 /* Now that obj is unlocked, it's safe to (potentially) grab the GC lock. */
5682 str
= cx
->regExpStatics
.input
;
5684 const char *bytes
= js_GetStringBytes(cx
, re
->source
);
5687 JS_ReportErrorNumber(cx
, js_GetErrorMessage
, NULL
,
5690 (re
->flags
& JSREG_GLOB
) ? "g" : "",
5691 (re
->flags
& JSREG_FOLD
) ? "i" : "",
5692 (re
->flags
& JSREG_MULTILINE
) ? "m" : "",
5693 (re
->flags
& JSREG_STICKY
) ? "y" : "");
5699 str
= js_ValueToString(cx
, argv
[0]);
5704 argv
[0] = STRING_TO_JSVAL(str
);
5707 if (lastIndex
< 0 || str
->length() < lastIndex
) {
5708 obj
->zeroRegExpLastIndex();
5711 i
= (size_t) lastIndex
;
5712 ok
= js_ExecuteRegExp(cx
, re
, str
, &i
, test
, rval
);
5714 ((re
->flags
& JSREG_GLOB
) || (*rval
!= JSVAL_NULL
&& sticky
))) {
5715 if (*rval
== JSVAL_NULL
)
5716 obj
->zeroRegExpLastIndex();
5718 ok
= SetRegExpLastIndex(cx
, obj
, i
);
5723 DROP_REGEXP(cx
, re
);
5728 regexp_exec(JSContext
*cx
, uintN argc
, jsval
*vp
)
5730 return regexp_exec_sub(cx
, JS_THIS_OBJECT(cx
, vp
), argc
, vp
+ 2, JS_FALSE
,
5735 regexp_test(JSContext
*cx
, uintN argc
, jsval
*vp
)
5737 if (!regexp_exec_sub(cx
, JS_THIS_OBJECT(cx
, vp
), argc
, vp
+ 2, JS_TRUE
, vp
))
5739 if (*vp
!= JSVAL_TRUE
)
5744 static JSFunctionSpec regexp_methods
[] = {
5746 JS_FN(js_toSource_str
, regexp_toString
, 0,0),
5748 JS_FN(js_toString_str
, regexp_toString
, 0,0),
5749 JS_FN("compile", regexp_compile
, 2,0),
5750 JS_FN("exec", regexp_exec
, 1,0),
5751 JS_FN("test", regexp_test
, 1,0),
5756 RegExp(JSContext
*cx
, JSObject
*obj
, uintN argc
, jsval
*argv
, jsval
*rval
)
5758 if (!JS_IsConstructing(cx
)) {
5760 * If first arg is regexp and no flags are given, just return the arg.
5761 * (regexp_compile_sub detects the regexp + flags case and throws a
5762 * TypeError.) See 10.15.3.1.
5764 if ((argc
< 2 || JSVAL_IS_VOID(argv
[1])) &&
5765 !JSVAL_IS_PRIMITIVE(argv
[0]) &&
5766 JSVAL_TO_OBJECT(argv
[0])->getClass() == &js_RegExpClass
) {
5771 /* Otherwise, replace obj with a new RegExp object. */
5772 obj
= NewObject(cx
, &js_RegExpClass
, NULL
, NULL
);
5777 * regexp_compile_sub does not use rval to root its temporaries so we
5778 * can use it to root obj.
5780 *rval
= OBJECT_TO_JSVAL(obj
);
5782 return regexp_compile_sub(cx
, obj
, argc
, argv
, rval
);
5786 js_InitRegExpClass(JSContext
*cx
, JSObject
*obj
)
5788 JSObject
*proto
= js_InitClass(cx
, obj
, NULL
, &js_RegExpClass
, RegExp
, 1,
5789 regexp_props
, regexp_methods
,
5790 regexp_static_props
, NULL
);
5794 JSObject
*ctor
= JS_GetConstructor(cx
, proto
);
5798 /* Give RegExp.prototype private data so it matches the empty string. */
5800 if (!JS_AliasProperty(cx
, ctor
, "input", "$_") ||
5801 !JS_AliasProperty(cx
, ctor
, "multiline", "$*") ||
5802 !JS_AliasProperty(cx
, ctor
, "lastMatch", "$&") ||
5803 !JS_AliasProperty(cx
, ctor
, "lastParen", "$+") ||
5804 !JS_AliasProperty(cx
, ctor
, "leftContext", "$`") ||
5805 !JS_AliasProperty(cx
, ctor
, "rightContext", "$'") ||
5806 !regexp_compile_sub(cx
, proto
, 0, NULL
, &rval
)) {
5814 js_NewRegExpObject(JSContext
*cx
, TokenStream
*ts
,
5815 const jschar
*chars
, size_t length
, uintN flags
)
5821 str
= js_NewStringCopyN(cx
, chars
, length
);
5824 AutoValueRooter
tvr(cx
, str
);
5825 re
= js_NewRegExp(cx
, ts
, str
, flags
, JS_FALSE
);
5828 obj
= NewObject(cx
, &js_RegExpClass
, NULL
, NULL
);
5830 js_DestroyRegExp(cx
, re
);
5833 obj
->setPrivate(re
);
5834 obj
->zeroRegExpLastIndex();
5838 JSObject
* JS_FASTCALL
5839 js_CloneRegExpObject(JSContext
*cx
, JSObject
*obj
, JSObject
*proto
)
5841 JS_ASSERT(obj
->getClass() == &js_RegExpClass
);
5843 JS_ASSERT(proto
->getClass() == &js_RegExpClass
);
5844 JSObject
*clone
= NewObjectWithGivenProto(cx
, &js_RegExpClass
, proto
, NULL
);
5847 JSRegExp
*re
= static_cast<JSRegExp
*>(obj
->getPrivate());
5848 clone
->setPrivate(re
);
5849 clone
->zeroRegExpLastIndex();
5850 HOLD_REGEXP(cx
, re
);
5855 JS_DEFINE_CALLINFO_3(extern, OBJECT
, js_CloneRegExpObject
, CONTEXT
, OBJECT
, OBJECT
, 0,
5860 js_ContainsRegExpMetaChars(const jschar
*chars
, size_t length
)
5862 for (size_t i
= 0; i
< length
; ++i
) {
5863 jschar c
= chars
[i
];
5865 /* Taken from the PatternCharacter production in 15.10.1. */
5866 case '^': case '$': case '\\': case '.': case '*': case '+':
5867 case '?': case '(': case ')': case '[': case ']': case '{':
5877 js_ObjectIsRegExp(JSObject
*obj
)
5879 return obj
->isRegExp();