Rubber-stamped by Brady Eidson.
[webbrowser.git] / JavaScriptCore / pcre / pcre_exec.cpp
blob8ca2eb4b79efddf053cdfc4b56733f96f9e12132
1 /* This is JavaScriptCore's variant of the PCRE library. While this library
2 started out as a copy of PCRE, many of the features of PCRE have been
3 removed. This library now supports only the regular expression features
4 required by the JavaScript language specification, and has only the functions
5 needed by JavaScriptCore and the rest of WebKit.
7 Originally written by Philip Hazel
8 Copyright (c) 1997-2006 University of Cambridge
9 Copyright (C) 2002, 2004, 2006, 2007, 2008, 2009 Apple Inc. All rights reserved.
10 Copyright (C) 2007 Eric Seidel <eric@webkit.org>
12 -----------------------------------------------------------------------------
13 Redistribution and use in source and binary forms, with or without
14 modification, are permitted provided that the following conditions are met:
16 * Redistributions of source code must retain the above copyright notice,
17 this list of conditions and the following disclaimer.
19 * Redistributions in binary form must reproduce the above copyright
20 notice, this list of conditions and the following disclaimer in the
21 documentation and/or other materials provided with the distribution.
23 * Neither the name of the University of Cambridge nor the names of its
24 contributors may be used to endorse or promote products derived from
25 this software without specific prior written permission.
27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 POSSIBILITY OF SUCH DAMAGE.
38 -----------------------------------------------------------------------------
41 /* This module contains jsRegExpExecute(), the externally visible function
42 that does pattern matching using an NFA algorithm, following the rules from
43 the JavaScript specification. There are also some supporting functions. */
45 #include "config.h"
46 #include "pcre_internal.h"
48 #include <limits.h>
49 #include <wtf/ASCIICType.h>
50 #include <wtf/Vector.h>
52 #if REGEXP_HISTOGRAM
53 #include <wtf/DateMath.h>
54 #include <runtime/UString.h>
55 #endif
57 using namespace WTF;
59 #if COMPILER(GCC)
60 #define USE_COMPUTED_GOTO_FOR_MATCH_RECURSION
61 //#define USE_COMPUTED_GOTO_FOR_MATCH_OPCODE_LOOP
62 #endif
64 /* Avoid warnings on Windows. */
65 #undef min
66 #undef max
68 #ifndef USE_COMPUTED_GOTO_FOR_MATCH_RECURSION
69 typedef int ReturnLocation;
70 #else
71 typedef void* ReturnLocation;
72 #endif
74 #if !REGEXP_HISTOGRAM
76 class HistogramTimeLogger {
77 public:
78 HistogramTimeLogger(const JSRegExp*) { }
81 #else
83 using namespace JSC;
85 class Histogram {
86 public:
87 ~Histogram();
88 void add(const JSRegExp*, double);
90 private:
91 typedef HashMap<RefPtr<UString::Rep>, double> Map;
92 Map times;
95 class HistogramTimeLogger {
96 public:
97 HistogramTimeLogger(const JSRegExp*);
98 ~HistogramTimeLogger();
100 private:
101 const JSRegExp* m_re;
102 double m_startTime;
105 #endif
107 /* Structure for building a chain of data for holding the values of
108 the subject pointer at the start of each bracket, used to detect when
109 an empty string has been matched by a bracket to break infinite loops. */
110 struct BracketChainNode {
111 BracketChainNode* previousBracket;
112 const UChar* bracketStart;
115 struct MatchFrame : FastAllocBase {
116 ReturnLocation returnLocation;
117 struct MatchFrame* previousFrame;
119 /* Function arguments that may change */
120 struct {
121 const UChar* subjectPtr;
122 const unsigned char* instructionPtr;
123 int offsetTop;
124 BracketChainNode* bracketChain;
125 } args;
128 /* PCRE uses "fake" recursion built off of gotos, thus
129 stack-based local variables are not safe to use. Instead we have to
130 store local variables on the current MatchFrame. */
131 struct {
132 const unsigned char* data;
133 const unsigned char* startOfRepeatingBracket;
134 const UChar* subjectPtrAtStartOfInstruction; // Several instrutions stash away a subjectPtr here for later compare
135 const unsigned char* instructionPtrAtStartOfOnce;
137 int repeatOthercase;
139 int ctype;
140 int fc;
141 int fi;
142 int length;
143 int max;
144 int number;
145 int offset;
146 int saveOffset1;
147 int saveOffset2;
148 int saveOffset3;
150 BracketChainNode bracketChainNode;
151 } locals;
154 /* Structure for passing "static" information around between the functions
155 doing traditional NFA matching, so that they are thread-safe. */
157 struct MatchData {
158 int* offsetVector; /* Offset vector */
159 int offsetEnd; /* One past the end */
160 int offsetMax; /* The maximum usable for return data */
161 bool offsetOverflow; /* Set if too many extractions */
162 const UChar* startSubject; /* Start of the subject string */
163 const UChar* endSubject; /* End of the subject string */
164 const UChar* endMatchPtr; /* Subject position at end match */
165 int endOffsetTop; /* Highwater mark at end of match */
166 bool multiline;
167 bool ignoreCase;
170 /* The maximum remaining length of subject we are prepared to search for a
171 reqByte match. */
173 #define REQ_BYTE_MAX 1000
175 /* The below limit restricts the number of "recursive" match calls in order to
176 avoid spending exponential time on complex regular expressions. */
178 static const unsigned matchLimit = 1000000;
180 #ifdef DEBUG
181 /*************************************************
182 * Debugging function to print chars *
183 *************************************************/
185 /* Print a sequence of chars in printable format, stopping at the end of the
186 subject if the requested.
188 Arguments:
189 p points to characters
190 length number to print
191 isSubject true if printing from within md.startSubject
192 md pointer to matching data block, if isSubject is true
195 static void pchars(const UChar* p, int length, bool isSubject, const MatchData& md)
197 if (isSubject && length > md.endSubject - p)
198 length = md.endSubject - p;
199 while (length-- > 0) {
200 int c;
201 if (isprint(c = *(p++)))
202 printf("%c", c);
203 else if (c < 256)
204 printf("\\x%02x", c);
205 else
206 printf("\\x{%x}", c);
209 #endif
211 /*************************************************
212 * Match a back-reference *
213 *************************************************/
215 /* If a back reference hasn't been set, the length that is passed is greater
216 than the number of characters left in the string, so the match fails.
218 Arguments:
219 offset index into the offset vector
220 subjectPtr points into the subject
221 length length to be matched
222 md points to match data block
224 Returns: true if matched
227 static bool matchRef(int offset, const UChar* subjectPtr, int length, const MatchData& md)
229 const UChar* p = md.startSubject + md.offsetVector[offset];
231 #ifdef DEBUG
232 if (subjectPtr >= md.endSubject)
233 printf("matching subject <null>");
234 else {
235 printf("matching subject ");
236 pchars(subjectPtr, length, true, md);
238 printf(" against backref ");
239 pchars(p, length, false, md);
240 printf("\n");
241 #endif
243 /* Always fail if not enough characters left */
245 if (length > md.endSubject - subjectPtr)
246 return false;
248 /* Separate the caselesss case for speed */
250 if (md.ignoreCase) {
251 while (length-- > 0) {
252 UChar c = *p++;
253 int othercase = jsc_pcre_ucp_othercase(c);
254 UChar d = *subjectPtr++;
255 if (c != d && othercase != d)
256 return false;
259 else {
260 while (length-- > 0)
261 if (*p++ != *subjectPtr++)
262 return false;
265 return true;
268 #ifndef USE_COMPUTED_GOTO_FOR_MATCH_RECURSION
270 /* Use numbered labels and switch statement at the bottom of the match function. */
272 #define RMATCH_WHERE(num) num
273 #define RRETURN_LABEL RRETURN_SWITCH
275 #else
277 /* Use GCC's computed goto extension. */
279 /* For one test case this is more than 40% faster than the switch statement.
280 We could avoid the use of the num argument entirely by using local labels,
281 but using it for the GCC case as well as the non-GCC case allows us to share
282 a bit more code and notice if we use conflicting numbers.*/
284 #define RMATCH_WHERE(num) &&RRETURN_##num
285 #define RRETURN_LABEL *stack.currentFrame->returnLocation
287 #endif
289 #define RECURSIVE_MATCH_COMMON(num) \
290 goto RECURSE;\
291 RRETURN_##num: \
292 stack.popCurrentFrame();
294 #define RECURSIVE_MATCH(num, ra, rb) \
295 do { \
296 stack.pushNewFrame((ra), (rb), RMATCH_WHERE(num)); \
297 RECURSIVE_MATCH_COMMON(num) \
298 } while (0)
300 #define RECURSIVE_MATCH_NEW_GROUP(num, ra, rb) \
301 do { \
302 stack.pushNewFrame((ra), (rb), RMATCH_WHERE(num)); \
303 startNewGroup(stack.currentFrame); \
304 RECURSIVE_MATCH_COMMON(num) \
305 } while (0)
307 #define RRETURN goto RRETURN_LABEL
309 #define RRETURN_NO_MATCH do { isMatch = false; RRETURN; } while (0)
311 /*************************************************
312 * Match from current position *
313 *************************************************/
315 /* On entry instructionPtr points to the first opcode, and subjectPtr to the first character
316 in the subject string, while substringStart holds the value of subjectPtr at the start of the
317 last bracketed group - used for breaking infinite loops matching zero-length
318 strings. This function is called recursively in many circumstances. Whenever it
319 returns a negative (error) response, the outer match() call must also return the
320 same response.
322 Arguments:
323 subjectPtr pointer in subject
324 instructionPtr position in code
325 offsetTop current top pointer
326 md pointer to "static" info for the match
328 Returns: 1 if matched ) these values are >= 0
329 0 if failed to match )
330 a negative error value if aborted by an error condition
331 (e.g. stopped by repeated call or recursion limit)
334 static const unsigned numFramesOnStack = 16;
336 struct MatchStack {
337 MatchStack()
338 : framesEnd(frames + numFramesOnStack)
339 , currentFrame(frames)
340 , size(1) // match() creates accesses the first frame w/o calling pushNewFrame
342 ASSERT((sizeof(frames) / sizeof(frames[0])) == numFramesOnStack);
345 MatchFrame frames[numFramesOnStack];
346 MatchFrame* framesEnd;
347 MatchFrame* currentFrame;
348 unsigned size;
350 inline bool canUseStackBufferForNextFrame()
352 return size < numFramesOnStack;
355 inline MatchFrame* allocateNextFrame()
357 if (canUseStackBufferForNextFrame())
358 return currentFrame + 1;
359 return new MatchFrame;
362 inline void pushNewFrame(const unsigned char* instructionPtr, BracketChainNode* bracketChain, ReturnLocation returnLocation)
364 MatchFrame* newframe = allocateNextFrame();
365 newframe->previousFrame = currentFrame;
367 newframe->args.subjectPtr = currentFrame->args.subjectPtr;
368 newframe->args.offsetTop = currentFrame->args.offsetTop;
369 newframe->args.instructionPtr = instructionPtr;
370 newframe->args.bracketChain = bracketChain;
371 newframe->returnLocation = returnLocation;
372 size++;
374 currentFrame = newframe;
377 inline void popCurrentFrame()
379 MatchFrame* oldFrame = currentFrame;
380 currentFrame = currentFrame->previousFrame;
381 if (size > numFramesOnStack)
382 delete oldFrame;
383 size--;
386 void popAllFrames()
388 while (size)
389 popCurrentFrame();
393 static int matchError(int errorCode, MatchStack& stack)
395 stack.popAllFrames();
396 return errorCode;
399 /* Get the next UTF-8 character, not advancing the pointer, incrementing length
400 if there are extra bytes. This is called when we know we are in UTF-8 mode. */
402 static inline void getUTF8CharAndIncrementLength(int& c, const unsigned char* subjectPtr, int& len)
404 c = *subjectPtr;
405 if ((c & 0xc0) == 0xc0) {
406 int gcaa = jsc_pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */
407 int gcss = 6 * gcaa;
408 c = (c & jsc_pcre_utf8_table3[gcaa]) << gcss;
409 for (int gcii = 1; gcii <= gcaa; gcii++) {
410 gcss -= 6;
411 c |= (subjectPtr[gcii] & 0x3f) << gcss;
413 len += gcaa;
417 static inline void startNewGroup(MatchFrame* currentFrame)
419 /* At the start of a bracketed group, add the current subject pointer to the
420 stack of such pointers, to be re-instated at the end of the group when we hit
421 the closing ket. When match() is called in other circumstances, we don't add to
422 this stack. */
424 currentFrame->locals.bracketChainNode.previousBracket = currentFrame->args.bracketChain;
425 currentFrame->locals.bracketChainNode.bracketStart = currentFrame->args.subjectPtr;
426 currentFrame->args.bracketChain = &currentFrame->locals.bracketChainNode;
429 // FIXME: "minimize" means "not greedy", we should invert the callers to ask for "greedy" to be less confusing
430 static inline void repeatInformationFromInstructionOffset(short instructionOffset, bool& minimize, int& minimumRepeats, int& maximumRepeats)
432 // Instruction offsets are based off of OP_CRSTAR, OP_STAR, OP_TYPESTAR, OP_NOTSTAR
433 static const char minimumRepeatsFromInstructionOffset[] = { 0, 0, 1, 1, 0, 0 };
434 static const int maximumRepeatsFromInstructionOffset[] = { INT_MAX, INT_MAX, INT_MAX, INT_MAX, 1, 1 };
436 ASSERT(instructionOffset >= 0);
437 ASSERT(instructionOffset <= (OP_CRMINQUERY - OP_CRSTAR));
439 minimize = (instructionOffset & 1); // this assumes ordering: Instruction, MinimizeInstruction, Instruction2, MinimizeInstruction2
440 minimumRepeats = minimumRepeatsFromInstructionOffset[instructionOffset];
441 maximumRepeats = maximumRepeatsFromInstructionOffset[instructionOffset];
444 static int match(const UChar* subjectPtr, const unsigned char* instructionPtr, int offsetTop, MatchData& md)
446 bool isMatch = false;
447 int min;
448 bool minimize = false; /* Initialization not really needed, but some compilers think so. */
449 unsigned remainingMatchCount = matchLimit;
450 int othercase; /* Declare here to avoid errors during jumps */
452 MatchStack stack;
454 /* The opcode jump table. */
455 #ifdef USE_COMPUTED_GOTO_FOR_MATCH_OPCODE_LOOP
456 #define EMIT_JUMP_TABLE_ENTRY(opcode) &&LABEL_OP_##opcode,
457 static void* opcodeJumpTable[256] = { FOR_EACH_OPCODE(EMIT_JUMP_TABLE_ENTRY) };
458 #undef EMIT_JUMP_TABLE_ENTRY
459 #endif
461 /* One-time setup of the opcode jump table. */
462 #ifdef USE_COMPUTED_GOTO_FOR_MATCH_OPCODE_LOOP
463 for (int i = 255; !opcodeJumpTable[i]; i--)
464 opcodeJumpTable[i] = &&CAPTURING_BRACKET;
465 #endif
467 #ifdef USE_COMPUTED_GOTO_FOR_MATCH_RECURSION
468 // Shark shows this as a hot line
469 // Using a static const here makes this line disappear, but makes later access hotter (not sure why)
470 stack.currentFrame->returnLocation = &&RETURN;
471 #else
472 stack.currentFrame->returnLocation = 0;
473 #endif
474 stack.currentFrame->args.subjectPtr = subjectPtr;
475 stack.currentFrame->args.instructionPtr = instructionPtr;
476 stack.currentFrame->args.offsetTop = offsetTop;
477 stack.currentFrame->args.bracketChain = 0;
478 startNewGroup(stack.currentFrame);
480 /* This is where control jumps back to to effect "recursion" */
482 RECURSE:
483 if (!--remainingMatchCount)
484 return matchError(JSRegExpErrorHitLimit, stack);
486 /* Now start processing the operations. */
488 #ifndef USE_COMPUTED_GOTO_FOR_MATCH_OPCODE_LOOP
489 while (true)
490 #endif
493 #ifdef USE_COMPUTED_GOTO_FOR_MATCH_OPCODE_LOOP
494 #define BEGIN_OPCODE(opcode) LABEL_OP_##opcode
495 #define NEXT_OPCODE goto *opcodeJumpTable[*stack.currentFrame->args.instructionPtr]
496 #else
497 #define BEGIN_OPCODE(opcode) case OP_##opcode
498 #define NEXT_OPCODE continue
499 #endif
501 #ifdef USE_COMPUTED_GOTO_FOR_MATCH_OPCODE_LOOP
502 NEXT_OPCODE;
503 #else
504 switch (*stack.currentFrame->args.instructionPtr)
505 #endif
507 /* Non-capturing bracket: optimized */
509 BEGIN_OPCODE(BRA):
510 NON_CAPTURING_BRACKET:
511 DPRINTF(("start bracket 0\n"));
512 do {
513 RECURSIVE_MATCH_NEW_GROUP(2, stack.currentFrame->args.instructionPtr + 1 + LINK_SIZE, stack.currentFrame->args.bracketChain);
514 if (isMatch)
515 RRETURN;
516 stack.currentFrame->args.instructionPtr += getLinkValue(stack.currentFrame->args.instructionPtr + 1);
517 } while (*stack.currentFrame->args.instructionPtr == OP_ALT);
518 DPRINTF(("bracket 0 failed\n"));
519 RRETURN;
521 /* Skip over large extraction number data if encountered. */
523 BEGIN_OPCODE(BRANUMBER):
524 stack.currentFrame->args.instructionPtr += 3;
525 NEXT_OPCODE;
527 /* End of the pattern. */
529 BEGIN_OPCODE(END):
530 md.endMatchPtr = stack.currentFrame->args.subjectPtr; /* Record where we ended */
531 md.endOffsetTop = stack.currentFrame->args.offsetTop; /* and how many extracts were taken */
532 isMatch = true;
533 RRETURN;
535 /* Assertion brackets. Check the alternative branches in turn - the
536 matching won't pass the KET for an assertion. If any one branch matches,
537 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
538 start of each branch to move the current point backwards, so the code at
539 this level is identical to the lookahead case. */
541 BEGIN_OPCODE(ASSERT):
542 do {
543 RECURSIVE_MATCH_NEW_GROUP(6, stack.currentFrame->args.instructionPtr + 1 + LINK_SIZE, NULL);
544 if (isMatch)
545 break;
546 stack.currentFrame->args.instructionPtr += getLinkValue(stack.currentFrame->args.instructionPtr + 1);
547 } while (*stack.currentFrame->args.instructionPtr == OP_ALT);
548 if (*stack.currentFrame->args.instructionPtr == OP_KET)
549 RRETURN_NO_MATCH;
551 /* Continue from after the assertion, updating the offsets high water
552 mark, since extracts may have been taken during the assertion. */
554 advanceToEndOfBracket(stack.currentFrame->args.instructionPtr);
555 stack.currentFrame->args.instructionPtr += 1 + LINK_SIZE;
556 stack.currentFrame->args.offsetTop = md.endOffsetTop;
557 NEXT_OPCODE;
559 /* Negative assertion: all branches must fail to match */
561 BEGIN_OPCODE(ASSERT_NOT):
562 do {
563 RECURSIVE_MATCH_NEW_GROUP(7, stack.currentFrame->args.instructionPtr + 1 + LINK_SIZE, NULL);
564 if (isMatch)
565 RRETURN_NO_MATCH;
566 stack.currentFrame->args.instructionPtr += getLinkValue(stack.currentFrame->args.instructionPtr + 1);
567 } while (*stack.currentFrame->args.instructionPtr == OP_ALT);
569 stack.currentFrame->args.instructionPtr += 1 + LINK_SIZE;
570 NEXT_OPCODE;
572 /* An alternation is the end of a branch; scan along to find the end of the
573 bracketed group and go to there. */
575 BEGIN_OPCODE(ALT):
576 advanceToEndOfBracket(stack.currentFrame->args.instructionPtr);
577 NEXT_OPCODE;
579 /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
580 that it may occur zero times. It may repeat infinitely, or not at all -
581 i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
582 repeat limits are compiled as a number of copies, with the optional ones
583 preceded by BRAZERO or BRAMINZERO. */
585 BEGIN_OPCODE(BRAZERO): {
586 stack.currentFrame->locals.startOfRepeatingBracket = stack.currentFrame->args.instructionPtr + 1;
587 RECURSIVE_MATCH_NEW_GROUP(14, stack.currentFrame->locals.startOfRepeatingBracket, stack.currentFrame->args.bracketChain);
588 if (isMatch)
589 RRETURN;
590 advanceToEndOfBracket(stack.currentFrame->locals.startOfRepeatingBracket);
591 stack.currentFrame->args.instructionPtr = stack.currentFrame->locals.startOfRepeatingBracket + 1 + LINK_SIZE;
592 NEXT_OPCODE;
595 BEGIN_OPCODE(BRAMINZERO): {
596 stack.currentFrame->locals.startOfRepeatingBracket = stack.currentFrame->args.instructionPtr + 1;
597 advanceToEndOfBracket(stack.currentFrame->locals.startOfRepeatingBracket);
598 RECURSIVE_MATCH_NEW_GROUP(15, stack.currentFrame->locals.startOfRepeatingBracket + 1 + LINK_SIZE, stack.currentFrame->args.bracketChain);
599 if (isMatch)
600 RRETURN;
601 stack.currentFrame->args.instructionPtr++;
602 NEXT_OPCODE;
605 /* End of a group, repeated or non-repeating. If we are at the end of
606 an assertion "group", stop matching and return 1, but record the
607 current high water mark for use by positive assertions. Do this also
608 for the "once" (not-backup up) groups. */
610 BEGIN_OPCODE(KET):
611 BEGIN_OPCODE(KETRMIN):
612 BEGIN_OPCODE(KETRMAX):
613 stack.currentFrame->locals.instructionPtrAtStartOfOnce = stack.currentFrame->args.instructionPtr - getLinkValue(stack.currentFrame->args.instructionPtr + 1);
614 stack.currentFrame->locals.subjectPtrAtStartOfInstruction = stack.currentFrame->args.bracketChain->bracketStart;
616 /* Back up the stack of bracket start pointers. */
618 stack.currentFrame->args.bracketChain = stack.currentFrame->args.bracketChain->previousBracket;
620 if (*stack.currentFrame->locals.instructionPtrAtStartOfOnce == OP_ASSERT || *stack.currentFrame->locals.instructionPtrAtStartOfOnce == OP_ASSERT_NOT) {
621 md.endOffsetTop = stack.currentFrame->args.offsetTop;
622 isMatch = true;
623 RRETURN;
626 /* In all other cases except a conditional group we have to check the
627 group number back at the start and if necessary complete handling an
628 extraction by setting the offsets and bumping the high water mark. */
630 stack.currentFrame->locals.number = *stack.currentFrame->locals.instructionPtrAtStartOfOnce - OP_BRA;
632 /* For extended extraction brackets (large number), we have to fish out
633 the number from a dummy opcode at the start. */
635 if (stack.currentFrame->locals.number > EXTRACT_BASIC_MAX)
636 stack.currentFrame->locals.number = get2ByteValue(stack.currentFrame->locals.instructionPtrAtStartOfOnce + 2 + LINK_SIZE);
637 stack.currentFrame->locals.offset = stack.currentFrame->locals.number << 1;
639 #ifdef DEBUG
640 printf("end bracket %d", stack.currentFrame->locals.number);
641 printf("\n");
642 #endif
644 /* Test for a numbered group. This includes groups called as a result
645 of recursion. Note that whole-pattern recursion is coded as a recurse
646 into group 0, so it won't be picked up here. Instead, we catch it when
647 the OP_END is reached. */
649 if (stack.currentFrame->locals.number > 0) {
650 if (stack.currentFrame->locals.offset >= md.offsetMax)
651 md.offsetOverflow = true;
652 else {
653 md.offsetVector[stack.currentFrame->locals.offset] =
654 md.offsetVector[md.offsetEnd - stack.currentFrame->locals.number];
655 md.offsetVector[stack.currentFrame->locals.offset+1] = stack.currentFrame->args.subjectPtr - md.startSubject;
656 if (stack.currentFrame->args.offsetTop <= stack.currentFrame->locals.offset)
657 stack.currentFrame->args.offsetTop = stack.currentFrame->locals.offset + 2;
661 /* For a non-repeating ket, just continue at this level. This also
662 happens for a repeating ket if no characters were matched in the group.
663 This is the forcible breaking of infinite loops as implemented in Perl
664 5.005. If there is an options reset, it will get obeyed in the normal
665 course of events. */
667 if (*stack.currentFrame->args.instructionPtr == OP_KET || stack.currentFrame->args.subjectPtr == stack.currentFrame->locals.subjectPtrAtStartOfInstruction) {
668 stack.currentFrame->args.instructionPtr += 1 + LINK_SIZE;
669 NEXT_OPCODE;
672 /* The repeating kets try the rest of the pattern or restart from the
673 preceding bracket, in the appropriate order. */
675 if (*stack.currentFrame->args.instructionPtr == OP_KETRMIN) {
676 RECURSIVE_MATCH(16, stack.currentFrame->args.instructionPtr + 1 + LINK_SIZE, stack.currentFrame->args.bracketChain);
677 if (isMatch)
678 RRETURN;
679 RECURSIVE_MATCH_NEW_GROUP(17, stack.currentFrame->locals.instructionPtrAtStartOfOnce, stack.currentFrame->args.bracketChain);
680 if (isMatch)
681 RRETURN;
682 } else { /* OP_KETRMAX */
683 RECURSIVE_MATCH_NEW_GROUP(18, stack.currentFrame->locals.instructionPtrAtStartOfOnce, stack.currentFrame->args.bracketChain);
684 if (isMatch)
685 RRETURN;
686 RECURSIVE_MATCH(19, stack.currentFrame->args.instructionPtr + 1 + LINK_SIZE, stack.currentFrame->args.bracketChain);
687 if (isMatch)
688 RRETURN;
690 RRETURN;
692 /* Start of subject. */
694 BEGIN_OPCODE(CIRC):
695 if (stack.currentFrame->args.subjectPtr != md.startSubject)
696 RRETURN_NO_MATCH;
697 stack.currentFrame->args.instructionPtr++;
698 NEXT_OPCODE;
700 /* After internal newline if multiline. */
702 BEGIN_OPCODE(BOL):
703 if (stack.currentFrame->args.subjectPtr != md.startSubject && !isNewline(stack.currentFrame->args.subjectPtr[-1]))
704 RRETURN_NO_MATCH;
705 stack.currentFrame->args.instructionPtr++;
706 NEXT_OPCODE;
708 /* End of subject. */
710 BEGIN_OPCODE(DOLL):
711 if (stack.currentFrame->args.subjectPtr < md.endSubject)
712 RRETURN_NO_MATCH;
713 stack.currentFrame->args.instructionPtr++;
714 NEXT_OPCODE;
716 /* Before internal newline if multiline. */
718 BEGIN_OPCODE(EOL):
719 if (stack.currentFrame->args.subjectPtr < md.endSubject && !isNewline(*stack.currentFrame->args.subjectPtr))
720 RRETURN_NO_MATCH;
721 stack.currentFrame->args.instructionPtr++;
722 NEXT_OPCODE;
724 /* Word boundary assertions */
726 BEGIN_OPCODE(NOT_WORD_BOUNDARY):
727 BEGIN_OPCODE(WORD_BOUNDARY): {
728 bool currentCharIsWordChar = false;
729 bool previousCharIsWordChar = false;
731 if (stack.currentFrame->args.subjectPtr > md.startSubject)
732 previousCharIsWordChar = isWordChar(stack.currentFrame->args.subjectPtr[-1]);
733 if (stack.currentFrame->args.subjectPtr < md.endSubject)
734 currentCharIsWordChar = isWordChar(*stack.currentFrame->args.subjectPtr);
736 /* Now see if the situation is what we want */
737 bool wordBoundaryDesired = (*stack.currentFrame->args.instructionPtr++ == OP_WORD_BOUNDARY);
738 if (wordBoundaryDesired ? currentCharIsWordChar == previousCharIsWordChar : currentCharIsWordChar != previousCharIsWordChar)
739 RRETURN_NO_MATCH;
740 NEXT_OPCODE;
743 /* Match a single character type; inline for speed */
745 BEGIN_OPCODE(NOT_NEWLINE):
746 if (stack.currentFrame->args.subjectPtr >= md.endSubject)
747 RRETURN_NO_MATCH;
748 if (isNewline(*stack.currentFrame->args.subjectPtr++))
749 RRETURN_NO_MATCH;
750 stack.currentFrame->args.instructionPtr++;
751 NEXT_OPCODE;
753 BEGIN_OPCODE(NOT_DIGIT):
754 if (stack.currentFrame->args.subjectPtr >= md.endSubject)
755 RRETURN_NO_MATCH;
756 if (isASCIIDigit(*stack.currentFrame->args.subjectPtr++))
757 RRETURN_NO_MATCH;
758 stack.currentFrame->args.instructionPtr++;
759 NEXT_OPCODE;
761 BEGIN_OPCODE(DIGIT):
762 if (stack.currentFrame->args.subjectPtr >= md.endSubject)
763 RRETURN_NO_MATCH;
764 if (!isASCIIDigit(*stack.currentFrame->args.subjectPtr++))
765 RRETURN_NO_MATCH;
766 stack.currentFrame->args.instructionPtr++;
767 NEXT_OPCODE;
769 BEGIN_OPCODE(NOT_WHITESPACE):
770 if (stack.currentFrame->args.subjectPtr >= md.endSubject)
771 RRETURN_NO_MATCH;
772 if (isSpaceChar(*stack.currentFrame->args.subjectPtr++))
773 RRETURN_NO_MATCH;
774 stack.currentFrame->args.instructionPtr++;
775 NEXT_OPCODE;
777 BEGIN_OPCODE(WHITESPACE):
778 if (stack.currentFrame->args.subjectPtr >= md.endSubject)
779 RRETURN_NO_MATCH;
780 if (!isSpaceChar(*stack.currentFrame->args.subjectPtr++))
781 RRETURN_NO_MATCH;
782 stack.currentFrame->args.instructionPtr++;
783 NEXT_OPCODE;
785 BEGIN_OPCODE(NOT_WORDCHAR):
786 if (stack.currentFrame->args.subjectPtr >= md.endSubject)
787 RRETURN_NO_MATCH;
788 if (isWordChar(*stack.currentFrame->args.subjectPtr++))
789 RRETURN_NO_MATCH;
790 stack.currentFrame->args.instructionPtr++;
791 NEXT_OPCODE;
793 BEGIN_OPCODE(WORDCHAR):
794 if (stack.currentFrame->args.subjectPtr >= md.endSubject)
795 RRETURN_NO_MATCH;
796 if (!isWordChar(*stack.currentFrame->args.subjectPtr++))
797 RRETURN_NO_MATCH;
798 stack.currentFrame->args.instructionPtr++;
799 NEXT_OPCODE;
801 /* Match a back reference, possibly repeatedly. Look past the end of the
802 item to see if there is repeat information following. The code is similar
803 to that for character classes, but repeated for efficiency. Then obey
804 similar code to character type repeats - written out again for speed.
805 However, if the referenced string is the empty string, always treat
806 it as matched, any number of times (otherwise there could be infinite
807 loops). */
809 BEGIN_OPCODE(REF):
810 stack.currentFrame->locals.offset = get2ByteValue(stack.currentFrame->args.instructionPtr + 1) << 1; /* Doubled ref number */
811 stack.currentFrame->args.instructionPtr += 3; /* Advance past item */
813 /* If the reference is unset, set the length to be longer than the amount
814 of subject left; this ensures that every attempt at a match fails. We
815 can't just fail here, because of the possibility of quantifiers with zero
816 minima. */
818 if (stack.currentFrame->locals.offset >= stack.currentFrame->args.offsetTop || md.offsetVector[stack.currentFrame->locals.offset] < 0)
819 stack.currentFrame->locals.length = 0;
820 else
821 stack.currentFrame->locals.length = md.offsetVector[stack.currentFrame->locals.offset+1] - md.offsetVector[stack.currentFrame->locals.offset];
823 /* Set up for repetition, or handle the non-repeated case */
825 switch (*stack.currentFrame->args.instructionPtr) {
826 case OP_CRSTAR:
827 case OP_CRMINSTAR:
828 case OP_CRPLUS:
829 case OP_CRMINPLUS:
830 case OP_CRQUERY:
831 case OP_CRMINQUERY:
832 repeatInformationFromInstructionOffset(*stack.currentFrame->args.instructionPtr++ - OP_CRSTAR, minimize, min, stack.currentFrame->locals.max);
833 break;
835 case OP_CRRANGE:
836 case OP_CRMINRANGE:
837 minimize = (*stack.currentFrame->args.instructionPtr == OP_CRMINRANGE);
838 min = get2ByteValue(stack.currentFrame->args.instructionPtr + 1);
839 stack.currentFrame->locals.max = get2ByteValue(stack.currentFrame->args.instructionPtr + 3);
840 if (stack.currentFrame->locals.max == 0)
841 stack.currentFrame->locals.max = INT_MAX;
842 stack.currentFrame->args.instructionPtr += 5;
843 break;
845 default: /* No repeat follows */
846 if (!matchRef(stack.currentFrame->locals.offset, stack.currentFrame->args.subjectPtr, stack.currentFrame->locals.length, md))
847 RRETURN_NO_MATCH;
848 stack.currentFrame->args.subjectPtr += stack.currentFrame->locals.length;
849 NEXT_OPCODE;
852 /* If the length of the reference is zero, just continue with the
853 main loop. */
855 if (stack.currentFrame->locals.length == 0)
856 NEXT_OPCODE;
858 /* First, ensure the minimum number of matches are present. */
860 for (int i = 1; i <= min; i++) {
861 if (!matchRef(stack.currentFrame->locals.offset, stack.currentFrame->args.subjectPtr, stack.currentFrame->locals.length, md))
862 RRETURN_NO_MATCH;
863 stack.currentFrame->args.subjectPtr += stack.currentFrame->locals.length;
866 /* If min = max, continue at the same level without recursion.
867 They are not both allowed to be zero. */
869 if (min == stack.currentFrame->locals.max)
870 NEXT_OPCODE;
872 /* If minimizing, keep trying and advancing the pointer */
874 if (minimize) {
875 for (stack.currentFrame->locals.fi = min;; stack.currentFrame->locals.fi++) {
876 RECURSIVE_MATCH(20, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain);
877 if (isMatch)
878 RRETURN;
879 if (stack.currentFrame->locals.fi >= stack.currentFrame->locals.max || !matchRef(stack.currentFrame->locals.offset, stack.currentFrame->args.subjectPtr, stack.currentFrame->locals.length, md))
880 RRETURN;
881 stack.currentFrame->args.subjectPtr += stack.currentFrame->locals.length;
883 /* Control never reaches here */
886 /* If maximizing, find the longest string and work backwards */
888 else {
889 stack.currentFrame->locals.subjectPtrAtStartOfInstruction = stack.currentFrame->args.subjectPtr;
890 for (int i = min; i < stack.currentFrame->locals.max; i++) {
891 if (!matchRef(stack.currentFrame->locals.offset, stack.currentFrame->args.subjectPtr, stack.currentFrame->locals.length, md))
892 break;
893 stack.currentFrame->args.subjectPtr += stack.currentFrame->locals.length;
895 while (stack.currentFrame->args.subjectPtr >= stack.currentFrame->locals.subjectPtrAtStartOfInstruction) {
896 RECURSIVE_MATCH(21, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain);
897 if (isMatch)
898 RRETURN;
899 stack.currentFrame->args.subjectPtr -= stack.currentFrame->locals.length;
901 RRETURN_NO_MATCH;
903 /* Control never reaches here */
905 /* Match a bit-mapped character class, possibly repeatedly. This op code is
906 used when all the characters in the class have values in the range 0-255,
907 and either the matching is caseful, or the characters are in the range
908 0-127 when UTF-8 processing is enabled. The only difference between
909 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
910 encountered.
912 First, look past the end of the item to see if there is repeat information
913 following. Then obey similar code to character type repeats - written out
914 again for speed. */
916 BEGIN_OPCODE(NCLASS):
917 BEGIN_OPCODE(CLASS):
918 stack.currentFrame->locals.data = stack.currentFrame->args.instructionPtr + 1; /* Save for matching */
919 stack.currentFrame->args.instructionPtr += 33; /* Advance past the item */
921 switch (*stack.currentFrame->args.instructionPtr) {
922 case OP_CRSTAR:
923 case OP_CRMINSTAR:
924 case OP_CRPLUS:
925 case OP_CRMINPLUS:
926 case OP_CRQUERY:
927 case OP_CRMINQUERY:
928 repeatInformationFromInstructionOffset(*stack.currentFrame->args.instructionPtr++ - OP_CRSTAR, minimize, min, stack.currentFrame->locals.max);
929 break;
931 case OP_CRRANGE:
932 case OP_CRMINRANGE:
933 minimize = (*stack.currentFrame->args.instructionPtr == OP_CRMINRANGE);
934 min = get2ByteValue(stack.currentFrame->args.instructionPtr + 1);
935 stack.currentFrame->locals.max = get2ByteValue(stack.currentFrame->args.instructionPtr + 3);
936 if (stack.currentFrame->locals.max == 0)
937 stack.currentFrame->locals.max = INT_MAX;
938 stack.currentFrame->args.instructionPtr += 5;
939 break;
941 default: /* No repeat follows */
942 min = stack.currentFrame->locals.max = 1;
943 break;
946 /* First, ensure the minimum number of matches are present. */
948 for (int i = 1; i <= min; i++) {
949 if (stack.currentFrame->args.subjectPtr >= md.endSubject)
950 RRETURN_NO_MATCH;
951 int c = *stack.currentFrame->args.subjectPtr++;
952 if (c > 255) {
953 if (stack.currentFrame->locals.data[-1] == OP_CLASS)
954 RRETURN_NO_MATCH;
955 } else {
956 if (!(stack.currentFrame->locals.data[c / 8] & (1 << (c & 7))))
957 RRETURN_NO_MATCH;
961 /* If max == min we can continue with the main loop without the
962 need to recurse. */
964 if (min == stack.currentFrame->locals.max)
965 NEXT_OPCODE;
967 /* If minimizing, keep testing the rest of the expression and advancing
968 the pointer while it matches the class. */
969 if (minimize) {
970 for (stack.currentFrame->locals.fi = min;; stack.currentFrame->locals.fi++) {
971 RECURSIVE_MATCH(22, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain);
972 if (isMatch)
973 RRETURN;
974 if (stack.currentFrame->locals.fi >= stack.currentFrame->locals.max || stack.currentFrame->args.subjectPtr >= md.endSubject)
975 RRETURN;
976 int c = *stack.currentFrame->args.subjectPtr++;
977 if (c > 255) {
978 if (stack.currentFrame->locals.data[-1] == OP_CLASS)
979 RRETURN;
980 } else {
981 if ((stack.currentFrame->locals.data[c/8] & (1 << (c&7))) == 0)
982 RRETURN;
985 /* Control never reaches here */
987 /* If maximizing, find the longest possible run, then work backwards. */
988 else {
989 stack.currentFrame->locals.subjectPtrAtStartOfInstruction = stack.currentFrame->args.subjectPtr;
991 for (int i = min; i < stack.currentFrame->locals.max; i++) {
992 if (stack.currentFrame->args.subjectPtr >= md.endSubject)
993 break;
994 int c = *stack.currentFrame->args.subjectPtr;
995 if (c > 255) {
996 if (stack.currentFrame->locals.data[-1] == OP_CLASS)
997 break;
998 } else {
999 if (!(stack.currentFrame->locals.data[c / 8] & (1 << (c & 7))))
1000 break;
1002 ++stack.currentFrame->args.subjectPtr;
1004 for (;;) {
1005 RECURSIVE_MATCH(24, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain);
1006 if (isMatch)
1007 RRETURN;
1008 if (stack.currentFrame->args.subjectPtr-- == stack.currentFrame->locals.subjectPtrAtStartOfInstruction)
1009 break; /* Stop if tried at original pos */
1012 RRETURN;
1014 /* Control never reaches here */
1016 /* Match an extended character class. */
1018 BEGIN_OPCODE(XCLASS):
1019 stack.currentFrame->locals.data = stack.currentFrame->args.instructionPtr + 1 + LINK_SIZE; /* Save for matching */
1020 stack.currentFrame->args.instructionPtr += getLinkValue(stack.currentFrame->args.instructionPtr + 1); /* Advance past the item */
1022 switch (*stack.currentFrame->args.instructionPtr) {
1023 case OP_CRSTAR:
1024 case OP_CRMINSTAR:
1025 case OP_CRPLUS:
1026 case OP_CRMINPLUS:
1027 case OP_CRQUERY:
1028 case OP_CRMINQUERY:
1029 repeatInformationFromInstructionOffset(*stack.currentFrame->args.instructionPtr++ - OP_CRSTAR, minimize, min, stack.currentFrame->locals.max);
1030 break;
1032 case OP_CRRANGE:
1033 case OP_CRMINRANGE:
1034 minimize = (*stack.currentFrame->args.instructionPtr == OP_CRMINRANGE);
1035 min = get2ByteValue(stack.currentFrame->args.instructionPtr + 1);
1036 stack.currentFrame->locals.max = get2ByteValue(stack.currentFrame->args.instructionPtr + 3);
1037 if (stack.currentFrame->locals.max == 0)
1038 stack.currentFrame->locals.max = INT_MAX;
1039 stack.currentFrame->args.instructionPtr += 5;
1040 break;
1042 default: /* No repeat follows */
1043 min = stack.currentFrame->locals.max = 1;
1046 /* First, ensure the minimum number of matches are present. */
1048 for (int i = 1; i <= min; i++) {
1049 if (stack.currentFrame->args.subjectPtr >= md.endSubject)
1050 RRETURN_NO_MATCH;
1051 int c = *stack.currentFrame->args.subjectPtr++;
1052 if (!jsc_pcre_xclass(c, stack.currentFrame->locals.data))
1053 RRETURN_NO_MATCH;
1056 /* If max == min we can continue with the main loop without the
1057 need to recurse. */
1059 if (min == stack.currentFrame->locals.max)
1060 NEXT_OPCODE;
1062 /* If minimizing, keep testing the rest of the expression and advancing
1063 the pointer while it matches the class. */
1065 if (minimize) {
1066 for (stack.currentFrame->locals.fi = min;; stack.currentFrame->locals.fi++) {
1067 RECURSIVE_MATCH(26, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain);
1068 if (isMatch)
1069 RRETURN;
1070 if (stack.currentFrame->locals.fi >= stack.currentFrame->locals.max || stack.currentFrame->args.subjectPtr >= md.endSubject)
1071 RRETURN;
1072 int c = *stack.currentFrame->args.subjectPtr++;
1073 if (!jsc_pcre_xclass(c, stack.currentFrame->locals.data))
1074 RRETURN;
1076 /* Control never reaches here */
1079 /* If maximizing, find the longest possible run, then work backwards. */
1081 else {
1082 stack.currentFrame->locals.subjectPtrAtStartOfInstruction = stack.currentFrame->args.subjectPtr;
1083 for (int i = min; i < stack.currentFrame->locals.max; i++) {
1084 if (stack.currentFrame->args.subjectPtr >= md.endSubject)
1085 break;
1086 int c = *stack.currentFrame->args.subjectPtr;
1087 if (!jsc_pcre_xclass(c, stack.currentFrame->locals.data))
1088 break;
1089 ++stack.currentFrame->args.subjectPtr;
1091 for(;;) {
1092 RECURSIVE_MATCH(27, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain);
1093 if (isMatch)
1094 RRETURN;
1095 if (stack.currentFrame->args.subjectPtr-- == stack.currentFrame->locals.subjectPtrAtStartOfInstruction)
1096 break; /* Stop if tried at original pos */
1098 RRETURN;
1101 /* Control never reaches here */
1103 /* Match a single character, casefully */
1105 BEGIN_OPCODE(CHAR):
1106 stack.currentFrame->locals.length = 1;
1107 stack.currentFrame->args.instructionPtr++;
1108 getUTF8CharAndIncrementLength(stack.currentFrame->locals.fc, stack.currentFrame->args.instructionPtr, stack.currentFrame->locals.length);
1109 stack.currentFrame->args.instructionPtr += stack.currentFrame->locals.length;
1110 if (stack.currentFrame->args.subjectPtr >= md.endSubject)
1111 RRETURN_NO_MATCH;
1112 if (stack.currentFrame->locals.fc != *stack.currentFrame->args.subjectPtr++)
1113 RRETURN_NO_MATCH;
1114 NEXT_OPCODE;
1116 /* Match a single character, caselessly */
1118 BEGIN_OPCODE(CHAR_IGNORING_CASE): {
1119 stack.currentFrame->locals.length = 1;
1120 stack.currentFrame->args.instructionPtr++;
1121 getUTF8CharAndIncrementLength(stack.currentFrame->locals.fc, stack.currentFrame->args.instructionPtr, stack.currentFrame->locals.length);
1122 stack.currentFrame->args.instructionPtr += stack.currentFrame->locals.length;
1123 if (stack.currentFrame->args.subjectPtr >= md.endSubject)
1124 RRETURN_NO_MATCH;
1125 int dc = *stack.currentFrame->args.subjectPtr++;
1126 if (stack.currentFrame->locals.fc != dc && jsc_pcre_ucp_othercase(stack.currentFrame->locals.fc) != dc)
1127 RRETURN_NO_MATCH;
1128 NEXT_OPCODE;
1131 /* Match a single ASCII character. */
1133 BEGIN_OPCODE(ASCII_CHAR):
1134 if (md.endSubject == stack.currentFrame->args.subjectPtr)
1135 RRETURN_NO_MATCH;
1136 if (*stack.currentFrame->args.subjectPtr != stack.currentFrame->args.instructionPtr[1])
1137 RRETURN_NO_MATCH;
1138 ++stack.currentFrame->args.subjectPtr;
1139 stack.currentFrame->args.instructionPtr += 2;
1140 NEXT_OPCODE;
1142 /* Match one of two cases of an ASCII letter. */
1144 BEGIN_OPCODE(ASCII_LETTER_IGNORING_CASE):
1145 if (md.endSubject == stack.currentFrame->args.subjectPtr)
1146 RRETURN_NO_MATCH;
1147 if ((*stack.currentFrame->args.subjectPtr | 0x20) != stack.currentFrame->args.instructionPtr[1])
1148 RRETURN_NO_MATCH;
1149 ++stack.currentFrame->args.subjectPtr;
1150 stack.currentFrame->args.instructionPtr += 2;
1151 NEXT_OPCODE;
1153 /* Match a single character repeatedly; different opcodes share code. */
1155 BEGIN_OPCODE(EXACT):
1156 min = stack.currentFrame->locals.max = get2ByteValue(stack.currentFrame->args.instructionPtr + 1);
1157 minimize = false;
1158 stack.currentFrame->args.instructionPtr += 3;
1159 goto REPEATCHAR;
1161 BEGIN_OPCODE(UPTO):
1162 BEGIN_OPCODE(MINUPTO):
1163 min = 0;
1164 stack.currentFrame->locals.max = get2ByteValue(stack.currentFrame->args.instructionPtr + 1);
1165 minimize = *stack.currentFrame->args.instructionPtr == OP_MINUPTO;
1166 stack.currentFrame->args.instructionPtr += 3;
1167 goto REPEATCHAR;
1169 BEGIN_OPCODE(STAR):
1170 BEGIN_OPCODE(MINSTAR):
1171 BEGIN_OPCODE(PLUS):
1172 BEGIN_OPCODE(MINPLUS):
1173 BEGIN_OPCODE(QUERY):
1174 BEGIN_OPCODE(MINQUERY):
1175 repeatInformationFromInstructionOffset(*stack.currentFrame->args.instructionPtr++ - OP_STAR, minimize, min, stack.currentFrame->locals.max);
1177 /* Common code for all repeated single-character matches. We can give
1178 up quickly if there are fewer than the minimum number of characters left in
1179 the subject. */
1181 REPEATCHAR:
1183 stack.currentFrame->locals.length = 1;
1184 getUTF8CharAndIncrementLength(stack.currentFrame->locals.fc, stack.currentFrame->args.instructionPtr, stack.currentFrame->locals.length);
1185 if (min * (stack.currentFrame->locals.fc > 0xFFFF ? 2 : 1) > md.endSubject - stack.currentFrame->args.subjectPtr)
1186 RRETURN_NO_MATCH;
1187 stack.currentFrame->args.instructionPtr += stack.currentFrame->locals.length;
1189 if (stack.currentFrame->locals.fc <= 0xFFFF) {
1190 othercase = md.ignoreCase ? jsc_pcre_ucp_othercase(stack.currentFrame->locals.fc) : -1;
1192 for (int i = 1; i <= min; i++) {
1193 if (*stack.currentFrame->args.subjectPtr != stack.currentFrame->locals.fc && *stack.currentFrame->args.subjectPtr != othercase)
1194 RRETURN_NO_MATCH;
1195 ++stack.currentFrame->args.subjectPtr;
1198 if (min == stack.currentFrame->locals.max)
1199 NEXT_OPCODE;
1201 if (minimize) {
1202 stack.currentFrame->locals.repeatOthercase = othercase;
1203 for (stack.currentFrame->locals.fi = min;; stack.currentFrame->locals.fi++) {
1204 RECURSIVE_MATCH(28, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain);
1205 if (isMatch)
1206 RRETURN;
1207 if (stack.currentFrame->locals.fi >= stack.currentFrame->locals.max || stack.currentFrame->args.subjectPtr >= md.endSubject)
1208 RRETURN;
1209 if (*stack.currentFrame->args.subjectPtr != stack.currentFrame->locals.fc && *stack.currentFrame->args.subjectPtr != stack.currentFrame->locals.repeatOthercase)
1210 RRETURN;
1211 ++stack.currentFrame->args.subjectPtr;
1213 /* Control never reaches here */
1214 } else {
1215 stack.currentFrame->locals.subjectPtrAtStartOfInstruction = stack.currentFrame->args.subjectPtr;
1216 for (int i = min; i < stack.currentFrame->locals.max; i++) {
1217 if (stack.currentFrame->args.subjectPtr >= md.endSubject)
1218 break;
1219 if (*stack.currentFrame->args.subjectPtr != stack.currentFrame->locals.fc && *stack.currentFrame->args.subjectPtr != othercase)
1220 break;
1221 ++stack.currentFrame->args.subjectPtr;
1223 while (stack.currentFrame->args.subjectPtr >= stack.currentFrame->locals.subjectPtrAtStartOfInstruction) {
1224 RECURSIVE_MATCH(29, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain);
1225 if (isMatch)
1226 RRETURN;
1227 --stack.currentFrame->args.subjectPtr;
1229 RRETURN_NO_MATCH;
1231 /* Control never reaches here */
1232 } else {
1233 /* No case on surrogate pairs, so no need to bother with "othercase". */
1235 for (int i = 1; i <= min; i++) {
1236 if (*stack.currentFrame->args.subjectPtr != stack.currentFrame->locals.fc)
1237 RRETURN_NO_MATCH;
1238 stack.currentFrame->args.subjectPtr += 2;
1241 if (min == stack.currentFrame->locals.max)
1242 NEXT_OPCODE;
1244 if (minimize) {
1245 for (stack.currentFrame->locals.fi = min;; stack.currentFrame->locals.fi++) {
1246 RECURSIVE_MATCH(30, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain);
1247 if (isMatch)
1248 RRETURN;
1249 if (stack.currentFrame->locals.fi >= stack.currentFrame->locals.max || stack.currentFrame->args.subjectPtr >= md.endSubject)
1250 RRETURN;
1251 if (*stack.currentFrame->args.subjectPtr != stack.currentFrame->locals.fc)
1252 RRETURN;
1253 stack.currentFrame->args.subjectPtr += 2;
1255 /* Control never reaches here */
1256 } else {
1257 stack.currentFrame->locals.subjectPtrAtStartOfInstruction = stack.currentFrame->args.subjectPtr;
1258 for (int i = min; i < stack.currentFrame->locals.max; i++) {
1259 if (stack.currentFrame->args.subjectPtr > md.endSubject - 2)
1260 break;
1261 if (*stack.currentFrame->args.subjectPtr != stack.currentFrame->locals.fc)
1262 break;
1263 stack.currentFrame->args.subjectPtr += 2;
1265 while (stack.currentFrame->args.subjectPtr >= stack.currentFrame->locals.subjectPtrAtStartOfInstruction) {
1266 RECURSIVE_MATCH(31, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain);
1267 if (isMatch)
1268 RRETURN;
1269 stack.currentFrame->args.subjectPtr -= 2;
1271 RRETURN_NO_MATCH;
1273 /* Control never reaches here */
1275 /* Control never reaches here */
1277 /* Match a negated single one-byte character. */
1279 BEGIN_OPCODE(NOT): {
1280 if (stack.currentFrame->args.subjectPtr >= md.endSubject)
1281 RRETURN_NO_MATCH;
1282 int b = stack.currentFrame->args.instructionPtr[1];
1283 int c = *stack.currentFrame->args.subjectPtr++;
1284 stack.currentFrame->args.instructionPtr += 2;
1285 if (md.ignoreCase) {
1286 if (c < 128)
1287 c = toLowerCase(c);
1288 if (toLowerCase(b) == c)
1289 RRETURN_NO_MATCH;
1290 } else {
1291 if (b == c)
1292 RRETURN_NO_MATCH;
1294 NEXT_OPCODE;
1297 /* Match a negated single one-byte character repeatedly. This is almost a
1298 repeat of the code for a repeated single character, but I haven't found a
1299 nice way of commoning these up that doesn't require a test of the
1300 positive/negative option for each character match. Maybe that wouldn't add
1301 very much to the time taken, but character matching *is* what this is all
1302 about... */
1304 BEGIN_OPCODE(NOTEXACT):
1305 min = stack.currentFrame->locals.max = get2ByteValue(stack.currentFrame->args.instructionPtr + 1);
1306 minimize = false;
1307 stack.currentFrame->args.instructionPtr += 3;
1308 goto REPEATNOTCHAR;
1310 BEGIN_OPCODE(NOTUPTO):
1311 BEGIN_OPCODE(NOTMINUPTO):
1312 min = 0;
1313 stack.currentFrame->locals.max = get2ByteValue(stack.currentFrame->args.instructionPtr + 1);
1314 minimize = *stack.currentFrame->args.instructionPtr == OP_NOTMINUPTO;
1315 stack.currentFrame->args.instructionPtr += 3;
1316 goto REPEATNOTCHAR;
1318 BEGIN_OPCODE(NOTSTAR):
1319 BEGIN_OPCODE(NOTMINSTAR):
1320 BEGIN_OPCODE(NOTPLUS):
1321 BEGIN_OPCODE(NOTMINPLUS):
1322 BEGIN_OPCODE(NOTQUERY):
1323 BEGIN_OPCODE(NOTMINQUERY):
1324 repeatInformationFromInstructionOffset(*stack.currentFrame->args.instructionPtr++ - OP_NOTSTAR, minimize, min, stack.currentFrame->locals.max);
1326 /* Common code for all repeated single-byte matches. We can give up quickly
1327 if there are fewer than the minimum number of bytes left in the
1328 subject. */
1330 REPEATNOTCHAR:
1331 if (min > md.endSubject - stack.currentFrame->args.subjectPtr)
1332 RRETURN_NO_MATCH;
1333 stack.currentFrame->locals.fc = *stack.currentFrame->args.instructionPtr++;
1335 /* The code is duplicated for the caseless and caseful cases, for speed,
1336 since matching characters is likely to be quite common. First, ensure the
1337 minimum number of matches are present. If min = max, continue at the same
1338 level without recursing. Otherwise, if minimizing, keep trying the rest of
1339 the expression and advancing one matching character if failing, up to the
1340 maximum. Alternatively, if maximizing, find the maximum number of
1341 characters and work backwards. */
1343 DPRINTF(("negative matching %c{%d,%d}\n", stack.currentFrame->locals.fc, min, stack.currentFrame->locals.max));
1345 if (md.ignoreCase) {
1346 if (stack.currentFrame->locals.fc < 128)
1347 stack.currentFrame->locals.fc = toLowerCase(stack.currentFrame->locals.fc);
1349 for (int i = 1; i <= min; i++) {
1350 int d = *stack.currentFrame->args.subjectPtr++;
1351 if (d < 128)
1352 d = toLowerCase(d);
1353 if (stack.currentFrame->locals.fc == d)
1354 RRETURN_NO_MATCH;
1357 if (min == stack.currentFrame->locals.max)
1358 NEXT_OPCODE;
1360 if (minimize) {
1361 for (stack.currentFrame->locals.fi = min;; stack.currentFrame->locals.fi++) {
1362 RECURSIVE_MATCH(38, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain);
1363 if (isMatch)
1364 RRETURN;
1365 int d = *stack.currentFrame->args.subjectPtr++;
1366 if (d < 128)
1367 d = toLowerCase(d);
1368 if (stack.currentFrame->locals.fi >= stack.currentFrame->locals.max || stack.currentFrame->args.subjectPtr >= md.endSubject || stack.currentFrame->locals.fc == d)
1369 RRETURN;
1371 /* Control never reaches here */
1374 /* Maximize case */
1376 else {
1377 stack.currentFrame->locals.subjectPtrAtStartOfInstruction = stack.currentFrame->args.subjectPtr;
1379 for (int i = min; i < stack.currentFrame->locals.max; i++) {
1380 if (stack.currentFrame->args.subjectPtr >= md.endSubject)
1381 break;
1382 int d = *stack.currentFrame->args.subjectPtr;
1383 if (d < 128)
1384 d = toLowerCase(d);
1385 if (stack.currentFrame->locals.fc == d)
1386 break;
1387 ++stack.currentFrame->args.subjectPtr;
1389 for (;;) {
1390 RECURSIVE_MATCH(40, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain);
1391 if (isMatch)
1392 RRETURN;
1393 if (stack.currentFrame->args.subjectPtr-- == stack.currentFrame->locals.subjectPtrAtStartOfInstruction)
1394 break; /* Stop if tried at original pos */
1397 RRETURN;
1399 /* Control never reaches here */
1402 /* Caseful comparisons */
1404 else {
1405 for (int i = 1; i <= min; i++) {
1406 int d = *stack.currentFrame->args.subjectPtr++;
1407 if (stack.currentFrame->locals.fc == d)
1408 RRETURN_NO_MATCH;
1411 if (min == stack.currentFrame->locals.max)
1412 NEXT_OPCODE;
1414 if (minimize) {
1415 for (stack.currentFrame->locals.fi = min;; stack.currentFrame->locals.fi++) {
1416 RECURSIVE_MATCH(42, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain);
1417 if (isMatch)
1418 RRETURN;
1419 int d = *stack.currentFrame->args.subjectPtr++;
1420 if (stack.currentFrame->locals.fi >= stack.currentFrame->locals.max || stack.currentFrame->args.subjectPtr >= md.endSubject || stack.currentFrame->locals.fc == d)
1421 RRETURN;
1423 /* Control never reaches here */
1426 /* Maximize case */
1428 else {
1429 stack.currentFrame->locals.subjectPtrAtStartOfInstruction = stack.currentFrame->args.subjectPtr;
1431 for (int i = min; i < stack.currentFrame->locals.max; i++) {
1432 if (stack.currentFrame->args.subjectPtr >= md.endSubject)
1433 break;
1434 int d = *stack.currentFrame->args.subjectPtr;
1435 if (stack.currentFrame->locals.fc == d)
1436 break;
1437 ++stack.currentFrame->args.subjectPtr;
1439 for (;;) {
1440 RECURSIVE_MATCH(44, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain);
1441 if (isMatch)
1442 RRETURN;
1443 if (stack.currentFrame->args.subjectPtr-- == stack.currentFrame->locals.subjectPtrAtStartOfInstruction)
1444 break; /* Stop if tried at original pos */
1447 RRETURN;
1450 /* Control never reaches here */
1452 /* Match a single character type repeatedly; several different opcodes
1453 share code. This is very similar to the code for single characters, but we
1454 repeat it in the interests of efficiency. */
1456 BEGIN_OPCODE(TYPEEXACT):
1457 min = stack.currentFrame->locals.max = get2ByteValue(stack.currentFrame->args.instructionPtr + 1);
1458 minimize = true;
1459 stack.currentFrame->args.instructionPtr += 3;
1460 goto REPEATTYPE;
1462 BEGIN_OPCODE(TYPEUPTO):
1463 BEGIN_OPCODE(TYPEMINUPTO):
1464 min = 0;
1465 stack.currentFrame->locals.max = get2ByteValue(stack.currentFrame->args.instructionPtr + 1);
1466 minimize = *stack.currentFrame->args.instructionPtr == OP_TYPEMINUPTO;
1467 stack.currentFrame->args.instructionPtr += 3;
1468 goto REPEATTYPE;
1470 BEGIN_OPCODE(TYPESTAR):
1471 BEGIN_OPCODE(TYPEMINSTAR):
1472 BEGIN_OPCODE(TYPEPLUS):
1473 BEGIN_OPCODE(TYPEMINPLUS):
1474 BEGIN_OPCODE(TYPEQUERY):
1475 BEGIN_OPCODE(TYPEMINQUERY):
1476 repeatInformationFromInstructionOffset(*stack.currentFrame->args.instructionPtr++ - OP_TYPESTAR, minimize, min, stack.currentFrame->locals.max);
1478 /* Common code for all repeated single character type matches. Note that
1479 in UTF-8 mode, '.' matches a character of any length, but for the other
1480 character types, the valid characters are all one-byte long. */
1482 REPEATTYPE:
1483 stack.currentFrame->locals.ctype = *stack.currentFrame->args.instructionPtr++; /* Code for the character type */
1485 /* First, ensure the minimum number of matches are present. Use inline
1486 code for maximizing the speed, and do the type test once at the start
1487 (i.e. keep it out of the loop). Also we can test that there are at least
1488 the minimum number of characters before we start. */
1490 if (min > md.endSubject - stack.currentFrame->args.subjectPtr)
1491 RRETURN_NO_MATCH;
1492 if (min > 0) {
1493 switch (stack.currentFrame->locals.ctype) {
1494 case OP_NOT_NEWLINE:
1495 for (int i = 1; i <= min; i++) {
1496 if (isNewline(*stack.currentFrame->args.subjectPtr))
1497 RRETURN_NO_MATCH;
1498 ++stack.currentFrame->args.subjectPtr;
1500 break;
1502 case OP_NOT_DIGIT:
1503 for (int i = 1; i <= min; i++) {
1504 if (isASCIIDigit(*stack.currentFrame->args.subjectPtr))
1505 RRETURN_NO_MATCH;
1506 ++stack.currentFrame->args.subjectPtr;
1508 break;
1510 case OP_DIGIT:
1511 for (int i = 1; i <= min; i++) {
1512 if (!isASCIIDigit(*stack.currentFrame->args.subjectPtr))
1513 RRETURN_NO_MATCH;
1514 ++stack.currentFrame->args.subjectPtr;
1516 break;
1518 case OP_NOT_WHITESPACE:
1519 for (int i = 1; i <= min; i++) {
1520 if (isSpaceChar(*stack.currentFrame->args.subjectPtr))
1521 RRETURN_NO_MATCH;
1522 ++stack.currentFrame->args.subjectPtr;
1524 break;
1526 case OP_WHITESPACE:
1527 for (int i = 1; i <= min; i++) {
1528 if (!isSpaceChar(*stack.currentFrame->args.subjectPtr))
1529 RRETURN_NO_MATCH;
1530 ++stack.currentFrame->args.subjectPtr;
1532 break;
1534 case OP_NOT_WORDCHAR:
1535 for (int i = 1; i <= min; i++) {
1536 if (isWordChar(*stack.currentFrame->args.subjectPtr))
1537 RRETURN_NO_MATCH;
1538 ++stack.currentFrame->args.subjectPtr;
1540 break;
1542 case OP_WORDCHAR:
1543 for (int i = 1; i <= min; i++) {
1544 if (!isWordChar(*stack.currentFrame->args.subjectPtr))
1545 RRETURN_NO_MATCH;
1546 ++stack.currentFrame->args.subjectPtr;
1548 break;
1550 default:
1551 ASSERT_NOT_REACHED();
1552 return matchError(JSRegExpErrorInternal, stack);
1553 } /* End switch(stack.currentFrame->locals.ctype) */
1556 /* If min = max, continue at the same level without recursing */
1558 if (min == stack.currentFrame->locals.max)
1559 NEXT_OPCODE;
1561 /* If minimizing, we have to test the rest of the pattern before each
1562 subsequent match. */
1564 if (minimize) {
1565 for (stack.currentFrame->locals.fi = min;; stack.currentFrame->locals.fi++) {
1566 RECURSIVE_MATCH(48, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain);
1567 if (isMatch)
1568 RRETURN;
1569 if (stack.currentFrame->locals.fi >= stack.currentFrame->locals.max || stack.currentFrame->args.subjectPtr >= md.endSubject)
1570 RRETURN;
1572 int c = *stack.currentFrame->args.subjectPtr++;
1573 switch (stack.currentFrame->locals.ctype) {
1574 case OP_NOT_NEWLINE:
1575 if (isNewline(c))
1576 RRETURN;
1577 break;
1579 case OP_NOT_DIGIT:
1580 if (isASCIIDigit(c))
1581 RRETURN;
1582 break;
1584 case OP_DIGIT:
1585 if (!isASCIIDigit(c))
1586 RRETURN;
1587 break;
1589 case OP_NOT_WHITESPACE:
1590 if (isSpaceChar(c))
1591 RRETURN;
1592 break;
1594 case OP_WHITESPACE:
1595 if (!isSpaceChar(c))
1596 RRETURN;
1597 break;
1599 case OP_NOT_WORDCHAR:
1600 if (isWordChar(c))
1601 RRETURN;
1602 break;
1604 case OP_WORDCHAR:
1605 if (!isWordChar(c))
1606 RRETURN;
1607 break;
1609 default:
1610 ASSERT_NOT_REACHED();
1611 return matchError(JSRegExpErrorInternal, stack);
1614 /* Control never reaches here */
1617 /* If maximizing it is worth using inline code for speed, doing the type
1618 test once at the start (i.e. keep it out of the loop). */
1620 else {
1621 stack.currentFrame->locals.subjectPtrAtStartOfInstruction = stack.currentFrame->args.subjectPtr; /* Remember where we started */
1623 switch (stack.currentFrame->locals.ctype) {
1624 case OP_NOT_NEWLINE:
1625 for (int i = min; i < stack.currentFrame->locals.max; i++) {
1626 if (stack.currentFrame->args.subjectPtr >= md.endSubject || isNewline(*stack.currentFrame->args.subjectPtr))
1627 break;
1628 stack.currentFrame->args.subjectPtr++;
1630 break;
1632 case OP_NOT_DIGIT:
1633 for (int i = min; i < stack.currentFrame->locals.max; i++) {
1634 if (stack.currentFrame->args.subjectPtr >= md.endSubject)
1635 break;
1636 int c = *stack.currentFrame->args.subjectPtr;
1637 if (isASCIIDigit(c))
1638 break;
1639 ++stack.currentFrame->args.subjectPtr;
1641 break;
1643 case OP_DIGIT:
1644 for (int i = min; i < stack.currentFrame->locals.max; i++) {
1645 if (stack.currentFrame->args.subjectPtr >= md.endSubject)
1646 break;
1647 int c = *stack.currentFrame->args.subjectPtr;
1648 if (!isASCIIDigit(c))
1649 break;
1650 ++stack.currentFrame->args.subjectPtr;
1652 break;
1654 case OP_NOT_WHITESPACE:
1655 for (int i = min; i < stack.currentFrame->locals.max; i++) {
1656 if (stack.currentFrame->args.subjectPtr >= md.endSubject)
1657 break;
1658 int c = *stack.currentFrame->args.subjectPtr;
1659 if (isSpaceChar(c))
1660 break;
1661 ++stack.currentFrame->args.subjectPtr;
1663 break;
1665 case OP_WHITESPACE:
1666 for (int i = min; i < stack.currentFrame->locals.max; i++) {
1667 if (stack.currentFrame->args.subjectPtr >= md.endSubject)
1668 break;
1669 int c = *stack.currentFrame->args.subjectPtr;
1670 if (!isSpaceChar(c))
1671 break;
1672 ++stack.currentFrame->args.subjectPtr;
1674 break;
1676 case OP_NOT_WORDCHAR:
1677 for (int i = min; i < stack.currentFrame->locals.max; i++) {
1678 if (stack.currentFrame->args.subjectPtr >= md.endSubject)
1679 break;
1680 int c = *stack.currentFrame->args.subjectPtr;
1681 if (isWordChar(c))
1682 break;
1683 ++stack.currentFrame->args.subjectPtr;
1685 break;
1687 case OP_WORDCHAR:
1688 for (int i = min; i < stack.currentFrame->locals.max; i++) {
1689 if (stack.currentFrame->args.subjectPtr >= md.endSubject)
1690 break;
1691 int c = *stack.currentFrame->args.subjectPtr;
1692 if (!isWordChar(c))
1693 break;
1694 ++stack.currentFrame->args.subjectPtr;
1696 break;
1698 default:
1699 ASSERT_NOT_REACHED();
1700 return matchError(JSRegExpErrorInternal, stack);
1703 /* stack.currentFrame->args.subjectPtr is now past the end of the maximum run */
1705 for (;;) {
1706 RECURSIVE_MATCH(52, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain);
1707 if (isMatch)
1708 RRETURN;
1709 if (stack.currentFrame->args.subjectPtr-- == stack.currentFrame->locals.subjectPtrAtStartOfInstruction)
1710 break; /* Stop if tried at original pos */
1713 /* Get here if we can't make it match with any permitted repetitions */
1715 RRETURN;
1717 /* Control never reaches here */
1719 BEGIN_OPCODE(CRMINPLUS):
1720 BEGIN_OPCODE(CRMINQUERY):
1721 BEGIN_OPCODE(CRMINRANGE):
1722 BEGIN_OPCODE(CRMINSTAR):
1723 BEGIN_OPCODE(CRPLUS):
1724 BEGIN_OPCODE(CRQUERY):
1725 BEGIN_OPCODE(CRRANGE):
1726 BEGIN_OPCODE(CRSTAR):
1727 ASSERT_NOT_REACHED();
1728 return matchError(JSRegExpErrorInternal, stack);
1730 #ifdef USE_COMPUTED_GOTO_FOR_MATCH_OPCODE_LOOP
1731 CAPTURING_BRACKET:
1732 #else
1733 default:
1734 #endif
1735 /* Opening capturing bracket. If there is space in the offset vector, save
1736 the current subject position in the working slot at the top of the vector. We
1737 mustn't change the current values of the data slot, because they may be set
1738 from a previous iteration of this group, and be referred to by a reference
1739 inside the group.
1741 If the bracket fails to match, we need to restore this value and also the
1742 values of the final offsets, in case they were set by a previous iteration of
1743 the same bracket.
1745 If there isn't enough space in the offset vector, treat this as if it were a
1746 non-capturing bracket. Don't worry about setting the flag for the error case
1747 here; that is handled in the code for KET. */
1749 ASSERT(*stack.currentFrame->args.instructionPtr > OP_BRA);
1751 stack.currentFrame->locals.number = *stack.currentFrame->args.instructionPtr - OP_BRA;
1753 /* For extended extraction brackets (large number), we have to fish out the
1754 number from a dummy opcode at the start. */
1756 if (stack.currentFrame->locals.number > EXTRACT_BASIC_MAX)
1757 stack.currentFrame->locals.number = get2ByteValue(stack.currentFrame->args.instructionPtr + 2 + LINK_SIZE);
1758 stack.currentFrame->locals.offset = stack.currentFrame->locals.number << 1;
1760 #ifdef DEBUG
1761 printf("start bracket %d subject=", stack.currentFrame->locals.number);
1762 pchars(stack.currentFrame->args.subjectPtr, 16, true, md);
1763 printf("\n");
1764 #endif
1766 if (stack.currentFrame->locals.offset < md.offsetMax) {
1767 stack.currentFrame->locals.saveOffset1 = md.offsetVector[stack.currentFrame->locals.offset];
1768 stack.currentFrame->locals.saveOffset2 = md.offsetVector[stack.currentFrame->locals.offset + 1];
1769 stack.currentFrame->locals.saveOffset3 = md.offsetVector[md.offsetEnd - stack.currentFrame->locals.number];
1771 DPRINTF(("saving %d %d %d\n", stack.currentFrame->locals.saveOffset1, stack.currentFrame->locals.saveOffset2, stack.currentFrame->locals.saveOffset3));
1772 md.offsetVector[md.offsetEnd - stack.currentFrame->locals.number] = stack.currentFrame->args.subjectPtr - md.startSubject;
1774 do {
1775 RECURSIVE_MATCH_NEW_GROUP(1, stack.currentFrame->args.instructionPtr + 1 + LINK_SIZE, stack.currentFrame->args.bracketChain);
1776 if (isMatch)
1777 RRETURN;
1778 stack.currentFrame->args.instructionPtr += getLinkValue(stack.currentFrame->args.instructionPtr + 1);
1779 } while (*stack.currentFrame->args.instructionPtr == OP_ALT);
1781 DPRINTF(("bracket %d failed\n", stack.currentFrame->locals.number));
1783 md.offsetVector[stack.currentFrame->locals.offset] = stack.currentFrame->locals.saveOffset1;
1784 md.offsetVector[stack.currentFrame->locals.offset + 1] = stack.currentFrame->locals.saveOffset2;
1785 md.offsetVector[md.offsetEnd - stack.currentFrame->locals.number] = stack.currentFrame->locals.saveOffset3;
1787 RRETURN;
1790 /* Insufficient room for saving captured contents */
1792 goto NON_CAPTURING_BRACKET;
1795 /* Do not stick any code in here without much thought; it is assumed
1796 that "continue" in the code above comes out to here to repeat the main
1797 loop. */
1799 } /* End of main loop */
1801 ASSERT_NOT_REACHED();
1803 #ifndef USE_COMPUTED_GOTO_FOR_MATCH_RECURSION
1805 RRETURN_SWITCH:
1806 switch (stack.currentFrame->returnLocation) {
1807 case 0: goto RETURN;
1808 case 1: goto RRETURN_1;
1809 case 2: goto RRETURN_2;
1810 case 6: goto RRETURN_6;
1811 case 7: goto RRETURN_7;
1812 case 14: goto RRETURN_14;
1813 case 15: goto RRETURN_15;
1814 case 16: goto RRETURN_16;
1815 case 17: goto RRETURN_17;
1816 case 18: goto RRETURN_18;
1817 case 19: goto RRETURN_19;
1818 case 20: goto RRETURN_20;
1819 case 21: goto RRETURN_21;
1820 case 22: goto RRETURN_22;
1821 case 24: goto RRETURN_24;
1822 case 26: goto RRETURN_26;
1823 case 27: goto RRETURN_27;
1824 case 28: goto RRETURN_28;
1825 case 29: goto RRETURN_29;
1826 case 30: goto RRETURN_30;
1827 case 31: goto RRETURN_31;
1828 case 38: goto RRETURN_38;
1829 case 40: goto RRETURN_40;
1830 case 42: goto RRETURN_42;
1831 case 44: goto RRETURN_44;
1832 case 48: goto RRETURN_48;
1833 case 52: goto RRETURN_52;
1836 ASSERT_NOT_REACHED();
1837 return matchError(JSRegExpErrorInternal, stack);
1839 #endif
1841 RETURN:
1842 return isMatch;
1846 /*************************************************
1847 * Execute a Regular Expression *
1848 *************************************************/
1850 /* This function applies a compiled re to a subject string and picks out
1851 portions of the string if it matches. Two elements in the vector are set for
1852 each substring: the offsets to the start and end of the substring.
1854 Arguments:
1855 re points to the compiled expression
1856 extra_data points to extra data or is NULL
1857 subject points to the subject string
1858 length length of subject string (may contain binary zeros)
1859 start_offset where to start in the subject string
1860 options option bits
1861 offsets points to a vector of ints to be filled in with offsets
1862 offsetCount the number of elements in the vector
1864 Returns: > 0 => success; value is the number of elements filled in
1865 = 0 => success, but offsets is not big enough
1866 -1 => failed to match
1867 < -1 => some kind of unexpected problem
1870 static void tryFirstByteOptimization(const UChar*& subjectPtr, const UChar* endSubject, int firstByte, bool firstByteIsCaseless, bool useMultiLineFirstCharOptimization, const UChar* originalSubjectStart)
1872 // If firstByte is set, try scanning to the first instance of that byte
1873 // no need to try and match against any earlier part of the subject string.
1874 if (firstByte >= 0) {
1875 UChar firstChar = firstByte;
1876 if (firstByteIsCaseless)
1877 while (subjectPtr < endSubject) {
1878 int c = *subjectPtr;
1879 if (c > 127)
1880 break;
1881 if (toLowerCase(c) == firstChar)
1882 break;
1883 subjectPtr++;
1885 else {
1886 while (subjectPtr < endSubject && *subjectPtr != firstChar)
1887 subjectPtr++;
1889 } else if (useMultiLineFirstCharOptimization) {
1890 /* Or to just after \n for a multiline match if possible */
1891 // I'm not sure why this != originalSubjectStart check is necessary -- ecs 11/18/07
1892 if (subjectPtr > originalSubjectStart) {
1893 while (subjectPtr < endSubject && !isNewline(subjectPtr[-1]))
1894 subjectPtr++;
1899 static bool tryRequiredByteOptimization(const UChar*& subjectPtr, const UChar* endSubject, int reqByte, int reqByte2, bool reqByteIsCaseless, bool hasFirstByte, const UChar*& reqBytePtr)
1901 /* If reqByte is set, we know that that character must appear in the subject
1902 for the match to succeed. If the first character is set, reqByte must be
1903 later in the subject; otherwise the test starts at the match point. This
1904 optimization can save a huge amount of backtracking in patterns with nested
1905 unlimited repeats that aren't going to match. Writing separate code for
1906 cased/caseless versions makes it go faster, as does using an autoincrement
1907 and backing off on a match.
1909 HOWEVER: when the subject string is very, very long, searching to its end can
1910 take a long time, and give bad performance on quite ordinary patterns. This
1911 showed up when somebody was matching /^C/ on a 32-megabyte string... so we
1912 don't do this when the string is sufficiently long.
1915 if (reqByte >= 0 && endSubject - subjectPtr < REQ_BYTE_MAX) {
1916 const UChar* p = subjectPtr + (hasFirstByte ? 1 : 0);
1918 /* We don't need to repeat the search if we haven't yet reached the
1919 place we found it at last time. */
1921 if (p > reqBytePtr) {
1922 if (reqByteIsCaseless) {
1923 while (p < endSubject) {
1924 int pp = *p++;
1925 if (pp == reqByte || pp == reqByte2) {
1926 p--;
1927 break;
1930 } else {
1931 while (p < endSubject) {
1932 if (*p++ == reqByte) {
1933 p--;
1934 break;
1939 /* If we can't find the required character, break the matching loop */
1941 if (p >= endSubject)
1942 return true;
1944 /* If we have found the required character, save the point where we
1945 found it, so that we don't search again next time round the loop if
1946 the start hasn't passed this character yet. */
1948 reqBytePtr = p;
1951 return false;
1954 int jsRegExpExecute(const JSRegExp* re,
1955 const UChar* subject, int length, int start_offset, int* offsets,
1956 int offsetCount)
1958 ASSERT(re);
1959 ASSERT(subject || !length);
1960 ASSERT(offsetCount >= 0);
1961 ASSERT(offsets || offsetCount == 0);
1963 HistogramTimeLogger logger(re);
1965 MatchData matchBlock;
1966 matchBlock.startSubject = subject;
1967 matchBlock.endSubject = matchBlock.startSubject + length;
1968 const UChar* endSubject = matchBlock.endSubject;
1970 matchBlock.multiline = (re->options & MatchAcrossMultipleLinesOption);
1971 matchBlock.ignoreCase = (re->options & IgnoreCaseOption);
1973 /* If the expression has got more back references than the offsets supplied can
1974 hold, we get a temporary chunk of working store to use during the matching.
1975 Otherwise, we can use the vector supplied, rounding down its size to a multiple
1976 of 3. */
1978 int ocount = offsetCount - (offsetCount % 3);
1980 // FIXME: This is lame that we have to second-guess our caller here.
1981 // The API should change to either fail-hard when we don't have enough offset space
1982 // or that we shouldn't ask our callers to pre-allocate in the first place.
1983 bool usingTemporaryOffsets = false;
1984 if (re->topBackref > 0 && re->topBackref >= ocount/3) {
1985 ocount = re->topBackref * 3 + 3;
1986 matchBlock.offsetVector = new int[ocount];
1987 if (!matchBlock.offsetVector)
1988 return JSRegExpErrorNoMemory;
1989 usingTemporaryOffsets = true;
1990 } else
1991 matchBlock.offsetVector = offsets;
1993 matchBlock.offsetEnd = ocount;
1994 matchBlock.offsetMax = (2*ocount)/3;
1995 matchBlock.offsetOverflow = false;
1997 /* Compute the minimum number of offsets that we need to reset each time. Doing
1998 this makes a huge difference to execution time when there aren't many brackets
1999 in the pattern. */
2001 int resetCount = 2 + re->topBracket * 2;
2002 if (resetCount > offsetCount)
2003 resetCount = ocount;
2005 /* Reset the working variable associated with each extraction. These should
2006 never be used unless previously set, but they get saved and restored, and so we
2007 initialize them to avoid reading uninitialized locations. */
2009 if (matchBlock.offsetVector) {
2010 int* iptr = matchBlock.offsetVector + ocount;
2011 int* iend = iptr - resetCount/2 + 1;
2012 while (--iptr >= iend)
2013 *iptr = -1;
2016 /* Set up the first character to match, if available. The firstByte value is
2017 never set for an anchored regular expression, but the anchoring may be forced
2018 at run time, so we have to test for anchoring. The first char may be unset for
2019 an unanchored pattern, of course. If there's no first char and the pattern was
2020 studied, there may be a bitmap of possible first characters. */
2022 bool firstByteIsCaseless = false;
2023 int firstByte = -1;
2024 if (re->options & UseFirstByteOptimizationOption) {
2025 firstByte = re->firstByte & 255;
2026 if ((firstByteIsCaseless = (re->firstByte & REQ_IGNORE_CASE)))
2027 firstByte = toLowerCase(firstByte);
2030 /* For anchored or unanchored matches, there may be a "last known required
2031 character" set. */
2033 bool reqByteIsCaseless = false;
2034 int reqByte = -1;
2035 int reqByte2 = -1;
2036 if (re->options & UseRequiredByteOptimizationOption) {
2037 reqByte = re->reqByte & 255; // FIXME: This optimization could be made to work for UTF16 chars as well...
2038 reqByteIsCaseless = (re->reqByte & REQ_IGNORE_CASE);
2039 reqByte2 = flipCase(reqByte);
2042 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
2043 the loop runs just once. */
2045 const UChar* startMatch = subject + start_offset;
2046 const UChar* reqBytePtr = startMatch - 1;
2047 bool useMultiLineFirstCharOptimization = re->options & UseMultiLineFirstByteOptimizationOption;
2049 do {
2050 /* Reset the maximum number of extractions we might see. */
2051 if (matchBlock.offsetVector) {
2052 int* iptr = matchBlock.offsetVector;
2053 int* iend = iptr + resetCount;
2054 while (iptr < iend)
2055 *iptr++ = -1;
2058 tryFirstByteOptimization(startMatch, endSubject, firstByte, firstByteIsCaseless, useMultiLineFirstCharOptimization, matchBlock.startSubject + start_offset);
2059 if (tryRequiredByteOptimization(startMatch, endSubject, reqByte, reqByte2, reqByteIsCaseless, firstByte >= 0, reqBytePtr))
2060 break;
2062 /* When a match occurs, substrings will be set for all internal extractions;
2063 we just need to set up the whole thing as substring 0 before returning. If
2064 there were too many extractions, set the return code to zero. In the case
2065 where we had to get some local store to hold offsets for backreferences, copy
2066 those back references that we can. In this case there need not be overflow
2067 if certain parts of the pattern were not used. */
2069 /* The code starts after the JSRegExp block and the capture name table. */
2070 const unsigned char* start_code = (const unsigned char*)(re + 1);
2072 int returnCode = match(startMatch, start_code, 2, matchBlock);
2074 /* When the result is no match, advance the pointer to the next character
2075 and continue. */
2076 if (returnCode == 0) {
2077 startMatch++;
2078 continue;
2081 if (returnCode != 1) {
2082 ASSERT(returnCode == JSRegExpErrorHitLimit || returnCode == JSRegExpErrorNoMemory);
2083 DPRINTF((">>>> error: returning %d\n", returnCode));
2084 return returnCode;
2087 /* We have a match! Copy the offset information from temporary store if
2088 necessary */
2090 if (usingTemporaryOffsets) {
2091 if (offsetCount >= 4) {
2092 memcpy(offsets + 2, matchBlock.offsetVector + 2, (offsetCount - 2) * sizeof(int));
2093 DPRINTF(("Copied offsets from temporary memory\n"));
2095 if (matchBlock.endOffsetTop > offsetCount)
2096 matchBlock.offsetOverflow = true;
2098 DPRINTF(("Freeing temporary memory\n"));
2099 delete [] matchBlock.offsetVector;
2102 returnCode = matchBlock.offsetOverflow ? 0 : matchBlock.endOffsetTop / 2;
2104 if (offsetCount < 2)
2105 returnCode = 0;
2106 else {
2107 offsets[0] = startMatch - matchBlock.startSubject;
2108 offsets[1] = matchBlock.endMatchPtr - matchBlock.startSubject;
2111 DPRINTF((">>>> returning %d\n", returnCode));
2112 return returnCode;
2113 } while (!(re->options & IsAnchoredOption) && startMatch <= endSubject);
2115 if (usingTemporaryOffsets) {
2116 DPRINTF(("Freeing temporary memory\n"));
2117 delete [] matchBlock.offsetVector;
2120 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
2121 return JSRegExpErrorNoMatch;
2124 #if REGEXP_HISTOGRAM
2126 class CompareHistogramEntries {
2127 public:
2128 bool operator()(const pair<UString, double>& a, const pair<UString, double>& b)
2130 if (a.second == b.second)
2131 return a.first < b.first;
2132 return a.second < b.second;
2136 Histogram::~Histogram()
2138 Vector<pair<UString, double> > values;
2139 Map::iterator end = times.end();
2140 for (Map::iterator it = times.begin(); it != end; ++it)
2141 values.append(*it);
2142 sort(values.begin(), values.end(), CompareHistogramEntries());
2143 size_t size = values.size();
2144 printf("Regular Expressions, sorted by time spent evaluating them:\n");
2145 for (size_t i = 0; i < size; ++i)
2146 printf(" %f - %s\n", values[size - i - 1].second, values[size - i - 1].first.UTF8String().c_str());
2149 void Histogram::add(const JSRegExp* re, double elapsedTime)
2151 UString string(reinterpret_cast<const UChar*>(reinterpret_cast<const char*>(re) + re->stringOffset), re->stringLength);
2152 if (re->options & IgnoreCaseOption && re->options & MatchAcrossMultipleLinesOption)
2153 string += " (multi-line, ignore case)";
2154 else {
2155 if (re->options & IgnoreCaseOption)
2156 string += " (ignore case)";
2157 if (re->options & MatchAcrossMultipleLinesOption)
2158 string += " (multi-line)";
2160 pair<Map::iterator, bool> result = times.add(string.rep(), elapsedTime);
2161 if (!result.second)
2162 result.first->second += elapsedTime;
2165 HistogramTimeLogger::HistogramTimeLogger(const JSRegExp* re)
2166 : m_re(re)
2167 , m_startTime(currentTimeMS())
2171 HistogramTimeLogger::~HistogramTimeLogger()
2173 static Histogram histogram;
2174 histogram.add(m_re, currentTimeMS() - m_startTime);
2177 #endif