1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
8 Written by Philip Hazel
9 Copyright (c) 1997-2012 University of Cambridge
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
40 /* This module contains pcre_exec(), the externally visible function that does
41 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
42 possible. There are also some static supporting functions. */
48 #define NLBLOCK md /* Block containing newline information */
49 #define PSSTART start_subject /* Field containing processed string start */
50 #define PSEND end_subject /* Field containing processed string end */
52 #include "pcre_internal.h"
54 /* Undefine some potentially clashing cpp symbols */
59 /* Values for setting in md->match_function_type to indicate two special types
60 of call to match(). We do it this way to save on using another stack variable,
61 as stack usage is to be discouraged. */
63 #define MATCH_CONDASSERT 1 /* Called to check a condition assertion */
64 #define MATCH_CBEGROUP 2 /* Could-be-empty unlimited repeat group */
66 /* Non-error returns from the match() function. Error returns are externally
67 defined PCRE_ERROR_xxx codes, which are all negative. */
70 #define MATCH_NOMATCH 0
72 /* Special internal returns from the match() function. Make them sufficiently
73 negative to avoid the external error codes. */
75 #define MATCH_ACCEPT (-999)
76 #define MATCH_COMMIT (-998)
77 #define MATCH_KETRPOS (-997)
78 #define MATCH_ONCE (-996)
79 #define MATCH_PRUNE (-995)
80 #define MATCH_SKIP (-994)
81 #define MATCH_SKIP_ARG (-993)
82 #define MATCH_THEN (-992)
84 /* Maximum number of ints of offset to save on the stack for recursive calls.
85 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
86 because the offset vector is always a multiple of 3 long. */
88 #define REC_STACK_SAVE_MAX 30
90 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
92 static const char rep_min
[] = { 0, 0, 1, 1, 0, 0 };
93 static const char rep_max
[] = { 0, 0, 0, 0, 1, 1 };
96 /*************************************************
97 * Debugging function to print chars *
98 *************************************************/
100 /* Print a sequence of chars in printable format, stopping at the end of the
101 subject if the requested.
104 p points to characters
105 length number to print
106 is_subject TRUE if printing from within md->start_subject
107 md pointer to matching data block, if is_subject is TRUE
113 pchars(const pcre_uchar
*p
, int length
, BOOL is_subject
, match_data
*md
)
117 if (is_subject
&& length
> md
->end_subject
- p
) length
= md
->end_subject
- p
;
119 if (isprint(c
= RAWUCHARINCTEST(p
))) printf("%c", (char)c
); else printf("\\x{%02x}", c
);
125 /*************************************************
126 * Match a back-reference *
127 *************************************************/
129 /* Normally, if a back reference hasn't been set, the length that is passed is
130 negative, so the match always fails. However, in JavaScript compatibility mode,
131 the length passed is zero. Note that in caseless UTF-8 mode, the number of
132 subject bytes matched may be different to the number of reference bytes.
135 offset index into the offset vector
136 eptr pointer into the subject
137 length length of reference to be matched (number of bytes)
138 md points to match data block
139 caseless TRUE if caseless
141 Returns: >= 0 the number of subject bytes matched
143 -2 partial match; always given if at end subject
147 match_ref(int offset
, register PCRE_PUCHAR eptr
, int length
, match_data
*md
,
150 PCRE_PUCHAR eptr_start
= eptr
;
151 register PCRE_PUCHAR p
= md
->start_subject
+ md
->offset_vector
[offset
];
157 if (eptr
>= md
->end_subject
)
158 printf("matching subject <null>");
161 printf("matching subject ");
162 pchars(eptr
, length
, TRUE
, md
);
164 printf(" against backref ");
165 pchars(p
, length
, FALSE
, md
);
169 /* Always fail if reference not set (and not JavaScript compatible - in that
170 case the length is passed as zero). */
172 if (length
< 0) return -1;
174 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
175 properly if Unicode properties are supported. Otherwise, we can check only
184 /* Match characters up to the end of the reference. NOTE: the number of
185 data units matched may differ, because in UTF-8 there are some characters
186 whose upper and lower case versions code have different numbers of bytes.
187 For example, U+023A (2 bytes in UTF-8) is the upper case version of U+2C65
188 (3 bytes in UTF-8); a sequence of 3 of the former uses 6 bytes, as does a
189 sequence of two of the latter. It is important, therefore, to check the
190 length along the reference, not along the subject (earlier code did this
193 PCRE_PUCHAR endptr
= p
+ length
;
197 const ucd_record
*ur
;
198 if (eptr
>= md
->end_subject
) return -2; /* Partial match */
202 if (c
!= d
&& c
!= d
+ ur
->other_case
)
204 const pcre_uint32
*pp
= PRIV(ucd_caseless_sets
) + ur
->caseset
;
207 if (c
< *pp
) return -1;
208 if (c
== *pp
++) break;
217 /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
218 is no UCP support. */
223 if (eptr
>= md
->end_subject
) return -2; /* Partial match */
224 cc
= RAWUCHARTEST(eptr
);
225 cp
= RAWUCHARTEST(p
);
226 if (TABLE_GET(cp
, md
->lcc
, cp
) != TABLE_GET(cc
, md
->lcc
, cc
)) return -1;
233 /* In the caseful case, we can just compare the bytes, whether or not we
234 are in UTF-8 mode. */
240 if (eptr
>= md
->end_subject
) return -2; /* Partial match */
241 if (RAWUCHARINCTEST(p
) != RAWUCHARINCTEST(eptr
)) return -1;
245 return (int)(eptr
- eptr_start
);
250 /***************************************************************************
251 ****************************************************************************
252 RECURSION IN THE match() FUNCTION
254 The match() function is highly recursive, though not every recursive call
255 increases the recursive depth. Nevertheless, some regular expressions can cause
256 it to recurse to a great depth. I was writing for Unix, so I just let it call
257 itself recursively. This uses the stack for saving everything that has to be
258 saved for a recursive call. On Unix, the stack can be large, and this works
261 It turns out that on some non-Unix-like systems there are problems with
262 programs that use a lot of stack. (This despite the fact that every last chip
263 has oodles of memory these days, and techniques for extending the stack have
264 been known for decades.) So....
266 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
267 calls by keeping local variables that need to be preserved in blocks of memory
268 obtained from malloc() instead instead of on the stack. Macros are used to
269 achieve this so that the actual code doesn't look very different to what it
272 The original heap-recursive code used longjmp(). However, it seems that this
273 can be very slow on some operating systems. Following a suggestion from Stan
274 Switzer, the use of longjmp() has been abolished, at the cost of having to
275 provide a unique number for each call to RMATCH. There is no way of generating
276 a sequence of numbers at compile time in C. I have given them names, to make
277 them stand out more clearly.
279 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
280 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
281 tests. Furthermore, not using longjmp() means that local dynamic variables
282 don't have indeterminate values; this has meant that the frame size can be
283 reduced because the result can be "passed back" by straight setting of the
284 variable instead of being passed in the frame.
285 ****************************************************************************
286 ***************************************************************************/
288 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
289 below must be updated in sync. */
291 enum { RM1
=1, RM2
, RM3
, RM4
, RM5
, RM6
, RM7
, RM8
, RM9
, RM10
,
292 RM11
, RM12
, RM13
, RM14
, RM15
, RM16
, RM17
, RM18
, RM19
, RM20
,
293 RM21
, RM22
, RM23
, RM24
, RM25
, RM26
, RM27
, RM28
, RM29
, RM30
,
294 RM31
, RM32
, RM33
, RM34
, RM35
, RM36
, RM37
, RM38
, RM39
, RM40
,
295 RM41
, RM42
, RM43
, RM44
, RM45
, RM46
, RM47
, RM48
, RM49
, RM50
,
296 RM51
, RM52
, RM53
, RM54
, RM55
, RM56
, RM57
, RM58
, RM59
, RM60
,
297 RM61
, RM62
, RM63
, RM64
, RM65
, RM66
, RM67
};
299 /* These versions of the macros use the stack, as normal. There are debugging
300 versions and production versions. Note that the "rw" argument of RMATCH isn't
301 actually used in this definition. */
304 #define REGISTER register
307 #define RMATCH(ra,rb,rc,rd,re,rw) \
309 printf("match() called in line %d\n", __LINE__); \
310 rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1); \
311 printf("to line %d\n", __LINE__); \
313 #define RRETURN(ra) \
315 printf("match() returned %d from line %d\n", ra, __LINE__); \
319 #define RMATCH(ra,rb,rc,rd,re,rw) \
320 rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1)
321 #define RRETURN(ra) return ra
327 /* These versions of the macros manage a private stack on the heap. Note that
328 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
329 argument of match(), which never changes. */
333 #define RMATCH(ra,rb,rc,rd,re,rw)\
335 heapframe *newframe = frame->Xnextframe;\
336 if (newframe == NULL)\
338 newframe = (heapframe *)(PUBL(stack_malloc))(sizeof(heapframe));\
339 if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\
340 newframe->Xnextframe = NULL;\
341 frame->Xnextframe = newframe;\
344 newframe->Xeptr = ra;\
345 newframe->Xecode = rb;\
346 newframe->Xmstart = mstart;\
347 newframe->Xoffset_top = rc;\
348 newframe->Xeptrb = re;\
349 newframe->Xrdepth = frame->Xrdepth + 1;\
350 newframe->Xprevframe = frame;\
352 DPRINTF(("restarting from line %d\n", __LINE__));\
355 DPRINTF(("jumped back to line %d\n", __LINE__));\
360 heapframe *oldframe = frame;\
361 frame = oldframe->Xprevframe;\
371 /* Structure for remembering the local variables in a private frame */
373 typedef struct heapframe
{
374 struct heapframe
*Xprevframe
;
375 struct heapframe
*Xnextframe
;
377 /* Function arguments that may change */
380 const pcre_uchar
*Xecode
;
384 unsigned int Xrdepth
;
386 /* Function local variables */
388 PCRE_PUCHAR Xcallpat
;
390 PCRE_PUCHAR Xcharptr
;
396 PCRE_PUCHAR Xsaved_eptr
;
398 recursion_info Xnew_recursive
;
406 unsigned int Xprop_value
;
407 int Xprop_fail_result
;
409 pcre_uchar Xocchars
[6];
422 int Xsave_capture_last
;
423 int Xsave_offset1
, Xsave_offset2
, Xsave_offset3
;
424 int Xstacksave
[REC_STACK_SAVE_MAX
];
428 /* Where to jump back to */
437 /***************************************************************************
438 ***************************************************************************/
442 /*************************************************
443 * Match from current position *
444 *************************************************/
446 /* This function is called recursively in many circumstances. Whenever it
447 returns a negative (error) response, the outer incarnation must also return the
450 /* These macros pack up tests that are used for partial matching, and which
451 appear several times in the code. We set the "hit end" flag if the pointer is
452 at the end of the subject and also past the start of the subject (i.e.
453 something has been matched). For hard partial matching, we then return
454 immediately. The second one is used when we already know we are past the end of
457 #define CHECK_PARTIAL()\
458 if (md->partial != 0 && eptr >= md->end_subject && \
459 eptr > md->start_used_ptr) \
462 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
465 #define SCHECK_PARTIAL()\
466 if (md->partial != 0 && eptr > md->start_used_ptr) \
469 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
473 /* Performance note: It might be tempting to extract commonly used fields from
474 the md structure (e.g. utf, end_subject) into individual variables to improve
475 performance. Tests using gcc on a SPARC disproved this; in the first case, it
476 made performance worse.
479 eptr pointer to current character in subject
480 ecode pointer to current position in compiled code
481 mstart pointer to the current match start position (can be modified
483 offset_top current top pointer
484 md pointer to "static" info for the match
485 eptrb pointer to chain of blocks containing eptr at start of
486 brackets - for testing for empty matches
487 rdepth the recursion depth
489 Returns: MATCH_MATCH if matched ) these values are >= 0
490 MATCH_NOMATCH if failed to match )
491 a negative MATCH_xxx value for PRUNE, SKIP, etc
492 a negative PCRE_ERROR_xxx value if aborted by an error condition
493 (e.g. stopped by repeated call or recursion limit)
497 match(REGISTER PCRE_PUCHAR eptr
, REGISTER
const pcre_uchar
*ecode
,
498 PCRE_PUCHAR mstart
, int offset_top
, match_data
*md
, eptrblock
*eptrb
,
501 /* These variables do not need to be preserved over recursion in this function,
502 so they can be ordinary variables in all cases. Mark some of them with
503 "register" because they are used a lot in loops. */
505 register int rrc
; /* Returns from recursive calls */
506 register int i
; /* Used for loops not involving calls to RMATCH() */
507 register pcre_uint32 c
; /* Character values not kept over RMATCH() calls */
508 register BOOL utf
; /* Local copy of UTF flag for speed */
510 BOOL minimize
, possessive
; /* Quantifier options */
514 /* When recursion is not being used, all "local" variables that have to be
515 preserved over calls to RMATCH() are part of a "frame". We set up the top-level
516 frame on the stack here; subsequent instantiations are obtained from the heap
517 whenever RMATCH() does a "recursion". See the macro definitions above. Putting
518 the top-level on the stack rather than malloc-ing them all gives a performance
519 boost in many cases where there is not much "recursion". */
522 heapframe
*frame
= (heapframe
*)md
->match_frames_base
;
524 /* Copy in the original argument variables */
527 frame
->Xecode
= ecode
;
528 frame
->Xmstart
= mstart
;
529 frame
->Xoffset_top
= offset_top
;
530 frame
->Xeptrb
= eptrb
;
531 frame
->Xrdepth
= rdepth
;
533 /* This is where control jumps back to to effect "recursion" */
537 /* Macros make the argument variables come from the current frame */
539 #define eptr frame->Xeptr
540 #define ecode frame->Xecode
541 #define mstart frame->Xmstart
542 #define offset_top frame->Xoffset_top
543 #define eptrb frame->Xeptrb
544 #define rdepth frame->Xrdepth
546 /* Ditto for the local variables */
549 #define charptr frame->Xcharptr
551 #define callpat frame->Xcallpat
552 #define codelink frame->Xcodelink
553 #define data frame->Xdata
554 #define next frame->Xnext
555 #define pp frame->Xpp
556 #define prev frame->Xprev
557 #define saved_eptr frame->Xsaved_eptr
559 #define new_recursive frame->Xnew_recursive
561 #define cur_is_word frame->Xcur_is_word
562 #define condition frame->Xcondition
563 #define prev_is_word frame->Xprev_is_word
566 #define prop_type frame->Xprop_type
567 #define prop_value frame->Xprop_value
568 #define prop_fail_result frame->Xprop_fail_result
569 #define oclength frame->Xoclength
570 #define occhars frame->Xocchars
573 #define ctype frame->Xctype
574 #define fc frame->Xfc
575 #define fi frame->Xfi
576 #define length frame->Xlength
577 #define max frame->Xmax
578 #define min frame->Xmin
579 #define number frame->Xnumber
580 #define offset frame->Xoffset
581 #define op frame->Xop
582 #define save_capture_last frame->Xsave_capture_last
583 #define save_offset1 frame->Xsave_offset1
584 #define save_offset2 frame->Xsave_offset2
585 #define save_offset3 frame->Xsave_offset3
586 #define stacksave frame->Xstacksave
588 #define newptrb frame->Xnewptrb
590 /* When recursion is being used, local variables are allocated on the stack and
591 get preserved during recursion in the normal way. In this environment, fi and
592 i, and fc and c, can be the same variables. */
594 #else /* NO_RECURSE not defined */
598 /* Many of the following variables are used only in small blocks of the code.
599 My normal style of coding would have declared them within each of those blocks.
600 However, in order to accommodate the version of this code that uses an external
601 "stack" implemented on the heap, it is easier to declare them all here, so the
602 declarations can be cut out in a block. The only declarations within blocks
603 below are for variables that do not have to be preserved over a recursive call
607 const pcre_uchar
*charptr
;
609 const pcre_uchar
*callpat
;
610 const pcre_uchar
*data
;
611 const pcre_uchar
*next
;
613 const pcre_uchar
*prev
;
614 PCRE_PUCHAR saved_eptr
;
616 recursion_info new_recursive
;
624 unsigned int prop_value
;
625 int prop_fail_result
;
627 pcre_uchar occhars
[6];
638 int save_capture_last
;
639 int save_offset1
, save_offset2
, save_offset3
;
640 int stacksave
[REC_STACK_SAVE_MAX
];
644 /* There is a special fudge for calling match() in a way that causes it to
645 measure the size of its basic stack frame when the stack is being used for
646 recursion. The second argument (ecode) being NULL triggers this behaviour. It
647 cannot normally ever be NULL. The return is the negated value of the frame
653 return match((PCRE_PUCHAR
)&rdepth
, NULL
, NULL
, 0, NULL
, NULL
, 1);
656 int len
= (char *)&rdepth
- (char *)eptr
;
657 return (len
> 0)? -len
: len
;
660 #endif /* NO_RECURSE */
662 /* To save space on the stack and in the heap frame, I have doubled up on some
663 of the local variables that are used only in localised parts of the code, but
664 still need to be preserved over recursive calls of match(). These macros define
665 the alternative names that are used. */
667 #define allow_zero cur_is_word
668 #define cbegroup condition
669 #define code_offset codelink
670 #define condassert condition
671 #define matched_once prev_is_word
673 #define save_mark data
675 /* These statements are here to stop the compiler complaining about unitialized
680 prop_fail_result
= 0;
684 /* This label is used for tail recursion, which is used in a few cases even
685 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
686 used. Thanks to Ian Taylor for noticing this possibility and sending the
691 /* OK, now we can get on with the real code of the function. Recursive calls
692 are specified by the macro RMATCH and RRETURN is used to return. When
693 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
694 and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
695 defined). However, RMATCH isn't like a function call because it's quite a
696 complicated macro. It has to be used in one particular way. This shouldn't,
697 however, impact performance when true recursion is being used. */
700 utf
= md
->utf
; /* Local copy of the flag */
705 /* First check that we haven't called match() too many times, or that we
706 haven't exceeded the recursive call limit. */
708 if (md
->match_call_count
++ >= md
->match_limit
) RRETURN(PCRE_ERROR_MATCHLIMIT
);
709 if (rdepth
>= md
->match_limit_recursion
) RRETURN(PCRE_ERROR_RECURSIONLIMIT
);
711 /* At the start of a group with an unlimited repeat that may match an empty
712 string, the variable md->match_function_type is set to MATCH_CBEGROUP. It is
713 done this way to save having to use another function argument, which would take
714 up space on the stack. See also MATCH_CONDASSERT below.
716 When MATCH_CBEGROUP is set, add the current subject pointer to the chain of
717 such remembered pointers, to be checked when we hit the closing ket, in order
718 to break infinite loops that match no characters. When match() is called in
719 other circumstances, don't add to the chain. The MATCH_CBEGROUP feature must
720 NOT be used with tail recursion, because the memory block that is used is on
721 the stack, so a new one may be required for each match(). */
723 if (md
->match_function_type
== MATCH_CBEGROUP
)
725 newptrb
.epb_saved_eptr
= eptr
;
726 newptrb
.epb_prev
= eptrb
;
728 md
->match_function_type
= 0;
731 /* Now start processing the opcodes. */
735 minimize
= possessive
= FALSE
;
741 md
->nomatch_mark
= ecode
+ 2;
742 md
->mark
= NULL
; /* In case previously set by assertion */
743 RMATCH(eptr
, ecode
+ PRIV(OP_lengths
)[*ecode
] + ecode
[1], offset_top
, md
,
745 if ((rrc
== MATCH_MATCH
|| rrc
== MATCH_ACCEPT
) &&
746 md
->mark
== NULL
) md
->mark
= ecode
+ 2;
748 /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
749 argument, and we must check whether that argument matches this MARK's
750 argument. It is passed back in md->start_match_ptr (an overloading of that
751 variable). If it does match, we reset that variable to the current subject
752 position and return MATCH_SKIP. Otherwise, pass back the return code
755 else if (rrc
== MATCH_SKIP_ARG
&&
756 STRCMP_UC_UC_TEST(ecode
+ 2, md
->start_match_ptr
) == 0)
758 md
->start_match_ptr
= eptr
;
764 RRETURN(MATCH_NOMATCH
);
766 /* COMMIT overrides PRUNE, SKIP, and THEN */
769 RMATCH(eptr
, ecode
+ PRIV(OP_lengths
)[*ecode
], offset_top
, md
,
771 if (rrc
!= MATCH_NOMATCH
&& rrc
!= MATCH_PRUNE
&&
772 rrc
!= MATCH_SKIP
&& rrc
!= MATCH_SKIP_ARG
&&
775 RRETURN(MATCH_COMMIT
);
777 /* PRUNE overrides THEN */
780 RMATCH(eptr
, ecode
+ PRIV(OP_lengths
)[*ecode
], offset_top
, md
,
782 if (rrc
!= MATCH_NOMATCH
&& rrc
!= MATCH_THEN
) RRETURN(rrc
);
783 RRETURN(MATCH_PRUNE
);
786 md
->nomatch_mark
= ecode
+ 2;
787 md
->mark
= NULL
; /* In case previously set by assertion */
788 RMATCH(eptr
, ecode
+ PRIV(OP_lengths
)[*ecode
] + ecode
[1], offset_top
, md
,
790 if ((rrc
== MATCH_MATCH
|| rrc
== MATCH_ACCEPT
) &&
791 md
->mark
== NULL
) md
->mark
= ecode
+ 2;
792 if (rrc
!= MATCH_NOMATCH
&& rrc
!= MATCH_THEN
) RRETURN(rrc
);
793 RRETURN(MATCH_PRUNE
);
795 /* SKIP overrides PRUNE and THEN */
798 RMATCH(eptr
, ecode
+ PRIV(OP_lengths
)[*ecode
], offset_top
, md
,
800 if (rrc
!= MATCH_NOMATCH
&& rrc
!= MATCH_PRUNE
&& rrc
!= MATCH_THEN
)
802 md
->start_match_ptr
= eptr
; /* Pass back current position */
805 /* Note that, for Perl compatibility, SKIP with an argument does NOT set
806 nomatch_mark. There is a flag that disables this opcode when re-matching a
807 pattern that ended with a SKIP for which there was not a matching MARK. */
810 if (md
->ignore_skip_arg
)
812 ecode
+= PRIV(OP_lengths
)[*ecode
] + ecode
[1];
815 RMATCH(eptr
, ecode
+ PRIV(OP_lengths
)[*ecode
] + ecode
[1], offset_top
, md
,
817 if (rrc
!= MATCH_NOMATCH
&& rrc
!= MATCH_PRUNE
&& rrc
!= MATCH_THEN
)
820 /* Pass back the current skip name by overloading md->start_match_ptr and
821 returning the special MATCH_SKIP_ARG return code. This will either be
822 caught by a matching MARK, or get to the top, where it causes a rematch
823 with the md->ignore_skip_arg flag set. */
825 md
->start_match_ptr
= ecode
+ 2;
826 RRETURN(MATCH_SKIP_ARG
);
828 /* For THEN (and THEN_ARG) we pass back the address of the opcode, so that
829 the branch in which it occurs can be determined. Overload the start of
830 match pointer to do this. */
833 RMATCH(eptr
, ecode
+ PRIV(OP_lengths
)[*ecode
], offset_top
, md
,
835 if (rrc
!= MATCH_NOMATCH
) RRETURN(rrc
);
836 md
->start_match_ptr
= ecode
;
840 md
->nomatch_mark
= ecode
+ 2;
841 md
->mark
= NULL
; /* In case previously set by assertion */
842 RMATCH(eptr
, ecode
+ PRIV(OP_lengths
)[*ecode
] + ecode
[1], offset_top
,
844 if ((rrc
== MATCH_MATCH
|| rrc
== MATCH_ACCEPT
) &&
845 md
->mark
== NULL
) md
->mark
= ecode
+ 2;
846 if (rrc
!= MATCH_NOMATCH
) RRETURN(rrc
);
847 md
->start_match_ptr
= ecode
;
850 /* Handle an atomic group that does not contain any capturing parentheses.
851 This can be handled like an assertion. Prior to 8.13, all atomic groups
852 were handled this way. In 8.13, the code was changed as below for ONCE, so
853 that backups pass through the group and thereby reset captured values.
854 However, this uses a lot more stack, so in 8.20, atomic groups that do not
855 contain any captures generate OP_ONCE_NC, which can be handled in the old,
856 less stack intensive way.
858 Check the alternative branches in turn - the matching won't pass the KET
859 for this kind of subpattern. If any one branch matches, we carry on as at
860 the end of a normal bracket, leaving the subject pointer, but resetting
861 the start-of-match value in case it was changed by \K. */
866 save_mark
= md
->mark
;
869 RMATCH(eptr
, ecode
+ 1 + LINK_SIZE
, offset_top
, md
, eptrb
, RM64
);
870 if (rrc
== MATCH_MATCH
) /* Note: _not_ MATCH_ACCEPT */
872 mstart
= md
->start_match_ptr
;
875 if (rrc
== MATCH_THEN
)
877 next
= ecode
+ GET(ecode
,1);
878 if (md
->start_match_ptr
< next
&&
879 (*ecode
== OP_ALT
|| *next
== OP_ALT
))
883 if (rrc
!= MATCH_NOMATCH
) RRETURN(rrc
);
884 ecode
+= GET(ecode
,1);
885 md
->mark
= save_mark
;
887 while (*ecode
== OP_ALT
);
889 /* If hit the end of the group (which could be repeated), fail */
891 if (*ecode
!= OP_ONCE_NC
&& *ecode
!= OP_ALT
) RRETURN(MATCH_NOMATCH
);
893 /* Continue as from after the group, updating the offsets high water
894 mark, since extracts may have been taken. */
896 do ecode
+= GET(ecode
, 1); while (*ecode
== OP_ALT
);
898 offset_top
= md
->end_offset_top
;
899 eptr
= md
->end_match_ptr
;
901 /* For a non-repeating ket, just continue at this level. This also
902 happens for a repeating ket if no characters were matched in the group.
903 This is the forcible breaking of infinite loops as implemented in Perl
906 if (*ecode
== OP_KET
|| eptr
== saved_eptr
)
908 ecode
+= 1+LINK_SIZE
;
912 /* The repeating kets try the rest of the pattern or restart from the
913 preceding bracket, in the appropriate order. The second "call" of match()
914 uses tail recursion, to avoid using another stack frame. */
916 if (*ecode
== OP_KETRMIN
)
918 RMATCH(eptr
, ecode
+ 1 + LINK_SIZE
, offset_top
, md
, eptrb
, RM65
);
919 if (rrc
!= MATCH_NOMATCH
) RRETURN(rrc
);
923 else /* OP_KETRMAX */
925 RMATCH(eptr
, prev
, offset_top
, md
, eptrb
, RM66
);
926 if (rrc
!= MATCH_NOMATCH
) RRETURN(rrc
);
927 ecode
+= 1 + LINK_SIZE
;
930 /* Control never gets here */
932 /* Handle a capturing bracket, other than those that are possessive with an
933 unlimited repeat. If there is space in the offset vector, save the current
934 subject position in the working slot at the top of the vector. We mustn't
935 change the current values of the data slot, because they may be set from a
936 previous iteration of this group, and be referred to by a reference inside
937 the group. A failure to match might occur after the group has succeeded,
938 if something later on doesn't match. For this reason, we need to restore
939 the working value and also the values of the final offsets, in case they
940 were set by a previous iteration of the same bracket.
942 If there isn't enough space in the offset vector, treat this as if it were
943 a non-capturing bracket. Don't worry about setting the flag for the error
944 case here; that is handled in the code for KET. */
948 number
= GET2(ecode
, 1+LINK_SIZE
);
949 offset
= number
<< 1;
952 printf("start bracket %d\n", number
);
954 pchars(eptr
, 16, TRUE
, md
);
958 if (offset
< md
->offset_max
)
960 save_offset1
= md
->offset_vector
[offset
];
961 save_offset2
= md
->offset_vector
[offset
+1];
962 save_offset3
= md
->offset_vector
[md
->offset_end
- number
];
963 save_capture_last
= md
->capture_last
;
964 save_mark
= md
->mark
;
966 DPRINTF(("saving %d %d %d\n", save_offset1
, save_offset2
, save_offset3
));
967 md
->offset_vector
[md
->offset_end
- number
] =
968 (int)(eptr
- md
->start_subject
);
972 if (op
>= OP_SBRA
) md
->match_function_type
= MATCH_CBEGROUP
;
973 RMATCH(eptr
, ecode
+ PRIV(OP_lengths
)[*ecode
], offset_top
, md
,
975 if (rrc
== MATCH_ONCE
) break; /* Backing up through an atomic group */
977 /* If we backed up to a THEN, check whether it is within the current
978 branch by comparing the address of the THEN that is passed back with
979 the end of the branch. If it is within the current branch, and the
980 branch is one of two or more alternatives (it either starts or ends
981 with OP_ALT), we have reached the limit of THEN's action, so convert
982 the return code to NOMATCH, which will cause normal backtracking to
983 happen from now on. Otherwise, THEN is passed back to an outer
984 alternative. This implements Perl's treatment of parenthesized groups,
985 where a group not containing | does not affect the current alternative,
986 that is, (X) is NOT the same as (X|(*F)). */
988 if (rrc
== MATCH_THEN
)
990 next
= ecode
+ GET(ecode
,1);
991 if (md
->start_match_ptr
< next
&&
992 (*ecode
== OP_ALT
|| *next
== OP_ALT
))
996 /* Anything other than NOMATCH is passed back. */
998 if (rrc
!= MATCH_NOMATCH
) RRETURN(rrc
);
999 md
->capture_last
= save_capture_last
;
1000 ecode
+= GET(ecode
, 1);
1001 md
->mark
= save_mark
;
1002 if (*ecode
!= OP_ALT
) break;
1005 DPRINTF(("bracket %d failed\n", number
));
1006 md
->offset_vector
[offset
] = save_offset1
;
1007 md
->offset_vector
[offset
+1] = save_offset2
;
1008 md
->offset_vector
[md
->offset_end
- number
] = save_offset3
;
1010 /* At this point, rrc will be one of MATCH_ONCE or MATCH_NOMATCH. */
1015 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1016 as a non-capturing bracket. */
1018 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1019 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1021 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1023 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1024 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1026 /* Non-capturing or atomic group, except for possessive with unlimited
1027 repeat and ONCE group with no captures. Loop for all the alternatives.
1029 When we get to the final alternative within the brackets, we used to return
1030 the result of a recursive call to match() whatever happened so it was
1031 possible to reduce stack usage by turning this into a tail recursion,
1032 except in the case of a possibly empty group. However, now that there is
1033 the possiblity of (*THEN) occurring in the final alternative, this
1034 optimization is no longer always possible.
1036 We can optimize if we know there are no (*THEN)s in the pattern; at present
1037 this is the best that can be done.
1039 MATCH_ONCE is returned when the end of an atomic group is successfully
1040 reached, but subsequent matching fails. It passes back up the tree (causing
1041 captured values to be reset) until the original atomic group level is
1042 reached. This is tested by comparing md->once_target with the start of the
1043 group. At this point, the return is converted into MATCH_NOMATCH so that
1044 previous backup points can be taken. */
1049 DPRINTF(("start non-capturing bracket\n"));
1053 if (op
>= OP_SBRA
|| op
== OP_ONCE
)
1054 md
->match_function_type
= MATCH_CBEGROUP
;
1056 /* If this is not a possibly empty group, and there are no (*THEN)s in
1057 the pattern, and this is the final alternative, optimize as described
1060 else if (!md
->hasthen
&& ecode
[GET(ecode
, 1)] != OP_ALT
)
1062 ecode
+= PRIV(OP_lengths
)[*ecode
];
1066 /* In all other cases, we have to make another call to match(). */
1068 save_mark
= md
->mark
;
1069 RMATCH(eptr
, ecode
+ PRIV(OP_lengths
)[*ecode
], offset_top
, md
, eptrb
,
1072 /* See comment in the code for capturing groups above about handling
1075 if (rrc
== MATCH_THEN
)
1077 next
= ecode
+ GET(ecode
,1);
1078 if (md
->start_match_ptr
< next
&&
1079 (*ecode
== OP_ALT
|| *next
== OP_ALT
))
1080 rrc
= MATCH_NOMATCH
;
1083 if (rrc
!= MATCH_NOMATCH
)
1085 if (rrc
== MATCH_ONCE
)
1087 const pcre_uchar
*scode
= ecode
;
1088 if (*scode
!= OP_ONCE
) /* If not at start, find it */
1090 while (*scode
== OP_ALT
) scode
+= GET(scode
, 1);
1091 scode
-= GET(scode
, 1);
1093 if (md
->once_target
== scode
) rrc
= MATCH_NOMATCH
;
1097 ecode
+= GET(ecode
, 1);
1098 md
->mark
= save_mark
;
1099 if (*ecode
!= OP_ALT
) break;
1102 RRETURN(MATCH_NOMATCH
);
1104 /* Handle possessive capturing brackets with an unlimited repeat. We come
1105 here from BRAZERO with allow_zero set TRUE. The offset_vector values are
1106 handled similarly to the normal case above. However, the matching is
1107 different. The end of these brackets will always be OP_KETRPOS, which
1108 returns MATCH_KETRPOS without going further in the pattern. By this means
1109 we can handle the group by iteration rather than recursion, thereby
1110 reducing the amount of stack needed. */
1117 number
= GET2(ecode
, 1+LINK_SIZE
);
1118 offset
= number
<< 1;
1121 printf("start possessive bracket %d\n", number
);
1123 pchars(eptr
, 16, TRUE
, md
);
1127 if (offset
< md
->offset_max
)
1129 matched_once
= FALSE
;
1130 code_offset
= (int)(ecode
- md
->start_code
);
1132 save_offset1
= md
->offset_vector
[offset
];
1133 save_offset2
= md
->offset_vector
[offset
+1];
1134 save_offset3
= md
->offset_vector
[md
->offset_end
- number
];
1135 save_capture_last
= md
->capture_last
;
1137 DPRINTF(("saving %d %d %d\n", save_offset1
, save_offset2
, save_offset3
));
1139 /* Each time round the loop, save the current subject position for use
1140 when the group matches. For MATCH_MATCH, the group has matched, so we
1141 restart it with a new subject starting position, remembering that we had
1142 at least one match. For MATCH_NOMATCH, carry on with the alternatives, as
1143 usual. If we haven't matched any alternatives in any iteration, check to
1144 see if a previous iteration matched. If so, the group has matched;
1145 continue from afterwards. Otherwise it has failed; restore the previous
1146 capture values before returning NOMATCH. */
1150 md
->offset_vector
[md
->offset_end
- number
] =
1151 (int)(eptr
- md
->start_subject
);
1152 if (op
>= OP_SBRA
) md
->match_function_type
= MATCH_CBEGROUP
;
1153 RMATCH(eptr
, ecode
+ PRIV(OP_lengths
)[*ecode
], offset_top
, md
,
1155 if (rrc
== MATCH_KETRPOS
)
1157 offset_top
= md
->end_offset_top
;
1158 eptr
= md
->end_match_ptr
;
1159 ecode
= md
->start_code
+ code_offset
;
1160 save_capture_last
= md
->capture_last
;
1161 matched_once
= TRUE
;
1165 /* See comment in the code for capturing groups above about handling
1168 if (rrc
== MATCH_THEN
)
1170 next
= ecode
+ GET(ecode
,1);
1171 if (md
->start_match_ptr
< next
&&
1172 (*ecode
== OP_ALT
|| *next
== OP_ALT
))
1173 rrc
= MATCH_NOMATCH
;
1176 if (rrc
!= MATCH_NOMATCH
) RRETURN(rrc
);
1177 md
->capture_last
= save_capture_last
;
1178 ecode
+= GET(ecode
, 1);
1179 if (*ecode
!= OP_ALT
) break;
1184 md
->offset_vector
[offset
] = save_offset1
;
1185 md
->offset_vector
[offset
+1] = save_offset2
;
1186 md
->offset_vector
[md
->offset_end
- number
] = save_offset3
;
1189 if (allow_zero
|| matched_once
)
1191 ecode
+= 1 + LINK_SIZE
;
1195 RRETURN(MATCH_NOMATCH
);
1198 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1199 as a non-capturing bracket. */
1201 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1202 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1204 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1206 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1207 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1209 /* Non-capturing possessive bracket with unlimited repeat. We come here
1210 from BRAZERO with allow_zero = TRUE. The code is similar to the above,
1211 without the capturing complication. It is written out separately for speed
1218 POSSESSIVE_NON_CAPTURE
:
1219 matched_once
= FALSE
;
1220 code_offset
= (int)(ecode
- md
->start_code
);
1224 if (op
>= OP_SBRA
) md
->match_function_type
= MATCH_CBEGROUP
;
1225 RMATCH(eptr
, ecode
+ PRIV(OP_lengths
)[*ecode
], offset_top
, md
,
1227 if (rrc
== MATCH_KETRPOS
)
1229 offset_top
= md
->end_offset_top
;
1230 eptr
= md
->end_match_ptr
;
1231 ecode
= md
->start_code
+ code_offset
;
1232 matched_once
= TRUE
;
1236 /* See comment in the code for capturing groups above about handling
1239 if (rrc
== MATCH_THEN
)
1241 next
= ecode
+ GET(ecode
,1);
1242 if (md
->start_match_ptr
< next
&&
1243 (*ecode
== OP_ALT
|| *next
== OP_ALT
))
1244 rrc
= MATCH_NOMATCH
;
1247 if (rrc
!= MATCH_NOMATCH
) RRETURN(rrc
);
1248 ecode
+= GET(ecode
, 1);
1249 if (*ecode
!= OP_ALT
) break;
1252 if (matched_once
|| allow_zero
)
1254 ecode
+= 1 + LINK_SIZE
;
1257 RRETURN(MATCH_NOMATCH
);
1259 /* Control never reaches here. */
1261 /* Conditional group: compilation checked that there are no more than
1262 two branches. If the condition is false, skipping the first branch takes us
1263 past the end if there is only one branch, but that's OK because that is
1264 exactly what going to the ket would do. */
1268 codelink
= GET(ecode
, 1);
1270 /* Because of the way auto-callout works during compile, a callout item is
1271 inserted between OP_COND and an assertion condition. */
1273 if (ecode
[LINK_SIZE
+1] == OP_CALLOUT
)
1275 if (PUBL(callout
) != NULL
)
1277 PUBL(callout_block
) cb
;
1278 cb
.version
= 2; /* Version 1 of the callout block */
1279 cb
.callout_number
= ecode
[LINK_SIZE
+2];
1280 cb
.offset_vector
= md
->offset_vector
;
1281 #if defined COMPILE_PCRE8
1282 cb
.subject
= (PCRE_SPTR
)md
->start_subject
;
1283 #elif defined COMPILE_PCRE16
1284 cb
.subject
= (PCRE_SPTR16
)md
->start_subject
;
1285 #elif defined COMPILE_PCRE32
1286 cb
.subject
= (PCRE_SPTR32
)md
->start_subject
;
1288 cb
.subject_length
= (int)(md
->end_subject
- md
->start_subject
);
1289 cb
.start_match
= (int)(mstart
- md
->start_subject
);
1290 cb
.current_position
= (int)(eptr
- md
->start_subject
);
1291 cb
.pattern_position
= GET(ecode
, LINK_SIZE
+ 3);
1292 cb
.next_item_length
= GET(ecode
, 3 + 2*LINK_SIZE
);
1293 cb
.capture_top
= offset_top
/2;
1294 cb
.capture_last
= md
->capture_last
;
1295 cb
.callout_data
= md
->callout_data
;
1296 cb
.mark
= md
->nomatch_mark
;
1297 if ((rrc
= (*PUBL(callout
))(&cb
)) > 0) RRETURN(MATCH_NOMATCH
);
1298 if (rrc
< 0) RRETURN(rrc
);
1300 ecode
+= PRIV(OP_lengths
)[OP_CALLOUT
];
1303 condcode
= ecode
[LINK_SIZE
+1];
1305 /* Now see what the actual condition is */
1307 if (condcode
== OP_RREF
|| condcode
== OP_NRREF
) /* Recursion test */
1309 if (md
->recursive
== NULL
) /* Not recursing => FALSE */
1312 ecode
+= GET(ecode
, 1);
1316 unsigned int recno
= GET2(ecode
, LINK_SIZE
+ 2); /* Recursion group number*/
1317 condition
= (recno
== RREF_ANY
|| recno
== md
->recursive
->group_num
);
1319 /* If the test is for recursion into a specific subpattern, and it is
1320 false, but the test was set up by name, scan the table to see if the
1321 name refers to any other numbers, and test them. The condition is true
1322 if any one is set. */
1324 if (!condition
&& condcode
== OP_NRREF
)
1326 pcre_uchar
*slotA
= md
->name_table
;
1327 for (i
= 0; i
< md
->name_count
; i
++)
1329 if (GET2(slotA
, 0) == recno
) break;
1330 slotA
+= md
->name_entry_size
;
1333 /* Found a name for the number - there can be only one; duplicate
1334 names for different numbers are allowed, but not vice versa. First
1335 scan down for duplicates. */
1337 if (i
< md
->name_count
)
1339 pcre_uchar
*slotB
= slotA
;
1340 while (slotB
> md
->name_table
)
1342 slotB
-= md
->name_entry_size
;
1343 if (STRCMP_UC_UC(slotA
+ IMM2_SIZE
, slotB
+ IMM2_SIZE
) == 0)
1345 condition
= GET2(slotB
, 0) == md
->recursive
->group_num
;
1346 if (condition
) break;
1351 /* Scan up for duplicates */
1356 for (i
++; i
< md
->name_count
; i
++)
1358 slotB
+= md
->name_entry_size
;
1359 if (STRCMP_UC_UC(slotA
+ IMM2_SIZE
, slotB
+ IMM2_SIZE
) == 0)
1361 condition
= GET2(slotB
, 0) == md
->recursive
->group_num
;
1362 if (condition
) break;
1370 /* Chose branch according to the condition */
1372 ecode
+= condition
? 1 + IMM2_SIZE
: GET(ecode
, 1);
1376 else if (condcode
== OP_CREF
|| condcode
== OP_NCREF
) /* Group used test */
1378 offset
= GET2(ecode
, LINK_SIZE
+2) << 1; /* Doubled ref number */
1379 condition
= offset
< offset_top
&& md
->offset_vector
[offset
] >= 0;
1381 /* If the numbered capture is unset, but the reference was by name,
1382 scan the table to see if the name refers to any other numbers, and test
1383 them. The condition is true if any one is set. This is tediously similar
1384 to the code above, but not close enough to try to amalgamate. */
1386 if (!condition
&& condcode
== OP_NCREF
)
1388 unsigned int refno
= offset
>> 1;
1389 pcre_uchar
*slotA
= md
->name_table
;
1391 for (i
= 0; i
< md
->name_count
; i
++)
1393 if (GET2(slotA
, 0) == refno
) break;
1394 slotA
+= md
->name_entry_size
;
1397 /* Found a name for the number - there can be only one; duplicate names
1398 for different numbers are allowed, but not vice versa. First scan down
1401 if (i
< md
->name_count
)
1403 pcre_uchar
*slotB
= slotA
;
1404 while (slotB
> md
->name_table
)
1406 slotB
-= md
->name_entry_size
;
1407 if (STRCMP_UC_UC(slotA
+ IMM2_SIZE
, slotB
+ IMM2_SIZE
) == 0)
1409 offset
= GET2(slotB
, 0) << 1;
1410 condition
= offset
< offset_top
&&
1411 md
->offset_vector
[offset
] >= 0;
1412 if (condition
) break;
1417 /* Scan up for duplicates */
1422 for (i
++; i
< md
->name_count
; i
++)
1424 slotB
+= md
->name_entry_size
;
1425 if (STRCMP_UC_UC(slotA
+ IMM2_SIZE
, slotB
+ IMM2_SIZE
) == 0)
1427 offset
= GET2(slotB
, 0) << 1;
1428 condition
= offset
< offset_top
&&
1429 md
->offset_vector
[offset
] >= 0;
1430 if (condition
) break;
1438 /* Chose branch according to the condition */
1440 ecode
+= condition
? 1 + IMM2_SIZE
: GET(ecode
, 1);
1443 else if (condcode
== OP_DEF
) /* DEFINE - always false */
1446 ecode
+= GET(ecode
, 1);
1449 /* The condition is an assertion. Call match() to evaluate it - setting
1450 md->match_function_type to MATCH_CONDASSERT causes it to stop at the end of
1455 md
->match_function_type
= MATCH_CONDASSERT
;
1456 RMATCH(eptr
, ecode
+ 1 + LINK_SIZE
, offset_top
, md
, NULL
, RM3
);
1457 if (rrc
== MATCH_MATCH
)
1459 if (md
->end_offset_top
> offset_top
)
1460 offset_top
= md
->end_offset_top
; /* Captures may have happened */
1462 ecode
+= 1 + LINK_SIZE
+ GET(ecode
, LINK_SIZE
+ 2);
1463 while (*ecode
== OP_ALT
) ecode
+= GET(ecode
, 1);
1466 /* PCRE doesn't allow the effect of (*THEN) to escape beyond an
1467 assertion; it is therefore treated as NOMATCH. */
1469 else if (rrc
!= MATCH_NOMATCH
&& rrc
!= MATCH_THEN
)
1471 RRETURN(rrc
); /* Need braces because of following else */
1480 /* We are now at the branch that is to be obeyed. As there is only one, can
1481 use tail recursion to avoid using another stack frame, except when there is
1482 unlimited repeat of a possibly empty group. In the latter case, a recursive
1483 call to match() is always required, unless the second alternative doesn't
1484 exist, in which case we can just plough on. Note that, for compatibility
1485 with Perl, the | in a conditional group is NOT treated as creating two
1486 alternatives. If a THEN is encountered in the branch, it propagates out to
1487 the enclosing alternative (unless nested in a deeper set of alternatives,
1490 if (condition
|| *ecode
== OP_ALT
)
1494 ecode
+= 1 + LINK_SIZE
;
1498 md
->match_function_type
= MATCH_CBEGROUP
;
1499 RMATCH(eptr
, ecode
+ 1 + LINK_SIZE
, offset_top
, md
, eptrb
, RM49
);
1503 /* Condition false & no alternative; continue after the group. */
1507 ecode
+= 1 + LINK_SIZE
;
1512 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1513 to close any currently open capturing brackets. */
1516 number
= GET2(ecode
, 1);
1517 offset
= number
<< 1;
1520 printf("end bracket %d at *ACCEPT", number
);
1524 md
->capture_last
= number
;
1525 if (offset
>= md
->offset_max
) md
->offset_overflow
= TRUE
; else
1527 md
->offset_vector
[offset
] =
1528 md
->offset_vector
[md
->offset_end
- number
];
1529 md
->offset_vector
[offset
+1] = (int)(eptr
- md
->start_subject
);
1530 if (offset_top
<= offset
) offset_top
= offset
+ 2;
1532 ecode
+= 1 + IMM2_SIZE
;
1536 /* End of the pattern, either real or forced. */
1540 case OP_ASSERT_ACCEPT
:
1542 /* If we have matched an empty string, fail if not in an assertion and not
1543 in a recursion if either PCRE_NOTEMPTY is set, or if PCRE_NOTEMPTY_ATSTART
1544 is set and we have matched at the start of the subject. In both cases,
1545 backtracking will then try other alternatives, if any. */
1547 if (eptr
== mstart
&& op
!= OP_ASSERT_ACCEPT
&&
1548 md
->recursive
== NULL
&&
1550 (md
->notempty_atstart
&&
1551 mstart
== md
->start_subject
+ md
->start_offset
)))
1552 RRETURN(MATCH_NOMATCH
);
1554 /* Otherwise, we have a match. */
1556 md
->end_match_ptr
= eptr
; /* Record where we ended */
1557 md
->end_offset_top
= offset_top
; /* and how many extracts were taken */
1558 md
->start_match_ptr
= mstart
; /* and the start (\K can modify) */
1560 /* For some reason, the macros don't work properly if an expression is
1561 given as the argument to RRETURN when the heap is in use. */
1563 rrc
= (op
== OP_END
)? MATCH_MATCH
: MATCH_ACCEPT
;
1566 /* Assertion brackets. Check the alternative branches in turn - the
1567 matching won't pass the KET for an assertion. If any one branch matches,
1568 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1569 start of each branch to move the current point backwards, so the code at
1570 this level is identical to the lookahead case. When the assertion is part
1571 of a condition, we want to return immediately afterwards. The caller of
1572 this incarnation of the match() function will have set MATCH_CONDASSERT in
1573 md->match_function type, and one of these opcodes will be the first opcode
1574 that is processed. We use a local variable that is preserved over calls to
1575 match() to remember this case. */
1579 save_mark
= md
->mark
;
1580 if (md
->match_function_type
== MATCH_CONDASSERT
)
1583 md
->match_function_type
= 0;
1585 else condassert
= FALSE
;
1589 RMATCH(eptr
, ecode
+ 1 + LINK_SIZE
, offset_top
, md
, NULL
, RM4
);
1590 if (rrc
== MATCH_MATCH
|| rrc
== MATCH_ACCEPT
)
1592 mstart
= md
->start_match_ptr
; /* In case \K reset it */
1595 md
->mark
= save_mark
;
1597 /* A COMMIT failure must fail the entire assertion, without trying any
1598 subsequent branches. */
1600 if (rrc
== MATCH_COMMIT
) RRETURN(MATCH_NOMATCH
);
1602 /* PCRE does not allow THEN to escape beyond an assertion; it
1603 is treated as NOMATCH. */
1605 if (rrc
!= MATCH_NOMATCH
&& rrc
!= MATCH_THEN
) RRETURN(rrc
);
1606 ecode
+= GET(ecode
, 1);
1608 while (*ecode
== OP_ALT
);
1610 if (*ecode
== OP_KET
) RRETURN(MATCH_NOMATCH
);
1612 /* If checking an assertion for a condition, return MATCH_MATCH. */
1614 if (condassert
) RRETURN(MATCH_MATCH
);
1616 /* Continue from after the assertion, updating the offsets high water
1617 mark, since extracts may have been taken during the assertion. */
1619 do ecode
+= GET(ecode
,1); while (*ecode
== OP_ALT
);
1620 ecode
+= 1 + LINK_SIZE
;
1621 offset_top
= md
->end_offset_top
;
1624 /* Negative assertion: all branches must fail to match. Encountering SKIP,
1625 PRUNE, or COMMIT means we must assume failure without checking subsequent
1629 case OP_ASSERTBACK_NOT
:
1630 save_mark
= md
->mark
;
1631 if (md
->match_function_type
== MATCH_CONDASSERT
)
1634 md
->match_function_type
= 0;
1636 else condassert
= FALSE
;
1640 RMATCH(eptr
, ecode
+ 1 + LINK_SIZE
, offset_top
, md
, NULL
, RM5
);
1641 md
->mark
= save_mark
;
1642 if (rrc
== MATCH_MATCH
|| rrc
== MATCH_ACCEPT
) RRETURN(MATCH_NOMATCH
);
1643 if (rrc
== MATCH_SKIP
|| rrc
== MATCH_PRUNE
|| rrc
== MATCH_COMMIT
)
1645 do ecode
+= GET(ecode
,1); while (*ecode
== OP_ALT
);
1649 /* PCRE does not allow THEN to escape beyond an assertion; it is treated
1652 if (rrc
!= MATCH_NOMATCH
&& rrc
!= MATCH_THEN
) RRETURN(rrc
);
1653 ecode
+= GET(ecode
,1);
1655 while (*ecode
== OP_ALT
);
1657 if (condassert
) RRETURN(MATCH_MATCH
); /* Condition assertion */
1659 ecode
+= 1 + LINK_SIZE
;
1662 /* Move the subject pointer back. This occurs only at the start of
1663 each branch of a lookbehind assertion. If we are too close to the start to
1664 move back, this match function fails. When working with UTF-8 we move
1665 back a number of characters, not bytes. */
1675 if (eptr
< md
->start_subject
) RRETURN(MATCH_NOMATCH
);
1682 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1685 eptr
-= GET(ecode
, 1);
1686 if (eptr
< md
->start_subject
) RRETURN(MATCH_NOMATCH
);
1689 /* Save the earliest consulted character, then skip to next op code */
1691 if (eptr
< md
->start_used_ptr
) md
->start_used_ptr
= eptr
;
1692 ecode
+= 1 + LINK_SIZE
;
1695 /* The callout item calls an external function, if one is provided, passing
1696 details of the match so far. This is mainly for debugging, though the
1697 function is able to force a failure. */
1700 if (PUBL(callout
) != NULL
)
1702 PUBL(callout_block
) cb
;
1703 cb
.version
= 2; /* Version 1 of the callout block */
1704 cb
.callout_number
= ecode
[1];
1705 cb
.offset_vector
= md
->offset_vector
;
1706 #if defined COMPILE_PCRE8
1707 cb
.subject
= (PCRE_SPTR
)md
->start_subject
;
1708 #elif defined COMPILE_PCRE16
1709 cb
.subject
= (PCRE_SPTR16
)md
->start_subject
;
1710 #elif defined COMPILE_PCRE32
1711 cb
.subject
= (PCRE_SPTR32
)md
->start_subject
;
1713 cb
.subject_length
= (int)(md
->end_subject
- md
->start_subject
);
1714 cb
.start_match
= (int)(mstart
- md
->start_subject
);
1715 cb
.current_position
= (int)(eptr
- md
->start_subject
);
1716 cb
.pattern_position
= GET(ecode
, 2);
1717 cb
.next_item_length
= GET(ecode
, 2 + LINK_SIZE
);
1718 cb
.capture_top
= offset_top
/2;
1719 cb
.capture_last
= md
->capture_last
;
1720 cb
.callout_data
= md
->callout_data
;
1721 cb
.mark
= md
->nomatch_mark
;
1722 if ((rrc
= (*PUBL(callout
))(&cb
)) > 0) RRETURN(MATCH_NOMATCH
);
1723 if (rrc
< 0) RRETURN(rrc
);
1725 ecode
+= 2 + 2*LINK_SIZE
;
1728 /* Recursion either matches the current regex, or some subexpression. The
1729 offset data is the offset to the starting bracket from the start of the
1730 whole pattern. (This is so that it works from duplicated subpatterns.)
1732 The state of the capturing groups is preserved over recursion, and
1733 re-instated afterwards. We don't know how many are started and not yet
1734 finished (offset_top records the completed total) so we just have to save
1735 all the potential data. There may be up to 65535 such values, which is too
1736 large to put on the stack, but using malloc for small numbers seems
1737 expensive. As a compromise, the stack is used when there are no more than
1738 REC_STACK_SAVE_MAX values to store; otherwise malloc is used.
1740 There are also other values that have to be saved. We use a chained
1741 sequence of blocks that actually live on the stack. Thanks to Robin Houston
1742 for the original version of this logic. It has, however, been hacked around
1743 a lot, so he is not to blame for the current way it works. */
1750 callpat
= md
->start_code
+ GET(ecode
, 1);
1751 recno
= (callpat
== md
->start_code
)? 0 :
1752 GET2(callpat
, 1 + LINK_SIZE
);
1754 /* Check for repeating a recursion without advancing the subject pointer.
1755 This should catch convoluted mutual recursions. (Some simple cases are
1756 caught at compile time.) */
1758 for (ri
= md
->recursive
; ri
!= NULL
; ri
= ri
->prevrec
)
1759 if (recno
== ri
->group_num
&& eptr
== ri
->subject_position
)
1760 RRETURN(PCRE_ERROR_RECURSELOOP
);
1762 /* Add to "recursing stack" */
1764 new_recursive
.group_num
= recno
;
1765 new_recursive
.subject_position
= eptr
;
1766 new_recursive
.prevrec
= md
->recursive
;
1767 md
->recursive
= &new_recursive
;
1769 /* Where to continue from afterwards */
1771 ecode
+= 1 + LINK_SIZE
;
1773 /* Now save the offset data */
1775 new_recursive
.saved_max
= md
->offset_end
;
1776 if (new_recursive
.saved_max
<= REC_STACK_SAVE_MAX
)
1777 new_recursive
.offset_save
= stacksave
;
1780 new_recursive
.offset_save
=
1781 (int *)(PUBL(malloc
))(new_recursive
.saved_max
* sizeof(int));
1782 if (new_recursive
.offset_save
== NULL
) RRETURN(PCRE_ERROR_NOMEMORY
);
1784 memcpy(new_recursive
.offset_save
, md
->offset_vector
,
1785 new_recursive
.saved_max
* sizeof(int));
1787 /* OK, now we can do the recursion. After processing each alternative,
1788 restore the offset data. If there were nested recursions, md->recursive
1789 might be changed, so reset it before looping. */
1791 DPRINTF(("Recursing into group %d\n", new_recursive
.group_num
));
1792 cbegroup
= (*callpat
>= OP_SBRA
);
1795 if (cbegroup
) md
->match_function_type
= MATCH_CBEGROUP
;
1796 RMATCH(eptr
, callpat
+ PRIV(OP_lengths
)[*callpat
], offset_top
,
1798 memcpy(md
->offset_vector
, new_recursive
.offset_save
,
1799 new_recursive
.saved_max
* sizeof(int));
1800 md
->recursive
= new_recursive
.prevrec
;
1801 if (rrc
== MATCH_MATCH
|| rrc
== MATCH_ACCEPT
)
1803 DPRINTF(("Recursion matched\n"));
1804 if (new_recursive
.offset_save
!= stacksave
)
1805 (PUBL(free
))(new_recursive
.offset_save
);
1807 /* Set where we got to in the subject, and reset the start in case
1808 it was changed by \K. This *is* propagated back out of a recursion,
1809 for Perl compatibility. */
1811 eptr
= md
->end_match_ptr
;
1812 mstart
= md
->start_match_ptr
;
1813 goto RECURSION_MATCHED
; /* Exit loop; end processing */
1816 /* PCRE does not allow THEN or COMMIT to escape beyond a recursion; it
1817 is treated as NOMATCH. */
1819 else if (rrc
!= MATCH_NOMATCH
&& rrc
!= MATCH_THEN
&&
1820 rrc
!= MATCH_COMMIT
)
1822 DPRINTF(("Recursion gave error %d\n", rrc
));
1823 if (new_recursive
.offset_save
!= stacksave
)
1824 (PUBL(free
))(new_recursive
.offset_save
);
1828 md
->recursive
= &new_recursive
;
1829 callpat
+= GET(callpat
, 1);
1831 while (*callpat
== OP_ALT
);
1833 DPRINTF(("Recursion didn't match\n"));
1834 md
->recursive
= new_recursive
.prevrec
;
1835 if (new_recursive
.offset_save
!= stacksave
)
1836 (PUBL(free
))(new_recursive
.offset_save
);
1837 RRETURN(MATCH_NOMATCH
);
1843 /* An alternation is the end of a branch; scan along to find the end of the
1844 bracketed group and go to there. */
1847 do ecode
+= GET(ecode
,1); while (*ecode
== OP_ALT
);
1850 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1851 indicating that it may occur zero times. It may repeat infinitely, or not
1852 at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1853 with fixed upper repeat limits are compiled as a number of copies, with the
1854 optional ones preceded by BRAZERO or BRAMINZERO. */
1858 RMATCH(eptr
, next
, offset_top
, md
, eptrb
, RM10
);
1859 if (rrc
!= MATCH_NOMATCH
) RRETURN(rrc
);
1860 do next
+= GET(next
, 1); while (*next
== OP_ALT
);
1861 ecode
= next
+ 1 + LINK_SIZE
;
1866 do next
+= GET(next
, 1); while (*next
== OP_ALT
);
1867 RMATCH(eptr
, next
+ 1+LINK_SIZE
, offset_top
, md
, eptrb
, RM11
);
1868 if (rrc
!= MATCH_NOMATCH
) RRETURN(rrc
);
1874 do next
+= GET(next
,1); while (*next
== OP_ALT
);
1875 ecode
= next
+ 1 + LINK_SIZE
;
1878 /* BRAPOSZERO occurs before a possessive bracket group. Don't do anything
1879 here; just jump to the group, with allow_zero set TRUE. */
1884 if (op
== OP_CBRAPOS
|| op
== OP_SCBRAPOS
) goto POSSESSIVE_CAPTURE
;
1885 goto POSSESSIVE_NON_CAPTURE
;
1887 /* End of a group, repeated or non-repeating. */
1893 prev
= ecode
- GET(ecode
, 1);
1895 /* If this was a group that remembered the subject start, in order to break
1896 infinite repeats of empty string matches, retrieve the subject start from
1897 the chain. Otherwise, set it NULL. */
1899 if (*prev
>= OP_SBRA
|| *prev
== OP_ONCE
)
1901 saved_eptr
= eptrb
->epb_saved_eptr
; /* Value at start of group */
1902 eptrb
= eptrb
->epb_prev
; /* Backup to previous group */
1904 else saved_eptr
= NULL
;
1906 /* If we are at the end of an assertion group or a non-capturing atomic
1907 group, stop matching and return MATCH_MATCH, but record the current high
1908 water mark for use by positive assertions. We also need to record the match
1909 start in case it was changed by \K. */
1911 if ((*prev
>= OP_ASSERT
&& *prev
<= OP_ASSERTBACK_NOT
) ||
1912 *prev
== OP_ONCE_NC
)
1914 md
->end_match_ptr
= eptr
; /* For ONCE_NC */
1915 md
->end_offset_top
= offset_top
;
1916 md
->start_match_ptr
= mstart
;
1917 RRETURN(MATCH_MATCH
); /* Sets md->mark */
1920 /* For capturing groups we have to check the group number back at the start
1921 and if necessary complete handling an extraction by setting the offsets and
1922 bumping the high water mark. Whole-pattern recursion is coded as a recurse
1923 into group 0, so it won't be picked up here. Instead, we catch it when the
1924 OP_END is reached. Other recursion is handled here. We just have to record
1925 the current subject position and start match pointer and give a MATCH
1928 if (*prev
== OP_CBRA
|| *prev
== OP_SCBRA
||
1929 *prev
== OP_CBRAPOS
|| *prev
== OP_SCBRAPOS
)
1931 number
= GET2(prev
, 1+LINK_SIZE
);
1932 offset
= number
<< 1;
1935 printf("end bracket %d", number
);
1939 /* Handle a recursively called group. */
1941 if (md
->recursive
!= NULL
&& md
->recursive
->group_num
== number
)
1943 md
->end_match_ptr
= eptr
;
1944 md
->start_match_ptr
= mstart
;
1945 RRETURN(MATCH_MATCH
);
1948 /* Deal with capturing */
1950 md
->capture_last
= number
;
1951 if (offset
>= md
->offset_max
) md
->offset_overflow
= TRUE
; else
1953 /* If offset is greater than offset_top, it means that we are
1954 "skipping" a capturing group, and that group's offsets must be marked
1955 unset. In earlier versions of PCRE, all the offsets were unset at the
1956 start of matching, but this doesn't work because atomic groups and
1957 assertions can cause a value to be set that should later be unset.
1958 Example: matching /(?>(a))b|(a)c/ against "ac". This sets group 1 as
1959 part of the atomic group, but this is not on the final matching path,
1960 so must be unset when 2 is set. (If there is no group 2, there is no
1961 problem, because offset_top will then be 2, indicating no capture.) */
1963 if (offset
> offset_top
)
1965 register int *iptr
= md
->offset_vector
+ offset_top
;
1966 register int *iend
= md
->offset_vector
+ offset
;
1967 while (iptr
< iend
) *iptr
++ = -1;
1970 /* Now make the extraction */
1972 md
->offset_vector
[offset
] =
1973 md
->offset_vector
[md
->offset_end
- number
];
1974 md
->offset_vector
[offset
+1] = (int)(eptr
- md
->start_subject
);
1975 if (offset_top
<= offset
) offset_top
= offset
+ 2;
1979 /* For an ordinary non-repeating ket, just continue at this level. This
1980 also happens for a repeating ket if no characters were matched in the
1981 group. This is the forcible breaking of infinite loops as implemented in
1982 Perl 5.005. For a non-repeating atomic group that includes captures,
1983 establish a backup point by processing the rest of the pattern at a lower
1984 level. If this results in a NOMATCH return, pass MATCH_ONCE back to the
1985 original OP_ONCE level, thereby bypassing intermediate backup points, but
1986 resetting any captures that happened along the way. */
1988 if (*ecode
== OP_KET
|| eptr
== saved_eptr
)
1990 if (*prev
== OP_ONCE
)
1992 RMATCH(eptr
, ecode
+ 1 + LINK_SIZE
, offset_top
, md
, eptrb
, RM12
);
1993 if (rrc
!= MATCH_NOMATCH
) RRETURN(rrc
);
1994 md
->once_target
= prev
; /* Level at which to change to MATCH_NOMATCH */
1995 RRETURN(MATCH_ONCE
);
1997 ecode
+= 1 + LINK_SIZE
; /* Carry on at this level */
2001 /* OP_KETRPOS is a possessive repeating ket. Remember the current position,
2002 and return the MATCH_KETRPOS. This makes it possible to do the repeats one
2003 at a time from the outer level, thus saving stack. */
2005 if (*ecode
== OP_KETRPOS
)
2007 md
->end_match_ptr
= eptr
;
2008 md
->end_offset_top
= offset_top
;
2009 RRETURN(MATCH_KETRPOS
);
2012 /* The normal repeating kets try the rest of the pattern or restart from
2013 the preceding bracket, in the appropriate order. In the second case, we can
2014 use tail recursion to avoid using another stack frame, unless we have an
2015 an atomic group or an unlimited repeat of a group that can match an empty
2018 if (*ecode
== OP_KETRMIN
)
2020 RMATCH(eptr
, ecode
+ 1 + LINK_SIZE
, offset_top
, md
, eptrb
, RM7
);
2021 if (rrc
!= MATCH_NOMATCH
) RRETURN(rrc
);
2022 if (*prev
== OP_ONCE
)
2024 RMATCH(eptr
, prev
, offset_top
, md
, eptrb
, RM8
);
2025 if (rrc
!= MATCH_NOMATCH
) RRETURN(rrc
);
2026 md
->once_target
= prev
; /* Level at which to change to MATCH_NOMATCH */
2027 RRETURN(MATCH_ONCE
);
2029 if (*prev
>= OP_SBRA
) /* Could match an empty string */
2031 RMATCH(eptr
, prev
, offset_top
, md
, eptrb
, RM50
);
2037 else /* OP_KETRMAX */
2039 RMATCH(eptr
, prev
, offset_top
, md
, eptrb
, RM13
);
2040 if (rrc
== MATCH_ONCE
&& md
->once_target
== prev
) rrc
= MATCH_NOMATCH
;
2041 if (rrc
!= MATCH_NOMATCH
) RRETURN(rrc
);
2042 if (*prev
== OP_ONCE
)
2044 RMATCH(eptr
, ecode
+ 1 + LINK_SIZE
, offset_top
, md
, eptrb
, RM9
);
2045 if (rrc
!= MATCH_NOMATCH
) RRETURN(rrc
);
2046 md
->once_target
= prev
;
2047 RRETURN(MATCH_ONCE
);
2049 ecode
+= 1 + LINK_SIZE
;
2052 /* Control never gets here */
2054 /* Not multiline mode: start of subject assertion, unless notbol. */
2057 if (md
->notbol
&& eptr
== md
->start_subject
) RRETURN(MATCH_NOMATCH
);
2059 /* Start of subject assertion */
2062 if (eptr
!= md
->start_subject
) RRETURN(MATCH_NOMATCH
);
2066 /* Multiline mode: start of subject unless notbol, or after any newline. */
2069 if (md
->notbol
&& eptr
== md
->start_subject
) RRETURN(MATCH_NOMATCH
);
2070 if (eptr
!= md
->start_subject
&&
2071 (eptr
== md
->end_subject
|| !WAS_NEWLINE(eptr
)))
2072 RRETURN(MATCH_NOMATCH
);
2076 /* Start of match assertion */
2079 if (eptr
!= md
->start_subject
+ md
->start_offset
) RRETURN(MATCH_NOMATCH
);
2083 /* Reset the start of match point */
2090 /* Multiline mode: assert before any newline, or before end of subject
2091 unless noteol is set. */
2094 if (eptr
< md
->end_subject
)
2096 if (!IS_NEWLINE(eptr
))
2098 if (md
->partial
!= 0 &&
2099 eptr
+ 1 >= md
->end_subject
&&
2100 NLBLOCK
->nltype
== NLTYPE_FIXED
&&
2101 NLBLOCK
->nllen
== 2 &&
2102 RAWUCHARTEST(eptr
) == NLBLOCK
->nl
[0])
2105 if (md
->partial
> 1) RRETURN(PCRE_ERROR_PARTIAL
);
2107 RRETURN(MATCH_NOMATCH
);
2112 if (md
->noteol
) RRETURN(MATCH_NOMATCH
);
2118 /* Not multiline mode: assert before a terminating newline or before end of
2119 subject unless noteol is set. */
2122 if (md
->noteol
) RRETURN(MATCH_NOMATCH
);
2123 if (!md
->endonly
) goto ASSERT_NL_OR_EOS
;
2125 /* ... else fall through for endonly */
2127 /* End of subject assertion (\z) */
2130 if (eptr
< md
->end_subject
) RRETURN(MATCH_NOMATCH
);
2135 /* End of subject or ending \n assertion (\Z) */
2139 if (eptr
< md
->end_subject
&&
2140 (!IS_NEWLINE(eptr
) || eptr
!= md
->end_subject
- md
->nllen
))
2142 if (md
->partial
!= 0 &&
2143 eptr
+ 1 >= md
->end_subject
&&
2144 NLBLOCK
->nltype
== NLTYPE_FIXED
&&
2145 NLBLOCK
->nllen
== 2 &&
2146 RAWUCHARTEST(eptr
) == NLBLOCK
->nl
[0])
2149 if (md
->partial
> 1) RRETURN(PCRE_ERROR_PARTIAL
);
2151 RRETURN(MATCH_NOMATCH
);
2154 /* Either at end of string or \n before end. */
2160 /* Word boundary assertions */
2162 case OP_NOT_WORD_BOUNDARY
:
2163 case OP_WORD_BOUNDARY
:
2166 /* Find out if the previous and current characters are "word" characters.
2167 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
2168 be "non-word" characters. Remember the earliest consulted character for
2169 partial matching. */
2174 /* Get status of previous character */
2176 if (eptr
== md
->start_subject
) prev_is_word
= FALSE
; else
2178 PCRE_PUCHAR lastptr
= eptr
- 1;
2180 if (lastptr
< md
->start_used_ptr
) md
->start_used_ptr
= lastptr
;
2181 GETCHAR(c
, lastptr
);
2185 if (c
== '_') prev_is_word
= TRUE
; else
2187 int cat
= UCD_CATEGORY(c
);
2188 prev_is_word
= (cat
== ucp_L
|| cat
== ucp_N
);
2193 prev_is_word
= c
< 256 && (md
->ctypes
[c
] & ctype_word
) != 0;
2196 /* Get status of next character */
2198 if (eptr
>= md
->end_subject
)
2201 cur_is_word
= FALSE
;
2209 if (c
== '_') cur_is_word
= TRUE
; else
2211 int cat
= UCD_CATEGORY(c
);
2212 cur_is_word
= (cat
== ucp_L
|| cat
== ucp_N
);
2217 cur_is_word
= c
< 256 && (md
->ctypes
[c
] & ctype_word
) != 0;
2223 /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for
2224 consistency with the behaviour of \w we do use it in this case. */
2227 /* Get status of previous character */
2229 if (eptr
== md
->start_subject
) prev_is_word
= FALSE
; else
2231 if (eptr
<= md
->start_used_ptr
) md
->start_used_ptr
= eptr
- 1;
2236 if (c
== '_') prev_is_word
= TRUE
; else
2238 int cat
= UCD_CATEGORY(c
);
2239 prev_is_word
= (cat
== ucp_L
|| cat
== ucp_N
);
2244 prev_is_word
= MAX_255(eptr
[-1])
2245 && ((md
->ctypes
[eptr
[-1]] & ctype_word
) != 0);
2248 /* Get status of next character */
2250 if (eptr
>= md
->end_subject
)
2253 cur_is_word
= FALSE
;
2260 if (c
== '_') cur_is_word
= TRUE
; else
2262 int cat
= UCD_CATEGORY(c
);
2263 cur_is_word
= (cat
== ucp_L
|| cat
== ucp_N
);
2268 cur_is_word
= MAX_255(*eptr
)
2269 && ((md
->ctypes
[*eptr
] & ctype_word
) != 0);
2272 /* Now see if the situation is what we want */
2274 if ((*ecode
++ == OP_WORD_BOUNDARY
)?
2275 cur_is_word
== prev_is_word
: cur_is_word
!= prev_is_word
)
2276 RRETURN(MATCH_NOMATCH
);
2280 /* Match any single character type except newline; have to take care with
2281 CRLF newlines and partial matching. */
2284 if (IS_NEWLINE(eptr
)) RRETURN(MATCH_NOMATCH
);
2285 if (md
->partial
!= 0 &&
2286 eptr
+ 1 >= md
->end_subject
&&
2287 NLBLOCK
->nltype
== NLTYPE_FIXED
&&
2288 NLBLOCK
->nllen
== 2 &&
2289 RAWUCHARTEST(eptr
) == NLBLOCK
->nl
[0])
2292 if (md
->partial
> 1) RRETURN(PCRE_ERROR_PARTIAL
);
2297 /* Match any single character whatsoever. */
2300 if (eptr
>= md
->end_subject
) /* DO NOT merge the eptr++ here; it must */
2301 { /* not be updated before SCHECK_PARTIAL. */
2303 RRETURN(MATCH_NOMATCH
);
2307 if (utf
) ACROSSCHAR(eptr
< md
->end_subject
, *eptr
, eptr
++);
2312 /* Match a single byte, even in UTF-8 mode. This opcode really does match
2313 any byte, even newline, independent of the setting of PCRE_DOTALL. */
2316 if (eptr
>= md
->end_subject
) /* DO NOT merge the eptr++ here; it must */
2317 { /* not be updated before SCHECK_PARTIAL. */
2319 RRETURN(MATCH_NOMATCH
);
2326 if (eptr
>= md
->end_subject
)
2329 RRETURN(MATCH_NOMATCH
);
2331 GETCHARINCTEST(c
, eptr
);
2333 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2336 (md
->ctypes
[c
] & ctype_digit
) != 0
2338 RRETURN(MATCH_NOMATCH
);
2343 if (eptr
>= md
->end_subject
)
2346 RRETURN(MATCH_NOMATCH
);
2348 GETCHARINCTEST(c
, eptr
);
2350 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2353 (md
->ctypes
[c
] & ctype_digit
) == 0
2355 RRETURN(MATCH_NOMATCH
);
2359 case OP_NOT_WHITESPACE
:
2360 if (eptr
>= md
->end_subject
)
2363 RRETURN(MATCH_NOMATCH
);
2365 GETCHARINCTEST(c
, eptr
);
2367 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2370 (md
->ctypes
[c
] & ctype_space
) != 0
2372 RRETURN(MATCH_NOMATCH
);
2377 if (eptr
>= md
->end_subject
)
2380 RRETURN(MATCH_NOMATCH
);
2382 GETCHARINCTEST(c
, eptr
);
2384 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2387 (md
->ctypes
[c
] & ctype_space
) == 0
2389 RRETURN(MATCH_NOMATCH
);
2393 case OP_NOT_WORDCHAR
:
2394 if (eptr
>= md
->end_subject
)
2397 RRETURN(MATCH_NOMATCH
);
2399 GETCHARINCTEST(c
, eptr
);
2401 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2404 (md
->ctypes
[c
] & ctype_word
) != 0
2406 RRETURN(MATCH_NOMATCH
);
2411 if (eptr
>= md
->end_subject
)
2414 RRETURN(MATCH_NOMATCH
);
2416 GETCHARINCTEST(c
, eptr
);
2418 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2421 (md
->ctypes
[c
] & ctype_word
) == 0
2423 RRETURN(MATCH_NOMATCH
);
2428 if (eptr
>= md
->end_subject
)
2431 RRETURN(MATCH_NOMATCH
);
2433 GETCHARINCTEST(c
, eptr
);
2436 default: RRETURN(MATCH_NOMATCH
);
2439 if (eptr
>= md
->end_subject
)
2443 else if (RAWUCHARTEST(eptr
) == CHAR_LF
) eptr
++;
2455 #endif /* Not EBCDIC */
2456 if (md
->bsr_anycrlf
) RRETURN(MATCH_NOMATCH
);
2463 if (eptr
>= md
->end_subject
)
2466 RRETURN(MATCH_NOMATCH
);
2468 GETCHARINCTEST(c
, eptr
);
2471 HSPACE_CASES
: RRETURN(MATCH_NOMATCH
); /* Byte and multibyte cases */
2478 if (eptr
>= md
->end_subject
)
2481 RRETURN(MATCH_NOMATCH
);
2483 GETCHARINCTEST(c
, eptr
);
2486 HSPACE_CASES
: break; /* Byte and multibyte cases */
2487 default: RRETURN(MATCH_NOMATCH
);
2493 if (eptr
>= md
->end_subject
)
2496 RRETURN(MATCH_NOMATCH
);
2498 GETCHARINCTEST(c
, eptr
);
2501 VSPACE_CASES
: RRETURN(MATCH_NOMATCH
);
2508 if (eptr
>= md
->end_subject
)
2511 RRETURN(MATCH_NOMATCH
);
2513 GETCHARINCTEST(c
, eptr
);
2516 VSPACE_CASES
: break;
2517 default: RRETURN(MATCH_NOMATCH
);
2523 /* Check the next character by Unicode property. We will get here only
2524 if the support is in the binary; otherwise a compile-time error occurs. */
2528 if (eptr
>= md
->end_subject
)
2531 RRETURN(MATCH_NOMATCH
);
2533 GETCHARINCTEST(c
, eptr
);
2535 const pcre_uint32
*cp
;
2536 const ucd_record
*prop
= GET_UCD(c
);
2541 if (op
== OP_NOTPROP
) RRETURN(MATCH_NOMATCH
);
2545 if ((prop
->chartype
== ucp_Lu
||
2546 prop
->chartype
== ucp_Ll
||
2547 prop
->chartype
== ucp_Lt
) == (op
== OP_NOTPROP
))
2548 RRETURN(MATCH_NOMATCH
);
2552 if ((ecode
[2] != PRIV(ucp_gentype
)[prop
->chartype
]) == (op
== OP_PROP
))
2553 RRETURN(MATCH_NOMATCH
);
2557 if ((ecode
[2] != prop
->chartype
) == (op
== OP_PROP
))
2558 RRETURN(MATCH_NOMATCH
);
2562 if ((ecode
[2] != prop
->script
) == (op
== OP_PROP
))
2563 RRETURN(MATCH_NOMATCH
);
2566 /* These are specials */
2569 if ((PRIV(ucp_gentype
)[prop
->chartype
] == ucp_L
||
2570 PRIV(ucp_gentype
)[prop
->chartype
] == ucp_N
) == (op
== OP_NOTPROP
))
2571 RRETURN(MATCH_NOMATCH
);
2574 case PT_SPACE
: /* Perl space */
2575 if ((PRIV(ucp_gentype
)[prop
->chartype
] == ucp_Z
||
2576 c
== CHAR_HT
|| c
== CHAR_NL
|| c
== CHAR_FF
|| c
== CHAR_CR
)
2577 == (op
== OP_NOTPROP
))
2578 RRETURN(MATCH_NOMATCH
);
2581 case PT_PXSPACE
: /* POSIX space */
2582 if ((PRIV(ucp_gentype
)[prop
->chartype
] == ucp_Z
||
2583 c
== CHAR_HT
|| c
== CHAR_NL
|| c
== CHAR_VT
||
2584 c
== CHAR_FF
|| c
== CHAR_CR
)
2585 == (op
== OP_NOTPROP
))
2586 RRETURN(MATCH_NOMATCH
);
2590 if ((PRIV(ucp_gentype
)[prop
->chartype
] == ucp_L
||
2591 PRIV(ucp_gentype
)[prop
->chartype
] == ucp_N
||
2592 c
== CHAR_UNDERSCORE
) == (op
== OP_NOTPROP
))
2593 RRETURN(MATCH_NOMATCH
);
2597 cp
= PRIV(ucd_caseless_sets
) + ecode
[2];
2601 { if (op
== OP_PROP
) { RRETURN(MATCH_NOMATCH
); } else break; }
2603 { if (op
== OP_PROP
) break; else { RRETURN(MATCH_NOMATCH
); } }
2607 /* This should never occur */
2610 RRETURN(PCRE_ERROR_INTERNAL
);
2617 /* Match an extended Unicode sequence. We will get here only if the support
2618 is in the binary; otherwise a compile-time error occurs. */
2621 if (eptr
>= md
->end_subject
)
2624 RRETURN(MATCH_NOMATCH
);
2629 GETCHARINCTEST(c
, eptr
);
2630 lgb
= UCD_GRAPHBREAK(c
);
2631 while (eptr
< md
->end_subject
)
2634 if (!utf
) c
= *eptr
; else { GETCHARLEN(c
, eptr
, len
); }
2635 rgb
= UCD_GRAPHBREAK(c
);
2636 if ((PRIV(ucp_gbtable
)[lgb
] & (1 << rgb
)) == 0) break;
2644 #endif /* SUPPORT_UCP */
2647 /* Match a back reference, possibly repeatedly. Look past the end of the
2648 item to see if there is repeat information following. The code is similar
2649 to that for character classes, but repeated for efficiency. Then obey
2650 similar code to character type repeats - written out again for speed.
2651 However, if the referenced string is the empty string, always treat
2652 it as matched, any number of times (otherwise there could be infinite
2657 caseless
= op
== OP_REFI
;
2658 offset
= GET2(ecode
, 1) << 1; /* Doubled ref number */
2659 ecode
+= 1 + IMM2_SIZE
;
2661 /* If the reference is unset, there are two possibilities:
2663 (a) In the default, Perl-compatible state, set the length negative;
2664 this ensures that every attempt at a match fails. We can't just fail
2665 here, because of the possibility of quantifiers with zero minima.
2667 (b) If the JavaScript compatibility flag is set, set the length to zero
2668 so that the back reference matches an empty string.
2670 Otherwise, set the length to the length of what was matched by the
2671 referenced subpattern. */
2673 if (offset
>= offset_top
|| md
->offset_vector
[offset
] < 0)
2674 length
= (md
->jscript_compat
)? 0 : -1;
2676 length
= md
->offset_vector
[offset
+1] - md
->offset_vector
[offset
];
2678 /* Set up for repetition, or handle the non-repeated case */
2688 c
= *ecode
++ - OP_CRSTAR
;
2689 minimize
= (c
& 1) != 0;
2690 min
= rep_min
[c
]; /* Pick up values from tables; */
2691 max
= rep_max
[c
]; /* zero for max => infinity */
2692 if (max
== 0) max
= INT_MAX
;
2697 minimize
= (*ecode
== OP_CRMINRANGE
);
2698 min
= GET2(ecode
, 1);
2699 max
= GET2(ecode
, 1 + IMM2_SIZE
);
2700 if (max
== 0) max
= INT_MAX
;
2701 ecode
+= 1 + 2 * IMM2_SIZE
;
2704 default: /* No repeat follows */
2705 if ((length
= match_ref(offset
, eptr
, length
, md
, caseless
)) < 0)
2707 if (length
== -2) eptr
= md
->end_subject
; /* Partial match */
2709 RRETURN(MATCH_NOMATCH
);
2712 continue; /* With the main loop */
2715 /* Handle repeated back references. If the length of the reference is
2716 zero, just continue with the main loop. If the length is negative, it
2717 means the reference is unset in non-Java-compatible mode. If the minimum is
2718 zero, we can continue at the same level without recursion. For any other
2719 minimum, carrying on will result in NOMATCH. */
2721 if (length
== 0) continue;
2722 if (length
< 0 && min
== 0) continue;
2724 /* First, ensure the minimum number of matches are present. We get back
2725 the length of the reference string explicitly rather than passing the
2726 address of eptr, so that eptr can be a register variable. */
2728 for (i
= 1; i
<= min
; i
++)
2731 if ((slength
= match_ref(offset
, eptr
, length
, md
, caseless
)) < 0)
2733 if (slength
== -2) eptr
= md
->end_subject
; /* Partial match */
2735 RRETURN(MATCH_NOMATCH
);
2740 /* If min = max, continue at the same level without recursion.
2741 They are not both allowed to be zero. */
2743 if (min
== max
) continue;
2745 /* If minimizing, keep trying and advancing the pointer */
2749 for (fi
= min
;; fi
++)
2752 RMATCH(eptr
, ecode
, offset_top
, md
, eptrb
, RM14
);
2753 if (rrc
!= MATCH_NOMATCH
) RRETURN(rrc
);
2754 if (fi
>= max
) RRETURN(MATCH_NOMATCH
);
2755 if ((slength
= match_ref(offset
, eptr
, length
, md
, caseless
)) < 0)
2757 if (slength
== -2) eptr
= md
->end_subject
; /* Partial match */
2759 RRETURN(MATCH_NOMATCH
);
2763 /* Control never gets here */
2766 /* If maximizing, find the longest string and work backwards */
2771 for (i
= min
; i
< max
; i
++)
2774 if ((slength
= match_ref(offset
, eptr
, length
, md
, caseless
)) < 0)
2776 /* Can't use CHECK_PARTIAL because we don't want to update eptr in
2777 the soft partial matching case. */
2779 if (slength
== -2 && md
->partial
!= 0 &&
2780 md
->end_subject
> md
->start_used_ptr
)
2783 if (md
->partial
> 1) RRETURN(PCRE_ERROR_PARTIAL
);
2792 RMATCH(eptr
, ecode
, offset_top
, md
, eptrb
, RM15
);
2793 if (rrc
!= MATCH_NOMATCH
) RRETURN(rrc
);
2796 RRETURN(MATCH_NOMATCH
);
2798 /* Control never gets here */
2800 /* Match a bit-mapped character class, possibly repeatedly. This op code is
2801 used when all the characters in the class have values in the range 0-255,
2802 and either the matching is caseful, or the characters are in the range
2803 0-127 when UTF-8 processing is enabled. The only difference between
2804 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2807 First, look past the end of the item to see if there is repeat information
2808 following. Then obey similar code to character type repeats - written out
2814 /* The data variable is saved across frames, so the byte map needs to
2816 #define BYTE_MAP ((pcre_uint8 *)data)
2817 data
= ecode
+ 1; /* Save for matching */
2818 ecode
+= 1 + (32 / sizeof(pcre_uchar
)); /* Advance past the item */
2828 c
= *ecode
++ - OP_CRSTAR
;
2829 minimize
= (c
& 1) != 0;
2830 min
= rep_min
[c
]; /* Pick up values from tables; */
2831 max
= rep_max
[c
]; /* zero for max => infinity */
2832 if (max
== 0) max
= INT_MAX
;
2837 minimize
= (*ecode
== OP_CRMINRANGE
);
2838 min
= GET2(ecode
, 1);
2839 max
= GET2(ecode
, 1 + IMM2_SIZE
);
2840 if (max
== 0) max
= INT_MAX
;
2841 ecode
+= 1 + 2 * IMM2_SIZE
;
2844 default: /* No repeat follows */
2849 /* First, ensure the minimum number of matches are present. */
2854 for (i
= 1; i
<= min
; i
++)
2856 if (eptr
>= md
->end_subject
)
2859 RRETURN(MATCH_NOMATCH
);
2861 GETCHARINC(c
, eptr
);
2864 if (op
== OP_CLASS
) RRETURN(MATCH_NOMATCH
);
2867 if ((BYTE_MAP
[c
/8] & (1 << (c
&7))) == 0) RRETURN(MATCH_NOMATCH
);
2874 for (i
= 1; i
<= min
; i
++)
2876 if (eptr
>= md
->end_subject
)
2879 RRETURN(MATCH_NOMATCH
);
2882 #ifndef COMPILE_PCRE8
2885 if (op
== OP_CLASS
) RRETURN(MATCH_NOMATCH
);
2889 if ((BYTE_MAP
[c
/8] & (1 << (c
&7))) == 0) RRETURN(MATCH_NOMATCH
);
2893 /* If max == min we can continue with the main loop without the
2896 if (min
== max
) continue;
2898 /* If minimizing, keep testing the rest of the expression and advancing
2899 the pointer while it matches the class. */
2906 for (fi
= min
;; fi
++)
2908 RMATCH(eptr
, ecode
, offset_top
, md
, eptrb
, RM16
);
2909 if (rrc
!= MATCH_NOMATCH
) RRETURN(rrc
);
2910 if (fi
>= max
) RRETURN(MATCH_NOMATCH
);
2911 if (eptr
>= md
->end_subject
)
2914 RRETURN(MATCH_NOMATCH
);
2916 GETCHARINC(c
, eptr
);
2919 if (op
== OP_CLASS
) RRETURN(MATCH_NOMATCH
);
2922 if ((BYTE_MAP
[c
/8] & (1 << (c
&7))) == 0) RRETURN(MATCH_NOMATCH
);
2929 for (fi
= min
;; fi
++)
2931 RMATCH(eptr
, ecode
, offset_top
, md
, eptrb
, RM17
);
2932 if (rrc
!= MATCH_NOMATCH
) RRETURN(rrc
);
2933 if (fi
>= max
) RRETURN(MATCH_NOMATCH
);
2934 if (eptr
>= md
->end_subject
)
2937 RRETURN(MATCH_NOMATCH
);
2940 #ifndef COMPILE_PCRE8
2943 if (op
== OP_CLASS
) RRETURN(MATCH_NOMATCH
);
2947 if ((BYTE_MAP
[c
/8] & (1 << (c
&7))) == 0) RRETURN(MATCH_NOMATCH
);
2950 /* Control never gets here */
2953 /* If maximizing, find the longest possible run, then work backwards. */
2962 for (i
= min
; i
< max
; i
++)
2965 if (eptr
>= md
->end_subject
)
2970 GETCHARLEN(c
, eptr
, len
);
2973 if (op
== OP_CLASS
) break;
2976 if ((BYTE_MAP
[c
/8] & (1 << (c
&7))) == 0) break;
2981 RMATCH(eptr
, ecode
, offset_top
, md
, eptrb
, RM18
);
2982 if (rrc
!= MATCH_NOMATCH
) RRETURN(rrc
);
2983 if (eptr
-- == pp
) break; /* Stop if tried at original pos */
2991 for (i
= min
; i
< max
; i
++)
2993 if (eptr
>= md
->end_subject
)
2999 #ifndef COMPILE_PCRE8
3002 if (op
== OP_CLASS
) break;
3006 if ((BYTE_MAP
[c
/8] & (1 << (c
&7))) == 0) break;
3011 RMATCH(eptr
, ecode
, offset_top
, md
, eptrb
, RM19
);
3012 if (rrc
!= MATCH_NOMATCH
) RRETURN(rrc
);
3017 RRETURN(MATCH_NOMATCH
);
3021 /* Control never gets here */
3024 /* Match an extended character class. This opcode is encountered only
3025 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
3026 mode, because Unicode properties are supported in non-UTF-8 mode. */
3028 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3031 data
= ecode
+ 1 + LINK_SIZE
; /* Save for matching */
3032 ecode
+= GET(ecode
, 1); /* Advance past the item */
3042 c
= *ecode
++ - OP_CRSTAR
;
3043 minimize
= (c
& 1) != 0;
3044 min
= rep_min
[c
]; /* Pick up values from tables; */
3045 max
= rep_max
[c
]; /* zero for max => infinity */
3046 if (max
== 0) max
= INT_MAX
;
3051 minimize
= (*ecode
== OP_CRMINRANGE
);
3052 min
= GET2(ecode
, 1);
3053 max
= GET2(ecode
, 1 + IMM2_SIZE
);
3054 if (max
== 0) max
= INT_MAX
;
3055 ecode
+= 1 + 2 * IMM2_SIZE
;
3058 default: /* No repeat follows */
3063 /* First, ensure the minimum number of matches are present. */
3065 for (i
= 1; i
<= min
; i
++)
3067 if (eptr
>= md
->end_subject
)
3070 RRETURN(MATCH_NOMATCH
);
3072 GETCHARINCTEST(c
, eptr
);
3073 if (!PRIV(xclass
)(c
, data
, utf
)) RRETURN(MATCH_NOMATCH
);
3076 /* If max == min we can continue with the main loop without the
3079 if (min
== max
) continue;
3081 /* If minimizing, keep testing the rest of the expression and advancing
3082 the pointer while it matches the class. */
3086 for (fi
= min
;; fi
++)
3088 RMATCH(eptr
, ecode
, offset_top
, md
, eptrb
, RM20
);
3089 if (rrc
!= MATCH_NOMATCH
) RRETURN(rrc
);
3090 if (fi
>= max
) RRETURN(MATCH_NOMATCH
);
3091 if (eptr
>= md
->end_subject
)
3094 RRETURN(MATCH_NOMATCH
);
3096 GETCHARINCTEST(c
, eptr
);
3097 if (!PRIV(xclass
)(c
, data
, utf
)) RRETURN(MATCH_NOMATCH
);
3099 /* Control never gets here */
3102 /* If maximizing, find the longest possible run, then work backwards. */
3107 for (i
= min
; i
< max
; i
++)
3110 if (eptr
>= md
->end_subject
)
3116 GETCHARLENTEST(c
, eptr
, len
);
3120 if (!PRIV(xclass
)(c
, data
, utf
)) break;
3125 RMATCH(eptr
, ecode
, offset_top
, md
, eptrb
, RM21
);
3126 if (rrc
!= MATCH_NOMATCH
) RRETURN(rrc
);
3127 if (eptr
-- == pp
) break; /* Stop if tried at original pos */
3129 if (utf
) BACKCHAR(eptr
);
3132 RRETURN(MATCH_NOMATCH
);
3135 /* Control never gets here */
3137 #endif /* End of XCLASS */
3139 /* Match a single character, casefully */
3147 GETCHARLEN(fc
, ecode
, length
);
3148 if (length
> md
->end_subject
- eptr
)
3150 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
3151 RRETURN(MATCH_NOMATCH
);
3153 while (length
-- > 0) if (*ecode
++ != RAWUCHARINC(eptr
)) RRETURN(MATCH_NOMATCH
);
3159 if (md
->end_subject
- eptr
< 1)
3161 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
3162 RRETURN(MATCH_NOMATCH
);
3164 if (ecode
[1] != *eptr
++) RRETURN(MATCH_NOMATCH
);
3169 /* Match a single character, caselessly. If we are at the end of the
3170 subject, give up immediately. */
3173 if (eptr
>= md
->end_subject
)
3176 RRETURN(MATCH_NOMATCH
);
3184 GETCHARLEN(fc
, ecode
, length
);
3186 /* If the pattern character's value is < 128, we have only one byte, and
3187 we know that its other case must also be one byte long, so we can use the
3188 fast lookup table. We know that there is at least one byte left in the
3193 pcre_uchar cc
= RAWUCHAR(eptr
);
3194 if (md
->lcc
[fc
] != TABLE_GET(cc
, md
->lcc
, cc
)) RRETURN(MATCH_NOMATCH
);
3199 /* Otherwise we must pick up the subject character. Note that we cannot
3200 use the value of "length" to check for sufficient bytes left, because the
3201 other case of the character may have more or fewer bytes. */
3206 GETCHARINC(dc
, eptr
);
3209 /* If we have Unicode property support, we can use it to test the other
3210 case of the character, if there is one. */
3215 if (dc
!= UCD_OTHERCASE(fc
))
3217 RRETURN(MATCH_NOMATCH
);
3222 #endif /* SUPPORT_UTF */
3226 if (TABLE_GET(ecode
[1], md
->lcc
, ecode
[1])
3227 != TABLE_GET(*eptr
, md
->lcc
, *eptr
)) RRETURN(MATCH_NOMATCH
);
3233 /* Match a single character repeatedly. */
3237 min
= max
= GET2(ecode
, 1);
3238 ecode
+= 1 + IMM2_SIZE
;
3251 max
= GET2(ecode
, 1);
3252 minimize
= *ecode
== OP_MINUPTO
|| *ecode
== OP_MINUPTOI
;
3253 ecode
+= 1 + IMM2_SIZE
;
3292 c
= *ecode
++ - ((op
< OP_STARI
)? OP_STAR
: OP_STARI
);
3293 minimize
= (c
& 1) != 0;
3294 min
= rep_min
[c
]; /* Pick up values from tables; */
3295 max
= rep_max
[c
]; /* zero for max => infinity */
3296 if (max
== 0) max
= INT_MAX
;
3298 /* Common code for all repeated single-character matches. */
3306 GETCHARLEN(fc
, ecode
, length
);
3309 /* Handle multibyte character matching specially here. There is
3310 support for caseless matching if UCP support is present. */
3315 pcre_uint32 othercase
;
3316 if (op
>= OP_STARI
&& /* Caseless */
3317 (othercase
= UCD_OTHERCASE(fc
)) != fc
)
3318 oclength
= PRIV(ord2utf
)(othercase
, occhars
);
3320 #endif /* SUPPORT_UCP */
3322 for (i
= 1; i
<= min
; i
++)
3324 if (eptr
<= md
->end_subject
- length
&&
3325 memcmp(eptr
, charptr
, IN_UCHARS(length
)) == 0) eptr
+= length
;
3327 else if (oclength
> 0 &&
3328 eptr
<= md
->end_subject
- oclength
&&
3329 memcmp(eptr
, occhars
, IN_UCHARS(oclength
)) == 0) eptr
+= oclength
;
3330 #endif /* SUPPORT_UCP */
3334 RRETURN(MATCH_NOMATCH
);
3338 if (min
== max
) continue;
3342 for (fi
= min
;; fi
++)
3344 RMATCH(eptr
, ecode
, offset_top
, md
, eptrb
, RM22
);
3345 if (rrc
!= MATCH_NOMATCH
) RRETURN(rrc
);
3346 if (fi
>= max
) RRETURN(MATCH_NOMATCH
);
3347 if (eptr
<= md
->end_subject
- length
&&
3348 memcmp(eptr
, charptr
, IN_UCHARS(length
)) == 0) eptr
+= length
;
3350 else if (oclength
> 0 &&
3351 eptr
<= md
->end_subject
- oclength
&&
3352 memcmp(eptr
, occhars
, IN_UCHARS(oclength
)) == 0) eptr
+= oclength
;
3353 #endif /* SUPPORT_UCP */
3357 RRETURN(MATCH_NOMATCH
);
3360 /* Control never gets here */
3366 for (i
= min
; i
< max
; i
++)
3368 if (eptr
<= md
->end_subject
- length
&&
3369 memcmp(eptr
, charptr
, IN_UCHARS(length
)) == 0) eptr
+= length
;
3371 else if (oclength
> 0 &&
3372 eptr
<= md
->end_subject
- oclength
&&
3373 memcmp(eptr
, occhars
, IN_UCHARS(oclength
)) == 0) eptr
+= oclength
;
3374 #endif /* SUPPORT_UCP */
3382 if (possessive
) continue;
3386 RMATCH(eptr
, ecode
, offset_top
, md
, eptrb
, RM23
);
3387 if (rrc
!= MATCH_NOMATCH
) RRETURN(rrc
);
3388 if (eptr
== pp
) { RRETURN(MATCH_NOMATCH
); }
3392 #else /* without SUPPORT_UCP */
3394 #endif /* SUPPORT_UCP */
3397 /* Control never gets here */
3400 /* If the length of a UTF-8 character is 1, we fall through here, and
3401 obey the code as for non-UTF-8 characters below, though in this case the
3402 value of fc will always be < 128. */
3405 #endif /* SUPPORT_UTF */
3406 /* When not in UTF-8 mode, load a single-byte character. */
3409 /* The value of fc at this point is always one character, though we may
3410 or may not be in UTF mode. The code is duplicated for the caseless and
3411 caseful cases, for speed, since matching characters is likely to be quite
3412 common. First, ensure the minimum number of matches are present. If min =
3413 max, continue at the same level without recursing. Otherwise, if
3414 minimizing, keep trying the rest of the expression and advancing one
3415 matching character if failing, up to the maximum. Alternatively, if
3416 maximizing, find the maximum number of characters and work backwards. */
3418 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc
, min
, max
,
3419 max
, (char *)eptr
));
3421 if (op
>= OP_STARI
) /* Caseless */
3423 #ifdef COMPILE_PCRE8
3424 /* fc must be < 128 if UTF is enabled. */
3429 if (utf
&& fc
> 127)
3430 foc
= UCD_OTHERCASE(fc
);
3432 if (utf
&& fc
> 127)
3434 #endif /* SUPPORT_UCP */
3436 #endif /* SUPPORT_UTF */
3437 foc
= TABLE_GET(fc
, md
->fcc
, fc
);
3438 #endif /* COMPILE_PCRE8 */
3440 for (i
= 1; i
<= min
; i
++)
3444 if (eptr
>= md
->end_subject
)
3447 RRETURN(MATCH_NOMATCH
);
3449 cc
= RAWUCHARTEST(eptr
);
3450 if (fc
!= cc
&& foc
!= cc
) RRETURN(MATCH_NOMATCH
);
3453 if (min
== max
) continue;
3456 for (fi
= min
;; fi
++)
3460 RMATCH(eptr
, ecode
, offset_top
, md
, eptrb
, RM24
);
3461 if (rrc
!= MATCH_NOMATCH
) RRETURN(rrc
);
3462 if (fi
>= max
) RRETURN(MATCH_NOMATCH
);
3463 if (eptr
>= md
->end_subject
)
3466 RRETURN(MATCH_NOMATCH
);
3468 cc
= RAWUCHARTEST(eptr
);
3469 if (fc
!= cc
&& foc
!= cc
) RRETURN(MATCH_NOMATCH
);
3472 /* Control never gets here */
3477 for (i
= min
; i
< max
; i
++)
3481 if (eptr
>= md
->end_subject
)
3486 cc
= RAWUCHARTEST(eptr
);
3487 if (fc
!= cc
&& foc
!= cc
) break;
3491 if (possessive
) continue;
3495 RMATCH(eptr
, ecode
, offset_top
, md
, eptrb
, RM25
);
3497 if (rrc
!= MATCH_NOMATCH
) RRETURN(rrc
);
3499 RRETURN(MATCH_NOMATCH
);
3501 /* Control never gets here */
3504 /* Caseful comparisons (includes all multi-byte characters) */
3508 for (i
= 1; i
<= min
; i
++)
3510 if (eptr
>= md
->end_subject
)
3513 RRETURN(MATCH_NOMATCH
);
3515 if (fc
!= RAWUCHARINCTEST(eptr
)) RRETURN(MATCH_NOMATCH
);
3518 if (min
== max
) continue;
3522 for (fi
= min
;; fi
++)
3524 RMATCH(eptr
, ecode
, offset_top
, md
, eptrb
, RM26
);
3525 if (rrc
!= MATCH_NOMATCH
) RRETURN(rrc
);
3526 if (fi
>= max
) RRETURN(MATCH_NOMATCH
);
3527 if (eptr
>= md
->end_subject
)
3530 RRETURN(MATCH_NOMATCH
);
3532 if (fc
!= RAWUCHARINCTEST(eptr
)) RRETURN(MATCH_NOMATCH
);
3534 /* Control never gets here */
3539 for (i
= min
; i
< max
; i
++)
3541 if (eptr
>= md
->end_subject
)
3546 if (fc
!= RAWUCHARTEST(eptr
)) break;
3549 if (possessive
) continue;
3553 RMATCH(eptr
, ecode
, offset_top
, md
, eptrb
, RM27
);
3555 if (rrc
!= MATCH_NOMATCH
) RRETURN(rrc
);
3557 RRETURN(MATCH_NOMATCH
);
3560 /* Control never gets here */
3562 /* Match a negated single one-byte character. The character we are
3563 checking can be multibyte. */
3567 if (eptr
>= md
->end_subject
)
3570 RRETURN(MATCH_NOMATCH
);
3575 register pcre_uint32 ch
, och
;
3578 GETCHARINC(ch
, ecode
);
3579 GETCHARINC(c
, eptr
);
3583 if (ch
== c
) RRETURN(MATCH_NOMATCH
);
3589 och
= UCD_OTHERCASE(ch
);
3593 #endif /* SUPPORT_UCP */
3595 och
= TABLE_GET(ch
, md
->fcc
, ch
);
3596 if (ch
== c
|| och
== c
) RRETURN(MATCH_NOMATCH
);
3602 register pcre_uint32 ch
= ecode
[1];
3604 if (ch
== c
|| (op
== OP_NOTI
&& TABLE_GET(ch
, md
->fcc
, ch
) == c
))
3605 RRETURN(MATCH_NOMATCH
);
3610 /* Match a negated single one-byte character repeatedly. This is almost a
3611 repeat of the code for a repeated single character, but I haven't found a
3612 nice way of commoning these up that doesn't require a test of the
3613 positive/negative option for each character match. Maybe that wouldn't add
3614 very much to the time taken, but character matching *is* what this is all
3619 min
= max
= GET2(ecode
, 1);
3620 ecode
+= 1 + IMM2_SIZE
;
3626 case OP_NOTMINUPTOI
:
3628 max
= GET2(ecode
, 1);
3629 minimize
= *ecode
== OP_NOTMINUPTO
|| *ecode
== OP_NOTMINUPTOI
;
3630 ecode
+= 1 + IMM2_SIZE
;
3634 case OP_NOTPOSSTARI
:
3642 case OP_NOTPOSPLUSI
:
3649 case OP_NOTPOSQUERY
:
3650 case OP_NOTPOSQUERYI
:
3658 case OP_NOTPOSUPTOI
:
3661 max
= GET2(ecode
, 1);
3662 ecode
+= 1 + IMM2_SIZE
;
3668 case OP_NOTMINSTARI
:
3672 case OP_NOTMINPLUSI
:
3675 case OP_NOTMINQUERY
:
3676 case OP_NOTMINQUERYI
:
3677 c
= *ecode
++ - ((op
>= OP_NOTSTARI
)? OP_NOTSTARI
: OP_NOTSTAR
);
3678 minimize
= (c
& 1) != 0;
3679 min
= rep_min
[c
]; /* Pick up values from tables; */
3680 max
= rep_max
[c
]; /* zero for max => infinity */
3681 if (max
== 0) max
= INT_MAX
;
3683 /* Common code for all repeated single-byte matches. */
3686 GETCHARINCTEST(fc
, ecode
);
3688 /* The code is duplicated for the caseless and caseful cases, for speed,
3689 since matching characters is likely to be quite common. First, ensure the
3690 minimum number of matches are present. If min = max, continue at the same
3691 level without recursing. Otherwise, if minimizing, keep trying the rest of
3692 the expression and advancing one matching character if failing, up to the
3693 maximum. Alternatively, if maximizing, find the maximum number of
3694 characters and work backwards. */
3696 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc
, min
, max
,
3697 max
, (char *)eptr
));
3699 if (op
>= OP_NOTSTARI
) /* Caseless */
3703 if (utf
&& fc
> 127)
3704 foc
= UCD_OTHERCASE(fc
);
3706 if (utf
&& fc
> 127)
3708 #endif /* SUPPORT_UCP */
3710 #endif /* SUPPORT_UTF */
3711 foc
= TABLE_GET(fc
, md
->fcc
, fc
);
3716 register pcre_uint32 d
;
3717 for (i
= 1; i
<= min
; i
++)
3719 if (eptr
>= md
->end_subject
)
3722 RRETURN(MATCH_NOMATCH
);
3724 GETCHARINC(d
, eptr
);
3725 if (fc
== d
|| (unsigned int)foc
== d
) RRETURN(MATCH_NOMATCH
);
3732 for (i
= 1; i
<= min
; i
++)
3734 if (eptr
>= md
->end_subject
)
3737 RRETURN(MATCH_NOMATCH
);
3739 if (fc
== *eptr
|| foc
== *eptr
) RRETURN(MATCH_NOMATCH
);
3744 if (min
== max
) continue;
3751 register pcre_uint32 d
;
3752 for (fi
= min
;; fi
++)
3754 RMATCH(eptr
, ecode
, offset_top
, md
, eptrb
, RM28
);
3755 if (rrc
!= MATCH_NOMATCH
) RRETURN(rrc
);
3756 if (fi
>= max
) RRETURN(MATCH_NOMATCH
);
3757 if (eptr
>= md
->end_subject
)
3760 RRETURN(MATCH_NOMATCH
);
3762 GETCHARINC(d
, eptr
);
3763 if (fc
== d
|| (unsigned int)foc
== d
) RRETURN(MATCH_NOMATCH
);
3770 for (fi
= min
;; fi
++)
3772 RMATCH(eptr
, ecode
, offset_top
, md
, eptrb
, RM29
);
3773 if (rrc
!= MATCH_NOMATCH
) RRETURN(rrc
);
3774 if (fi
>= max
) RRETURN(MATCH_NOMATCH
);
3775 if (eptr
>= md
->end_subject
)
3778 RRETURN(MATCH_NOMATCH
);
3780 if (fc
== *eptr
|| foc
== *eptr
) RRETURN(MATCH_NOMATCH
);
3784 /* Control never gets here */
3796 register pcre_uint32 d
;
3797 for (i
= min
; i
< max
; i
++)
3800 if (eptr
>= md
->end_subject
)
3805 GETCHARLEN(d
, eptr
, len
);
3806 if (fc
== d
|| (unsigned int)foc
== d
) break;
3809 if (possessive
) continue;
3812 RMATCH(eptr
, ecode
, offset_top
, md
, eptrb
, RM30
);
3813 if (rrc
!= MATCH_NOMATCH
) RRETURN(rrc
);
3814 if (eptr
-- == pp
) break; /* Stop if tried at original pos */
3822 for (i
= min
; i
< max
; i
++)
3824 if (eptr
>= md
->end_subject
)
3829 if (fc
== *eptr
|| foc
== *eptr
) break;
3832 if (possessive
) continue;
3835 RMATCH(eptr
, ecode
, offset_top
, md
, eptrb
, RM31
);
3836 if (rrc
!= MATCH_NOMATCH
) RRETURN(rrc
);
3841 RRETURN(MATCH_NOMATCH
);
3843 /* Control never gets here */
3846 /* Caseful comparisons */
3853 register pcre_uint32 d
;
3854 for (i
= 1; i
<= min
; i
++)
3856 if (eptr
>= md
->end_subject
)
3859 RRETURN(MATCH_NOMATCH
);
3861 GETCHARINC(d
, eptr
);
3862 if (fc
== d
) RRETURN(MATCH_NOMATCH
);
3869 for (i
= 1; i
<= min
; i
++)
3871 if (eptr
>= md
->end_subject
)
3874 RRETURN(MATCH_NOMATCH
);
3876 if (fc
== *eptr
++) RRETURN(MATCH_NOMATCH
);
3880 if (min
== max
) continue;
3887 register pcre_uint32 d
;
3888 for (fi
= min
;; fi
++)
3890 RMATCH(eptr
, ecode
, offset_top
, md
, eptrb
, RM32
);
3891 if (rrc
!= MATCH_NOMATCH
) RRETURN(rrc
);
3892 if (fi
>= max
) RRETURN(MATCH_NOMATCH
);
3893 if (eptr
>= md
->end_subject
)
3896 RRETURN(MATCH_NOMATCH
);
3898 GETCHARINC(d
, eptr
);
3899 if (fc
== d
) RRETURN(MATCH_NOMATCH
);
3906 for (fi
= min
;; fi
++)
3908 RMATCH(eptr
, ecode
, offset_top
, md
, eptrb
, RM33
);
3909 if (rrc
!= MATCH_NOMATCH
) RRETURN(rrc
);
3910 if (fi
>= max
) RRETURN(MATCH_NOMATCH
);
3911 if (eptr
>= md
->end_subject
)
3914 RRETURN(MATCH_NOMATCH
);
3916 if (fc
== *eptr
++) RRETURN(MATCH_NOMATCH
);
3919 /* Control never gets here */
3931 register pcre_uint32 d
;
3932 for (i
= min
; i
< max
; i
++)
3935 if (eptr
>= md
->end_subject
)
3940 GETCHARLEN(d
, eptr
, len
);
3944 if (possessive
) continue;
3947 RMATCH(eptr
, ecode
, offset_top
, md
, eptrb
, RM34
);
3948 if (rrc
!= MATCH_NOMATCH
) RRETURN(rrc
);
3949 if (eptr
-- == pp
) break; /* Stop if tried at original pos */
3957 for (i
= min
; i
< max
; i
++)
3959 if (eptr
>= md
->end_subject
)
3964 if (fc
== *eptr
) break;
3967 if (possessive
) continue;
3970 RMATCH(eptr
, ecode
, offset_top
, md
, eptrb
, RM35
);
3971 if (rrc
!= MATCH_NOMATCH
) RRETURN(rrc
);
3976 RRETURN(MATCH_NOMATCH
);
3979 /* Control never gets here */
3981 /* Match a single character type repeatedly; several different opcodes
3982 share code. This is very similar to the code for single characters, but we
3983 repeat it in the interests of efficiency. */
3986 min
= max
= GET2(ecode
, 1);
3988 ecode
+= 1 + IMM2_SIZE
;
3992 case OP_TYPEMINUPTO
:
3994 max
= GET2(ecode
, 1);
3995 minimize
= *ecode
== OP_TYPEMINUPTO
;
3996 ecode
+= 1 + IMM2_SIZE
;
3999 case OP_TYPEPOSSTAR
:
4006 case OP_TYPEPOSPLUS
:
4013 case OP_TYPEPOSQUERY
:
4020 case OP_TYPEPOSUPTO
:
4023 max
= GET2(ecode
, 1);
4024 ecode
+= 1 + IMM2_SIZE
;
4028 case OP_TYPEMINSTAR
:
4030 case OP_TYPEMINPLUS
:
4032 case OP_TYPEMINQUERY
:
4033 c
= *ecode
++ - OP_TYPESTAR
;
4034 minimize
= (c
& 1) != 0;
4035 min
= rep_min
[c
]; /* Pick up values from tables; */
4036 max
= rep_max
[c
]; /* zero for max => infinity */
4037 if (max
== 0) max
= INT_MAX
;
4039 /* Common code for all repeated single character type matches. Note that
4040 in UTF-8 mode, '.' matches a character of any length, but for the other
4041 character types, the valid characters are all one-byte long. */
4044 ctype
= *ecode
++; /* Code for the character type */
4047 if (ctype
== OP_PROP
|| ctype
== OP_NOTPROP
)
4049 prop_fail_result
= ctype
== OP_NOTPROP
;
4050 prop_type
= *ecode
++;
4051 prop_value
= *ecode
++;
4053 else prop_type
= -1;
4056 /* First, ensure the minimum number of matches are present. Use inline
4057 code for maximizing the speed, and do the type test once at the start
4058 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
4059 is tidier. Also separate the UCP code, which can be the same for both UTF-8
4060 and single-bytes. */
4070 if (prop_fail_result
) RRETURN(MATCH_NOMATCH
);
4071 for (i
= 1; i
<= min
; i
++)
4073 if (eptr
>= md
->end_subject
)
4076 RRETURN(MATCH_NOMATCH
);
4078 GETCHARINCTEST(c
, eptr
);
4083 for (i
= 1; i
<= min
; i
++)
4086 if (eptr
>= md
->end_subject
)
4089 RRETURN(MATCH_NOMATCH
);
4091 GETCHARINCTEST(c
, eptr
);
4092 chartype
= UCD_CHARTYPE(c
);
4093 if ((chartype
== ucp_Lu
||
4094 chartype
== ucp_Ll
||
4095 chartype
== ucp_Lt
) == prop_fail_result
)
4096 RRETURN(MATCH_NOMATCH
);
4101 for (i
= 1; i
<= min
; i
++)
4103 if (eptr
>= md
->end_subject
)
4106 RRETURN(MATCH_NOMATCH
);
4108 GETCHARINCTEST(c
, eptr
);
4109 if ((UCD_CATEGORY(c
) == prop_value
) == prop_fail_result
)
4110 RRETURN(MATCH_NOMATCH
);
4115 for (i
= 1; i
<= min
; i
++)
4117 if (eptr
>= md
->end_subject
)
4120 RRETURN(MATCH_NOMATCH
);
4122 GETCHARINCTEST(c
, eptr
);
4123 if ((UCD_CHARTYPE(c
) == prop_value
) == prop_fail_result
)
4124 RRETURN(MATCH_NOMATCH
);
4129 for (i
= 1; i
<= min
; i
++)
4131 if (eptr
>= md
->end_subject
)
4134 RRETURN(MATCH_NOMATCH
);
4136 GETCHARINCTEST(c
, eptr
);
4137 if ((UCD_SCRIPT(c
) == prop_value
) == prop_fail_result
)
4138 RRETURN(MATCH_NOMATCH
);
4143 for (i
= 1; i
<= min
; i
++)
4146 if (eptr
>= md
->end_subject
)
4149 RRETURN(MATCH_NOMATCH
);
4151 GETCHARINCTEST(c
, eptr
);
4152 category
= UCD_CATEGORY(c
);
4153 if ((category
== ucp_L
|| category
== ucp_N
) == prop_fail_result
)
4154 RRETURN(MATCH_NOMATCH
);
4158 case PT_SPACE
: /* Perl space */
4159 for (i
= 1; i
<= min
; i
++)
4161 if (eptr
>= md
->end_subject
)
4164 RRETURN(MATCH_NOMATCH
);
4166 GETCHARINCTEST(c
, eptr
);
4167 if ((UCD_CATEGORY(c
) == ucp_Z
|| c
== CHAR_HT
|| c
== CHAR_NL
||
4168 c
== CHAR_FF
|| c
== CHAR_CR
)
4169 == prop_fail_result
)
4170 RRETURN(MATCH_NOMATCH
);
4174 case PT_PXSPACE
: /* POSIX space */
4175 for (i
= 1; i
<= min
; i
++)
4177 if (eptr
>= md
->end_subject
)
4180 RRETURN(MATCH_NOMATCH
);
4182 GETCHARINCTEST(c
, eptr
);
4183 if ((UCD_CATEGORY(c
) == ucp_Z
|| c
== CHAR_HT
|| c
== CHAR_NL
||
4184 c
== CHAR_VT
|| c
== CHAR_FF
|| c
== CHAR_CR
)
4185 == prop_fail_result
)
4186 RRETURN(MATCH_NOMATCH
);
4191 for (i
= 1; i
<= min
; i
++)
4194 if (eptr
>= md
->end_subject
)
4197 RRETURN(MATCH_NOMATCH
);
4199 GETCHARINCTEST(c
, eptr
);
4200 category
= UCD_CATEGORY(c
);
4201 if ((category
== ucp_L
|| category
== ucp_N
|| c
== CHAR_UNDERSCORE
)
4202 == prop_fail_result
)
4203 RRETURN(MATCH_NOMATCH
);
4208 for (i
= 1; i
<= min
; i
++)
4210 const pcre_uint32
*cp
;
4211 if (eptr
>= md
->end_subject
)
4214 RRETURN(MATCH_NOMATCH
);
4216 GETCHARINCTEST(c
, eptr
);
4217 cp
= PRIV(ucd_caseless_sets
) + prop_value
;
4221 { if (prop_fail_result
) break; else { RRETURN(MATCH_NOMATCH
); } }
4223 { if (prop_fail_result
) { RRETURN(MATCH_NOMATCH
); } else break; }
4228 /* This should not occur */
4231 RRETURN(PCRE_ERROR_INTERNAL
);
4235 /* Match extended Unicode sequences. We will get here only if the
4236 support is in the binary; otherwise a compile-time error occurs. */
4238 else if (ctype
== OP_EXTUNI
)
4240 for (i
= 1; i
<= min
; i
++)
4242 if (eptr
>= md
->end_subject
)
4245 RRETURN(MATCH_NOMATCH
);
4250 GETCHARINCTEST(c
, eptr
);
4251 lgb
= UCD_GRAPHBREAK(c
);
4252 while (eptr
< md
->end_subject
)
4255 if (!utf
) c
= *eptr
; else { GETCHARLEN(c
, eptr
, len
); }
4256 rgb
= UCD_GRAPHBREAK(c
);
4257 if ((PRIV(ucp_gbtable
)[lgb
] & (1 << rgb
)) == 0) break;
4267 #endif /* SUPPORT_UCP */
4269 /* Handle all other cases when the coding is UTF-8 */
4272 if (utf
) switch(ctype
)
4275 for (i
= 1; i
<= min
; i
++)
4277 if (eptr
>= md
->end_subject
)
4280 RRETURN(MATCH_NOMATCH
);
4282 if (IS_NEWLINE(eptr
)) RRETURN(MATCH_NOMATCH
);
4283 if (md
->partial
!= 0 &&
4284 eptr
+ 1 >= md
->end_subject
&&
4285 NLBLOCK
->nltype
== NLTYPE_FIXED
&&
4286 NLBLOCK
->nllen
== 2 &&
4287 RAWUCHAR(eptr
) == NLBLOCK
->nl
[0])
4290 if (md
->partial
> 1) RRETURN(PCRE_ERROR_PARTIAL
);
4293 ACROSSCHAR(eptr
< md
->end_subject
, *eptr
, eptr
++);
4298 for (i
= 1; i
<= min
; i
++)
4300 if (eptr
>= md
->end_subject
)
4303 RRETURN(MATCH_NOMATCH
);
4306 ACROSSCHAR(eptr
< md
->end_subject
, *eptr
, eptr
++);
4311 if (eptr
> md
->end_subject
- min
) RRETURN(MATCH_NOMATCH
);
4316 for (i
= 1; i
<= min
; i
++)
4318 if (eptr
>= md
->end_subject
)
4321 RRETURN(MATCH_NOMATCH
);
4323 GETCHARINC(c
, eptr
);
4326 default: RRETURN(MATCH_NOMATCH
);
4329 if (eptr
< md
->end_subject
&& RAWUCHAR(eptr
) == CHAR_LF
) eptr
++;
4341 #endif /* Not EBCDIC */
4342 if (md
->bsr_anycrlf
) RRETURN(MATCH_NOMATCH
);
4349 for (i
= 1; i
<= min
; i
++)
4351 if (eptr
>= md
->end_subject
)
4354 RRETURN(MATCH_NOMATCH
);
4356 GETCHARINC(c
, eptr
);
4359 HSPACE_CASES
: RRETURN(MATCH_NOMATCH
); /* Byte and multibyte cases */
4366 for (i
= 1; i
<= min
; i
++)
4368 if (eptr
>= md
->end_subject
)
4371 RRETURN(MATCH_NOMATCH
);
4373 GETCHARINC(c
, eptr
);
4376 HSPACE_CASES
: break; /* Byte and multibyte cases */
4377 default: RRETURN(MATCH_NOMATCH
);
4383 for (i
= 1; i
<= min
; i
++)
4385 if (eptr
>= md
->end_subject
)
4388 RRETURN(MATCH_NOMATCH
);
4390 GETCHARINC(c
, eptr
);
4393 VSPACE_CASES
: RRETURN(MATCH_NOMATCH
);
4400 for (i
= 1; i
<= min
; i
++)
4402 if (eptr
>= md
->end_subject
)
4405 RRETURN(MATCH_NOMATCH
);
4407 GETCHARINC(c
, eptr
);
4410 VSPACE_CASES
: break;
4411 default: RRETURN(MATCH_NOMATCH
);
4417 for (i
= 1; i
<= min
; i
++)
4419 if (eptr
>= md
->end_subject
)
4422 RRETURN(MATCH_NOMATCH
);
4424 GETCHARINC(c
, eptr
);
4425 if (c
< 128 && (md
->ctypes
[c
] & ctype_digit
) != 0)
4426 RRETURN(MATCH_NOMATCH
);
4431 for (i
= 1; i
<= min
; i
++)
4435 if (eptr
>= md
->end_subject
)
4438 RRETURN(MATCH_NOMATCH
);
4440 cc
= RAWUCHAR(eptr
);
4441 if (cc
>= 128 || (md
->ctypes
[cc
] & ctype_digit
) == 0)
4442 RRETURN(MATCH_NOMATCH
);
4444 /* No need to skip more bytes - we know it's a 1-byte character */
4448 case OP_NOT_WHITESPACE
:
4449 for (i
= 1; i
<= min
; i
++)
4453 if (eptr
>= md
->end_subject
)
4456 RRETURN(MATCH_NOMATCH
);
4458 cc
= RAWUCHAR(eptr
);
4459 if (cc
< 128 && (md
->ctypes
[cc
] & ctype_space
) != 0)
4460 RRETURN(MATCH_NOMATCH
);
4462 ACROSSCHAR(eptr
< md
->end_subject
, *eptr
, eptr
++);
4467 for (i
= 1; i
<= min
; i
++)
4471 if (eptr
>= md
->end_subject
)
4474 RRETURN(MATCH_NOMATCH
);
4476 cc
= RAWUCHAR(eptr
);
4477 if (cc
>= 128 || (md
->ctypes
[cc
] & ctype_space
) == 0)
4478 RRETURN(MATCH_NOMATCH
);
4480 /* No need to skip more bytes - we know it's a 1-byte character */
4484 case OP_NOT_WORDCHAR
:
4485 for (i
= 1; i
<= min
; i
++)
4489 if (eptr
>= md
->end_subject
)
4492 RRETURN(MATCH_NOMATCH
);
4494 cc
= RAWUCHAR(eptr
);
4495 if (cc
< 128 && (md
->ctypes
[cc
] & ctype_word
) != 0)
4496 RRETURN(MATCH_NOMATCH
);
4498 ACROSSCHAR(eptr
< md
->end_subject
, *eptr
, eptr
++);
4503 for (i
= 1; i
<= min
; i
++)
4507 if (eptr
>= md
->end_subject
)
4510 RRETURN(MATCH_NOMATCH
);
4512 cc
= RAWUCHAR(eptr
);
4513 if (cc
>= 128 || (md
->ctypes
[cc
] & ctype_word
) == 0)
4514 RRETURN(MATCH_NOMATCH
);
4516 /* No need to skip more bytes - we know it's a 1-byte character */
4521 RRETURN(PCRE_ERROR_INTERNAL
);
4522 } /* End switch(ctype) */
4525 #endif /* SUPPORT_UTF */
4527 /* Code for the non-UTF-8 case for minimum matching of operators other
4528 than OP_PROP and OP_NOTPROP. */
4533 for (i
= 1; i
<= min
; i
++)
4535 if (eptr
>= md
->end_subject
)
4538 RRETURN(MATCH_NOMATCH
);
4540 if (IS_NEWLINE(eptr
)) RRETURN(MATCH_NOMATCH
);
4541 if (md
->partial
!= 0 &&
4542 eptr
+ 1 >= md
->end_subject
&&
4543 NLBLOCK
->nltype
== NLTYPE_FIXED
&&
4544 NLBLOCK
->nllen
== 2 &&
4545 *eptr
== NLBLOCK
->nl
[0])
4548 if (md
->partial
> 1) RRETURN(PCRE_ERROR_PARTIAL
);
4555 if (eptr
> md
->end_subject
- min
)
4558 RRETURN(MATCH_NOMATCH
);
4564 if (eptr
> md
->end_subject
- min
)
4567 RRETURN(MATCH_NOMATCH
);
4573 for (i
= 1; i
<= min
; i
++)
4575 if (eptr
>= md
->end_subject
)
4578 RRETURN(MATCH_NOMATCH
);
4582 default: RRETURN(MATCH_NOMATCH
);
4585 if (eptr
< md
->end_subject
&& *eptr
== CHAR_LF
) eptr
++;
4594 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4598 if (md
->bsr_anycrlf
) RRETURN(MATCH_NOMATCH
);
4605 for (i
= 1; i
<= min
; i
++)
4607 if (eptr
>= md
->end_subject
)
4610 RRETURN(MATCH_NOMATCH
);
4616 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4617 HSPACE_MULTIBYTE_CASES
:
4619 RRETURN(MATCH_NOMATCH
);
4625 for (i
= 1; i
<= min
; i
++)
4627 if (eptr
>= md
->end_subject
)
4630 RRETURN(MATCH_NOMATCH
);
4634 default: RRETURN(MATCH_NOMATCH
);
4636 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4637 HSPACE_MULTIBYTE_CASES
:
4645 for (i
= 1; i
<= min
; i
++)
4647 if (eptr
>= md
->end_subject
)
4650 RRETURN(MATCH_NOMATCH
);
4655 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4656 VSPACE_MULTIBYTE_CASES
:
4658 RRETURN(MATCH_NOMATCH
);
4665 for (i
= 1; i
<= min
; i
++)
4667 if (eptr
>= md
->end_subject
)
4670 RRETURN(MATCH_NOMATCH
);
4674 default: RRETURN(MATCH_NOMATCH
);
4676 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4677 VSPACE_MULTIBYTE_CASES
:
4685 for (i
= 1; i
<= min
; i
++)
4687 if (eptr
>= md
->end_subject
)
4690 RRETURN(MATCH_NOMATCH
);
4692 if (MAX_255(*eptr
) && (md
->ctypes
[*eptr
] & ctype_digit
) != 0)
4693 RRETURN(MATCH_NOMATCH
);
4699 for (i
= 1; i
<= min
; i
++)
4701 if (eptr
>= md
->end_subject
)
4704 RRETURN(MATCH_NOMATCH
);
4706 if (!MAX_255(*eptr
) || (md
->ctypes
[*eptr
] & ctype_digit
) == 0)
4707 RRETURN(MATCH_NOMATCH
);
4712 case OP_NOT_WHITESPACE
:
4713 for (i
= 1; i
<= min
; i
++)
4715 if (eptr
>= md
->end_subject
)
4718 RRETURN(MATCH_NOMATCH
);
4720 if (MAX_255(*eptr
) && (md
->ctypes
[*eptr
] & ctype_space
) != 0)
4721 RRETURN(MATCH_NOMATCH
);
4727 for (i
= 1; i
<= min
; i
++)
4729 if (eptr
>= md
->end_subject
)
4732 RRETURN(MATCH_NOMATCH
);
4734 if (!MAX_255(*eptr
) || (md
->ctypes
[*eptr
] & ctype_space
) == 0)
4735 RRETURN(MATCH_NOMATCH
);
4740 case OP_NOT_WORDCHAR
:
4741 for (i
= 1; i
<= min
; i
++)
4743 if (eptr
>= md
->end_subject
)
4746 RRETURN(MATCH_NOMATCH
);
4748 if (MAX_255(*eptr
) && (md
->ctypes
[*eptr
] & ctype_word
) != 0)
4749 RRETURN(MATCH_NOMATCH
);
4755 for (i
= 1; i
<= min
; i
++)
4757 if (eptr
>= md
->end_subject
)
4760 RRETURN(MATCH_NOMATCH
);
4762 if (!MAX_255(*eptr
) || (md
->ctypes
[*eptr
] & ctype_word
) == 0)
4763 RRETURN(MATCH_NOMATCH
);
4769 RRETURN(PCRE_ERROR_INTERNAL
);
4773 /* If min = max, continue at the same level without recursing */
4775 if (min
== max
) continue;
4777 /* If minimizing, we have to test the rest of the pattern before each
4778 subsequent match. Again, separate the UTF-8 case for speed, and also
4779 separate the UCP cases. */
4789 for (fi
= min
;; fi
++)
4791 RMATCH(eptr
, ecode
, offset_top
, md
, eptrb
, RM36
);
4792 if (rrc
!= MATCH_NOMATCH
) RRETURN(rrc
);
4793 if (fi
>= max
) RRETURN(MATCH_NOMATCH
);
4794 if (eptr
>= md
->end_subject
)
4797 RRETURN(MATCH_NOMATCH
);
4799 GETCHARINCTEST(c
, eptr
);
4800 if (prop_fail_result
) RRETURN(MATCH_NOMATCH
);
4802 /* Control never gets here */
4805 for (fi
= min
;; fi
++)
4808 RMATCH(eptr
, ecode
, offset_top
, md
, eptrb
, RM37
);
4809 if (rrc
!= MATCH_NOMATCH
) RRETURN(rrc
);
4810 if (fi
>= max
) RRETURN(MATCH_NOMATCH
);
4811 if (eptr
>= md
->end_subject
)
4814 RRETURN(MATCH_NOMATCH
);
4816 GETCHARINCTEST(c
, eptr
);
4817 chartype
= UCD_CHARTYPE(c
);
4818 if ((chartype
== ucp_Lu
||
4819 chartype
== ucp_Ll
||
4820 chartype
== ucp_Lt
) == prop_fail_result
)
4821 RRETURN(MATCH_NOMATCH
);
4823 /* Control never gets here */
4826 for (fi
= min
;; fi
++)
4828 RMATCH(eptr
, ecode
, offset_top
, md
, eptrb
, RM38
);
4829 if (rrc
!= MATCH_NOMATCH
) RRETURN(rrc
);
4830 if (fi
>= max
) RRETURN(MATCH_NOMATCH
);
4831 if (eptr
>= md
->end_subject
)
4834 RRETURN(MATCH_NOMATCH
);
4836 GETCHARINCTEST(c
, eptr
);
4837 if ((UCD_CATEGORY(c
) == prop_value
) == prop_fail_result
)
4838 RRETURN(MATCH_NOMATCH
);
4840 /* Control never gets here */
4843 for (fi
= min
;; fi
++)
4845 RMATCH(eptr
, ecode
, offset_top
, md
, eptrb
, RM39
);
4846 if (rrc
!= MATCH_NOMATCH
) RRETURN(rrc
);
4847 if (fi
>= max
) RRETURN(MATCH_NOMATCH
);
4848 if (eptr
>= md
->end_subject
)
4851 RRETURN(MATCH_NOMATCH
);
4853 GETCHARINCTEST(c
, eptr
);
4854 if ((UCD_CHARTYPE(c
) == prop_value
) == prop_fail_result
)
4855 RRETURN(MATCH_NOMATCH
);
4857 /* Control never gets here */
4860 for (fi
= min
;; fi
++)
4862 RMATCH(eptr
, ecode
, offset_top
, md
, eptrb
, RM40
);
4863 if (rrc
!= MATCH_NOMATCH
) RRETURN(rrc
);
4864 if (fi
>= max
) RRETURN(MATCH_NOMATCH
);
4865 if (eptr
>= md
->end_subject
)
4868 RRETURN(MATCH_NOMATCH
);
4870 GETCHARINCTEST(c
, eptr
);
4871 if ((UCD_SCRIPT(c
) == prop_value
) == prop_fail_result
)
4872 RRETURN(MATCH_NOMATCH
);
4874 /* Control never gets here */
4877 for (fi
= min
;; fi
++)
4880 RMATCH(eptr
, ecode
, offset_top
, md
, eptrb
, RM59
);
4881 if (rrc
!= MATCH_NOMATCH
) RRETURN(rrc
);
4882 if (fi
>= max
) RRETURN(MATCH_NOMATCH
);
4883 if (eptr
>= md
->end_subject
)
4886 RRETURN(MATCH_NOMATCH
);
4888 GETCHARINCTEST(c
, eptr
);
4889 category
= UCD_CATEGORY(c
);
4890 if ((category
== ucp_L
|| category
== ucp_N
) == prop_fail_result
)
4891 RRETURN(MATCH_NOMATCH
);
4893 /* Control never gets here */
4895 case PT_SPACE
: /* Perl space */
4896 for (fi
= min
;; fi
++)
4898 RMATCH(eptr
, ecode
, offset_top
, md
, eptrb
, RM60
);
4899 if (rrc
!= MATCH_NOMATCH
) RRETURN(rrc
);
4900 if (fi
>= max
) RRETURN(MATCH_NOMATCH
);
4901 if (eptr
>= md
->end_subject
)
4904 RRETURN(MATCH_NOMATCH
);
4906 GETCHARINCTEST(c
, eptr
);
4907 if ((UCD_CATEGORY(c
) == ucp_Z
|| c
== CHAR_HT
|| c
== CHAR_NL
||
4908 c
== CHAR_FF
|| c
== CHAR_CR
)
4909 == prop_fail_result
)
4910 RRETURN(MATCH_NOMATCH
);
4912 /* Control never gets here */
4914 case PT_PXSPACE
: /* POSIX space */
4915 for (fi
= min
;; fi
++)
4917 RMATCH(eptr
, ecode
, offset_top
, md
, eptrb
, RM61
);
4918 if (rrc
!= MATCH_NOMATCH
) RRETURN(rrc
);
4919 if (fi
>= max
) RRETURN(MATCH_NOMATCH
);
4920 if (eptr
>= md
->end_subject
)
4923 RRETURN(MATCH_NOMATCH
);
4925 GETCHARINCTEST(c
, eptr
);
4926 if ((UCD_CATEGORY(c
) == ucp_Z
|| c
== CHAR_HT
|| c
== CHAR_NL
||
4927 c
== CHAR_VT
|| c
== CHAR_FF
|| c
== CHAR_CR
)
4928 == prop_fail_result
)
4929 RRETURN(MATCH_NOMATCH
);
4931 /* Control never gets here */
4934 for (fi
= min
;; fi
++)
4937 RMATCH(eptr
, ecode
, offset_top
, md
, eptrb
, RM62
);
4938 if (rrc
!= MATCH_NOMATCH
) RRETURN(rrc
);
4939 if (fi
>= max
) RRETURN(MATCH_NOMATCH
);
4940 if (eptr
>= md
->end_subject
)
4943 RRETURN(MATCH_NOMATCH
);
4945 GETCHARINCTEST(c
, eptr
);
4946 category
= UCD_CATEGORY(c
);
4947 if ((category
== ucp_L
||
4948 category
== ucp_N
||
4949 c
== CHAR_UNDERSCORE
)
4950 == prop_fail_result
)
4951 RRETURN(MATCH_NOMATCH
);
4953 /* Control never gets here */
4956 for (fi
= min
;; fi
++)
4958 const pcre_uint32
*cp
;
4959 RMATCH(eptr
, ecode
, offset_top
, md
, eptrb
, RM67
);
4960 if (rrc
!= MATCH_NOMATCH
) RRETURN(rrc
);
4961 if (fi
>= max
) RRETURN(MATCH_NOMATCH
);
4962 if (eptr
>= md
->end_subject
)
4965 RRETURN(MATCH_NOMATCH
);
4967 GETCHARINCTEST(c
, eptr
);
4968 cp
= PRIV(ucd_caseless_sets
) + prop_value
;
4972 { if (prop_fail_result
) break; else { RRETURN(MATCH_NOMATCH
); } }
4974 { if (prop_fail_result
) { RRETURN(MATCH_NOMATCH
); } else break; }
4977 /* Control never gets here */
4979 /* This should never occur */
4981 RRETURN(PCRE_ERROR_INTERNAL
);
4985 /* Match extended Unicode sequences. We will get here only if the
4986 support is in the binary; otherwise a compile-time error occurs. */
4988 else if (ctype
== OP_EXTUNI
)
4990 for (fi
= min
;; fi
++)
4992 RMATCH(eptr
, ecode
, offset_top
, md
, eptrb
, RM41
);
4993 if (rrc
!= MATCH_NOMATCH
) RRETURN(rrc
);
4994 if (fi
>= max
) RRETURN(MATCH_NOMATCH
);
4995 if (eptr
>= md
->end_subject
)
4998 RRETURN(MATCH_NOMATCH
);
5003 GETCHARINCTEST(c
, eptr
);
5004 lgb
= UCD_GRAPHBREAK(c
);
5005 while (eptr
< md
->end_subject
)
5008 if (!utf
) c
= *eptr
; else { GETCHARLEN(c
, eptr
, len
); }
5009 rgb
= UCD_GRAPHBREAK(c
);
5010 if ((PRIV(ucp_gbtable
)[lgb
] & (1 << rgb
)) == 0) break;
5019 #endif /* SUPPORT_UCP */
5024 for (fi
= min
;; fi
++)
5026 RMATCH(eptr
, ecode
, offset_top
, md
, eptrb
, RM42
);
5027 if (rrc
!= MATCH_NOMATCH
) RRETURN(rrc
);
5028 if (fi
>= max
) RRETURN(MATCH_NOMATCH
);
5029 if (eptr
>= md
->end_subject
)
5032 RRETURN(MATCH_NOMATCH
);
5034 if (ctype
== OP_ANY
&& IS_NEWLINE(eptr
))
5035 RRETURN(MATCH_NOMATCH
);
5036 GETCHARINC(c
, eptr
);
5039 case OP_ANY
: /* This is the non-NL case */
5040 if (md
->partial
!= 0 && /* Take care with CRLF partial */
5041 eptr
>= md
->end_subject
&&
5042 NLBLOCK
->nltype
== NLTYPE_FIXED
&&
5043 NLBLOCK
->nllen
== 2 &&
5044 c
== NLBLOCK
->nl
[0])
5047 if (md
->partial
> 1) RRETURN(PCRE_ERROR_PARTIAL
);
5058 default: RRETURN(MATCH_NOMATCH
);
5060 if (eptr
< md
->end_subject
&& RAWUCHAR(eptr
) == CHAR_LF
) eptr
++;
5072 #endif /* Not EBCDIC */
5073 if (md
->bsr_anycrlf
) RRETURN(MATCH_NOMATCH
);
5081 HSPACE_CASES
: RRETURN(MATCH_NOMATCH
);
5089 HSPACE_CASES
: break;
5090 default: RRETURN(MATCH_NOMATCH
);
5097 VSPACE_CASES
: RRETURN(MATCH_NOMATCH
);
5105 VSPACE_CASES
: break;
5106 default: RRETURN(MATCH_NOMATCH
);
5111 if (c
< 256 && (md
->ctypes
[c
] & ctype_digit
) != 0)
5112 RRETURN(MATCH_NOMATCH
);
5116 if (c
>= 256 || (md
->ctypes
[c
] & ctype_digit
) == 0)
5117 RRETURN(MATCH_NOMATCH
);
5120 case OP_NOT_WHITESPACE
:
5121 if (c
< 256 && (md
->ctypes
[c
] & ctype_space
) != 0)
5122 RRETURN(MATCH_NOMATCH
);
5126 if (c
>= 256 || (md
->ctypes
[c
] & ctype_space
) == 0)
5127 RRETURN(MATCH_NOMATCH
);
5130 case OP_NOT_WORDCHAR
:
5131 if (c
< 256 && (md
->ctypes
[c
] & ctype_word
) != 0)
5132 RRETURN(MATCH_NOMATCH
);
5136 if (c
>= 256 || (md
->ctypes
[c
] & ctype_word
) == 0)
5137 RRETURN(MATCH_NOMATCH
);
5141 RRETURN(PCRE_ERROR_INTERNAL
);
5149 for (fi
= min
;; fi
++)
5151 RMATCH(eptr
, ecode
, offset_top
, md
, eptrb
, RM43
);
5152 if (rrc
!= MATCH_NOMATCH
) RRETURN(rrc
);
5153 if (fi
>= max
) RRETURN(MATCH_NOMATCH
);
5154 if (eptr
>= md
->end_subject
)
5157 RRETURN(MATCH_NOMATCH
);
5159 if (ctype
== OP_ANY
&& IS_NEWLINE(eptr
))
5160 RRETURN(MATCH_NOMATCH
);
5164 case OP_ANY
: /* This is the non-NL case */
5165 if (md
->partial
!= 0 && /* Take care with CRLF partial */
5166 eptr
>= md
->end_subject
&&
5167 NLBLOCK
->nltype
== NLTYPE_FIXED
&&
5168 NLBLOCK
->nllen
== 2 &&
5169 c
== NLBLOCK
->nl
[0])
5172 if (md
->partial
> 1) RRETURN(PCRE_ERROR_PARTIAL
);
5183 default: RRETURN(MATCH_NOMATCH
);
5185 if (eptr
< md
->end_subject
&& *eptr
== CHAR_LF
) eptr
++;
5194 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5198 if (md
->bsr_anycrlf
) RRETURN(MATCH_NOMATCH
);
5208 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5209 HSPACE_MULTIBYTE_CASES
:
5211 RRETURN(MATCH_NOMATCH
);
5218 default: RRETURN(MATCH_NOMATCH
);
5220 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5221 HSPACE_MULTIBYTE_CASES
:
5232 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5233 VSPACE_MULTIBYTE_CASES
:
5235 RRETURN(MATCH_NOMATCH
);
5242 default: RRETURN(MATCH_NOMATCH
);
5244 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5245 VSPACE_MULTIBYTE_CASES
:
5252 if (MAX_255(c
) && (md
->ctypes
[c
] & ctype_digit
) != 0) RRETURN(MATCH_NOMATCH
);
5256 if (!MAX_255(c
) || (md
->ctypes
[c
] & ctype_digit
) == 0) RRETURN(MATCH_NOMATCH
);
5259 case OP_NOT_WHITESPACE
:
5260 if (MAX_255(c
) && (md
->ctypes
[c
] & ctype_space
) != 0) RRETURN(MATCH_NOMATCH
);
5264 if (!MAX_255(c
) || (md
->ctypes
[c
] & ctype_space
) == 0) RRETURN(MATCH_NOMATCH
);
5267 case OP_NOT_WORDCHAR
:
5268 if (MAX_255(c
) && (md
->ctypes
[c
] & ctype_word
) != 0) RRETURN(MATCH_NOMATCH
);
5272 if (!MAX_255(c
) || (md
->ctypes
[c
] & ctype_word
) == 0) RRETURN(MATCH_NOMATCH
);
5276 RRETURN(PCRE_ERROR_INTERNAL
);
5280 /* Control never gets here */
5283 /* If maximizing, it is worth using inline code for speed, doing the type
5284 test once at the start (i.e. keep it out of the loop). Again, keep the
5285 UTF-8 and UCP stuff separate. */
5289 pp
= eptr
; /* Remember where we started */
5297 for (i
= min
; i
< max
; i
++)
5300 if (eptr
>= md
->end_subject
)
5305 GETCHARLENTEST(c
, eptr
, len
);
5306 if (prop_fail_result
) break;
5312 for (i
= min
; i
< max
; i
++)
5316 if (eptr
>= md
->end_subject
)
5321 GETCHARLENTEST(c
, eptr
, len
);
5322 chartype
= UCD_CHARTYPE(c
);
5323 if ((chartype
== ucp_Lu
||
5324 chartype
== ucp_Ll
||
5325 chartype
== ucp_Lt
) == prop_fail_result
)
5332 for (i
= min
; i
< max
; i
++)
5335 if (eptr
>= md
->end_subject
)
5340 GETCHARLENTEST(c
, eptr
, len
);
5341 if ((UCD_CATEGORY(c
) == prop_value
) == prop_fail_result
) break;
5347 for (i
= min
; i
< max
; i
++)
5350 if (eptr
>= md
->end_subject
)
5355 GETCHARLENTEST(c
, eptr
, len
);
5356 if ((UCD_CHARTYPE(c
) == prop_value
) == prop_fail_result
) break;
5362 for (i
= min
; i
< max
; i
++)
5365 if (eptr
>= md
->end_subject
)
5370 GETCHARLENTEST(c
, eptr
, len
);
5371 if ((UCD_SCRIPT(c
) == prop_value
) == prop_fail_result
) break;
5377 for (i
= min
; i
< max
; i
++)
5381 if (eptr
>= md
->end_subject
)
5386 GETCHARLENTEST(c
, eptr
, len
);
5387 category
= UCD_CATEGORY(c
);
5388 if ((category
== ucp_L
|| category
== ucp_N
) == prop_fail_result
)
5394 case PT_SPACE
: /* Perl space */
5395 for (i
= min
; i
< max
; i
++)
5398 if (eptr
>= md
->end_subject
)
5403 GETCHARLENTEST(c
, eptr
, len
);
5404 if ((UCD_CATEGORY(c
) == ucp_Z
|| c
== CHAR_HT
|| c
== CHAR_NL
||
5405 c
== CHAR_FF
|| c
== CHAR_CR
)
5406 == prop_fail_result
)
5412 case PT_PXSPACE
: /* POSIX space */
5413 for (i
= min
; i
< max
; i
++)
5416 if (eptr
>= md
->end_subject
)
5421 GETCHARLENTEST(c
, eptr
, len
);
5422 if ((UCD_CATEGORY(c
) == ucp_Z
|| c
== CHAR_HT
|| c
== CHAR_NL
||
5423 c
== CHAR_VT
|| c
== CHAR_FF
|| c
== CHAR_CR
)
5424 == prop_fail_result
)
5431 for (i
= min
; i
< max
; i
++)
5435 if (eptr
>= md
->end_subject
)
5440 GETCHARLENTEST(c
, eptr
, len
);
5441 category
= UCD_CATEGORY(c
);
5442 if ((category
== ucp_L
|| category
== ucp_N
||
5443 c
== CHAR_UNDERSCORE
) == prop_fail_result
)
5450 for (i
= min
; i
< max
; i
++)
5452 const pcre_uint32
*cp
;
5454 if (eptr
>= md
->end_subject
)
5459 GETCHARLENTEST(c
, eptr
, len
);
5460 cp
= PRIV(ucd_caseless_sets
) + prop_value
;
5464 { if (prop_fail_result
) break; else goto GOT_MAX
; }
5466 { if (prop_fail_result
) goto GOT_MAX
; else break; }
5474 RRETURN(PCRE_ERROR_INTERNAL
);
5477 /* eptr is now past the end of the maximum run */
5479 if (possessive
) continue;
5482 RMATCH(eptr
, ecode
, offset_top
, md
, eptrb
, RM44
);
5483 if (rrc
!= MATCH_NOMATCH
) RRETURN(rrc
);
5484 if (eptr
-- == pp
) break; /* Stop if tried at original pos */
5485 if (utf
) BACKCHAR(eptr
);
5489 /* Match extended Unicode sequences. We will get here only if the
5490 support is in the binary; otherwise a compile-time error occurs. */
5492 else if (ctype
== OP_EXTUNI
)
5494 for (i
= min
; i
< max
; i
++)
5496 if (eptr
>= md
->end_subject
)
5504 GETCHARINCTEST(c
, eptr
);
5505 lgb
= UCD_GRAPHBREAK(c
);
5506 while (eptr
< md
->end_subject
)
5509 if (!utf
) c
= *eptr
; else { GETCHARLEN(c
, eptr
, len
); }
5510 rgb
= UCD_GRAPHBREAK(c
);
5511 if ((PRIV(ucp_gbtable
)[lgb
] & (1 << rgb
)) == 0) break;
5519 /* eptr is now past the end of the maximum run */
5521 if (possessive
) continue;
5525 RMATCH(eptr
, ecode
, offset_top
, md
, eptrb
, RM45
);
5526 if (rrc
!= MATCH_NOMATCH
) RRETURN(rrc
);
5527 if (eptr
-- == pp
) break; /* Stop if tried at original pos */
5528 for (;;) /* Move back over one extended */
5530 if (!utf
) c
= *eptr
; else
5535 if (UCD_CATEGORY(c
) != ucp_M
) break;
5542 #endif /* SUPPORT_UCP */
5552 for (i
= min
; i
< max
; i
++)
5554 if (eptr
>= md
->end_subject
)
5559 if (IS_NEWLINE(eptr
)) break;
5560 if (md
->partial
!= 0 && /* Take care with CRLF partial */
5561 eptr
+ 1 >= md
->end_subject
&&
5562 NLBLOCK
->nltype
== NLTYPE_FIXED
&&
5563 NLBLOCK
->nllen
== 2 &&
5564 RAWUCHAR(eptr
) == NLBLOCK
->nl
[0])
5567 if (md
->partial
> 1) RRETURN(PCRE_ERROR_PARTIAL
);
5570 ACROSSCHAR(eptr
< md
->end_subject
, *eptr
, eptr
++);
5574 /* Handle unlimited UTF-8 repeat */
5578 for (i
= min
; i
< max
; i
++)
5580 if (eptr
>= md
->end_subject
)
5585 if (IS_NEWLINE(eptr
)) break;
5586 if (md
->partial
!= 0 && /* Take care with CRLF partial */
5587 eptr
+ 1 >= md
->end_subject
&&
5588 NLBLOCK
->nltype
== NLTYPE_FIXED
&&
5589 NLBLOCK
->nllen
== 2 &&
5590 RAWUCHAR(eptr
) == NLBLOCK
->nl
[0])
5593 if (md
->partial
> 1) RRETURN(PCRE_ERROR_PARTIAL
);
5596 ACROSSCHAR(eptr
< md
->end_subject
, *eptr
, eptr
++);
5604 for (i
= min
; i
< max
; i
++)
5606 if (eptr
>= md
->end_subject
)
5612 ACROSSCHAR(eptr
< md
->end_subject
, *eptr
, eptr
++);
5617 eptr
= md
->end_subject
; /* Unlimited UTF-8 repeat */
5622 /* The byte case is the same as non-UTF8 */
5626 if (c
> (unsigned int)(md
->end_subject
- eptr
))
5628 eptr
= md
->end_subject
;
5635 for (i
= min
; i
< max
; i
++)
5638 if (eptr
>= md
->end_subject
)
5643 GETCHARLEN(c
, eptr
, len
);
5646 if (++eptr
>= md
->end_subject
) break;
5647 if (RAWUCHAR(eptr
) == CHAR_LF
) eptr
++;
5653 (c
!= CHAR_VT
&& c
!= CHAR_FF
&& c
!= CHAR_NEL
5655 && c
!= 0x2028 && c
!= 0x2029
5656 #endif /* Not EBCDIC */
5666 for (i
= min
; i
< max
; i
++)
5670 if (eptr
>= md
->end_subject
)
5675 GETCHARLEN(c
, eptr
, len
);
5678 HSPACE_CASES
: gotspace
= TRUE
; break;
5679 default: gotspace
= FALSE
; break;
5681 if (gotspace
== (ctype
== OP_NOT_HSPACE
)) break;
5688 for (i
= min
; i
< max
; i
++)
5692 if (eptr
>= md
->end_subject
)
5697 GETCHARLEN(c
, eptr
, len
);
5700 VSPACE_CASES
: gotspace
= TRUE
; break;
5701 default: gotspace
= FALSE
; break;
5703 if (gotspace
== (ctype
== OP_NOT_VSPACE
)) break;
5709 for (i
= min
; i
< max
; i
++)
5712 if (eptr
>= md
->end_subject
)
5717 GETCHARLEN(c
, eptr
, len
);
5718 if (c
< 256 && (md
->ctypes
[c
] & ctype_digit
) != 0) break;
5724 for (i
= min
; i
< max
; i
++)
5727 if (eptr
>= md
->end_subject
)
5732 GETCHARLEN(c
, eptr
, len
);
5733 if (c
>= 256 ||(md
->ctypes
[c
] & ctype_digit
) == 0) break;
5738 case OP_NOT_WHITESPACE
:
5739 for (i
= min
; i
< max
; i
++)
5742 if (eptr
>= md
->end_subject
)
5747 GETCHARLEN(c
, eptr
, len
);
5748 if (c
< 256 && (md
->ctypes
[c
] & ctype_space
) != 0) break;
5754 for (i
= min
; i
< max
; i
++)
5757 if (eptr
>= md
->end_subject
)
5762 GETCHARLEN(c
, eptr
, len
);
5763 if (c
>= 256 ||(md
->ctypes
[c
] & ctype_space
) == 0) break;
5768 case OP_NOT_WORDCHAR
:
5769 for (i
= min
; i
< max
; i
++)
5772 if (eptr
>= md
->end_subject
)
5777 GETCHARLEN(c
, eptr
, len
);
5778 if (c
< 256 && (md
->ctypes
[c
] & ctype_word
) != 0) break;
5784 for (i
= min
; i
< max
; i
++)
5787 if (eptr
>= md
->end_subject
)
5792 GETCHARLEN(c
, eptr
, len
);
5793 if (c
>= 256 || (md
->ctypes
[c
] & ctype_word
) == 0) break;
5799 RRETURN(PCRE_ERROR_INTERNAL
);
5802 /* eptr is now past the end of the maximum run. If possessive, we are
5803 done (no backing up). Otherwise, match at this position; anything other
5804 than no match is immediately returned. For nomatch, back up one
5805 character, unless we are matching \R and the last thing matched was
5806 \r\n, in which case, back up two bytes. */
5808 if (possessive
) continue;
5811 RMATCH(eptr
, ecode
, offset_top
, md
, eptrb
, RM46
);
5812 if (rrc
!= MATCH_NOMATCH
) RRETURN(rrc
);
5813 if (eptr
-- == pp
) break; /* Stop if tried at original pos */
5815 if (ctype
== OP_ANYNL
&& eptr
> pp
&& RAWUCHAR(eptr
) == CHAR_NL
&&
5816 RAWUCHAR(eptr
- 1) == CHAR_CR
) eptr
--;
5820 #endif /* SUPPORT_UTF */
5826 for (i
= min
; i
< max
; i
++)
5828 if (eptr
>= md
->end_subject
)
5833 if (IS_NEWLINE(eptr
)) break;
5834 if (md
->partial
!= 0 && /* Take care with CRLF partial */
5835 eptr
+ 1 >= md
->end_subject
&&
5836 NLBLOCK
->nltype
== NLTYPE_FIXED
&&
5837 NLBLOCK
->nllen
== 2 &&
5838 *eptr
== NLBLOCK
->nl
[0])
5841 if (md
->partial
> 1) RRETURN(PCRE_ERROR_PARTIAL
);
5850 if (c
> (unsigned int)(md
->end_subject
- eptr
))
5852 eptr
= md
->end_subject
;
5859 for (i
= min
; i
< max
; i
++)
5861 if (eptr
>= md
->end_subject
)
5869 if (++eptr
>= md
->end_subject
) break;
5870 if (*eptr
== CHAR_LF
) eptr
++;
5874 if (c
!= CHAR_LF
&& (md
->bsr_anycrlf
||
5875 (c
!= CHAR_VT
&& c
!= CHAR_FF
&& c
!= CHAR_NEL
5876 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5877 && c
!= 0x2028 && c
!= 0x2029
5886 for (i
= min
; i
< max
; i
++)
5888 if (eptr
>= md
->end_subject
)
5895 default: eptr
++; break;
5897 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5898 HSPACE_MULTIBYTE_CASES
:
5907 for (i
= min
; i
< max
; i
++)
5909 if (eptr
>= md
->end_subject
)
5916 default: goto ENDLOOP01
;
5918 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5919 HSPACE_MULTIBYTE_CASES
:
5928 for (i
= min
; i
< max
; i
++)
5930 if (eptr
>= md
->end_subject
)
5937 default: eptr
++; break;
5939 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5940 VSPACE_MULTIBYTE_CASES
:
5949 for (i
= min
; i
< max
; i
++)
5951 if (eptr
>= md
->end_subject
)
5958 default: goto ENDLOOP03
;
5960 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5961 VSPACE_MULTIBYTE_CASES
:
5970 for (i
= min
; i
< max
; i
++)
5972 if (eptr
>= md
->end_subject
)
5977 if (MAX_255(*eptr
) && (md
->ctypes
[*eptr
] & ctype_digit
) != 0) break;
5983 for (i
= min
; i
< max
; i
++)
5985 if (eptr
>= md
->end_subject
)
5990 if (!MAX_255(*eptr
) || (md
->ctypes
[*eptr
] & ctype_digit
) == 0) break;
5995 case OP_NOT_WHITESPACE
:
5996 for (i
= min
; i
< max
; i
++)
5998 if (eptr
>= md
->end_subject
)
6003 if (MAX_255(*eptr
) && (md
->ctypes
[*eptr
] & ctype_space
) != 0) break;
6009 for (i
= min
; i
< max
; i
++)
6011 if (eptr
>= md
->end_subject
)
6016 if (!MAX_255(*eptr
) || (md
->ctypes
[*eptr
] & ctype_space
) == 0) break;
6021 case OP_NOT_WORDCHAR
:
6022 for (i
= min
; i
< max
; i
++)
6024 if (eptr
>= md
->end_subject
)
6029 if (MAX_255(*eptr
) && (md
->ctypes
[*eptr
] & ctype_word
) != 0) break;
6035 for (i
= min
; i
< max
; i
++)
6037 if (eptr
>= md
->end_subject
)
6042 if (!MAX_255(*eptr
) || (md
->ctypes
[*eptr
] & ctype_word
) == 0) break;
6048 RRETURN(PCRE_ERROR_INTERNAL
);
6051 /* eptr is now past the end of the maximum run. If possessive, we are
6052 done (no backing up). Otherwise, match at this position; anything other
6053 than no match is immediately returned. For nomatch, back up one
6054 character (byte), unless we are matching \R and the last thing matched
6055 was \r\n, in which case, back up two bytes. */
6057 if (possessive
) continue;
6060 RMATCH(eptr
, ecode
, offset_top
, md
, eptrb
, RM47
);
6061 if (rrc
!= MATCH_NOMATCH
) RRETURN(rrc
);
6063 if (ctype
== OP_ANYNL
&& eptr
> pp
&& *eptr
== CHAR_LF
&&
6064 eptr
[-1] == CHAR_CR
) eptr
--;
6068 /* Get here if we can't make it match with any permitted repetitions */
6070 RRETURN(MATCH_NOMATCH
);
6072 /* Control never gets here */
6074 /* There's been some horrible disaster. Arrival here can only mean there is
6075 something seriously wrong in the code above or the OP_xxx definitions. */
6078 DPRINTF(("Unknown opcode %d\n", *ecode
));
6079 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE
);
6082 /* Do not stick any code in here without much thought; it is assumed
6083 that "continue" in the code above comes out to here to repeat the main
6086 } /* End of main loop */
6087 /* Control never reaches here */
6090 /* When compiling to use the heap rather than the stack for recursive calls to
6091 match(), the RRETURN() macro jumps here. The number that is saved in
6092 frame->Xwhere indicates which label we actually want to return to. */
6095 #define LBL(val) case val: goto L_RM##val;
6097 switch (frame
->Xwhere
)
6099 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
6100 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
6101 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
6102 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
6103 LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58) LBL(63) LBL(64)
6105 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
6109 LBL(16) LBL(18) LBL(20)
6110 LBL(22) LBL(23) LBL(28) LBL(30)
6111 LBL(32) LBL(34) LBL(42) LBL(46)
6113 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
6114 LBL(59) LBL(60) LBL(61) LBL(62) LBL(67)
6115 #endif /* SUPPORT_UCP */
6116 #endif /* SUPPORT_UTF */
6118 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame
->Xwhere
));
6119 return PCRE_ERROR_INTERNAL
;
6122 #endif /* NO_RECURSE */
6126 /***************************************************************************
6127 ****************************************************************************
6128 RECURSION IN THE match() FUNCTION
6130 Undefine all the macros that were defined above to handle this. */
6148 #undef new_recursive
6161 #undef save_capture_last
6171 /* These two are defined as macros in both cases */
6176 /***************************************************************************
6177 ***************************************************************************/
6181 /*************************************************
6182 * Release allocated heap frames *
6183 *************************************************/
6185 /* This function releases all the allocated frames. The base frame is on the
6186 machine stack, and so must not be freed.
6188 Argument: the address of the base frame
6193 release_match_heapframes (heapframe
*frame_base
)
6195 heapframe
*nextframe
= frame_base
->Xnextframe
;
6196 while (nextframe
!= NULL
)
6198 heapframe
*oldframe
= nextframe
;
6199 nextframe
= nextframe
->Xnextframe
;
6200 (PUBL(stack_free
))(oldframe
);
6206 /*************************************************
6207 * Execute a Regular Expression *
6208 *************************************************/
6210 /* This function applies a compiled re to a subject string and picks out
6211 portions of the string if it matches. Two elements in the vector are set for
6212 each substring: the offsets to the start and end of the substring.
6215 argument_re points to the compiled expression
6216 extra_data points to extra data or is NULL
6217 subject points to the subject string
6218 length length of subject string (may contain binary zeros)
6219 start_offset where to start in the subject string
6221 offsets points to a vector of ints to be filled in with offsets
6222 offsetcount the number of elements in the vector
6224 Returns: > 0 => success; value is the number of elements filled in
6225 = 0 => success, but offsets is not big enough
6226 -1 => failed to match
6227 < -1 => some kind of unexpected problem
6230 #if defined COMPILE_PCRE8
6231 PCRE_EXP_DEFN
int PCRE_CALL_CONVENTION
6232 pcre_exec(const pcre
*argument_re
, const pcre_extra
*extra_data
,
6233 PCRE_SPTR subject
, int length
, int start_offset
, int options
, int *offsets
,
6235 #elif defined COMPILE_PCRE16
6236 PCRE_EXP_DEFN
int PCRE_CALL_CONVENTION
6237 pcre16_exec(const pcre16
*argument_re
, const pcre16_extra
*extra_data
,
6238 PCRE_SPTR16 subject
, int length
, int start_offset
, int options
, int *offsets
,
6240 #elif defined COMPILE_PCRE32
6241 PCRE_EXP_DEFN
int PCRE_CALL_CONVENTION
6242 pcre32_exec(const pcre32
*argument_re
, const pcre32_extra
*extra_data
,
6243 PCRE_SPTR32 subject
, int length
, int start_offset
, int options
, int *offsets
,
6247 int rc
, ocount
, arg_offset_max
;
6249 BOOL using_temporary_offsets
= FALSE
;
6254 BOOL has_first_char
= FALSE
;
6255 BOOL has_req_char
= FALSE
;
6256 pcre_uchar first_char
= 0;
6257 pcre_uchar first_char2
= 0;
6258 pcre_uchar req_char
= 0;
6259 pcre_uchar req_char2
= 0;
6260 match_data match_block
;
6261 match_data
*md
= &match_block
;
6262 const pcre_uint8
*tables
;
6263 const pcre_uint8
*start_bits
= NULL
;
6264 PCRE_PUCHAR start_match
= (PCRE_PUCHAR
)subject
+ start_offset
;
6265 PCRE_PUCHAR end_subject
;
6266 PCRE_PUCHAR start_partial
= NULL
;
6267 PCRE_PUCHAR req_char_ptr
= start_match
- 1;
6269 const pcre_study_data
*study
;
6270 const REAL_PCRE
*re
= (const REAL_PCRE
*)argument_re
;
6273 heapframe frame_zero
;
6274 frame_zero
.Xprevframe
= NULL
; /* Marks the top level */
6275 frame_zero
.Xnextframe
= NULL
; /* None are allocated yet */
6276 md
->match_frames_base
= &frame_zero
;
6279 /* Check for the special magic call that measures the size of the stack used
6280 per recursive call of match(). Without the funny casting for sizeof, a Windows
6281 compiler gave this error: "unary minus operator applied to unsigned type,
6282 result still unsigned". Hopefully the cast fixes that. */
6284 if (re
== NULL
&& extra_data
== NULL
&& subject
== NULL
&& length
== -999 &&
6285 start_offset
== -999)
6287 return -((int)sizeof(heapframe
));
6289 return match(NULL
, NULL
, NULL
, 0, NULL
, NULL
, 0);
6292 /* Plausibility checks */
6294 if ((options
& ~PUBLIC_EXEC_OPTIONS
) != 0) return PCRE_ERROR_BADOPTION
;
6295 if (re
== NULL
|| subject
== NULL
|| (offsets
== NULL
&& offsetcount
> 0))
6296 return PCRE_ERROR_NULL
;
6297 if (offsetcount
< 0) return PCRE_ERROR_BADCOUNT
;
6298 if (length
< 0) return PCRE_ERROR_BADLENGTH
;
6299 if (start_offset
< 0 || start_offset
> length
) return PCRE_ERROR_BADOFFSET
;
6301 /* Check that the first field in the block is the magic number. If it is not,
6302 return with PCRE_ERROR_BADMAGIC. However, if the magic number is equal to
6303 REVERSED_MAGIC_NUMBER we return with PCRE_ERROR_BADENDIANNESS, which
6304 means that the pattern is likely compiled with different endianness. */
6306 if (re
->magic_number
!= MAGIC_NUMBER
)
6307 return re
->magic_number
== REVERSED_MAGIC_NUMBER
?
6308 PCRE_ERROR_BADENDIANNESS
:PCRE_ERROR_BADMAGIC
;
6309 if ((re
->flags
& PCRE_MODE
) == 0) return PCRE_ERROR_BADMODE
;
6311 /* These two settings are used in the code for checking a UTF-8 string that
6312 follows immediately afterwards. Other values in the md block are used only
6313 during "normal" pcre_exec() processing, not when the JIT support is in use,
6314 so they are set up later. */
6316 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
6317 utf
= md
->utf
= (re
->options
& PCRE_UTF8
) != 0;
6318 md
->partial
= ((options
& PCRE_PARTIAL_HARD
) != 0)? 2 :
6319 ((options
& PCRE_PARTIAL_SOFT
) != 0)? 1 : 0;
6321 /* Check a UTF-8 string if required. Pass back the character offset and error
6322 code for an invalid string if a results vector is available. */
6325 if (utf
&& (options
& PCRE_NO_UTF8_CHECK
) == 0)
6328 int errorcode
= PRIV(valid_utf
)((PCRE_PUCHAR
)subject
, length
, &erroroffset
);
6331 if (offsetcount
>= 2)
6333 offsets
[0] = erroroffset
;
6334 offsets
[1] = errorcode
;
6336 #if defined COMPILE_PCRE8
6337 return (errorcode
<= PCRE_UTF8_ERR5
&& md
->partial
> 1)?
6338 PCRE_ERROR_SHORTUTF8
: PCRE_ERROR_BADUTF8
;
6339 #elif defined COMPILE_PCRE16
6340 return (errorcode
<= PCRE_UTF16_ERR1
&& md
->partial
> 1)?
6341 PCRE_ERROR_SHORTUTF16
: PCRE_ERROR_BADUTF16
;
6342 #elif defined COMPILE_PCRE32
6343 return PCRE_ERROR_BADUTF32
;
6346 #if defined COMPILE_PCRE8 || defined COMPILE_PCRE16
6347 /* Check that a start_offset points to the start of a UTF character. */
6348 if (start_offset
> 0 && start_offset
< length
&&
6349 NOT_FIRSTCHAR(((PCRE_PUCHAR
)subject
)[start_offset
]))
6350 return PCRE_ERROR_BADUTF8_OFFSET
;
6355 /* If the pattern was successfully studied with JIT support, run the JIT
6356 executable instead of the rest of this function. Most options must be set at
6357 compile time for the JIT code to be usable. Fallback to the normal code path if
6358 an unsupported flag is set. */
6361 if (extra_data
!= NULL
6362 && (extra_data
->flags
& (PCRE_EXTRA_EXECUTABLE_JIT
|
6363 PCRE_EXTRA_TABLES
)) == PCRE_EXTRA_EXECUTABLE_JIT
6364 && extra_data
->executable_jit
!= NULL
6365 && (options
& ~PUBLIC_JIT_EXEC_OPTIONS
) == 0)
6367 rc
= PRIV(jit_exec
)(extra_data
, (const pcre_uchar
*)subject
, length
,
6368 start_offset
, options
, offsets
, offsetcount
);
6370 /* PCRE_ERROR_NULL means that the selected normal or partial matching
6371 mode is not compiled. In this case we simply fallback to interpreter. */
6373 if (rc
!= PCRE_ERROR_JIT_BADOPTION
) return rc
;
6377 /* Carry on with non-JIT matching. This information is for finding all the
6378 numbers associated with a given name, for condition testing. */
6380 md
->name_table
= (pcre_uchar
*)re
+ re
->name_table_offset
;
6381 md
->name_count
= re
->name_count
;
6382 md
->name_entry_size
= re
->name_entry_size
;
6384 /* Fish out the optional data from the extra_data structure, first setting
6385 the default values. */
6388 md
->match_limit
= MATCH_LIMIT
;
6389 md
->match_limit_recursion
= MATCH_LIMIT_RECURSION
;
6390 md
->callout_data
= NULL
;
6392 /* The table pointer is always in native byte order. */
6394 tables
= re
->tables
;
6396 if (extra_data
!= NULL
)
6398 register unsigned int flags
= extra_data
->flags
;
6399 if ((flags
& PCRE_EXTRA_STUDY_DATA
) != 0)
6400 study
= (const pcre_study_data
*)extra_data
->study_data
;
6401 if ((flags
& PCRE_EXTRA_MATCH_LIMIT
) != 0)
6402 md
->match_limit
= extra_data
->match_limit
;
6403 if ((flags
& PCRE_EXTRA_MATCH_LIMIT_RECURSION
) != 0)
6404 md
->match_limit_recursion
= extra_data
->match_limit_recursion
;
6405 if ((flags
& PCRE_EXTRA_CALLOUT_DATA
) != 0)
6406 md
->callout_data
= extra_data
->callout_data
;
6407 if ((flags
& PCRE_EXTRA_TABLES
) != 0) tables
= extra_data
->tables
;
6410 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
6411 is a feature that makes it possible to save compiled regex and re-use them
6412 in other programs later. */
6414 if (tables
== NULL
) tables
= PRIV(default_tables
);
6416 /* Set up other data */
6418 anchored
= ((re
->options
| options
) & PCRE_ANCHORED
) != 0;
6419 startline
= (re
->flags
& PCRE_STARTLINE
) != 0;
6420 firstline
= (re
->options
& PCRE_FIRSTLINE
) != 0;
6422 /* The code starts after the real_pcre block and the capture name table. */
6424 md
->start_code
= (const pcre_uchar
*)re
+ re
->name_table_offset
+
6425 re
->name_count
* re
->name_entry_size
;
6427 md
->start_subject
= (PCRE_PUCHAR
)subject
;
6428 md
->start_offset
= start_offset
;
6429 md
->end_subject
= md
->start_subject
+ length
;
6430 end_subject
= md
->end_subject
;
6432 md
->endonly
= (re
->options
& PCRE_DOLLAR_ENDONLY
) != 0;
6433 md
->use_ucp
= (re
->options
& PCRE_UCP
) != 0;
6434 md
->jscript_compat
= (re
->options
& PCRE_JAVASCRIPT_COMPAT
) != 0;
6435 md
->ignore_skip_arg
= FALSE
;
6437 /* Some options are unpacked into BOOL variables in the hope that testing
6438 them will be faster than individual option bits. */
6440 md
->notbol
= (options
& PCRE_NOTBOL
) != 0;
6441 md
->noteol
= (options
& PCRE_NOTEOL
) != 0;
6442 md
->notempty
= (options
& PCRE_NOTEMPTY
) != 0;
6443 md
->notempty_atstart
= (options
& PCRE_NOTEMPTY_ATSTART
) != 0;
6446 md
->mark
= md
->nomatch_mark
= NULL
; /* In case never set */
6448 md
->recursive
= NULL
; /* No recursion at top level */
6449 md
->hasthen
= (re
->flags
& PCRE_HASTHEN
) != 0;
6451 md
->lcc
= tables
+ lcc_offset
;
6452 md
->fcc
= tables
+ fcc_offset
;
6453 md
->ctypes
= tables
+ ctypes_offset
;
6455 /* Handle different \R options. */
6457 switch (options
& (PCRE_BSR_ANYCRLF
|PCRE_BSR_UNICODE
))
6460 if ((re
->options
& (PCRE_BSR_ANYCRLF
|PCRE_BSR_UNICODE
)) != 0)
6461 md
->bsr_anycrlf
= (re
->options
& PCRE_BSR_ANYCRLF
) != 0;
6464 md
->bsr_anycrlf
= TRUE
;
6466 md
->bsr_anycrlf
= FALSE
;
6470 case PCRE_BSR_ANYCRLF
:
6471 md
->bsr_anycrlf
= TRUE
;
6474 case PCRE_BSR_UNICODE
:
6475 md
->bsr_anycrlf
= FALSE
;
6478 default: return PCRE_ERROR_BADNEWLINE
;
6481 /* Handle different types of newline. The three bits give eight cases. If
6482 nothing is set at run time, whatever was used at compile time applies. */
6484 switch ((((options
& PCRE_NEWLINE_BITS
) == 0)? re
->options
:
6485 (pcre_uint32
)options
) & PCRE_NEWLINE_BITS
)
6487 case 0: newline
= NEWLINE
; break; /* Compile-time default */
6488 case PCRE_NEWLINE_CR
: newline
= CHAR_CR
; break;
6489 case PCRE_NEWLINE_LF
: newline
= CHAR_NL
; break;
6490 case PCRE_NEWLINE_CR
+
6491 PCRE_NEWLINE_LF
: newline
= (CHAR_CR
<< 8) | CHAR_NL
; break;
6492 case PCRE_NEWLINE_ANY
: newline
= -1; break;
6493 case PCRE_NEWLINE_ANYCRLF
: newline
= -2; break;
6494 default: return PCRE_ERROR_BADNEWLINE
;
6499 md
->nltype
= NLTYPE_ANYCRLF
;
6501 else if (newline
< 0)
6503 md
->nltype
= NLTYPE_ANY
;
6507 md
->nltype
= NLTYPE_FIXED
;
6511 md
->nl
[0] = (newline
>> 8) & 255;
6512 md
->nl
[1] = newline
& 255;
6517 md
->nl
[0] = newline
;
6521 /* Partial matching was originally supported only for a restricted set of
6522 regexes; from release 8.00 there are no restrictions, but the bits are still
6523 defined (though never set). So there's no harm in leaving this code. */
6525 if (md
->partial
&& (re
->flags
& PCRE_NOPARTIAL
) != 0)
6526 return PCRE_ERROR_BADPARTIAL
;
6528 /* If the expression has got more back references than the offsets supplied can
6529 hold, we get a temporary chunk of working store to use during the matching.
6530 Otherwise, we can use the vector supplied, rounding down its size to a multiple
6533 ocount
= offsetcount
- (offsetcount
% 3);
6534 arg_offset_max
= (2*ocount
)/3;
6536 if (re
->top_backref
> 0 && re
->top_backref
>= ocount
/3)
6538 ocount
= re
->top_backref
* 3 + 3;
6539 md
->offset_vector
= (int *)(PUBL(malloc
))(ocount
* sizeof(int));
6540 if (md
->offset_vector
== NULL
) return PCRE_ERROR_NOMEMORY
;
6541 using_temporary_offsets
= TRUE
;
6542 DPRINTF(("Got memory to hold back references\n"));
6544 else md
->offset_vector
= offsets
;
6546 md
->offset_end
= ocount
;
6547 md
->offset_max
= (2*ocount
)/3;
6548 md
->offset_overflow
= FALSE
;
6549 md
->capture_last
= -1;
6551 /* Reset the working variable associated with each extraction. These should
6552 never be used unless previously set, but they get saved and restored, and so we
6553 initialize them to avoid reading uninitialized locations. Also, unset the
6554 offsets for the matched string. This is really just for tidiness with callouts,
6555 in case they inspect these fields. */
6557 if (md
->offset_vector
!= NULL
)
6559 register int *iptr
= md
->offset_vector
+ ocount
;
6560 register int *iend
= iptr
- re
->top_bracket
;
6561 if (iend
< md
->offset_vector
+ 2) iend
= md
->offset_vector
+ 2;
6562 while (--iptr
>= iend
) *iptr
= -1;
6563 md
->offset_vector
[0] = md
->offset_vector
[1] = -1;
6566 /* Set up the first character to match, if available. The first_char value is
6567 never set for an anchored regular expression, but the anchoring may be forced
6568 at run time, so we have to test for anchoring. The first char may be unset for
6569 an unanchored pattern, of course. If there's no first char and the pattern was
6570 studied, there may be a bitmap of possible first characters. */
6574 if ((re
->flags
& PCRE_FIRSTSET
) != 0)
6576 has_first_char
= TRUE
;
6577 first_char
= first_char2
= (pcre_uchar
)(re
->first_char
);
6578 if ((re
->flags
& PCRE_FCH_CASELESS
) != 0)
6580 first_char2
= TABLE_GET(first_char
, md
->fcc
, first_char
);
6581 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
6582 if (utf
&& first_char
> 127)
6583 first_char2
= UCD_OTHERCASE(first_char
);
6588 if (!startline
&& study
!= NULL
&&
6589 (study
->flags
& PCRE_STUDY_MAPPED
) != 0)
6590 start_bits
= study
->start_bits
;
6593 /* For anchored or unanchored matches, there may be a "last known required
6596 if ((re
->flags
& PCRE_REQCHSET
) != 0)
6598 has_req_char
= TRUE
;
6599 req_char
= req_char2
= (pcre_uchar
)(re
->req_char
);
6600 if ((re
->flags
& PCRE_RCH_CASELESS
) != 0)
6602 req_char2
= TABLE_GET(req_char
, md
->fcc
, req_char
);
6603 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
6604 if (utf
&& req_char
> 127)
6605 req_char2
= UCD_OTHERCASE(req_char
);
6611 /* ==========================================================================*/
6613 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
6614 the loop runs just once. */
6618 PCRE_PUCHAR save_end_subject
= end_subject
;
6619 PCRE_PUCHAR new_start_match
;
6621 /* If firstline is TRUE, the start of the match is constrained to the first
6622 line of a multiline string. That is, the match must be before or at the first
6623 newline. Implement this by temporarily adjusting end_subject so that we stop
6624 scanning at a newline. If the match fails at the newline, later code breaks
6629 PCRE_PUCHAR t
= start_match
;
6633 while (t
< md
->end_subject
&& !IS_NEWLINE(t
))
6636 ACROSSCHAR(t
< end_subject
, *t
, t
++);
6641 while (t
< md
->end_subject
&& !IS_NEWLINE(t
)) t
++;
6645 /* There are some optimizations that avoid running the match if a known
6646 starting point is not found, or if a known later character is not present.
6647 However, there is an option that disables these, for testing and for ensuring
6648 that all callouts do actually occur. The option can be set in the regex by
6649 (*NO_START_OPT) or passed in match-time options. */
6651 if (((options
| re
->options
) & PCRE_NO_START_OPTIMIZE
) == 0)
6653 /* Advance to a unique first char if there is one. */
6659 if (first_char
!= first_char2
)
6660 while (start_match
< end_subject
&&
6661 (smc
= RAWUCHARTEST(start_match
)) != first_char
&& smc
!= first_char2
)
6664 while (start_match
< end_subject
&& RAWUCHARTEST(start_match
) != first_char
)
6668 /* Or to just after a linebreak for a multiline match */
6672 if (start_match
> md
->start_subject
+ start_offset
)
6677 while (start_match
< end_subject
&& !WAS_NEWLINE(start_match
))
6680 ACROSSCHAR(start_match
< end_subject
, *start_match
,
6686 while (start_match
< end_subject
&& !WAS_NEWLINE(start_match
))
6689 /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
6690 and we are now at a LF, advance the match position by one more character.
6693 if (start_match
[-1] == CHAR_CR
&&
6694 (md
->nltype
== NLTYPE_ANY
|| md
->nltype
== NLTYPE_ANYCRLF
) &&
6695 start_match
< end_subject
&&
6696 RAWUCHARTEST(start_match
) == CHAR_NL
)
6701 /* Or to a non-unique first byte after study */
6703 else if (start_bits
!= NULL
)
6705 while (start_match
< end_subject
)
6707 register pcre_uint32 c
= RAWUCHARTEST(start_match
);
6708 #ifndef COMPILE_PCRE8
6709 if (c
> 255) c
= 255;
6711 if ((start_bits
[c
/8] & (1 << (c
&7))) == 0)
6714 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
6715 /* In non 8-bit mode, the iteration will stop for
6716 characters > 255 at the beginning or not stop at all. */
6718 ACROSSCHAR(start_match
< end_subject
, *start_match
,
6725 } /* Starting optimizations */
6727 /* Restore fudged end_subject */
6729 end_subject
= save_end_subject
;
6731 /* The following two optimizations are disabled for partial matching or if
6732 disabling is explicitly requested. */
6734 if (((options
| re
->options
) & PCRE_NO_START_OPTIMIZE
) == 0 && !md
->partial
)
6736 /* If the pattern was studied, a minimum subject length may be set. This is
6737 a lower bound; no actual string of that length may actually match the
6738 pattern. Although the value is, strictly, in characters, we treat it as
6739 bytes to avoid spending too much time in this optimization. */
6741 if (study
!= NULL
&& (study
->flags
& PCRE_STUDY_MINLEN
) != 0 &&
6742 (pcre_uint32
)(end_subject
- start_match
) < study
->minlength
)
6748 /* If req_char is set, we know that that character must appear in the
6749 subject for the match to succeed. If the first character is set, req_char
6750 must be later in the subject; otherwise the test starts at the match point.
6751 This optimization can save a huge amount of backtracking in patterns with
6752 nested unlimited repeats that aren't going to match. Writing separate code
6753 for cased/caseless versions makes it go faster, as does using an
6754 autoincrement and backing off on a match.
6756 HOWEVER: when the subject string is very, very long, searching to its end
6757 can take a long time, and give bad performance on quite ordinary patterns.
6758 This showed up when somebody was matching something like /^\d+C/ on a
6759 32-megabyte string... so we don't do this when the string is sufficiently
6762 if (has_req_char
&& end_subject
- start_match
< REQ_BYTE_MAX
)
6764 register PCRE_PUCHAR p
= start_match
+ (has_first_char
? 1:0);
6766 /* We don't need to repeat the search if we haven't yet reached the
6767 place we found it at last time. */
6769 if (p
> req_char_ptr
)
6771 if (req_char
!= req_char2
)
6773 while (p
< end_subject
)
6775 register pcre_uint32 pp
= RAWUCHARINCTEST(p
);
6776 if (pp
== req_char
|| pp
== req_char2
) { p
--; break; }
6781 while (p
< end_subject
)
6783 if (RAWUCHARINCTEST(p
) == req_char
) { p
--; break; }
6787 /* If we can't find the required character, break the matching loop,
6788 forcing a match failure. */
6790 if (p
>= end_subject
)
6796 /* If we have found the required character, save the point where we
6797 found it, so that we don't search again next time round the loop if
6798 the start hasn't passed this character yet. */
6805 #ifdef PCRE_DEBUG /* Sigh. Some compilers never learn. */
6806 printf(">>>> Match against: ");
6807 pchars(start_match
, end_subject
- start_match
, TRUE
, md
);
6811 /* OK, we can now run the match. If "hitend" is set afterwards, remember the
6812 first starting point for which a partial match was found. */
6814 md
->start_match_ptr
= start_match
;
6815 md
->start_used_ptr
= start_match
;
6816 md
->match_call_count
= 0;
6817 md
->match_function_type
= 0;
6818 md
->end_offset_top
= 0;
6819 rc
= match(start_match
, md
->start_code
, start_match
, 2, md
, NULL
, 0);
6820 if (md
->hitend
&& start_partial
== NULL
) start_partial
= md
->start_used_ptr
;
6824 /* If MATCH_SKIP_ARG reaches this level it means that a MARK that matched
6825 the SKIP's arg was not found. In this circumstance, Perl ignores the SKIP
6826 entirely. The only way we can do that is to re-do the match at the same
6827 point, with a flag to force SKIP with an argument to be ignored. Just
6828 treating this case as NOMATCH does not work because it does not check other
6829 alternatives in patterns such as A(*SKIP:A)B|AC when the subject is AC. */
6831 case MATCH_SKIP_ARG
:
6832 new_start_match
= start_match
;
6833 md
->ignore_skip_arg
= TRUE
;
6836 /* SKIP passes back the next starting point explicitly, but if it is the
6837 same as the match we have just done, treat it as NOMATCH. */
6840 if (md
->start_match_ptr
!= start_match
)
6842 new_start_match
= md
->start_match_ptr
;
6847 /* NOMATCH and PRUNE advance by one character. THEN at this level acts
6848 exactly like PRUNE. Unset the ignore SKIP-with-argument flag. */
6853 md
->ignore_skip_arg
= FALSE
;
6854 new_start_match
= start_match
+ 1;
6857 ACROSSCHAR(new_start_match
< end_subject
, *new_start_match
,
6862 /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */
6868 /* Any other return is either a match, or some kind of error. */
6874 /* Control reaches here for the various types of "no match at this point"
6875 result. Reset the code to MATCH_NOMATCH for subsequent checking. */
6879 /* If PCRE_FIRSTLINE is set, the match must happen before or at the first
6880 newline in the subject (though it may continue over the newline). Therefore,
6881 if we have just failed to match, starting at a newline, do not continue. */
6883 if (firstline
&& IS_NEWLINE(start_match
)) break;
6885 /* Advance to new matching position */
6887 start_match
= new_start_match
;
6889 /* Break the loop if the pattern is anchored or if we have passed the end of
6892 if (anchored
|| start_match
> end_subject
) break;
6894 /* If we have just passed a CR and we are now at a LF, and the pattern does
6895 not contain any explicit matches for \r or \n, and the newline option is CRLF
6896 or ANY or ANYCRLF, advance the match position by one more character. In
6897 normal matching start_match will aways be greater than the first position at
6898 this stage, but a failed *SKIP can cause a return at the same point, which is
6899 why the first test exists. */
6901 if (start_match
> (PCRE_PUCHAR
)subject
+ start_offset
&&
6902 start_match
[-1] == CHAR_CR
&&
6903 start_match
< end_subject
&&
6904 *start_match
== CHAR_NL
&&
6905 (re
->flags
& PCRE_HASCRORLF
) == 0 &&
6906 (md
->nltype
== NLTYPE_ANY
||
6907 md
->nltype
== NLTYPE_ANYCRLF
||
6911 md
->mark
= NULL
; /* Reset for start of next match attempt */
6912 } /* End of for(;;) "bumpalong" loop */
6914 /* ==========================================================================*/
6916 /* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping
6919 (1) The pattern is anchored or the match was failed by (*COMMIT);
6921 (2) We are past the end of the subject;
6923 (3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because
6924 this option requests that a match occur at or before the first newline in
6927 When we have a match and the offset vector is big enough to deal with any
6928 backreferences, captured substring offsets will already be set up. In the case
6929 where we had to get some local store to hold offsets for backreference
6930 processing, copy those that we can. In this case there need not be overflow if
6931 certain parts of the pattern were not used, even though there are more
6932 capturing parentheses than vector slots. */
6936 if (rc
== MATCH_MATCH
|| rc
== MATCH_ACCEPT
)
6938 if (using_temporary_offsets
)
6940 if (arg_offset_max
>= 4)
6942 memcpy(offsets
+ 2, md
->offset_vector
+ 2,
6943 (arg_offset_max
- 2) * sizeof(int));
6944 DPRINTF(("Copied offsets from temporary memory\n"));
6946 if (md
->end_offset_top
> arg_offset_max
) md
->offset_overflow
= TRUE
;
6947 DPRINTF(("Freeing temporary memory\n"));
6948 (PUBL(free
))(md
->offset_vector
);
6951 /* Set the return code to the number of captured strings, or 0 if there were
6952 too many to fit into the vector. */
6954 rc
= (md
->offset_overflow
&& md
->end_offset_top
>= arg_offset_max
)?
6955 0 : md
->end_offset_top
/2;
6957 /* If there is space in the offset vector, set any unused pairs at the end of
6958 the pattern to -1 for backwards compatibility. It is documented that this
6959 happens. In earlier versions, the whole set of potential capturing offsets
6960 was set to -1 each time round the loop, but this is handled differently now.
6961 "Gaps" are set to -1 dynamically instead (this fixes a bug). Thus, it is only
6962 those at the end that need unsetting here. We can't just unset them all at
6963 the start of the whole thing because they may get set in one branch that is
6964 not the final matching branch. */
6966 if (md
->end_offset_top
/2 <= re
->top_bracket
&& offsets
!= NULL
)
6968 register int *iptr
, *iend
;
6969 int resetcount
= 2 + re
->top_bracket
* 2;
6970 if (resetcount
> offsetcount
) resetcount
= offsetcount
;
6971 iptr
= offsets
+ md
->end_offset_top
;
6972 iend
= offsets
+ resetcount
;
6973 while (iptr
< iend
) *iptr
++ = -1;
6976 /* If there is space, set up the whole thing as substring 0. The value of
6977 md->start_match_ptr might be modified if \K was encountered on the success
6980 if (offsetcount
< 2) rc
= 0; else
6982 offsets
[0] = (int)(md
->start_match_ptr
- md
->start_subject
);
6983 offsets
[1] = (int)(md
->end_match_ptr
- md
->start_subject
);
6986 /* Return MARK data if requested */
6988 if (extra_data
!= NULL
&& (extra_data
->flags
& PCRE_EXTRA_MARK
) != 0)
6989 *(extra_data
->mark
) = (pcre_uchar
*)md
->mark
;
6990 DPRINTF((">>>> returning %d\n", rc
));
6992 release_match_heapframes(&frame_zero
);
6997 /* Control gets here if there has been an error, or if the overall match
6998 attempt has failed at all permitted starting positions. */
7000 if (using_temporary_offsets
)
7002 DPRINTF(("Freeing temporary memory\n"));
7003 (PUBL(free
))(md
->offset_vector
);
7006 /* For anything other than nomatch or partial match, just return the code. */
7008 if (rc
!= MATCH_NOMATCH
&& rc
!= PCRE_ERROR_PARTIAL
)
7010 DPRINTF((">>>> error: returning %d\n", rc
));
7012 release_match_heapframes(&frame_zero
);
7017 /* Handle partial matches - disable any mark data */
7019 if (start_partial
!= NULL
)
7021 DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
7023 if (offsetcount
> 1)
7025 offsets
[0] = (int)(start_partial
- (PCRE_PUCHAR
)subject
);
7026 offsets
[1] = (int)(end_subject
- (PCRE_PUCHAR
)subject
);
7028 rc
= PCRE_ERROR_PARTIAL
;
7031 /* This is the classic nomatch case */
7035 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
7036 rc
= PCRE_ERROR_NOMATCH
;
7039 /* Return the MARK data if it has been requested. */
7041 if (extra_data
!= NULL
&& (extra_data
->flags
& PCRE_EXTRA_MARK
) != 0)
7042 *(extra_data
->mark
) = (pcre_uchar
*)md
->nomatch_mark
;
7044 release_match_heapframes(&frame_zero
);
7049 /* End of pcre_exec.c */