Merge remote-tracking branch 'origin/tomato-shibby-RT-AC' into arrmo-RT-AC
[tomato.git] / release / src / router / pcre / pcre_exec.c
blob5dec99234a9d8eb12585f90d5f415cf0eb2333aa
1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
8 Written by Philip Hazel
9 Copyright (c) 1997-2014 University of Cambridge
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
40 /* This module contains pcre_exec(), the externally visible function that does
41 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
42 possible. There are also some static supporting functions. */
44 #ifdef HAVE_CONFIG_H
45 #include "config.h"
46 #endif
48 #define NLBLOCK md /* Block containing newline information */
49 #define PSSTART start_subject /* Field containing processed string start */
50 #define PSEND end_subject /* Field containing processed string end */
52 #include "pcre_internal.h"
54 /* Undefine some potentially clashing cpp symbols */
56 #undef min
57 #undef max
59 /* The md->capture_last field uses the lower 16 bits for the last captured
60 substring (which can never be greater than 65535) and a bit in the top half
61 to mean "capture vector overflowed". This odd way of doing things was
62 implemented when it was realized that preserving and restoring the overflow bit
63 whenever the last capture number was saved/restored made for a neater
64 interface, and doing it this way saved on (a) another variable, which would
65 have increased the stack frame size (a big NO-NO in PCRE) and (b) another
66 separate set of save/restore instructions. The following defines are used in
67 implementing this. */
69 #define CAPLMASK 0x0000ffff /* The bits used for last_capture */
70 #define OVFLMASK 0xffff0000 /* The bits used for the overflow flag */
71 #define OVFLBIT 0x00010000 /* The bit that is set for overflow */
73 /* Values for setting in md->match_function_type to indicate two special types
74 of call to match(). We do it this way to save on using another stack variable,
75 as stack usage is to be discouraged. */
77 #define MATCH_CONDASSERT 1 /* Called to check a condition assertion */
78 #define MATCH_CBEGROUP 2 /* Could-be-empty unlimited repeat group */
80 /* Non-error returns from the match() function. Error returns are externally
81 defined PCRE_ERROR_xxx codes, which are all negative. */
83 #define MATCH_MATCH 1
84 #define MATCH_NOMATCH 0
86 /* Special internal returns from the match() function. Make them sufficiently
87 negative to avoid the external error codes. */
89 #define MATCH_ACCEPT (-999)
90 #define MATCH_KETRPOS (-998)
91 #define MATCH_ONCE (-997)
92 /* The next 5 must be kept together and in sequence so that a test that checks
93 for any one of them can use a range. */
94 #define MATCH_COMMIT (-996)
95 #define MATCH_PRUNE (-995)
96 #define MATCH_SKIP (-994)
97 #define MATCH_SKIP_ARG (-993)
98 #define MATCH_THEN (-992)
99 #define MATCH_BACKTRACK_MAX MATCH_THEN
100 #define MATCH_BACKTRACK_MIN MATCH_COMMIT
102 /* Maximum number of ints of offset to save on the stack for recursive calls.
103 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
104 because the offset vector is always a multiple of 3 long. */
106 #define REC_STACK_SAVE_MAX 30
108 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
110 static const char rep_min[] = { 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, };
111 static const char rep_max[] = { 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, };
113 #ifdef PCRE_DEBUG
114 /*************************************************
115 * Debugging function to print chars *
116 *************************************************/
118 /* Print a sequence of chars in printable format, stopping at the end of the
119 subject if the requested.
121 Arguments:
122 p points to characters
123 length number to print
124 is_subject TRUE if printing from within md->start_subject
125 md pointer to matching data block, if is_subject is TRUE
127 Returns: nothing
130 static void
131 pchars(const pcre_uchar *p, int length, BOOL is_subject, match_data *md)
133 pcre_uint32 c;
134 BOOL utf = md->utf;
135 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
136 while (length-- > 0)
137 if (isprint(c = UCHAR21INCTEST(p))) printf("%c", (char)c); else printf("\\x{%02x}", c);
139 #endif
143 /*************************************************
144 * Match a back-reference *
145 *************************************************/
147 /* Normally, if a back reference hasn't been set, the length that is passed is
148 negative, so the match always fails. However, in JavaScript compatibility mode,
149 the length passed is zero. Note that in caseless UTF-8 mode, the number of
150 subject bytes matched may be different to the number of reference bytes.
152 Arguments:
153 offset index into the offset vector
154 eptr pointer into the subject
155 length length of reference to be matched (number of bytes)
156 md points to match data block
157 caseless TRUE if caseless
159 Returns: >= 0 the number of subject bytes matched
160 -1 no match
161 -2 partial match; always given if at end subject
164 static int
165 match_ref(int offset, register PCRE_PUCHAR eptr, int length, match_data *md,
166 BOOL caseless)
168 PCRE_PUCHAR eptr_start = eptr;
169 register PCRE_PUCHAR p = md->start_subject + md->offset_vector[offset];
170 #if defined SUPPORT_UTF && defined SUPPORT_UCP
171 BOOL utf = md->utf;
172 #endif
174 #ifdef PCRE_DEBUG
175 if (eptr >= md->end_subject)
176 printf("matching subject <null>");
177 else
179 printf("matching subject ");
180 pchars(eptr, length, TRUE, md);
182 printf(" against backref ");
183 pchars(p, length, FALSE, md);
184 printf("\n");
185 #endif
187 /* Always fail if reference not set (and not JavaScript compatible - in that
188 case the length is passed as zero). */
190 if (length < 0) return -1;
192 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
193 properly if Unicode properties are supported. Otherwise, we can check only
194 ASCII characters. */
196 if (caseless)
198 #if defined SUPPORT_UTF && defined SUPPORT_UCP
199 if (utf)
201 /* Match characters up to the end of the reference. NOTE: the number of
202 data units matched may differ, because in UTF-8 there are some characters
203 whose upper and lower case versions code have different numbers of bytes.
204 For example, U+023A (2 bytes in UTF-8) is the upper case version of U+2C65
205 (3 bytes in UTF-8); a sequence of 3 of the former uses 6 bytes, as does a
206 sequence of two of the latter. It is important, therefore, to check the
207 length along the reference, not along the subject (earlier code did this
208 wrong). */
210 PCRE_PUCHAR endptr = p + length;
211 while (p < endptr)
213 pcre_uint32 c, d;
214 const ucd_record *ur;
215 if (eptr >= md->end_subject) return -2; /* Partial match */
216 GETCHARINC(c, eptr);
217 GETCHARINC(d, p);
218 ur = GET_UCD(d);
219 if (c != d && c != d + ur->other_case)
221 const pcre_uint32 *pp = PRIV(ucd_caseless_sets) + ur->caseset;
222 for (;;)
224 if (c < *pp) return -1;
225 if (c == *pp++) break;
230 else
231 #endif
233 /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
234 is no UCP support. */
236 while (length-- > 0)
238 pcre_uint32 cc, cp;
239 if (eptr >= md->end_subject) return -2; /* Partial match */
240 cc = UCHAR21TEST(eptr);
241 cp = UCHAR21TEST(p);
242 if (TABLE_GET(cp, md->lcc, cp) != TABLE_GET(cc, md->lcc, cc)) return -1;
243 p++;
244 eptr++;
249 /* In the caseful case, we can just compare the bytes, whether or not we
250 are in UTF-8 mode. */
252 else
254 while (length-- > 0)
256 if (eptr >= md->end_subject) return -2; /* Partial match */
257 if (UCHAR21INCTEST(p) != UCHAR21INCTEST(eptr)) return -1;
261 return (int)(eptr - eptr_start);
266 /***************************************************************************
267 ****************************************************************************
268 RECURSION IN THE match() FUNCTION
270 The match() function is highly recursive, though not every recursive call
271 increases the recursive depth. Nevertheless, some regular expressions can cause
272 it to recurse to a great depth. I was writing for Unix, so I just let it call
273 itself recursively. This uses the stack for saving everything that has to be
274 saved for a recursive call. On Unix, the stack can be large, and this works
275 fine.
277 It turns out that on some non-Unix-like systems there are problems with
278 programs that use a lot of stack. (This despite the fact that every last chip
279 has oodles of memory these days, and techniques for extending the stack have
280 been known for decades.) So....
282 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
283 calls by keeping local variables that need to be preserved in blocks of memory
284 obtained from malloc() instead instead of on the stack. Macros are used to
285 achieve this so that the actual code doesn't look very different to what it
286 always used to.
288 The original heap-recursive code used longjmp(). However, it seems that this
289 can be very slow on some operating systems. Following a suggestion from Stan
290 Switzer, the use of longjmp() has been abolished, at the cost of having to
291 provide a unique number for each call to RMATCH. There is no way of generating
292 a sequence of numbers at compile time in C. I have given them names, to make
293 them stand out more clearly.
295 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
296 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
297 tests. Furthermore, not using longjmp() means that local dynamic variables
298 don't have indeterminate values; this has meant that the frame size can be
299 reduced because the result can be "passed back" by straight setting of the
300 variable instead of being passed in the frame.
301 ****************************************************************************
302 ***************************************************************************/
304 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
305 below must be updated in sync. */
307 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
308 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
309 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
310 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
311 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
312 RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
313 RM61, RM62, RM63, RM64, RM65, RM66, RM67 };
315 /* These versions of the macros use the stack, as normal. There are debugging
316 versions and production versions. Note that the "rw" argument of RMATCH isn't
317 actually used in this definition. */
319 #ifndef NO_RECURSE
320 #define REGISTER register
322 #ifdef PCRE_DEBUG
323 #define RMATCH(ra,rb,rc,rd,re,rw) \
325 printf("match() called in line %d\n", __LINE__); \
326 rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1); \
327 printf("to line %d\n", __LINE__); \
329 #define RRETURN(ra) \
331 printf("match() returned %d from line %d\n", ra, __LINE__); \
332 return ra; \
334 #else
335 #define RMATCH(ra,rb,rc,rd,re,rw) \
336 rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1)
337 #define RRETURN(ra) return ra
338 #endif
340 #else
343 /* These versions of the macros manage a private stack on the heap. Note that
344 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
345 argument of match(), which never changes. */
347 #define REGISTER
349 #define RMATCH(ra,rb,rc,rd,re,rw)\
351 heapframe *newframe = frame->Xnextframe;\
352 if (newframe == NULL)\
354 newframe = (heapframe *)(PUBL(stack_malloc))(sizeof(heapframe));\
355 if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\
356 newframe->Xnextframe = NULL;\
357 frame->Xnextframe = newframe;\
359 frame->Xwhere = rw;\
360 newframe->Xeptr = ra;\
361 newframe->Xecode = rb;\
362 newframe->Xmstart = mstart;\
363 newframe->Xoffset_top = rc;\
364 newframe->Xeptrb = re;\
365 newframe->Xrdepth = frame->Xrdepth + 1;\
366 newframe->Xprevframe = frame;\
367 frame = newframe;\
368 DPRINTF(("restarting from line %d\n", __LINE__));\
369 goto HEAP_RECURSE;\
370 L_##rw:\
371 DPRINTF(("jumped back to line %d\n", __LINE__));\
374 #define RRETURN(ra)\
376 heapframe *oldframe = frame;\
377 frame = oldframe->Xprevframe;\
378 if (frame != NULL)\
380 rrc = ra;\
381 goto HEAP_RETURN;\
383 return ra;\
387 /* Structure for remembering the local variables in a private frame */
389 typedef struct heapframe {
390 struct heapframe *Xprevframe;
391 struct heapframe *Xnextframe;
393 /* Function arguments that may change */
395 PCRE_PUCHAR Xeptr;
396 const pcre_uchar *Xecode;
397 PCRE_PUCHAR Xmstart;
398 int Xoffset_top;
399 eptrblock *Xeptrb;
400 unsigned int Xrdepth;
402 /* Function local variables */
404 PCRE_PUCHAR Xcallpat;
405 #ifdef SUPPORT_UTF
406 PCRE_PUCHAR Xcharptr;
407 #endif
408 PCRE_PUCHAR Xdata;
409 PCRE_PUCHAR Xnext;
410 PCRE_PUCHAR Xpp;
411 PCRE_PUCHAR Xprev;
412 PCRE_PUCHAR Xsaved_eptr;
414 recursion_info Xnew_recursive;
416 BOOL Xcur_is_word;
417 BOOL Xcondition;
418 BOOL Xprev_is_word;
420 #ifdef SUPPORT_UCP
421 int Xprop_type;
422 unsigned int Xprop_value;
423 int Xprop_fail_result;
424 int Xoclength;
425 pcre_uchar Xocchars[6];
426 #endif
428 int Xcodelink;
429 int Xctype;
430 unsigned int Xfc;
431 int Xfi;
432 int Xlength;
433 int Xmax;
434 int Xmin;
435 unsigned int Xnumber;
436 int Xoffset;
437 unsigned int Xop;
438 pcre_int32 Xsave_capture_last;
439 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
440 int Xstacksave[REC_STACK_SAVE_MAX];
442 eptrblock Xnewptrb;
444 /* Where to jump back to */
446 int Xwhere;
448 } heapframe;
450 #endif
453 /***************************************************************************
454 ***************************************************************************/
458 /*************************************************
459 * Match from current position *
460 *************************************************/
462 /* This function is called recursively in many circumstances. Whenever it
463 returns a negative (error) response, the outer incarnation must also return the
464 same response. */
466 /* These macros pack up tests that are used for partial matching, and which
467 appear several times in the code. We set the "hit end" flag if the pointer is
468 at the end of the subject and also past the start of the subject (i.e.
469 something has been matched). For hard partial matching, we then return
470 immediately. The second one is used when we already know we are past the end of
471 the subject. */
473 #define CHECK_PARTIAL()\
474 if (md->partial != 0 && eptr >= md->end_subject && \
475 eptr > md->start_used_ptr) \
477 md->hitend = TRUE; \
478 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
481 #define SCHECK_PARTIAL()\
482 if (md->partial != 0 && eptr > md->start_used_ptr) \
484 md->hitend = TRUE; \
485 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
489 /* Performance note: It might be tempting to extract commonly used fields from
490 the md structure (e.g. utf, end_subject) into individual variables to improve
491 performance. Tests using gcc on a SPARC disproved this; in the first case, it
492 made performance worse.
494 Arguments:
495 eptr pointer to current character in subject
496 ecode pointer to current position in compiled code
497 mstart pointer to the current match start position (can be modified
498 by encountering \K)
499 offset_top current top pointer
500 md pointer to "static" info for the match
501 eptrb pointer to chain of blocks containing eptr at start of
502 brackets - for testing for empty matches
503 rdepth the recursion depth
505 Returns: MATCH_MATCH if matched ) these values are >= 0
506 MATCH_NOMATCH if failed to match )
507 a negative MATCH_xxx value for PRUNE, SKIP, etc
508 a negative PCRE_ERROR_xxx value if aborted by an error condition
509 (e.g. stopped by repeated call or recursion limit)
512 static int
513 match(REGISTER PCRE_PUCHAR eptr, REGISTER const pcre_uchar *ecode,
514 PCRE_PUCHAR mstart, int offset_top, match_data *md, eptrblock *eptrb,
515 unsigned int rdepth)
517 /* These variables do not need to be preserved over recursion in this function,
518 so they can be ordinary variables in all cases. Mark some of them with
519 "register" because they are used a lot in loops. */
521 register int rrc; /* Returns from recursive calls */
522 register int i; /* Used for loops not involving calls to RMATCH() */
523 register pcre_uint32 c; /* Character values not kept over RMATCH() calls */
524 register BOOL utf; /* Local copy of UTF flag for speed */
526 BOOL minimize, possessive; /* Quantifier options */
527 BOOL caseless;
528 int condcode;
530 /* When recursion is not being used, all "local" variables that have to be
531 preserved over calls to RMATCH() are part of a "frame". We set up the top-level
532 frame on the stack here; subsequent instantiations are obtained from the heap
533 whenever RMATCH() does a "recursion". See the macro definitions above. Putting
534 the top-level on the stack rather than malloc-ing them all gives a performance
535 boost in many cases where there is not much "recursion". */
537 #ifdef NO_RECURSE
538 heapframe *frame = (heapframe *)md->match_frames_base;
540 /* Copy in the original argument variables */
542 frame->Xeptr = eptr;
543 frame->Xecode = ecode;
544 frame->Xmstart = mstart;
545 frame->Xoffset_top = offset_top;
546 frame->Xeptrb = eptrb;
547 frame->Xrdepth = rdepth;
549 /* This is where control jumps back to to effect "recursion" */
551 HEAP_RECURSE:
553 /* Macros make the argument variables come from the current frame */
555 #define eptr frame->Xeptr
556 #define ecode frame->Xecode
557 #define mstart frame->Xmstart
558 #define offset_top frame->Xoffset_top
559 #define eptrb frame->Xeptrb
560 #define rdepth frame->Xrdepth
562 /* Ditto for the local variables */
564 #ifdef SUPPORT_UTF
565 #define charptr frame->Xcharptr
566 #endif
567 #define callpat frame->Xcallpat
568 #define codelink frame->Xcodelink
569 #define data frame->Xdata
570 #define next frame->Xnext
571 #define pp frame->Xpp
572 #define prev frame->Xprev
573 #define saved_eptr frame->Xsaved_eptr
575 #define new_recursive frame->Xnew_recursive
577 #define cur_is_word frame->Xcur_is_word
578 #define condition frame->Xcondition
579 #define prev_is_word frame->Xprev_is_word
581 #ifdef SUPPORT_UCP
582 #define prop_type frame->Xprop_type
583 #define prop_value frame->Xprop_value
584 #define prop_fail_result frame->Xprop_fail_result
585 #define oclength frame->Xoclength
586 #define occhars frame->Xocchars
587 #endif
589 #define ctype frame->Xctype
590 #define fc frame->Xfc
591 #define fi frame->Xfi
592 #define length frame->Xlength
593 #define max frame->Xmax
594 #define min frame->Xmin
595 #define number frame->Xnumber
596 #define offset frame->Xoffset
597 #define op frame->Xop
598 #define save_capture_last frame->Xsave_capture_last
599 #define save_offset1 frame->Xsave_offset1
600 #define save_offset2 frame->Xsave_offset2
601 #define save_offset3 frame->Xsave_offset3
602 #define stacksave frame->Xstacksave
604 #define newptrb frame->Xnewptrb
606 /* When recursion is being used, local variables are allocated on the stack and
607 get preserved during recursion in the normal way. In this environment, fi and
608 i, and fc and c, can be the same variables. */
610 #else /* NO_RECURSE not defined */
611 #define fi i
612 #define fc c
614 /* Many of the following variables are used only in small blocks of the code.
615 My normal style of coding would have declared them within each of those blocks.
616 However, in order to accommodate the version of this code that uses an external
617 "stack" implemented on the heap, it is easier to declare them all here, so the
618 declarations can be cut out in a block. The only declarations within blocks
619 below are for variables that do not have to be preserved over a recursive call
620 to RMATCH(). */
622 #ifdef SUPPORT_UTF
623 const pcre_uchar *charptr;
624 #endif
625 const pcre_uchar *callpat;
626 const pcre_uchar *data;
627 const pcre_uchar *next;
628 PCRE_PUCHAR pp;
629 const pcre_uchar *prev;
630 PCRE_PUCHAR saved_eptr;
632 recursion_info new_recursive;
634 BOOL cur_is_word;
635 BOOL condition;
636 BOOL prev_is_word;
638 #ifdef SUPPORT_UCP
639 int prop_type;
640 unsigned int prop_value;
641 int prop_fail_result;
642 int oclength;
643 pcre_uchar occhars[6];
644 #endif
646 int codelink;
647 int ctype;
648 int length;
649 int max;
650 int min;
651 unsigned int number;
652 int offset;
653 unsigned int op;
654 pcre_int32 save_capture_last;
655 int save_offset1, save_offset2, save_offset3;
656 int stacksave[REC_STACK_SAVE_MAX];
658 eptrblock newptrb;
660 /* There is a special fudge for calling match() in a way that causes it to
661 measure the size of its basic stack frame when the stack is being used for
662 recursion. The second argument (ecode) being NULL triggers this behaviour. It
663 cannot normally ever be NULL. The return is the negated value of the frame
664 size. */
666 if (ecode == NULL)
668 if (rdepth == 0)
669 return match((PCRE_PUCHAR)&rdepth, NULL, NULL, 0, NULL, NULL, 1);
670 else
672 int len = (char *)&rdepth - (char *)eptr;
673 return (len > 0)? -len : len;
676 #endif /* NO_RECURSE */
678 /* To save space on the stack and in the heap frame, I have doubled up on some
679 of the local variables that are used only in localised parts of the code, but
680 still need to be preserved over recursive calls of match(). These macros define
681 the alternative names that are used. */
683 #define allow_zero cur_is_word
684 #define cbegroup condition
685 #define code_offset codelink
686 #define condassert condition
687 #define matched_once prev_is_word
688 #define foc number
689 #define save_mark data
691 /* These statements are here to stop the compiler complaining about unitialized
692 variables. */
694 #ifdef SUPPORT_UCP
695 prop_value = 0;
696 prop_fail_result = 0;
697 #endif
700 /* This label is used for tail recursion, which is used in a few cases even
701 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
702 used. Thanks to Ian Taylor for noticing this possibility and sending the
703 original patch. */
705 TAIL_RECURSE:
707 /* OK, now we can get on with the real code of the function. Recursive calls
708 are specified by the macro RMATCH and RRETURN is used to return. When
709 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
710 and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
711 defined). However, RMATCH isn't like a function call because it's quite a
712 complicated macro. It has to be used in one particular way. This shouldn't,
713 however, impact performance when true recursion is being used. */
715 #ifdef SUPPORT_UTF
716 utf = md->utf; /* Local copy of the flag */
717 #else
718 utf = FALSE;
719 #endif
721 /* First check that we haven't called match() too many times, or that we
722 haven't exceeded the recursive call limit. */
724 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
725 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
727 /* At the start of a group with an unlimited repeat that may match an empty
728 string, the variable md->match_function_type is set to MATCH_CBEGROUP. It is
729 done this way to save having to use another function argument, which would take
730 up space on the stack. See also MATCH_CONDASSERT below.
732 When MATCH_CBEGROUP is set, add the current subject pointer to the chain of
733 such remembered pointers, to be checked when we hit the closing ket, in order
734 to break infinite loops that match no characters. When match() is called in
735 other circumstances, don't add to the chain. The MATCH_CBEGROUP feature must
736 NOT be used with tail recursion, because the memory block that is used is on
737 the stack, so a new one may be required for each match(). */
739 if (md->match_function_type == MATCH_CBEGROUP)
741 newptrb.epb_saved_eptr = eptr;
742 newptrb.epb_prev = eptrb;
743 eptrb = &newptrb;
744 md->match_function_type = 0;
747 /* Now start processing the opcodes. */
749 for (;;)
751 minimize = possessive = FALSE;
752 op = *ecode;
754 switch(op)
756 case OP_MARK:
757 md->nomatch_mark = ecode + 2;
758 md->mark = NULL; /* In case previously set by assertion */
759 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
760 eptrb, RM55);
761 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
762 md->mark == NULL) md->mark = ecode + 2;
764 /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
765 argument, and we must check whether that argument matches this MARK's
766 argument. It is passed back in md->start_match_ptr (an overloading of that
767 variable). If it does match, we reset that variable to the current subject
768 position and return MATCH_SKIP. Otherwise, pass back the return code
769 unaltered. */
771 else if (rrc == MATCH_SKIP_ARG &&
772 STRCMP_UC_UC_TEST(ecode + 2, md->start_match_ptr) == 0)
774 md->start_match_ptr = eptr;
775 RRETURN(MATCH_SKIP);
777 RRETURN(rrc);
779 case OP_FAIL:
780 RRETURN(MATCH_NOMATCH);
782 case OP_COMMIT:
783 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
784 eptrb, RM52);
785 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
786 RRETURN(MATCH_COMMIT);
788 case OP_PRUNE:
789 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
790 eptrb, RM51);
791 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
792 RRETURN(MATCH_PRUNE);
794 case OP_PRUNE_ARG:
795 md->nomatch_mark = ecode + 2;
796 md->mark = NULL; /* In case previously set by assertion */
797 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
798 eptrb, RM56);
799 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
800 md->mark == NULL) md->mark = ecode + 2;
801 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
802 RRETURN(MATCH_PRUNE);
804 case OP_SKIP:
805 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
806 eptrb, RM53);
807 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
808 md->start_match_ptr = eptr; /* Pass back current position */
809 RRETURN(MATCH_SKIP);
811 /* Note that, for Perl compatibility, SKIP with an argument does NOT set
812 nomatch_mark. When a pattern match ends with a SKIP_ARG for which there was
813 not a matching mark, we have to re-run the match, ignoring the SKIP_ARG
814 that failed and any that precede it (either they also failed, or were not
815 triggered). To do this, we maintain a count of executed SKIP_ARGs. If a
816 SKIP_ARG gets to top level, the match is re-run with md->ignore_skip_arg
817 set to the count of the one that failed. */
819 case OP_SKIP_ARG:
820 md->skip_arg_count++;
821 if (md->skip_arg_count <= md->ignore_skip_arg)
823 ecode += PRIV(OP_lengths)[*ecode] + ecode[1];
824 break;
826 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
827 eptrb, RM57);
828 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
830 /* Pass back the current skip name by overloading md->start_match_ptr and
831 returning the special MATCH_SKIP_ARG return code. This will either be
832 caught by a matching MARK, or get to the top, where it causes a rematch
833 with md->ignore_skip_arg set to the value of md->skip_arg_count. */
835 md->start_match_ptr = ecode + 2;
836 RRETURN(MATCH_SKIP_ARG);
838 /* For THEN (and THEN_ARG) we pass back the address of the opcode, so that
839 the branch in which it occurs can be determined. Overload the start of
840 match pointer to do this. */
842 case OP_THEN:
843 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
844 eptrb, RM54);
845 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
846 md->start_match_ptr = ecode;
847 RRETURN(MATCH_THEN);
849 case OP_THEN_ARG:
850 md->nomatch_mark = ecode + 2;
851 md->mark = NULL; /* In case previously set by assertion */
852 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top,
853 md, eptrb, RM58);
854 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
855 md->mark == NULL) md->mark = ecode + 2;
856 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
857 md->start_match_ptr = ecode;
858 RRETURN(MATCH_THEN);
860 /* Handle an atomic group that does not contain any capturing parentheses.
861 This can be handled like an assertion. Prior to 8.13, all atomic groups
862 were handled this way. In 8.13, the code was changed as below for ONCE, so
863 that backups pass through the group and thereby reset captured values.
864 However, this uses a lot more stack, so in 8.20, atomic groups that do not
865 contain any captures generate OP_ONCE_NC, which can be handled in the old,
866 less stack intensive way.
868 Check the alternative branches in turn - the matching won't pass the KET
869 for this kind of subpattern. If any one branch matches, we carry on as at
870 the end of a normal bracket, leaving the subject pointer, but resetting
871 the start-of-match value in case it was changed by \K. */
873 case OP_ONCE_NC:
874 prev = ecode;
875 saved_eptr = eptr;
876 save_mark = md->mark;
879 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM64);
880 if (rrc == MATCH_MATCH) /* Note: _not_ MATCH_ACCEPT */
882 mstart = md->start_match_ptr;
883 break;
885 if (rrc == MATCH_THEN)
887 next = ecode + GET(ecode,1);
888 if (md->start_match_ptr < next &&
889 (*ecode == OP_ALT || *next == OP_ALT))
890 rrc = MATCH_NOMATCH;
893 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
894 ecode += GET(ecode,1);
895 md->mark = save_mark;
897 while (*ecode == OP_ALT);
899 /* If hit the end of the group (which could be repeated), fail */
901 if (*ecode != OP_ONCE_NC && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
903 /* Continue as from after the group, updating the offsets high water
904 mark, since extracts may have been taken. */
906 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
908 offset_top = md->end_offset_top;
909 eptr = md->end_match_ptr;
911 /* For a non-repeating ket, just continue at this level. This also
912 happens for a repeating ket if no characters were matched in the group.
913 This is the forcible breaking of infinite loops as implemented in Perl
914 5.005. */
916 if (*ecode == OP_KET || eptr == saved_eptr)
918 ecode += 1+LINK_SIZE;
919 break;
922 /* The repeating kets try the rest of the pattern or restart from the
923 preceding bracket, in the appropriate order. The second "call" of match()
924 uses tail recursion, to avoid using another stack frame. */
926 if (*ecode == OP_KETRMIN)
928 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM65);
929 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
930 ecode = prev;
931 goto TAIL_RECURSE;
933 else /* OP_KETRMAX */
935 RMATCH(eptr, prev, offset_top, md, eptrb, RM66);
936 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
937 ecode += 1 + LINK_SIZE;
938 goto TAIL_RECURSE;
940 /* Control never gets here */
942 /* Handle a capturing bracket, other than those that are possessive with an
943 unlimited repeat. If there is space in the offset vector, save the current
944 subject position in the working slot at the top of the vector. We mustn't
945 change the current values of the data slot, because they may be set from a
946 previous iteration of this group, and be referred to by a reference inside
947 the group. A failure to match might occur after the group has succeeded,
948 if something later on doesn't match. For this reason, we need to restore
949 the working value and also the values of the final offsets, in case they
950 were set by a previous iteration of the same bracket.
952 If there isn't enough space in the offset vector, treat this as if it were
953 a non-capturing bracket. Don't worry about setting the flag for the error
954 case here; that is handled in the code for KET. */
956 case OP_CBRA:
957 case OP_SCBRA:
958 number = GET2(ecode, 1+LINK_SIZE);
959 offset = number << 1;
961 #ifdef PCRE_DEBUG
962 printf("start bracket %d\n", number);
963 printf("subject=");
964 pchars(eptr, 16, TRUE, md);
965 printf("\n");
966 #endif
968 if (offset < md->offset_max)
970 save_offset1 = md->offset_vector[offset];
971 save_offset2 = md->offset_vector[offset+1];
972 save_offset3 = md->offset_vector[md->offset_end - number];
973 save_capture_last = md->capture_last;
974 save_mark = md->mark;
976 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
977 md->offset_vector[md->offset_end - number] =
978 (int)(eptr - md->start_subject);
980 for (;;)
982 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
983 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
984 eptrb, RM1);
985 if (rrc == MATCH_ONCE) break; /* Backing up through an atomic group */
987 /* If we backed up to a THEN, check whether it is within the current
988 branch by comparing the address of the THEN that is passed back with
989 the end of the branch. If it is within the current branch, and the
990 branch is one of two or more alternatives (it either starts or ends
991 with OP_ALT), we have reached the limit of THEN's action, so convert
992 the return code to NOMATCH, which will cause normal backtracking to
993 happen from now on. Otherwise, THEN is passed back to an outer
994 alternative. This implements Perl's treatment of parenthesized groups,
995 where a group not containing | does not affect the current alternative,
996 that is, (X) is NOT the same as (X|(*F)). */
998 if (rrc == MATCH_THEN)
1000 next = ecode + GET(ecode,1);
1001 if (md->start_match_ptr < next &&
1002 (*ecode == OP_ALT || *next == OP_ALT))
1003 rrc = MATCH_NOMATCH;
1006 /* Anything other than NOMATCH is passed back. */
1008 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1009 md->capture_last = save_capture_last;
1010 ecode += GET(ecode, 1);
1011 md->mark = save_mark;
1012 if (*ecode != OP_ALT) break;
1015 DPRINTF(("bracket %d failed\n", number));
1016 md->offset_vector[offset] = save_offset1;
1017 md->offset_vector[offset+1] = save_offset2;
1018 md->offset_vector[md->offset_end - number] = save_offset3;
1020 /* At this point, rrc will be one of MATCH_ONCE or MATCH_NOMATCH. */
1022 RRETURN(rrc);
1025 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1026 as a non-capturing bracket. */
1028 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1029 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1031 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1033 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1034 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1036 /* Non-capturing or atomic group, except for possessive with unlimited
1037 repeat and ONCE group with no captures. Loop for all the alternatives.
1039 When we get to the final alternative within the brackets, we used to return
1040 the result of a recursive call to match() whatever happened so it was
1041 possible to reduce stack usage by turning this into a tail recursion,
1042 except in the case of a possibly empty group. However, now that there is
1043 the possiblity of (*THEN) occurring in the final alternative, this
1044 optimization is no longer always possible.
1046 We can optimize if we know there are no (*THEN)s in the pattern; at present
1047 this is the best that can be done.
1049 MATCH_ONCE is returned when the end of an atomic group is successfully
1050 reached, but subsequent matching fails. It passes back up the tree (causing
1051 captured values to be reset) until the original atomic group level is
1052 reached. This is tested by comparing md->once_target with the start of the
1053 group. At this point, the return is converted into MATCH_NOMATCH so that
1054 previous backup points can be taken. */
1056 case OP_ONCE:
1057 case OP_BRA:
1058 case OP_SBRA:
1059 DPRINTF(("start non-capturing bracket\n"));
1061 for (;;)
1063 if (op >= OP_SBRA || op == OP_ONCE)
1064 md->match_function_type = MATCH_CBEGROUP;
1066 /* If this is not a possibly empty group, and there are no (*THEN)s in
1067 the pattern, and this is the final alternative, optimize as described
1068 above. */
1070 else if (!md->hasthen && ecode[GET(ecode, 1)] != OP_ALT)
1072 ecode += PRIV(OP_lengths)[*ecode];
1073 goto TAIL_RECURSE;
1076 /* In all other cases, we have to make another call to match(). */
1078 save_mark = md->mark;
1079 save_capture_last = md->capture_last;
1080 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, eptrb,
1081 RM2);
1083 /* See comment in the code for capturing groups above about handling
1084 THEN. */
1086 if (rrc == MATCH_THEN)
1088 next = ecode + GET(ecode,1);
1089 if (md->start_match_ptr < next &&
1090 (*ecode == OP_ALT || *next == OP_ALT))
1091 rrc = MATCH_NOMATCH;
1094 if (rrc != MATCH_NOMATCH)
1096 if (rrc == MATCH_ONCE)
1098 const pcre_uchar *scode = ecode;
1099 if (*scode != OP_ONCE) /* If not at start, find it */
1101 while (*scode == OP_ALT) scode += GET(scode, 1);
1102 scode -= GET(scode, 1);
1104 if (md->once_target == scode) rrc = MATCH_NOMATCH;
1106 RRETURN(rrc);
1108 ecode += GET(ecode, 1);
1109 md->mark = save_mark;
1110 if (*ecode != OP_ALT) break;
1111 md->capture_last = save_capture_last;
1114 RRETURN(MATCH_NOMATCH);
1116 /* Handle possessive capturing brackets with an unlimited repeat. We come
1117 here from BRAZERO with allow_zero set TRUE. The offset_vector values are
1118 handled similarly to the normal case above. However, the matching is
1119 different. The end of these brackets will always be OP_KETRPOS, which
1120 returns MATCH_KETRPOS without going further in the pattern. By this means
1121 we can handle the group by iteration rather than recursion, thereby
1122 reducing the amount of stack needed. */
1124 case OP_CBRAPOS:
1125 case OP_SCBRAPOS:
1126 allow_zero = FALSE;
1128 POSSESSIVE_CAPTURE:
1129 number = GET2(ecode, 1+LINK_SIZE);
1130 offset = number << 1;
1132 #ifdef PCRE_DEBUG
1133 printf("start possessive bracket %d\n", number);
1134 printf("subject=");
1135 pchars(eptr, 16, TRUE, md);
1136 printf("\n");
1137 #endif
1139 if (offset < md->offset_max)
1141 matched_once = FALSE;
1142 code_offset = (int)(ecode - md->start_code);
1144 save_offset1 = md->offset_vector[offset];
1145 save_offset2 = md->offset_vector[offset+1];
1146 save_offset3 = md->offset_vector[md->offset_end - number];
1147 save_capture_last = md->capture_last;
1149 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
1151 /* Each time round the loop, save the current subject position for use
1152 when the group matches. For MATCH_MATCH, the group has matched, so we
1153 restart it with a new subject starting position, remembering that we had
1154 at least one match. For MATCH_NOMATCH, carry on with the alternatives, as
1155 usual. If we haven't matched any alternatives in any iteration, check to
1156 see if a previous iteration matched. If so, the group has matched;
1157 continue from afterwards. Otherwise it has failed; restore the previous
1158 capture values before returning NOMATCH. */
1160 for (;;)
1162 md->offset_vector[md->offset_end - number] =
1163 (int)(eptr - md->start_subject);
1164 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1165 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1166 eptrb, RM63);
1167 if (rrc == MATCH_KETRPOS)
1169 offset_top = md->end_offset_top;
1170 eptr = md->end_match_ptr;
1171 ecode = md->start_code + code_offset;
1172 save_capture_last = md->capture_last;
1173 matched_once = TRUE;
1174 mstart = md->start_match_ptr; /* In case \K changed it */
1175 continue;
1178 /* See comment in the code for capturing groups above about handling
1179 THEN. */
1181 if (rrc == MATCH_THEN)
1183 next = ecode + GET(ecode,1);
1184 if (md->start_match_ptr < next &&
1185 (*ecode == OP_ALT || *next == OP_ALT))
1186 rrc = MATCH_NOMATCH;
1189 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1190 md->capture_last = save_capture_last;
1191 ecode += GET(ecode, 1);
1192 if (*ecode != OP_ALT) break;
1195 if (!matched_once)
1197 md->offset_vector[offset] = save_offset1;
1198 md->offset_vector[offset+1] = save_offset2;
1199 md->offset_vector[md->offset_end - number] = save_offset3;
1202 if (allow_zero || matched_once)
1204 ecode += 1 + LINK_SIZE;
1205 break;
1208 RRETURN(MATCH_NOMATCH);
1211 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1212 as a non-capturing bracket. */
1214 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1215 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1217 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1219 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1220 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1222 /* Non-capturing possessive bracket with unlimited repeat. We come here
1223 from BRAZERO with allow_zero = TRUE. The code is similar to the above,
1224 without the capturing complication. It is written out separately for speed
1225 and cleanliness. */
1227 case OP_BRAPOS:
1228 case OP_SBRAPOS:
1229 allow_zero = FALSE;
1231 POSSESSIVE_NON_CAPTURE:
1232 matched_once = FALSE;
1233 code_offset = (int)(ecode - md->start_code);
1234 save_capture_last = md->capture_last;
1236 for (;;)
1238 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1239 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1240 eptrb, RM48);
1241 if (rrc == MATCH_KETRPOS)
1243 offset_top = md->end_offset_top;
1244 eptr = md->end_match_ptr;
1245 ecode = md->start_code + code_offset;
1246 matched_once = TRUE;
1247 mstart = md->start_match_ptr; /* In case \K reset it */
1248 continue;
1251 /* See comment in the code for capturing groups above about handling
1252 THEN. */
1254 if (rrc == MATCH_THEN)
1256 next = ecode + GET(ecode,1);
1257 if (md->start_match_ptr < next &&
1258 (*ecode == OP_ALT || *next == OP_ALT))
1259 rrc = MATCH_NOMATCH;
1262 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1263 ecode += GET(ecode, 1);
1264 if (*ecode != OP_ALT) break;
1265 md->capture_last = save_capture_last;
1268 if (matched_once || allow_zero)
1270 ecode += 1 + LINK_SIZE;
1271 break;
1273 RRETURN(MATCH_NOMATCH);
1275 /* Control never reaches here. */
1277 /* Conditional group: compilation checked that there are no more than two
1278 branches. If the condition is false, skipping the first branch takes us
1279 past the end of the item if there is only one branch, but that's exactly
1280 what we want. */
1282 case OP_COND:
1283 case OP_SCOND:
1285 /* The variable codelink will be added to ecode when the condition is
1286 false, to get to the second branch. Setting it to the offset to the ALT
1287 or KET, then incrementing ecode achieves this effect. We now have ecode
1288 pointing to the condition or callout. */
1290 codelink = GET(ecode, 1); /* Offset to the second branch */
1291 ecode += 1 + LINK_SIZE; /* From this opcode */
1293 /* Because of the way auto-callout works during compile, a callout item is
1294 inserted between OP_COND and an assertion condition. */
1296 if (*ecode == OP_CALLOUT)
1298 if (PUBL(callout) != NULL)
1300 PUBL(callout_block) cb;
1301 cb.version = 2; /* Version 1 of the callout block */
1302 cb.callout_number = ecode[1];
1303 cb.offset_vector = md->offset_vector;
1304 #if defined COMPILE_PCRE8
1305 cb.subject = (PCRE_SPTR)md->start_subject;
1306 #elif defined COMPILE_PCRE16
1307 cb.subject = (PCRE_SPTR16)md->start_subject;
1308 #elif defined COMPILE_PCRE32
1309 cb.subject = (PCRE_SPTR32)md->start_subject;
1310 #endif
1311 cb.subject_length = (int)(md->end_subject - md->start_subject);
1312 cb.start_match = (int)(mstart - md->start_subject);
1313 cb.current_position = (int)(eptr - md->start_subject);
1314 cb.pattern_position = GET(ecode, 2);
1315 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1316 cb.capture_top = offset_top/2;
1317 cb.capture_last = md->capture_last & CAPLMASK;
1318 /* Internal change requires this for API compatibility. */
1319 if (cb.capture_last == 0) cb.capture_last = -1;
1320 cb.callout_data = md->callout_data;
1321 cb.mark = md->nomatch_mark;
1322 if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1323 if (rrc < 0) RRETURN(rrc);
1326 /* Advance ecode past the callout, so it now points to the condition. We
1327 must adjust codelink so that the value of ecode+codelink is unchanged. */
1329 ecode += PRIV(OP_lengths)[OP_CALLOUT];
1330 codelink -= PRIV(OP_lengths)[OP_CALLOUT];
1333 /* Test the various possible conditions */
1335 condition = FALSE;
1336 switch(condcode = *ecode)
1338 case OP_RREF: /* Numbered group recursion test */
1339 if (md->recursive != NULL) /* Not recursing => FALSE */
1341 unsigned int recno = GET2(ecode, 1); /* Recursion group number*/
1342 condition = (recno == RREF_ANY || recno == md->recursive->group_num);
1344 break;
1346 case OP_DNRREF: /* Duplicate named group recursion test */
1347 if (md->recursive != NULL)
1349 int count = GET2(ecode, 1 + IMM2_SIZE);
1350 pcre_uchar *slot = md->name_table + GET2(ecode, 1) * md->name_entry_size;
1351 while (count-- > 0)
1353 unsigned int recno = GET2(slot, 0);
1354 condition = recno == md->recursive->group_num;
1355 if (condition) break;
1356 slot += md->name_entry_size;
1359 break;
1361 case OP_CREF: /* Numbered group used test */
1362 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
1363 condition = offset < offset_top && md->offset_vector[offset] >= 0;
1364 break;
1366 case OP_DNCREF: /* Duplicate named group used test */
1368 int count = GET2(ecode, 1 + IMM2_SIZE);
1369 pcre_uchar *slot = md->name_table + GET2(ecode, 1) * md->name_entry_size;
1370 while (count-- > 0)
1372 offset = GET2(slot, 0) << 1;
1373 condition = offset < offset_top && md->offset_vector[offset] >= 0;
1374 if (condition) break;
1375 slot += md->name_entry_size;
1378 break;
1380 case OP_DEF: /* DEFINE - always false */
1381 break;
1383 /* The condition is an assertion. Call match() to evaluate it - setting
1384 md->match_function_type to MATCH_CONDASSERT causes it to stop at the end
1385 of an assertion. */
1387 default:
1388 md->match_function_type = MATCH_CONDASSERT;
1389 RMATCH(eptr, ecode, offset_top, md, NULL, RM3);
1390 if (rrc == MATCH_MATCH)
1392 if (md->end_offset_top > offset_top)
1393 offset_top = md->end_offset_top; /* Captures may have happened */
1394 condition = TRUE;
1396 /* Advance ecode past the assertion to the start of the first branch,
1397 but adjust it so that the general choosing code below works. */
1399 ecode += GET(ecode, 1);
1400 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
1401 ecode += 1 + LINK_SIZE - PRIV(OP_lengths)[condcode];
1404 /* PCRE doesn't allow the effect of (*THEN) to escape beyond an
1405 assertion; it is therefore treated as NOMATCH. Any other return is an
1406 error. */
1408 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1410 RRETURN(rrc); /* Need braces because of following else */
1412 break;
1415 /* Choose branch according to the condition */
1417 ecode += condition? PRIV(OP_lengths)[condcode] : codelink;
1419 /* We are now at the branch that is to be obeyed. As there is only one, we
1420 can use tail recursion to avoid using another stack frame, except when
1421 there is unlimited repeat of a possibly empty group. In the latter case, a
1422 recursive call to match() is always required, unless the second alternative
1423 doesn't exist, in which case we can just plough on. Note that, for
1424 compatibility with Perl, the | in a conditional group is NOT treated as
1425 creating two alternatives. If a THEN is encountered in the branch, it
1426 propagates out to the enclosing alternative (unless nested in a deeper set
1427 of alternatives, of course). */
1429 if (condition || ecode[-(1+LINK_SIZE)] == OP_ALT)
1431 if (op != OP_SCOND)
1433 goto TAIL_RECURSE;
1436 md->match_function_type = MATCH_CBEGROUP;
1437 RMATCH(eptr, ecode, offset_top, md, eptrb, RM49);
1438 RRETURN(rrc);
1441 /* Condition false & no alternative; continue after the group. */
1443 else
1446 break;
1449 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1450 to close any currently open capturing brackets. */
1452 case OP_CLOSE:
1453 number = GET2(ecode, 1); /* Must be less than 65536 */
1454 offset = number << 1;
1456 #ifdef PCRE_DEBUG
1457 printf("end bracket %d at *ACCEPT", number);
1458 printf("\n");
1459 #endif
1461 md->capture_last = (md->capture_last & OVFLMASK) | number;
1462 if (offset >= md->offset_max) md->capture_last |= OVFLBIT; else
1464 md->offset_vector[offset] =
1465 md->offset_vector[md->offset_end - number];
1466 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1467 if (offset_top <= offset) offset_top = offset + 2;
1469 ecode += 1 + IMM2_SIZE;
1470 break;
1473 /* End of the pattern, either real or forced. */
1475 case OP_END:
1476 case OP_ACCEPT:
1477 case OP_ASSERT_ACCEPT:
1479 /* If we have matched an empty string, fail if not in an assertion and not
1480 in a recursion if either PCRE_NOTEMPTY is set, or if PCRE_NOTEMPTY_ATSTART
1481 is set and we have matched at the start of the subject. In both cases,
1482 backtracking will then try other alternatives, if any. */
1484 if (eptr == mstart && op != OP_ASSERT_ACCEPT &&
1485 md->recursive == NULL &&
1486 (md->notempty ||
1487 (md->notempty_atstart &&
1488 mstart == md->start_subject + md->start_offset)))
1489 RRETURN(MATCH_NOMATCH);
1491 /* Otherwise, we have a match. */
1493 md->end_match_ptr = eptr; /* Record where we ended */
1494 md->end_offset_top = offset_top; /* and how many extracts were taken */
1495 md->start_match_ptr = mstart; /* and the start (\K can modify) */
1497 /* For some reason, the macros don't work properly if an expression is
1498 given as the argument to RRETURN when the heap is in use. */
1500 rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT;
1501 RRETURN(rrc);
1503 /* Assertion brackets. Check the alternative branches in turn - the
1504 matching won't pass the KET for an assertion. If any one branch matches,
1505 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1506 start of each branch to move the current point backwards, so the code at
1507 this level is identical to the lookahead case. When the assertion is part
1508 of a condition, we want to return immediately afterwards. The caller of
1509 this incarnation of the match() function will have set MATCH_CONDASSERT in
1510 md->match_function type, and one of these opcodes will be the first opcode
1511 that is processed. We use a local variable that is preserved over calls to
1512 match() to remember this case. */
1514 case OP_ASSERT:
1515 case OP_ASSERTBACK:
1516 save_mark = md->mark;
1517 if (md->match_function_type == MATCH_CONDASSERT)
1519 condassert = TRUE;
1520 md->match_function_type = 0;
1522 else condassert = FALSE;
1524 /* Loop for each branch */
1528 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM4);
1530 /* A match means that the assertion is true; break out of the loop
1531 that matches its alternatives. */
1533 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1535 mstart = md->start_match_ptr; /* In case \K reset it */
1536 break;
1539 /* If not matched, restore the previous mark setting. */
1541 md->mark = save_mark;
1543 /* See comment in the code for capturing groups above about handling
1544 THEN. */
1546 if (rrc == MATCH_THEN)
1548 next = ecode + GET(ecode,1);
1549 if (md->start_match_ptr < next &&
1550 (*ecode == OP_ALT || *next == OP_ALT))
1551 rrc = MATCH_NOMATCH;
1554 /* Anything other than NOMATCH causes the entire assertion to fail,
1555 passing back the return code. This includes COMMIT, SKIP, PRUNE and an
1556 uncaptured THEN, which means they take their normal effect. This
1557 consistent approach does not always have exactly the same effect as in
1558 Perl. */
1560 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1561 ecode += GET(ecode, 1);
1563 while (*ecode == OP_ALT); /* Continue for next alternative */
1565 /* If we have tried all the alternative branches, the assertion has
1566 failed. If not, we broke out after a match. */
1568 if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
1570 /* If checking an assertion for a condition, return MATCH_MATCH. */
1572 if (condassert) RRETURN(MATCH_MATCH);
1574 /* Continue from after a successful assertion, updating the offsets high
1575 water mark, since extracts may have been taken during the assertion. */
1577 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1578 ecode += 1 + LINK_SIZE;
1579 offset_top = md->end_offset_top;
1580 continue;
1582 /* Negative assertion: all branches must fail to match for the assertion to
1583 succeed. */
1585 case OP_ASSERT_NOT:
1586 case OP_ASSERTBACK_NOT:
1587 save_mark = md->mark;
1588 if (md->match_function_type == MATCH_CONDASSERT)
1590 condassert = TRUE;
1591 md->match_function_type = 0;
1593 else condassert = FALSE;
1595 /* Loop for each alternative branch. */
1599 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM5);
1600 md->mark = save_mark; /* Always restore the mark setting */
1602 switch(rrc)
1604 case MATCH_MATCH: /* A successful match means */
1605 case MATCH_ACCEPT: /* the assertion has failed. */
1606 RRETURN(MATCH_NOMATCH);
1608 case MATCH_NOMATCH: /* Carry on with next branch */
1609 break;
1611 /* See comment in the code for capturing groups above about handling
1612 THEN. */
1614 case MATCH_THEN:
1615 next = ecode + GET(ecode,1);
1616 if (md->start_match_ptr < next &&
1617 (*ecode == OP_ALT || *next == OP_ALT))
1619 rrc = MATCH_NOMATCH;
1620 break;
1622 /* Otherwise fall through. */
1624 /* COMMIT, SKIP, PRUNE, and an uncaptured THEN cause the whole
1625 assertion to fail to match, without considering any more alternatives.
1626 Failing to match means the assertion is true. This is a consistent
1627 approach, but does not always have the same effect as in Perl. */
1629 case MATCH_COMMIT:
1630 case MATCH_SKIP:
1631 case MATCH_SKIP_ARG:
1632 case MATCH_PRUNE:
1633 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1634 goto NEG_ASSERT_TRUE; /* Break out of alternation loop */
1636 /* Anything else is an error */
1638 default:
1639 RRETURN(rrc);
1642 /* Continue with next branch */
1644 ecode += GET(ecode,1);
1646 while (*ecode == OP_ALT);
1648 /* All branches in the assertion failed to match. */
1650 NEG_ASSERT_TRUE:
1651 if (condassert) RRETURN(MATCH_MATCH); /* Condition assertion */
1652 ecode += 1 + LINK_SIZE; /* Continue with current branch */
1653 continue;
1655 /* Move the subject pointer back. This occurs only at the start of
1656 each branch of a lookbehind assertion. If we are too close to the start to
1657 move back, this match function fails. When working with UTF-8 we move
1658 back a number of characters, not bytes. */
1660 case OP_REVERSE:
1661 #ifdef SUPPORT_UTF
1662 if (utf)
1664 i = GET(ecode, 1);
1665 while (i-- > 0)
1667 eptr--;
1668 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1669 BACKCHAR(eptr);
1672 else
1673 #endif
1675 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1678 eptr -= GET(ecode, 1);
1679 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1682 /* Save the earliest consulted character, then skip to next op code */
1684 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1685 ecode += 1 + LINK_SIZE;
1686 break;
1688 /* The callout item calls an external function, if one is provided, passing
1689 details of the match so far. This is mainly for debugging, though the
1690 function is able to force a failure. */
1692 case OP_CALLOUT:
1693 if (PUBL(callout) != NULL)
1695 PUBL(callout_block) cb;
1696 cb.version = 2; /* Version 1 of the callout block */
1697 cb.callout_number = ecode[1];
1698 cb.offset_vector = md->offset_vector;
1699 #if defined COMPILE_PCRE8
1700 cb.subject = (PCRE_SPTR)md->start_subject;
1701 #elif defined COMPILE_PCRE16
1702 cb.subject = (PCRE_SPTR16)md->start_subject;
1703 #elif defined COMPILE_PCRE32
1704 cb.subject = (PCRE_SPTR32)md->start_subject;
1705 #endif
1706 cb.subject_length = (int)(md->end_subject - md->start_subject);
1707 cb.start_match = (int)(mstart - md->start_subject);
1708 cb.current_position = (int)(eptr - md->start_subject);
1709 cb.pattern_position = GET(ecode, 2);
1710 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1711 cb.capture_top = offset_top/2;
1712 cb.capture_last = md->capture_last & CAPLMASK;
1713 /* Internal change requires this for API compatibility. */
1714 if (cb.capture_last == 0) cb.capture_last = -1;
1715 cb.callout_data = md->callout_data;
1716 cb.mark = md->nomatch_mark;
1717 if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1718 if (rrc < 0) RRETURN(rrc);
1720 ecode += 2 + 2*LINK_SIZE;
1721 break;
1723 /* Recursion either matches the current regex, or some subexpression. The
1724 offset data is the offset to the starting bracket from the start of the
1725 whole pattern. (This is so that it works from duplicated subpatterns.)
1727 The state of the capturing groups is preserved over recursion, and
1728 re-instated afterwards. We don't know how many are started and not yet
1729 finished (offset_top records the completed total) so we just have to save
1730 all the potential data. There may be up to 65535 such values, which is too
1731 large to put on the stack, but using malloc for small numbers seems
1732 expensive. As a compromise, the stack is used when there are no more than
1733 REC_STACK_SAVE_MAX values to store; otherwise malloc is used.
1735 There are also other values that have to be saved. We use a chained
1736 sequence of blocks that actually live on the stack. Thanks to Robin Houston
1737 for the original version of this logic. It has, however, been hacked around
1738 a lot, so he is not to blame for the current way it works. */
1740 case OP_RECURSE:
1742 recursion_info *ri;
1743 unsigned int recno;
1745 callpat = md->start_code + GET(ecode, 1);
1746 recno = (callpat == md->start_code)? 0 :
1747 GET2(callpat, 1 + LINK_SIZE);
1749 /* Check for repeating a recursion without advancing the subject pointer.
1750 This should catch convoluted mutual recursions. (Some simple cases are
1751 caught at compile time.) */
1753 for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
1754 if (recno == ri->group_num && eptr == ri->subject_position)
1755 RRETURN(PCRE_ERROR_RECURSELOOP);
1757 /* Add to "recursing stack" */
1759 new_recursive.group_num = recno;
1760 new_recursive.saved_capture_last = md->capture_last;
1761 new_recursive.subject_position = eptr;
1762 new_recursive.prevrec = md->recursive;
1763 md->recursive = &new_recursive;
1765 /* Where to continue from afterwards */
1767 ecode += 1 + LINK_SIZE;
1769 /* Now save the offset data */
1771 new_recursive.saved_max = md->offset_end;
1772 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1773 new_recursive.offset_save = stacksave;
1774 else
1776 new_recursive.offset_save =
1777 (int *)(PUBL(malloc))(new_recursive.saved_max * sizeof(int));
1778 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1780 memcpy(new_recursive.offset_save, md->offset_vector,
1781 new_recursive.saved_max * sizeof(int));
1783 /* OK, now we can do the recursion. After processing each alternative,
1784 restore the offset data and the last captured value. If there were nested
1785 recursions, md->recursive might be changed, so reset it before looping.
1788 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1789 cbegroup = (*callpat >= OP_SBRA);
1792 if (cbegroup) md->match_function_type = MATCH_CBEGROUP;
1793 RMATCH(eptr, callpat + PRIV(OP_lengths)[*callpat], offset_top,
1794 md, eptrb, RM6);
1795 memcpy(md->offset_vector, new_recursive.offset_save,
1796 new_recursive.saved_max * sizeof(int));
1797 md->capture_last = new_recursive.saved_capture_last;
1798 md->recursive = new_recursive.prevrec;
1799 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1801 DPRINTF(("Recursion matched\n"));
1802 if (new_recursive.offset_save != stacksave)
1803 (PUBL(free))(new_recursive.offset_save);
1805 /* Set where we got to in the subject, and reset the start in case
1806 it was changed by \K. This *is* propagated back out of a recursion,
1807 for Perl compatibility. */
1809 eptr = md->end_match_ptr;
1810 mstart = md->start_match_ptr;
1811 goto RECURSION_MATCHED; /* Exit loop; end processing */
1814 /* PCRE does not allow THEN, SKIP, PRUNE or COMMIT to escape beyond a
1815 recursion; they cause a NOMATCH for the entire recursion. These codes
1816 are defined in a range that can be tested for. */
1818 if (rrc >= MATCH_BACKTRACK_MIN && rrc <= MATCH_BACKTRACK_MAX)
1819 RRETURN(MATCH_NOMATCH);
1821 /* Any return code other than NOMATCH is an error. */
1823 if (rrc != MATCH_NOMATCH)
1825 DPRINTF(("Recursion gave error %d\n", rrc));
1826 if (new_recursive.offset_save != stacksave)
1827 (PUBL(free))(new_recursive.offset_save);
1828 RRETURN(rrc);
1831 md->recursive = &new_recursive;
1832 callpat += GET(callpat, 1);
1834 while (*callpat == OP_ALT);
1836 DPRINTF(("Recursion didn't match\n"));
1837 md->recursive = new_recursive.prevrec;
1838 if (new_recursive.offset_save != stacksave)
1839 (PUBL(free))(new_recursive.offset_save);
1840 RRETURN(MATCH_NOMATCH);
1843 RECURSION_MATCHED:
1844 break;
1846 /* An alternation is the end of a branch; scan along to find the end of the
1847 bracketed group and go to there. */
1849 case OP_ALT:
1850 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1851 break;
1853 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1854 indicating that it may occur zero times. It may repeat infinitely, or not
1855 at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1856 with fixed upper repeat limits are compiled as a number of copies, with the
1857 optional ones preceded by BRAZERO or BRAMINZERO. */
1859 case OP_BRAZERO:
1860 next = ecode + 1;
1861 RMATCH(eptr, next, offset_top, md, eptrb, RM10);
1862 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1863 do next += GET(next, 1); while (*next == OP_ALT);
1864 ecode = next + 1 + LINK_SIZE;
1865 break;
1867 case OP_BRAMINZERO:
1868 next = ecode + 1;
1869 do next += GET(next, 1); while (*next == OP_ALT);
1870 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, eptrb, RM11);
1871 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1872 ecode++;
1873 break;
1875 case OP_SKIPZERO:
1876 next = ecode+1;
1877 do next += GET(next,1); while (*next == OP_ALT);
1878 ecode = next + 1 + LINK_SIZE;
1879 break;
1881 /* BRAPOSZERO occurs before a possessive bracket group. Don't do anything
1882 here; just jump to the group, with allow_zero set TRUE. */
1884 case OP_BRAPOSZERO:
1885 op = *(++ecode);
1886 allow_zero = TRUE;
1887 if (op == OP_CBRAPOS || op == OP_SCBRAPOS) goto POSSESSIVE_CAPTURE;
1888 goto POSSESSIVE_NON_CAPTURE;
1890 /* End of a group, repeated or non-repeating. */
1892 case OP_KET:
1893 case OP_KETRMIN:
1894 case OP_KETRMAX:
1895 case OP_KETRPOS:
1896 prev = ecode - GET(ecode, 1);
1898 /* If this was a group that remembered the subject start, in order to break
1899 infinite repeats of empty string matches, retrieve the subject start from
1900 the chain. Otherwise, set it NULL. */
1902 if (*prev >= OP_SBRA || *prev == OP_ONCE)
1904 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1905 eptrb = eptrb->epb_prev; /* Backup to previous group */
1907 else saved_eptr = NULL;
1909 /* If we are at the end of an assertion group or a non-capturing atomic
1910 group, stop matching and return MATCH_MATCH, but record the current high
1911 water mark for use by positive assertions. We also need to record the match
1912 start in case it was changed by \K. */
1914 if ((*prev >= OP_ASSERT && *prev <= OP_ASSERTBACK_NOT) ||
1915 *prev == OP_ONCE_NC)
1917 md->end_match_ptr = eptr; /* For ONCE_NC */
1918 md->end_offset_top = offset_top;
1919 md->start_match_ptr = mstart;
1920 RRETURN(MATCH_MATCH); /* Sets md->mark */
1923 /* For capturing groups we have to check the group number back at the start
1924 and if necessary complete handling an extraction by setting the offsets and
1925 bumping the high water mark. Whole-pattern recursion is coded as a recurse
1926 into group 0, so it won't be picked up here. Instead, we catch it when the
1927 OP_END is reached. Other recursion is handled here. We just have to record
1928 the current subject position and start match pointer and give a MATCH
1929 return. */
1931 if (*prev == OP_CBRA || *prev == OP_SCBRA ||
1932 *prev == OP_CBRAPOS || *prev == OP_SCBRAPOS)
1934 number = GET2(prev, 1+LINK_SIZE);
1935 offset = number << 1;
1937 #ifdef PCRE_DEBUG
1938 printf("end bracket %d", number);
1939 printf("\n");
1940 #endif
1942 /* Handle a recursively called group. */
1944 if (md->recursive != NULL && md->recursive->group_num == number)
1946 md->end_match_ptr = eptr;
1947 md->start_match_ptr = mstart;
1948 RRETURN(MATCH_MATCH);
1951 /* Deal with capturing */
1953 md->capture_last = (md->capture_last & OVFLMASK) | number;
1954 if (offset >= md->offset_max) md->capture_last |= OVFLBIT; else
1956 /* If offset is greater than offset_top, it means that we are
1957 "skipping" a capturing group, and that group's offsets must be marked
1958 unset. In earlier versions of PCRE, all the offsets were unset at the
1959 start of matching, but this doesn't work because atomic groups and
1960 assertions can cause a value to be set that should later be unset.
1961 Example: matching /(?>(a))b|(a)c/ against "ac". This sets group 1 as
1962 part of the atomic group, but this is not on the final matching path,
1963 so must be unset when 2 is set. (If there is no group 2, there is no
1964 problem, because offset_top will then be 2, indicating no capture.) */
1966 if (offset > offset_top)
1968 register int *iptr = md->offset_vector + offset_top;
1969 register int *iend = md->offset_vector + offset;
1970 while (iptr < iend) *iptr++ = -1;
1973 /* Now make the extraction */
1975 md->offset_vector[offset] =
1976 md->offset_vector[md->offset_end - number];
1977 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1978 if (offset_top <= offset) offset_top = offset + 2;
1982 /* For an ordinary non-repeating ket, just continue at this level. This
1983 also happens for a repeating ket if no characters were matched in the
1984 group. This is the forcible breaking of infinite loops as implemented in
1985 Perl 5.005. For a non-repeating atomic group that includes captures,
1986 establish a backup point by processing the rest of the pattern at a lower
1987 level. If this results in a NOMATCH return, pass MATCH_ONCE back to the
1988 original OP_ONCE level, thereby bypassing intermediate backup points, but
1989 resetting any captures that happened along the way. */
1991 if (*ecode == OP_KET || eptr == saved_eptr)
1993 if (*prev == OP_ONCE)
1995 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM12);
1996 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1997 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
1998 RRETURN(MATCH_ONCE);
2000 ecode += 1 + LINK_SIZE; /* Carry on at this level */
2001 break;
2004 /* OP_KETRPOS is a possessive repeating ket. Remember the current position,
2005 and return the MATCH_KETRPOS. This makes it possible to do the repeats one
2006 at a time from the outer level, thus saving stack. */
2008 if (*ecode == OP_KETRPOS)
2010 md->start_match_ptr = mstart; /* In case \K reset it */
2011 md->end_match_ptr = eptr;
2012 md->end_offset_top = offset_top;
2013 RRETURN(MATCH_KETRPOS);
2016 /* The normal repeating kets try the rest of the pattern or restart from
2017 the preceding bracket, in the appropriate order. In the second case, we can
2018 use tail recursion to avoid using another stack frame, unless we have an
2019 an atomic group or an unlimited repeat of a group that can match an empty
2020 string. */
2022 if (*ecode == OP_KETRMIN)
2024 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM7);
2025 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2026 if (*prev == OP_ONCE)
2028 RMATCH(eptr, prev, offset_top, md, eptrb, RM8);
2029 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2030 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
2031 RRETURN(MATCH_ONCE);
2033 if (*prev >= OP_SBRA) /* Could match an empty string */
2035 RMATCH(eptr, prev, offset_top, md, eptrb, RM50);
2036 RRETURN(rrc);
2038 ecode = prev;
2039 goto TAIL_RECURSE;
2041 else /* OP_KETRMAX */
2043 RMATCH(eptr, prev, offset_top, md, eptrb, RM13);
2044 if (rrc == MATCH_ONCE && md->once_target == prev) rrc = MATCH_NOMATCH;
2045 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2046 if (*prev == OP_ONCE)
2048 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM9);
2049 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2050 md->once_target = prev;
2051 RRETURN(MATCH_ONCE);
2053 ecode += 1 + LINK_SIZE;
2054 goto TAIL_RECURSE;
2056 /* Control never gets here */
2058 /* Not multiline mode: start of subject assertion, unless notbol. */
2060 case OP_CIRC:
2061 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
2063 /* Start of subject assertion */
2065 case OP_SOD:
2066 if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
2067 ecode++;
2068 break;
2070 /* Multiline mode: start of subject unless notbol, or after any newline. */
2072 case OP_CIRCM:
2073 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
2074 if (eptr != md->start_subject &&
2075 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
2076 RRETURN(MATCH_NOMATCH);
2077 ecode++;
2078 break;
2080 /* Start of match assertion */
2082 case OP_SOM:
2083 if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
2084 ecode++;
2085 break;
2087 /* Reset the start of match point */
2089 case OP_SET_SOM:
2090 mstart = eptr;
2091 ecode++;
2092 break;
2094 /* Multiline mode: assert before any newline, or before end of subject
2095 unless noteol is set. */
2097 case OP_DOLLM:
2098 if (eptr < md->end_subject)
2100 if (!IS_NEWLINE(eptr))
2102 if (md->partial != 0 &&
2103 eptr + 1 >= md->end_subject &&
2104 NLBLOCK->nltype == NLTYPE_FIXED &&
2105 NLBLOCK->nllen == 2 &&
2106 UCHAR21TEST(eptr) == NLBLOCK->nl[0])
2108 md->hitend = TRUE;
2109 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2111 RRETURN(MATCH_NOMATCH);
2114 else
2116 if (md->noteol) RRETURN(MATCH_NOMATCH);
2117 SCHECK_PARTIAL();
2119 ecode++;
2120 break;
2122 /* Not multiline mode: assert before a terminating newline or before end of
2123 subject unless noteol is set. */
2125 case OP_DOLL:
2126 if (md->noteol) RRETURN(MATCH_NOMATCH);
2127 if (!md->endonly) goto ASSERT_NL_OR_EOS;
2129 /* ... else fall through for endonly */
2131 /* End of subject assertion (\z) */
2133 case OP_EOD:
2134 if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
2135 SCHECK_PARTIAL();
2136 ecode++;
2137 break;
2139 /* End of subject or ending \n assertion (\Z) */
2141 case OP_EODN:
2142 ASSERT_NL_OR_EOS:
2143 if (eptr < md->end_subject &&
2144 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
2146 if (md->partial != 0 &&
2147 eptr + 1 >= md->end_subject &&
2148 NLBLOCK->nltype == NLTYPE_FIXED &&
2149 NLBLOCK->nllen == 2 &&
2150 UCHAR21TEST(eptr) == NLBLOCK->nl[0])
2152 md->hitend = TRUE;
2153 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2155 RRETURN(MATCH_NOMATCH);
2158 /* Either at end of string or \n before end. */
2160 SCHECK_PARTIAL();
2161 ecode++;
2162 break;
2164 /* Word boundary assertions */
2166 case OP_NOT_WORD_BOUNDARY:
2167 case OP_WORD_BOUNDARY:
2170 /* Find out if the previous and current characters are "word" characters.
2171 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
2172 be "non-word" characters. Remember the earliest consulted character for
2173 partial matching. */
2175 #ifdef SUPPORT_UTF
2176 if (utf)
2178 /* Get status of previous character */
2180 if (eptr == md->start_subject) prev_is_word = FALSE; else
2182 PCRE_PUCHAR lastptr = eptr - 1;
2183 BACKCHAR(lastptr);
2184 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
2185 GETCHAR(c, lastptr);
2186 #ifdef SUPPORT_UCP
2187 if (md->use_ucp)
2189 if (c == '_') prev_is_word = TRUE; else
2191 int cat = UCD_CATEGORY(c);
2192 prev_is_word = (cat == ucp_L || cat == ucp_N);
2195 else
2196 #endif
2197 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2200 /* Get status of next character */
2202 if (eptr >= md->end_subject)
2204 SCHECK_PARTIAL();
2205 cur_is_word = FALSE;
2207 else
2209 GETCHAR(c, eptr);
2210 #ifdef SUPPORT_UCP
2211 if (md->use_ucp)
2213 if (c == '_') cur_is_word = TRUE; else
2215 int cat = UCD_CATEGORY(c);
2216 cur_is_word = (cat == ucp_L || cat == ucp_N);
2219 else
2220 #endif
2221 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2224 else
2225 #endif
2227 /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for
2228 consistency with the behaviour of \w we do use it in this case. */
2231 /* Get status of previous character */
2233 if (eptr == md->start_subject) prev_is_word = FALSE; else
2235 if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
2236 #ifdef SUPPORT_UCP
2237 if (md->use_ucp)
2239 c = eptr[-1];
2240 if (c == '_') prev_is_word = TRUE; else
2242 int cat = UCD_CATEGORY(c);
2243 prev_is_word = (cat == ucp_L || cat == ucp_N);
2246 else
2247 #endif
2248 prev_is_word = MAX_255(eptr[-1])
2249 && ((md->ctypes[eptr[-1]] & ctype_word) != 0);
2252 /* Get status of next character */
2254 if (eptr >= md->end_subject)
2256 SCHECK_PARTIAL();
2257 cur_is_word = FALSE;
2259 else
2260 #ifdef SUPPORT_UCP
2261 if (md->use_ucp)
2263 c = *eptr;
2264 if (c == '_') cur_is_word = TRUE; else
2266 int cat = UCD_CATEGORY(c);
2267 cur_is_word = (cat == ucp_L || cat == ucp_N);
2270 else
2271 #endif
2272 cur_is_word = MAX_255(*eptr)
2273 && ((md->ctypes[*eptr] & ctype_word) != 0);
2276 /* Now see if the situation is what we want */
2278 if ((*ecode++ == OP_WORD_BOUNDARY)?
2279 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
2280 RRETURN(MATCH_NOMATCH);
2282 break;
2284 /* Match any single character type except newline; have to take care with
2285 CRLF newlines and partial matching. */
2287 case OP_ANY:
2288 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
2289 if (md->partial != 0 &&
2290 eptr + 1 >= md->end_subject &&
2291 NLBLOCK->nltype == NLTYPE_FIXED &&
2292 NLBLOCK->nllen == 2 &&
2293 UCHAR21TEST(eptr) == NLBLOCK->nl[0])
2295 md->hitend = TRUE;
2296 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2299 /* Fall through */
2301 /* Match any single character whatsoever. */
2303 case OP_ALLANY:
2304 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2305 { /* not be updated before SCHECK_PARTIAL. */
2306 SCHECK_PARTIAL();
2307 RRETURN(MATCH_NOMATCH);
2309 eptr++;
2310 #ifdef SUPPORT_UTF
2311 if (utf) ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
2312 #endif
2313 ecode++;
2314 break;
2316 /* Match a single byte, even in UTF-8 mode. This opcode really does match
2317 any byte, even newline, independent of the setting of PCRE_DOTALL. */
2319 case OP_ANYBYTE:
2320 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2321 { /* not be updated before SCHECK_PARTIAL. */
2322 SCHECK_PARTIAL();
2323 RRETURN(MATCH_NOMATCH);
2325 eptr++;
2326 ecode++;
2327 break;
2329 case OP_NOT_DIGIT:
2330 if (eptr >= md->end_subject)
2332 SCHECK_PARTIAL();
2333 RRETURN(MATCH_NOMATCH);
2335 GETCHARINCTEST(c, eptr);
2336 if (
2337 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2338 c < 256 &&
2339 #endif
2340 (md->ctypes[c] & ctype_digit) != 0
2342 RRETURN(MATCH_NOMATCH);
2343 ecode++;
2344 break;
2346 case OP_DIGIT:
2347 if (eptr >= md->end_subject)
2349 SCHECK_PARTIAL();
2350 RRETURN(MATCH_NOMATCH);
2352 GETCHARINCTEST(c, eptr);
2353 if (
2354 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2355 c > 255 ||
2356 #endif
2357 (md->ctypes[c] & ctype_digit) == 0
2359 RRETURN(MATCH_NOMATCH);
2360 ecode++;
2361 break;
2363 case OP_NOT_WHITESPACE:
2364 if (eptr >= md->end_subject)
2366 SCHECK_PARTIAL();
2367 RRETURN(MATCH_NOMATCH);
2369 GETCHARINCTEST(c, eptr);
2370 if (
2371 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2372 c < 256 &&
2373 #endif
2374 (md->ctypes[c] & ctype_space) != 0
2376 RRETURN(MATCH_NOMATCH);
2377 ecode++;
2378 break;
2380 case OP_WHITESPACE:
2381 if (eptr >= md->end_subject)
2383 SCHECK_PARTIAL();
2384 RRETURN(MATCH_NOMATCH);
2386 GETCHARINCTEST(c, eptr);
2387 if (
2388 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2389 c > 255 ||
2390 #endif
2391 (md->ctypes[c] & ctype_space) == 0
2393 RRETURN(MATCH_NOMATCH);
2394 ecode++;
2395 break;
2397 case OP_NOT_WORDCHAR:
2398 if (eptr >= md->end_subject)
2400 SCHECK_PARTIAL();
2401 RRETURN(MATCH_NOMATCH);
2403 GETCHARINCTEST(c, eptr);
2404 if (
2405 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2406 c < 256 &&
2407 #endif
2408 (md->ctypes[c] & ctype_word) != 0
2410 RRETURN(MATCH_NOMATCH);
2411 ecode++;
2412 break;
2414 case OP_WORDCHAR:
2415 if (eptr >= md->end_subject)
2417 SCHECK_PARTIAL();
2418 RRETURN(MATCH_NOMATCH);
2420 GETCHARINCTEST(c, eptr);
2421 if (
2422 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2423 c > 255 ||
2424 #endif
2425 (md->ctypes[c] & ctype_word) == 0
2427 RRETURN(MATCH_NOMATCH);
2428 ecode++;
2429 break;
2431 case OP_ANYNL:
2432 if (eptr >= md->end_subject)
2434 SCHECK_PARTIAL();
2435 RRETURN(MATCH_NOMATCH);
2437 GETCHARINCTEST(c, eptr);
2438 switch(c)
2440 default: RRETURN(MATCH_NOMATCH);
2442 case CHAR_CR:
2443 if (eptr >= md->end_subject)
2445 SCHECK_PARTIAL();
2447 else if (UCHAR21TEST(eptr) == CHAR_LF) eptr++;
2448 break;
2450 case CHAR_LF:
2451 break;
2453 case CHAR_VT:
2454 case CHAR_FF:
2455 case CHAR_NEL:
2456 #ifndef EBCDIC
2457 case 0x2028:
2458 case 0x2029:
2459 #endif /* Not EBCDIC */
2460 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
2461 break;
2463 ecode++;
2464 break;
2466 case OP_NOT_HSPACE:
2467 if (eptr >= md->end_subject)
2469 SCHECK_PARTIAL();
2470 RRETURN(MATCH_NOMATCH);
2472 GETCHARINCTEST(c, eptr);
2473 switch(c)
2475 HSPACE_CASES: RRETURN(MATCH_NOMATCH); /* Byte and multibyte cases */
2476 default: break;
2478 ecode++;
2479 break;
2481 case OP_HSPACE:
2482 if (eptr >= md->end_subject)
2484 SCHECK_PARTIAL();
2485 RRETURN(MATCH_NOMATCH);
2487 GETCHARINCTEST(c, eptr);
2488 switch(c)
2490 HSPACE_CASES: break; /* Byte and multibyte cases */
2491 default: RRETURN(MATCH_NOMATCH);
2493 ecode++;
2494 break;
2496 case OP_NOT_VSPACE:
2497 if (eptr >= md->end_subject)
2499 SCHECK_PARTIAL();
2500 RRETURN(MATCH_NOMATCH);
2502 GETCHARINCTEST(c, eptr);
2503 switch(c)
2505 VSPACE_CASES: RRETURN(MATCH_NOMATCH);
2506 default: break;
2508 ecode++;
2509 break;
2511 case OP_VSPACE:
2512 if (eptr >= md->end_subject)
2514 SCHECK_PARTIAL();
2515 RRETURN(MATCH_NOMATCH);
2517 GETCHARINCTEST(c, eptr);
2518 switch(c)
2520 VSPACE_CASES: break;
2521 default: RRETURN(MATCH_NOMATCH);
2523 ecode++;
2524 break;
2526 #ifdef SUPPORT_UCP
2527 /* Check the next character by Unicode property. We will get here only
2528 if the support is in the binary; otherwise a compile-time error occurs. */
2530 case OP_PROP:
2531 case OP_NOTPROP:
2532 if (eptr >= md->end_subject)
2534 SCHECK_PARTIAL();
2535 RRETURN(MATCH_NOMATCH);
2537 GETCHARINCTEST(c, eptr);
2539 const pcre_uint32 *cp;
2540 const ucd_record *prop = GET_UCD(c);
2542 switch(ecode[1])
2544 case PT_ANY:
2545 if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
2546 break;
2548 case PT_LAMP:
2549 if ((prop->chartype == ucp_Lu ||
2550 prop->chartype == ucp_Ll ||
2551 prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
2552 RRETURN(MATCH_NOMATCH);
2553 break;
2555 case PT_GC:
2556 if ((ecode[2] != PRIV(ucp_gentype)[prop->chartype]) == (op == OP_PROP))
2557 RRETURN(MATCH_NOMATCH);
2558 break;
2560 case PT_PC:
2561 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
2562 RRETURN(MATCH_NOMATCH);
2563 break;
2565 case PT_SC:
2566 if ((ecode[2] != prop->script) == (op == OP_PROP))
2567 RRETURN(MATCH_NOMATCH);
2568 break;
2570 /* These are specials */
2572 case PT_ALNUM:
2573 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2574 PRIV(ucp_gentype)[prop->chartype] == ucp_N) == (op == OP_NOTPROP))
2575 RRETURN(MATCH_NOMATCH);
2576 break;
2578 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
2579 which means that Perl space and POSIX space are now identical. PCRE
2580 was changed at release 8.34. */
2582 case PT_SPACE: /* Perl space */
2583 case PT_PXSPACE: /* POSIX space */
2584 switch(c)
2586 HSPACE_CASES:
2587 VSPACE_CASES:
2588 if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
2589 break;
2591 default:
2592 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z) ==
2593 (op == OP_NOTPROP)) RRETURN(MATCH_NOMATCH);
2594 break;
2596 break;
2598 case PT_WORD:
2599 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2600 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
2601 c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
2602 RRETURN(MATCH_NOMATCH);
2603 break;
2605 case PT_CLIST:
2606 cp = PRIV(ucd_caseless_sets) + ecode[2];
2607 for (;;)
2609 if (c < *cp)
2610 { if (op == OP_PROP) { RRETURN(MATCH_NOMATCH); } else break; }
2611 if (c == *cp++)
2612 { if (op == OP_PROP) break; else { RRETURN(MATCH_NOMATCH); } }
2614 break;
2616 case PT_UCNC:
2617 if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
2618 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
2619 c >= 0xe000) == (op == OP_NOTPROP))
2620 RRETURN(MATCH_NOMATCH);
2621 break;
2623 /* This should never occur */
2625 default:
2626 RRETURN(PCRE_ERROR_INTERNAL);
2629 ecode += 3;
2631 break;
2633 /* Match an extended Unicode sequence. We will get here only if the support
2634 is in the binary; otherwise a compile-time error occurs. */
2636 case OP_EXTUNI:
2637 if (eptr >= md->end_subject)
2639 SCHECK_PARTIAL();
2640 RRETURN(MATCH_NOMATCH);
2642 else
2644 int lgb, rgb;
2645 GETCHARINCTEST(c, eptr);
2646 lgb = UCD_GRAPHBREAK(c);
2647 while (eptr < md->end_subject)
2649 int len = 1;
2650 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
2651 rgb = UCD_GRAPHBREAK(c);
2652 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
2653 lgb = rgb;
2654 eptr += len;
2657 CHECK_PARTIAL();
2658 ecode++;
2659 break;
2660 #endif /* SUPPORT_UCP */
2663 /* Match a back reference, possibly repeatedly. Look past the end of the
2664 item to see if there is repeat information following. The code is similar
2665 to that for character classes, but repeated for efficiency. Then obey
2666 similar code to character type repeats - written out again for speed.
2667 However, if the referenced string is the empty string, always treat
2668 it as matched, any number of times (otherwise there could be infinite
2669 loops). If the reference is unset, there are two possibilities:
2671 (a) In the default, Perl-compatible state, set the length negative;
2672 this ensures that every attempt at a match fails. We can't just fail
2673 here, because of the possibility of quantifiers with zero minima.
2675 (b) If the JavaScript compatibility flag is set, set the length to zero
2676 so that the back reference matches an empty string.
2678 Otherwise, set the length to the length of what was matched by the
2679 referenced subpattern.
2681 The OP_REF and OP_REFI opcodes are used for a reference to a numbered group
2682 or to a non-duplicated named group. For a duplicated named group, OP_DNREF
2683 and OP_DNREFI are used. In this case we must scan the list of groups to
2684 which the name refers, and use the first one that is set. */
2686 case OP_DNREF:
2687 case OP_DNREFI:
2688 caseless = op == OP_DNREFI;
2690 int count = GET2(ecode, 1+IMM2_SIZE);
2691 pcre_uchar *slot = md->name_table + GET2(ecode, 1) * md->name_entry_size;
2692 ecode += 1 + 2*IMM2_SIZE;
2694 /* Setting the default length first and initializing 'offset' avoids
2695 compiler warnings in the REF_REPEAT code. */
2697 length = (md->jscript_compat)? 0 : -1;
2698 offset = 0;
2700 while (count-- > 0)
2702 offset = GET2(slot, 0) << 1;
2703 if (offset < offset_top && md->offset_vector[offset] >= 0)
2705 length = md->offset_vector[offset+1] - md->offset_vector[offset];
2706 break;
2708 slot += md->name_entry_size;
2711 goto REF_REPEAT;
2713 case OP_REF:
2714 case OP_REFI:
2715 caseless = op == OP_REFI;
2716 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
2717 ecode += 1 + IMM2_SIZE;
2718 if (offset >= offset_top || md->offset_vector[offset] < 0)
2719 length = (md->jscript_compat)? 0 : -1;
2720 else
2721 length = md->offset_vector[offset+1] - md->offset_vector[offset];
2723 /* Set up for repetition, or handle the non-repeated case */
2725 REF_REPEAT:
2726 switch (*ecode)
2728 case OP_CRSTAR:
2729 case OP_CRMINSTAR:
2730 case OP_CRPLUS:
2731 case OP_CRMINPLUS:
2732 case OP_CRQUERY:
2733 case OP_CRMINQUERY:
2734 c = *ecode++ - OP_CRSTAR;
2735 minimize = (c & 1) != 0;
2736 min = rep_min[c]; /* Pick up values from tables; */
2737 max = rep_max[c]; /* zero for max => infinity */
2738 if (max == 0) max = INT_MAX;
2739 break;
2741 case OP_CRRANGE:
2742 case OP_CRMINRANGE:
2743 minimize = (*ecode == OP_CRMINRANGE);
2744 min = GET2(ecode, 1);
2745 max = GET2(ecode, 1 + IMM2_SIZE);
2746 if (max == 0) max = INT_MAX;
2747 ecode += 1 + 2 * IMM2_SIZE;
2748 break;
2750 default: /* No repeat follows */
2751 if ((length = match_ref(offset, eptr, length, md, caseless)) < 0)
2753 if (length == -2) eptr = md->end_subject; /* Partial match */
2754 CHECK_PARTIAL();
2755 RRETURN(MATCH_NOMATCH);
2757 eptr += length;
2758 continue; /* With the main loop */
2761 /* Handle repeated back references. If the length of the reference is
2762 zero, just continue with the main loop. If the length is negative, it
2763 means the reference is unset in non-Java-compatible mode. If the minimum is
2764 zero, we can continue at the same level without recursion. For any other
2765 minimum, carrying on will result in NOMATCH. */
2767 if (length == 0) continue;
2768 if (length < 0 && min == 0) continue;
2770 /* First, ensure the minimum number of matches are present. We get back
2771 the length of the reference string explicitly rather than passing the
2772 address of eptr, so that eptr can be a register variable. */
2774 for (i = 1; i <= min; i++)
2776 int slength;
2777 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2779 if (slength == -2) eptr = md->end_subject; /* Partial match */
2780 CHECK_PARTIAL();
2781 RRETURN(MATCH_NOMATCH);
2783 eptr += slength;
2786 /* If min = max, continue at the same level without recursion.
2787 They are not both allowed to be zero. */
2789 if (min == max) continue;
2791 /* If minimizing, keep trying and advancing the pointer */
2793 if (minimize)
2795 for (fi = min;; fi++)
2797 int slength;
2798 RMATCH(eptr, ecode, offset_top, md, eptrb, RM14);
2799 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2800 if (fi >= max) RRETURN(MATCH_NOMATCH);
2801 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2803 if (slength == -2) eptr = md->end_subject; /* Partial match */
2804 CHECK_PARTIAL();
2805 RRETURN(MATCH_NOMATCH);
2807 eptr += slength;
2809 /* Control never gets here */
2812 /* If maximizing, find the longest string and work backwards */
2814 else
2816 pp = eptr;
2817 for (i = min; i < max; i++)
2819 int slength;
2820 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2822 /* Can't use CHECK_PARTIAL because we don't want to update eptr in
2823 the soft partial matching case. */
2825 if (slength == -2 && md->partial != 0 &&
2826 md->end_subject > md->start_used_ptr)
2828 md->hitend = TRUE;
2829 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2831 break;
2833 eptr += slength;
2836 while (eptr >= pp)
2838 RMATCH(eptr, ecode, offset_top, md, eptrb, RM15);
2839 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2840 eptr -= length;
2842 RRETURN(MATCH_NOMATCH);
2844 /* Control never gets here */
2846 /* Match a bit-mapped character class, possibly repeatedly. This op code is
2847 used when all the characters in the class have values in the range 0-255,
2848 and either the matching is caseful, or the characters are in the range
2849 0-127 when UTF-8 processing is enabled. The only difference between
2850 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2851 encountered.
2853 First, look past the end of the item to see if there is repeat information
2854 following. Then obey similar code to character type repeats - written out
2855 again for speed. */
2857 case OP_NCLASS:
2858 case OP_CLASS:
2860 /* The data variable is saved across frames, so the byte map needs to
2861 be stored there. */
2862 #define BYTE_MAP ((pcre_uint8 *)data)
2863 data = ecode + 1; /* Save for matching */
2864 ecode += 1 + (32 / sizeof(pcre_uchar)); /* Advance past the item */
2866 switch (*ecode)
2868 case OP_CRSTAR:
2869 case OP_CRMINSTAR:
2870 case OP_CRPLUS:
2871 case OP_CRMINPLUS:
2872 case OP_CRQUERY:
2873 case OP_CRMINQUERY:
2874 case OP_CRPOSSTAR:
2875 case OP_CRPOSPLUS:
2876 case OP_CRPOSQUERY:
2877 c = *ecode++ - OP_CRSTAR;
2878 if (c < OP_CRPOSSTAR - OP_CRSTAR) minimize = (c & 1) != 0;
2879 else possessive = TRUE;
2880 min = rep_min[c]; /* Pick up values from tables; */
2881 max = rep_max[c]; /* zero for max => infinity */
2882 if (max == 0) max = INT_MAX;
2883 break;
2885 case OP_CRRANGE:
2886 case OP_CRMINRANGE:
2887 case OP_CRPOSRANGE:
2888 minimize = (*ecode == OP_CRMINRANGE);
2889 possessive = (*ecode == OP_CRPOSRANGE);
2890 min = GET2(ecode, 1);
2891 max = GET2(ecode, 1 + IMM2_SIZE);
2892 if (max == 0) max = INT_MAX;
2893 ecode += 1 + 2 * IMM2_SIZE;
2894 break;
2896 default: /* No repeat follows */
2897 min = max = 1;
2898 break;
2901 /* First, ensure the minimum number of matches are present. */
2903 #ifdef SUPPORT_UTF
2904 if (utf)
2906 for (i = 1; i <= min; i++)
2908 if (eptr >= md->end_subject)
2910 SCHECK_PARTIAL();
2911 RRETURN(MATCH_NOMATCH);
2913 GETCHARINC(c, eptr);
2914 if (c > 255)
2916 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2918 else
2919 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2922 else
2923 #endif
2924 /* Not UTF mode */
2926 for (i = 1; i <= min; i++)
2928 if (eptr >= md->end_subject)
2930 SCHECK_PARTIAL();
2931 RRETURN(MATCH_NOMATCH);
2933 c = *eptr++;
2934 #ifndef COMPILE_PCRE8
2935 if (c > 255)
2937 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2939 else
2940 #endif
2941 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2945 /* If max == min we can continue with the main loop without the
2946 need to recurse. */
2948 if (min == max) continue;
2950 /* If minimizing, keep testing the rest of the expression and advancing
2951 the pointer while it matches the class. */
2953 if (minimize)
2955 #ifdef SUPPORT_UTF
2956 if (utf)
2958 for (fi = min;; fi++)
2960 RMATCH(eptr, ecode, offset_top, md, eptrb, RM16);
2961 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2962 if (fi >= max) RRETURN(MATCH_NOMATCH);
2963 if (eptr >= md->end_subject)
2965 SCHECK_PARTIAL();
2966 RRETURN(MATCH_NOMATCH);
2968 GETCHARINC(c, eptr);
2969 if (c > 255)
2971 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2973 else
2974 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2977 else
2978 #endif
2979 /* Not UTF mode */
2981 for (fi = min;; fi++)
2983 RMATCH(eptr, ecode, offset_top, md, eptrb, RM17);
2984 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2985 if (fi >= max) RRETURN(MATCH_NOMATCH);
2986 if (eptr >= md->end_subject)
2988 SCHECK_PARTIAL();
2989 RRETURN(MATCH_NOMATCH);
2991 c = *eptr++;
2992 #ifndef COMPILE_PCRE8
2993 if (c > 255)
2995 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2997 else
2998 #endif
2999 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
3002 /* Control never gets here */
3005 /* If maximizing, find the longest possible run, then work backwards. */
3007 else
3009 pp = eptr;
3011 #ifdef SUPPORT_UTF
3012 if (utf)
3014 for (i = min; i < max; i++)
3016 int len = 1;
3017 if (eptr >= md->end_subject)
3019 SCHECK_PARTIAL();
3020 break;
3022 GETCHARLEN(c, eptr, len);
3023 if (c > 255)
3025 if (op == OP_CLASS) break;
3027 else
3028 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
3029 eptr += len;
3032 if (possessive) continue; /* No backtracking */
3034 for (;;)
3036 RMATCH(eptr, ecode, offset_top, md, eptrb, RM18);
3037 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3038 if (eptr-- == pp) break; /* Stop if tried at original pos */
3039 BACKCHAR(eptr);
3042 else
3043 #endif
3044 /* Not UTF mode */
3046 for (i = min; i < max; i++)
3048 if (eptr >= md->end_subject)
3050 SCHECK_PARTIAL();
3051 break;
3053 c = *eptr;
3054 #ifndef COMPILE_PCRE8
3055 if (c > 255)
3057 if (op == OP_CLASS) break;
3059 else
3060 #endif
3061 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
3062 eptr++;
3065 if (possessive) continue; /* No backtracking */
3067 while (eptr >= pp)
3069 RMATCH(eptr, ecode, offset_top, md, eptrb, RM19);
3070 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3071 eptr--;
3075 RRETURN(MATCH_NOMATCH);
3077 #undef BYTE_MAP
3079 /* Control never gets here */
3082 /* Match an extended character class. In the 8-bit library, this opcode is
3083 encountered only when UTF-8 mode mode is supported. In the 16-bit and
3084 32-bit libraries, codepoints greater than 255 may be encountered even when
3085 UTF is not supported. */
3087 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3088 case OP_XCLASS:
3090 data = ecode + 1 + LINK_SIZE; /* Save for matching */
3091 ecode += GET(ecode, 1); /* Advance past the item */
3093 switch (*ecode)
3095 case OP_CRSTAR:
3096 case OP_CRMINSTAR:
3097 case OP_CRPLUS:
3098 case OP_CRMINPLUS:
3099 case OP_CRQUERY:
3100 case OP_CRMINQUERY:
3101 case OP_CRPOSSTAR:
3102 case OP_CRPOSPLUS:
3103 case OP_CRPOSQUERY:
3104 c = *ecode++ - OP_CRSTAR;
3105 if (c < OP_CRPOSSTAR - OP_CRSTAR) minimize = (c & 1) != 0;
3106 else possessive = TRUE;
3107 min = rep_min[c]; /* Pick up values from tables; */
3108 max = rep_max[c]; /* zero for max => infinity */
3109 if (max == 0) max = INT_MAX;
3110 break;
3112 case OP_CRRANGE:
3113 case OP_CRMINRANGE:
3114 case OP_CRPOSRANGE:
3115 minimize = (*ecode == OP_CRMINRANGE);
3116 possessive = (*ecode == OP_CRPOSRANGE);
3117 min = GET2(ecode, 1);
3118 max = GET2(ecode, 1 + IMM2_SIZE);
3119 if (max == 0) max = INT_MAX;
3120 ecode += 1 + 2 * IMM2_SIZE;
3121 break;
3123 default: /* No repeat follows */
3124 min = max = 1;
3125 break;
3128 /* First, ensure the minimum number of matches are present. */
3130 for (i = 1; i <= min; i++)
3132 if (eptr >= md->end_subject)
3134 SCHECK_PARTIAL();
3135 RRETURN(MATCH_NOMATCH);
3137 GETCHARINCTEST(c, eptr);
3138 if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH);
3141 /* If max == min we can continue with the main loop without the
3142 need to recurse. */
3144 if (min == max) continue;
3146 /* If minimizing, keep testing the rest of the expression and advancing
3147 the pointer while it matches the class. */
3149 if (minimize)
3151 for (fi = min;; fi++)
3153 RMATCH(eptr, ecode, offset_top, md, eptrb, RM20);
3154 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3155 if (fi >= max) RRETURN(MATCH_NOMATCH);
3156 if (eptr >= md->end_subject)
3158 SCHECK_PARTIAL();
3159 RRETURN(MATCH_NOMATCH);
3161 GETCHARINCTEST(c, eptr);
3162 if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH);
3164 /* Control never gets here */
3167 /* If maximizing, find the longest possible run, then work backwards. */
3169 else
3171 pp = eptr;
3172 for (i = min; i < max; i++)
3174 int len = 1;
3175 if (eptr >= md->end_subject)
3177 SCHECK_PARTIAL();
3178 break;
3180 #ifdef SUPPORT_UTF
3181 GETCHARLENTEST(c, eptr, len);
3182 #else
3183 c = *eptr;
3184 #endif
3185 if (!PRIV(xclass)(c, data, utf)) break;
3186 eptr += len;
3189 if (possessive) continue; /* No backtracking */
3191 for(;;)
3193 RMATCH(eptr, ecode, offset_top, md, eptrb, RM21);
3194 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3195 if (eptr-- == pp) break; /* Stop if tried at original pos */
3196 #ifdef SUPPORT_UTF
3197 if (utf) BACKCHAR(eptr);
3198 #endif
3200 RRETURN(MATCH_NOMATCH);
3203 /* Control never gets here */
3205 #endif /* End of XCLASS */
3207 /* Match a single character, casefully */
3209 case OP_CHAR:
3210 #ifdef SUPPORT_UTF
3211 if (utf)
3213 length = 1;
3214 ecode++;
3215 GETCHARLEN(fc, ecode, length);
3216 if (length > md->end_subject - eptr)
3218 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
3219 RRETURN(MATCH_NOMATCH);
3221 while (length-- > 0) if (*ecode++ != UCHAR21INC(eptr)) RRETURN(MATCH_NOMATCH);
3223 else
3224 #endif
3225 /* Not UTF mode */
3227 if (md->end_subject - eptr < 1)
3229 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
3230 RRETURN(MATCH_NOMATCH);
3232 if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
3233 ecode += 2;
3235 break;
3237 /* Match a single character, caselessly. If we are at the end of the
3238 subject, give up immediately. */
3240 case OP_CHARI:
3241 if (eptr >= md->end_subject)
3243 SCHECK_PARTIAL();
3244 RRETURN(MATCH_NOMATCH);
3247 #ifdef SUPPORT_UTF
3248 if (utf)
3250 length = 1;
3251 ecode++;
3252 GETCHARLEN(fc, ecode, length);
3254 /* If the pattern character's value is < 128, we have only one byte, and
3255 we know that its other case must also be one byte long, so we can use the
3256 fast lookup table. We know that there is at least one byte left in the
3257 subject. */
3259 if (fc < 128)
3261 pcre_uint32 cc = UCHAR21(eptr);
3262 if (md->lcc[fc] != TABLE_GET(cc, md->lcc, cc)) RRETURN(MATCH_NOMATCH);
3263 ecode++;
3264 eptr++;
3267 /* Otherwise we must pick up the subject character. Note that we cannot
3268 use the value of "length" to check for sufficient bytes left, because the
3269 other case of the character may have more or fewer bytes. */
3271 else
3273 pcre_uint32 dc;
3274 GETCHARINC(dc, eptr);
3275 ecode += length;
3277 /* If we have Unicode property support, we can use it to test the other
3278 case of the character, if there is one. */
3280 if (fc != dc)
3282 #ifdef SUPPORT_UCP
3283 if (dc != UCD_OTHERCASE(fc))
3284 #endif
3285 RRETURN(MATCH_NOMATCH);
3289 else
3290 #endif /* SUPPORT_UTF */
3292 /* Not UTF mode */
3294 if (TABLE_GET(ecode[1], md->lcc, ecode[1])
3295 != TABLE_GET(*eptr, md->lcc, *eptr)) RRETURN(MATCH_NOMATCH);
3296 eptr++;
3297 ecode += 2;
3299 break;
3301 /* Match a single character repeatedly. */
3303 case OP_EXACT:
3304 case OP_EXACTI:
3305 min = max = GET2(ecode, 1);
3306 ecode += 1 + IMM2_SIZE;
3307 goto REPEATCHAR;
3309 case OP_POSUPTO:
3310 case OP_POSUPTOI:
3311 possessive = TRUE;
3312 /* Fall through */
3314 case OP_UPTO:
3315 case OP_UPTOI:
3316 case OP_MINUPTO:
3317 case OP_MINUPTOI:
3318 min = 0;
3319 max = GET2(ecode, 1);
3320 minimize = *ecode == OP_MINUPTO || *ecode == OP_MINUPTOI;
3321 ecode += 1 + IMM2_SIZE;
3322 goto REPEATCHAR;
3324 case OP_POSSTAR:
3325 case OP_POSSTARI:
3326 possessive = TRUE;
3327 min = 0;
3328 max = INT_MAX;
3329 ecode++;
3330 goto REPEATCHAR;
3332 case OP_POSPLUS:
3333 case OP_POSPLUSI:
3334 possessive = TRUE;
3335 min = 1;
3336 max = INT_MAX;
3337 ecode++;
3338 goto REPEATCHAR;
3340 case OP_POSQUERY:
3341 case OP_POSQUERYI:
3342 possessive = TRUE;
3343 min = 0;
3344 max = 1;
3345 ecode++;
3346 goto REPEATCHAR;
3348 case OP_STAR:
3349 case OP_STARI:
3350 case OP_MINSTAR:
3351 case OP_MINSTARI:
3352 case OP_PLUS:
3353 case OP_PLUSI:
3354 case OP_MINPLUS:
3355 case OP_MINPLUSI:
3356 case OP_QUERY:
3357 case OP_QUERYI:
3358 case OP_MINQUERY:
3359 case OP_MINQUERYI:
3360 c = *ecode++ - ((op < OP_STARI)? OP_STAR : OP_STARI);
3361 minimize = (c & 1) != 0;
3362 min = rep_min[c]; /* Pick up values from tables; */
3363 max = rep_max[c]; /* zero for max => infinity */
3364 if (max == 0) max = INT_MAX;
3366 /* Common code for all repeated single-character matches. We first check
3367 for the minimum number of characters. If the minimum equals the maximum, we
3368 are done. Otherwise, if minimizing, check the rest of the pattern for a
3369 match; if there isn't one, advance up to the maximum, one character at a
3370 time.
3372 If maximizing, advance up to the maximum number of matching characters,
3373 until eptr is past the end of the maximum run. If possessive, we are
3374 then done (no backing up). Otherwise, match at this position; anything
3375 other than no match is immediately returned. For nomatch, back up one
3376 character, unless we are matching \R and the last thing matched was
3377 \r\n, in which case, back up two bytes. When we reach the first optional
3378 character position, we can save stack by doing a tail recurse.
3380 The various UTF/non-UTF and caseful/caseless cases are handled separately,
3381 for speed. */
3383 REPEATCHAR:
3384 #ifdef SUPPORT_UTF
3385 if (utf)
3387 length = 1;
3388 charptr = ecode;
3389 GETCHARLEN(fc, ecode, length);
3390 ecode += length;
3392 /* Handle multibyte character matching specially here. There is
3393 support for caseless matching if UCP support is present. */
3395 if (length > 1)
3397 #ifdef SUPPORT_UCP
3398 pcre_uint32 othercase;
3399 if (op >= OP_STARI && /* Caseless */
3400 (othercase = UCD_OTHERCASE(fc)) != fc)
3401 oclength = PRIV(ord2utf)(othercase, occhars);
3402 else oclength = 0;
3403 #endif /* SUPPORT_UCP */
3405 for (i = 1; i <= min; i++)
3407 if (eptr <= md->end_subject - length &&
3408 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3409 #ifdef SUPPORT_UCP
3410 else if (oclength > 0 &&
3411 eptr <= md->end_subject - oclength &&
3412 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3413 #endif /* SUPPORT_UCP */
3414 else
3416 CHECK_PARTIAL();
3417 RRETURN(MATCH_NOMATCH);
3421 if (min == max) continue;
3423 if (minimize)
3425 for (fi = min;; fi++)
3427 RMATCH(eptr, ecode, offset_top, md, eptrb, RM22);
3428 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3429 if (fi >= max) RRETURN(MATCH_NOMATCH);
3430 if (eptr <= md->end_subject - length &&
3431 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3432 #ifdef SUPPORT_UCP
3433 else if (oclength > 0 &&
3434 eptr <= md->end_subject - oclength &&
3435 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3436 #endif /* SUPPORT_UCP */
3437 else
3439 CHECK_PARTIAL();
3440 RRETURN(MATCH_NOMATCH);
3443 /* Control never gets here */
3446 else /* Maximize */
3448 pp = eptr;
3449 for (i = min; i < max; i++)
3451 if (eptr <= md->end_subject - length &&
3452 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3453 #ifdef SUPPORT_UCP
3454 else if (oclength > 0 &&
3455 eptr <= md->end_subject - oclength &&
3456 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3457 #endif /* SUPPORT_UCP */
3458 else
3460 CHECK_PARTIAL();
3461 break;
3465 if (possessive) continue; /* No backtracking */
3466 for(;;)
3468 if (eptr == pp) goto TAIL_RECURSE;
3469 RMATCH(eptr, ecode, offset_top, md, eptrb, RM23);
3470 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3471 #ifdef SUPPORT_UCP
3472 eptr--;
3473 BACKCHAR(eptr);
3474 #else /* without SUPPORT_UCP */
3475 eptr -= length;
3476 #endif /* SUPPORT_UCP */
3479 /* Control never gets here */
3482 /* If the length of a UTF-8 character is 1, we fall through here, and
3483 obey the code as for non-UTF-8 characters below, though in this case the
3484 value of fc will always be < 128. */
3486 else
3487 #endif /* SUPPORT_UTF */
3488 /* When not in UTF-8 mode, load a single-byte character. */
3489 fc = *ecode++;
3491 /* The value of fc at this point is always one character, though we may
3492 or may not be in UTF mode. The code is duplicated for the caseless and
3493 caseful cases, for speed, since matching characters is likely to be quite
3494 common. First, ensure the minimum number of matches are present. If min =
3495 max, continue at the same level without recursing. Otherwise, if
3496 minimizing, keep trying the rest of the expression and advancing one
3497 matching character if failing, up to the maximum. Alternatively, if
3498 maximizing, find the maximum number of characters and work backwards. */
3500 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3501 max, (char *)eptr));
3503 if (op >= OP_STARI) /* Caseless */
3505 #ifdef COMPILE_PCRE8
3506 /* fc must be < 128 if UTF is enabled. */
3507 foc = md->fcc[fc];
3508 #else
3509 #ifdef SUPPORT_UTF
3510 #ifdef SUPPORT_UCP
3511 if (utf && fc > 127)
3512 foc = UCD_OTHERCASE(fc);
3513 #else
3514 if (utf && fc > 127)
3515 foc = fc;
3516 #endif /* SUPPORT_UCP */
3517 else
3518 #endif /* SUPPORT_UTF */
3519 foc = TABLE_GET(fc, md->fcc, fc);
3520 #endif /* COMPILE_PCRE8 */
3522 for (i = 1; i <= min; i++)
3524 pcre_uint32 cc; /* Faster than pcre_uchar */
3525 if (eptr >= md->end_subject)
3527 SCHECK_PARTIAL();
3528 RRETURN(MATCH_NOMATCH);
3530 cc = UCHAR21TEST(eptr);
3531 if (fc != cc && foc != cc) RRETURN(MATCH_NOMATCH);
3532 eptr++;
3534 if (min == max) continue;
3535 if (minimize)
3537 for (fi = min;; fi++)
3539 pcre_uint32 cc; /* Faster than pcre_uchar */
3540 RMATCH(eptr, ecode, offset_top, md, eptrb, RM24);
3541 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3542 if (fi >= max) RRETURN(MATCH_NOMATCH);
3543 if (eptr >= md->end_subject)
3545 SCHECK_PARTIAL();
3546 RRETURN(MATCH_NOMATCH);
3548 cc = UCHAR21TEST(eptr);
3549 if (fc != cc && foc != cc) RRETURN(MATCH_NOMATCH);
3550 eptr++;
3552 /* Control never gets here */
3554 else /* Maximize */
3556 pp = eptr;
3557 for (i = min; i < max; i++)
3559 pcre_uint32 cc; /* Faster than pcre_uchar */
3560 if (eptr >= md->end_subject)
3562 SCHECK_PARTIAL();
3563 break;
3565 cc = UCHAR21TEST(eptr);
3566 if (fc != cc && foc != cc) break;
3567 eptr++;
3569 if (possessive) continue; /* No backtracking */
3570 for (;;)
3572 if (eptr == pp) goto TAIL_RECURSE;
3573 RMATCH(eptr, ecode, offset_top, md, eptrb, RM25);
3574 eptr--;
3575 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3577 /* Control never gets here */
3581 /* Caseful comparisons (includes all multi-byte characters) */
3583 else
3585 for (i = 1; i <= min; i++)
3587 if (eptr >= md->end_subject)
3589 SCHECK_PARTIAL();
3590 RRETURN(MATCH_NOMATCH);
3592 if (fc != UCHAR21INCTEST(eptr)) RRETURN(MATCH_NOMATCH);
3595 if (min == max) continue;
3597 if (minimize)
3599 for (fi = min;; fi++)
3601 RMATCH(eptr, ecode, offset_top, md, eptrb, RM26);
3602 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3603 if (fi >= max) RRETURN(MATCH_NOMATCH);
3604 if (eptr >= md->end_subject)
3606 SCHECK_PARTIAL();
3607 RRETURN(MATCH_NOMATCH);
3609 if (fc != UCHAR21INCTEST(eptr)) RRETURN(MATCH_NOMATCH);
3611 /* Control never gets here */
3613 else /* Maximize */
3615 pp = eptr;
3616 for (i = min; i < max; i++)
3618 if (eptr >= md->end_subject)
3620 SCHECK_PARTIAL();
3621 break;
3623 if (fc != UCHAR21TEST(eptr)) break;
3624 eptr++;
3626 if (possessive) continue; /* No backtracking */
3627 for (;;)
3629 if (eptr == pp) goto TAIL_RECURSE;
3630 RMATCH(eptr, ecode, offset_top, md, eptrb, RM27);
3631 eptr--;
3632 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3634 /* Control never gets here */
3637 /* Control never gets here */
3639 /* Match a negated single one-byte character. The character we are
3640 checking can be multibyte. */
3642 case OP_NOT:
3643 case OP_NOTI:
3644 if (eptr >= md->end_subject)
3646 SCHECK_PARTIAL();
3647 RRETURN(MATCH_NOMATCH);
3649 #ifdef SUPPORT_UTF
3650 if (utf)
3652 register pcre_uint32 ch, och;
3654 ecode++;
3655 GETCHARINC(ch, ecode);
3656 GETCHARINC(c, eptr);
3658 if (op == OP_NOT)
3660 if (ch == c) RRETURN(MATCH_NOMATCH);
3662 else
3664 #ifdef SUPPORT_UCP
3665 if (ch > 127)
3666 och = UCD_OTHERCASE(ch);
3667 #else
3668 if (ch > 127)
3669 och = ch;
3670 #endif /* SUPPORT_UCP */
3671 else
3672 och = TABLE_GET(ch, md->fcc, ch);
3673 if (ch == c || och == c) RRETURN(MATCH_NOMATCH);
3676 else
3677 #endif
3679 register pcre_uint32 ch = ecode[1];
3680 c = *eptr++;
3681 if (ch == c || (op == OP_NOTI && TABLE_GET(ch, md->fcc, ch) == c))
3682 RRETURN(MATCH_NOMATCH);
3683 ecode += 2;
3685 break;
3687 /* Match a negated single one-byte character repeatedly. This is almost a
3688 repeat of the code for a repeated single character, but I haven't found a
3689 nice way of commoning these up that doesn't require a test of the
3690 positive/negative option for each character match. Maybe that wouldn't add
3691 very much to the time taken, but character matching *is* what this is all
3692 about... */
3694 case OP_NOTEXACT:
3695 case OP_NOTEXACTI:
3696 min = max = GET2(ecode, 1);
3697 ecode += 1 + IMM2_SIZE;
3698 goto REPEATNOTCHAR;
3700 case OP_NOTUPTO:
3701 case OP_NOTUPTOI:
3702 case OP_NOTMINUPTO:
3703 case OP_NOTMINUPTOI:
3704 min = 0;
3705 max = GET2(ecode, 1);
3706 minimize = *ecode == OP_NOTMINUPTO || *ecode == OP_NOTMINUPTOI;
3707 ecode += 1 + IMM2_SIZE;
3708 goto REPEATNOTCHAR;
3710 case OP_NOTPOSSTAR:
3711 case OP_NOTPOSSTARI:
3712 possessive = TRUE;
3713 min = 0;
3714 max = INT_MAX;
3715 ecode++;
3716 goto REPEATNOTCHAR;
3718 case OP_NOTPOSPLUS:
3719 case OP_NOTPOSPLUSI:
3720 possessive = TRUE;
3721 min = 1;
3722 max = INT_MAX;
3723 ecode++;
3724 goto REPEATNOTCHAR;
3726 case OP_NOTPOSQUERY:
3727 case OP_NOTPOSQUERYI:
3728 possessive = TRUE;
3729 min = 0;
3730 max = 1;
3731 ecode++;
3732 goto REPEATNOTCHAR;
3734 case OP_NOTPOSUPTO:
3735 case OP_NOTPOSUPTOI:
3736 possessive = TRUE;
3737 min = 0;
3738 max = GET2(ecode, 1);
3739 ecode += 1 + IMM2_SIZE;
3740 goto REPEATNOTCHAR;
3742 case OP_NOTSTAR:
3743 case OP_NOTSTARI:
3744 case OP_NOTMINSTAR:
3745 case OP_NOTMINSTARI:
3746 case OP_NOTPLUS:
3747 case OP_NOTPLUSI:
3748 case OP_NOTMINPLUS:
3749 case OP_NOTMINPLUSI:
3750 case OP_NOTQUERY:
3751 case OP_NOTQUERYI:
3752 case OP_NOTMINQUERY:
3753 case OP_NOTMINQUERYI:
3754 c = *ecode++ - ((op >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR);
3755 minimize = (c & 1) != 0;
3756 min = rep_min[c]; /* Pick up values from tables; */
3757 max = rep_max[c]; /* zero for max => infinity */
3758 if (max == 0) max = INT_MAX;
3760 /* Common code for all repeated single-byte matches. */
3762 REPEATNOTCHAR:
3763 GETCHARINCTEST(fc, ecode);
3765 /* The code is duplicated for the caseless and caseful cases, for speed,
3766 since matching characters is likely to be quite common. First, ensure the
3767 minimum number of matches are present. If min = max, continue at the same
3768 level without recursing. Otherwise, if minimizing, keep trying the rest of
3769 the expression and advancing one matching character if failing, up to the
3770 maximum. Alternatively, if maximizing, find the maximum number of
3771 characters and work backwards. */
3773 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3774 max, (char *)eptr));
3776 if (op >= OP_NOTSTARI) /* Caseless */
3778 #ifdef SUPPORT_UTF
3779 #ifdef SUPPORT_UCP
3780 if (utf && fc > 127)
3781 foc = UCD_OTHERCASE(fc);
3782 #else
3783 if (utf && fc > 127)
3784 foc = fc;
3785 #endif /* SUPPORT_UCP */
3786 else
3787 #endif /* SUPPORT_UTF */
3788 foc = TABLE_GET(fc, md->fcc, fc);
3790 #ifdef SUPPORT_UTF
3791 if (utf)
3793 register pcre_uint32 d;
3794 for (i = 1; i <= min; i++)
3796 if (eptr >= md->end_subject)
3798 SCHECK_PARTIAL();
3799 RRETURN(MATCH_NOMATCH);
3801 GETCHARINC(d, eptr);
3802 if (fc == d || (unsigned int)foc == d) RRETURN(MATCH_NOMATCH);
3805 else
3806 #endif /* SUPPORT_UTF */
3807 /* Not UTF mode */
3809 for (i = 1; i <= min; i++)
3811 if (eptr >= md->end_subject)
3813 SCHECK_PARTIAL();
3814 RRETURN(MATCH_NOMATCH);
3816 if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH);
3817 eptr++;
3821 if (min == max) continue;
3823 if (minimize)
3825 #ifdef SUPPORT_UTF
3826 if (utf)
3828 register pcre_uint32 d;
3829 for (fi = min;; fi++)
3831 RMATCH(eptr, ecode, offset_top, md, eptrb, RM28);
3832 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3833 if (fi >= max) RRETURN(MATCH_NOMATCH);
3834 if (eptr >= md->end_subject)
3836 SCHECK_PARTIAL();
3837 RRETURN(MATCH_NOMATCH);
3839 GETCHARINC(d, eptr);
3840 if (fc == d || (unsigned int)foc == d) RRETURN(MATCH_NOMATCH);
3843 else
3844 #endif /*SUPPORT_UTF */
3845 /* Not UTF mode */
3847 for (fi = min;; fi++)
3849 RMATCH(eptr, ecode, offset_top, md, eptrb, RM29);
3850 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3851 if (fi >= max) RRETURN(MATCH_NOMATCH);
3852 if (eptr >= md->end_subject)
3854 SCHECK_PARTIAL();
3855 RRETURN(MATCH_NOMATCH);
3857 if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH);
3858 eptr++;
3861 /* Control never gets here */
3864 /* Maximize case */
3866 else
3868 pp = eptr;
3870 #ifdef SUPPORT_UTF
3871 if (utf)
3873 register pcre_uint32 d;
3874 for (i = min; i < max; i++)
3876 int len = 1;
3877 if (eptr >= md->end_subject)
3879 SCHECK_PARTIAL();
3880 break;
3882 GETCHARLEN(d, eptr, len);
3883 if (fc == d || (unsigned int)foc == d) break;
3884 eptr += len;
3886 if (possessive) continue; /* No backtracking */
3887 for(;;)
3889 if (eptr == pp) goto TAIL_RECURSE;
3890 RMATCH(eptr, ecode, offset_top, md, eptrb, RM30);
3891 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3892 eptr--;
3893 BACKCHAR(eptr);
3896 else
3897 #endif /* SUPPORT_UTF */
3898 /* Not UTF mode */
3900 for (i = min; i < max; i++)
3902 if (eptr >= md->end_subject)
3904 SCHECK_PARTIAL();
3905 break;
3907 if (fc == *eptr || foc == *eptr) break;
3908 eptr++;
3910 if (possessive) continue; /* No backtracking */
3911 for (;;)
3913 if (eptr == pp) goto TAIL_RECURSE;
3914 RMATCH(eptr, ecode, offset_top, md, eptrb, RM31);
3915 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3916 eptr--;
3919 /* Control never gets here */
3923 /* Caseful comparisons */
3925 else
3927 #ifdef SUPPORT_UTF
3928 if (utf)
3930 register pcre_uint32 d;
3931 for (i = 1; i <= min; i++)
3933 if (eptr >= md->end_subject)
3935 SCHECK_PARTIAL();
3936 RRETURN(MATCH_NOMATCH);
3938 GETCHARINC(d, eptr);
3939 if (fc == d) RRETURN(MATCH_NOMATCH);
3942 else
3943 #endif
3944 /* Not UTF mode */
3946 for (i = 1; i <= min; i++)
3948 if (eptr >= md->end_subject)
3950 SCHECK_PARTIAL();
3951 RRETURN(MATCH_NOMATCH);
3953 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
3957 if (min == max) continue;
3959 if (minimize)
3961 #ifdef SUPPORT_UTF
3962 if (utf)
3964 register pcre_uint32 d;
3965 for (fi = min;; fi++)
3967 RMATCH(eptr, ecode, offset_top, md, eptrb, RM32);
3968 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3969 if (fi >= max) RRETURN(MATCH_NOMATCH);
3970 if (eptr >= md->end_subject)
3972 SCHECK_PARTIAL();
3973 RRETURN(MATCH_NOMATCH);
3975 GETCHARINC(d, eptr);
3976 if (fc == d) RRETURN(MATCH_NOMATCH);
3979 else
3980 #endif
3981 /* Not UTF mode */
3983 for (fi = min;; fi++)
3985 RMATCH(eptr, ecode, offset_top, md, eptrb, RM33);
3986 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3987 if (fi >= max) RRETURN(MATCH_NOMATCH);
3988 if (eptr >= md->end_subject)
3990 SCHECK_PARTIAL();
3991 RRETURN(MATCH_NOMATCH);
3993 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
3996 /* Control never gets here */
3999 /* Maximize case */
4001 else
4003 pp = eptr;
4005 #ifdef SUPPORT_UTF
4006 if (utf)
4008 register pcre_uint32 d;
4009 for (i = min; i < max; i++)
4011 int len = 1;
4012 if (eptr >= md->end_subject)
4014 SCHECK_PARTIAL();
4015 break;
4017 GETCHARLEN(d, eptr, len);
4018 if (fc == d) break;
4019 eptr += len;
4021 if (possessive) continue; /* No backtracking */
4022 for(;;)
4024 if (eptr == pp) goto TAIL_RECURSE;
4025 RMATCH(eptr, ecode, offset_top, md, eptrb, RM34);
4026 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4027 eptr--;
4028 BACKCHAR(eptr);
4031 else
4032 #endif
4033 /* Not UTF mode */
4035 for (i = min; i < max; i++)
4037 if (eptr >= md->end_subject)
4039 SCHECK_PARTIAL();
4040 break;
4042 if (fc == *eptr) break;
4043 eptr++;
4045 if (possessive) continue; /* No backtracking */
4046 for (;;)
4048 if (eptr == pp) goto TAIL_RECURSE;
4049 RMATCH(eptr, ecode, offset_top, md, eptrb, RM35);
4050 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4051 eptr--;
4054 /* Control never gets here */
4057 /* Control never gets here */
4059 /* Match a single character type repeatedly; several different opcodes
4060 share code. This is very similar to the code for single characters, but we
4061 repeat it in the interests of efficiency. */
4063 case OP_TYPEEXACT:
4064 min = max = GET2(ecode, 1);
4065 minimize = TRUE;
4066 ecode += 1 + IMM2_SIZE;
4067 goto REPEATTYPE;
4069 case OP_TYPEUPTO:
4070 case OP_TYPEMINUPTO:
4071 min = 0;
4072 max = GET2(ecode, 1);
4073 minimize = *ecode == OP_TYPEMINUPTO;
4074 ecode += 1 + IMM2_SIZE;
4075 goto REPEATTYPE;
4077 case OP_TYPEPOSSTAR:
4078 possessive = TRUE;
4079 min = 0;
4080 max = INT_MAX;
4081 ecode++;
4082 goto REPEATTYPE;
4084 case OP_TYPEPOSPLUS:
4085 possessive = TRUE;
4086 min = 1;
4087 max = INT_MAX;
4088 ecode++;
4089 goto REPEATTYPE;
4091 case OP_TYPEPOSQUERY:
4092 possessive = TRUE;
4093 min = 0;
4094 max = 1;
4095 ecode++;
4096 goto REPEATTYPE;
4098 case OP_TYPEPOSUPTO:
4099 possessive = TRUE;
4100 min = 0;
4101 max = GET2(ecode, 1);
4102 ecode += 1 + IMM2_SIZE;
4103 goto REPEATTYPE;
4105 case OP_TYPESTAR:
4106 case OP_TYPEMINSTAR:
4107 case OP_TYPEPLUS:
4108 case OP_TYPEMINPLUS:
4109 case OP_TYPEQUERY:
4110 case OP_TYPEMINQUERY:
4111 c = *ecode++ - OP_TYPESTAR;
4112 minimize = (c & 1) != 0;
4113 min = rep_min[c]; /* Pick up values from tables; */
4114 max = rep_max[c]; /* zero for max => infinity */
4115 if (max == 0) max = INT_MAX;
4117 /* Common code for all repeated single character type matches. Note that
4118 in UTF-8 mode, '.' matches a character of any length, but for the other
4119 character types, the valid characters are all one-byte long. */
4121 REPEATTYPE:
4122 ctype = *ecode++; /* Code for the character type */
4124 #ifdef SUPPORT_UCP
4125 if (ctype == OP_PROP || ctype == OP_NOTPROP)
4127 prop_fail_result = ctype == OP_NOTPROP;
4128 prop_type = *ecode++;
4129 prop_value = *ecode++;
4131 else prop_type = -1;
4132 #endif
4134 /* First, ensure the minimum number of matches are present. Use inline
4135 code for maximizing the speed, and do the type test once at the start
4136 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
4137 is tidier. Also separate the UCP code, which can be the same for both UTF-8
4138 and single-bytes. */
4140 if (min > 0)
4142 #ifdef SUPPORT_UCP
4143 if (prop_type >= 0)
4145 switch(prop_type)
4147 case PT_ANY:
4148 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
4149 for (i = 1; i <= min; i++)
4151 if (eptr >= md->end_subject)
4153 SCHECK_PARTIAL();
4154 RRETURN(MATCH_NOMATCH);
4156 GETCHARINCTEST(c, eptr);
4158 break;
4160 case PT_LAMP:
4161 for (i = 1; i <= min; i++)
4163 int chartype;
4164 if (eptr >= md->end_subject)
4166 SCHECK_PARTIAL();
4167 RRETURN(MATCH_NOMATCH);
4169 GETCHARINCTEST(c, eptr);
4170 chartype = UCD_CHARTYPE(c);
4171 if ((chartype == ucp_Lu ||
4172 chartype == ucp_Ll ||
4173 chartype == ucp_Lt) == prop_fail_result)
4174 RRETURN(MATCH_NOMATCH);
4176 break;
4178 case PT_GC:
4179 for (i = 1; i <= min; i++)
4181 if (eptr >= md->end_subject)
4183 SCHECK_PARTIAL();
4184 RRETURN(MATCH_NOMATCH);
4186 GETCHARINCTEST(c, eptr);
4187 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4188 RRETURN(MATCH_NOMATCH);
4190 break;
4192 case PT_PC:
4193 for (i = 1; i <= min; i++)
4195 if (eptr >= md->end_subject)
4197 SCHECK_PARTIAL();
4198 RRETURN(MATCH_NOMATCH);
4200 GETCHARINCTEST(c, eptr);
4201 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4202 RRETURN(MATCH_NOMATCH);
4204 break;
4206 case PT_SC:
4207 for (i = 1; i <= min; i++)
4209 if (eptr >= md->end_subject)
4211 SCHECK_PARTIAL();
4212 RRETURN(MATCH_NOMATCH);
4214 GETCHARINCTEST(c, eptr);
4215 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4216 RRETURN(MATCH_NOMATCH);
4218 break;
4220 case PT_ALNUM:
4221 for (i = 1; i <= min; i++)
4223 int category;
4224 if (eptr >= md->end_subject)
4226 SCHECK_PARTIAL();
4227 RRETURN(MATCH_NOMATCH);
4229 GETCHARINCTEST(c, eptr);
4230 category = UCD_CATEGORY(c);
4231 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4232 RRETURN(MATCH_NOMATCH);
4234 break;
4236 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
4237 which means that Perl space and POSIX space are now identical. PCRE
4238 was changed at release 8.34. */
4240 case PT_SPACE: /* Perl space */
4241 case PT_PXSPACE: /* POSIX space */
4242 for (i = 1; i <= min; i++)
4244 if (eptr >= md->end_subject)
4246 SCHECK_PARTIAL();
4247 RRETURN(MATCH_NOMATCH);
4249 GETCHARINCTEST(c, eptr);
4250 switch(c)
4252 HSPACE_CASES:
4253 VSPACE_CASES:
4254 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
4255 break;
4257 default:
4258 if ((UCD_CATEGORY(c) == ucp_Z) == prop_fail_result)
4259 RRETURN(MATCH_NOMATCH);
4260 break;
4263 break;
4265 case PT_WORD:
4266 for (i = 1; i <= min; i++)
4268 int category;
4269 if (eptr >= md->end_subject)
4271 SCHECK_PARTIAL();
4272 RRETURN(MATCH_NOMATCH);
4274 GETCHARINCTEST(c, eptr);
4275 category = UCD_CATEGORY(c);
4276 if ((category == ucp_L || category == ucp_N || c == CHAR_UNDERSCORE)
4277 == prop_fail_result)
4278 RRETURN(MATCH_NOMATCH);
4280 break;
4282 case PT_CLIST:
4283 for (i = 1; i <= min; i++)
4285 const pcre_uint32 *cp;
4286 if (eptr >= md->end_subject)
4288 SCHECK_PARTIAL();
4289 RRETURN(MATCH_NOMATCH);
4291 GETCHARINCTEST(c, eptr);
4292 cp = PRIV(ucd_caseless_sets) + prop_value;
4293 for (;;)
4295 if (c < *cp)
4296 { if (prop_fail_result) break; else { RRETURN(MATCH_NOMATCH); } }
4297 if (c == *cp++)
4298 { if (prop_fail_result) { RRETURN(MATCH_NOMATCH); } else break; }
4301 break;
4303 case PT_UCNC:
4304 for (i = 1; i <= min; i++)
4306 if (eptr >= md->end_subject)
4308 SCHECK_PARTIAL();
4309 RRETURN(MATCH_NOMATCH);
4311 GETCHARINCTEST(c, eptr);
4312 if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
4313 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
4314 c >= 0xe000) == prop_fail_result)
4315 RRETURN(MATCH_NOMATCH);
4317 break;
4319 /* This should not occur */
4321 default:
4322 RRETURN(PCRE_ERROR_INTERNAL);
4326 /* Match extended Unicode sequences. We will get here only if the
4327 support is in the binary; otherwise a compile-time error occurs. */
4329 else if (ctype == OP_EXTUNI)
4331 for (i = 1; i <= min; i++)
4333 if (eptr >= md->end_subject)
4335 SCHECK_PARTIAL();
4336 RRETURN(MATCH_NOMATCH);
4338 else
4340 int lgb, rgb;
4341 GETCHARINCTEST(c, eptr);
4342 lgb = UCD_GRAPHBREAK(c);
4343 while (eptr < md->end_subject)
4345 int len = 1;
4346 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
4347 rgb = UCD_GRAPHBREAK(c);
4348 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
4349 lgb = rgb;
4350 eptr += len;
4353 CHECK_PARTIAL();
4357 else
4358 #endif /* SUPPORT_UCP */
4360 /* Handle all other cases when the coding is UTF-8 */
4362 #ifdef SUPPORT_UTF
4363 if (utf) switch(ctype)
4365 case OP_ANY:
4366 for (i = 1; i <= min; i++)
4368 if (eptr >= md->end_subject)
4370 SCHECK_PARTIAL();
4371 RRETURN(MATCH_NOMATCH);
4373 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
4374 if (md->partial != 0 &&
4375 eptr + 1 >= md->end_subject &&
4376 NLBLOCK->nltype == NLTYPE_FIXED &&
4377 NLBLOCK->nllen == 2 &&
4378 UCHAR21(eptr) == NLBLOCK->nl[0])
4380 md->hitend = TRUE;
4381 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
4383 eptr++;
4384 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4386 break;
4388 case OP_ALLANY:
4389 for (i = 1; i <= min; i++)
4391 if (eptr >= md->end_subject)
4393 SCHECK_PARTIAL();
4394 RRETURN(MATCH_NOMATCH);
4396 eptr++;
4397 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4399 break;
4401 case OP_ANYBYTE:
4402 if (eptr > md->end_subject - min) RRETURN(MATCH_NOMATCH);
4403 eptr += min;
4404 break;
4406 case OP_ANYNL:
4407 for (i = 1; i <= min; i++)
4409 if (eptr >= md->end_subject)
4411 SCHECK_PARTIAL();
4412 RRETURN(MATCH_NOMATCH);
4414 GETCHARINC(c, eptr);
4415 switch(c)
4417 default: RRETURN(MATCH_NOMATCH);
4419 case CHAR_CR:
4420 if (eptr < md->end_subject && UCHAR21(eptr) == CHAR_LF) eptr++;
4421 break;
4423 case CHAR_LF:
4424 break;
4426 case CHAR_VT:
4427 case CHAR_FF:
4428 case CHAR_NEL:
4429 #ifndef EBCDIC
4430 case 0x2028:
4431 case 0x2029:
4432 #endif /* Not EBCDIC */
4433 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4434 break;
4437 break;
4439 case OP_NOT_HSPACE:
4440 for (i = 1; i <= min; i++)
4442 if (eptr >= md->end_subject)
4444 SCHECK_PARTIAL();
4445 RRETURN(MATCH_NOMATCH);
4447 GETCHARINC(c, eptr);
4448 switch(c)
4450 HSPACE_CASES: RRETURN(MATCH_NOMATCH); /* Byte and multibyte cases */
4451 default: break;
4454 break;
4456 case OP_HSPACE:
4457 for (i = 1; i <= min; i++)
4459 if (eptr >= md->end_subject)
4461 SCHECK_PARTIAL();
4462 RRETURN(MATCH_NOMATCH);
4464 GETCHARINC(c, eptr);
4465 switch(c)
4467 HSPACE_CASES: break; /* Byte and multibyte cases */
4468 default: RRETURN(MATCH_NOMATCH);
4471 break;
4473 case OP_NOT_VSPACE:
4474 for (i = 1; i <= min; i++)
4476 if (eptr >= md->end_subject)
4478 SCHECK_PARTIAL();
4479 RRETURN(MATCH_NOMATCH);
4481 GETCHARINC(c, eptr);
4482 switch(c)
4484 VSPACE_CASES: RRETURN(MATCH_NOMATCH);
4485 default: break;
4488 break;
4490 case OP_VSPACE:
4491 for (i = 1; i <= min; i++)
4493 if (eptr >= md->end_subject)
4495 SCHECK_PARTIAL();
4496 RRETURN(MATCH_NOMATCH);
4498 GETCHARINC(c, eptr);
4499 switch(c)
4501 VSPACE_CASES: break;
4502 default: RRETURN(MATCH_NOMATCH);
4505 break;
4507 case OP_NOT_DIGIT:
4508 for (i = 1; i <= min; i++)
4510 if (eptr >= md->end_subject)
4512 SCHECK_PARTIAL();
4513 RRETURN(MATCH_NOMATCH);
4515 GETCHARINC(c, eptr);
4516 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
4517 RRETURN(MATCH_NOMATCH);
4519 break;
4521 case OP_DIGIT:
4522 for (i = 1; i <= min; i++)
4524 pcre_uint32 cc;
4525 if (eptr >= md->end_subject)
4527 SCHECK_PARTIAL();
4528 RRETURN(MATCH_NOMATCH);
4530 cc = UCHAR21(eptr);
4531 if (cc >= 128 || (md->ctypes[cc] & ctype_digit) == 0)
4532 RRETURN(MATCH_NOMATCH);
4533 eptr++;
4534 /* No need to skip more bytes - we know it's a 1-byte character */
4536 break;
4538 case OP_NOT_WHITESPACE:
4539 for (i = 1; i <= min; i++)
4541 pcre_uint32 cc;
4542 if (eptr >= md->end_subject)
4544 SCHECK_PARTIAL();
4545 RRETURN(MATCH_NOMATCH);
4547 cc = UCHAR21(eptr);
4548 if (cc < 128 && (md->ctypes[cc] & ctype_space) != 0)
4549 RRETURN(MATCH_NOMATCH);
4550 eptr++;
4551 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4553 break;
4555 case OP_WHITESPACE:
4556 for (i = 1; i <= min; i++)
4558 pcre_uint32 cc;
4559 if (eptr >= md->end_subject)
4561 SCHECK_PARTIAL();
4562 RRETURN(MATCH_NOMATCH);
4564 cc = UCHAR21(eptr);
4565 if (cc >= 128 || (md->ctypes[cc] & ctype_space) == 0)
4566 RRETURN(MATCH_NOMATCH);
4567 eptr++;
4568 /* No need to skip more bytes - we know it's a 1-byte character */
4570 break;
4572 case OP_NOT_WORDCHAR:
4573 for (i = 1; i <= min; i++)
4575 pcre_uint32 cc;
4576 if (eptr >= md->end_subject)
4578 SCHECK_PARTIAL();
4579 RRETURN(MATCH_NOMATCH);
4581 cc = UCHAR21(eptr);
4582 if (cc < 128 && (md->ctypes[cc] & ctype_word) != 0)
4583 RRETURN(MATCH_NOMATCH);
4584 eptr++;
4585 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4587 break;
4589 case OP_WORDCHAR:
4590 for (i = 1; i <= min; i++)
4592 pcre_uint32 cc;
4593 if (eptr >= md->end_subject)
4595 SCHECK_PARTIAL();
4596 RRETURN(MATCH_NOMATCH);
4598 cc = UCHAR21(eptr);
4599 if (cc >= 128 || (md->ctypes[cc] & ctype_word) == 0)
4600 RRETURN(MATCH_NOMATCH);
4601 eptr++;
4602 /* No need to skip more bytes - we know it's a 1-byte character */
4604 break;
4606 default:
4607 RRETURN(PCRE_ERROR_INTERNAL);
4608 } /* End switch(ctype) */
4610 else
4611 #endif /* SUPPORT_UTF */
4613 /* Code for the non-UTF-8 case for minimum matching of operators other
4614 than OP_PROP and OP_NOTPROP. */
4616 switch(ctype)
4618 case OP_ANY:
4619 for (i = 1; i <= min; i++)
4621 if (eptr >= md->end_subject)
4623 SCHECK_PARTIAL();
4624 RRETURN(MATCH_NOMATCH);
4626 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
4627 if (md->partial != 0 &&
4628 eptr + 1 >= md->end_subject &&
4629 NLBLOCK->nltype == NLTYPE_FIXED &&
4630 NLBLOCK->nllen == 2 &&
4631 *eptr == NLBLOCK->nl[0])
4633 md->hitend = TRUE;
4634 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
4636 eptr++;
4638 break;
4640 case OP_ALLANY:
4641 if (eptr > md->end_subject - min)
4643 SCHECK_PARTIAL();
4644 RRETURN(MATCH_NOMATCH);
4646 eptr += min;
4647 break;
4649 case OP_ANYBYTE:
4650 if (eptr > md->end_subject - min)
4652 SCHECK_PARTIAL();
4653 RRETURN(MATCH_NOMATCH);
4655 eptr += min;
4656 break;
4658 case OP_ANYNL:
4659 for (i = 1; i <= min; i++)
4661 if (eptr >= md->end_subject)
4663 SCHECK_PARTIAL();
4664 RRETURN(MATCH_NOMATCH);
4666 switch(*eptr++)
4668 default: RRETURN(MATCH_NOMATCH);
4670 case CHAR_CR:
4671 if (eptr < md->end_subject && *eptr == CHAR_LF) eptr++;
4672 break;
4674 case CHAR_LF:
4675 break;
4677 case CHAR_VT:
4678 case CHAR_FF:
4679 case CHAR_NEL:
4680 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4681 case 0x2028:
4682 case 0x2029:
4683 #endif
4684 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4685 break;
4688 break;
4690 case OP_NOT_HSPACE:
4691 for (i = 1; i <= min; i++)
4693 if (eptr >= md->end_subject)
4695 SCHECK_PARTIAL();
4696 RRETURN(MATCH_NOMATCH);
4698 switch(*eptr++)
4700 default: break;
4701 HSPACE_BYTE_CASES:
4702 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4703 HSPACE_MULTIBYTE_CASES:
4704 #endif
4705 RRETURN(MATCH_NOMATCH);
4708 break;
4710 case OP_HSPACE:
4711 for (i = 1; i <= min; i++)
4713 if (eptr >= md->end_subject)
4715 SCHECK_PARTIAL();
4716 RRETURN(MATCH_NOMATCH);
4718 switch(*eptr++)
4720 default: RRETURN(MATCH_NOMATCH);
4721 HSPACE_BYTE_CASES:
4722 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4723 HSPACE_MULTIBYTE_CASES:
4724 #endif
4725 break;
4728 break;
4730 case OP_NOT_VSPACE:
4731 for (i = 1; i <= min; i++)
4733 if (eptr >= md->end_subject)
4735 SCHECK_PARTIAL();
4736 RRETURN(MATCH_NOMATCH);
4738 switch(*eptr++)
4740 VSPACE_BYTE_CASES:
4741 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4742 VSPACE_MULTIBYTE_CASES:
4743 #endif
4744 RRETURN(MATCH_NOMATCH);
4745 default: break;
4748 break;
4750 case OP_VSPACE:
4751 for (i = 1; i <= min; i++)
4753 if (eptr >= md->end_subject)
4755 SCHECK_PARTIAL();
4756 RRETURN(MATCH_NOMATCH);
4758 switch(*eptr++)
4760 default: RRETURN(MATCH_NOMATCH);
4761 VSPACE_BYTE_CASES:
4762 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4763 VSPACE_MULTIBYTE_CASES:
4764 #endif
4765 break;
4768 break;
4770 case OP_NOT_DIGIT:
4771 for (i = 1; i <= min; i++)
4773 if (eptr >= md->end_subject)
4775 SCHECK_PARTIAL();
4776 RRETURN(MATCH_NOMATCH);
4778 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_digit) != 0)
4779 RRETURN(MATCH_NOMATCH);
4780 eptr++;
4782 break;
4784 case OP_DIGIT:
4785 for (i = 1; i <= min; i++)
4787 if (eptr >= md->end_subject)
4789 SCHECK_PARTIAL();
4790 RRETURN(MATCH_NOMATCH);
4792 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_digit) == 0)
4793 RRETURN(MATCH_NOMATCH);
4794 eptr++;
4796 break;
4798 case OP_NOT_WHITESPACE:
4799 for (i = 1; i <= min; i++)
4801 if (eptr >= md->end_subject)
4803 SCHECK_PARTIAL();
4804 RRETURN(MATCH_NOMATCH);
4806 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_space) != 0)
4807 RRETURN(MATCH_NOMATCH);
4808 eptr++;
4810 break;
4812 case OP_WHITESPACE:
4813 for (i = 1; i <= min; i++)
4815 if (eptr >= md->end_subject)
4817 SCHECK_PARTIAL();
4818 RRETURN(MATCH_NOMATCH);
4820 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_space) == 0)
4821 RRETURN(MATCH_NOMATCH);
4822 eptr++;
4824 break;
4826 case OP_NOT_WORDCHAR:
4827 for (i = 1; i <= min; i++)
4829 if (eptr >= md->end_subject)
4831 SCHECK_PARTIAL();
4832 RRETURN(MATCH_NOMATCH);
4834 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_word) != 0)
4835 RRETURN(MATCH_NOMATCH);
4836 eptr++;
4838 break;
4840 case OP_WORDCHAR:
4841 for (i = 1; i <= min; i++)
4843 if (eptr >= md->end_subject)
4845 SCHECK_PARTIAL();
4846 RRETURN(MATCH_NOMATCH);
4848 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_word) == 0)
4849 RRETURN(MATCH_NOMATCH);
4850 eptr++;
4852 break;
4854 default:
4855 RRETURN(PCRE_ERROR_INTERNAL);
4859 /* If min = max, continue at the same level without recursing */
4861 if (min == max) continue;
4863 /* If minimizing, we have to test the rest of the pattern before each
4864 subsequent match. Again, separate the UTF-8 case for speed, and also
4865 separate the UCP cases. */
4867 if (minimize)
4869 #ifdef SUPPORT_UCP
4870 if (prop_type >= 0)
4872 switch(prop_type)
4874 case PT_ANY:
4875 for (fi = min;; fi++)
4877 RMATCH(eptr, ecode, offset_top, md, eptrb, RM36);
4878 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4879 if (fi >= max) RRETURN(MATCH_NOMATCH);
4880 if (eptr >= md->end_subject)
4882 SCHECK_PARTIAL();
4883 RRETURN(MATCH_NOMATCH);
4885 GETCHARINCTEST(c, eptr);
4886 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
4888 /* Control never gets here */
4890 case PT_LAMP:
4891 for (fi = min;; fi++)
4893 int chartype;
4894 RMATCH(eptr, ecode, offset_top, md, eptrb, RM37);
4895 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4896 if (fi >= max) RRETURN(MATCH_NOMATCH);
4897 if (eptr >= md->end_subject)
4899 SCHECK_PARTIAL();
4900 RRETURN(MATCH_NOMATCH);
4902 GETCHARINCTEST(c, eptr);
4903 chartype = UCD_CHARTYPE(c);
4904 if ((chartype == ucp_Lu ||
4905 chartype == ucp_Ll ||
4906 chartype == ucp_Lt) == prop_fail_result)
4907 RRETURN(MATCH_NOMATCH);
4909 /* Control never gets here */
4911 case PT_GC:
4912 for (fi = min;; fi++)
4914 RMATCH(eptr, ecode, offset_top, md, eptrb, RM38);
4915 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4916 if (fi >= max) RRETURN(MATCH_NOMATCH);
4917 if (eptr >= md->end_subject)
4919 SCHECK_PARTIAL();
4920 RRETURN(MATCH_NOMATCH);
4922 GETCHARINCTEST(c, eptr);
4923 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4924 RRETURN(MATCH_NOMATCH);
4926 /* Control never gets here */
4928 case PT_PC:
4929 for (fi = min;; fi++)
4931 RMATCH(eptr, ecode, offset_top, md, eptrb, RM39);
4932 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4933 if (fi >= max) RRETURN(MATCH_NOMATCH);
4934 if (eptr >= md->end_subject)
4936 SCHECK_PARTIAL();
4937 RRETURN(MATCH_NOMATCH);
4939 GETCHARINCTEST(c, eptr);
4940 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4941 RRETURN(MATCH_NOMATCH);
4943 /* Control never gets here */
4945 case PT_SC:
4946 for (fi = min;; fi++)
4948 RMATCH(eptr, ecode, offset_top, md, eptrb, RM40);
4949 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4950 if (fi >= max) RRETURN(MATCH_NOMATCH);
4951 if (eptr >= md->end_subject)
4953 SCHECK_PARTIAL();
4954 RRETURN(MATCH_NOMATCH);
4956 GETCHARINCTEST(c, eptr);
4957 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4958 RRETURN(MATCH_NOMATCH);
4960 /* Control never gets here */
4962 case PT_ALNUM:
4963 for (fi = min;; fi++)
4965 int category;
4966 RMATCH(eptr, ecode, offset_top, md, eptrb, RM59);
4967 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4968 if (fi >= max) RRETURN(MATCH_NOMATCH);
4969 if (eptr >= md->end_subject)
4971 SCHECK_PARTIAL();
4972 RRETURN(MATCH_NOMATCH);
4974 GETCHARINCTEST(c, eptr);
4975 category = UCD_CATEGORY(c);
4976 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4977 RRETURN(MATCH_NOMATCH);
4979 /* Control never gets here */
4981 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
4982 which means that Perl space and POSIX space are now identical. PCRE
4983 was changed at release 8.34. */
4985 case PT_SPACE: /* Perl space */
4986 case PT_PXSPACE: /* POSIX space */
4987 for (fi = min;; fi++)
4989 RMATCH(eptr, ecode, offset_top, md, eptrb, RM61);
4990 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4991 if (fi >= max) RRETURN(MATCH_NOMATCH);
4992 if (eptr >= md->end_subject)
4994 SCHECK_PARTIAL();
4995 RRETURN(MATCH_NOMATCH);
4997 GETCHARINCTEST(c, eptr);
4998 switch(c)
5000 HSPACE_CASES:
5001 VSPACE_CASES:
5002 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
5003 break;
5005 default:
5006 if ((UCD_CATEGORY(c) == ucp_Z) == prop_fail_result)
5007 RRETURN(MATCH_NOMATCH);
5008 break;
5011 /* Control never gets here */
5013 case PT_WORD:
5014 for (fi = min;; fi++)
5016 int category;
5017 RMATCH(eptr, ecode, offset_top, md, eptrb, RM62);
5018 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5019 if (fi >= max) RRETURN(MATCH_NOMATCH);
5020 if (eptr >= md->end_subject)
5022 SCHECK_PARTIAL();
5023 RRETURN(MATCH_NOMATCH);
5025 GETCHARINCTEST(c, eptr);
5026 category = UCD_CATEGORY(c);
5027 if ((category == ucp_L ||
5028 category == ucp_N ||
5029 c == CHAR_UNDERSCORE)
5030 == prop_fail_result)
5031 RRETURN(MATCH_NOMATCH);
5033 /* Control never gets here */
5035 case PT_CLIST:
5036 for (fi = min;; fi++)
5038 const pcre_uint32 *cp;
5039 RMATCH(eptr, ecode, offset_top, md, eptrb, RM67);
5040 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5041 if (fi >= max) RRETURN(MATCH_NOMATCH);
5042 if (eptr >= md->end_subject)
5044 SCHECK_PARTIAL();
5045 RRETURN(MATCH_NOMATCH);
5047 GETCHARINCTEST(c, eptr);
5048 cp = PRIV(ucd_caseless_sets) + prop_value;
5049 for (;;)
5051 if (c < *cp)
5052 { if (prop_fail_result) break; else { RRETURN(MATCH_NOMATCH); } }
5053 if (c == *cp++)
5054 { if (prop_fail_result) { RRETURN(MATCH_NOMATCH); } else break; }
5057 /* Control never gets here */
5059 case PT_UCNC:
5060 for (fi = min;; fi++)
5062 RMATCH(eptr, ecode, offset_top, md, eptrb, RM60);
5063 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5064 if (fi >= max) RRETURN(MATCH_NOMATCH);
5065 if (eptr >= md->end_subject)
5067 SCHECK_PARTIAL();
5068 RRETURN(MATCH_NOMATCH);
5070 GETCHARINCTEST(c, eptr);
5071 if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
5072 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
5073 c >= 0xe000) == prop_fail_result)
5074 RRETURN(MATCH_NOMATCH);
5076 /* Control never gets here */
5078 /* This should never occur */
5079 default:
5080 RRETURN(PCRE_ERROR_INTERNAL);
5084 /* Match extended Unicode sequences. We will get here only if the
5085 support is in the binary; otherwise a compile-time error occurs. */
5087 else if (ctype == OP_EXTUNI)
5089 for (fi = min;; fi++)
5091 RMATCH(eptr, ecode, offset_top, md, eptrb, RM41);
5092 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5093 if (fi >= max) RRETURN(MATCH_NOMATCH);
5094 if (eptr >= md->end_subject)
5096 SCHECK_PARTIAL();
5097 RRETURN(MATCH_NOMATCH);
5099 else
5101 int lgb, rgb;
5102 GETCHARINCTEST(c, eptr);
5103 lgb = UCD_GRAPHBREAK(c);
5104 while (eptr < md->end_subject)
5106 int len = 1;
5107 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5108 rgb = UCD_GRAPHBREAK(c);
5109 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
5110 lgb = rgb;
5111 eptr += len;
5114 CHECK_PARTIAL();
5117 else
5118 #endif /* SUPPORT_UCP */
5120 #ifdef SUPPORT_UTF
5121 if (utf)
5123 for (fi = min;; fi++)
5125 RMATCH(eptr, ecode, offset_top, md, eptrb, RM42);
5126 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5127 if (fi >= max) RRETURN(MATCH_NOMATCH);
5128 if (eptr >= md->end_subject)
5130 SCHECK_PARTIAL();
5131 RRETURN(MATCH_NOMATCH);
5133 if (ctype == OP_ANY && IS_NEWLINE(eptr))
5134 RRETURN(MATCH_NOMATCH);
5135 GETCHARINC(c, eptr);
5136 switch(ctype)
5138 case OP_ANY: /* This is the non-NL case */
5139 if (md->partial != 0 && /* Take care with CRLF partial */
5140 eptr >= md->end_subject &&
5141 NLBLOCK->nltype == NLTYPE_FIXED &&
5142 NLBLOCK->nllen == 2 &&
5143 c == NLBLOCK->nl[0])
5145 md->hitend = TRUE;
5146 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5148 break;
5150 case OP_ALLANY:
5151 case OP_ANYBYTE:
5152 break;
5154 case OP_ANYNL:
5155 switch(c)
5157 default: RRETURN(MATCH_NOMATCH);
5158 case CHAR_CR:
5159 if (eptr < md->end_subject && UCHAR21(eptr) == CHAR_LF) eptr++;
5160 break;
5162 case CHAR_LF:
5163 break;
5165 case CHAR_VT:
5166 case CHAR_FF:
5167 case CHAR_NEL:
5168 #ifndef EBCDIC
5169 case 0x2028:
5170 case 0x2029:
5171 #endif /* Not EBCDIC */
5172 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
5173 break;
5175 break;
5177 case OP_NOT_HSPACE:
5178 switch(c)
5180 HSPACE_CASES: RRETURN(MATCH_NOMATCH);
5181 default: break;
5183 break;
5185 case OP_HSPACE:
5186 switch(c)
5188 HSPACE_CASES: break;
5189 default: RRETURN(MATCH_NOMATCH);
5191 break;
5193 case OP_NOT_VSPACE:
5194 switch(c)
5196 VSPACE_CASES: RRETURN(MATCH_NOMATCH);
5197 default: break;
5199 break;
5201 case OP_VSPACE:
5202 switch(c)
5204 VSPACE_CASES: break;
5205 default: RRETURN(MATCH_NOMATCH);
5207 break;
5209 case OP_NOT_DIGIT:
5210 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
5211 RRETURN(MATCH_NOMATCH);
5212 break;
5214 case OP_DIGIT:
5215 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
5216 RRETURN(MATCH_NOMATCH);
5217 break;
5219 case OP_NOT_WHITESPACE:
5220 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
5221 RRETURN(MATCH_NOMATCH);
5222 break;
5224 case OP_WHITESPACE:
5225 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
5226 RRETURN(MATCH_NOMATCH);
5227 break;
5229 case OP_NOT_WORDCHAR:
5230 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
5231 RRETURN(MATCH_NOMATCH);
5232 break;
5234 case OP_WORDCHAR:
5235 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
5236 RRETURN(MATCH_NOMATCH);
5237 break;
5239 default:
5240 RRETURN(PCRE_ERROR_INTERNAL);
5244 else
5245 #endif
5246 /* Not UTF mode */
5248 for (fi = min;; fi++)
5250 RMATCH(eptr, ecode, offset_top, md, eptrb, RM43);
5251 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5252 if (fi >= max) RRETURN(MATCH_NOMATCH);
5253 if (eptr >= md->end_subject)
5255 SCHECK_PARTIAL();
5256 RRETURN(MATCH_NOMATCH);
5258 if (ctype == OP_ANY && IS_NEWLINE(eptr))
5259 RRETURN(MATCH_NOMATCH);
5260 c = *eptr++;
5261 switch(ctype)
5263 case OP_ANY: /* This is the non-NL case */
5264 if (md->partial != 0 && /* Take care with CRLF partial */
5265 eptr >= md->end_subject &&
5266 NLBLOCK->nltype == NLTYPE_FIXED &&
5267 NLBLOCK->nllen == 2 &&
5268 c == NLBLOCK->nl[0])
5270 md->hitend = TRUE;
5271 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5273 break;
5275 case OP_ALLANY:
5276 case OP_ANYBYTE:
5277 break;
5279 case OP_ANYNL:
5280 switch(c)
5282 default: RRETURN(MATCH_NOMATCH);
5283 case CHAR_CR:
5284 if (eptr < md->end_subject && *eptr == CHAR_LF) eptr++;
5285 break;
5287 case CHAR_LF:
5288 break;
5290 case CHAR_VT:
5291 case CHAR_FF:
5292 case CHAR_NEL:
5293 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5294 case 0x2028:
5295 case 0x2029:
5296 #endif
5297 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
5298 break;
5300 break;
5302 case OP_NOT_HSPACE:
5303 switch(c)
5305 default: break;
5306 HSPACE_BYTE_CASES:
5307 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5308 HSPACE_MULTIBYTE_CASES:
5309 #endif
5310 RRETURN(MATCH_NOMATCH);
5312 break;
5314 case OP_HSPACE:
5315 switch(c)
5317 default: RRETURN(MATCH_NOMATCH);
5318 HSPACE_BYTE_CASES:
5319 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5320 HSPACE_MULTIBYTE_CASES:
5321 #endif
5322 break;
5324 break;
5326 case OP_NOT_VSPACE:
5327 switch(c)
5329 default: break;
5330 VSPACE_BYTE_CASES:
5331 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5332 VSPACE_MULTIBYTE_CASES:
5333 #endif
5334 RRETURN(MATCH_NOMATCH);
5336 break;
5338 case OP_VSPACE:
5339 switch(c)
5341 default: RRETURN(MATCH_NOMATCH);
5342 VSPACE_BYTE_CASES:
5343 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5344 VSPACE_MULTIBYTE_CASES:
5345 #endif
5346 break;
5348 break;
5350 case OP_NOT_DIGIT:
5351 if (MAX_255(c) && (md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
5352 break;
5354 case OP_DIGIT:
5355 if (!MAX_255(c) || (md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
5356 break;
5358 case OP_NOT_WHITESPACE:
5359 if (MAX_255(c) && (md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
5360 break;
5362 case OP_WHITESPACE:
5363 if (!MAX_255(c) || (md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
5364 break;
5366 case OP_NOT_WORDCHAR:
5367 if (MAX_255(c) && (md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
5368 break;
5370 case OP_WORDCHAR:
5371 if (!MAX_255(c) || (md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
5372 break;
5374 default:
5375 RRETURN(PCRE_ERROR_INTERNAL);
5379 /* Control never gets here */
5382 /* If maximizing, it is worth using inline code for speed, doing the type
5383 test once at the start (i.e. keep it out of the loop). Again, keep the
5384 UTF-8 and UCP stuff separate. */
5386 else
5388 pp = eptr; /* Remember where we started */
5390 #ifdef SUPPORT_UCP
5391 if (prop_type >= 0)
5393 switch(prop_type)
5395 case PT_ANY:
5396 for (i = min; i < max; i++)
5398 int len = 1;
5399 if (eptr >= md->end_subject)
5401 SCHECK_PARTIAL();
5402 break;
5404 GETCHARLENTEST(c, eptr, len);
5405 if (prop_fail_result) break;
5406 eptr+= len;
5408 break;
5410 case PT_LAMP:
5411 for (i = min; i < max; i++)
5413 int chartype;
5414 int len = 1;
5415 if (eptr >= md->end_subject)
5417 SCHECK_PARTIAL();
5418 break;
5420 GETCHARLENTEST(c, eptr, len);
5421 chartype = UCD_CHARTYPE(c);
5422 if ((chartype == ucp_Lu ||
5423 chartype == ucp_Ll ||
5424 chartype == ucp_Lt) == prop_fail_result)
5425 break;
5426 eptr+= len;
5428 break;
5430 case PT_GC:
5431 for (i = min; i < max; i++)
5433 int len = 1;
5434 if (eptr >= md->end_subject)
5436 SCHECK_PARTIAL();
5437 break;
5439 GETCHARLENTEST(c, eptr, len);
5440 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result) break;
5441 eptr+= len;
5443 break;
5445 case PT_PC:
5446 for (i = min; i < max; i++)
5448 int len = 1;
5449 if (eptr >= md->end_subject)
5451 SCHECK_PARTIAL();
5452 break;
5454 GETCHARLENTEST(c, eptr, len);
5455 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result) break;
5456 eptr+= len;
5458 break;
5460 case PT_SC:
5461 for (i = min; i < max; i++)
5463 int len = 1;
5464 if (eptr >= md->end_subject)
5466 SCHECK_PARTIAL();
5467 break;
5469 GETCHARLENTEST(c, eptr, len);
5470 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result) break;
5471 eptr+= len;
5473 break;
5475 case PT_ALNUM:
5476 for (i = min; i < max; i++)
5478 int category;
5479 int len = 1;
5480 if (eptr >= md->end_subject)
5482 SCHECK_PARTIAL();
5483 break;
5485 GETCHARLENTEST(c, eptr, len);
5486 category = UCD_CATEGORY(c);
5487 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
5488 break;
5489 eptr+= len;
5491 break;
5493 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
5494 which means that Perl space and POSIX space are now identical. PCRE
5495 was changed at release 8.34. */
5497 case PT_SPACE: /* Perl space */
5498 case PT_PXSPACE: /* POSIX space */
5499 for (i = min; i < max; i++)
5501 int len = 1;
5502 if (eptr >= md->end_subject)
5504 SCHECK_PARTIAL();
5505 break;
5507 GETCHARLENTEST(c, eptr, len);
5508 switch(c)
5510 HSPACE_CASES:
5511 VSPACE_CASES:
5512 if (prop_fail_result) goto ENDLOOP99; /* Break the loop */
5513 break;
5515 default:
5516 if ((UCD_CATEGORY(c) == ucp_Z) == prop_fail_result)
5517 goto ENDLOOP99; /* Break the loop */
5518 break;
5520 eptr+= len;
5522 ENDLOOP99:
5523 break;
5525 case PT_WORD:
5526 for (i = min; i < max; i++)
5528 int category;
5529 int len = 1;
5530 if (eptr >= md->end_subject)
5532 SCHECK_PARTIAL();
5533 break;
5535 GETCHARLENTEST(c, eptr, len);
5536 category = UCD_CATEGORY(c);
5537 if ((category == ucp_L || category == ucp_N ||
5538 c == CHAR_UNDERSCORE) == prop_fail_result)
5539 break;
5540 eptr+= len;
5542 break;
5544 case PT_CLIST:
5545 for (i = min; i < max; i++)
5547 const pcre_uint32 *cp;
5548 int len = 1;
5549 if (eptr >= md->end_subject)
5551 SCHECK_PARTIAL();
5552 break;
5554 GETCHARLENTEST(c, eptr, len);
5555 cp = PRIV(ucd_caseless_sets) + prop_value;
5556 for (;;)
5558 if (c < *cp)
5559 { if (prop_fail_result) break; else goto GOT_MAX; }
5560 if (c == *cp++)
5561 { if (prop_fail_result) goto GOT_MAX; else break; }
5563 eptr += len;
5565 GOT_MAX:
5566 break;
5568 case PT_UCNC:
5569 for (i = min; i < max; i++)
5571 int len = 1;
5572 if (eptr >= md->end_subject)
5574 SCHECK_PARTIAL();
5575 break;
5577 GETCHARLENTEST(c, eptr, len);
5578 if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
5579 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
5580 c >= 0xe000) == prop_fail_result)
5581 break;
5582 eptr += len;
5584 break;
5586 default:
5587 RRETURN(PCRE_ERROR_INTERNAL);
5590 /* eptr is now past the end of the maximum run */
5592 if (possessive) continue; /* No backtracking */
5593 for(;;)
5595 if (eptr == pp) goto TAIL_RECURSE;
5596 RMATCH(eptr, ecode, offset_top, md, eptrb, RM44);
5597 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5598 eptr--;
5599 if (utf) BACKCHAR(eptr);
5603 /* Match extended Unicode grapheme clusters. We will get here only if the
5604 support is in the binary; otherwise a compile-time error occurs. */
5606 else if (ctype == OP_EXTUNI)
5608 for (i = min; i < max; i++)
5610 if (eptr >= md->end_subject)
5612 SCHECK_PARTIAL();
5613 break;
5615 else
5617 int lgb, rgb;
5618 GETCHARINCTEST(c, eptr);
5619 lgb = UCD_GRAPHBREAK(c);
5620 while (eptr < md->end_subject)
5622 int len = 1;
5623 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5624 rgb = UCD_GRAPHBREAK(c);
5625 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
5626 lgb = rgb;
5627 eptr += len;
5630 CHECK_PARTIAL();
5633 /* eptr is now past the end of the maximum run */
5635 if (possessive) continue; /* No backtracking */
5637 for(;;)
5639 int lgb, rgb;
5640 PCRE_PUCHAR fptr;
5642 if (eptr == pp) goto TAIL_RECURSE; /* At start of char run */
5643 RMATCH(eptr, ecode, offset_top, md, eptrb, RM45);
5644 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5646 /* Backtracking over an extended grapheme cluster involves inspecting
5647 the previous two characters (if present) to see if a break is
5648 permitted between them. */
5650 eptr--;
5651 if (!utf) c = *eptr; else
5653 BACKCHAR(eptr);
5654 GETCHAR(c, eptr);
5656 rgb = UCD_GRAPHBREAK(c);
5658 for (;;)
5660 if (eptr == pp) goto TAIL_RECURSE; /* At start of char run */
5661 fptr = eptr - 1;
5662 if (!utf) c = *fptr; else
5664 BACKCHAR(fptr);
5665 GETCHAR(c, fptr);
5667 lgb = UCD_GRAPHBREAK(c);
5668 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
5669 eptr = fptr;
5670 rgb = lgb;
5675 else
5676 #endif /* SUPPORT_UCP */
5678 #ifdef SUPPORT_UTF
5679 if (utf)
5681 switch(ctype)
5683 case OP_ANY:
5684 if (max < INT_MAX)
5686 for (i = min; i < max; i++)
5688 if (eptr >= md->end_subject)
5690 SCHECK_PARTIAL();
5691 break;
5693 if (IS_NEWLINE(eptr)) break;
5694 if (md->partial != 0 && /* Take care with CRLF partial */
5695 eptr + 1 >= md->end_subject &&
5696 NLBLOCK->nltype == NLTYPE_FIXED &&
5697 NLBLOCK->nllen == 2 &&
5698 UCHAR21(eptr) == NLBLOCK->nl[0])
5700 md->hitend = TRUE;
5701 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5703 eptr++;
5704 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5708 /* Handle unlimited UTF-8 repeat */
5710 else
5712 for (i = min; i < max; i++)
5714 if (eptr >= md->end_subject)
5716 SCHECK_PARTIAL();
5717 break;
5719 if (IS_NEWLINE(eptr)) break;
5720 if (md->partial != 0 && /* Take care with CRLF partial */
5721 eptr + 1 >= md->end_subject &&
5722 NLBLOCK->nltype == NLTYPE_FIXED &&
5723 NLBLOCK->nllen == 2 &&
5724 UCHAR21(eptr) == NLBLOCK->nl[0])
5726 md->hitend = TRUE;
5727 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5729 eptr++;
5730 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5733 break;
5735 case OP_ALLANY:
5736 if (max < INT_MAX)
5738 for (i = min; i < max; i++)
5740 if (eptr >= md->end_subject)
5742 SCHECK_PARTIAL();
5743 break;
5745 eptr++;
5746 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5749 else
5751 eptr = md->end_subject; /* Unlimited UTF-8 repeat */
5752 SCHECK_PARTIAL();
5754 break;
5756 /* The byte case is the same as non-UTF8 */
5758 case OP_ANYBYTE:
5759 c = max - min;
5760 if (c > (unsigned int)(md->end_subject - eptr))
5762 eptr = md->end_subject;
5763 SCHECK_PARTIAL();
5765 else eptr += c;
5766 break;
5768 case OP_ANYNL:
5769 for (i = min; i < max; i++)
5771 int len = 1;
5772 if (eptr >= md->end_subject)
5774 SCHECK_PARTIAL();
5775 break;
5777 GETCHARLEN(c, eptr, len);
5778 if (c == CHAR_CR)
5780 if (++eptr >= md->end_subject) break;
5781 if (UCHAR21(eptr) == CHAR_LF) eptr++;
5783 else
5785 if (c != CHAR_LF &&
5786 (md->bsr_anycrlf ||
5787 (c != CHAR_VT && c != CHAR_FF && c != CHAR_NEL
5788 #ifndef EBCDIC
5789 && c != 0x2028 && c != 0x2029
5790 #endif /* Not EBCDIC */
5792 break;
5793 eptr += len;
5796 break;
5798 case OP_NOT_HSPACE:
5799 case OP_HSPACE:
5800 for (i = min; i < max; i++)
5802 BOOL gotspace;
5803 int len = 1;
5804 if (eptr >= md->end_subject)
5806 SCHECK_PARTIAL();
5807 break;
5809 GETCHARLEN(c, eptr, len);
5810 switch(c)
5812 HSPACE_CASES: gotspace = TRUE; break;
5813 default: gotspace = FALSE; break;
5815 if (gotspace == (ctype == OP_NOT_HSPACE)) break;
5816 eptr += len;
5818 break;
5820 case OP_NOT_VSPACE:
5821 case OP_VSPACE:
5822 for (i = min; i < max; i++)
5824 BOOL gotspace;
5825 int len = 1;
5826 if (eptr >= md->end_subject)
5828 SCHECK_PARTIAL();
5829 break;
5831 GETCHARLEN(c, eptr, len);
5832 switch(c)
5834 VSPACE_CASES: gotspace = TRUE; break;
5835 default: gotspace = FALSE; break;
5837 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
5838 eptr += len;
5840 break;
5842 case OP_NOT_DIGIT:
5843 for (i = min; i < max; i++)
5845 int len = 1;
5846 if (eptr >= md->end_subject)
5848 SCHECK_PARTIAL();
5849 break;
5851 GETCHARLEN(c, eptr, len);
5852 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
5853 eptr+= len;
5855 break;
5857 case OP_DIGIT:
5858 for (i = min; i < max; i++)
5860 int len = 1;
5861 if (eptr >= md->end_subject)
5863 SCHECK_PARTIAL();
5864 break;
5866 GETCHARLEN(c, eptr, len);
5867 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
5868 eptr+= len;
5870 break;
5872 case OP_NOT_WHITESPACE:
5873 for (i = min; i < max; i++)
5875 int len = 1;
5876 if (eptr >= md->end_subject)
5878 SCHECK_PARTIAL();
5879 break;
5881 GETCHARLEN(c, eptr, len);
5882 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
5883 eptr+= len;
5885 break;
5887 case OP_WHITESPACE:
5888 for (i = min; i < max; i++)
5890 int len = 1;
5891 if (eptr >= md->end_subject)
5893 SCHECK_PARTIAL();
5894 break;
5896 GETCHARLEN(c, eptr, len);
5897 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
5898 eptr+= len;
5900 break;
5902 case OP_NOT_WORDCHAR:
5903 for (i = min; i < max; i++)
5905 int len = 1;
5906 if (eptr >= md->end_subject)
5908 SCHECK_PARTIAL();
5909 break;
5911 GETCHARLEN(c, eptr, len);
5912 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
5913 eptr+= len;
5915 break;
5917 case OP_WORDCHAR:
5918 for (i = min; i < max; i++)
5920 int len = 1;
5921 if (eptr >= md->end_subject)
5923 SCHECK_PARTIAL();
5924 break;
5926 GETCHARLEN(c, eptr, len);
5927 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
5928 eptr+= len;
5930 break;
5932 default:
5933 RRETURN(PCRE_ERROR_INTERNAL);
5936 if (possessive) continue; /* No backtracking */
5937 for(;;)
5939 if (eptr == pp) goto TAIL_RECURSE;
5940 RMATCH(eptr, ecode, offset_top, md, eptrb, RM46);
5941 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5942 eptr--;
5943 BACKCHAR(eptr);
5944 if (ctype == OP_ANYNL && eptr > pp && UCHAR21(eptr) == CHAR_NL &&
5945 UCHAR21(eptr - 1) == CHAR_CR) eptr--;
5948 else
5949 #endif /* SUPPORT_UTF */
5950 /* Not UTF mode */
5952 switch(ctype)
5954 case OP_ANY:
5955 for (i = min; i < max; i++)
5957 if (eptr >= md->end_subject)
5959 SCHECK_PARTIAL();
5960 break;
5962 if (IS_NEWLINE(eptr)) break;
5963 if (md->partial != 0 && /* Take care with CRLF partial */
5964 eptr + 1 >= md->end_subject &&
5965 NLBLOCK->nltype == NLTYPE_FIXED &&
5966 NLBLOCK->nllen == 2 &&
5967 *eptr == NLBLOCK->nl[0])
5969 md->hitend = TRUE;
5970 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5972 eptr++;
5974 break;
5976 case OP_ALLANY:
5977 case OP_ANYBYTE:
5978 c = max - min;
5979 if (c > (unsigned int)(md->end_subject - eptr))
5981 eptr = md->end_subject;
5982 SCHECK_PARTIAL();
5984 else eptr += c;
5985 break;
5987 case OP_ANYNL:
5988 for (i = min; i < max; i++)
5990 if (eptr >= md->end_subject)
5992 SCHECK_PARTIAL();
5993 break;
5995 c = *eptr;
5996 if (c == CHAR_CR)
5998 if (++eptr >= md->end_subject) break;
5999 if (*eptr == CHAR_LF) eptr++;
6001 else
6003 if (c != CHAR_LF && (md->bsr_anycrlf ||
6004 (c != CHAR_VT && c != CHAR_FF && c != CHAR_NEL
6005 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
6006 && c != 0x2028 && c != 0x2029
6007 #endif
6008 ))) break;
6009 eptr++;
6012 break;
6014 case OP_NOT_HSPACE:
6015 for (i = min; i < max; i++)
6017 if (eptr >= md->end_subject)
6019 SCHECK_PARTIAL();
6020 break;
6022 switch(*eptr)
6024 default: eptr++; break;
6025 HSPACE_BYTE_CASES:
6026 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
6027 HSPACE_MULTIBYTE_CASES:
6028 #endif
6029 goto ENDLOOP00;
6032 ENDLOOP00:
6033 break;
6035 case OP_HSPACE:
6036 for (i = min; i < max; i++)
6038 if (eptr >= md->end_subject)
6040 SCHECK_PARTIAL();
6041 break;
6043 switch(*eptr)
6045 default: goto ENDLOOP01;
6046 HSPACE_BYTE_CASES:
6047 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
6048 HSPACE_MULTIBYTE_CASES:
6049 #endif
6050 eptr++; break;
6053 ENDLOOP01:
6054 break;
6056 case OP_NOT_VSPACE:
6057 for (i = min; i < max; i++)
6059 if (eptr >= md->end_subject)
6061 SCHECK_PARTIAL();
6062 break;
6064 switch(*eptr)
6066 default: eptr++; break;
6067 VSPACE_BYTE_CASES:
6068 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
6069 VSPACE_MULTIBYTE_CASES:
6070 #endif
6071 goto ENDLOOP02;
6074 ENDLOOP02:
6075 break;
6077 case OP_VSPACE:
6078 for (i = min; i < max; i++)
6080 if (eptr >= md->end_subject)
6082 SCHECK_PARTIAL();
6083 break;
6085 switch(*eptr)
6087 default: goto ENDLOOP03;
6088 VSPACE_BYTE_CASES:
6089 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
6090 VSPACE_MULTIBYTE_CASES:
6091 #endif
6092 eptr++; break;
6095 ENDLOOP03:
6096 break;
6098 case OP_NOT_DIGIT:
6099 for (i = min; i < max; i++)
6101 if (eptr >= md->end_subject)
6103 SCHECK_PARTIAL();
6104 break;
6106 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_digit) != 0) break;
6107 eptr++;
6109 break;
6111 case OP_DIGIT:
6112 for (i = min; i < max; i++)
6114 if (eptr >= md->end_subject)
6116 SCHECK_PARTIAL();
6117 break;
6119 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_digit) == 0) break;
6120 eptr++;
6122 break;
6124 case OP_NOT_WHITESPACE:
6125 for (i = min; i < max; i++)
6127 if (eptr >= md->end_subject)
6129 SCHECK_PARTIAL();
6130 break;
6132 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_space) != 0) break;
6133 eptr++;
6135 break;
6137 case OP_WHITESPACE:
6138 for (i = min; i < max; i++)
6140 if (eptr >= md->end_subject)
6142 SCHECK_PARTIAL();
6143 break;
6145 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_space) == 0) break;
6146 eptr++;
6148 break;
6150 case OP_NOT_WORDCHAR:
6151 for (i = min; i < max; i++)
6153 if (eptr >= md->end_subject)
6155 SCHECK_PARTIAL();
6156 break;
6158 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_word) != 0) break;
6159 eptr++;
6161 break;
6163 case OP_WORDCHAR:
6164 for (i = min; i < max; i++)
6166 if (eptr >= md->end_subject)
6168 SCHECK_PARTIAL();
6169 break;
6171 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_word) == 0) break;
6172 eptr++;
6174 break;
6176 default:
6177 RRETURN(PCRE_ERROR_INTERNAL);
6180 if (possessive) continue; /* No backtracking */
6181 for (;;)
6183 if (eptr == pp) goto TAIL_RECURSE;
6184 RMATCH(eptr, ecode, offset_top, md, eptrb, RM47);
6185 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6186 eptr--;
6187 if (ctype == OP_ANYNL && eptr > pp && *eptr == CHAR_LF &&
6188 eptr[-1] == CHAR_CR) eptr--;
6192 /* Control never gets here */
6195 /* There's been some horrible disaster. Arrival here can only mean there is
6196 something seriously wrong in the code above or the OP_xxx definitions. */
6198 default:
6199 DPRINTF(("Unknown opcode %d\n", *ecode));
6200 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
6203 /* Do not stick any code in here without much thought; it is assumed
6204 that "continue" in the code above comes out to here to repeat the main
6205 loop. */
6207 } /* End of main loop */
6208 /* Control never reaches here */
6211 /* When compiling to use the heap rather than the stack for recursive calls to
6212 match(), the RRETURN() macro jumps here. The number that is saved in
6213 frame->Xwhere indicates which label we actually want to return to. */
6215 #ifdef NO_RECURSE
6216 #define LBL(val) case val: goto L_RM##val;
6217 HEAP_RETURN:
6218 switch (frame->Xwhere)
6220 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
6221 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
6222 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
6223 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
6224 LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58) LBL(63) LBL(64)
6225 LBL(65) LBL(66)
6226 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
6227 LBL(20) LBL(21)
6228 #endif
6229 #ifdef SUPPORT_UTF
6230 LBL(16) LBL(18)
6231 LBL(22) LBL(23) LBL(28) LBL(30)
6232 LBL(32) LBL(34) LBL(42) LBL(46)
6233 #ifdef SUPPORT_UCP
6234 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
6235 LBL(59) LBL(60) LBL(61) LBL(62) LBL(67)
6236 #endif /* SUPPORT_UCP */
6237 #endif /* SUPPORT_UTF */
6238 default:
6239 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
6240 return PCRE_ERROR_INTERNAL;
6242 #undef LBL
6243 #endif /* NO_RECURSE */
6247 /***************************************************************************
6248 ****************************************************************************
6249 RECURSION IN THE match() FUNCTION
6251 Undefine all the macros that were defined above to handle this. */
6253 #ifdef NO_RECURSE
6254 #undef eptr
6255 #undef ecode
6256 #undef mstart
6257 #undef offset_top
6258 #undef eptrb
6259 #undef flags
6261 #undef callpat
6262 #undef charptr
6263 #undef data
6264 #undef next
6265 #undef pp
6266 #undef prev
6267 #undef saved_eptr
6269 #undef new_recursive
6271 #undef cur_is_word
6272 #undef condition
6273 #undef prev_is_word
6275 #undef ctype
6276 #undef length
6277 #undef max
6278 #undef min
6279 #undef number
6280 #undef offset
6281 #undef op
6282 #undef save_capture_last
6283 #undef save_offset1
6284 #undef save_offset2
6285 #undef save_offset3
6286 #undef stacksave
6288 #undef newptrb
6290 #endif
6292 /* These two are defined as macros in both cases */
6294 #undef fc
6295 #undef fi
6297 /***************************************************************************
6298 ***************************************************************************/
6301 #ifdef NO_RECURSE
6302 /*************************************************
6303 * Release allocated heap frames *
6304 *************************************************/
6306 /* This function releases all the allocated frames. The base frame is on the
6307 machine stack, and so must not be freed.
6309 Argument: the address of the base frame
6310 Returns: nothing
6313 static void
6314 release_match_heapframes (heapframe *frame_base)
6316 heapframe *nextframe = frame_base->Xnextframe;
6317 while (nextframe != NULL)
6319 heapframe *oldframe = nextframe;
6320 nextframe = nextframe->Xnextframe;
6321 (PUBL(stack_free))(oldframe);
6324 #endif
6327 /*************************************************
6328 * Execute a Regular Expression *
6329 *************************************************/
6331 /* This function applies a compiled re to a subject string and picks out
6332 portions of the string if it matches. Two elements in the vector are set for
6333 each substring: the offsets to the start and end of the substring.
6335 Arguments:
6336 argument_re points to the compiled expression
6337 extra_data points to extra data or is NULL
6338 subject points to the subject string
6339 length length of subject string (may contain binary zeros)
6340 start_offset where to start in the subject string
6341 options option bits
6342 offsets points to a vector of ints to be filled in with offsets
6343 offsetcount the number of elements in the vector
6345 Returns: > 0 => success; value is the number of elements filled in
6346 = 0 => success, but offsets is not big enough
6347 -1 => failed to match
6348 < -1 => some kind of unexpected problem
6351 #if defined COMPILE_PCRE8
6352 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6353 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
6354 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
6355 int offsetcount)
6356 #elif defined COMPILE_PCRE16
6357 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6358 pcre16_exec(const pcre16 *argument_re, const pcre16_extra *extra_data,
6359 PCRE_SPTR16 subject, int length, int start_offset, int options, int *offsets,
6360 int offsetcount)
6361 #elif defined COMPILE_PCRE32
6362 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6363 pcre32_exec(const pcre32 *argument_re, const pcre32_extra *extra_data,
6364 PCRE_SPTR32 subject, int length, int start_offset, int options, int *offsets,
6365 int offsetcount)
6366 #endif
6368 int rc, ocount, arg_offset_max;
6369 int newline;
6370 BOOL using_temporary_offsets = FALSE;
6371 BOOL anchored;
6372 BOOL startline;
6373 BOOL firstline;
6374 BOOL utf;
6375 BOOL has_first_char = FALSE;
6376 BOOL has_req_char = FALSE;
6377 pcre_uchar first_char = 0;
6378 pcre_uchar first_char2 = 0;
6379 pcre_uchar req_char = 0;
6380 pcre_uchar req_char2 = 0;
6381 match_data match_block;
6382 match_data *md = &match_block;
6383 const pcre_uint8 *tables;
6384 const pcre_uint8 *start_bits = NULL;
6385 PCRE_PUCHAR start_match = (PCRE_PUCHAR)subject + start_offset;
6386 PCRE_PUCHAR end_subject;
6387 PCRE_PUCHAR start_partial = NULL;
6388 PCRE_PUCHAR match_partial = NULL;
6389 PCRE_PUCHAR req_char_ptr = start_match - 1;
6391 const pcre_study_data *study;
6392 const REAL_PCRE *re = (const REAL_PCRE *)argument_re;
6394 #ifdef NO_RECURSE
6395 heapframe frame_zero;
6396 frame_zero.Xprevframe = NULL; /* Marks the top level */
6397 frame_zero.Xnextframe = NULL; /* None are allocated yet */
6398 md->match_frames_base = &frame_zero;
6399 #endif
6401 /* Check for the special magic call that measures the size of the stack used
6402 per recursive call of match(). Without the funny casting for sizeof, a Windows
6403 compiler gave this error: "unary minus operator applied to unsigned type,
6404 result still unsigned". Hopefully the cast fixes that. */
6406 if (re == NULL && extra_data == NULL && subject == NULL && length == -999 &&
6407 start_offset == -999)
6408 #ifdef NO_RECURSE
6409 return -((int)sizeof(heapframe));
6410 #else
6411 return match(NULL, NULL, NULL, 0, NULL, NULL, 0);
6412 #endif
6414 /* Plausibility checks */
6416 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
6417 if (re == NULL || subject == NULL || (offsets == NULL && offsetcount > 0))
6418 return PCRE_ERROR_NULL;
6419 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
6420 if (length < 0) return PCRE_ERROR_BADLENGTH;
6421 if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
6423 /* Check that the first field in the block is the magic number. If it is not,
6424 return with PCRE_ERROR_BADMAGIC. However, if the magic number is equal to
6425 REVERSED_MAGIC_NUMBER we return with PCRE_ERROR_BADENDIANNESS, which
6426 means that the pattern is likely compiled with different endianness. */
6428 if (re->magic_number != MAGIC_NUMBER)
6429 return re->magic_number == REVERSED_MAGIC_NUMBER?
6430 PCRE_ERROR_BADENDIANNESS:PCRE_ERROR_BADMAGIC;
6431 if ((re->flags & PCRE_MODE) == 0) return PCRE_ERROR_BADMODE;
6433 /* These two settings are used in the code for checking a UTF-8 string that
6434 follows immediately afterwards. Other values in the md block are used only
6435 during "normal" pcre_exec() processing, not when the JIT support is in use,
6436 so they are set up later. */
6438 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
6439 utf = md->utf = (re->options & PCRE_UTF8) != 0;
6440 md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
6441 ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;
6443 /* Check a UTF-8 string if required. Pass back the character offset and error
6444 code for an invalid string if a results vector is available. */
6446 #ifdef SUPPORT_UTF
6447 if (utf && (options & PCRE_NO_UTF8_CHECK) == 0)
6449 int erroroffset;
6450 int errorcode = PRIV(valid_utf)((PCRE_PUCHAR)subject, length, &erroroffset);
6451 if (errorcode != 0)
6453 if (offsetcount >= 2)
6455 offsets[0] = erroroffset;
6456 offsets[1] = errorcode;
6458 #if defined COMPILE_PCRE8
6459 return (errorcode <= PCRE_UTF8_ERR5 && md->partial > 1)?
6460 PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
6461 #elif defined COMPILE_PCRE16
6462 return (errorcode <= PCRE_UTF16_ERR1 && md->partial > 1)?
6463 PCRE_ERROR_SHORTUTF16 : PCRE_ERROR_BADUTF16;
6464 #elif defined COMPILE_PCRE32
6465 return PCRE_ERROR_BADUTF32;
6466 #endif
6468 #if defined COMPILE_PCRE8 || defined COMPILE_PCRE16
6469 /* Check that a start_offset points to the start of a UTF character. */
6470 if (start_offset > 0 && start_offset < length &&
6471 NOT_FIRSTCHAR(((PCRE_PUCHAR)subject)[start_offset]))
6472 return PCRE_ERROR_BADUTF8_OFFSET;
6473 #endif
6475 #endif
6477 /* If the pattern was successfully studied with JIT support, run the JIT
6478 executable instead of the rest of this function. Most options must be set at
6479 compile time for the JIT code to be usable. Fallback to the normal code path if
6480 an unsupported flag is set. */
6482 #ifdef SUPPORT_JIT
6483 if (extra_data != NULL
6484 && (extra_data->flags & (PCRE_EXTRA_EXECUTABLE_JIT |
6485 PCRE_EXTRA_TABLES)) == PCRE_EXTRA_EXECUTABLE_JIT
6486 && extra_data->executable_jit != NULL
6487 && (options & ~PUBLIC_JIT_EXEC_OPTIONS) == 0)
6489 rc = PRIV(jit_exec)(extra_data, (const pcre_uchar *)subject, length,
6490 start_offset, options, offsets, offsetcount);
6492 /* PCRE_ERROR_NULL means that the selected normal or partial matching
6493 mode is not compiled. In this case we simply fallback to interpreter. */
6495 if (rc != PCRE_ERROR_JIT_BADOPTION) return rc;
6497 #endif
6499 /* Carry on with non-JIT matching. This information is for finding all the
6500 numbers associated with a given name, for condition testing. */
6502 md->name_table = (pcre_uchar *)re + re->name_table_offset;
6503 md->name_count = re->name_count;
6504 md->name_entry_size = re->name_entry_size;
6506 /* Fish out the optional data from the extra_data structure, first setting
6507 the default values. */
6509 study = NULL;
6510 md->match_limit = MATCH_LIMIT;
6511 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
6512 md->callout_data = NULL;
6514 /* The table pointer is always in native byte order. */
6516 tables = re->tables;
6518 /* The two limit values override the defaults, whatever their value. */
6520 if (extra_data != NULL)
6522 register unsigned int flags = extra_data->flags;
6523 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
6524 study = (const pcre_study_data *)extra_data->study_data;
6525 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
6526 md->match_limit = extra_data->match_limit;
6527 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
6528 md->match_limit_recursion = extra_data->match_limit_recursion;
6529 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
6530 md->callout_data = extra_data->callout_data;
6531 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
6534 /* Limits in the regex override only if they are smaller. */
6536 if ((re->flags & PCRE_MLSET) != 0 && re->limit_match < md->match_limit)
6537 md->match_limit = re->limit_match;
6539 if ((re->flags & PCRE_RLSET) != 0 &&
6540 re->limit_recursion < md->match_limit_recursion)
6541 md->match_limit_recursion = re->limit_recursion;
6543 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
6544 is a feature that makes it possible to save compiled regex and re-use them
6545 in other programs later. */
6547 if (tables == NULL) tables = PRIV(default_tables);
6549 /* Set up other data */
6551 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
6552 startline = (re->flags & PCRE_STARTLINE) != 0;
6553 firstline = (re->options & PCRE_FIRSTLINE) != 0;
6555 /* The code starts after the real_pcre block and the capture name table. */
6557 md->start_code = (const pcre_uchar *)re + re->name_table_offset +
6558 re->name_count * re->name_entry_size;
6560 md->start_subject = (PCRE_PUCHAR)subject;
6561 md->start_offset = start_offset;
6562 md->end_subject = md->start_subject + length;
6563 end_subject = md->end_subject;
6565 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
6566 md->use_ucp = (re->options & PCRE_UCP) != 0;
6567 md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
6568 md->ignore_skip_arg = 0;
6570 /* Some options are unpacked into BOOL variables in the hope that testing
6571 them will be faster than individual option bits. */
6573 md->notbol = (options & PCRE_NOTBOL) != 0;
6574 md->noteol = (options & PCRE_NOTEOL) != 0;
6575 md->notempty = (options & PCRE_NOTEMPTY) != 0;
6576 md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0;
6578 md->hitend = FALSE;
6579 md->mark = md->nomatch_mark = NULL; /* In case never set */
6581 md->recursive = NULL; /* No recursion at top level */
6582 md->hasthen = (re->flags & PCRE_HASTHEN) != 0;
6584 md->lcc = tables + lcc_offset;
6585 md->fcc = tables + fcc_offset;
6586 md->ctypes = tables + ctypes_offset;
6588 /* Handle different \R options. */
6590 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
6592 case 0:
6593 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
6594 md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
6595 else
6596 #ifdef BSR_ANYCRLF
6597 md->bsr_anycrlf = TRUE;
6598 #else
6599 md->bsr_anycrlf = FALSE;
6600 #endif
6601 break;
6603 case PCRE_BSR_ANYCRLF:
6604 md->bsr_anycrlf = TRUE;
6605 break;
6607 case PCRE_BSR_UNICODE:
6608 md->bsr_anycrlf = FALSE;
6609 break;
6611 default: return PCRE_ERROR_BADNEWLINE;
6614 /* Handle different types of newline. The three bits give eight cases. If
6615 nothing is set at run time, whatever was used at compile time applies. */
6617 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
6618 (pcre_uint32)options) & PCRE_NEWLINE_BITS)
6620 case 0: newline = NEWLINE; break; /* Compile-time default */
6621 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
6622 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
6623 case PCRE_NEWLINE_CR+
6624 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
6625 case PCRE_NEWLINE_ANY: newline = -1; break;
6626 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
6627 default: return PCRE_ERROR_BADNEWLINE;
6630 if (newline == -2)
6632 md->nltype = NLTYPE_ANYCRLF;
6634 else if (newline < 0)
6636 md->nltype = NLTYPE_ANY;
6638 else
6640 md->nltype = NLTYPE_FIXED;
6641 if (newline > 255)
6643 md->nllen = 2;
6644 md->nl[0] = (newline >> 8) & 255;
6645 md->nl[1] = newline & 255;
6647 else
6649 md->nllen = 1;
6650 md->nl[0] = newline;
6654 /* Partial matching was originally supported only for a restricted set of
6655 regexes; from release 8.00 there are no restrictions, but the bits are still
6656 defined (though never set). So there's no harm in leaving this code. */
6658 if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
6659 return PCRE_ERROR_BADPARTIAL;
6661 /* If the expression has got more back references than the offsets supplied can
6662 hold, we get a temporary chunk of working store to use during the matching.
6663 Otherwise, we can use the vector supplied, rounding down its size to a multiple
6664 of 3. */
6666 ocount = offsetcount - (offsetcount % 3);
6667 arg_offset_max = (2*ocount)/3;
6669 if (re->top_backref > 0 && re->top_backref >= ocount/3)
6671 ocount = re->top_backref * 3 + 3;
6672 md->offset_vector = (int *)(PUBL(malloc))(ocount * sizeof(int));
6673 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
6674 using_temporary_offsets = TRUE;
6675 DPRINTF(("Got memory to hold back references\n"));
6677 else md->offset_vector = offsets;
6678 md->offset_end = ocount;
6679 md->offset_max = (2*ocount)/3;
6680 md->capture_last = 0;
6682 /* Reset the working variable associated with each extraction. These should
6683 never be used unless previously set, but they get saved and restored, and so we
6684 initialize them to avoid reading uninitialized locations. Also, unset the
6685 offsets for the matched string. This is really just for tidiness with callouts,
6686 in case they inspect these fields. */
6688 if (md->offset_vector != NULL)
6690 register int *iptr = md->offset_vector + ocount;
6691 register int *iend = iptr - re->top_bracket;
6692 if (iend < md->offset_vector + 2) iend = md->offset_vector + 2;
6693 while (--iptr >= iend) *iptr = -1;
6694 md->offset_vector[0] = md->offset_vector[1] = -1;
6697 /* Set up the first character to match, if available. The first_char value is
6698 never set for an anchored regular expression, but the anchoring may be forced
6699 at run time, so we have to test for anchoring. The first char may be unset for
6700 an unanchored pattern, of course. If there's no first char and the pattern was
6701 studied, there may be a bitmap of possible first characters. */
6703 if (!anchored)
6705 if ((re->flags & PCRE_FIRSTSET) != 0)
6707 has_first_char = TRUE;
6708 first_char = first_char2 = (pcre_uchar)(re->first_char);
6709 if ((re->flags & PCRE_FCH_CASELESS) != 0)
6711 first_char2 = TABLE_GET(first_char, md->fcc, first_char);
6712 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
6713 if (utf && first_char > 127)
6714 first_char2 = UCD_OTHERCASE(first_char);
6715 #endif
6718 else
6719 if (!startline && study != NULL &&
6720 (study->flags & PCRE_STUDY_MAPPED) != 0)
6721 start_bits = study->start_bits;
6724 /* For anchored or unanchored matches, there may be a "last known required
6725 character" set. */
6727 if ((re->flags & PCRE_REQCHSET) != 0)
6729 has_req_char = TRUE;
6730 req_char = req_char2 = (pcre_uchar)(re->req_char);
6731 if ((re->flags & PCRE_RCH_CASELESS) != 0)
6733 req_char2 = TABLE_GET(req_char, md->fcc, req_char);
6734 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
6735 if (utf && req_char > 127)
6736 req_char2 = UCD_OTHERCASE(req_char);
6737 #endif
6742 /* ==========================================================================*/
6744 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
6745 the loop runs just once. */
6747 for(;;)
6749 PCRE_PUCHAR save_end_subject = end_subject;
6750 PCRE_PUCHAR new_start_match;
6752 /* If firstline is TRUE, the start of the match is constrained to the first
6753 line of a multiline string. That is, the match must be before or at the first
6754 newline. Implement this by temporarily adjusting end_subject so that we stop
6755 scanning at a newline. If the match fails at the newline, later code breaks
6756 this loop. */
6758 if (firstline)
6760 PCRE_PUCHAR t = start_match;
6761 #ifdef SUPPORT_UTF
6762 if (utf)
6764 while (t < md->end_subject && !IS_NEWLINE(t))
6766 t++;
6767 ACROSSCHAR(t < end_subject, *t, t++);
6770 else
6771 #endif
6772 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
6773 end_subject = t;
6776 /* There are some optimizations that avoid running the match if a known
6777 starting point is not found, or if a known later character is not present.
6778 However, there is an option that disables these, for testing and for ensuring
6779 that all callouts do actually occur. The option can be set in the regex by
6780 (*NO_START_OPT) or passed in match-time options. */
6782 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
6784 /* Advance to a unique first char if there is one. */
6786 if (has_first_char)
6788 pcre_uchar smc;
6790 if (first_char != first_char2)
6791 while (start_match < end_subject &&
6792 (smc = UCHAR21TEST(start_match)) != first_char && smc != first_char2)
6793 start_match++;
6794 else
6795 while (start_match < end_subject && UCHAR21TEST(start_match) != first_char)
6796 start_match++;
6799 /* Or to just after a linebreak for a multiline match */
6801 else if (startline)
6803 if (start_match > md->start_subject + start_offset)
6805 #ifdef SUPPORT_UTF
6806 if (utf)
6808 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6810 start_match++;
6811 ACROSSCHAR(start_match < end_subject, *start_match,
6812 start_match++);
6815 else
6816 #endif
6817 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6818 start_match++;
6820 /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
6821 and we are now at a LF, advance the match position by one more character.
6824 if (start_match[-1] == CHAR_CR &&
6825 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
6826 start_match < end_subject &&
6827 UCHAR21TEST(start_match) == CHAR_NL)
6828 start_match++;
6832 /* Or to a non-unique first byte after study */
6834 else if (start_bits != NULL)
6836 while (start_match < end_subject)
6838 register pcre_uint32 c = UCHAR21TEST(start_match);
6839 #ifndef COMPILE_PCRE8
6840 if (c > 255) c = 255;
6841 #endif
6842 if ((start_bits[c/8] & (1 << (c&7))) != 0) break;
6843 start_match++;
6846 } /* Starting optimizations */
6848 /* Restore fudged end_subject */
6850 end_subject = save_end_subject;
6852 /* The following two optimizations are disabled for partial matching or if
6853 disabling is explicitly requested. */
6855 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0 && !md->partial)
6857 /* If the pattern was studied, a minimum subject length may be set. This is
6858 a lower bound; no actual string of that length may actually match the
6859 pattern. Although the value is, strictly, in characters, we treat it as
6860 bytes to avoid spending too much time in this optimization. */
6862 if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
6863 (pcre_uint32)(end_subject - start_match) < study->minlength)
6865 rc = MATCH_NOMATCH;
6866 break;
6869 /* If req_char is set, we know that that character must appear in the
6870 subject for the match to succeed. If the first character is set, req_char
6871 must be later in the subject; otherwise the test starts at the match point.
6872 This optimization can save a huge amount of backtracking in patterns with
6873 nested unlimited repeats that aren't going to match. Writing separate code
6874 for cased/caseless versions makes it go faster, as does using an
6875 autoincrement and backing off on a match.
6877 HOWEVER: when the subject string is very, very long, searching to its end
6878 can take a long time, and give bad performance on quite ordinary patterns.
6879 This showed up when somebody was matching something like /^\d+C/ on a
6880 32-megabyte string... so we don't do this when the string is sufficiently
6881 long. */
6883 if (has_req_char && end_subject - start_match < REQ_BYTE_MAX)
6885 register PCRE_PUCHAR p = start_match + (has_first_char? 1:0);
6887 /* We don't need to repeat the search if we haven't yet reached the
6888 place we found it at last time. */
6890 if (p > req_char_ptr)
6892 if (req_char != req_char2)
6894 while (p < end_subject)
6896 register pcre_uint32 pp = UCHAR21INCTEST(p);
6897 if (pp == req_char || pp == req_char2) { p--; break; }
6900 else
6902 while (p < end_subject)
6904 if (UCHAR21INCTEST(p) == req_char) { p--; break; }
6908 /* If we can't find the required character, break the matching loop,
6909 forcing a match failure. */
6911 if (p >= end_subject)
6913 rc = MATCH_NOMATCH;
6914 break;
6917 /* If we have found the required character, save the point where we
6918 found it, so that we don't search again next time round the loop if
6919 the start hasn't passed this character yet. */
6921 req_char_ptr = p;
6926 #ifdef PCRE_DEBUG /* Sigh. Some compilers never learn. */
6927 printf(">>>> Match against: ");
6928 pchars(start_match, end_subject - start_match, TRUE, md);
6929 printf("\n");
6930 #endif
6932 /* OK, we can now run the match. If "hitend" is set afterwards, remember the
6933 first starting point for which a partial match was found. */
6935 md->start_match_ptr = start_match;
6936 md->start_used_ptr = start_match;
6937 md->match_call_count = 0;
6938 md->match_function_type = 0;
6939 md->end_offset_top = 0;
6940 md->skip_arg_count = 0;
6941 rc = match(start_match, md->start_code, start_match, 2, md, NULL, 0);
6942 if (md->hitend && start_partial == NULL)
6944 start_partial = md->start_used_ptr;
6945 match_partial = start_match;
6948 switch(rc)
6950 /* If MATCH_SKIP_ARG reaches this level it means that a MARK that matched
6951 the SKIP's arg was not found. In this circumstance, Perl ignores the SKIP
6952 entirely. The only way we can do that is to re-do the match at the same
6953 point, with a flag to force SKIP with an argument to be ignored. Just
6954 treating this case as NOMATCH does not work because it does not check other
6955 alternatives in patterns such as A(*SKIP:A)B|AC when the subject is AC. */
6957 case MATCH_SKIP_ARG:
6958 new_start_match = start_match;
6959 md->ignore_skip_arg = md->skip_arg_count;
6960 break;
6962 /* SKIP passes back the next starting point explicitly, but if it is no
6963 greater than the match we have just done, treat it as NOMATCH. */
6965 case MATCH_SKIP:
6966 if (md->start_match_ptr > start_match)
6968 new_start_match = md->start_match_ptr;
6969 break;
6971 /* Fall through */
6973 /* NOMATCH and PRUNE advance by one character. THEN at this level acts
6974 exactly like PRUNE. Unset ignore SKIP-with-argument. */
6976 case MATCH_NOMATCH:
6977 case MATCH_PRUNE:
6978 case MATCH_THEN:
6979 md->ignore_skip_arg = 0;
6980 new_start_match = start_match + 1;
6981 #ifdef SUPPORT_UTF
6982 if (utf)
6983 ACROSSCHAR(new_start_match < end_subject, *new_start_match,
6984 new_start_match++);
6985 #endif
6986 break;
6988 /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */
6990 case MATCH_COMMIT:
6991 rc = MATCH_NOMATCH;
6992 goto ENDLOOP;
6994 /* Any other return is either a match, or some kind of error. */
6996 default:
6997 goto ENDLOOP;
7000 /* Control reaches here for the various types of "no match at this point"
7001 result. Reset the code to MATCH_NOMATCH for subsequent checking. */
7003 rc = MATCH_NOMATCH;
7005 /* If PCRE_FIRSTLINE is set, the match must happen before or at the first
7006 newline in the subject (though it may continue over the newline). Therefore,
7007 if we have just failed to match, starting at a newline, do not continue. */
7009 if (firstline && IS_NEWLINE(start_match)) break;
7011 /* Advance to new matching position */
7013 start_match = new_start_match;
7015 /* Break the loop if the pattern is anchored or if we have passed the end of
7016 the subject. */
7018 if (anchored || start_match > end_subject) break;
7020 /* If we have just passed a CR and we are now at a LF, and the pattern does
7021 not contain any explicit matches for \r or \n, and the newline option is CRLF
7022 or ANY or ANYCRLF, advance the match position by one more character. In
7023 normal matching start_match will aways be greater than the first position at
7024 this stage, but a failed *SKIP can cause a return at the same point, which is
7025 why the first test exists. */
7027 if (start_match > (PCRE_PUCHAR)subject + start_offset &&
7028 start_match[-1] == CHAR_CR &&
7029 start_match < end_subject &&
7030 *start_match == CHAR_NL &&
7031 (re->flags & PCRE_HASCRORLF) == 0 &&
7032 (md->nltype == NLTYPE_ANY ||
7033 md->nltype == NLTYPE_ANYCRLF ||
7034 md->nllen == 2))
7035 start_match++;
7037 md->mark = NULL; /* Reset for start of next match attempt */
7038 } /* End of for(;;) "bumpalong" loop */
7040 /* ==========================================================================*/
7042 /* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping
7043 conditions is true:
7045 (1) The pattern is anchored or the match was failed by (*COMMIT);
7047 (2) We are past the end of the subject;
7049 (3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because
7050 this option requests that a match occur at or before the first newline in
7051 the subject.
7053 When we have a match and the offset vector is big enough to deal with any
7054 backreferences, captured substring offsets will already be set up. In the case
7055 where we had to get some local store to hold offsets for backreference
7056 processing, copy those that we can. In this case there need not be overflow if
7057 certain parts of the pattern were not used, even though there are more
7058 capturing parentheses than vector slots. */
7060 ENDLOOP:
7062 if (rc == MATCH_MATCH || rc == MATCH_ACCEPT)
7064 if (using_temporary_offsets)
7066 if (arg_offset_max >= 4)
7068 memcpy(offsets + 2, md->offset_vector + 2,
7069 (arg_offset_max - 2) * sizeof(int));
7070 DPRINTF(("Copied offsets from temporary memory\n"));
7072 if (md->end_offset_top > arg_offset_max) md->capture_last |= OVFLBIT;
7073 DPRINTF(("Freeing temporary memory\n"));
7074 (PUBL(free))(md->offset_vector);
7077 /* Set the return code to the number of captured strings, or 0 if there were
7078 too many to fit into the vector. */
7080 rc = ((md->capture_last & OVFLBIT) != 0 &&
7081 md->end_offset_top >= arg_offset_max)?
7082 0 : md->end_offset_top/2;
7084 /* If there is space in the offset vector, set any unused pairs at the end of
7085 the pattern to -1 for backwards compatibility. It is documented that this
7086 happens. In earlier versions, the whole set of potential capturing offsets
7087 was set to -1 each time round the loop, but this is handled differently now.
7088 "Gaps" are set to -1 dynamically instead (this fixes a bug). Thus, it is only
7089 those at the end that need unsetting here. We can't just unset them all at
7090 the start of the whole thing because they may get set in one branch that is
7091 not the final matching branch. */
7093 if (md->end_offset_top/2 <= re->top_bracket && offsets != NULL)
7095 register int *iptr, *iend;
7096 int resetcount = 2 + re->top_bracket * 2;
7097 if (resetcount > offsetcount) resetcount = offsetcount;
7098 iptr = offsets + md->end_offset_top;
7099 iend = offsets + resetcount;
7100 while (iptr < iend) *iptr++ = -1;
7103 /* If there is space, set up the whole thing as substring 0. The value of
7104 md->start_match_ptr might be modified if \K was encountered on the success
7105 matching path. */
7107 if (offsetcount < 2) rc = 0; else
7109 offsets[0] = (int)(md->start_match_ptr - md->start_subject);
7110 offsets[1] = (int)(md->end_match_ptr - md->start_subject);
7113 /* Return MARK data if requested */
7115 if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_MARK) != 0)
7116 *(extra_data->mark) = (pcre_uchar *)md->mark;
7117 DPRINTF((">>>> returning %d\n", rc));
7118 #ifdef NO_RECURSE
7119 release_match_heapframes(&frame_zero);
7120 #endif
7121 return rc;
7124 /* Control gets here if there has been an error, or if the overall match
7125 attempt has failed at all permitted starting positions. */
7127 if (using_temporary_offsets)
7129 DPRINTF(("Freeing temporary memory\n"));
7130 (PUBL(free))(md->offset_vector);
7133 /* For anything other than nomatch or partial match, just return the code. */
7135 if (rc != MATCH_NOMATCH && rc != PCRE_ERROR_PARTIAL)
7137 DPRINTF((">>>> error: returning %d\n", rc));
7138 #ifdef NO_RECURSE
7139 release_match_heapframes(&frame_zero);
7140 #endif
7141 return rc;
7144 /* Handle partial matches - disable any mark data */
7146 if (match_partial != NULL)
7148 DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
7149 md->mark = NULL;
7150 if (offsetcount > 1)
7152 offsets[0] = (int)(start_partial - (PCRE_PUCHAR)subject);
7153 offsets[1] = (int)(end_subject - (PCRE_PUCHAR)subject);
7154 if (offsetcount > 2)
7155 offsets[2] = (int)(match_partial - (PCRE_PUCHAR)subject);
7157 rc = PCRE_ERROR_PARTIAL;
7160 /* This is the classic nomatch case */
7162 else
7164 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
7165 rc = PCRE_ERROR_NOMATCH;
7168 /* Return the MARK data if it has been requested. */
7170 if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_MARK) != 0)
7171 *(extra_data->mark) = (pcre_uchar *)md->nomatch_mark;
7172 #ifdef NO_RECURSE
7173 release_match_heapframes(&frame_zero);
7174 #endif
7175 return rc;
7178 /* End of pcre_exec.c */