Updates to Tomato RAF including NGINX && PHP
[tomato.git] / release / src / router / php / ext / pcre / pcrelib / pcre_exec.c
blob05d0e52d33c91df2b014a55fcc56c87e49730142
1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
8 Written by Philip Hazel
9 Copyright (c) 1997-2012 University of Cambridge
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
40 /* This module contains pcre_exec(), the externally visible function that does
41 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
42 possible. There are also some static supporting functions. */
44 #ifdef HAVE_CONFIG_H
45 #include "config.h"
46 #endif
48 #define NLBLOCK md /* Block containing newline information */
49 #define PSSTART start_subject /* Field containing processed string start */
50 #define PSEND end_subject /* Field containing processed string end */
52 #include "pcre_internal.h"
54 /* Undefine some potentially clashing cpp symbols */
56 #undef min
57 #undef max
59 /* Values for setting in md->match_function_type to indicate two special types
60 of call to match(). We do it this way to save on using another stack variable,
61 as stack usage is to be discouraged. */
63 #define MATCH_CONDASSERT 1 /* Called to check a condition assertion */
64 #define MATCH_CBEGROUP 2 /* Could-be-empty unlimited repeat group */
66 /* Non-error returns from the match() function. Error returns are externally
67 defined PCRE_ERROR_xxx codes, which are all negative. */
69 #define MATCH_MATCH 1
70 #define MATCH_NOMATCH 0
72 /* Special internal returns from the match() function. Make them sufficiently
73 negative to avoid the external error codes. */
75 #define MATCH_ACCEPT (-999)
76 #define MATCH_COMMIT (-998)
77 #define MATCH_KETRPOS (-997)
78 #define MATCH_ONCE (-996)
79 #define MATCH_PRUNE (-995)
80 #define MATCH_SKIP (-994)
81 #define MATCH_SKIP_ARG (-993)
82 #define MATCH_THEN (-992)
84 /* Maximum number of ints of offset to save on the stack for recursive calls.
85 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
86 because the offset vector is always a multiple of 3 long. */
88 #define REC_STACK_SAVE_MAX 30
90 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
92 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
93 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
95 #ifdef PCRE_DEBUG
96 /*************************************************
97 * Debugging function to print chars *
98 *************************************************/
100 /* Print a sequence of chars in printable format, stopping at the end of the
101 subject if the requested.
103 Arguments:
104 p points to characters
105 length number to print
106 is_subject TRUE if printing from within md->start_subject
107 md pointer to matching data block, if is_subject is TRUE
109 Returns: nothing
112 static void
113 pchars(const pcre_uchar *p, int length, BOOL is_subject, match_data *md)
115 pcre_uint32 c;
116 BOOL utf = md->utf;
117 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
118 while (length-- > 0)
119 if (isprint(c = RAWUCHARINCTEST(p))) printf("%c", (char)c); else printf("\\x{%02x}", c);
121 #endif
125 /*************************************************
126 * Match a back-reference *
127 *************************************************/
129 /* Normally, if a back reference hasn't been set, the length that is passed is
130 negative, so the match always fails. However, in JavaScript compatibility mode,
131 the length passed is zero. Note that in caseless UTF-8 mode, the number of
132 subject bytes matched may be different to the number of reference bytes.
134 Arguments:
135 offset index into the offset vector
136 eptr pointer into the subject
137 length length of reference to be matched (number of bytes)
138 md points to match data block
139 caseless TRUE if caseless
141 Returns: >= 0 the number of subject bytes matched
142 -1 no match
143 -2 partial match; always given if at end subject
146 static int
147 match_ref(int offset, register PCRE_PUCHAR eptr, int length, match_data *md,
148 BOOL caseless)
150 PCRE_PUCHAR eptr_start = eptr;
151 register PCRE_PUCHAR p = md->start_subject + md->offset_vector[offset];
152 #ifdef SUPPORT_UTF
153 BOOL utf = md->utf;
154 #endif
156 #ifdef PCRE_DEBUG
157 if (eptr >= md->end_subject)
158 printf("matching subject <null>");
159 else
161 printf("matching subject ");
162 pchars(eptr, length, TRUE, md);
164 printf(" against backref ");
165 pchars(p, length, FALSE, md);
166 printf("\n");
167 #endif
169 /* Always fail if reference not set (and not JavaScript compatible - in that
170 case the length is passed as zero). */
172 if (length < 0) return -1;
174 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
175 properly if Unicode properties are supported. Otherwise, we can check only
176 ASCII characters. */
178 if (caseless)
180 #ifdef SUPPORT_UTF
181 #ifdef SUPPORT_UCP
182 if (utf)
184 /* Match characters up to the end of the reference. NOTE: the number of
185 data units matched may differ, because in UTF-8 there are some characters
186 whose upper and lower case versions code have different numbers of bytes.
187 For example, U+023A (2 bytes in UTF-8) is the upper case version of U+2C65
188 (3 bytes in UTF-8); a sequence of 3 of the former uses 6 bytes, as does a
189 sequence of two of the latter. It is important, therefore, to check the
190 length along the reference, not along the subject (earlier code did this
191 wrong). */
193 PCRE_PUCHAR endptr = p + length;
194 while (p < endptr)
196 pcre_uint32 c, d;
197 const ucd_record *ur;
198 if (eptr >= md->end_subject) return -2; /* Partial match */
199 GETCHARINC(c, eptr);
200 GETCHARINC(d, p);
201 ur = GET_UCD(d);
202 if (c != d && c != d + ur->other_case)
204 const pcre_uint32 *pp = PRIV(ucd_caseless_sets) + ur->caseset;
205 for (;;)
207 if (c < *pp) return -1;
208 if (c == *pp++) break;
213 else
214 #endif
215 #endif
217 /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
218 is no UCP support. */
220 while (length-- > 0)
222 pcre_uchar cc, cp;
223 if (eptr >= md->end_subject) return -2; /* Partial match */
224 cc = RAWUCHARTEST(eptr);
225 cp = RAWUCHARTEST(p);
226 if (TABLE_GET(cp, md->lcc, cp) != TABLE_GET(cc, md->lcc, cc)) return -1;
227 p++;
228 eptr++;
233 /* In the caseful case, we can just compare the bytes, whether or not we
234 are in UTF-8 mode. */
236 else
238 while (length-- > 0)
240 if (eptr >= md->end_subject) return -2; /* Partial match */
241 if (RAWUCHARINCTEST(p) != RAWUCHARINCTEST(eptr)) return -1;
245 return (int)(eptr - eptr_start);
250 /***************************************************************************
251 ****************************************************************************
252 RECURSION IN THE match() FUNCTION
254 The match() function is highly recursive, though not every recursive call
255 increases the recursive depth. Nevertheless, some regular expressions can cause
256 it to recurse to a great depth. I was writing for Unix, so I just let it call
257 itself recursively. This uses the stack for saving everything that has to be
258 saved for a recursive call. On Unix, the stack can be large, and this works
259 fine.
261 It turns out that on some non-Unix-like systems there are problems with
262 programs that use a lot of stack. (This despite the fact that every last chip
263 has oodles of memory these days, and techniques for extending the stack have
264 been known for decades.) So....
266 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
267 calls by keeping local variables that need to be preserved in blocks of memory
268 obtained from malloc() instead instead of on the stack. Macros are used to
269 achieve this so that the actual code doesn't look very different to what it
270 always used to.
272 The original heap-recursive code used longjmp(). However, it seems that this
273 can be very slow on some operating systems. Following a suggestion from Stan
274 Switzer, the use of longjmp() has been abolished, at the cost of having to
275 provide a unique number for each call to RMATCH. There is no way of generating
276 a sequence of numbers at compile time in C. I have given them names, to make
277 them stand out more clearly.
279 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
280 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
281 tests. Furthermore, not using longjmp() means that local dynamic variables
282 don't have indeterminate values; this has meant that the frame size can be
283 reduced because the result can be "passed back" by straight setting of the
284 variable instead of being passed in the frame.
285 ****************************************************************************
286 ***************************************************************************/
288 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
289 below must be updated in sync. */
291 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
292 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
293 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
294 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
295 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
296 RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
297 RM61, RM62, RM63, RM64, RM65, RM66, RM67 };
299 /* These versions of the macros use the stack, as normal. There are debugging
300 versions and production versions. Note that the "rw" argument of RMATCH isn't
301 actually used in this definition. */
303 #ifndef NO_RECURSE
304 #define REGISTER register
306 #ifdef PCRE_DEBUG
307 #define RMATCH(ra,rb,rc,rd,re,rw) \
309 printf("match() called in line %d\n", __LINE__); \
310 rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1); \
311 printf("to line %d\n", __LINE__); \
313 #define RRETURN(ra) \
315 printf("match() returned %d from line %d\n", ra, __LINE__); \
316 return ra; \
318 #else
319 #define RMATCH(ra,rb,rc,rd,re,rw) \
320 rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1)
321 #define RRETURN(ra) return ra
322 #endif
324 #else
327 /* These versions of the macros manage a private stack on the heap. Note that
328 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
329 argument of match(), which never changes. */
331 #define REGISTER
333 #define RMATCH(ra,rb,rc,rd,re,rw)\
335 heapframe *newframe = frame->Xnextframe;\
336 if (newframe == NULL)\
338 newframe = (heapframe *)(PUBL(stack_malloc))(sizeof(heapframe));\
339 if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\
340 newframe->Xnextframe = NULL;\
341 frame->Xnextframe = newframe;\
343 frame->Xwhere = rw;\
344 newframe->Xeptr = ra;\
345 newframe->Xecode = rb;\
346 newframe->Xmstart = mstart;\
347 newframe->Xoffset_top = rc;\
348 newframe->Xeptrb = re;\
349 newframe->Xrdepth = frame->Xrdepth + 1;\
350 newframe->Xprevframe = frame;\
351 frame = newframe;\
352 DPRINTF(("restarting from line %d\n", __LINE__));\
353 goto HEAP_RECURSE;\
354 L_##rw:\
355 DPRINTF(("jumped back to line %d\n", __LINE__));\
358 #define RRETURN(ra)\
360 heapframe *oldframe = frame;\
361 frame = oldframe->Xprevframe;\
362 if (frame != NULL)\
364 rrc = ra;\
365 goto HEAP_RETURN;\
367 return ra;\
371 /* Structure for remembering the local variables in a private frame */
373 typedef struct heapframe {
374 struct heapframe *Xprevframe;
375 struct heapframe *Xnextframe;
377 /* Function arguments that may change */
379 PCRE_PUCHAR Xeptr;
380 const pcre_uchar *Xecode;
381 PCRE_PUCHAR Xmstart;
382 int Xoffset_top;
383 eptrblock *Xeptrb;
384 unsigned int Xrdepth;
386 /* Function local variables */
388 PCRE_PUCHAR Xcallpat;
389 #ifdef SUPPORT_UTF
390 PCRE_PUCHAR Xcharptr;
391 #endif
392 PCRE_PUCHAR Xdata;
393 PCRE_PUCHAR Xnext;
394 PCRE_PUCHAR Xpp;
395 PCRE_PUCHAR Xprev;
396 PCRE_PUCHAR Xsaved_eptr;
398 recursion_info Xnew_recursive;
400 BOOL Xcur_is_word;
401 BOOL Xcondition;
402 BOOL Xprev_is_word;
404 #ifdef SUPPORT_UCP
405 int Xprop_type;
406 unsigned int Xprop_value;
407 int Xprop_fail_result;
408 int Xoclength;
409 pcre_uchar Xocchars[6];
410 #endif
412 int Xcodelink;
413 int Xctype;
414 unsigned int Xfc;
415 int Xfi;
416 int Xlength;
417 int Xmax;
418 int Xmin;
419 int Xnumber;
420 int Xoffset;
421 int Xop;
422 int Xsave_capture_last;
423 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
424 int Xstacksave[REC_STACK_SAVE_MAX];
426 eptrblock Xnewptrb;
428 /* Where to jump back to */
430 int Xwhere;
432 } heapframe;
434 #endif
437 /***************************************************************************
438 ***************************************************************************/
442 /*************************************************
443 * Match from current position *
444 *************************************************/
446 /* This function is called recursively in many circumstances. Whenever it
447 returns a negative (error) response, the outer incarnation must also return the
448 same response. */
450 /* These macros pack up tests that are used for partial matching, and which
451 appear several times in the code. We set the "hit end" flag if the pointer is
452 at the end of the subject and also past the start of the subject (i.e.
453 something has been matched). For hard partial matching, we then return
454 immediately. The second one is used when we already know we are past the end of
455 the subject. */
457 #define CHECK_PARTIAL()\
458 if (md->partial != 0 && eptr >= md->end_subject && \
459 eptr > md->start_used_ptr) \
461 md->hitend = TRUE; \
462 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
465 #define SCHECK_PARTIAL()\
466 if (md->partial != 0 && eptr > md->start_used_ptr) \
468 md->hitend = TRUE; \
469 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
473 /* Performance note: It might be tempting to extract commonly used fields from
474 the md structure (e.g. utf, end_subject) into individual variables to improve
475 performance. Tests using gcc on a SPARC disproved this; in the first case, it
476 made performance worse.
478 Arguments:
479 eptr pointer to current character in subject
480 ecode pointer to current position in compiled code
481 mstart pointer to the current match start position (can be modified
482 by encountering \K)
483 offset_top current top pointer
484 md pointer to "static" info for the match
485 eptrb pointer to chain of blocks containing eptr at start of
486 brackets - for testing for empty matches
487 rdepth the recursion depth
489 Returns: MATCH_MATCH if matched ) these values are >= 0
490 MATCH_NOMATCH if failed to match )
491 a negative MATCH_xxx value for PRUNE, SKIP, etc
492 a negative PCRE_ERROR_xxx value if aborted by an error condition
493 (e.g. stopped by repeated call or recursion limit)
496 static int
497 match(REGISTER PCRE_PUCHAR eptr, REGISTER const pcre_uchar *ecode,
498 PCRE_PUCHAR mstart, int offset_top, match_data *md, eptrblock *eptrb,
499 unsigned int rdepth)
501 /* These variables do not need to be preserved over recursion in this function,
502 so they can be ordinary variables in all cases. Mark some of them with
503 "register" because they are used a lot in loops. */
505 register int rrc; /* Returns from recursive calls */
506 register int i; /* Used for loops not involving calls to RMATCH() */
507 register pcre_uint32 c; /* Character values not kept over RMATCH() calls */
508 register BOOL utf; /* Local copy of UTF flag for speed */
510 BOOL minimize, possessive; /* Quantifier options */
511 BOOL caseless;
512 int condcode;
514 /* When recursion is not being used, all "local" variables that have to be
515 preserved over calls to RMATCH() are part of a "frame". We set up the top-level
516 frame on the stack here; subsequent instantiations are obtained from the heap
517 whenever RMATCH() does a "recursion". See the macro definitions above. Putting
518 the top-level on the stack rather than malloc-ing them all gives a performance
519 boost in many cases where there is not much "recursion". */
521 #ifdef NO_RECURSE
522 heapframe *frame = (heapframe *)md->match_frames_base;
524 /* Copy in the original argument variables */
526 frame->Xeptr = eptr;
527 frame->Xecode = ecode;
528 frame->Xmstart = mstart;
529 frame->Xoffset_top = offset_top;
530 frame->Xeptrb = eptrb;
531 frame->Xrdepth = rdepth;
533 /* This is where control jumps back to to effect "recursion" */
535 HEAP_RECURSE:
537 /* Macros make the argument variables come from the current frame */
539 #define eptr frame->Xeptr
540 #define ecode frame->Xecode
541 #define mstart frame->Xmstart
542 #define offset_top frame->Xoffset_top
543 #define eptrb frame->Xeptrb
544 #define rdepth frame->Xrdepth
546 /* Ditto for the local variables */
548 #ifdef SUPPORT_UTF
549 #define charptr frame->Xcharptr
550 #endif
551 #define callpat frame->Xcallpat
552 #define codelink frame->Xcodelink
553 #define data frame->Xdata
554 #define next frame->Xnext
555 #define pp frame->Xpp
556 #define prev frame->Xprev
557 #define saved_eptr frame->Xsaved_eptr
559 #define new_recursive frame->Xnew_recursive
561 #define cur_is_word frame->Xcur_is_word
562 #define condition frame->Xcondition
563 #define prev_is_word frame->Xprev_is_word
565 #ifdef SUPPORT_UCP
566 #define prop_type frame->Xprop_type
567 #define prop_value frame->Xprop_value
568 #define prop_fail_result frame->Xprop_fail_result
569 #define oclength frame->Xoclength
570 #define occhars frame->Xocchars
571 #endif
573 #define ctype frame->Xctype
574 #define fc frame->Xfc
575 #define fi frame->Xfi
576 #define length frame->Xlength
577 #define max frame->Xmax
578 #define min frame->Xmin
579 #define number frame->Xnumber
580 #define offset frame->Xoffset
581 #define op frame->Xop
582 #define save_capture_last frame->Xsave_capture_last
583 #define save_offset1 frame->Xsave_offset1
584 #define save_offset2 frame->Xsave_offset2
585 #define save_offset3 frame->Xsave_offset3
586 #define stacksave frame->Xstacksave
588 #define newptrb frame->Xnewptrb
590 /* When recursion is being used, local variables are allocated on the stack and
591 get preserved during recursion in the normal way. In this environment, fi and
592 i, and fc and c, can be the same variables. */
594 #else /* NO_RECURSE not defined */
595 #define fi i
596 #define fc c
598 /* Many of the following variables are used only in small blocks of the code.
599 My normal style of coding would have declared them within each of those blocks.
600 However, in order to accommodate the version of this code that uses an external
601 "stack" implemented on the heap, it is easier to declare them all here, so the
602 declarations can be cut out in a block. The only declarations within blocks
603 below are for variables that do not have to be preserved over a recursive call
604 to RMATCH(). */
606 #ifdef SUPPORT_UTF
607 const pcre_uchar *charptr;
608 #endif
609 const pcre_uchar *callpat;
610 const pcre_uchar *data;
611 const pcre_uchar *next;
612 PCRE_PUCHAR pp;
613 const pcre_uchar *prev;
614 PCRE_PUCHAR saved_eptr;
616 recursion_info new_recursive;
618 BOOL cur_is_word;
619 BOOL condition;
620 BOOL prev_is_word;
622 #ifdef SUPPORT_UCP
623 int prop_type;
624 unsigned int prop_value;
625 int prop_fail_result;
626 int oclength;
627 pcre_uchar occhars[6];
628 #endif
630 int codelink;
631 int ctype;
632 int length;
633 int max;
634 int min;
635 unsigned int number;
636 int offset;
637 pcre_uchar op;
638 int save_capture_last;
639 int save_offset1, save_offset2, save_offset3;
640 int stacksave[REC_STACK_SAVE_MAX];
642 eptrblock newptrb;
644 /* There is a special fudge for calling match() in a way that causes it to
645 measure the size of its basic stack frame when the stack is being used for
646 recursion. The second argument (ecode) being NULL triggers this behaviour. It
647 cannot normally ever be NULL. The return is the negated value of the frame
648 size. */
650 if (ecode == NULL)
652 if (rdepth == 0)
653 return match((PCRE_PUCHAR)&rdepth, NULL, NULL, 0, NULL, NULL, 1);
654 else
656 int len = (char *)&rdepth - (char *)eptr;
657 return (len > 0)? -len : len;
660 #endif /* NO_RECURSE */
662 /* To save space on the stack and in the heap frame, I have doubled up on some
663 of the local variables that are used only in localised parts of the code, but
664 still need to be preserved over recursive calls of match(). These macros define
665 the alternative names that are used. */
667 #define allow_zero cur_is_word
668 #define cbegroup condition
669 #define code_offset codelink
670 #define condassert condition
671 #define matched_once prev_is_word
672 #define foc number
673 #define save_mark data
675 /* These statements are here to stop the compiler complaining about unitialized
676 variables. */
678 #ifdef SUPPORT_UCP
679 prop_value = 0;
680 prop_fail_result = 0;
681 #endif
684 /* This label is used for tail recursion, which is used in a few cases even
685 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
686 used. Thanks to Ian Taylor for noticing this possibility and sending the
687 original patch. */
689 TAIL_RECURSE:
691 /* OK, now we can get on with the real code of the function. Recursive calls
692 are specified by the macro RMATCH and RRETURN is used to return. When
693 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
694 and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
695 defined). However, RMATCH isn't like a function call because it's quite a
696 complicated macro. It has to be used in one particular way. This shouldn't,
697 however, impact performance when true recursion is being used. */
699 #ifdef SUPPORT_UTF
700 utf = md->utf; /* Local copy of the flag */
701 #else
702 utf = FALSE;
703 #endif
705 /* First check that we haven't called match() too many times, or that we
706 haven't exceeded the recursive call limit. */
708 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
709 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
711 /* At the start of a group with an unlimited repeat that may match an empty
712 string, the variable md->match_function_type is set to MATCH_CBEGROUP. It is
713 done this way to save having to use another function argument, which would take
714 up space on the stack. See also MATCH_CONDASSERT below.
716 When MATCH_CBEGROUP is set, add the current subject pointer to the chain of
717 such remembered pointers, to be checked when we hit the closing ket, in order
718 to break infinite loops that match no characters. When match() is called in
719 other circumstances, don't add to the chain. The MATCH_CBEGROUP feature must
720 NOT be used with tail recursion, because the memory block that is used is on
721 the stack, so a new one may be required for each match(). */
723 if (md->match_function_type == MATCH_CBEGROUP)
725 newptrb.epb_saved_eptr = eptr;
726 newptrb.epb_prev = eptrb;
727 eptrb = &newptrb;
728 md->match_function_type = 0;
731 /* Now start processing the opcodes. */
733 for (;;)
735 minimize = possessive = FALSE;
736 op = *ecode;
738 switch(op)
740 case OP_MARK:
741 md->nomatch_mark = ecode + 2;
742 md->mark = NULL; /* In case previously set by assertion */
743 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
744 eptrb, RM55);
745 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
746 md->mark == NULL) md->mark = ecode + 2;
748 /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
749 argument, and we must check whether that argument matches this MARK's
750 argument. It is passed back in md->start_match_ptr (an overloading of that
751 variable). If it does match, we reset that variable to the current subject
752 position and return MATCH_SKIP. Otherwise, pass back the return code
753 unaltered. */
755 else if (rrc == MATCH_SKIP_ARG &&
756 STRCMP_UC_UC_TEST(ecode + 2, md->start_match_ptr) == 0)
758 md->start_match_ptr = eptr;
759 RRETURN(MATCH_SKIP);
761 RRETURN(rrc);
763 case OP_FAIL:
764 RRETURN(MATCH_NOMATCH);
766 /* COMMIT overrides PRUNE, SKIP, and THEN */
768 case OP_COMMIT:
769 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
770 eptrb, RM52);
771 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE &&
772 rrc != MATCH_SKIP && rrc != MATCH_SKIP_ARG &&
773 rrc != MATCH_THEN)
774 RRETURN(rrc);
775 RRETURN(MATCH_COMMIT);
777 /* PRUNE overrides THEN */
779 case OP_PRUNE:
780 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
781 eptrb, RM51);
782 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
783 RRETURN(MATCH_PRUNE);
785 case OP_PRUNE_ARG:
786 md->nomatch_mark = ecode + 2;
787 md->mark = NULL; /* In case previously set by assertion */
788 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
789 eptrb, RM56);
790 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
791 md->mark == NULL) md->mark = ecode + 2;
792 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
793 RRETURN(MATCH_PRUNE);
795 /* SKIP overrides PRUNE and THEN */
797 case OP_SKIP:
798 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
799 eptrb, RM53);
800 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
801 RRETURN(rrc);
802 md->start_match_ptr = eptr; /* Pass back current position */
803 RRETURN(MATCH_SKIP);
805 /* Note that, for Perl compatibility, SKIP with an argument does NOT set
806 nomatch_mark. There is a flag that disables this opcode when re-matching a
807 pattern that ended with a SKIP for which there was not a matching MARK. */
809 case OP_SKIP_ARG:
810 if (md->ignore_skip_arg)
812 ecode += PRIV(OP_lengths)[*ecode] + ecode[1];
813 break;
815 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
816 eptrb, RM57);
817 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
818 RRETURN(rrc);
820 /* Pass back the current skip name by overloading md->start_match_ptr and
821 returning the special MATCH_SKIP_ARG return code. This will either be
822 caught by a matching MARK, or get to the top, where it causes a rematch
823 with the md->ignore_skip_arg flag set. */
825 md->start_match_ptr = ecode + 2;
826 RRETURN(MATCH_SKIP_ARG);
828 /* For THEN (and THEN_ARG) we pass back the address of the opcode, so that
829 the branch in which it occurs can be determined. Overload the start of
830 match pointer to do this. */
832 case OP_THEN:
833 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
834 eptrb, RM54);
835 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
836 md->start_match_ptr = ecode;
837 RRETURN(MATCH_THEN);
839 case OP_THEN_ARG:
840 md->nomatch_mark = ecode + 2;
841 md->mark = NULL; /* In case previously set by assertion */
842 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top,
843 md, eptrb, RM58);
844 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
845 md->mark == NULL) md->mark = ecode + 2;
846 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
847 md->start_match_ptr = ecode;
848 RRETURN(MATCH_THEN);
850 /* Handle an atomic group that does not contain any capturing parentheses.
851 This can be handled like an assertion. Prior to 8.13, all atomic groups
852 were handled this way. In 8.13, the code was changed as below for ONCE, so
853 that backups pass through the group and thereby reset captured values.
854 However, this uses a lot more stack, so in 8.20, atomic groups that do not
855 contain any captures generate OP_ONCE_NC, which can be handled in the old,
856 less stack intensive way.
858 Check the alternative branches in turn - the matching won't pass the KET
859 for this kind of subpattern. If any one branch matches, we carry on as at
860 the end of a normal bracket, leaving the subject pointer, but resetting
861 the start-of-match value in case it was changed by \K. */
863 case OP_ONCE_NC:
864 prev = ecode;
865 saved_eptr = eptr;
866 save_mark = md->mark;
869 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM64);
870 if (rrc == MATCH_MATCH) /* Note: _not_ MATCH_ACCEPT */
872 mstart = md->start_match_ptr;
873 break;
875 if (rrc == MATCH_THEN)
877 next = ecode + GET(ecode,1);
878 if (md->start_match_ptr < next &&
879 (*ecode == OP_ALT || *next == OP_ALT))
880 rrc = MATCH_NOMATCH;
883 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
884 ecode += GET(ecode,1);
885 md->mark = save_mark;
887 while (*ecode == OP_ALT);
889 /* If hit the end of the group (which could be repeated), fail */
891 if (*ecode != OP_ONCE_NC && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
893 /* Continue as from after the group, updating the offsets high water
894 mark, since extracts may have been taken. */
896 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
898 offset_top = md->end_offset_top;
899 eptr = md->end_match_ptr;
901 /* For a non-repeating ket, just continue at this level. This also
902 happens for a repeating ket if no characters were matched in the group.
903 This is the forcible breaking of infinite loops as implemented in Perl
904 5.005. */
906 if (*ecode == OP_KET || eptr == saved_eptr)
908 ecode += 1+LINK_SIZE;
909 break;
912 /* The repeating kets try the rest of the pattern or restart from the
913 preceding bracket, in the appropriate order. The second "call" of match()
914 uses tail recursion, to avoid using another stack frame. */
916 if (*ecode == OP_KETRMIN)
918 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM65);
919 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
920 ecode = prev;
921 goto TAIL_RECURSE;
923 else /* OP_KETRMAX */
925 RMATCH(eptr, prev, offset_top, md, eptrb, RM66);
926 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
927 ecode += 1 + LINK_SIZE;
928 goto TAIL_RECURSE;
930 /* Control never gets here */
932 /* Handle a capturing bracket, other than those that are possessive with an
933 unlimited repeat. If there is space in the offset vector, save the current
934 subject position in the working slot at the top of the vector. We mustn't
935 change the current values of the data slot, because they may be set from a
936 previous iteration of this group, and be referred to by a reference inside
937 the group. A failure to match might occur after the group has succeeded,
938 if something later on doesn't match. For this reason, we need to restore
939 the working value and also the values of the final offsets, in case they
940 were set by a previous iteration of the same bracket.
942 If there isn't enough space in the offset vector, treat this as if it were
943 a non-capturing bracket. Don't worry about setting the flag for the error
944 case here; that is handled in the code for KET. */
946 case OP_CBRA:
947 case OP_SCBRA:
948 number = GET2(ecode, 1+LINK_SIZE);
949 offset = number << 1;
951 #ifdef PCRE_DEBUG
952 printf("start bracket %d\n", number);
953 printf("subject=");
954 pchars(eptr, 16, TRUE, md);
955 printf("\n");
956 #endif
958 if (offset < md->offset_max)
960 save_offset1 = md->offset_vector[offset];
961 save_offset2 = md->offset_vector[offset+1];
962 save_offset3 = md->offset_vector[md->offset_end - number];
963 save_capture_last = md->capture_last;
964 save_mark = md->mark;
966 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
967 md->offset_vector[md->offset_end - number] =
968 (int)(eptr - md->start_subject);
970 for (;;)
972 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
973 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
974 eptrb, RM1);
975 if (rrc == MATCH_ONCE) break; /* Backing up through an atomic group */
977 /* If we backed up to a THEN, check whether it is within the current
978 branch by comparing the address of the THEN that is passed back with
979 the end of the branch. If it is within the current branch, and the
980 branch is one of two or more alternatives (it either starts or ends
981 with OP_ALT), we have reached the limit of THEN's action, so convert
982 the return code to NOMATCH, which will cause normal backtracking to
983 happen from now on. Otherwise, THEN is passed back to an outer
984 alternative. This implements Perl's treatment of parenthesized groups,
985 where a group not containing | does not affect the current alternative,
986 that is, (X) is NOT the same as (X|(*F)). */
988 if (rrc == MATCH_THEN)
990 next = ecode + GET(ecode,1);
991 if (md->start_match_ptr < next &&
992 (*ecode == OP_ALT || *next == OP_ALT))
993 rrc = MATCH_NOMATCH;
996 /* Anything other than NOMATCH is passed back. */
998 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
999 md->capture_last = save_capture_last;
1000 ecode += GET(ecode, 1);
1001 md->mark = save_mark;
1002 if (*ecode != OP_ALT) break;
1005 DPRINTF(("bracket %d failed\n", number));
1006 md->offset_vector[offset] = save_offset1;
1007 md->offset_vector[offset+1] = save_offset2;
1008 md->offset_vector[md->offset_end - number] = save_offset3;
1010 /* At this point, rrc will be one of MATCH_ONCE or MATCH_NOMATCH. */
1012 RRETURN(rrc);
1015 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1016 as a non-capturing bracket. */
1018 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1019 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1021 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1023 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1024 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1026 /* Non-capturing or atomic group, except for possessive with unlimited
1027 repeat and ONCE group with no captures. Loop for all the alternatives.
1029 When we get to the final alternative within the brackets, we used to return
1030 the result of a recursive call to match() whatever happened so it was
1031 possible to reduce stack usage by turning this into a tail recursion,
1032 except in the case of a possibly empty group. However, now that there is
1033 the possiblity of (*THEN) occurring in the final alternative, this
1034 optimization is no longer always possible.
1036 We can optimize if we know there are no (*THEN)s in the pattern; at present
1037 this is the best that can be done.
1039 MATCH_ONCE is returned when the end of an atomic group is successfully
1040 reached, but subsequent matching fails. It passes back up the tree (causing
1041 captured values to be reset) until the original atomic group level is
1042 reached. This is tested by comparing md->once_target with the start of the
1043 group. At this point, the return is converted into MATCH_NOMATCH so that
1044 previous backup points can be taken. */
1046 case OP_ONCE:
1047 case OP_BRA:
1048 case OP_SBRA:
1049 DPRINTF(("start non-capturing bracket\n"));
1051 for (;;)
1053 if (op >= OP_SBRA || op == OP_ONCE)
1054 md->match_function_type = MATCH_CBEGROUP;
1056 /* If this is not a possibly empty group, and there are no (*THEN)s in
1057 the pattern, and this is the final alternative, optimize as described
1058 above. */
1060 else if (!md->hasthen && ecode[GET(ecode, 1)] != OP_ALT)
1062 ecode += PRIV(OP_lengths)[*ecode];
1063 goto TAIL_RECURSE;
1066 /* In all other cases, we have to make another call to match(). */
1068 save_mark = md->mark;
1069 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, eptrb,
1070 RM2);
1072 /* See comment in the code for capturing groups above about handling
1073 THEN. */
1075 if (rrc == MATCH_THEN)
1077 next = ecode + GET(ecode,1);
1078 if (md->start_match_ptr < next &&
1079 (*ecode == OP_ALT || *next == OP_ALT))
1080 rrc = MATCH_NOMATCH;
1083 if (rrc != MATCH_NOMATCH)
1085 if (rrc == MATCH_ONCE)
1087 const pcre_uchar *scode = ecode;
1088 if (*scode != OP_ONCE) /* If not at start, find it */
1090 while (*scode == OP_ALT) scode += GET(scode, 1);
1091 scode -= GET(scode, 1);
1093 if (md->once_target == scode) rrc = MATCH_NOMATCH;
1095 RRETURN(rrc);
1097 ecode += GET(ecode, 1);
1098 md->mark = save_mark;
1099 if (*ecode != OP_ALT) break;
1102 RRETURN(MATCH_NOMATCH);
1104 /* Handle possessive capturing brackets with an unlimited repeat. We come
1105 here from BRAZERO with allow_zero set TRUE. The offset_vector values are
1106 handled similarly to the normal case above. However, the matching is
1107 different. The end of these brackets will always be OP_KETRPOS, which
1108 returns MATCH_KETRPOS without going further in the pattern. By this means
1109 we can handle the group by iteration rather than recursion, thereby
1110 reducing the amount of stack needed. */
1112 case OP_CBRAPOS:
1113 case OP_SCBRAPOS:
1114 allow_zero = FALSE;
1116 POSSESSIVE_CAPTURE:
1117 number = GET2(ecode, 1+LINK_SIZE);
1118 offset = number << 1;
1120 #ifdef PCRE_DEBUG
1121 printf("start possessive bracket %d\n", number);
1122 printf("subject=");
1123 pchars(eptr, 16, TRUE, md);
1124 printf("\n");
1125 #endif
1127 if (offset < md->offset_max)
1129 matched_once = FALSE;
1130 code_offset = (int)(ecode - md->start_code);
1132 save_offset1 = md->offset_vector[offset];
1133 save_offset2 = md->offset_vector[offset+1];
1134 save_offset3 = md->offset_vector[md->offset_end - number];
1135 save_capture_last = md->capture_last;
1137 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
1139 /* Each time round the loop, save the current subject position for use
1140 when the group matches. For MATCH_MATCH, the group has matched, so we
1141 restart it with a new subject starting position, remembering that we had
1142 at least one match. For MATCH_NOMATCH, carry on with the alternatives, as
1143 usual. If we haven't matched any alternatives in any iteration, check to
1144 see if a previous iteration matched. If so, the group has matched;
1145 continue from afterwards. Otherwise it has failed; restore the previous
1146 capture values before returning NOMATCH. */
1148 for (;;)
1150 md->offset_vector[md->offset_end - number] =
1151 (int)(eptr - md->start_subject);
1152 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1153 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1154 eptrb, RM63);
1155 if (rrc == MATCH_KETRPOS)
1157 offset_top = md->end_offset_top;
1158 eptr = md->end_match_ptr;
1159 ecode = md->start_code + code_offset;
1160 save_capture_last = md->capture_last;
1161 matched_once = TRUE;
1162 continue;
1165 /* See comment in the code for capturing groups above about handling
1166 THEN. */
1168 if (rrc == MATCH_THEN)
1170 next = ecode + GET(ecode,1);
1171 if (md->start_match_ptr < next &&
1172 (*ecode == OP_ALT || *next == OP_ALT))
1173 rrc = MATCH_NOMATCH;
1176 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1177 md->capture_last = save_capture_last;
1178 ecode += GET(ecode, 1);
1179 if (*ecode != OP_ALT) break;
1182 if (!matched_once)
1184 md->offset_vector[offset] = save_offset1;
1185 md->offset_vector[offset+1] = save_offset2;
1186 md->offset_vector[md->offset_end - number] = save_offset3;
1189 if (allow_zero || matched_once)
1191 ecode += 1 + LINK_SIZE;
1192 break;
1195 RRETURN(MATCH_NOMATCH);
1198 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1199 as a non-capturing bracket. */
1201 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1202 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1204 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1206 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1207 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1209 /* Non-capturing possessive bracket with unlimited repeat. We come here
1210 from BRAZERO with allow_zero = TRUE. The code is similar to the above,
1211 without the capturing complication. It is written out separately for speed
1212 and cleanliness. */
1214 case OP_BRAPOS:
1215 case OP_SBRAPOS:
1216 allow_zero = FALSE;
1218 POSSESSIVE_NON_CAPTURE:
1219 matched_once = FALSE;
1220 code_offset = (int)(ecode - md->start_code);
1222 for (;;)
1224 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1225 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1226 eptrb, RM48);
1227 if (rrc == MATCH_KETRPOS)
1229 offset_top = md->end_offset_top;
1230 eptr = md->end_match_ptr;
1231 ecode = md->start_code + code_offset;
1232 matched_once = TRUE;
1233 continue;
1236 /* See comment in the code for capturing groups above about handling
1237 THEN. */
1239 if (rrc == MATCH_THEN)
1241 next = ecode + GET(ecode,1);
1242 if (md->start_match_ptr < next &&
1243 (*ecode == OP_ALT || *next == OP_ALT))
1244 rrc = MATCH_NOMATCH;
1247 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1248 ecode += GET(ecode, 1);
1249 if (*ecode != OP_ALT) break;
1252 if (matched_once || allow_zero)
1254 ecode += 1 + LINK_SIZE;
1255 break;
1257 RRETURN(MATCH_NOMATCH);
1259 /* Control never reaches here. */
1261 /* Conditional group: compilation checked that there are no more than
1262 two branches. If the condition is false, skipping the first branch takes us
1263 past the end if there is only one branch, but that's OK because that is
1264 exactly what going to the ket would do. */
1266 case OP_COND:
1267 case OP_SCOND:
1268 codelink = GET(ecode, 1);
1270 /* Because of the way auto-callout works during compile, a callout item is
1271 inserted between OP_COND and an assertion condition. */
1273 if (ecode[LINK_SIZE+1] == OP_CALLOUT)
1275 if (PUBL(callout) != NULL)
1277 PUBL(callout_block) cb;
1278 cb.version = 2; /* Version 1 of the callout block */
1279 cb.callout_number = ecode[LINK_SIZE+2];
1280 cb.offset_vector = md->offset_vector;
1281 #if defined COMPILE_PCRE8
1282 cb.subject = (PCRE_SPTR)md->start_subject;
1283 #elif defined COMPILE_PCRE16
1284 cb.subject = (PCRE_SPTR16)md->start_subject;
1285 #elif defined COMPILE_PCRE32
1286 cb.subject = (PCRE_SPTR32)md->start_subject;
1287 #endif
1288 cb.subject_length = (int)(md->end_subject - md->start_subject);
1289 cb.start_match = (int)(mstart - md->start_subject);
1290 cb.current_position = (int)(eptr - md->start_subject);
1291 cb.pattern_position = GET(ecode, LINK_SIZE + 3);
1292 cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
1293 cb.capture_top = offset_top/2;
1294 cb.capture_last = md->capture_last;
1295 cb.callout_data = md->callout_data;
1296 cb.mark = md->nomatch_mark;
1297 if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1298 if (rrc < 0) RRETURN(rrc);
1300 ecode += PRIV(OP_lengths)[OP_CALLOUT];
1303 condcode = ecode[LINK_SIZE+1];
1305 /* Now see what the actual condition is */
1307 if (condcode == OP_RREF || condcode == OP_NRREF) /* Recursion test */
1309 if (md->recursive == NULL) /* Not recursing => FALSE */
1311 condition = FALSE;
1312 ecode += GET(ecode, 1);
1314 else
1316 unsigned int recno = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
1317 condition = (recno == RREF_ANY || recno == md->recursive->group_num);
1319 /* If the test is for recursion into a specific subpattern, and it is
1320 false, but the test was set up by name, scan the table to see if the
1321 name refers to any other numbers, and test them. The condition is true
1322 if any one is set. */
1324 if (!condition && condcode == OP_NRREF)
1326 pcre_uchar *slotA = md->name_table;
1327 for (i = 0; i < md->name_count; i++)
1329 if (GET2(slotA, 0) == recno) break;
1330 slotA += md->name_entry_size;
1333 /* Found a name for the number - there can be only one; duplicate
1334 names for different numbers are allowed, but not vice versa. First
1335 scan down for duplicates. */
1337 if (i < md->name_count)
1339 pcre_uchar *slotB = slotA;
1340 while (slotB > md->name_table)
1342 slotB -= md->name_entry_size;
1343 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1345 condition = GET2(slotB, 0) == md->recursive->group_num;
1346 if (condition) break;
1348 else break;
1351 /* Scan up for duplicates */
1353 if (!condition)
1355 slotB = slotA;
1356 for (i++; i < md->name_count; i++)
1358 slotB += md->name_entry_size;
1359 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1361 condition = GET2(slotB, 0) == md->recursive->group_num;
1362 if (condition) break;
1364 else break;
1370 /* Chose branch according to the condition */
1372 ecode += condition? 1 + IMM2_SIZE : GET(ecode, 1);
1376 else if (condcode == OP_CREF || condcode == OP_NCREF) /* Group used test */
1378 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
1379 condition = offset < offset_top && md->offset_vector[offset] >= 0;
1381 /* If the numbered capture is unset, but the reference was by name,
1382 scan the table to see if the name refers to any other numbers, and test
1383 them. The condition is true if any one is set. This is tediously similar
1384 to the code above, but not close enough to try to amalgamate. */
1386 if (!condition && condcode == OP_NCREF)
1388 unsigned int refno = offset >> 1;
1389 pcre_uchar *slotA = md->name_table;
1391 for (i = 0; i < md->name_count; i++)
1393 if (GET2(slotA, 0) == refno) break;
1394 slotA += md->name_entry_size;
1397 /* Found a name for the number - there can be only one; duplicate names
1398 for different numbers are allowed, but not vice versa. First scan down
1399 for duplicates. */
1401 if (i < md->name_count)
1403 pcre_uchar *slotB = slotA;
1404 while (slotB > md->name_table)
1406 slotB -= md->name_entry_size;
1407 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1409 offset = GET2(slotB, 0) << 1;
1410 condition = offset < offset_top &&
1411 md->offset_vector[offset] >= 0;
1412 if (condition) break;
1414 else break;
1417 /* Scan up for duplicates */
1419 if (!condition)
1421 slotB = slotA;
1422 for (i++; i < md->name_count; i++)
1424 slotB += md->name_entry_size;
1425 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1427 offset = GET2(slotB, 0) << 1;
1428 condition = offset < offset_top &&
1429 md->offset_vector[offset] >= 0;
1430 if (condition) break;
1432 else break;
1438 /* Chose branch according to the condition */
1440 ecode += condition? 1 + IMM2_SIZE : GET(ecode, 1);
1443 else if (condcode == OP_DEF) /* DEFINE - always false */
1445 condition = FALSE;
1446 ecode += GET(ecode, 1);
1449 /* The condition is an assertion. Call match() to evaluate it - setting
1450 md->match_function_type to MATCH_CONDASSERT causes it to stop at the end of
1451 an assertion. */
1453 else
1455 md->match_function_type = MATCH_CONDASSERT;
1456 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM3);
1457 if (rrc == MATCH_MATCH)
1459 if (md->end_offset_top > offset_top)
1460 offset_top = md->end_offset_top; /* Captures may have happened */
1461 condition = TRUE;
1462 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
1463 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
1466 /* PCRE doesn't allow the effect of (*THEN) to escape beyond an
1467 assertion; it is therefore treated as NOMATCH. */
1469 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1471 RRETURN(rrc); /* Need braces because of following else */
1473 else
1475 condition = FALSE;
1476 ecode += codelink;
1480 /* We are now at the branch that is to be obeyed. As there is only one, can
1481 use tail recursion to avoid using another stack frame, except when there is
1482 unlimited repeat of a possibly empty group. In the latter case, a recursive
1483 call to match() is always required, unless the second alternative doesn't
1484 exist, in which case we can just plough on. Note that, for compatibility
1485 with Perl, the | in a conditional group is NOT treated as creating two
1486 alternatives. If a THEN is encountered in the branch, it propagates out to
1487 the enclosing alternative (unless nested in a deeper set of alternatives,
1488 of course). */
1490 if (condition || *ecode == OP_ALT)
1492 if (op != OP_SCOND)
1494 ecode += 1 + LINK_SIZE;
1495 goto TAIL_RECURSE;
1498 md->match_function_type = MATCH_CBEGROUP;
1499 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM49);
1500 RRETURN(rrc);
1503 /* Condition false & no alternative; continue after the group. */
1505 else
1507 ecode += 1 + LINK_SIZE;
1509 break;
1512 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1513 to close any currently open capturing brackets. */
1515 case OP_CLOSE:
1516 number = GET2(ecode, 1);
1517 offset = number << 1;
1519 #ifdef PCRE_DEBUG
1520 printf("end bracket %d at *ACCEPT", number);
1521 printf("\n");
1522 #endif
1524 md->capture_last = number;
1525 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1527 md->offset_vector[offset] =
1528 md->offset_vector[md->offset_end - number];
1529 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1530 if (offset_top <= offset) offset_top = offset + 2;
1532 ecode += 1 + IMM2_SIZE;
1533 break;
1536 /* End of the pattern, either real or forced. */
1538 case OP_END:
1539 case OP_ACCEPT:
1540 case OP_ASSERT_ACCEPT:
1542 /* If we have matched an empty string, fail if not in an assertion and not
1543 in a recursion if either PCRE_NOTEMPTY is set, or if PCRE_NOTEMPTY_ATSTART
1544 is set and we have matched at the start of the subject. In both cases,
1545 backtracking will then try other alternatives, if any. */
1547 if (eptr == mstart && op != OP_ASSERT_ACCEPT &&
1548 md->recursive == NULL &&
1549 (md->notempty ||
1550 (md->notempty_atstart &&
1551 mstart == md->start_subject + md->start_offset)))
1552 RRETURN(MATCH_NOMATCH);
1554 /* Otherwise, we have a match. */
1556 md->end_match_ptr = eptr; /* Record where we ended */
1557 md->end_offset_top = offset_top; /* and how many extracts were taken */
1558 md->start_match_ptr = mstart; /* and the start (\K can modify) */
1560 /* For some reason, the macros don't work properly if an expression is
1561 given as the argument to RRETURN when the heap is in use. */
1563 rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT;
1564 RRETURN(rrc);
1566 /* Assertion brackets. Check the alternative branches in turn - the
1567 matching won't pass the KET for an assertion. If any one branch matches,
1568 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1569 start of each branch to move the current point backwards, so the code at
1570 this level is identical to the lookahead case. When the assertion is part
1571 of a condition, we want to return immediately afterwards. The caller of
1572 this incarnation of the match() function will have set MATCH_CONDASSERT in
1573 md->match_function type, and one of these opcodes will be the first opcode
1574 that is processed. We use a local variable that is preserved over calls to
1575 match() to remember this case. */
1577 case OP_ASSERT:
1578 case OP_ASSERTBACK:
1579 save_mark = md->mark;
1580 if (md->match_function_type == MATCH_CONDASSERT)
1582 condassert = TRUE;
1583 md->match_function_type = 0;
1585 else condassert = FALSE;
1589 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM4);
1590 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1592 mstart = md->start_match_ptr; /* In case \K reset it */
1593 break;
1595 md->mark = save_mark;
1597 /* A COMMIT failure must fail the entire assertion, without trying any
1598 subsequent branches. */
1600 if (rrc == MATCH_COMMIT) RRETURN(MATCH_NOMATCH);
1602 /* PCRE does not allow THEN to escape beyond an assertion; it
1603 is treated as NOMATCH. */
1605 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1606 ecode += GET(ecode, 1);
1608 while (*ecode == OP_ALT);
1610 if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
1612 /* If checking an assertion for a condition, return MATCH_MATCH. */
1614 if (condassert) RRETURN(MATCH_MATCH);
1616 /* Continue from after the assertion, updating the offsets high water
1617 mark, since extracts may have been taken during the assertion. */
1619 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1620 ecode += 1 + LINK_SIZE;
1621 offset_top = md->end_offset_top;
1622 continue;
1624 /* Negative assertion: all branches must fail to match. Encountering SKIP,
1625 PRUNE, or COMMIT means we must assume failure without checking subsequent
1626 branches. */
1628 case OP_ASSERT_NOT:
1629 case OP_ASSERTBACK_NOT:
1630 save_mark = md->mark;
1631 if (md->match_function_type == MATCH_CONDASSERT)
1633 condassert = TRUE;
1634 md->match_function_type = 0;
1636 else condassert = FALSE;
1640 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM5);
1641 md->mark = save_mark;
1642 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) RRETURN(MATCH_NOMATCH);
1643 if (rrc == MATCH_SKIP || rrc == MATCH_PRUNE || rrc == MATCH_COMMIT)
1645 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1646 break;
1649 /* PCRE does not allow THEN to escape beyond an assertion; it is treated
1650 as NOMATCH. */
1652 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1653 ecode += GET(ecode,1);
1655 while (*ecode == OP_ALT);
1657 if (condassert) RRETURN(MATCH_MATCH); /* Condition assertion */
1659 ecode += 1 + LINK_SIZE;
1660 continue;
1662 /* Move the subject pointer back. This occurs only at the start of
1663 each branch of a lookbehind assertion. If we are too close to the start to
1664 move back, this match function fails. When working with UTF-8 we move
1665 back a number of characters, not bytes. */
1667 case OP_REVERSE:
1668 #ifdef SUPPORT_UTF
1669 if (utf)
1671 i = GET(ecode, 1);
1672 while (i-- > 0)
1674 eptr--;
1675 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1676 BACKCHAR(eptr);
1679 else
1680 #endif
1682 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1685 eptr -= GET(ecode, 1);
1686 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1689 /* Save the earliest consulted character, then skip to next op code */
1691 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1692 ecode += 1 + LINK_SIZE;
1693 break;
1695 /* The callout item calls an external function, if one is provided, passing
1696 details of the match so far. This is mainly for debugging, though the
1697 function is able to force a failure. */
1699 case OP_CALLOUT:
1700 if (PUBL(callout) != NULL)
1702 PUBL(callout_block) cb;
1703 cb.version = 2; /* Version 1 of the callout block */
1704 cb.callout_number = ecode[1];
1705 cb.offset_vector = md->offset_vector;
1706 #if defined COMPILE_PCRE8
1707 cb.subject = (PCRE_SPTR)md->start_subject;
1708 #elif defined COMPILE_PCRE16
1709 cb.subject = (PCRE_SPTR16)md->start_subject;
1710 #elif defined COMPILE_PCRE32
1711 cb.subject = (PCRE_SPTR32)md->start_subject;
1712 #endif
1713 cb.subject_length = (int)(md->end_subject - md->start_subject);
1714 cb.start_match = (int)(mstart - md->start_subject);
1715 cb.current_position = (int)(eptr - md->start_subject);
1716 cb.pattern_position = GET(ecode, 2);
1717 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1718 cb.capture_top = offset_top/2;
1719 cb.capture_last = md->capture_last;
1720 cb.callout_data = md->callout_data;
1721 cb.mark = md->nomatch_mark;
1722 if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1723 if (rrc < 0) RRETURN(rrc);
1725 ecode += 2 + 2*LINK_SIZE;
1726 break;
1728 /* Recursion either matches the current regex, or some subexpression. The
1729 offset data is the offset to the starting bracket from the start of the
1730 whole pattern. (This is so that it works from duplicated subpatterns.)
1732 The state of the capturing groups is preserved over recursion, and
1733 re-instated afterwards. We don't know how many are started and not yet
1734 finished (offset_top records the completed total) so we just have to save
1735 all the potential data. There may be up to 65535 such values, which is too
1736 large to put on the stack, but using malloc for small numbers seems
1737 expensive. As a compromise, the stack is used when there are no more than
1738 REC_STACK_SAVE_MAX values to store; otherwise malloc is used.
1740 There are also other values that have to be saved. We use a chained
1741 sequence of blocks that actually live on the stack. Thanks to Robin Houston
1742 for the original version of this logic. It has, however, been hacked around
1743 a lot, so he is not to blame for the current way it works. */
1745 case OP_RECURSE:
1747 recursion_info *ri;
1748 unsigned int recno;
1750 callpat = md->start_code + GET(ecode, 1);
1751 recno = (callpat == md->start_code)? 0 :
1752 GET2(callpat, 1 + LINK_SIZE);
1754 /* Check for repeating a recursion without advancing the subject pointer.
1755 This should catch convoluted mutual recursions. (Some simple cases are
1756 caught at compile time.) */
1758 for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
1759 if (recno == ri->group_num && eptr == ri->subject_position)
1760 RRETURN(PCRE_ERROR_RECURSELOOP);
1762 /* Add to "recursing stack" */
1764 new_recursive.group_num = recno;
1765 new_recursive.subject_position = eptr;
1766 new_recursive.prevrec = md->recursive;
1767 md->recursive = &new_recursive;
1769 /* Where to continue from afterwards */
1771 ecode += 1 + LINK_SIZE;
1773 /* Now save the offset data */
1775 new_recursive.saved_max = md->offset_end;
1776 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1777 new_recursive.offset_save = stacksave;
1778 else
1780 new_recursive.offset_save =
1781 (int *)(PUBL(malloc))(new_recursive.saved_max * sizeof(int));
1782 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1784 memcpy(new_recursive.offset_save, md->offset_vector,
1785 new_recursive.saved_max * sizeof(int));
1787 /* OK, now we can do the recursion. After processing each alternative,
1788 restore the offset data. If there were nested recursions, md->recursive
1789 might be changed, so reset it before looping. */
1791 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1792 cbegroup = (*callpat >= OP_SBRA);
1795 if (cbegroup) md->match_function_type = MATCH_CBEGROUP;
1796 RMATCH(eptr, callpat + PRIV(OP_lengths)[*callpat], offset_top,
1797 md, eptrb, RM6);
1798 memcpy(md->offset_vector, new_recursive.offset_save,
1799 new_recursive.saved_max * sizeof(int));
1800 md->recursive = new_recursive.prevrec;
1801 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1803 DPRINTF(("Recursion matched\n"));
1804 if (new_recursive.offset_save != stacksave)
1805 (PUBL(free))(new_recursive.offset_save);
1807 /* Set where we got to in the subject, and reset the start in case
1808 it was changed by \K. This *is* propagated back out of a recursion,
1809 for Perl compatibility. */
1811 eptr = md->end_match_ptr;
1812 mstart = md->start_match_ptr;
1813 goto RECURSION_MATCHED; /* Exit loop; end processing */
1816 /* PCRE does not allow THEN or COMMIT to escape beyond a recursion; it
1817 is treated as NOMATCH. */
1819 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN &&
1820 rrc != MATCH_COMMIT)
1822 DPRINTF(("Recursion gave error %d\n", rrc));
1823 if (new_recursive.offset_save != stacksave)
1824 (PUBL(free))(new_recursive.offset_save);
1825 RRETURN(rrc);
1828 md->recursive = &new_recursive;
1829 callpat += GET(callpat, 1);
1831 while (*callpat == OP_ALT);
1833 DPRINTF(("Recursion didn't match\n"));
1834 md->recursive = new_recursive.prevrec;
1835 if (new_recursive.offset_save != stacksave)
1836 (PUBL(free))(new_recursive.offset_save);
1837 RRETURN(MATCH_NOMATCH);
1840 RECURSION_MATCHED:
1841 break;
1843 /* An alternation is the end of a branch; scan along to find the end of the
1844 bracketed group and go to there. */
1846 case OP_ALT:
1847 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1848 break;
1850 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1851 indicating that it may occur zero times. It may repeat infinitely, or not
1852 at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1853 with fixed upper repeat limits are compiled as a number of copies, with the
1854 optional ones preceded by BRAZERO or BRAMINZERO. */
1856 case OP_BRAZERO:
1857 next = ecode + 1;
1858 RMATCH(eptr, next, offset_top, md, eptrb, RM10);
1859 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1860 do next += GET(next, 1); while (*next == OP_ALT);
1861 ecode = next + 1 + LINK_SIZE;
1862 break;
1864 case OP_BRAMINZERO:
1865 next = ecode + 1;
1866 do next += GET(next, 1); while (*next == OP_ALT);
1867 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, eptrb, RM11);
1868 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1869 ecode++;
1870 break;
1872 case OP_SKIPZERO:
1873 next = ecode+1;
1874 do next += GET(next,1); while (*next == OP_ALT);
1875 ecode = next + 1 + LINK_SIZE;
1876 break;
1878 /* BRAPOSZERO occurs before a possessive bracket group. Don't do anything
1879 here; just jump to the group, with allow_zero set TRUE. */
1881 case OP_BRAPOSZERO:
1882 op = *(++ecode);
1883 allow_zero = TRUE;
1884 if (op == OP_CBRAPOS || op == OP_SCBRAPOS) goto POSSESSIVE_CAPTURE;
1885 goto POSSESSIVE_NON_CAPTURE;
1887 /* End of a group, repeated or non-repeating. */
1889 case OP_KET:
1890 case OP_KETRMIN:
1891 case OP_KETRMAX:
1892 case OP_KETRPOS:
1893 prev = ecode - GET(ecode, 1);
1895 /* If this was a group that remembered the subject start, in order to break
1896 infinite repeats of empty string matches, retrieve the subject start from
1897 the chain. Otherwise, set it NULL. */
1899 if (*prev >= OP_SBRA || *prev == OP_ONCE)
1901 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1902 eptrb = eptrb->epb_prev; /* Backup to previous group */
1904 else saved_eptr = NULL;
1906 /* If we are at the end of an assertion group or a non-capturing atomic
1907 group, stop matching and return MATCH_MATCH, but record the current high
1908 water mark for use by positive assertions. We also need to record the match
1909 start in case it was changed by \K. */
1911 if ((*prev >= OP_ASSERT && *prev <= OP_ASSERTBACK_NOT) ||
1912 *prev == OP_ONCE_NC)
1914 md->end_match_ptr = eptr; /* For ONCE_NC */
1915 md->end_offset_top = offset_top;
1916 md->start_match_ptr = mstart;
1917 RRETURN(MATCH_MATCH); /* Sets md->mark */
1920 /* For capturing groups we have to check the group number back at the start
1921 and if necessary complete handling an extraction by setting the offsets and
1922 bumping the high water mark. Whole-pattern recursion is coded as a recurse
1923 into group 0, so it won't be picked up here. Instead, we catch it when the
1924 OP_END is reached. Other recursion is handled here. We just have to record
1925 the current subject position and start match pointer and give a MATCH
1926 return. */
1928 if (*prev == OP_CBRA || *prev == OP_SCBRA ||
1929 *prev == OP_CBRAPOS || *prev == OP_SCBRAPOS)
1931 number = GET2(prev, 1+LINK_SIZE);
1932 offset = number << 1;
1934 #ifdef PCRE_DEBUG
1935 printf("end bracket %d", number);
1936 printf("\n");
1937 #endif
1939 /* Handle a recursively called group. */
1941 if (md->recursive != NULL && md->recursive->group_num == number)
1943 md->end_match_ptr = eptr;
1944 md->start_match_ptr = mstart;
1945 RRETURN(MATCH_MATCH);
1948 /* Deal with capturing */
1950 md->capture_last = number;
1951 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1953 /* If offset is greater than offset_top, it means that we are
1954 "skipping" a capturing group, and that group's offsets must be marked
1955 unset. In earlier versions of PCRE, all the offsets were unset at the
1956 start of matching, but this doesn't work because atomic groups and
1957 assertions can cause a value to be set that should later be unset.
1958 Example: matching /(?>(a))b|(a)c/ against "ac". This sets group 1 as
1959 part of the atomic group, but this is not on the final matching path,
1960 so must be unset when 2 is set. (If there is no group 2, there is no
1961 problem, because offset_top will then be 2, indicating no capture.) */
1963 if (offset > offset_top)
1965 register int *iptr = md->offset_vector + offset_top;
1966 register int *iend = md->offset_vector + offset;
1967 while (iptr < iend) *iptr++ = -1;
1970 /* Now make the extraction */
1972 md->offset_vector[offset] =
1973 md->offset_vector[md->offset_end - number];
1974 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1975 if (offset_top <= offset) offset_top = offset + 2;
1979 /* For an ordinary non-repeating ket, just continue at this level. This
1980 also happens for a repeating ket if no characters were matched in the
1981 group. This is the forcible breaking of infinite loops as implemented in
1982 Perl 5.005. For a non-repeating atomic group that includes captures,
1983 establish a backup point by processing the rest of the pattern at a lower
1984 level. If this results in a NOMATCH return, pass MATCH_ONCE back to the
1985 original OP_ONCE level, thereby bypassing intermediate backup points, but
1986 resetting any captures that happened along the way. */
1988 if (*ecode == OP_KET || eptr == saved_eptr)
1990 if (*prev == OP_ONCE)
1992 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM12);
1993 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1994 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
1995 RRETURN(MATCH_ONCE);
1997 ecode += 1 + LINK_SIZE; /* Carry on at this level */
1998 break;
2001 /* OP_KETRPOS is a possessive repeating ket. Remember the current position,
2002 and return the MATCH_KETRPOS. This makes it possible to do the repeats one
2003 at a time from the outer level, thus saving stack. */
2005 if (*ecode == OP_KETRPOS)
2007 md->end_match_ptr = eptr;
2008 md->end_offset_top = offset_top;
2009 RRETURN(MATCH_KETRPOS);
2012 /* The normal repeating kets try the rest of the pattern or restart from
2013 the preceding bracket, in the appropriate order. In the second case, we can
2014 use tail recursion to avoid using another stack frame, unless we have an
2015 an atomic group or an unlimited repeat of a group that can match an empty
2016 string. */
2018 if (*ecode == OP_KETRMIN)
2020 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM7);
2021 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2022 if (*prev == OP_ONCE)
2024 RMATCH(eptr, prev, offset_top, md, eptrb, RM8);
2025 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2026 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
2027 RRETURN(MATCH_ONCE);
2029 if (*prev >= OP_SBRA) /* Could match an empty string */
2031 RMATCH(eptr, prev, offset_top, md, eptrb, RM50);
2032 RRETURN(rrc);
2034 ecode = prev;
2035 goto TAIL_RECURSE;
2037 else /* OP_KETRMAX */
2039 RMATCH(eptr, prev, offset_top, md, eptrb, RM13);
2040 if (rrc == MATCH_ONCE && md->once_target == prev) rrc = MATCH_NOMATCH;
2041 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2042 if (*prev == OP_ONCE)
2044 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM9);
2045 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2046 md->once_target = prev;
2047 RRETURN(MATCH_ONCE);
2049 ecode += 1 + LINK_SIZE;
2050 goto TAIL_RECURSE;
2052 /* Control never gets here */
2054 /* Not multiline mode: start of subject assertion, unless notbol. */
2056 case OP_CIRC:
2057 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
2059 /* Start of subject assertion */
2061 case OP_SOD:
2062 if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
2063 ecode++;
2064 break;
2066 /* Multiline mode: start of subject unless notbol, or after any newline. */
2068 case OP_CIRCM:
2069 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
2070 if (eptr != md->start_subject &&
2071 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
2072 RRETURN(MATCH_NOMATCH);
2073 ecode++;
2074 break;
2076 /* Start of match assertion */
2078 case OP_SOM:
2079 if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
2080 ecode++;
2081 break;
2083 /* Reset the start of match point */
2085 case OP_SET_SOM:
2086 mstart = eptr;
2087 ecode++;
2088 break;
2090 /* Multiline mode: assert before any newline, or before end of subject
2091 unless noteol is set. */
2093 case OP_DOLLM:
2094 if (eptr < md->end_subject)
2096 if (!IS_NEWLINE(eptr))
2098 if (md->partial != 0 &&
2099 eptr + 1 >= md->end_subject &&
2100 NLBLOCK->nltype == NLTYPE_FIXED &&
2101 NLBLOCK->nllen == 2 &&
2102 RAWUCHARTEST(eptr) == NLBLOCK->nl[0])
2104 md->hitend = TRUE;
2105 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2107 RRETURN(MATCH_NOMATCH);
2110 else
2112 if (md->noteol) RRETURN(MATCH_NOMATCH);
2113 SCHECK_PARTIAL();
2115 ecode++;
2116 break;
2118 /* Not multiline mode: assert before a terminating newline or before end of
2119 subject unless noteol is set. */
2121 case OP_DOLL:
2122 if (md->noteol) RRETURN(MATCH_NOMATCH);
2123 if (!md->endonly) goto ASSERT_NL_OR_EOS;
2125 /* ... else fall through for endonly */
2127 /* End of subject assertion (\z) */
2129 case OP_EOD:
2130 if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
2131 SCHECK_PARTIAL();
2132 ecode++;
2133 break;
2135 /* End of subject or ending \n assertion (\Z) */
2137 case OP_EODN:
2138 ASSERT_NL_OR_EOS:
2139 if (eptr < md->end_subject &&
2140 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
2142 if (md->partial != 0 &&
2143 eptr + 1 >= md->end_subject &&
2144 NLBLOCK->nltype == NLTYPE_FIXED &&
2145 NLBLOCK->nllen == 2 &&
2146 RAWUCHARTEST(eptr) == NLBLOCK->nl[0])
2148 md->hitend = TRUE;
2149 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2151 RRETURN(MATCH_NOMATCH);
2154 /* Either at end of string or \n before end. */
2156 SCHECK_PARTIAL();
2157 ecode++;
2158 break;
2160 /* Word boundary assertions */
2162 case OP_NOT_WORD_BOUNDARY:
2163 case OP_WORD_BOUNDARY:
2166 /* Find out if the previous and current characters are "word" characters.
2167 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
2168 be "non-word" characters. Remember the earliest consulted character for
2169 partial matching. */
2171 #ifdef SUPPORT_UTF
2172 if (utf)
2174 /* Get status of previous character */
2176 if (eptr == md->start_subject) prev_is_word = FALSE; else
2178 PCRE_PUCHAR lastptr = eptr - 1;
2179 BACKCHAR(lastptr);
2180 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
2181 GETCHAR(c, lastptr);
2182 #ifdef SUPPORT_UCP
2183 if (md->use_ucp)
2185 if (c == '_') prev_is_word = TRUE; else
2187 int cat = UCD_CATEGORY(c);
2188 prev_is_word = (cat == ucp_L || cat == ucp_N);
2191 else
2192 #endif
2193 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2196 /* Get status of next character */
2198 if (eptr >= md->end_subject)
2200 SCHECK_PARTIAL();
2201 cur_is_word = FALSE;
2203 else
2205 GETCHAR(c, eptr);
2206 #ifdef SUPPORT_UCP
2207 if (md->use_ucp)
2209 if (c == '_') cur_is_word = TRUE; else
2211 int cat = UCD_CATEGORY(c);
2212 cur_is_word = (cat == ucp_L || cat == ucp_N);
2215 else
2216 #endif
2217 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2220 else
2221 #endif
2223 /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for
2224 consistency with the behaviour of \w we do use it in this case. */
2227 /* Get status of previous character */
2229 if (eptr == md->start_subject) prev_is_word = FALSE; else
2231 if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
2232 #ifdef SUPPORT_UCP
2233 if (md->use_ucp)
2235 c = eptr[-1];
2236 if (c == '_') prev_is_word = TRUE; else
2238 int cat = UCD_CATEGORY(c);
2239 prev_is_word = (cat == ucp_L || cat == ucp_N);
2242 else
2243 #endif
2244 prev_is_word = MAX_255(eptr[-1])
2245 && ((md->ctypes[eptr[-1]] & ctype_word) != 0);
2248 /* Get status of next character */
2250 if (eptr >= md->end_subject)
2252 SCHECK_PARTIAL();
2253 cur_is_word = FALSE;
2255 else
2256 #ifdef SUPPORT_UCP
2257 if (md->use_ucp)
2259 c = *eptr;
2260 if (c == '_') cur_is_word = TRUE; else
2262 int cat = UCD_CATEGORY(c);
2263 cur_is_word = (cat == ucp_L || cat == ucp_N);
2266 else
2267 #endif
2268 cur_is_word = MAX_255(*eptr)
2269 && ((md->ctypes[*eptr] & ctype_word) != 0);
2272 /* Now see if the situation is what we want */
2274 if ((*ecode++ == OP_WORD_BOUNDARY)?
2275 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
2276 RRETURN(MATCH_NOMATCH);
2278 break;
2280 /* Match any single character type except newline; have to take care with
2281 CRLF newlines and partial matching. */
2283 case OP_ANY:
2284 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
2285 if (md->partial != 0 &&
2286 eptr + 1 >= md->end_subject &&
2287 NLBLOCK->nltype == NLTYPE_FIXED &&
2288 NLBLOCK->nllen == 2 &&
2289 RAWUCHARTEST(eptr) == NLBLOCK->nl[0])
2291 md->hitend = TRUE;
2292 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2295 /* Fall through */
2297 /* Match any single character whatsoever. */
2299 case OP_ALLANY:
2300 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2301 { /* not be updated before SCHECK_PARTIAL. */
2302 SCHECK_PARTIAL();
2303 RRETURN(MATCH_NOMATCH);
2305 eptr++;
2306 #ifdef SUPPORT_UTF
2307 if (utf) ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
2308 #endif
2309 ecode++;
2310 break;
2312 /* Match a single byte, even in UTF-8 mode. This opcode really does match
2313 any byte, even newline, independent of the setting of PCRE_DOTALL. */
2315 case OP_ANYBYTE:
2316 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2317 { /* not be updated before SCHECK_PARTIAL. */
2318 SCHECK_PARTIAL();
2319 RRETURN(MATCH_NOMATCH);
2321 eptr++;
2322 ecode++;
2323 break;
2325 case OP_NOT_DIGIT:
2326 if (eptr >= md->end_subject)
2328 SCHECK_PARTIAL();
2329 RRETURN(MATCH_NOMATCH);
2331 GETCHARINCTEST(c, eptr);
2332 if (
2333 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2334 c < 256 &&
2335 #endif
2336 (md->ctypes[c] & ctype_digit) != 0
2338 RRETURN(MATCH_NOMATCH);
2339 ecode++;
2340 break;
2342 case OP_DIGIT:
2343 if (eptr >= md->end_subject)
2345 SCHECK_PARTIAL();
2346 RRETURN(MATCH_NOMATCH);
2348 GETCHARINCTEST(c, eptr);
2349 if (
2350 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2351 c > 255 ||
2352 #endif
2353 (md->ctypes[c] & ctype_digit) == 0
2355 RRETURN(MATCH_NOMATCH);
2356 ecode++;
2357 break;
2359 case OP_NOT_WHITESPACE:
2360 if (eptr >= md->end_subject)
2362 SCHECK_PARTIAL();
2363 RRETURN(MATCH_NOMATCH);
2365 GETCHARINCTEST(c, eptr);
2366 if (
2367 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2368 c < 256 &&
2369 #endif
2370 (md->ctypes[c] & ctype_space) != 0
2372 RRETURN(MATCH_NOMATCH);
2373 ecode++;
2374 break;
2376 case OP_WHITESPACE:
2377 if (eptr >= md->end_subject)
2379 SCHECK_PARTIAL();
2380 RRETURN(MATCH_NOMATCH);
2382 GETCHARINCTEST(c, eptr);
2383 if (
2384 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2385 c > 255 ||
2386 #endif
2387 (md->ctypes[c] & ctype_space) == 0
2389 RRETURN(MATCH_NOMATCH);
2390 ecode++;
2391 break;
2393 case OP_NOT_WORDCHAR:
2394 if (eptr >= md->end_subject)
2396 SCHECK_PARTIAL();
2397 RRETURN(MATCH_NOMATCH);
2399 GETCHARINCTEST(c, eptr);
2400 if (
2401 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2402 c < 256 &&
2403 #endif
2404 (md->ctypes[c] & ctype_word) != 0
2406 RRETURN(MATCH_NOMATCH);
2407 ecode++;
2408 break;
2410 case OP_WORDCHAR:
2411 if (eptr >= md->end_subject)
2413 SCHECK_PARTIAL();
2414 RRETURN(MATCH_NOMATCH);
2416 GETCHARINCTEST(c, eptr);
2417 if (
2418 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2419 c > 255 ||
2420 #endif
2421 (md->ctypes[c] & ctype_word) == 0
2423 RRETURN(MATCH_NOMATCH);
2424 ecode++;
2425 break;
2427 case OP_ANYNL:
2428 if (eptr >= md->end_subject)
2430 SCHECK_PARTIAL();
2431 RRETURN(MATCH_NOMATCH);
2433 GETCHARINCTEST(c, eptr);
2434 switch(c)
2436 default: RRETURN(MATCH_NOMATCH);
2438 case CHAR_CR:
2439 if (eptr >= md->end_subject)
2441 SCHECK_PARTIAL();
2443 else if (RAWUCHARTEST(eptr) == CHAR_LF) eptr++;
2444 break;
2446 case CHAR_LF:
2447 break;
2449 case CHAR_VT:
2450 case CHAR_FF:
2451 case CHAR_NEL:
2452 #ifndef EBCDIC
2453 case 0x2028:
2454 case 0x2029:
2455 #endif /* Not EBCDIC */
2456 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
2457 break;
2459 ecode++;
2460 break;
2462 case OP_NOT_HSPACE:
2463 if (eptr >= md->end_subject)
2465 SCHECK_PARTIAL();
2466 RRETURN(MATCH_NOMATCH);
2468 GETCHARINCTEST(c, eptr);
2469 switch(c)
2471 HSPACE_CASES: RRETURN(MATCH_NOMATCH); /* Byte and multibyte cases */
2472 default: break;
2474 ecode++;
2475 break;
2477 case OP_HSPACE:
2478 if (eptr >= md->end_subject)
2480 SCHECK_PARTIAL();
2481 RRETURN(MATCH_NOMATCH);
2483 GETCHARINCTEST(c, eptr);
2484 switch(c)
2486 HSPACE_CASES: break; /* Byte and multibyte cases */
2487 default: RRETURN(MATCH_NOMATCH);
2489 ecode++;
2490 break;
2492 case OP_NOT_VSPACE:
2493 if (eptr >= md->end_subject)
2495 SCHECK_PARTIAL();
2496 RRETURN(MATCH_NOMATCH);
2498 GETCHARINCTEST(c, eptr);
2499 switch(c)
2501 VSPACE_CASES: RRETURN(MATCH_NOMATCH);
2502 default: break;
2504 ecode++;
2505 break;
2507 case OP_VSPACE:
2508 if (eptr >= md->end_subject)
2510 SCHECK_PARTIAL();
2511 RRETURN(MATCH_NOMATCH);
2513 GETCHARINCTEST(c, eptr);
2514 switch(c)
2516 VSPACE_CASES: break;
2517 default: RRETURN(MATCH_NOMATCH);
2519 ecode++;
2520 break;
2522 #ifdef SUPPORT_UCP
2523 /* Check the next character by Unicode property. We will get here only
2524 if the support is in the binary; otherwise a compile-time error occurs. */
2526 case OP_PROP:
2527 case OP_NOTPROP:
2528 if (eptr >= md->end_subject)
2530 SCHECK_PARTIAL();
2531 RRETURN(MATCH_NOMATCH);
2533 GETCHARINCTEST(c, eptr);
2535 const pcre_uint32 *cp;
2536 const ucd_record *prop = GET_UCD(c);
2538 switch(ecode[1])
2540 case PT_ANY:
2541 if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
2542 break;
2544 case PT_LAMP:
2545 if ((prop->chartype == ucp_Lu ||
2546 prop->chartype == ucp_Ll ||
2547 prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
2548 RRETURN(MATCH_NOMATCH);
2549 break;
2551 case PT_GC:
2552 if ((ecode[2] != PRIV(ucp_gentype)[prop->chartype]) == (op == OP_PROP))
2553 RRETURN(MATCH_NOMATCH);
2554 break;
2556 case PT_PC:
2557 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
2558 RRETURN(MATCH_NOMATCH);
2559 break;
2561 case PT_SC:
2562 if ((ecode[2] != prop->script) == (op == OP_PROP))
2563 RRETURN(MATCH_NOMATCH);
2564 break;
2566 /* These are specials */
2568 case PT_ALNUM:
2569 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2570 PRIV(ucp_gentype)[prop->chartype] == ucp_N) == (op == OP_NOTPROP))
2571 RRETURN(MATCH_NOMATCH);
2572 break;
2574 case PT_SPACE: /* Perl space */
2575 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
2576 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2577 == (op == OP_NOTPROP))
2578 RRETURN(MATCH_NOMATCH);
2579 break;
2581 case PT_PXSPACE: /* POSIX space */
2582 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
2583 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2584 c == CHAR_FF || c == CHAR_CR)
2585 == (op == OP_NOTPROP))
2586 RRETURN(MATCH_NOMATCH);
2587 break;
2589 case PT_WORD:
2590 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2591 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
2592 c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
2593 RRETURN(MATCH_NOMATCH);
2594 break;
2596 case PT_CLIST:
2597 cp = PRIV(ucd_caseless_sets) + ecode[2];
2598 for (;;)
2600 if (c < *cp)
2601 { if (op == OP_PROP) { RRETURN(MATCH_NOMATCH); } else break; }
2602 if (c == *cp++)
2603 { if (op == OP_PROP) break; else { RRETURN(MATCH_NOMATCH); } }
2605 break;
2607 /* This should never occur */
2609 default:
2610 RRETURN(PCRE_ERROR_INTERNAL);
2613 ecode += 3;
2615 break;
2617 /* Match an extended Unicode sequence. We will get here only if the support
2618 is in the binary; otherwise a compile-time error occurs. */
2620 case OP_EXTUNI:
2621 if (eptr >= md->end_subject)
2623 SCHECK_PARTIAL();
2624 RRETURN(MATCH_NOMATCH);
2626 else
2628 int lgb, rgb;
2629 GETCHARINCTEST(c, eptr);
2630 lgb = UCD_GRAPHBREAK(c);
2631 while (eptr < md->end_subject)
2633 int len = 1;
2634 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
2635 rgb = UCD_GRAPHBREAK(c);
2636 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
2637 lgb = rgb;
2638 eptr += len;
2641 CHECK_PARTIAL();
2642 ecode++;
2643 break;
2644 #endif /* SUPPORT_UCP */
2647 /* Match a back reference, possibly repeatedly. Look past the end of the
2648 item to see if there is repeat information following. The code is similar
2649 to that for character classes, but repeated for efficiency. Then obey
2650 similar code to character type repeats - written out again for speed.
2651 However, if the referenced string is the empty string, always treat
2652 it as matched, any number of times (otherwise there could be infinite
2653 loops). */
2655 case OP_REF:
2656 case OP_REFI:
2657 caseless = op == OP_REFI;
2658 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
2659 ecode += 1 + IMM2_SIZE;
2661 /* If the reference is unset, there are two possibilities:
2663 (a) In the default, Perl-compatible state, set the length negative;
2664 this ensures that every attempt at a match fails. We can't just fail
2665 here, because of the possibility of quantifiers with zero minima.
2667 (b) If the JavaScript compatibility flag is set, set the length to zero
2668 so that the back reference matches an empty string.
2670 Otherwise, set the length to the length of what was matched by the
2671 referenced subpattern. */
2673 if (offset >= offset_top || md->offset_vector[offset] < 0)
2674 length = (md->jscript_compat)? 0 : -1;
2675 else
2676 length = md->offset_vector[offset+1] - md->offset_vector[offset];
2678 /* Set up for repetition, or handle the non-repeated case */
2680 switch (*ecode)
2682 case OP_CRSTAR:
2683 case OP_CRMINSTAR:
2684 case OP_CRPLUS:
2685 case OP_CRMINPLUS:
2686 case OP_CRQUERY:
2687 case OP_CRMINQUERY:
2688 c = *ecode++ - OP_CRSTAR;
2689 minimize = (c & 1) != 0;
2690 min = rep_min[c]; /* Pick up values from tables; */
2691 max = rep_max[c]; /* zero for max => infinity */
2692 if (max == 0) max = INT_MAX;
2693 break;
2695 case OP_CRRANGE:
2696 case OP_CRMINRANGE:
2697 minimize = (*ecode == OP_CRMINRANGE);
2698 min = GET2(ecode, 1);
2699 max = GET2(ecode, 1 + IMM2_SIZE);
2700 if (max == 0) max = INT_MAX;
2701 ecode += 1 + 2 * IMM2_SIZE;
2702 break;
2704 default: /* No repeat follows */
2705 if ((length = match_ref(offset, eptr, length, md, caseless)) < 0)
2707 if (length == -2) eptr = md->end_subject; /* Partial match */
2708 CHECK_PARTIAL();
2709 RRETURN(MATCH_NOMATCH);
2711 eptr += length;
2712 continue; /* With the main loop */
2715 /* Handle repeated back references. If the length of the reference is
2716 zero, just continue with the main loop. If the length is negative, it
2717 means the reference is unset in non-Java-compatible mode. If the minimum is
2718 zero, we can continue at the same level without recursion. For any other
2719 minimum, carrying on will result in NOMATCH. */
2721 if (length == 0) continue;
2722 if (length < 0 && min == 0) continue;
2724 /* First, ensure the minimum number of matches are present. We get back
2725 the length of the reference string explicitly rather than passing the
2726 address of eptr, so that eptr can be a register variable. */
2728 for (i = 1; i <= min; i++)
2730 int slength;
2731 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2733 if (slength == -2) eptr = md->end_subject; /* Partial match */
2734 CHECK_PARTIAL();
2735 RRETURN(MATCH_NOMATCH);
2737 eptr += slength;
2740 /* If min = max, continue at the same level without recursion.
2741 They are not both allowed to be zero. */
2743 if (min == max) continue;
2745 /* If minimizing, keep trying and advancing the pointer */
2747 if (minimize)
2749 for (fi = min;; fi++)
2751 int slength;
2752 RMATCH(eptr, ecode, offset_top, md, eptrb, RM14);
2753 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2754 if (fi >= max) RRETURN(MATCH_NOMATCH);
2755 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2757 if (slength == -2) eptr = md->end_subject; /* Partial match */
2758 CHECK_PARTIAL();
2759 RRETURN(MATCH_NOMATCH);
2761 eptr += slength;
2763 /* Control never gets here */
2766 /* If maximizing, find the longest string and work backwards */
2768 else
2770 pp = eptr;
2771 for (i = min; i < max; i++)
2773 int slength;
2774 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2776 /* Can't use CHECK_PARTIAL because we don't want to update eptr in
2777 the soft partial matching case. */
2779 if (slength == -2 && md->partial != 0 &&
2780 md->end_subject > md->start_used_ptr)
2782 md->hitend = TRUE;
2783 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2785 break;
2787 eptr += slength;
2790 while (eptr >= pp)
2792 RMATCH(eptr, ecode, offset_top, md, eptrb, RM15);
2793 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2794 eptr -= length;
2796 RRETURN(MATCH_NOMATCH);
2798 /* Control never gets here */
2800 /* Match a bit-mapped character class, possibly repeatedly. This op code is
2801 used when all the characters in the class have values in the range 0-255,
2802 and either the matching is caseful, or the characters are in the range
2803 0-127 when UTF-8 processing is enabled. The only difference between
2804 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2805 encountered.
2807 First, look past the end of the item to see if there is repeat information
2808 following. Then obey similar code to character type repeats - written out
2809 again for speed. */
2811 case OP_NCLASS:
2812 case OP_CLASS:
2814 /* The data variable is saved across frames, so the byte map needs to
2815 be stored there. */
2816 #define BYTE_MAP ((pcre_uint8 *)data)
2817 data = ecode + 1; /* Save for matching */
2818 ecode += 1 + (32 / sizeof(pcre_uchar)); /* Advance past the item */
2820 switch (*ecode)
2822 case OP_CRSTAR:
2823 case OP_CRMINSTAR:
2824 case OP_CRPLUS:
2825 case OP_CRMINPLUS:
2826 case OP_CRQUERY:
2827 case OP_CRMINQUERY:
2828 c = *ecode++ - OP_CRSTAR;
2829 minimize = (c & 1) != 0;
2830 min = rep_min[c]; /* Pick up values from tables; */
2831 max = rep_max[c]; /* zero for max => infinity */
2832 if (max == 0) max = INT_MAX;
2833 break;
2835 case OP_CRRANGE:
2836 case OP_CRMINRANGE:
2837 minimize = (*ecode == OP_CRMINRANGE);
2838 min = GET2(ecode, 1);
2839 max = GET2(ecode, 1 + IMM2_SIZE);
2840 if (max == 0) max = INT_MAX;
2841 ecode += 1 + 2 * IMM2_SIZE;
2842 break;
2844 default: /* No repeat follows */
2845 min = max = 1;
2846 break;
2849 /* First, ensure the minimum number of matches are present. */
2851 #ifdef SUPPORT_UTF
2852 if (utf)
2854 for (i = 1; i <= min; i++)
2856 if (eptr >= md->end_subject)
2858 SCHECK_PARTIAL();
2859 RRETURN(MATCH_NOMATCH);
2861 GETCHARINC(c, eptr);
2862 if (c > 255)
2864 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2866 else
2867 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2870 else
2871 #endif
2872 /* Not UTF mode */
2874 for (i = 1; i <= min; i++)
2876 if (eptr >= md->end_subject)
2878 SCHECK_PARTIAL();
2879 RRETURN(MATCH_NOMATCH);
2881 c = *eptr++;
2882 #ifndef COMPILE_PCRE8
2883 if (c > 255)
2885 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2887 else
2888 #endif
2889 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2893 /* If max == min we can continue with the main loop without the
2894 need to recurse. */
2896 if (min == max) continue;
2898 /* If minimizing, keep testing the rest of the expression and advancing
2899 the pointer while it matches the class. */
2901 if (minimize)
2903 #ifdef SUPPORT_UTF
2904 if (utf)
2906 for (fi = min;; fi++)
2908 RMATCH(eptr, ecode, offset_top, md, eptrb, RM16);
2909 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2910 if (fi >= max) RRETURN(MATCH_NOMATCH);
2911 if (eptr >= md->end_subject)
2913 SCHECK_PARTIAL();
2914 RRETURN(MATCH_NOMATCH);
2916 GETCHARINC(c, eptr);
2917 if (c > 255)
2919 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2921 else
2922 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2925 else
2926 #endif
2927 /* Not UTF mode */
2929 for (fi = min;; fi++)
2931 RMATCH(eptr, ecode, offset_top, md, eptrb, RM17);
2932 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2933 if (fi >= max) RRETURN(MATCH_NOMATCH);
2934 if (eptr >= md->end_subject)
2936 SCHECK_PARTIAL();
2937 RRETURN(MATCH_NOMATCH);
2939 c = *eptr++;
2940 #ifndef COMPILE_PCRE8
2941 if (c > 255)
2943 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2945 else
2946 #endif
2947 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2950 /* Control never gets here */
2953 /* If maximizing, find the longest possible run, then work backwards. */
2955 else
2957 pp = eptr;
2959 #ifdef SUPPORT_UTF
2960 if (utf)
2962 for (i = min; i < max; i++)
2964 int len = 1;
2965 if (eptr >= md->end_subject)
2967 SCHECK_PARTIAL();
2968 break;
2970 GETCHARLEN(c, eptr, len);
2971 if (c > 255)
2973 if (op == OP_CLASS) break;
2975 else
2976 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
2977 eptr += len;
2979 for (;;)
2981 RMATCH(eptr, ecode, offset_top, md, eptrb, RM18);
2982 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2983 if (eptr-- == pp) break; /* Stop if tried at original pos */
2984 BACKCHAR(eptr);
2987 else
2988 #endif
2989 /* Not UTF mode */
2991 for (i = min; i < max; i++)
2993 if (eptr >= md->end_subject)
2995 SCHECK_PARTIAL();
2996 break;
2998 c = *eptr;
2999 #ifndef COMPILE_PCRE8
3000 if (c > 255)
3002 if (op == OP_CLASS) break;
3004 else
3005 #endif
3006 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
3007 eptr++;
3009 while (eptr >= pp)
3011 RMATCH(eptr, ecode, offset_top, md, eptrb, RM19);
3012 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3013 eptr--;
3017 RRETURN(MATCH_NOMATCH);
3019 #undef BYTE_MAP
3021 /* Control never gets here */
3024 /* Match an extended character class. This opcode is encountered only
3025 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
3026 mode, because Unicode properties are supported in non-UTF-8 mode. */
3028 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3029 case OP_XCLASS:
3031 data = ecode + 1 + LINK_SIZE; /* Save for matching */
3032 ecode += GET(ecode, 1); /* Advance past the item */
3034 switch (*ecode)
3036 case OP_CRSTAR:
3037 case OP_CRMINSTAR:
3038 case OP_CRPLUS:
3039 case OP_CRMINPLUS:
3040 case OP_CRQUERY:
3041 case OP_CRMINQUERY:
3042 c = *ecode++ - OP_CRSTAR;
3043 minimize = (c & 1) != 0;
3044 min = rep_min[c]; /* Pick up values from tables; */
3045 max = rep_max[c]; /* zero for max => infinity */
3046 if (max == 0) max = INT_MAX;
3047 break;
3049 case OP_CRRANGE:
3050 case OP_CRMINRANGE:
3051 minimize = (*ecode == OP_CRMINRANGE);
3052 min = GET2(ecode, 1);
3053 max = GET2(ecode, 1 + IMM2_SIZE);
3054 if (max == 0) max = INT_MAX;
3055 ecode += 1 + 2 * IMM2_SIZE;
3056 break;
3058 default: /* No repeat follows */
3059 min = max = 1;
3060 break;
3063 /* First, ensure the minimum number of matches are present. */
3065 for (i = 1; i <= min; i++)
3067 if (eptr >= md->end_subject)
3069 SCHECK_PARTIAL();
3070 RRETURN(MATCH_NOMATCH);
3072 GETCHARINCTEST(c, eptr);
3073 if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH);
3076 /* If max == min we can continue with the main loop without the
3077 need to recurse. */
3079 if (min == max) continue;
3081 /* If minimizing, keep testing the rest of the expression and advancing
3082 the pointer while it matches the class. */
3084 if (minimize)
3086 for (fi = min;; fi++)
3088 RMATCH(eptr, ecode, offset_top, md, eptrb, RM20);
3089 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3090 if (fi >= max) RRETURN(MATCH_NOMATCH);
3091 if (eptr >= md->end_subject)
3093 SCHECK_PARTIAL();
3094 RRETURN(MATCH_NOMATCH);
3096 GETCHARINCTEST(c, eptr);
3097 if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH);
3099 /* Control never gets here */
3102 /* If maximizing, find the longest possible run, then work backwards. */
3104 else
3106 pp = eptr;
3107 for (i = min; i < max; i++)
3109 int len = 1;
3110 if (eptr >= md->end_subject)
3112 SCHECK_PARTIAL();
3113 break;
3115 #ifdef SUPPORT_UTF
3116 GETCHARLENTEST(c, eptr, len);
3117 #else
3118 c = *eptr;
3119 #endif
3120 if (!PRIV(xclass)(c, data, utf)) break;
3121 eptr += len;
3123 for(;;)
3125 RMATCH(eptr, ecode, offset_top, md, eptrb, RM21);
3126 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3127 if (eptr-- == pp) break; /* Stop if tried at original pos */
3128 #ifdef SUPPORT_UTF
3129 if (utf) BACKCHAR(eptr);
3130 #endif
3132 RRETURN(MATCH_NOMATCH);
3135 /* Control never gets here */
3137 #endif /* End of XCLASS */
3139 /* Match a single character, casefully */
3141 case OP_CHAR:
3142 #ifdef SUPPORT_UTF
3143 if (utf)
3145 length = 1;
3146 ecode++;
3147 GETCHARLEN(fc, ecode, length);
3148 if (length > md->end_subject - eptr)
3150 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
3151 RRETURN(MATCH_NOMATCH);
3153 while (length-- > 0) if (*ecode++ != RAWUCHARINC(eptr)) RRETURN(MATCH_NOMATCH);
3155 else
3156 #endif
3157 /* Not UTF mode */
3159 if (md->end_subject - eptr < 1)
3161 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
3162 RRETURN(MATCH_NOMATCH);
3164 if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
3165 ecode += 2;
3167 break;
3169 /* Match a single character, caselessly. If we are at the end of the
3170 subject, give up immediately. */
3172 case OP_CHARI:
3173 if (eptr >= md->end_subject)
3175 SCHECK_PARTIAL();
3176 RRETURN(MATCH_NOMATCH);
3179 #ifdef SUPPORT_UTF
3180 if (utf)
3182 length = 1;
3183 ecode++;
3184 GETCHARLEN(fc, ecode, length);
3186 /* If the pattern character's value is < 128, we have only one byte, and
3187 we know that its other case must also be one byte long, so we can use the
3188 fast lookup table. We know that there is at least one byte left in the
3189 subject. */
3191 if (fc < 128)
3193 pcre_uchar cc = RAWUCHAR(eptr);
3194 if (md->lcc[fc] != TABLE_GET(cc, md->lcc, cc)) RRETURN(MATCH_NOMATCH);
3195 ecode++;
3196 eptr++;
3199 /* Otherwise we must pick up the subject character. Note that we cannot
3200 use the value of "length" to check for sufficient bytes left, because the
3201 other case of the character may have more or fewer bytes. */
3203 else
3205 pcre_uint32 dc;
3206 GETCHARINC(dc, eptr);
3207 ecode += length;
3209 /* If we have Unicode property support, we can use it to test the other
3210 case of the character, if there is one. */
3212 if (fc != dc)
3214 #ifdef SUPPORT_UCP
3215 if (dc != UCD_OTHERCASE(fc))
3216 #endif
3217 RRETURN(MATCH_NOMATCH);
3221 else
3222 #endif /* SUPPORT_UTF */
3224 /* Not UTF mode */
3226 if (TABLE_GET(ecode[1], md->lcc, ecode[1])
3227 != TABLE_GET(*eptr, md->lcc, *eptr)) RRETURN(MATCH_NOMATCH);
3228 eptr++;
3229 ecode += 2;
3231 break;
3233 /* Match a single character repeatedly. */
3235 case OP_EXACT:
3236 case OP_EXACTI:
3237 min = max = GET2(ecode, 1);
3238 ecode += 1 + IMM2_SIZE;
3239 goto REPEATCHAR;
3241 case OP_POSUPTO:
3242 case OP_POSUPTOI:
3243 possessive = TRUE;
3244 /* Fall through */
3246 case OP_UPTO:
3247 case OP_UPTOI:
3248 case OP_MINUPTO:
3249 case OP_MINUPTOI:
3250 min = 0;
3251 max = GET2(ecode, 1);
3252 minimize = *ecode == OP_MINUPTO || *ecode == OP_MINUPTOI;
3253 ecode += 1 + IMM2_SIZE;
3254 goto REPEATCHAR;
3256 case OP_POSSTAR:
3257 case OP_POSSTARI:
3258 possessive = TRUE;
3259 min = 0;
3260 max = INT_MAX;
3261 ecode++;
3262 goto REPEATCHAR;
3264 case OP_POSPLUS:
3265 case OP_POSPLUSI:
3266 possessive = TRUE;
3267 min = 1;
3268 max = INT_MAX;
3269 ecode++;
3270 goto REPEATCHAR;
3272 case OP_POSQUERY:
3273 case OP_POSQUERYI:
3274 possessive = TRUE;
3275 min = 0;
3276 max = 1;
3277 ecode++;
3278 goto REPEATCHAR;
3280 case OP_STAR:
3281 case OP_STARI:
3282 case OP_MINSTAR:
3283 case OP_MINSTARI:
3284 case OP_PLUS:
3285 case OP_PLUSI:
3286 case OP_MINPLUS:
3287 case OP_MINPLUSI:
3288 case OP_QUERY:
3289 case OP_QUERYI:
3290 case OP_MINQUERY:
3291 case OP_MINQUERYI:
3292 c = *ecode++ - ((op < OP_STARI)? OP_STAR : OP_STARI);
3293 minimize = (c & 1) != 0;
3294 min = rep_min[c]; /* Pick up values from tables; */
3295 max = rep_max[c]; /* zero for max => infinity */
3296 if (max == 0) max = INT_MAX;
3298 /* Common code for all repeated single-character matches. */
3300 REPEATCHAR:
3301 #ifdef SUPPORT_UTF
3302 if (utf)
3304 length = 1;
3305 charptr = ecode;
3306 GETCHARLEN(fc, ecode, length);
3307 ecode += length;
3309 /* Handle multibyte character matching specially here. There is
3310 support for caseless matching if UCP support is present. */
3312 if (length > 1)
3314 #ifdef SUPPORT_UCP
3315 pcre_uint32 othercase;
3316 if (op >= OP_STARI && /* Caseless */
3317 (othercase = UCD_OTHERCASE(fc)) != fc)
3318 oclength = PRIV(ord2utf)(othercase, occhars);
3319 else oclength = 0;
3320 #endif /* SUPPORT_UCP */
3322 for (i = 1; i <= min; i++)
3324 if (eptr <= md->end_subject - length &&
3325 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3326 #ifdef SUPPORT_UCP
3327 else if (oclength > 0 &&
3328 eptr <= md->end_subject - oclength &&
3329 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3330 #endif /* SUPPORT_UCP */
3331 else
3333 CHECK_PARTIAL();
3334 RRETURN(MATCH_NOMATCH);
3338 if (min == max) continue;
3340 if (minimize)
3342 for (fi = min;; fi++)
3344 RMATCH(eptr, ecode, offset_top, md, eptrb, RM22);
3345 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3346 if (fi >= max) RRETURN(MATCH_NOMATCH);
3347 if (eptr <= md->end_subject - length &&
3348 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3349 #ifdef SUPPORT_UCP
3350 else if (oclength > 0 &&
3351 eptr <= md->end_subject - oclength &&
3352 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3353 #endif /* SUPPORT_UCP */
3354 else
3356 CHECK_PARTIAL();
3357 RRETURN(MATCH_NOMATCH);
3360 /* Control never gets here */
3363 else /* Maximize */
3365 pp = eptr;
3366 for (i = min; i < max; i++)
3368 if (eptr <= md->end_subject - length &&
3369 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3370 #ifdef SUPPORT_UCP
3371 else if (oclength > 0 &&
3372 eptr <= md->end_subject - oclength &&
3373 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3374 #endif /* SUPPORT_UCP */
3375 else
3377 CHECK_PARTIAL();
3378 break;
3382 if (possessive) continue;
3384 for(;;)
3386 RMATCH(eptr, ecode, offset_top, md, eptrb, RM23);
3387 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3388 if (eptr == pp) { RRETURN(MATCH_NOMATCH); }
3389 #ifdef SUPPORT_UCP
3390 eptr--;
3391 BACKCHAR(eptr);
3392 #else /* without SUPPORT_UCP */
3393 eptr -= length;
3394 #endif /* SUPPORT_UCP */
3397 /* Control never gets here */
3400 /* If the length of a UTF-8 character is 1, we fall through here, and
3401 obey the code as for non-UTF-8 characters below, though in this case the
3402 value of fc will always be < 128. */
3404 else
3405 #endif /* SUPPORT_UTF */
3406 /* When not in UTF-8 mode, load a single-byte character. */
3407 fc = *ecode++;
3409 /* The value of fc at this point is always one character, though we may
3410 or may not be in UTF mode. The code is duplicated for the caseless and
3411 caseful cases, for speed, since matching characters is likely to be quite
3412 common. First, ensure the minimum number of matches are present. If min =
3413 max, continue at the same level without recursing. Otherwise, if
3414 minimizing, keep trying the rest of the expression and advancing one
3415 matching character if failing, up to the maximum. Alternatively, if
3416 maximizing, find the maximum number of characters and work backwards. */
3418 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3419 max, (char *)eptr));
3421 if (op >= OP_STARI) /* Caseless */
3423 #ifdef COMPILE_PCRE8
3424 /* fc must be < 128 if UTF is enabled. */
3425 foc = md->fcc[fc];
3426 #else
3427 #ifdef SUPPORT_UTF
3428 #ifdef SUPPORT_UCP
3429 if (utf && fc > 127)
3430 foc = UCD_OTHERCASE(fc);
3431 #else
3432 if (utf && fc > 127)
3433 foc = fc;
3434 #endif /* SUPPORT_UCP */
3435 else
3436 #endif /* SUPPORT_UTF */
3437 foc = TABLE_GET(fc, md->fcc, fc);
3438 #endif /* COMPILE_PCRE8 */
3440 for (i = 1; i <= min; i++)
3442 pcre_uchar cc;
3444 if (eptr >= md->end_subject)
3446 SCHECK_PARTIAL();
3447 RRETURN(MATCH_NOMATCH);
3449 cc = RAWUCHARTEST(eptr);
3450 if (fc != cc && foc != cc) RRETURN(MATCH_NOMATCH);
3451 eptr++;
3453 if (min == max) continue;
3454 if (minimize)
3456 for (fi = min;; fi++)
3458 pcre_uchar cc;
3460 RMATCH(eptr, ecode, offset_top, md, eptrb, RM24);
3461 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3462 if (fi >= max) RRETURN(MATCH_NOMATCH);
3463 if (eptr >= md->end_subject)
3465 SCHECK_PARTIAL();
3466 RRETURN(MATCH_NOMATCH);
3468 cc = RAWUCHARTEST(eptr);
3469 if (fc != cc && foc != cc) RRETURN(MATCH_NOMATCH);
3470 eptr++;
3472 /* Control never gets here */
3474 else /* Maximize */
3476 pp = eptr;
3477 for (i = min; i < max; i++)
3479 pcre_uchar cc;
3481 if (eptr >= md->end_subject)
3483 SCHECK_PARTIAL();
3484 break;
3486 cc = RAWUCHARTEST(eptr);
3487 if (fc != cc && foc != cc) break;
3488 eptr++;
3491 if (possessive) continue;
3493 while (eptr >= pp)
3495 RMATCH(eptr, ecode, offset_top, md, eptrb, RM25);
3496 eptr--;
3497 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3499 RRETURN(MATCH_NOMATCH);
3501 /* Control never gets here */
3504 /* Caseful comparisons (includes all multi-byte characters) */
3506 else
3508 for (i = 1; i <= min; i++)
3510 if (eptr >= md->end_subject)
3512 SCHECK_PARTIAL();
3513 RRETURN(MATCH_NOMATCH);
3515 if (fc != RAWUCHARINCTEST(eptr)) RRETURN(MATCH_NOMATCH);
3518 if (min == max) continue;
3520 if (minimize)
3522 for (fi = min;; fi++)
3524 RMATCH(eptr, ecode, offset_top, md, eptrb, RM26);
3525 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3526 if (fi >= max) RRETURN(MATCH_NOMATCH);
3527 if (eptr >= md->end_subject)
3529 SCHECK_PARTIAL();
3530 RRETURN(MATCH_NOMATCH);
3532 if (fc != RAWUCHARINCTEST(eptr)) RRETURN(MATCH_NOMATCH);
3534 /* Control never gets here */
3536 else /* Maximize */
3538 pp = eptr;
3539 for (i = min; i < max; i++)
3541 if (eptr >= md->end_subject)
3543 SCHECK_PARTIAL();
3544 break;
3546 if (fc != RAWUCHARTEST(eptr)) break;
3547 eptr++;
3549 if (possessive) continue;
3551 while (eptr >= pp)
3553 RMATCH(eptr, ecode, offset_top, md, eptrb, RM27);
3554 eptr--;
3555 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3557 RRETURN(MATCH_NOMATCH);
3560 /* Control never gets here */
3562 /* Match a negated single one-byte character. The character we are
3563 checking can be multibyte. */
3565 case OP_NOT:
3566 case OP_NOTI:
3567 if (eptr >= md->end_subject)
3569 SCHECK_PARTIAL();
3570 RRETURN(MATCH_NOMATCH);
3572 #ifdef SUPPORT_UTF
3573 if (utf)
3575 register pcre_uint32 ch, och;
3577 ecode++;
3578 GETCHARINC(ch, ecode);
3579 GETCHARINC(c, eptr);
3581 if (op == OP_NOT)
3583 if (ch == c) RRETURN(MATCH_NOMATCH);
3585 else
3587 #ifdef SUPPORT_UCP
3588 if (ch > 127)
3589 och = UCD_OTHERCASE(ch);
3590 #else
3591 if (ch > 127)
3592 och = ch;
3593 #endif /* SUPPORT_UCP */
3594 else
3595 och = TABLE_GET(ch, md->fcc, ch);
3596 if (ch == c || och == c) RRETURN(MATCH_NOMATCH);
3599 else
3600 #endif
3602 register pcre_uint32 ch = ecode[1];
3603 c = *eptr++;
3604 if (ch == c || (op == OP_NOTI && TABLE_GET(ch, md->fcc, ch) == c))
3605 RRETURN(MATCH_NOMATCH);
3606 ecode += 2;
3608 break;
3610 /* Match a negated single one-byte character repeatedly. This is almost a
3611 repeat of the code for a repeated single character, but I haven't found a
3612 nice way of commoning these up that doesn't require a test of the
3613 positive/negative option for each character match. Maybe that wouldn't add
3614 very much to the time taken, but character matching *is* what this is all
3615 about... */
3617 case OP_NOTEXACT:
3618 case OP_NOTEXACTI:
3619 min = max = GET2(ecode, 1);
3620 ecode += 1 + IMM2_SIZE;
3621 goto REPEATNOTCHAR;
3623 case OP_NOTUPTO:
3624 case OP_NOTUPTOI:
3625 case OP_NOTMINUPTO:
3626 case OP_NOTMINUPTOI:
3627 min = 0;
3628 max = GET2(ecode, 1);
3629 minimize = *ecode == OP_NOTMINUPTO || *ecode == OP_NOTMINUPTOI;
3630 ecode += 1 + IMM2_SIZE;
3631 goto REPEATNOTCHAR;
3633 case OP_NOTPOSSTAR:
3634 case OP_NOTPOSSTARI:
3635 possessive = TRUE;
3636 min = 0;
3637 max = INT_MAX;
3638 ecode++;
3639 goto REPEATNOTCHAR;
3641 case OP_NOTPOSPLUS:
3642 case OP_NOTPOSPLUSI:
3643 possessive = TRUE;
3644 min = 1;
3645 max = INT_MAX;
3646 ecode++;
3647 goto REPEATNOTCHAR;
3649 case OP_NOTPOSQUERY:
3650 case OP_NOTPOSQUERYI:
3651 possessive = TRUE;
3652 min = 0;
3653 max = 1;
3654 ecode++;
3655 goto REPEATNOTCHAR;
3657 case OP_NOTPOSUPTO:
3658 case OP_NOTPOSUPTOI:
3659 possessive = TRUE;
3660 min = 0;
3661 max = GET2(ecode, 1);
3662 ecode += 1 + IMM2_SIZE;
3663 goto REPEATNOTCHAR;
3665 case OP_NOTSTAR:
3666 case OP_NOTSTARI:
3667 case OP_NOTMINSTAR:
3668 case OP_NOTMINSTARI:
3669 case OP_NOTPLUS:
3670 case OP_NOTPLUSI:
3671 case OP_NOTMINPLUS:
3672 case OP_NOTMINPLUSI:
3673 case OP_NOTQUERY:
3674 case OP_NOTQUERYI:
3675 case OP_NOTMINQUERY:
3676 case OP_NOTMINQUERYI:
3677 c = *ecode++ - ((op >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR);
3678 minimize = (c & 1) != 0;
3679 min = rep_min[c]; /* Pick up values from tables; */
3680 max = rep_max[c]; /* zero for max => infinity */
3681 if (max == 0) max = INT_MAX;
3683 /* Common code for all repeated single-byte matches. */
3685 REPEATNOTCHAR:
3686 GETCHARINCTEST(fc, ecode);
3688 /* The code is duplicated for the caseless and caseful cases, for speed,
3689 since matching characters is likely to be quite common. First, ensure the
3690 minimum number of matches are present. If min = max, continue at the same
3691 level without recursing. Otherwise, if minimizing, keep trying the rest of
3692 the expression and advancing one matching character if failing, up to the
3693 maximum. Alternatively, if maximizing, find the maximum number of
3694 characters and work backwards. */
3696 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3697 max, (char *)eptr));
3699 if (op >= OP_NOTSTARI) /* Caseless */
3701 #ifdef SUPPORT_UTF
3702 #ifdef SUPPORT_UCP
3703 if (utf && fc > 127)
3704 foc = UCD_OTHERCASE(fc);
3705 #else
3706 if (utf && fc > 127)
3707 foc = fc;
3708 #endif /* SUPPORT_UCP */
3709 else
3710 #endif /* SUPPORT_UTF */
3711 foc = TABLE_GET(fc, md->fcc, fc);
3713 #ifdef SUPPORT_UTF
3714 if (utf)
3716 register pcre_uint32 d;
3717 for (i = 1; i <= min; i++)
3719 if (eptr >= md->end_subject)
3721 SCHECK_PARTIAL();
3722 RRETURN(MATCH_NOMATCH);
3724 GETCHARINC(d, eptr);
3725 if (fc == d || (unsigned int)foc == d) RRETURN(MATCH_NOMATCH);
3728 else
3729 #endif
3730 /* Not UTF mode */
3732 for (i = 1; i <= min; i++)
3734 if (eptr >= md->end_subject)
3736 SCHECK_PARTIAL();
3737 RRETURN(MATCH_NOMATCH);
3739 if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH);
3740 eptr++;
3744 if (min == max) continue;
3746 if (minimize)
3748 #ifdef SUPPORT_UTF
3749 if (utf)
3751 register pcre_uint32 d;
3752 for (fi = min;; fi++)
3754 RMATCH(eptr, ecode, offset_top, md, eptrb, RM28);
3755 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3756 if (fi >= max) RRETURN(MATCH_NOMATCH);
3757 if (eptr >= md->end_subject)
3759 SCHECK_PARTIAL();
3760 RRETURN(MATCH_NOMATCH);
3762 GETCHARINC(d, eptr);
3763 if (fc == d || (unsigned int)foc == d) RRETURN(MATCH_NOMATCH);
3766 else
3767 #endif
3768 /* Not UTF mode */
3770 for (fi = min;; fi++)
3772 RMATCH(eptr, ecode, offset_top, md, eptrb, RM29);
3773 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3774 if (fi >= max) RRETURN(MATCH_NOMATCH);
3775 if (eptr >= md->end_subject)
3777 SCHECK_PARTIAL();
3778 RRETURN(MATCH_NOMATCH);
3780 if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH);
3781 eptr++;
3784 /* Control never gets here */
3787 /* Maximize case */
3789 else
3791 pp = eptr;
3793 #ifdef SUPPORT_UTF
3794 if (utf)
3796 register pcre_uint32 d;
3797 for (i = min; i < max; i++)
3799 int len = 1;
3800 if (eptr >= md->end_subject)
3802 SCHECK_PARTIAL();
3803 break;
3805 GETCHARLEN(d, eptr, len);
3806 if (fc == d || (unsigned int)foc == d) break;
3807 eptr += len;
3809 if (possessive) continue;
3810 for(;;)
3812 RMATCH(eptr, ecode, offset_top, md, eptrb, RM30);
3813 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3814 if (eptr-- == pp) break; /* Stop if tried at original pos */
3815 BACKCHAR(eptr);
3818 else
3819 #endif
3820 /* Not UTF mode */
3822 for (i = min; i < max; i++)
3824 if (eptr >= md->end_subject)
3826 SCHECK_PARTIAL();
3827 break;
3829 if (fc == *eptr || foc == *eptr) break;
3830 eptr++;
3832 if (possessive) continue;
3833 while (eptr >= pp)
3835 RMATCH(eptr, ecode, offset_top, md, eptrb, RM31);
3836 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3837 eptr--;
3841 RRETURN(MATCH_NOMATCH);
3843 /* Control never gets here */
3846 /* Caseful comparisons */
3848 else
3850 #ifdef SUPPORT_UTF
3851 if (utf)
3853 register pcre_uint32 d;
3854 for (i = 1; i <= min; i++)
3856 if (eptr >= md->end_subject)
3858 SCHECK_PARTIAL();
3859 RRETURN(MATCH_NOMATCH);
3861 GETCHARINC(d, eptr);
3862 if (fc == d) RRETURN(MATCH_NOMATCH);
3865 else
3866 #endif
3867 /* Not UTF mode */
3869 for (i = 1; i <= min; i++)
3871 if (eptr >= md->end_subject)
3873 SCHECK_PARTIAL();
3874 RRETURN(MATCH_NOMATCH);
3876 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
3880 if (min == max) continue;
3882 if (minimize)
3884 #ifdef SUPPORT_UTF
3885 if (utf)
3887 register pcre_uint32 d;
3888 for (fi = min;; fi++)
3890 RMATCH(eptr, ecode, offset_top, md, eptrb, RM32);
3891 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3892 if (fi >= max) RRETURN(MATCH_NOMATCH);
3893 if (eptr >= md->end_subject)
3895 SCHECK_PARTIAL();
3896 RRETURN(MATCH_NOMATCH);
3898 GETCHARINC(d, eptr);
3899 if (fc == d) RRETURN(MATCH_NOMATCH);
3902 else
3903 #endif
3904 /* Not UTF mode */
3906 for (fi = min;; fi++)
3908 RMATCH(eptr, ecode, offset_top, md, eptrb, RM33);
3909 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3910 if (fi >= max) RRETURN(MATCH_NOMATCH);
3911 if (eptr >= md->end_subject)
3913 SCHECK_PARTIAL();
3914 RRETURN(MATCH_NOMATCH);
3916 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
3919 /* Control never gets here */
3922 /* Maximize case */
3924 else
3926 pp = eptr;
3928 #ifdef SUPPORT_UTF
3929 if (utf)
3931 register pcre_uint32 d;
3932 for (i = min; i < max; i++)
3934 int len = 1;
3935 if (eptr >= md->end_subject)
3937 SCHECK_PARTIAL();
3938 break;
3940 GETCHARLEN(d, eptr, len);
3941 if (fc == d) break;
3942 eptr += len;
3944 if (possessive) continue;
3945 for(;;)
3947 RMATCH(eptr, ecode, offset_top, md, eptrb, RM34);
3948 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3949 if (eptr-- == pp) break; /* Stop if tried at original pos */
3950 BACKCHAR(eptr);
3953 else
3954 #endif
3955 /* Not UTF mode */
3957 for (i = min; i < max; i++)
3959 if (eptr >= md->end_subject)
3961 SCHECK_PARTIAL();
3962 break;
3964 if (fc == *eptr) break;
3965 eptr++;
3967 if (possessive) continue;
3968 while (eptr >= pp)
3970 RMATCH(eptr, ecode, offset_top, md, eptrb, RM35);
3971 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3972 eptr--;
3976 RRETURN(MATCH_NOMATCH);
3979 /* Control never gets here */
3981 /* Match a single character type repeatedly; several different opcodes
3982 share code. This is very similar to the code for single characters, but we
3983 repeat it in the interests of efficiency. */
3985 case OP_TYPEEXACT:
3986 min = max = GET2(ecode, 1);
3987 minimize = TRUE;
3988 ecode += 1 + IMM2_SIZE;
3989 goto REPEATTYPE;
3991 case OP_TYPEUPTO:
3992 case OP_TYPEMINUPTO:
3993 min = 0;
3994 max = GET2(ecode, 1);
3995 minimize = *ecode == OP_TYPEMINUPTO;
3996 ecode += 1 + IMM2_SIZE;
3997 goto REPEATTYPE;
3999 case OP_TYPEPOSSTAR:
4000 possessive = TRUE;
4001 min = 0;
4002 max = INT_MAX;
4003 ecode++;
4004 goto REPEATTYPE;
4006 case OP_TYPEPOSPLUS:
4007 possessive = TRUE;
4008 min = 1;
4009 max = INT_MAX;
4010 ecode++;
4011 goto REPEATTYPE;
4013 case OP_TYPEPOSQUERY:
4014 possessive = TRUE;
4015 min = 0;
4016 max = 1;
4017 ecode++;
4018 goto REPEATTYPE;
4020 case OP_TYPEPOSUPTO:
4021 possessive = TRUE;
4022 min = 0;
4023 max = GET2(ecode, 1);
4024 ecode += 1 + IMM2_SIZE;
4025 goto REPEATTYPE;
4027 case OP_TYPESTAR:
4028 case OP_TYPEMINSTAR:
4029 case OP_TYPEPLUS:
4030 case OP_TYPEMINPLUS:
4031 case OP_TYPEQUERY:
4032 case OP_TYPEMINQUERY:
4033 c = *ecode++ - OP_TYPESTAR;
4034 minimize = (c & 1) != 0;
4035 min = rep_min[c]; /* Pick up values from tables; */
4036 max = rep_max[c]; /* zero for max => infinity */
4037 if (max == 0) max = INT_MAX;
4039 /* Common code for all repeated single character type matches. Note that
4040 in UTF-8 mode, '.' matches a character of any length, but for the other
4041 character types, the valid characters are all one-byte long. */
4043 REPEATTYPE:
4044 ctype = *ecode++; /* Code for the character type */
4046 #ifdef SUPPORT_UCP
4047 if (ctype == OP_PROP || ctype == OP_NOTPROP)
4049 prop_fail_result = ctype == OP_NOTPROP;
4050 prop_type = *ecode++;
4051 prop_value = *ecode++;
4053 else prop_type = -1;
4054 #endif
4056 /* First, ensure the minimum number of matches are present. Use inline
4057 code for maximizing the speed, and do the type test once at the start
4058 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
4059 is tidier. Also separate the UCP code, which can be the same for both UTF-8
4060 and single-bytes. */
4062 if (min > 0)
4064 #ifdef SUPPORT_UCP
4065 if (prop_type >= 0)
4067 switch(prop_type)
4069 case PT_ANY:
4070 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
4071 for (i = 1; i <= min; i++)
4073 if (eptr >= md->end_subject)
4075 SCHECK_PARTIAL();
4076 RRETURN(MATCH_NOMATCH);
4078 GETCHARINCTEST(c, eptr);
4080 break;
4082 case PT_LAMP:
4083 for (i = 1; i <= min; i++)
4085 int chartype;
4086 if (eptr >= md->end_subject)
4088 SCHECK_PARTIAL();
4089 RRETURN(MATCH_NOMATCH);
4091 GETCHARINCTEST(c, eptr);
4092 chartype = UCD_CHARTYPE(c);
4093 if ((chartype == ucp_Lu ||
4094 chartype == ucp_Ll ||
4095 chartype == ucp_Lt) == prop_fail_result)
4096 RRETURN(MATCH_NOMATCH);
4098 break;
4100 case PT_GC:
4101 for (i = 1; i <= min; i++)
4103 if (eptr >= md->end_subject)
4105 SCHECK_PARTIAL();
4106 RRETURN(MATCH_NOMATCH);
4108 GETCHARINCTEST(c, eptr);
4109 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4110 RRETURN(MATCH_NOMATCH);
4112 break;
4114 case PT_PC:
4115 for (i = 1; i <= min; i++)
4117 if (eptr >= md->end_subject)
4119 SCHECK_PARTIAL();
4120 RRETURN(MATCH_NOMATCH);
4122 GETCHARINCTEST(c, eptr);
4123 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4124 RRETURN(MATCH_NOMATCH);
4126 break;
4128 case PT_SC:
4129 for (i = 1; i <= min; i++)
4131 if (eptr >= md->end_subject)
4133 SCHECK_PARTIAL();
4134 RRETURN(MATCH_NOMATCH);
4136 GETCHARINCTEST(c, eptr);
4137 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4138 RRETURN(MATCH_NOMATCH);
4140 break;
4142 case PT_ALNUM:
4143 for (i = 1; i <= min; i++)
4145 int category;
4146 if (eptr >= md->end_subject)
4148 SCHECK_PARTIAL();
4149 RRETURN(MATCH_NOMATCH);
4151 GETCHARINCTEST(c, eptr);
4152 category = UCD_CATEGORY(c);
4153 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4154 RRETURN(MATCH_NOMATCH);
4156 break;
4158 case PT_SPACE: /* Perl space */
4159 for (i = 1; i <= min; i++)
4161 if (eptr >= md->end_subject)
4163 SCHECK_PARTIAL();
4164 RRETURN(MATCH_NOMATCH);
4166 GETCHARINCTEST(c, eptr);
4167 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4168 c == CHAR_FF || c == CHAR_CR)
4169 == prop_fail_result)
4170 RRETURN(MATCH_NOMATCH);
4172 break;
4174 case PT_PXSPACE: /* POSIX space */
4175 for (i = 1; i <= min; i++)
4177 if (eptr >= md->end_subject)
4179 SCHECK_PARTIAL();
4180 RRETURN(MATCH_NOMATCH);
4182 GETCHARINCTEST(c, eptr);
4183 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4184 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4185 == prop_fail_result)
4186 RRETURN(MATCH_NOMATCH);
4188 break;
4190 case PT_WORD:
4191 for (i = 1; i <= min; i++)
4193 int category;
4194 if (eptr >= md->end_subject)
4196 SCHECK_PARTIAL();
4197 RRETURN(MATCH_NOMATCH);
4199 GETCHARINCTEST(c, eptr);
4200 category = UCD_CATEGORY(c);
4201 if ((category == ucp_L || category == ucp_N || c == CHAR_UNDERSCORE)
4202 == prop_fail_result)
4203 RRETURN(MATCH_NOMATCH);
4205 break;
4207 case PT_CLIST:
4208 for (i = 1; i <= min; i++)
4210 const pcre_uint32 *cp;
4211 if (eptr >= md->end_subject)
4213 SCHECK_PARTIAL();
4214 RRETURN(MATCH_NOMATCH);
4216 GETCHARINCTEST(c, eptr);
4217 cp = PRIV(ucd_caseless_sets) + prop_value;
4218 for (;;)
4220 if (c < *cp)
4221 { if (prop_fail_result) break; else { RRETURN(MATCH_NOMATCH); } }
4222 if (c == *cp++)
4223 { if (prop_fail_result) { RRETURN(MATCH_NOMATCH); } else break; }
4226 break;
4228 /* This should not occur */
4230 default:
4231 RRETURN(PCRE_ERROR_INTERNAL);
4235 /* Match extended Unicode sequences. We will get here only if the
4236 support is in the binary; otherwise a compile-time error occurs. */
4238 else if (ctype == OP_EXTUNI)
4240 for (i = 1; i <= min; i++)
4242 if (eptr >= md->end_subject)
4244 SCHECK_PARTIAL();
4245 RRETURN(MATCH_NOMATCH);
4247 else
4249 int lgb, rgb;
4250 GETCHARINCTEST(c, eptr);
4251 lgb = UCD_GRAPHBREAK(c);
4252 while (eptr < md->end_subject)
4254 int len = 1;
4255 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
4256 rgb = UCD_GRAPHBREAK(c);
4257 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
4258 lgb = rgb;
4259 eptr += len;
4262 CHECK_PARTIAL();
4266 else
4267 #endif /* SUPPORT_UCP */
4269 /* Handle all other cases when the coding is UTF-8 */
4271 #ifdef SUPPORT_UTF
4272 if (utf) switch(ctype)
4274 case OP_ANY:
4275 for (i = 1; i <= min; i++)
4277 if (eptr >= md->end_subject)
4279 SCHECK_PARTIAL();
4280 RRETURN(MATCH_NOMATCH);
4282 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
4283 if (md->partial != 0 &&
4284 eptr + 1 >= md->end_subject &&
4285 NLBLOCK->nltype == NLTYPE_FIXED &&
4286 NLBLOCK->nllen == 2 &&
4287 RAWUCHAR(eptr) == NLBLOCK->nl[0])
4289 md->hitend = TRUE;
4290 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
4292 eptr++;
4293 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4295 break;
4297 case OP_ALLANY:
4298 for (i = 1; i <= min; i++)
4300 if (eptr >= md->end_subject)
4302 SCHECK_PARTIAL();
4303 RRETURN(MATCH_NOMATCH);
4305 eptr++;
4306 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4308 break;
4310 case OP_ANYBYTE:
4311 if (eptr > md->end_subject - min) RRETURN(MATCH_NOMATCH);
4312 eptr += min;
4313 break;
4315 case OP_ANYNL:
4316 for (i = 1; i <= min; i++)
4318 if (eptr >= md->end_subject)
4320 SCHECK_PARTIAL();
4321 RRETURN(MATCH_NOMATCH);
4323 GETCHARINC(c, eptr);
4324 switch(c)
4326 default: RRETURN(MATCH_NOMATCH);
4328 case CHAR_CR:
4329 if (eptr < md->end_subject && RAWUCHAR(eptr) == CHAR_LF) eptr++;
4330 break;
4332 case CHAR_LF:
4333 break;
4335 case CHAR_VT:
4336 case CHAR_FF:
4337 case CHAR_NEL:
4338 #ifndef EBCDIC
4339 case 0x2028:
4340 case 0x2029:
4341 #endif /* Not EBCDIC */
4342 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4343 break;
4346 break;
4348 case OP_NOT_HSPACE:
4349 for (i = 1; i <= min; i++)
4351 if (eptr >= md->end_subject)
4353 SCHECK_PARTIAL();
4354 RRETURN(MATCH_NOMATCH);
4356 GETCHARINC(c, eptr);
4357 switch(c)
4359 HSPACE_CASES: RRETURN(MATCH_NOMATCH); /* Byte and multibyte cases */
4360 default: break;
4363 break;
4365 case OP_HSPACE:
4366 for (i = 1; i <= min; i++)
4368 if (eptr >= md->end_subject)
4370 SCHECK_PARTIAL();
4371 RRETURN(MATCH_NOMATCH);
4373 GETCHARINC(c, eptr);
4374 switch(c)
4376 HSPACE_CASES: break; /* Byte and multibyte cases */
4377 default: RRETURN(MATCH_NOMATCH);
4380 break;
4382 case OP_NOT_VSPACE:
4383 for (i = 1; i <= min; i++)
4385 if (eptr >= md->end_subject)
4387 SCHECK_PARTIAL();
4388 RRETURN(MATCH_NOMATCH);
4390 GETCHARINC(c, eptr);
4391 switch(c)
4393 VSPACE_CASES: RRETURN(MATCH_NOMATCH);
4394 default: break;
4397 break;
4399 case OP_VSPACE:
4400 for (i = 1; i <= min; i++)
4402 if (eptr >= md->end_subject)
4404 SCHECK_PARTIAL();
4405 RRETURN(MATCH_NOMATCH);
4407 GETCHARINC(c, eptr);
4408 switch(c)
4410 VSPACE_CASES: break;
4411 default: RRETURN(MATCH_NOMATCH);
4414 break;
4416 case OP_NOT_DIGIT:
4417 for (i = 1; i <= min; i++)
4419 if (eptr >= md->end_subject)
4421 SCHECK_PARTIAL();
4422 RRETURN(MATCH_NOMATCH);
4424 GETCHARINC(c, eptr);
4425 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
4426 RRETURN(MATCH_NOMATCH);
4428 break;
4430 case OP_DIGIT:
4431 for (i = 1; i <= min; i++)
4433 pcre_uchar cc;
4435 if (eptr >= md->end_subject)
4437 SCHECK_PARTIAL();
4438 RRETURN(MATCH_NOMATCH);
4440 cc = RAWUCHAR(eptr);
4441 if (cc >= 128 || (md->ctypes[cc] & ctype_digit) == 0)
4442 RRETURN(MATCH_NOMATCH);
4443 eptr++;
4444 /* No need to skip more bytes - we know it's a 1-byte character */
4446 break;
4448 case OP_NOT_WHITESPACE:
4449 for (i = 1; i <= min; i++)
4451 pcre_uchar cc;
4453 if (eptr >= md->end_subject)
4455 SCHECK_PARTIAL();
4456 RRETURN(MATCH_NOMATCH);
4458 cc = RAWUCHAR(eptr);
4459 if (cc < 128 && (md->ctypes[cc] & ctype_space) != 0)
4460 RRETURN(MATCH_NOMATCH);
4461 eptr++;
4462 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4464 break;
4466 case OP_WHITESPACE:
4467 for (i = 1; i <= min; i++)
4469 pcre_uchar cc;
4471 if (eptr >= md->end_subject)
4473 SCHECK_PARTIAL();
4474 RRETURN(MATCH_NOMATCH);
4476 cc = RAWUCHAR(eptr);
4477 if (cc >= 128 || (md->ctypes[cc] & ctype_space) == 0)
4478 RRETURN(MATCH_NOMATCH);
4479 eptr++;
4480 /* No need to skip more bytes - we know it's a 1-byte character */
4482 break;
4484 case OP_NOT_WORDCHAR:
4485 for (i = 1; i <= min; i++)
4487 pcre_uchar cc;
4489 if (eptr >= md->end_subject)
4491 SCHECK_PARTIAL();
4492 RRETURN(MATCH_NOMATCH);
4494 cc = RAWUCHAR(eptr);
4495 if (cc < 128 && (md->ctypes[cc] & ctype_word) != 0)
4496 RRETURN(MATCH_NOMATCH);
4497 eptr++;
4498 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4500 break;
4502 case OP_WORDCHAR:
4503 for (i = 1; i <= min; i++)
4505 pcre_uchar cc;
4507 if (eptr >= md->end_subject)
4509 SCHECK_PARTIAL();
4510 RRETURN(MATCH_NOMATCH);
4512 cc = RAWUCHAR(eptr);
4513 if (cc >= 128 || (md->ctypes[cc] & ctype_word) == 0)
4514 RRETURN(MATCH_NOMATCH);
4515 eptr++;
4516 /* No need to skip more bytes - we know it's a 1-byte character */
4518 break;
4520 default:
4521 RRETURN(PCRE_ERROR_INTERNAL);
4522 } /* End switch(ctype) */
4524 else
4525 #endif /* SUPPORT_UTF */
4527 /* Code for the non-UTF-8 case for minimum matching of operators other
4528 than OP_PROP and OP_NOTPROP. */
4530 switch(ctype)
4532 case OP_ANY:
4533 for (i = 1; i <= min; i++)
4535 if (eptr >= md->end_subject)
4537 SCHECK_PARTIAL();
4538 RRETURN(MATCH_NOMATCH);
4540 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
4541 if (md->partial != 0 &&
4542 eptr + 1 >= md->end_subject &&
4543 NLBLOCK->nltype == NLTYPE_FIXED &&
4544 NLBLOCK->nllen == 2 &&
4545 *eptr == NLBLOCK->nl[0])
4547 md->hitend = TRUE;
4548 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
4550 eptr++;
4552 break;
4554 case OP_ALLANY:
4555 if (eptr > md->end_subject - min)
4557 SCHECK_PARTIAL();
4558 RRETURN(MATCH_NOMATCH);
4560 eptr += min;
4561 break;
4563 case OP_ANYBYTE:
4564 if (eptr > md->end_subject - min)
4566 SCHECK_PARTIAL();
4567 RRETURN(MATCH_NOMATCH);
4569 eptr += min;
4570 break;
4572 case OP_ANYNL:
4573 for (i = 1; i <= min; i++)
4575 if (eptr >= md->end_subject)
4577 SCHECK_PARTIAL();
4578 RRETURN(MATCH_NOMATCH);
4580 switch(*eptr++)
4582 default: RRETURN(MATCH_NOMATCH);
4584 case CHAR_CR:
4585 if (eptr < md->end_subject && *eptr == CHAR_LF) eptr++;
4586 break;
4588 case CHAR_LF:
4589 break;
4591 case CHAR_VT:
4592 case CHAR_FF:
4593 case CHAR_NEL:
4594 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4595 case 0x2028:
4596 case 0x2029:
4597 #endif
4598 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4599 break;
4602 break;
4604 case OP_NOT_HSPACE:
4605 for (i = 1; i <= min; i++)
4607 if (eptr >= md->end_subject)
4609 SCHECK_PARTIAL();
4610 RRETURN(MATCH_NOMATCH);
4612 switch(*eptr++)
4614 default: break;
4615 HSPACE_BYTE_CASES:
4616 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4617 HSPACE_MULTIBYTE_CASES:
4618 #endif
4619 RRETURN(MATCH_NOMATCH);
4622 break;
4624 case OP_HSPACE:
4625 for (i = 1; i <= min; i++)
4627 if (eptr >= md->end_subject)
4629 SCHECK_PARTIAL();
4630 RRETURN(MATCH_NOMATCH);
4632 switch(*eptr++)
4634 default: RRETURN(MATCH_NOMATCH);
4635 HSPACE_BYTE_CASES:
4636 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4637 HSPACE_MULTIBYTE_CASES:
4638 #endif
4639 break;
4642 break;
4644 case OP_NOT_VSPACE:
4645 for (i = 1; i <= min; i++)
4647 if (eptr >= md->end_subject)
4649 SCHECK_PARTIAL();
4650 RRETURN(MATCH_NOMATCH);
4652 switch(*eptr++)
4654 VSPACE_BYTE_CASES:
4655 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4656 VSPACE_MULTIBYTE_CASES:
4657 #endif
4658 RRETURN(MATCH_NOMATCH);
4659 default: break;
4662 break;
4664 case OP_VSPACE:
4665 for (i = 1; i <= min; i++)
4667 if (eptr >= md->end_subject)
4669 SCHECK_PARTIAL();
4670 RRETURN(MATCH_NOMATCH);
4672 switch(*eptr++)
4674 default: RRETURN(MATCH_NOMATCH);
4675 VSPACE_BYTE_CASES:
4676 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4677 VSPACE_MULTIBYTE_CASES:
4678 #endif
4679 break;
4682 break;
4684 case OP_NOT_DIGIT:
4685 for (i = 1; i <= min; i++)
4687 if (eptr >= md->end_subject)
4689 SCHECK_PARTIAL();
4690 RRETURN(MATCH_NOMATCH);
4692 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_digit) != 0)
4693 RRETURN(MATCH_NOMATCH);
4694 eptr++;
4696 break;
4698 case OP_DIGIT:
4699 for (i = 1; i <= min; i++)
4701 if (eptr >= md->end_subject)
4703 SCHECK_PARTIAL();
4704 RRETURN(MATCH_NOMATCH);
4706 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_digit) == 0)
4707 RRETURN(MATCH_NOMATCH);
4708 eptr++;
4710 break;
4712 case OP_NOT_WHITESPACE:
4713 for (i = 1; i <= min; i++)
4715 if (eptr >= md->end_subject)
4717 SCHECK_PARTIAL();
4718 RRETURN(MATCH_NOMATCH);
4720 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_space) != 0)
4721 RRETURN(MATCH_NOMATCH);
4722 eptr++;
4724 break;
4726 case OP_WHITESPACE:
4727 for (i = 1; i <= min; i++)
4729 if (eptr >= md->end_subject)
4731 SCHECK_PARTIAL();
4732 RRETURN(MATCH_NOMATCH);
4734 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_space) == 0)
4735 RRETURN(MATCH_NOMATCH);
4736 eptr++;
4738 break;
4740 case OP_NOT_WORDCHAR:
4741 for (i = 1; i <= min; i++)
4743 if (eptr >= md->end_subject)
4745 SCHECK_PARTIAL();
4746 RRETURN(MATCH_NOMATCH);
4748 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_word) != 0)
4749 RRETURN(MATCH_NOMATCH);
4750 eptr++;
4752 break;
4754 case OP_WORDCHAR:
4755 for (i = 1; i <= min; i++)
4757 if (eptr >= md->end_subject)
4759 SCHECK_PARTIAL();
4760 RRETURN(MATCH_NOMATCH);
4762 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_word) == 0)
4763 RRETURN(MATCH_NOMATCH);
4764 eptr++;
4766 break;
4768 default:
4769 RRETURN(PCRE_ERROR_INTERNAL);
4773 /* If min = max, continue at the same level without recursing */
4775 if (min == max) continue;
4777 /* If minimizing, we have to test the rest of the pattern before each
4778 subsequent match. Again, separate the UTF-8 case for speed, and also
4779 separate the UCP cases. */
4781 if (minimize)
4783 #ifdef SUPPORT_UCP
4784 if (prop_type >= 0)
4786 switch(prop_type)
4788 case PT_ANY:
4789 for (fi = min;; fi++)
4791 RMATCH(eptr, ecode, offset_top, md, eptrb, RM36);
4792 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4793 if (fi >= max) RRETURN(MATCH_NOMATCH);
4794 if (eptr >= md->end_subject)
4796 SCHECK_PARTIAL();
4797 RRETURN(MATCH_NOMATCH);
4799 GETCHARINCTEST(c, eptr);
4800 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
4802 /* Control never gets here */
4804 case PT_LAMP:
4805 for (fi = min;; fi++)
4807 int chartype;
4808 RMATCH(eptr, ecode, offset_top, md, eptrb, RM37);
4809 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4810 if (fi >= max) RRETURN(MATCH_NOMATCH);
4811 if (eptr >= md->end_subject)
4813 SCHECK_PARTIAL();
4814 RRETURN(MATCH_NOMATCH);
4816 GETCHARINCTEST(c, eptr);
4817 chartype = UCD_CHARTYPE(c);
4818 if ((chartype == ucp_Lu ||
4819 chartype == ucp_Ll ||
4820 chartype == ucp_Lt) == prop_fail_result)
4821 RRETURN(MATCH_NOMATCH);
4823 /* Control never gets here */
4825 case PT_GC:
4826 for (fi = min;; fi++)
4828 RMATCH(eptr, ecode, offset_top, md, eptrb, RM38);
4829 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4830 if (fi >= max) RRETURN(MATCH_NOMATCH);
4831 if (eptr >= md->end_subject)
4833 SCHECK_PARTIAL();
4834 RRETURN(MATCH_NOMATCH);
4836 GETCHARINCTEST(c, eptr);
4837 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4838 RRETURN(MATCH_NOMATCH);
4840 /* Control never gets here */
4842 case PT_PC:
4843 for (fi = min;; fi++)
4845 RMATCH(eptr, ecode, offset_top, md, eptrb, RM39);
4846 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4847 if (fi >= max) RRETURN(MATCH_NOMATCH);
4848 if (eptr >= md->end_subject)
4850 SCHECK_PARTIAL();
4851 RRETURN(MATCH_NOMATCH);
4853 GETCHARINCTEST(c, eptr);
4854 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4855 RRETURN(MATCH_NOMATCH);
4857 /* Control never gets here */
4859 case PT_SC:
4860 for (fi = min;; fi++)
4862 RMATCH(eptr, ecode, offset_top, md, eptrb, RM40);
4863 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4864 if (fi >= max) RRETURN(MATCH_NOMATCH);
4865 if (eptr >= md->end_subject)
4867 SCHECK_PARTIAL();
4868 RRETURN(MATCH_NOMATCH);
4870 GETCHARINCTEST(c, eptr);
4871 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4872 RRETURN(MATCH_NOMATCH);
4874 /* Control never gets here */
4876 case PT_ALNUM:
4877 for (fi = min;; fi++)
4879 int category;
4880 RMATCH(eptr, ecode, offset_top, md, eptrb, RM59);
4881 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4882 if (fi >= max) RRETURN(MATCH_NOMATCH);
4883 if (eptr >= md->end_subject)
4885 SCHECK_PARTIAL();
4886 RRETURN(MATCH_NOMATCH);
4888 GETCHARINCTEST(c, eptr);
4889 category = UCD_CATEGORY(c);
4890 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4891 RRETURN(MATCH_NOMATCH);
4893 /* Control never gets here */
4895 case PT_SPACE: /* Perl space */
4896 for (fi = min;; fi++)
4898 RMATCH(eptr, ecode, offset_top, md, eptrb, RM60);
4899 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4900 if (fi >= max) RRETURN(MATCH_NOMATCH);
4901 if (eptr >= md->end_subject)
4903 SCHECK_PARTIAL();
4904 RRETURN(MATCH_NOMATCH);
4906 GETCHARINCTEST(c, eptr);
4907 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4908 c == CHAR_FF || c == CHAR_CR)
4909 == prop_fail_result)
4910 RRETURN(MATCH_NOMATCH);
4912 /* Control never gets here */
4914 case PT_PXSPACE: /* POSIX space */
4915 for (fi = min;; fi++)
4917 RMATCH(eptr, ecode, offset_top, md, eptrb, RM61);
4918 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4919 if (fi >= max) RRETURN(MATCH_NOMATCH);
4920 if (eptr >= md->end_subject)
4922 SCHECK_PARTIAL();
4923 RRETURN(MATCH_NOMATCH);
4925 GETCHARINCTEST(c, eptr);
4926 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4927 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4928 == prop_fail_result)
4929 RRETURN(MATCH_NOMATCH);
4931 /* Control never gets here */
4933 case PT_WORD:
4934 for (fi = min;; fi++)
4936 int category;
4937 RMATCH(eptr, ecode, offset_top, md, eptrb, RM62);
4938 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4939 if (fi >= max) RRETURN(MATCH_NOMATCH);
4940 if (eptr >= md->end_subject)
4942 SCHECK_PARTIAL();
4943 RRETURN(MATCH_NOMATCH);
4945 GETCHARINCTEST(c, eptr);
4946 category = UCD_CATEGORY(c);
4947 if ((category == ucp_L ||
4948 category == ucp_N ||
4949 c == CHAR_UNDERSCORE)
4950 == prop_fail_result)
4951 RRETURN(MATCH_NOMATCH);
4953 /* Control never gets here */
4955 case PT_CLIST:
4956 for (fi = min;; fi++)
4958 const pcre_uint32 *cp;
4959 RMATCH(eptr, ecode, offset_top, md, eptrb, RM67);
4960 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4961 if (fi >= max) RRETURN(MATCH_NOMATCH);
4962 if (eptr >= md->end_subject)
4964 SCHECK_PARTIAL();
4965 RRETURN(MATCH_NOMATCH);
4967 GETCHARINCTEST(c, eptr);
4968 cp = PRIV(ucd_caseless_sets) + prop_value;
4969 for (;;)
4971 if (c < *cp)
4972 { if (prop_fail_result) break; else { RRETURN(MATCH_NOMATCH); } }
4973 if (c == *cp++)
4974 { if (prop_fail_result) { RRETURN(MATCH_NOMATCH); } else break; }
4977 /* Control never gets here */
4979 /* This should never occur */
4980 default:
4981 RRETURN(PCRE_ERROR_INTERNAL);
4985 /* Match extended Unicode sequences. We will get here only if the
4986 support is in the binary; otherwise a compile-time error occurs. */
4988 else if (ctype == OP_EXTUNI)
4990 for (fi = min;; fi++)
4992 RMATCH(eptr, ecode, offset_top, md, eptrb, RM41);
4993 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4994 if (fi >= max) RRETURN(MATCH_NOMATCH);
4995 if (eptr >= md->end_subject)
4997 SCHECK_PARTIAL();
4998 RRETURN(MATCH_NOMATCH);
5000 else
5002 int lgb, rgb;
5003 GETCHARINCTEST(c, eptr);
5004 lgb = UCD_GRAPHBREAK(c);
5005 while (eptr < md->end_subject)
5007 int len = 1;
5008 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5009 rgb = UCD_GRAPHBREAK(c);
5010 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
5011 lgb = rgb;
5012 eptr += len;
5015 CHECK_PARTIAL();
5018 else
5019 #endif /* SUPPORT_UCP */
5021 #ifdef SUPPORT_UTF
5022 if (utf)
5024 for (fi = min;; fi++)
5026 RMATCH(eptr, ecode, offset_top, md, eptrb, RM42);
5027 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5028 if (fi >= max) RRETURN(MATCH_NOMATCH);
5029 if (eptr >= md->end_subject)
5031 SCHECK_PARTIAL();
5032 RRETURN(MATCH_NOMATCH);
5034 if (ctype == OP_ANY && IS_NEWLINE(eptr))
5035 RRETURN(MATCH_NOMATCH);
5036 GETCHARINC(c, eptr);
5037 switch(ctype)
5039 case OP_ANY: /* This is the non-NL case */
5040 if (md->partial != 0 && /* Take care with CRLF partial */
5041 eptr >= md->end_subject &&
5042 NLBLOCK->nltype == NLTYPE_FIXED &&
5043 NLBLOCK->nllen == 2 &&
5044 c == NLBLOCK->nl[0])
5046 md->hitend = TRUE;
5047 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5049 break;
5051 case OP_ALLANY:
5052 case OP_ANYBYTE:
5053 break;
5055 case OP_ANYNL:
5056 switch(c)
5058 default: RRETURN(MATCH_NOMATCH);
5059 case CHAR_CR:
5060 if (eptr < md->end_subject && RAWUCHAR(eptr) == CHAR_LF) eptr++;
5061 break;
5063 case CHAR_LF:
5064 break;
5066 case CHAR_VT:
5067 case CHAR_FF:
5068 case CHAR_NEL:
5069 #ifndef EBCDIC
5070 case 0x2028:
5071 case 0x2029:
5072 #endif /* Not EBCDIC */
5073 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
5074 break;
5076 break;
5078 case OP_NOT_HSPACE:
5079 switch(c)
5081 HSPACE_CASES: RRETURN(MATCH_NOMATCH);
5082 default: break;
5084 break;
5086 case OP_HSPACE:
5087 switch(c)
5089 HSPACE_CASES: break;
5090 default: RRETURN(MATCH_NOMATCH);
5092 break;
5094 case OP_NOT_VSPACE:
5095 switch(c)
5097 VSPACE_CASES: RRETURN(MATCH_NOMATCH);
5098 default: break;
5100 break;
5102 case OP_VSPACE:
5103 switch(c)
5105 VSPACE_CASES: break;
5106 default: RRETURN(MATCH_NOMATCH);
5108 break;
5110 case OP_NOT_DIGIT:
5111 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
5112 RRETURN(MATCH_NOMATCH);
5113 break;
5115 case OP_DIGIT:
5116 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
5117 RRETURN(MATCH_NOMATCH);
5118 break;
5120 case OP_NOT_WHITESPACE:
5121 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
5122 RRETURN(MATCH_NOMATCH);
5123 break;
5125 case OP_WHITESPACE:
5126 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
5127 RRETURN(MATCH_NOMATCH);
5128 break;
5130 case OP_NOT_WORDCHAR:
5131 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
5132 RRETURN(MATCH_NOMATCH);
5133 break;
5135 case OP_WORDCHAR:
5136 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
5137 RRETURN(MATCH_NOMATCH);
5138 break;
5140 default:
5141 RRETURN(PCRE_ERROR_INTERNAL);
5145 else
5146 #endif
5147 /* Not UTF mode */
5149 for (fi = min;; fi++)
5151 RMATCH(eptr, ecode, offset_top, md, eptrb, RM43);
5152 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5153 if (fi >= max) RRETURN(MATCH_NOMATCH);
5154 if (eptr >= md->end_subject)
5156 SCHECK_PARTIAL();
5157 RRETURN(MATCH_NOMATCH);
5159 if (ctype == OP_ANY && IS_NEWLINE(eptr))
5160 RRETURN(MATCH_NOMATCH);
5161 c = *eptr++;
5162 switch(ctype)
5164 case OP_ANY: /* This is the non-NL case */
5165 if (md->partial != 0 && /* Take care with CRLF partial */
5166 eptr >= md->end_subject &&
5167 NLBLOCK->nltype == NLTYPE_FIXED &&
5168 NLBLOCK->nllen == 2 &&
5169 c == NLBLOCK->nl[0])
5171 md->hitend = TRUE;
5172 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5174 break;
5176 case OP_ALLANY:
5177 case OP_ANYBYTE:
5178 break;
5180 case OP_ANYNL:
5181 switch(c)
5183 default: RRETURN(MATCH_NOMATCH);
5184 case CHAR_CR:
5185 if (eptr < md->end_subject && *eptr == CHAR_LF) eptr++;
5186 break;
5188 case CHAR_LF:
5189 break;
5191 case CHAR_VT:
5192 case CHAR_FF:
5193 case CHAR_NEL:
5194 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5195 case 0x2028:
5196 case 0x2029:
5197 #endif
5198 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
5199 break;
5201 break;
5203 case OP_NOT_HSPACE:
5204 switch(c)
5206 default: break;
5207 HSPACE_BYTE_CASES:
5208 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5209 HSPACE_MULTIBYTE_CASES:
5210 #endif
5211 RRETURN(MATCH_NOMATCH);
5213 break;
5215 case OP_HSPACE:
5216 switch(c)
5218 default: RRETURN(MATCH_NOMATCH);
5219 HSPACE_BYTE_CASES:
5220 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5221 HSPACE_MULTIBYTE_CASES:
5222 #endif
5223 break;
5225 break;
5227 case OP_NOT_VSPACE:
5228 switch(c)
5230 default: break;
5231 VSPACE_BYTE_CASES:
5232 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5233 VSPACE_MULTIBYTE_CASES:
5234 #endif
5235 RRETURN(MATCH_NOMATCH);
5237 break;
5239 case OP_VSPACE:
5240 switch(c)
5242 default: RRETURN(MATCH_NOMATCH);
5243 VSPACE_BYTE_CASES:
5244 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5245 VSPACE_MULTIBYTE_CASES:
5246 #endif
5247 break;
5249 break;
5251 case OP_NOT_DIGIT:
5252 if (MAX_255(c) && (md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
5253 break;
5255 case OP_DIGIT:
5256 if (!MAX_255(c) || (md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
5257 break;
5259 case OP_NOT_WHITESPACE:
5260 if (MAX_255(c) && (md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
5261 break;
5263 case OP_WHITESPACE:
5264 if (!MAX_255(c) || (md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
5265 break;
5267 case OP_NOT_WORDCHAR:
5268 if (MAX_255(c) && (md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
5269 break;
5271 case OP_WORDCHAR:
5272 if (!MAX_255(c) || (md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
5273 break;
5275 default:
5276 RRETURN(PCRE_ERROR_INTERNAL);
5280 /* Control never gets here */
5283 /* If maximizing, it is worth using inline code for speed, doing the type
5284 test once at the start (i.e. keep it out of the loop). Again, keep the
5285 UTF-8 and UCP stuff separate. */
5287 else
5289 pp = eptr; /* Remember where we started */
5291 #ifdef SUPPORT_UCP
5292 if (prop_type >= 0)
5294 switch(prop_type)
5296 case PT_ANY:
5297 for (i = min; i < max; i++)
5299 int len = 1;
5300 if (eptr >= md->end_subject)
5302 SCHECK_PARTIAL();
5303 break;
5305 GETCHARLENTEST(c, eptr, len);
5306 if (prop_fail_result) break;
5307 eptr+= len;
5309 break;
5311 case PT_LAMP:
5312 for (i = min; i < max; i++)
5314 int chartype;
5315 int len = 1;
5316 if (eptr >= md->end_subject)
5318 SCHECK_PARTIAL();
5319 break;
5321 GETCHARLENTEST(c, eptr, len);
5322 chartype = UCD_CHARTYPE(c);
5323 if ((chartype == ucp_Lu ||
5324 chartype == ucp_Ll ||
5325 chartype == ucp_Lt) == prop_fail_result)
5326 break;
5327 eptr+= len;
5329 break;
5331 case PT_GC:
5332 for (i = min; i < max; i++)
5334 int len = 1;
5335 if (eptr >= md->end_subject)
5337 SCHECK_PARTIAL();
5338 break;
5340 GETCHARLENTEST(c, eptr, len);
5341 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result) break;
5342 eptr+= len;
5344 break;
5346 case PT_PC:
5347 for (i = min; i < max; i++)
5349 int len = 1;
5350 if (eptr >= md->end_subject)
5352 SCHECK_PARTIAL();
5353 break;
5355 GETCHARLENTEST(c, eptr, len);
5356 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result) break;
5357 eptr+= len;
5359 break;
5361 case PT_SC:
5362 for (i = min; i < max; i++)
5364 int len = 1;
5365 if (eptr >= md->end_subject)
5367 SCHECK_PARTIAL();
5368 break;
5370 GETCHARLENTEST(c, eptr, len);
5371 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result) break;
5372 eptr+= len;
5374 break;
5376 case PT_ALNUM:
5377 for (i = min; i < max; i++)
5379 int category;
5380 int len = 1;
5381 if (eptr >= md->end_subject)
5383 SCHECK_PARTIAL();
5384 break;
5386 GETCHARLENTEST(c, eptr, len);
5387 category = UCD_CATEGORY(c);
5388 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
5389 break;
5390 eptr+= len;
5392 break;
5394 case PT_SPACE: /* Perl space */
5395 for (i = min; i < max; i++)
5397 int len = 1;
5398 if (eptr >= md->end_subject)
5400 SCHECK_PARTIAL();
5401 break;
5403 GETCHARLENTEST(c, eptr, len);
5404 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5405 c == CHAR_FF || c == CHAR_CR)
5406 == prop_fail_result)
5407 break;
5408 eptr+= len;
5410 break;
5412 case PT_PXSPACE: /* POSIX space */
5413 for (i = min; i < max; i++)
5415 int len = 1;
5416 if (eptr >= md->end_subject)
5418 SCHECK_PARTIAL();
5419 break;
5421 GETCHARLENTEST(c, eptr, len);
5422 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5423 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
5424 == prop_fail_result)
5425 break;
5426 eptr+= len;
5428 break;
5430 case PT_WORD:
5431 for (i = min; i < max; i++)
5433 int category;
5434 int len = 1;
5435 if (eptr >= md->end_subject)
5437 SCHECK_PARTIAL();
5438 break;
5440 GETCHARLENTEST(c, eptr, len);
5441 category = UCD_CATEGORY(c);
5442 if ((category == ucp_L || category == ucp_N ||
5443 c == CHAR_UNDERSCORE) == prop_fail_result)
5444 break;
5445 eptr+= len;
5447 break;
5449 case PT_CLIST:
5450 for (i = min; i < max; i++)
5452 const pcre_uint32 *cp;
5453 int len = 1;
5454 if (eptr >= md->end_subject)
5456 SCHECK_PARTIAL();
5457 break;
5459 GETCHARLENTEST(c, eptr, len);
5460 cp = PRIV(ucd_caseless_sets) + prop_value;
5461 for (;;)
5463 if (c < *cp)
5464 { if (prop_fail_result) break; else goto GOT_MAX; }
5465 if (c == *cp++)
5466 { if (prop_fail_result) goto GOT_MAX; else break; }
5468 eptr += len;
5470 GOT_MAX:
5471 break;
5473 default:
5474 RRETURN(PCRE_ERROR_INTERNAL);
5477 /* eptr is now past the end of the maximum run */
5479 if (possessive) continue;
5480 for(;;)
5482 RMATCH(eptr, ecode, offset_top, md, eptrb, RM44);
5483 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5484 if (eptr-- == pp) break; /* Stop if tried at original pos */
5485 if (utf) BACKCHAR(eptr);
5489 /* Match extended Unicode sequences. We will get here only if the
5490 support is in the binary; otherwise a compile-time error occurs. */
5492 else if (ctype == OP_EXTUNI)
5494 for (i = min; i < max; i++)
5496 if (eptr >= md->end_subject)
5498 SCHECK_PARTIAL();
5499 break;
5501 else
5503 int lgb, rgb;
5504 GETCHARINCTEST(c, eptr);
5505 lgb = UCD_GRAPHBREAK(c);
5506 while (eptr < md->end_subject)
5508 int len = 1;
5509 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5510 rgb = UCD_GRAPHBREAK(c);
5511 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
5512 lgb = rgb;
5513 eptr += len;
5516 CHECK_PARTIAL();
5519 /* eptr is now past the end of the maximum run */
5521 if (possessive) continue;
5523 for(;;)
5525 RMATCH(eptr, ecode, offset_top, md, eptrb, RM45);
5526 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5527 if (eptr-- == pp) break; /* Stop if tried at original pos */
5528 for (;;) /* Move back over one extended */
5530 if (!utf) c = *eptr; else
5532 BACKCHAR(eptr);
5533 GETCHAR(c, eptr);
5535 if (UCD_CATEGORY(c) != ucp_M) break;
5536 eptr--;
5541 else
5542 #endif /* SUPPORT_UCP */
5544 #ifdef SUPPORT_UTF
5545 if (utf)
5547 switch(ctype)
5549 case OP_ANY:
5550 if (max < INT_MAX)
5552 for (i = min; i < max; i++)
5554 if (eptr >= md->end_subject)
5556 SCHECK_PARTIAL();
5557 break;
5559 if (IS_NEWLINE(eptr)) break;
5560 if (md->partial != 0 && /* Take care with CRLF partial */
5561 eptr + 1 >= md->end_subject &&
5562 NLBLOCK->nltype == NLTYPE_FIXED &&
5563 NLBLOCK->nllen == 2 &&
5564 RAWUCHAR(eptr) == NLBLOCK->nl[0])
5566 md->hitend = TRUE;
5567 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5569 eptr++;
5570 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5574 /* Handle unlimited UTF-8 repeat */
5576 else
5578 for (i = min; i < max; i++)
5580 if (eptr >= md->end_subject)
5582 SCHECK_PARTIAL();
5583 break;
5585 if (IS_NEWLINE(eptr)) break;
5586 if (md->partial != 0 && /* Take care with CRLF partial */
5587 eptr + 1 >= md->end_subject &&
5588 NLBLOCK->nltype == NLTYPE_FIXED &&
5589 NLBLOCK->nllen == 2 &&
5590 RAWUCHAR(eptr) == NLBLOCK->nl[0])
5592 md->hitend = TRUE;
5593 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5595 eptr++;
5596 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5599 break;
5601 case OP_ALLANY:
5602 if (max < INT_MAX)
5604 for (i = min; i < max; i++)
5606 if (eptr >= md->end_subject)
5608 SCHECK_PARTIAL();
5609 break;
5611 eptr++;
5612 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5615 else
5617 eptr = md->end_subject; /* Unlimited UTF-8 repeat */
5618 SCHECK_PARTIAL();
5620 break;
5622 /* The byte case is the same as non-UTF8 */
5624 case OP_ANYBYTE:
5625 c = max - min;
5626 if (c > (unsigned int)(md->end_subject - eptr))
5628 eptr = md->end_subject;
5629 SCHECK_PARTIAL();
5631 else eptr += c;
5632 break;
5634 case OP_ANYNL:
5635 for (i = min; i < max; i++)
5637 int len = 1;
5638 if (eptr >= md->end_subject)
5640 SCHECK_PARTIAL();
5641 break;
5643 GETCHARLEN(c, eptr, len);
5644 if (c == CHAR_CR)
5646 if (++eptr >= md->end_subject) break;
5647 if (RAWUCHAR(eptr) == CHAR_LF) eptr++;
5649 else
5651 if (c != CHAR_LF &&
5652 (md->bsr_anycrlf ||
5653 (c != CHAR_VT && c != CHAR_FF && c != CHAR_NEL
5654 #ifndef EBCDIC
5655 && c != 0x2028 && c != 0x2029
5656 #endif /* Not EBCDIC */
5658 break;
5659 eptr += len;
5662 break;
5664 case OP_NOT_HSPACE:
5665 case OP_HSPACE:
5666 for (i = min; i < max; i++)
5668 BOOL gotspace;
5669 int len = 1;
5670 if (eptr >= md->end_subject)
5672 SCHECK_PARTIAL();
5673 break;
5675 GETCHARLEN(c, eptr, len);
5676 switch(c)
5678 HSPACE_CASES: gotspace = TRUE; break;
5679 default: gotspace = FALSE; break;
5681 if (gotspace == (ctype == OP_NOT_HSPACE)) break;
5682 eptr += len;
5684 break;
5686 case OP_NOT_VSPACE:
5687 case OP_VSPACE:
5688 for (i = min; i < max; i++)
5690 BOOL gotspace;
5691 int len = 1;
5692 if (eptr >= md->end_subject)
5694 SCHECK_PARTIAL();
5695 break;
5697 GETCHARLEN(c, eptr, len);
5698 switch(c)
5700 VSPACE_CASES: gotspace = TRUE; break;
5701 default: gotspace = FALSE; break;
5703 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
5704 eptr += len;
5706 break;
5708 case OP_NOT_DIGIT:
5709 for (i = min; i < max; i++)
5711 int len = 1;
5712 if (eptr >= md->end_subject)
5714 SCHECK_PARTIAL();
5715 break;
5717 GETCHARLEN(c, eptr, len);
5718 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
5719 eptr+= len;
5721 break;
5723 case OP_DIGIT:
5724 for (i = min; i < max; i++)
5726 int len = 1;
5727 if (eptr >= md->end_subject)
5729 SCHECK_PARTIAL();
5730 break;
5732 GETCHARLEN(c, eptr, len);
5733 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
5734 eptr+= len;
5736 break;
5738 case OP_NOT_WHITESPACE:
5739 for (i = min; i < max; i++)
5741 int len = 1;
5742 if (eptr >= md->end_subject)
5744 SCHECK_PARTIAL();
5745 break;
5747 GETCHARLEN(c, eptr, len);
5748 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
5749 eptr+= len;
5751 break;
5753 case OP_WHITESPACE:
5754 for (i = min; i < max; i++)
5756 int len = 1;
5757 if (eptr >= md->end_subject)
5759 SCHECK_PARTIAL();
5760 break;
5762 GETCHARLEN(c, eptr, len);
5763 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
5764 eptr+= len;
5766 break;
5768 case OP_NOT_WORDCHAR:
5769 for (i = min; i < max; i++)
5771 int len = 1;
5772 if (eptr >= md->end_subject)
5774 SCHECK_PARTIAL();
5775 break;
5777 GETCHARLEN(c, eptr, len);
5778 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
5779 eptr+= len;
5781 break;
5783 case OP_WORDCHAR:
5784 for (i = min; i < max; i++)
5786 int len = 1;
5787 if (eptr >= md->end_subject)
5789 SCHECK_PARTIAL();
5790 break;
5792 GETCHARLEN(c, eptr, len);
5793 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
5794 eptr+= len;
5796 break;
5798 default:
5799 RRETURN(PCRE_ERROR_INTERNAL);
5802 /* eptr is now past the end of the maximum run. If possessive, we are
5803 done (no backing up). Otherwise, match at this position; anything other
5804 than no match is immediately returned. For nomatch, back up one
5805 character, unless we are matching \R and the last thing matched was
5806 \r\n, in which case, back up two bytes. */
5808 if (possessive) continue;
5809 for(;;)
5811 RMATCH(eptr, ecode, offset_top, md, eptrb, RM46);
5812 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5813 if (eptr-- == pp) break; /* Stop if tried at original pos */
5814 BACKCHAR(eptr);
5815 if (ctype == OP_ANYNL && eptr > pp && RAWUCHAR(eptr) == CHAR_NL &&
5816 RAWUCHAR(eptr - 1) == CHAR_CR) eptr--;
5819 else
5820 #endif /* SUPPORT_UTF */
5821 /* Not UTF mode */
5823 switch(ctype)
5825 case OP_ANY:
5826 for (i = min; i < max; i++)
5828 if (eptr >= md->end_subject)
5830 SCHECK_PARTIAL();
5831 break;
5833 if (IS_NEWLINE(eptr)) break;
5834 if (md->partial != 0 && /* Take care with CRLF partial */
5835 eptr + 1 >= md->end_subject &&
5836 NLBLOCK->nltype == NLTYPE_FIXED &&
5837 NLBLOCK->nllen == 2 &&
5838 *eptr == NLBLOCK->nl[0])
5840 md->hitend = TRUE;
5841 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5843 eptr++;
5845 break;
5847 case OP_ALLANY:
5848 case OP_ANYBYTE:
5849 c = max - min;
5850 if (c > (unsigned int)(md->end_subject - eptr))
5852 eptr = md->end_subject;
5853 SCHECK_PARTIAL();
5855 else eptr += c;
5856 break;
5858 case OP_ANYNL:
5859 for (i = min; i < max; i++)
5861 if (eptr >= md->end_subject)
5863 SCHECK_PARTIAL();
5864 break;
5866 c = *eptr;
5867 if (c == CHAR_CR)
5869 if (++eptr >= md->end_subject) break;
5870 if (*eptr == CHAR_LF) eptr++;
5872 else
5874 if (c != CHAR_LF && (md->bsr_anycrlf ||
5875 (c != CHAR_VT && c != CHAR_FF && c != CHAR_NEL
5876 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5877 && c != 0x2028 && c != 0x2029
5878 #endif
5879 ))) break;
5880 eptr++;
5883 break;
5885 case OP_NOT_HSPACE:
5886 for (i = min; i < max; i++)
5888 if (eptr >= md->end_subject)
5890 SCHECK_PARTIAL();
5891 break;
5893 switch(*eptr)
5895 default: eptr++; break;
5896 HSPACE_BYTE_CASES:
5897 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5898 HSPACE_MULTIBYTE_CASES:
5899 #endif
5900 goto ENDLOOP00;
5903 ENDLOOP00:
5904 break;
5906 case OP_HSPACE:
5907 for (i = min; i < max; i++)
5909 if (eptr >= md->end_subject)
5911 SCHECK_PARTIAL();
5912 break;
5914 switch(*eptr)
5916 default: goto ENDLOOP01;
5917 HSPACE_BYTE_CASES:
5918 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5919 HSPACE_MULTIBYTE_CASES:
5920 #endif
5921 eptr++; break;
5924 ENDLOOP01:
5925 break;
5927 case OP_NOT_VSPACE:
5928 for (i = min; i < max; i++)
5930 if (eptr >= md->end_subject)
5932 SCHECK_PARTIAL();
5933 break;
5935 switch(*eptr)
5937 default: eptr++; break;
5938 VSPACE_BYTE_CASES:
5939 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5940 VSPACE_MULTIBYTE_CASES:
5941 #endif
5942 goto ENDLOOP02;
5945 ENDLOOP02:
5946 break;
5948 case OP_VSPACE:
5949 for (i = min; i < max; i++)
5951 if (eptr >= md->end_subject)
5953 SCHECK_PARTIAL();
5954 break;
5956 switch(*eptr)
5958 default: goto ENDLOOP03;
5959 VSPACE_BYTE_CASES:
5960 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5961 VSPACE_MULTIBYTE_CASES:
5962 #endif
5963 eptr++; break;
5966 ENDLOOP03:
5967 break;
5969 case OP_NOT_DIGIT:
5970 for (i = min; i < max; i++)
5972 if (eptr >= md->end_subject)
5974 SCHECK_PARTIAL();
5975 break;
5977 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_digit) != 0) break;
5978 eptr++;
5980 break;
5982 case OP_DIGIT:
5983 for (i = min; i < max; i++)
5985 if (eptr >= md->end_subject)
5987 SCHECK_PARTIAL();
5988 break;
5990 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_digit) == 0) break;
5991 eptr++;
5993 break;
5995 case OP_NOT_WHITESPACE:
5996 for (i = min; i < max; i++)
5998 if (eptr >= md->end_subject)
6000 SCHECK_PARTIAL();
6001 break;
6003 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_space) != 0) break;
6004 eptr++;
6006 break;
6008 case OP_WHITESPACE:
6009 for (i = min; i < max; i++)
6011 if (eptr >= md->end_subject)
6013 SCHECK_PARTIAL();
6014 break;
6016 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_space) == 0) break;
6017 eptr++;
6019 break;
6021 case OP_NOT_WORDCHAR:
6022 for (i = min; i < max; i++)
6024 if (eptr >= md->end_subject)
6026 SCHECK_PARTIAL();
6027 break;
6029 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_word) != 0) break;
6030 eptr++;
6032 break;
6034 case OP_WORDCHAR:
6035 for (i = min; i < max; i++)
6037 if (eptr >= md->end_subject)
6039 SCHECK_PARTIAL();
6040 break;
6042 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_word) == 0) break;
6043 eptr++;
6045 break;
6047 default:
6048 RRETURN(PCRE_ERROR_INTERNAL);
6051 /* eptr is now past the end of the maximum run. If possessive, we are
6052 done (no backing up). Otherwise, match at this position; anything other
6053 than no match is immediately returned. For nomatch, back up one
6054 character (byte), unless we are matching \R and the last thing matched
6055 was \r\n, in which case, back up two bytes. */
6057 if (possessive) continue;
6058 while (eptr >= pp)
6060 RMATCH(eptr, ecode, offset_top, md, eptrb, RM47);
6061 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6062 eptr--;
6063 if (ctype == OP_ANYNL && eptr > pp && *eptr == CHAR_LF &&
6064 eptr[-1] == CHAR_CR) eptr--;
6068 /* Get here if we can't make it match with any permitted repetitions */
6070 RRETURN(MATCH_NOMATCH);
6072 /* Control never gets here */
6074 /* There's been some horrible disaster. Arrival here can only mean there is
6075 something seriously wrong in the code above or the OP_xxx definitions. */
6077 default:
6078 DPRINTF(("Unknown opcode %d\n", *ecode));
6079 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
6082 /* Do not stick any code in here without much thought; it is assumed
6083 that "continue" in the code above comes out to here to repeat the main
6084 loop. */
6086 } /* End of main loop */
6087 /* Control never reaches here */
6090 /* When compiling to use the heap rather than the stack for recursive calls to
6091 match(), the RRETURN() macro jumps here. The number that is saved in
6092 frame->Xwhere indicates which label we actually want to return to. */
6094 #ifdef NO_RECURSE
6095 #define LBL(val) case val: goto L_RM##val;
6096 HEAP_RETURN:
6097 switch (frame->Xwhere)
6099 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
6100 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
6101 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
6102 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
6103 LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58) LBL(63) LBL(64)
6104 LBL(65) LBL(66)
6105 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
6106 LBL(21)
6107 #endif
6108 #ifdef SUPPORT_UTF
6109 LBL(16) LBL(18) LBL(20)
6110 LBL(22) LBL(23) LBL(28) LBL(30)
6111 LBL(32) LBL(34) LBL(42) LBL(46)
6112 #ifdef SUPPORT_UCP
6113 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
6114 LBL(59) LBL(60) LBL(61) LBL(62) LBL(67)
6115 #endif /* SUPPORT_UCP */
6116 #endif /* SUPPORT_UTF */
6117 default:
6118 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
6119 return PCRE_ERROR_INTERNAL;
6121 #undef LBL
6122 #endif /* NO_RECURSE */
6126 /***************************************************************************
6127 ****************************************************************************
6128 RECURSION IN THE match() FUNCTION
6130 Undefine all the macros that were defined above to handle this. */
6132 #ifdef NO_RECURSE
6133 #undef eptr
6134 #undef ecode
6135 #undef mstart
6136 #undef offset_top
6137 #undef eptrb
6138 #undef flags
6140 #undef callpat
6141 #undef charptr
6142 #undef data
6143 #undef next
6144 #undef pp
6145 #undef prev
6146 #undef saved_eptr
6148 #undef new_recursive
6150 #undef cur_is_word
6151 #undef condition
6152 #undef prev_is_word
6154 #undef ctype
6155 #undef length
6156 #undef max
6157 #undef min
6158 #undef number
6159 #undef offset
6160 #undef op
6161 #undef save_capture_last
6162 #undef save_offset1
6163 #undef save_offset2
6164 #undef save_offset3
6165 #undef stacksave
6167 #undef newptrb
6169 #endif
6171 /* These two are defined as macros in both cases */
6173 #undef fc
6174 #undef fi
6176 /***************************************************************************
6177 ***************************************************************************/
6180 #ifdef NO_RECURSE
6181 /*************************************************
6182 * Release allocated heap frames *
6183 *************************************************/
6185 /* This function releases all the allocated frames. The base frame is on the
6186 machine stack, and so must not be freed.
6188 Argument: the address of the base frame
6189 Returns: nothing
6192 static void
6193 release_match_heapframes (heapframe *frame_base)
6195 heapframe *nextframe = frame_base->Xnextframe;
6196 while (nextframe != NULL)
6198 heapframe *oldframe = nextframe;
6199 nextframe = nextframe->Xnextframe;
6200 (PUBL(stack_free))(oldframe);
6203 #endif
6206 /*************************************************
6207 * Execute a Regular Expression *
6208 *************************************************/
6210 /* This function applies a compiled re to a subject string and picks out
6211 portions of the string if it matches. Two elements in the vector are set for
6212 each substring: the offsets to the start and end of the substring.
6214 Arguments:
6215 argument_re points to the compiled expression
6216 extra_data points to extra data or is NULL
6217 subject points to the subject string
6218 length length of subject string (may contain binary zeros)
6219 start_offset where to start in the subject string
6220 options option bits
6221 offsets points to a vector of ints to be filled in with offsets
6222 offsetcount the number of elements in the vector
6224 Returns: > 0 => success; value is the number of elements filled in
6225 = 0 => success, but offsets is not big enough
6226 -1 => failed to match
6227 < -1 => some kind of unexpected problem
6230 #if defined COMPILE_PCRE8
6231 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6232 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
6233 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
6234 int offsetcount)
6235 #elif defined COMPILE_PCRE16
6236 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6237 pcre16_exec(const pcre16 *argument_re, const pcre16_extra *extra_data,
6238 PCRE_SPTR16 subject, int length, int start_offset, int options, int *offsets,
6239 int offsetcount)
6240 #elif defined COMPILE_PCRE32
6241 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6242 pcre32_exec(const pcre32 *argument_re, const pcre32_extra *extra_data,
6243 PCRE_SPTR32 subject, int length, int start_offset, int options, int *offsets,
6244 int offsetcount)
6245 #endif
6247 int rc, ocount, arg_offset_max;
6248 int newline;
6249 BOOL using_temporary_offsets = FALSE;
6250 BOOL anchored;
6251 BOOL startline;
6252 BOOL firstline;
6253 BOOL utf;
6254 BOOL has_first_char = FALSE;
6255 BOOL has_req_char = FALSE;
6256 pcre_uchar first_char = 0;
6257 pcre_uchar first_char2 = 0;
6258 pcre_uchar req_char = 0;
6259 pcre_uchar req_char2 = 0;
6260 match_data match_block;
6261 match_data *md = &match_block;
6262 const pcre_uint8 *tables;
6263 const pcre_uint8 *start_bits = NULL;
6264 PCRE_PUCHAR start_match = (PCRE_PUCHAR)subject + start_offset;
6265 PCRE_PUCHAR end_subject;
6266 PCRE_PUCHAR start_partial = NULL;
6267 PCRE_PUCHAR req_char_ptr = start_match - 1;
6269 const pcre_study_data *study;
6270 const REAL_PCRE *re = (const REAL_PCRE *)argument_re;
6272 #ifdef NO_RECURSE
6273 heapframe frame_zero;
6274 frame_zero.Xprevframe = NULL; /* Marks the top level */
6275 frame_zero.Xnextframe = NULL; /* None are allocated yet */
6276 md->match_frames_base = &frame_zero;
6277 #endif
6279 /* Check for the special magic call that measures the size of the stack used
6280 per recursive call of match(). Without the funny casting for sizeof, a Windows
6281 compiler gave this error: "unary minus operator applied to unsigned type,
6282 result still unsigned". Hopefully the cast fixes that. */
6284 if (re == NULL && extra_data == NULL && subject == NULL && length == -999 &&
6285 start_offset == -999)
6286 #ifdef NO_RECURSE
6287 return -((int)sizeof(heapframe));
6288 #else
6289 return match(NULL, NULL, NULL, 0, NULL, NULL, 0);
6290 #endif
6292 /* Plausibility checks */
6294 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
6295 if (re == NULL || subject == NULL || (offsets == NULL && offsetcount > 0))
6296 return PCRE_ERROR_NULL;
6297 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
6298 if (length < 0) return PCRE_ERROR_BADLENGTH;
6299 if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
6301 /* Check that the first field in the block is the magic number. If it is not,
6302 return with PCRE_ERROR_BADMAGIC. However, if the magic number is equal to
6303 REVERSED_MAGIC_NUMBER we return with PCRE_ERROR_BADENDIANNESS, which
6304 means that the pattern is likely compiled with different endianness. */
6306 if (re->magic_number != MAGIC_NUMBER)
6307 return re->magic_number == REVERSED_MAGIC_NUMBER?
6308 PCRE_ERROR_BADENDIANNESS:PCRE_ERROR_BADMAGIC;
6309 if ((re->flags & PCRE_MODE) == 0) return PCRE_ERROR_BADMODE;
6311 /* These two settings are used in the code for checking a UTF-8 string that
6312 follows immediately afterwards. Other values in the md block are used only
6313 during "normal" pcre_exec() processing, not when the JIT support is in use,
6314 so they are set up later. */
6316 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
6317 utf = md->utf = (re->options & PCRE_UTF8) != 0;
6318 md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
6319 ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;
6321 /* Check a UTF-8 string if required. Pass back the character offset and error
6322 code for an invalid string if a results vector is available. */
6324 #ifdef SUPPORT_UTF
6325 if (utf && (options & PCRE_NO_UTF8_CHECK) == 0)
6327 int erroroffset;
6328 int errorcode = PRIV(valid_utf)((PCRE_PUCHAR)subject, length, &erroroffset);
6329 if (errorcode != 0)
6331 if (offsetcount >= 2)
6333 offsets[0] = erroroffset;
6334 offsets[1] = errorcode;
6336 #if defined COMPILE_PCRE8
6337 return (errorcode <= PCRE_UTF8_ERR5 && md->partial > 1)?
6338 PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
6339 #elif defined COMPILE_PCRE16
6340 return (errorcode <= PCRE_UTF16_ERR1 && md->partial > 1)?
6341 PCRE_ERROR_SHORTUTF16 : PCRE_ERROR_BADUTF16;
6342 #elif defined COMPILE_PCRE32
6343 return PCRE_ERROR_BADUTF32;
6344 #endif
6346 #if defined COMPILE_PCRE8 || defined COMPILE_PCRE16
6347 /* Check that a start_offset points to the start of a UTF character. */
6348 if (start_offset > 0 && start_offset < length &&
6349 NOT_FIRSTCHAR(((PCRE_PUCHAR)subject)[start_offset]))
6350 return PCRE_ERROR_BADUTF8_OFFSET;
6351 #endif
6353 #endif
6355 /* If the pattern was successfully studied with JIT support, run the JIT
6356 executable instead of the rest of this function. Most options must be set at
6357 compile time for the JIT code to be usable. Fallback to the normal code path if
6358 an unsupported flag is set. */
6360 #ifdef SUPPORT_JIT
6361 if (extra_data != NULL
6362 && (extra_data->flags & (PCRE_EXTRA_EXECUTABLE_JIT |
6363 PCRE_EXTRA_TABLES)) == PCRE_EXTRA_EXECUTABLE_JIT
6364 && extra_data->executable_jit != NULL
6365 && (options & ~PUBLIC_JIT_EXEC_OPTIONS) == 0)
6367 rc = PRIV(jit_exec)(extra_data, (const pcre_uchar *)subject, length,
6368 start_offset, options, offsets, offsetcount);
6370 /* PCRE_ERROR_NULL means that the selected normal or partial matching
6371 mode is not compiled. In this case we simply fallback to interpreter. */
6373 if (rc != PCRE_ERROR_JIT_BADOPTION) return rc;
6375 #endif
6377 /* Carry on with non-JIT matching. This information is for finding all the
6378 numbers associated with a given name, for condition testing. */
6380 md->name_table = (pcre_uchar *)re + re->name_table_offset;
6381 md->name_count = re->name_count;
6382 md->name_entry_size = re->name_entry_size;
6384 /* Fish out the optional data from the extra_data structure, first setting
6385 the default values. */
6387 study = NULL;
6388 md->match_limit = MATCH_LIMIT;
6389 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
6390 md->callout_data = NULL;
6392 /* The table pointer is always in native byte order. */
6394 tables = re->tables;
6396 if (extra_data != NULL)
6398 register unsigned int flags = extra_data->flags;
6399 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
6400 study = (const pcre_study_data *)extra_data->study_data;
6401 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
6402 md->match_limit = extra_data->match_limit;
6403 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
6404 md->match_limit_recursion = extra_data->match_limit_recursion;
6405 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
6406 md->callout_data = extra_data->callout_data;
6407 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
6410 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
6411 is a feature that makes it possible to save compiled regex and re-use them
6412 in other programs later. */
6414 if (tables == NULL) tables = PRIV(default_tables);
6416 /* Set up other data */
6418 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
6419 startline = (re->flags & PCRE_STARTLINE) != 0;
6420 firstline = (re->options & PCRE_FIRSTLINE) != 0;
6422 /* The code starts after the real_pcre block and the capture name table. */
6424 md->start_code = (const pcre_uchar *)re + re->name_table_offset +
6425 re->name_count * re->name_entry_size;
6427 md->start_subject = (PCRE_PUCHAR)subject;
6428 md->start_offset = start_offset;
6429 md->end_subject = md->start_subject + length;
6430 end_subject = md->end_subject;
6432 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
6433 md->use_ucp = (re->options & PCRE_UCP) != 0;
6434 md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
6435 md->ignore_skip_arg = FALSE;
6437 /* Some options are unpacked into BOOL variables in the hope that testing
6438 them will be faster than individual option bits. */
6440 md->notbol = (options & PCRE_NOTBOL) != 0;
6441 md->noteol = (options & PCRE_NOTEOL) != 0;
6442 md->notempty = (options & PCRE_NOTEMPTY) != 0;
6443 md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0;
6445 md->hitend = FALSE;
6446 md->mark = md->nomatch_mark = NULL; /* In case never set */
6448 md->recursive = NULL; /* No recursion at top level */
6449 md->hasthen = (re->flags & PCRE_HASTHEN) != 0;
6451 md->lcc = tables + lcc_offset;
6452 md->fcc = tables + fcc_offset;
6453 md->ctypes = tables + ctypes_offset;
6455 /* Handle different \R options. */
6457 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
6459 case 0:
6460 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
6461 md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
6462 else
6463 #ifdef BSR_ANYCRLF
6464 md->bsr_anycrlf = TRUE;
6465 #else
6466 md->bsr_anycrlf = FALSE;
6467 #endif
6468 break;
6470 case PCRE_BSR_ANYCRLF:
6471 md->bsr_anycrlf = TRUE;
6472 break;
6474 case PCRE_BSR_UNICODE:
6475 md->bsr_anycrlf = FALSE;
6476 break;
6478 default: return PCRE_ERROR_BADNEWLINE;
6481 /* Handle different types of newline. The three bits give eight cases. If
6482 nothing is set at run time, whatever was used at compile time applies. */
6484 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
6485 (pcre_uint32)options) & PCRE_NEWLINE_BITS)
6487 case 0: newline = NEWLINE; break; /* Compile-time default */
6488 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
6489 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
6490 case PCRE_NEWLINE_CR+
6491 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
6492 case PCRE_NEWLINE_ANY: newline = -1; break;
6493 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
6494 default: return PCRE_ERROR_BADNEWLINE;
6497 if (newline == -2)
6499 md->nltype = NLTYPE_ANYCRLF;
6501 else if (newline < 0)
6503 md->nltype = NLTYPE_ANY;
6505 else
6507 md->nltype = NLTYPE_FIXED;
6508 if (newline > 255)
6510 md->nllen = 2;
6511 md->nl[0] = (newline >> 8) & 255;
6512 md->nl[1] = newline & 255;
6514 else
6516 md->nllen = 1;
6517 md->nl[0] = newline;
6521 /* Partial matching was originally supported only for a restricted set of
6522 regexes; from release 8.00 there are no restrictions, but the bits are still
6523 defined (though never set). So there's no harm in leaving this code. */
6525 if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
6526 return PCRE_ERROR_BADPARTIAL;
6528 /* If the expression has got more back references than the offsets supplied can
6529 hold, we get a temporary chunk of working store to use during the matching.
6530 Otherwise, we can use the vector supplied, rounding down its size to a multiple
6531 of 3. */
6533 ocount = offsetcount - (offsetcount % 3);
6534 arg_offset_max = (2*ocount)/3;
6536 if (re->top_backref > 0 && re->top_backref >= ocount/3)
6538 ocount = re->top_backref * 3 + 3;
6539 md->offset_vector = (int *)(PUBL(malloc))(ocount * sizeof(int));
6540 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
6541 using_temporary_offsets = TRUE;
6542 DPRINTF(("Got memory to hold back references\n"));
6544 else md->offset_vector = offsets;
6546 md->offset_end = ocount;
6547 md->offset_max = (2*ocount)/3;
6548 md->offset_overflow = FALSE;
6549 md->capture_last = -1;
6551 /* Reset the working variable associated with each extraction. These should
6552 never be used unless previously set, but they get saved and restored, and so we
6553 initialize them to avoid reading uninitialized locations. Also, unset the
6554 offsets for the matched string. This is really just for tidiness with callouts,
6555 in case they inspect these fields. */
6557 if (md->offset_vector != NULL)
6559 register int *iptr = md->offset_vector + ocount;
6560 register int *iend = iptr - re->top_bracket;
6561 if (iend < md->offset_vector + 2) iend = md->offset_vector + 2;
6562 while (--iptr >= iend) *iptr = -1;
6563 md->offset_vector[0] = md->offset_vector[1] = -1;
6566 /* Set up the first character to match, if available. The first_char value is
6567 never set for an anchored regular expression, but the anchoring may be forced
6568 at run time, so we have to test for anchoring. The first char may be unset for
6569 an unanchored pattern, of course. If there's no first char and the pattern was
6570 studied, there may be a bitmap of possible first characters. */
6572 if (!anchored)
6574 if ((re->flags & PCRE_FIRSTSET) != 0)
6576 has_first_char = TRUE;
6577 first_char = first_char2 = (pcre_uchar)(re->first_char);
6578 if ((re->flags & PCRE_FCH_CASELESS) != 0)
6580 first_char2 = TABLE_GET(first_char, md->fcc, first_char);
6581 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
6582 if (utf && first_char > 127)
6583 first_char2 = UCD_OTHERCASE(first_char);
6584 #endif
6587 else
6588 if (!startline && study != NULL &&
6589 (study->flags & PCRE_STUDY_MAPPED) != 0)
6590 start_bits = study->start_bits;
6593 /* For anchored or unanchored matches, there may be a "last known required
6594 character" set. */
6596 if ((re->flags & PCRE_REQCHSET) != 0)
6598 has_req_char = TRUE;
6599 req_char = req_char2 = (pcre_uchar)(re->req_char);
6600 if ((re->flags & PCRE_RCH_CASELESS) != 0)
6602 req_char2 = TABLE_GET(req_char, md->fcc, req_char);
6603 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
6604 if (utf && req_char > 127)
6605 req_char2 = UCD_OTHERCASE(req_char);
6606 #endif
6611 /* ==========================================================================*/
6613 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
6614 the loop runs just once. */
6616 for(;;)
6618 PCRE_PUCHAR save_end_subject = end_subject;
6619 PCRE_PUCHAR new_start_match;
6621 /* If firstline is TRUE, the start of the match is constrained to the first
6622 line of a multiline string. That is, the match must be before or at the first
6623 newline. Implement this by temporarily adjusting end_subject so that we stop
6624 scanning at a newline. If the match fails at the newline, later code breaks
6625 this loop. */
6627 if (firstline)
6629 PCRE_PUCHAR t = start_match;
6630 #ifdef SUPPORT_UTF
6631 if (utf)
6633 while (t < md->end_subject && !IS_NEWLINE(t))
6635 t++;
6636 ACROSSCHAR(t < end_subject, *t, t++);
6639 else
6640 #endif
6641 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
6642 end_subject = t;
6645 /* There are some optimizations that avoid running the match if a known
6646 starting point is not found, or if a known later character is not present.
6647 However, there is an option that disables these, for testing and for ensuring
6648 that all callouts do actually occur. The option can be set in the regex by
6649 (*NO_START_OPT) or passed in match-time options. */
6651 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
6653 /* Advance to a unique first char if there is one. */
6655 if (has_first_char)
6657 pcre_uchar smc;
6659 if (first_char != first_char2)
6660 while (start_match < end_subject &&
6661 (smc = RAWUCHARTEST(start_match)) != first_char && smc != first_char2)
6662 start_match++;
6663 else
6664 while (start_match < end_subject && RAWUCHARTEST(start_match) != first_char)
6665 start_match++;
6668 /* Or to just after a linebreak for a multiline match */
6670 else if (startline)
6672 if (start_match > md->start_subject + start_offset)
6674 #ifdef SUPPORT_UTF
6675 if (utf)
6677 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6679 start_match++;
6680 ACROSSCHAR(start_match < end_subject, *start_match,
6681 start_match++);
6684 else
6685 #endif
6686 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6687 start_match++;
6689 /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
6690 and we are now at a LF, advance the match position by one more character.
6693 if (start_match[-1] == CHAR_CR &&
6694 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
6695 start_match < end_subject &&
6696 RAWUCHARTEST(start_match) == CHAR_NL)
6697 start_match++;
6701 /* Or to a non-unique first byte after study */
6703 else if (start_bits != NULL)
6705 while (start_match < end_subject)
6707 register pcre_uint32 c = RAWUCHARTEST(start_match);
6708 #ifndef COMPILE_PCRE8
6709 if (c > 255) c = 255;
6710 #endif
6711 if ((start_bits[c/8] & (1 << (c&7))) == 0)
6713 start_match++;
6714 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
6715 /* In non 8-bit mode, the iteration will stop for
6716 characters > 255 at the beginning or not stop at all. */
6717 if (utf)
6718 ACROSSCHAR(start_match < end_subject, *start_match,
6719 start_match++);
6720 #endif
6722 else break;
6725 } /* Starting optimizations */
6727 /* Restore fudged end_subject */
6729 end_subject = save_end_subject;
6731 /* The following two optimizations are disabled for partial matching or if
6732 disabling is explicitly requested. */
6734 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0 && !md->partial)
6736 /* If the pattern was studied, a minimum subject length may be set. This is
6737 a lower bound; no actual string of that length may actually match the
6738 pattern. Although the value is, strictly, in characters, we treat it as
6739 bytes to avoid spending too much time in this optimization. */
6741 if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
6742 (pcre_uint32)(end_subject - start_match) < study->minlength)
6744 rc = MATCH_NOMATCH;
6745 break;
6748 /* If req_char is set, we know that that character must appear in the
6749 subject for the match to succeed. If the first character is set, req_char
6750 must be later in the subject; otherwise the test starts at the match point.
6751 This optimization can save a huge amount of backtracking in patterns with
6752 nested unlimited repeats that aren't going to match. Writing separate code
6753 for cased/caseless versions makes it go faster, as does using an
6754 autoincrement and backing off on a match.
6756 HOWEVER: when the subject string is very, very long, searching to its end
6757 can take a long time, and give bad performance on quite ordinary patterns.
6758 This showed up when somebody was matching something like /^\d+C/ on a
6759 32-megabyte string... so we don't do this when the string is sufficiently
6760 long. */
6762 if (has_req_char && end_subject - start_match < REQ_BYTE_MAX)
6764 register PCRE_PUCHAR p = start_match + (has_first_char? 1:0);
6766 /* We don't need to repeat the search if we haven't yet reached the
6767 place we found it at last time. */
6769 if (p > req_char_ptr)
6771 if (req_char != req_char2)
6773 while (p < end_subject)
6775 register pcre_uint32 pp = RAWUCHARINCTEST(p);
6776 if (pp == req_char || pp == req_char2) { p--; break; }
6779 else
6781 while (p < end_subject)
6783 if (RAWUCHARINCTEST(p) == req_char) { p--; break; }
6787 /* If we can't find the required character, break the matching loop,
6788 forcing a match failure. */
6790 if (p >= end_subject)
6792 rc = MATCH_NOMATCH;
6793 break;
6796 /* If we have found the required character, save the point where we
6797 found it, so that we don't search again next time round the loop if
6798 the start hasn't passed this character yet. */
6800 req_char_ptr = p;
6805 #ifdef PCRE_DEBUG /* Sigh. Some compilers never learn. */
6806 printf(">>>> Match against: ");
6807 pchars(start_match, end_subject - start_match, TRUE, md);
6808 printf("\n");
6809 #endif
6811 /* OK, we can now run the match. If "hitend" is set afterwards, remember the
6812 first starting point for which a partial match was found. */
6814 md->start_match_ptr = start_match;
6815 md->start_used_ptr = start_match;
6816 md->match_call_count = 0;
6817 md->match_function_type = 0;
6818 md->end_offset_top = 0;
6819 rc = match(start_match, md->start_code, start_match, 2, md, NULL, 0);
6820 if (md->hitend && start_partial == NULL) start_partial = md->start_used_ptr;
6822 switch(rc)
6824 /* If MATCH_SKIP_ARG reaches this level it means that a MARK that matched
6825 the SKIP's arg was not found. In this circumstance, Perl ignores the SKIP
6826 entirely. The only way we can do that is to re-do the match at the same
6827 point, with a flag to force SKIP with an argument to be ignored. Just
6828 treating this case as NOMATCH does not work because it does not check other
6829 alternatives in patterns such as A(*SKIP:A)B|AC when the subject is AC. */
6831 case MATCH_SKIP_ARG:
6832 new_start_match = start_match;
6833 md->ignore_skip_arg = TRUE;
6834 break;
6836 /* SKIP passes back the next starting point explicitly, but if it is the
6837 same as the match we have just done, treat it as NOMATCH. */
6839 case MATCH_SKIP:
6840 if (md->start_match_ptr != start_match)
6842 new_start_match = md->start_match_ptr;
6843 break;
6845 /* Fall through */
6847 /* NOMATCH and PRUNE advance by one character. THEN at this level acts
6848 exactly like PRUNE. Unset the ignore SKIP-with-argument flag. */
6850 case MATCH_NOMATCH:
6851 case MATCH_PRUNE:
6852 case MATCH_THEN:
6853 md->ignore_skip_arg = FALSE;
6854 new_start_match = start_match + 1;
6855 #ifdef SUPPORT_UTF
6856 if (utf)
6857 ACROSSCHAR(new_start_match < end_subject, *new_start_match,
6858 new_start_match++);
6859 #endif
6860 break;
6862 /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */
6864 case MATCH_COMMIT:
6865 rc = MATCH_NOMATCH;
6866 goto ENDLOOP;
6868 /* Any other return is either a match, or some kind of error. */
6870 default:
6871 goto ENDLOOP;
6874 /* Control reaches here for the various types of "no match at this point"
6875 result. Reset the code to MATCH_NOMATCH for subsequent checking. */
6877 rc = MATCH_NOMATCH;
6879 /* If PCRE_FIRSTLINE is set, the match must happen before or at the first
6880 newline in the subject (though it may continue over the newline). Therefore,
6881 if we have just failed to match, starting at a newline, do not continue. */
6883 if (firstline && IS_NEWLINE(start_match)) break;
6885 /* Advance to new matching position */
6887 start_match = new_start_match;
6889 /* Break the loop if the pattern is anchored or if we have passed the end of
6890 the subject. */
6892 if (anchored || start_match > end_subject) break;
6894 /* If we have just passed a CR and we are now at a LF, and the pattern does
6895 not contain any explicit matches for \r or \n, and the newline option is CRLF
6896 or ANY or ANYCRLF, advance the match position by one more character. In
6897 normal matching start_match will aways be greater than the first position at
6898 this stage, but a failed *SKIP can cause a return at the same point, which is
6899 why the first test exists. */
6901 if (start_match > (PCRE_PUCHAR)subject + start_offset &&
6902 start_match[-1] == CHAR_CR &&
6903 start_match < end_subject &&
6904 *start_match == CHAR_NL &&
6905 (re->flags & PCRE_HASCRORLF) == 0 &&
6906 (md->nltype == NLTYPE_ANY ||
6907 md->nltype == NLTYPE_ANYCRLF ||
6908 md->nllen == 2))
6909 start_match++;
6911 md->mark = NULL; /* Reset for start of next match attempt */
6912 } /* End of for(;;) "bumpalong" loop */
6914 /* ==========================================================================*/
6916 /* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping
6917 conditions is true:
6919 (1) The pattern is anchored or the match was failed by (*COMMIT);
6921 (2) We are past the end of the subject;
6923 (3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because
6924 this option requests that a match occur at or before the first newline in
6925 the subject.
6927 When we have a match and the offset vector is big enough to deal with any
6928 backreferences, captured substring offsets will already be set up. In the case
6929 where we had to get some local store to hold offsets for backreference
6930 processing, copy those that we can. In this case there need not be overflow if
6931 certain parts of the pattern were not used, even though there are more
6932 capturing parentheses than vector slots. */
6934 ENDLOOP:
6936 if (rc == MATCH_MATCH || rc == MATCH_ACCEPT)
6938 if (using_temporary_offsets)
6940 if (arg_offset_max >= 4)
6942 memcpy(offsets + 2, md->offset_vector + 2,
6943 (arg_offset_max - 2) * sizeof(int));
6944 DPRINTF(("Copied offsets from temporary memory\n"));
6946 if (md->end_offset_top > arg_offset_max) md->offset_overflow = TRUE;
6947 DPRINTF(("Freeing temporary memory\n"));
6948 (PUBL(free))(md->offset_vector);
6951 /* Set the return code to the number of captured strings, or 0 if there were
6952 too many to fit into the vector. */
6954 rc = (md->offset_overflow && md->end_offset_top >= arg_offset_max)?
6955 0 : md->end_offset_top/2;
6957 /* If there is space in the offset vector, set any unused pairs at the end of
6958 the pattern to -1 for backwards compatibility. It is documented that this
6959 happens. In earlier versions, the whole set of potential capturing offsets
6960 was set to -1 each time round the loop, but this is handled differently now.
6961 "Gaps" are set to -1 dynamically instead (this fixes a bug). Thus, it is only
6962 those at the end that need unsetting here. We can't just unset them all at
6963 the start of the whole thing because they may get set in one branch that is
6964 not the final matching branch. */
6966 if (md->end_offset_top/2 <= re->top_bracket && offsets != NULL)
6968 register int *iptr, *iend;
6969 int resetcount = 2 + re->top_bracket * 2;
6970 if (resetcount > offsetcount) resetcount = offsetcount;
6971 iptr = offsets + md->end_offset_top;
6972 iend = offsets + resetcount;
6973 while (iptr < iend) *iptr++ = -1;
6976 /* If there is space, set up the whole thing as substring 0. The value of
6977 md->start_match_ptr might be modified if \K was encountered on the success
6978 matching path. */
6980 if (offsetcount < 2) rc = 0; else
6982 offsets[0] = (int)(md->start_match_ptr - md->start_subject);
6983 offsets[1] = (int)(md->end_match_ptr - md->start_subject);
6986 /* Return MARK data if requested */
6988 if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_MARK) != 0)
6989 *(extra_data->mark) = (pcre_uchar *)md->mark;
6990 DPRINTF((">>>> returning %d\n", rc));
6991 #ifdef NO_RECURSE
6992 release_match_heapframes(&frame_zero);
6993 #endif
6994 return rc;
6997 /* Control gets here if there has been an error, or if the overall match
6998 attempt has failed at all permitted starting positions. */
7000 if (using_temporary_offsets)
7002 DPRINTF(("Freeing temporary memory\n"));
7003 (PUBL(free))(md->offset_vector);
7006 /* For anything other than nomatch or partial match, just return the code. */
7008 if (rc != MATCH_NOMATCH && rc != PCRE_ERROR_PARTIAL)
7010 DPRINTF((">>>> error: returning %d\n", rc));
7011 #ifdef NO_RECURSE
7012 release_match_heapframes(&frame_zero);
7013 #endif
7014 return rc;
7017 /* Handle partial matches - disable any mark data */
7019 if (start_partial != NULL)
7021 DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
7022 md->mark = NULL;
7023 if (offsetcount > 1)
7025 offsets[0] = (int)(start_partial - (PCRE_PUCHAR)subject);
7026 offsets[1] = (int)(end_subject - (PCRE_PUCHAR)subject);
7028 rc = PCRE_ERROR_PARTIAL;
7031 /* This is the classic nomatch case */
7033 else
7035 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
7036 rc = PCRE_ERROR_NOMATCH;
7039 /* Return the MARK data if it has been requested. */
7041 if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_MARK) != 0)
7042 *(extra_data->mark) = (pcre_uchar *)md->nomatch_mark;
7043 #ifdef NO_RECURSE
7044 release_match_heapframes(&frame_zero);
7045 #endif
7046 return rc;
7049 /* End of pcre_exec.c */