Clean and tiddy-up files.
[tomato.git] / release / src / router / pcre / pcre_dfa_exec.c
blob91fb730b68de1e75ebdbd9ffc6e234964db02845
1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language (but see
7 below for why this module is different).
9 Written by Philip Hazel
10 Copyright (c) 1997-2012 University of Cambridge
12 -----------------------------------------------------------------------------
13 Redistribution and use in source and binary forms, with or without
14 modification, are permitted provided that the following conditions are met:
16 * Redistributions of source code must retain the above copyright notice,
17 this list of conditions and the following disclaimer.
19 * Redistributions in binary form must reproduce the above copyright
20 notice, this list of conditions and the following disclaimer in the
21 documentation and/or other materials provided with the distribution.
23 * Neither the name of the University of Cambridge nor the names of its
24 contributors may be used to endorse or promote products derived from
25 this software without specific prior written permission.
27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 POSSIBILITY OF SUCH DAMAGE.
38 -----------------------------------------------------------------------------
41 /* This module contains the external function pcre_dfa_exec(), which is an
42 alternative matching function that uses a sort of DFA algorithm (not a true
43 FSM). This is NOT Perl-compatible, but it has advantages in certain
44 applications. */
47 /* NOTE ABOUT PERFORMANCE: A user of this function sent some code that improved
48 the performance of his patterns greatly. I could not use it as it stood, as it
49 was not thread safe, and made assumptions about pattern sizes. Also, it caused
50 test 7 to loop, and test 9 to crash with a segfault.
52 The issue is the check for duplicate states, which is done by a simple linear
53 search up the state list. (Grep for "duplicate" below to find the code.) For
54 many patterns, there will never be many states active at one time, so a simple
55 linear search is fine. In patterns that have many active states, it might be a
56 bottleneck. The suggested code used an indexing scheme to remember which states
57 had previously been used for each character, and avoided the linear search when
58 it knew there was no chance of a duplicate. This was implemented when adding
59 states to the state lists.
61 I wrote some thread-safe, not-limited code to try something similar at the time
62 of checking for duplicates (instead of when adding states), using index vectors
63 on the stack. It did give a 13% improvement with one specially constructed
64 pattern for certain subject strings, but on other strings and on many of the
65 simpler patterns in the test suite it did worse. The major problem, I think,
66 was the extra time to initialize the index. This had to be done for each call
67 of internal_dfa_exec(). (The supplied patch used a static vector, initialized
68 only once - I suspect this was the cause of the problems with the tests.)
70 Overall, I concluded that the gains in some cases did not outweigh the losses
71 in others, so I abandoned this code. */
75 #ifdef HAVE_CONFIG_H
76 #include "config.h"
77 #endif
79 #define NLBLOCK md /* Block containing newline information */
80 #define PSSTART start_subject /* Field containing processed string start */
81 #define PSEND end_subject /* Field containing processed string end */
83 #include "pcre_internal.h"
86 /* For use to indent debugging output */
88 #define SP " "
91 /*************************************************
92 * Code parameters and static tables *
93 *************************************************/
95 /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
96 into others, under special conditions. A gap of 20 between the blocks should be
97 enough. The resulting opcodes don't have to be less than 256 because they are
98 never stored, so we push them well clear of the normal opcodes. */
100 #define OP_PROP_EXTRA 300
101 #define OP_EXTUNI_EXTRA 320
102 #define OP_ANYNL_EXTRA 340
103 #define OP_HSPACE_EXTRA 360
104 #define OP_VSPACE_EXTRA 380
107 /* This table identifies those opcodes that are followed immediately by a
108 character that is to be tested in some way. This makes it possible to
109 centralize the loading of these characters. In the case of Type * etc, the
110 "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
111 small value. Non-zero values in the table are the offsets from the opcode where
112 the character is to be found. ***NOTE*** If the start of this table is
113 modified, the three tables that follow must also be modified. */
115 static const pcre_uint8 coptable[] = {
116 0, /* End */
117 0, 0, 0, 0, 0, /* \A, \G, \K, \B, \b */
118 0, 0, 0, 0, 0, 0, /* \D, \d, \S, \s, \W, \w */
119 0, 0, 0, /* Any, AllAny, Anybyte */
120 0, 0, /* \P, \p */
121 0, 0, 0, 0, 0, /* \R, \H, \h, \V, \v */
122 0, /* \X */
123 0, 0, 0, 0, 0, 0, /* \Z, \z, ^, ^M, $, $M */
124 1, /* Char */
125 1, /* Chari */
126 1, /* not */
127 1, /* noti */
128 /* Positive single-char repeats */
129 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
130 1+IMM2_SIZE, 1+IMM2_SIZE, /* upto, minupto */
131 1+IMM2_SIZE, /* exact */
132 1, 1, 1, 1+IMM2_SIZE, /* *+, ++, ?+, upto+ */
133 1, 1, 1, 1, 1, 1, /* *I, *?I, +I, +?I, ?I, ??I */
134 1+IMM2_SIZE, 1+IMM2_SIZE, /* upto I, minupto I */
135 1+IMM2_SIZE, /* exact I */
136 1, 1, 1, 1+IMM2_SIZE, /* *+I, ++I, ?+I, upto+I */
137 /* Negative single-char repeats - only for chars < 256 */
138 1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
139 1+IMM2_SIZE, 1+IMM2_SIZE, /* NOT upto, minupto */
140 1+IMM2_SIZE, /* NOT exact */
141 1, 1, 1, 1+IMM2_SIZE, /* NOT *+, ++, ?+, upto+ */
142 1, 1, 1, 1, 1, 1, /* NOT *I, *?I, +I, +?I, ?I, ??I */
143 1+IMM2_SIZE, 1+IMM2_SIZE, /* NOT upto I, minupto I */
144 1+IMM2_SIZE, /* NOT exact I */
145 1, 1, 1, 1+IMM2_SIZE, /* NOT *+I, ++I, ?+I, upto+I */
146 /* Positive type repeats */
147 1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
148 1+IMM2_SIZE, 1+IMM2_SIZE, /* Type upto, minupto */
149 1+IMM2_SIZE, /* Type exact */
150 1, 1, 1, 1+IMM2_SIZE, /* Type *+, ++, ?+, upto+ */
151 /* Character class & ref repeats */
152 0, 0, 0, 0, 0, 0, /* *, *?, +, +?, ?, ?? */
153 0, 0, /* CRRANGE, CRMINRANGE */
154 0, /* CLASS */
155 0, /* NCLASS */
156 0, /* XCLASS - variable length */
157 0, /* REF */
158 0, /* REFI */
159 0, /* RECURSE */
160 0, /* CALLOUT */
161 0, /* Alt */
162 0, /* Ket */
163 0, /* KetRmax */
164 0, /* KetRmin */
165 0, /* KetRpos */
166 0, /* Reverse */
167 0, /* Assert */
168 0, /* Assert not */
169 0, /* Assert behind */
170 0, /* Assert behind not */
171 0, 0, /* ONCE, ONCE_NC */
172 0, 0, 0, 0, 0, /* BRA, BRAPOS, CBRA, CBRAPOS, COND */
173 0, 0, 0, 0, 0, /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND */
174 0, 0, /* CREF, NCREF */
175 0, 0, /* RREF, NRREF */
176 0, /* DEF */
177 0, 0, 0, /* BRAZERO, BRAMINZERO, BRAPOSZERO */
178 0, 0, 0, /* MARK, PRUNE, PRUNE_ARG */
179 0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG */
180 0, 0, 0, 0, /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT */
181 0, 0 /* CLOSE, SKIPZERO */
184 /* This table identifies those opcodes that inspect a character. It is used to
185 remember the fact that a character could have been inspected when the end of
186 the subject is reached. ***NOTE*** If the start of this table is modified, the
187 two tables that follow must also be modified. */
189 static const pcre_uint8 poptable[] = {
190 0, /* End */
191 0, 0, 0, 1, 1, /* \A, \G, \K, \B, \b */
192 1, 1, 1, 1, 1, 1, /* \D, \d, \S, \s, \W, \w */
193 1, 1, 1, /* Any, AllAny, Anybyte */
194 1, 1, /* \P, \p */
195 1, 1, 1, 1, 1, /* \R, \H, \h, \V, \v */
196 1, /* \X */
197 0, 0, 0, 0, 0, 0, /* \Z, \z, ^, ^M, $, $M */
198 1, /* Char */
199 1, /* Chari */
200 1, /* not */
201 1, /* noti */
202 /* Positive single-char repeats */
203 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
204 1, 1, 1, /* upto, minupto, exact */
205 1, 1, 1, 1, /* *+, ++, ?+, upto+ */
206 1, 1, 1, 1, 1, 1, /* *I, *?I, +I, +?I, ?I, ??I */
207 1, 1, 1, /* upto I, minupto I, exact I */
208 1, 1, 1, 1, /* *+I, ++I, ?+I, upto+I */
209 /* Negative single-char repeats - only for chars < 256 */
210 1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
211 1, 1, 1, /* NOT upto, minupto, exact */
212 1, 1, 1, 1, /* NOT *+, ++, ?+, upto+ */
213 1, 1, 1, 1, 1, 1, /* NOT *I, *?I, +I, +?I, ?I, ??I */
214 1, 1, 1, /* NOT upto I, minupto I, exact I */
215 1, 1, 1, 1, /* NOT *+I, ++I, ?+I, upto+I */
216 /* Positive type repeats */
217 1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
218 1, 1, 1, /* Type upto, minupto, exact */
219 1, 1, 1, 1, /* Type *+, ++, ?+, upto+ */
220 /* Character class & ref repeats */
221 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
222 1, 1, /* CRRANGE, CRMINRANGE */
223 1, /* CLASS */
224 1, /* NCLASS */
225 1, /* XCLASS - variable length */
226 0, /* REF */
227 0, /* REFI */
228 0, /* RECURSE */
229 0, /* CALLOUT */
230 0, /* Alt */
231 0, /* Ket */
232 0, /* KetRmax */
233 0, /* KetRmin */
234 0, /* KetRpos */
235 0, /* Reverse */
236 0, /* Assert */
237 0, /* Assert not */
238 0, /* Assert behind */
239 0, /* Assert behind not */
240 0, 0, /* ONCE, ONCE_NC */
241 0, 0, 0, 0, 0, /* BRA, BRAPOS, CBRA, CBRAPOS, COND */
242 0, 0, 0, 0, 0, /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND */
243 0, 0, /* CREF, NCREF */
244 0, 0, /* RREF, NRREF */
245 0, /* DEF */
246 0, 0, 0, /* BRAZERO, BRAMINZERO, BRAPOSZERO */
247 0, 0, 0, /* MARK, PRUNE, PRUNE_ARG */
248 0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG */
249 0, 0, 0, 0, /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT */
250 0, 0 /* CLOSE, SKIPZERO */
253 /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
254 and \w */
256 static const pcre_uint8 toptable1[] = {
257 0, 0, 0, 0, 0, 0,
258 ctype_digit, ctype_digit,
259 ctype_space, ctype_space,
260 ctype_word, ctype_word,
261 0, 0 /* OP_ANY, OP_ALLANY */
264 static const pcre_uint8 toptable2[] = {
265 0, 0, 0, 0, 0, 0,
266 ctype_digit, 0,
267 ctype_space, 0,
268 ctype_word, 0,
269 1, 1 /* OP_ANY, OP_ALLANY */
273 /* Structure for holding data about a particular state, which is in effect the
274 current data for an active path through the match tree. It must consist
275 entirely of ints because the working vector we are passed, and which we put
276 these structures in, is a vector of ints. */
278 typedef struct stateblock {
279 int offset; /* Offset to opcode */
280 int count; /* Count for repeats */
281 int data; /* Some use extra data */
282 } stateblock;
284 #define INTS_PER_STATEBLOCK (int)(sizeof(stateblock)/sizeof(int))
287 #ifdef PCRE_DEBUG
288 /*************************************************
289 * Print character string *
290 *************************************************/
292 /* Character string printing function for debugging.
294 Arguments:
295 p points to string
296 length number of bytes
297 f where to print
299 Returns: nothing
302 static void
303 pchars(const pcre_uchar *p, int length, FILE *f)
305 pcre_uint32 c;
306 while (length-- > 0)
308 if (isprint(c = *(p++)))
309 fprintf(f, "%c", c);
310 else
311 fprintf(f, "\\x{%02x}", c);
314 #endif
318 /*************************************************
319 * Execute a Regular Expression - DFA engine *
320 *************************************************/
322 /* This internal function applies a compiled pattern to a subject string,
323 starting at a given point, using a DFA engine. This function is called from the
324 external one, possibly multiple times if the pattern is not anchored. The
325 function calls itself recursively for some kinds of subpattern.
327 Arguments:
328 md the match_data block with fixed information
329 this_start_code the opening bracket of this subexpression's code
330 current_subject where we currently are in the subject string
331 start_offset start offset in the subject string
332 offsets vector to contain the matching string offsets
333 offsetcount size of same
334 workspace vector of workspace
335 wscount size of same
336 rlevel function call recursion level
338 Returns: > 0 => number of match offset pairs placed in offsets
339 = 0 => offsets overflowed; longest matches are present
340 -1 => failed to match
341 < -1 => some kind of unexpected problem
343 The following macros are used for adding states to the two state vectors (one
344 for the current character, one for the following character). */
346 #define ADD_ACTIVE(x,y) \
347 if (active_count++ < wscount) \
349 next_active_state->offset = (x); \
350 next_active_state->count = (y); \
351 next_active_state++; \
352 DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
354 else return PCRE_ERROR_DFA_WSSIZE
356 #define ADD_ACTIVE_DATA(x,y,z) \
357 if (active_count++ < wscount) \
359 next_active_state->offset = (x); \
360 next_active_state->count = (y); \
361 next_active_state->data = (z); \
362 next_active_state++; \
363 DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
365 else return PCRE_ERROR_DFA_WSSIZE
367 #define ADD_NEW(x,y) \
368 if (new_count++ < wscount) \
370 next_new_state->offset = (x); \
371 next_new_state->count = (y); \
372 next_new_state++; \
373 DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
375 else return PCRE_ERROR_DFA_WSSIZE
377 #define ADD_NEW_DATA(x,y,z) \
378 if (new_count++ < wscount) \
380 next_new_state->offset = (x); \
381 next_new_state->count = (y); \
382 next_new_state->data = (z); \
383 next_new_state++; \
384 DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d) line %d\n", rlevel*2-2, SP, \
385 (x), (y), (z), __LINE__)); \
387 else return PCRE_ERROR_DFA_WSSIZE
389 /* And now, here is the code */
391 static int
392 internal_dfa_exec(
393 dfa_match_data *md,
394 const pcre_uchar *this_start_code,
395 const pcre_uchar *current_subject,
396 int start_offset,
397 int *offsets,
398 int offsetcount,
399 int *workspace,
400 int wscount,
401 int rlevel)
403 stateblock *active_states, *new_states, *temp_states;
404 stateblock *next_active_state, *next_new_state;
406 const pcre_uint8 *ctypes, *lcc, *fcc;
407 const pcre_uchar *ptr;
408 const pcre_uchar *end_code, *first_op;
410 dfa_recursion_info new_recursive;
412 int active_count, new_count, match_count;
414 /* Some fields in the md block are frequently referenced, so we load them into
415 independent variables in the hope that this will perform better. */
417 const pcre_uchar *start_subject = md->start_subject;
418 const pcre_uchar *end_subject = md->end_subject;
419 const pcre_uchar *start_code = md->start_code;
421 #ifdef SUPPORT_UTF
422 BOOL utf = (md->poptions & PCRE_UTF8) != 0;
423 #else
424 BOOL utf = FALSE;
425 #endif
427 BOOL reset_could_continue = FALSE;
429 rlevel++;
430 offsetcount &= (-2);
432 wscount -= 2;
433 wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
434 (2 * INTS_PER_STATEBLOCK);
436 DPRINTF(("\n%.*s---------------------\n"
437 "%.*sCall to internal_dfa_exec f=%d\n",
438 rlevel*2-2, SP, rlevel*2-2, SP, rlevel));
440 ctypes = md->tables + ctypes_offset;
441 lcc = md->tables + lcc_offset;
442 fcc = md->tables + fcc_offset;
444 match_count = PCRE_ERROR_NOMATCH; /* A negative number */
446 active_states = (stateblock *)(workspace + 2);
447 next_new_state = new_states = active_states + wscount;
448 new_count = 0;
450 first_op = this_start_code + 1 + LINK_SIZE +
451 ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
452 *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)
453 ? IMM2_SIZE:0);
455 /* The first thing in any (sub) pattern is a bracket of some sort. Push all
456 the alternative states onto the list, and find out where the end is. This
457 makes is possible to use this function recursively, when we want to stop at a
458 matching internal ket rather than at the end.
460 If the first opcode in the first alternative is OP_REVERSE, we are dealing with
461 a backward assertion. In that case, we have to find out the maximum amount to
462 move back, and set up each alternative appropriately. */
464 if (*first_op == OP_REVERSE)
466 int max_back = 0;
467 int gone_back;
469 end_code = this_start_code;
472 int back = GET(end_code, 2+LINK_SIZE);
473 if (back > max_back) max_back = back;
474 end_code += GET(end_code, 1);
476 while (*end_code == OP_ALT);
478 /* If we can't go back the amount required for the longest lookbehind
479 pattern, go back as far as we can; some alternatives may still be viable. */
481 #ifdef SUPPORT_UTF
482 /* In character mode we have to step back character by character */
484 if (utf)
486 for (gone_back = 0; gone_back < max_back; gone_back++)
488 if (current_subject <= start_subject) break;
489 current_subject--;
490 ACROSSCHAR(current_subject > start_subject, *current_subject, current_subject--);
493 else
494 #endif
496 /* In byte-mode we can do this quickly. */
499 gone_back = (current_subject - max_back < start_subject)?
500 (int)(current_subject - start_subject) : max_back;
501 current_subject -= gone_back;
504 /* Save the earliest consulted character */
506 if (current_subject < md->start_used_ptr)
507 md->start_used_ptr = current_subject;
509 /* Now we can process the individual branches. */
511 end_code = this_start_code;
514 int back = GET(end_code, 2+LINK_SIZE);
515 if (back <= gone_back)
517 int bstate = (int)(end_code - start_code + 2 + 2*LINK_SIZE);
518 ADD_NEW_DATA(-bstate, 0, gone_back - back);
520 end_code += GET(end_code, 1);
522 while (*end_code == OP_ALT);
525 /* This is the code for a "normal" subpattern (not a backward assertion). The
526 start of a whole pattern is always one of these. If we are at the top level,
527 we may be asked to restart matching from the same point that we reached for a
528 previous partial match. We still have to scan through the top-level branches to
529 find the end state. */
531 else
533 end_code = this_start_code;
535 /* Restarting */
537 if (rlevel == 1 && (md->moptions & PCRE_DFA_RESTART) != 0)
539 do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
540 new_count = workspace[1];
541 if (!workspace[0])
542 memcpy(new_states, active_states, new_count * sizeof(stateblock));
545 /* Not restarting */
547 else
549 int length = 1 + LINK_SIZE +
550 ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
551 *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)
552 ? IMM2_SIZE:0);
555 ADD_NEW((int)(end_code - start_code + length), 0);
556 end_code += GET(end_code, 1);
557 length = 1 + LINK_SIZE;
559 while (*end_code == OP_ALT);
563 workspace[0] = 0; /* Bit indicating which vector is current */
565 DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, (int)(end_code - start_code)));
567 /* Loop for scanning the subject */
569 ptr = current_subject;
570 for (;;)
572 int i, j;
573 int clen, dlen;
574 pcre_uint32 c, d;
575 int forced_fail = 0;
576 BOOL partial_newline = FALSE;
577 BOOL could_continue = reset_could_continue;
578 reset_could_continue = FALSE;
580 /* Make the new state list into the active state list and empty the
581 new state list. */
583 temp_states = active_states;
584 active_states = new_states;
585 new_states = temp_states;
586 active_count = new_count;
587 new_count = 0;
589 workspace[0] ^= 1; /* Remember for the restarting feature */
590 workspace[1] = active_count;
592 #ifdef PCRE_DEBUG
593 printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
594 pchars(ptr, STRLEN_UC(ptr), stdout);
595 printf("\"\n");
597 printf("%.*sActive states: ", rlevel*2-2, SP);
598 for (i = 0; i < active_count; i++)
599 printf("%d/%d ", active_states[i].offset, active_states[i].count);
600 printf("\n");
601 #endif
603 /* Set the pointers for adding new states */
605 next_active_state = active_states + active_count;
606 next_new_state = new_states;
608 /* Load the current character from the subject outside the loop, as many
609 different states may want to look at it, and we assume that at least one
610 will. */
612 if (ptr < end_subject)
614 clen = 1; /* Number of data items in the character */
615 #ifdef SUPPORT_UTF
616 GETCHARLENTEST(c, ptr, clen);
617 #else
618 c = *ptr;
619 #endif /* SUPPORT_UTF */
621 else
623 clen = 0; /* This indicates the end of the subject */
624 c = NOTACHAR; /* This value should never actually be used */
627 /* Scan up the active states and act on each one. The result of an action
628 may be to add more states to the currently active list (e.g. on hitting a
629 parenthesis) or it may be to put states on the new list, for considering
630 when we move the character pointer on. */
632 for (i = 0; i < active_count; i++)
634 stateblock *current_state = active_states + i;
635 BOOL caseless = FALSE;
636 const pcre_uchar *code;
637 int state_offset = current_state->offset;
638 int codevalue, rrc;
639 unsigned int count;
641 #ifdef PCRE_DEBUG
642 printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
643 if (clen == 0) printf("EOL\n");
644 else if (c > 32 && c < 127) printf("'%c'\n", c);
645 else printf("0x%02x\n", c);
646 #endif
648 /* A negative offset is a special case meaning "hold off going to this
649 (negated) state until the number of characters in the data field have
650 been skipped". If the could_continue flag was passed over from a previous
651 state, arrange for it to passed on. */
653 if (state_offset < 0)
655 if (current_state->data > 0)
657 DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));
658 ADD_NEW_DATA(state_offset, current_state->count,
659 current_state->data - 1);
660 if (could_continue) reset_could_continue = TRUE;
661 continue;
663 else
665 current_state->offset = state_offset = -state_offset;
669 /* Check for a duplicate state with the same count, and skip if found.
670 See the note at the head of this module about the possibility of improving
671 performance here. */
673 for (j = 0; j < i; j++)
675 if (active_states[j].offset == state_offset &&
676 active_states[j].count == current_state->count)
678 DPRINTF(("%.*sDuplicate state: skipped\n", rlevel*2-2, SP));
679 goto NEXT_ACTIVE_STATE;
683 /* The state offset is the offset to the opcode */
685 code = start_code + state_offset;
686 codevalue = *code;
688 /* If this opcode inspects a character, but we are at the end of the
689 subject, remember the fact for use when testing for a partial match. */
691 if (clen == 0 && poptable[codevalue] != 0)
692 could_continue = TRUE;
694 /* If this opcode is followed by an inline character, load it. It is
695 tempting to test for the presence of a subject character here, but that
696 is wrong, because sometimes zero repetitions of the subject are
697 permitted.
699 We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
700 argument that is not a data character - but is always one byte long because
701 the values are small. We have to take special action to deal with \P, \p,
702 \H, \h, \V, \v and \X in this case. To keep the other cases fast, convert
703 these ones to new opcodes. */
705 if (coptable[codevalue] > 0)
707 dlen = 1;
708 #ifdef SUPPORT_UTF
709 if (utf) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
710 #endif /* SUPPORT_UTF */
711 d = code[coptable[codevalue]];
712 if (codevalue >= OP_TYPESTAR)
714 switch(d)
716 case OP_ANYBYTE: return PCRE_ERROR_DFA_UITEM;
717 case OP_NOTPROP:
718 case OP_PROP: codevalue += OP_PROP_EXTRA; break;
719 case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
720 case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
721 case OP_NOT_HSPACE:
722 case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
723 case OP_NOT_VSPACE:
724 case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
725 default: break;
729 else
731 dlen = 0; /* Not strictly necessary, but compilers moan */
732 d = NOTACHAR; /* if these variables are not set. */
736 /* Now process the individual opcodes */
738 switch (codevalue)
740 /* ========================================================================== */
741 /* These cases are never obeyed. This is a fudge that causes a compile-
742 time error if the vectors coptable or poptable, which are indexed by
743 opcode, are not the correct length. It seems to be the only way to do
744 such a check at compile time, as the sizeof() operator does not work
745 in the C preprocessor. */
747 case OP_TABLE_LENGTH:
748 case OP_TABLE_LENGTH +
749 ((sizeof(coptable) == OP_TABLE_LENGTH) &&
750 (sizeof(poptable) == OP_TABLE_LENGTH)):
751 break;
753 /* ========================================================================== */
754 /* Reached a closing bracket. If not at the end of the pattern, carry
755 on with the next opcode. For repeating opcodes, also add the repeat
756 state. Note that KETRPOS will always be encountered at the end of the
757 subpattern, because the possessive subpattern repeats are always handled
758 using recursive calls. Thus, it never adds any new states.
760 At the end of the (sub)pattern, unless we have an empty string and
761 PCRE_NOTEMPTY is set, or PCRE_NOTEMPTY_ATSTART is set and we are at the
762 start of the subject, save the match data, shifting up all previous
763 matches so we always have the longest first. */
765 case OP_KET:
766 case OP_KETRMIN:
767 case OP_KETRMAX:
768 case OP_KETRPOS:
769 if (code != end_code)
771 ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
772 if (codevalue != OP_KET)
774 ADD_ACTIVE(state_offset - GET(code, 1), 0);
777 else
779 if (ptr > current_subject ||
780 ((md->moptions & PCRE_NOTEMPTY) == 0 &&
781 ((md->moptions & PCRE_NOTEMPTY_ATSTART) == 0 ||
782 current_subject > start_subject + md->start_offset)))
784 if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
785 else if (match_count > 0 && ++match_count * 2 > offsetcount)
786 match_count = 0;
787 count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
788 if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
789 if (offsetcount >= 2)
791 offsets[0] = (int)(current_subject - start_subject);
792 offsets[1] = (int)(ptr - start_subject);
793 DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
794 offsets[1] - offsets[0], (char *)current_subject));
796 if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
798 DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
799 "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
800 match_count, rlevel*2-2, SP));
801 return match_count;
805 break;
807 /* ========================================================================== */
808 /* These opcodes add to the current list of states without looking
809 at the current character. */
811 /*-----------------------------------------------------------------*/
812 case OP_ALT:
813 do { code += GET(code, 1); } while (*code == OP_ALT);
814 ADD_ACTIVE((int)(code - start_code), 0);
815 break;
817 /*-----------------------------------------------------------------*/
818 case OP_BRA:
819 case OP_SBRA:
822 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
823 code += GET(code, 1);
825 while (*code == OP_ALT);
826 break;
828 /*-----------------------------------------------------------------*/
829 case OP_CBRA:
830 case OP_SCBRA:
831 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE + IMM2_SIZE), 0);
832 code += GET(code, 1);
833 while (*code == OP_ALT)
835 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
836 code += GET(code, 1);
838 break;
840 /*-----------------------------------------------------------------*/
841 case OP_BRAZERO:
842 case OP_BRAMINZERO:
843 ADD_ACTIVE(state_offset + 1, 0);
844 code += 1 + GET(code, 2);
845 while (*code == OP_ALT) code += GET(code, 1);
846 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
847 break;
849 /*-----------------------------------------------------------------*/
850 case OP_SKIPZERO:
851 code += 1 + GET(code, 2);
852 while (*code == OP_ALT) code += GET(code, 1);
853 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
854 break;
856 /*-----------------------------------------------------------------*/
857 case OP_CIRC:
858 if (ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0)
859 { ADD_ACTIVE(state_offset + 1, 0); }
860 break;
862 /*-----------------------------------------------------------------*/
863 case OP_CIRCM:
864 if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
865 (ptr != end_subject && WAS_NEWLINE(ptr)))
866 { ADD_ACTIVE(state_offset + 1, 0); }
867 break;
869 /*-----------------------------------------------------------------*/
870 case OP_EOD:
871 if (ptr >= end_subject)
873 if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
874 could_continue = TRUE;
875 else { ADD_ACTIVE(state_offset + 1, 0); }
877 break;
879 /*-----------------------------------------------------------------*/
880 case OP_SOD:
881 if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
882 break;
884 /*-----------------------------------------------------------------*/
885 case OP_SOM:
886 if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
887 break;
890 /* ========================================================================== */
891 /* These opcodes inspect the next subject character, and sometimes
892 the previous one as well, but do not have an argument. The variable
893 clen contains the length of the current character and is zero if we are
894 at the end of the subject. */
896 /*-----------------------------------------------------------------*/
897 case OP_ANY:
898 if (clen > 0 && !IS_NEWLINE(ptr))
900 if (ptr + 1 >= md->end_subject &&
901 (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
902 NLBLOCK->nltype == NLTYPE_FIXED &&
903 NLBLOCK->nllen == 2 &&
904 c == NLBLOCK->nl[0])
906 could_continue = partial_newline = TRUE;
908 else
910 ADD_NEW(state_offset + 1, 0);
913 break;
915 /*-----------------------------------------------------------------*/
916 case OP_ALLANY:
917 if (clen > 0)
918 { ADD_NEW(state_offset + 1, 0); }
919 break;
921 /*-----------------------------------------------------------------*/
922 case OP_EODN:
923 if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
924 could_continue = TRUE;
925 else if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
926 { ADD_ACTIVE(state_offset + 1, 0); }
927 break;
929 /*-----------------------------------------------------------------*/
930 case OP_DOLL:
931 if ((md->moptions & PCRE_NOTEOL) == 0)
933 if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
934 could_continue = TRUE;
935 else if (clen == 0 ||
936 ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
937 (ptr == end_subject - md->nllen)
939 { ADD_ACTIVE(state_offset + 1, 0); }
940 else if (ptr + 1 >= md->end_subject &&
941 (md->moptions & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) != 0 &&
942 NLBLOCK->nltype == NLTYPE_FIXED &&
943 NLBLOCK->nllen == 2 &&
944 c == NLBLOCK->nl[0])
946 if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
948 reset_could_continue = TRUE;
949 ADD_NEW_DATA(-(state_offset + 1), 0, 1);
951 else could_continue = partial_newline = TRUE;
954 break;
956 /*-----------------------------------------------------------------*/
957 case OP_DOLLM:
958 if ((md->moptions & PCRE_NOTEOL) == 0)
960 if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
961 could_continue = TRUE;
962 else if (clen == 0 ||
963 ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr)))
964 { ADD_ACTIVE(state_offset + 1, 0); }
965 else if (ptr + 1 >= md->end_subject &&
966 (md->moptions & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) != 0 &&
967 NLBLOCK->nltype == NLTYPE_FIXED &&
968 NLBLOCK->nllen == 2 &&
969 c == NLBLOCK->nl[0])
971 if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
973 reset_could_continue = TRUE;
974 ADD_NEW_DATA(-(state_offset + 1), 0, 1);
976 else could_continue = partial_newline = TRUE;
979 else if (IS_NEWLINE(ptr))
980 { ADD_ACTIVE(state_offset + 1, 0); }
981 break;
983 /*-----------------------------------------------------------------*/
985 case OP_DIGIT:
986 case OP_WHITESPACE:
987 case OP_WORDCHAR:
988 if (clen > 0 && c < 256 &&
989 ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
990 { ADD_NEW(state_offset + 1, 0); }
991 break;
993 /*-----------------------------------------------------------------*/
994 case OP_NOT_DIGIT:
995 case OP_NOT_WHITESPACE:
996 case OP_NOT_WORDCHAR:
997 if (clen > 0 && (c >= 256 ||
998 ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
999 { ADD_NEW(state_offset + 1, 0); }
1000 break;
1002 /*-----------------------------------------------------------------*/
1003 case OP_WORD_BOUNDARY:
1004 case OP_NOT_WORD_BOUNDARY:
1006 int left_word, right_word;
1008 if (ptr > start_subject)
1010 const pcre_uchar *temp = ptr - 1;
1011 if (temp < md->start_used_ptr) md->start_used_ptr = temp;
1012 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
1013 if (utf) { BACKCHAR(temp); }
1014 #endif
1015 GETCHARTEST(d, temp);
1016 #ifdef SUPPORT_UCP
1017 if ((md->poptions & PCRE_UCP) != 0)
1019 if (d == '_') left_word = TRUE; else
1021 int cat = UCD_CATEGORY(d);
1022 left_word = (cat == ucp_L || cat == ucp_N);
1025 else
1026 #endif
1027 left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
1029 else left_word = FALSE;
1031 if (clen > 0)
1033 #ifdef SUPPORT_UCP
1034 if ((md->poptions & PCRE_UCP) != 0)
1036 if (c == '_') right_word = TRUE; else
1038 int cat = UCD_CATEGORY(c);
1039 right_word = (cat == ucp_L || cat == ucp_N);
1042 else
1043 #endif
1044 right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
1046 else right_word = FALSE;
1048 if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
1049 { ADD_ACTIVE(state_offset + 1, 0); }
1051 break;
1054 /*-----------------------------------------------------------------*/
1055 /* Check the next character by Unicode property. We will get here only
1056 if the support is in the binary; otherwise a compile-time error occurs.
1059 #ifdef SUPPORT_UCP
1060 case OP_PROP:
1061 case OP_NOTPROP:
1062 if (clen > 0)
1064 BOOL OK;
1065 const pcre_uint32 *cp;
1066 const ucd_record * prop = GET_UCD(c);
1067 switch(code[1])
1069 case PT_ANY:
1070 OK = TRUE;
1071 break;
1073 case PT_LAMP:
1074 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1075 prop->chartype == ucp_Lt;
1076 break;
1078 case PT_GC:
1079 OK = PRIV(ucp_gentype)[prop->chartype] == code[2];
1080 break;
1082 case PT_PC:
1083 OK = prop->chartype == code[2];
1084 break;
1086 case PT_SC:
1087 OK = prop->script == code[2];
1088 break;
1090 /* These are specials for combination cases. */
1092 case PT_ALNUM:
1093 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1094 PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1095 break;
1097 case PT_SPACE: /* Perl space */
1098 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1099 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1100 break;
1102 case PT_PXSPACE: /* POSIX space */
1103 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1104 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1105 c == CHAR_FF || c == CHAR_CR;
1106 break;
1108 case PT_WORD:
1109 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1110 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1111 c == CHAR_UNDERSCORE;
1112 break;
1114 case PT_CLIST:
1115 cp = PRIV(ucd_caseless_sets) + code[2];
1116 for (;;)
1118 if (c < *cp) { OK = FALSE; break; }
1119 if (c == *cp++) { OK = TRUE; break; }
1121 break;
1123 /* Should never occur, but keep compilers from grumbling. */
1125 default:
1126 OK = codevalue != OP_PROP;
1127 break;
1130 if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
1132 break;
1133 #endif
1137 /* ========================================================================== */
1138 /* These opcodes likewise inspect the subject character, but have an
1139 argument that is not a data character. It is one of these opcodes:
1140 OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
1141 OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
1143 case OP_TYPEPLUS:
1144 case OP_TYPEMINPLUS:
1145 case OP_TYPEPOSPLUS:
1146 count = current_state->count; /* Already matched */
1147 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1148 if (clen > 0)
1150 if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1151 (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1152 NLBLOCK->nltype == NLTYPE_FIXED &&
1153 NLBLOCK->nllen == 2 &&
1154 c == NLBLOCK->nl[0])
1156 could_continue = partial_newline = TRUE;
1158 else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1159 (c < 256 &&
1160 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1161 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1163 if (count > 0 && codevalue == OP_TYPEPOSPLUS)
1165 active_count--; /* Remove non-match possibility */
1166 next_active_state--;
1168 count++;
1169 ADD_NEW(state_offset, count);
1172 break;
1174 /*-----------------------------------------------------------------*/
1175 case OP_TYPEQUERY:
1176 case OP_TYPEMINQUERY:
1177 case OP_TYPEPOSQUERY:
1178 ADD_ACTIVE(state_offset + 2, 0);
1179 if (clen > 0)
1181 if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1182 (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1183 NLBLOCK->nltype == NLTYPE_FIXED &&
1184 NLBLOCK->nllen == 2 &&
1185 c == NLBLOCK->nl[0])
1187 could_continue = partial_newline = TRUE;
1189 else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1190 (c < 256 &&
1191 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1192 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1194 if (codevalue == OP_TYPEPOSQUERY)
1196 active_count--; /* Remove non-match possibility */
1197 next_active_state--;
1199 ADD_NEW(state_offset + 2, 0);
1202 break;
1204 /*-----------------------------------------------------------------*/
1205 case OP_TYPESTAR:
1206 case OP_TYPEMINSTAR:
1207 case OP_TYPEPOSSTAR:
1208 ADD_ACTIVE(state_offset + 2, 0);
1209 if (clen > 0)
1211 if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1212 (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1213 NLBLOCK->nltype == NLTYPE_FIXED &&
1214 NLBLOCK->nllen == 2 &&
1215 c == NLBLOCK->nl[0])
1217 could_continue = partial_newline = TRUE;
1219 else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1220 (c < 256 &&
1221 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1222 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1224 if (codevalue == OP_TYPEPOSSTAR)
1226 active_count--; /* Remove non-match possibility */
1227 next_active_state--;
1229 ADD_NEW(state_offset, 0);
1232 break;
1234 /*-----------------------------------------------------------------*/
1235 case OP_TYPEEXACT:
1236 count = current_state->count; /* Number already matched */
1237 if (clen > 0)
1239 if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1240 (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1241 NLBLOCK->nltype == NLTYPE_FIXED &&
1242 NLBLOCK->nllen == 2 &&
1243 c == NLBLOCK->nl[0])
1245 could_continue = partial_newline = TRUE;
1247 else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1248 (c < 256 &&
1249 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1250 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1252 if (++count >= GET2(code, 1))
1253 { ADD_NEW(state_offset + 1 + IMM2_SIZE + 1, 0); }
1254 else
1255 { ADD_NEW(state_offset, count); }
1258 break;
1260 /*-----------------------------------------------------------------*/
1261 case OP_TYPEUPTO:
1262 case OP_TYPEMINUPTO:
1263 case OP_TYPEPOSUPTO:
1264 ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0);
1265 count = current_state->count; /* Number already matched */
1266 if (clen > 0)
1268 if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1269 (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1270 NLBLOCK->nltype == NLTYPE_FIXED &&
1271 NLBLOCK->nllen == 2 &&
1272 c == NLBLOCK->nl[0])
1274 could_continue = partial_newline = TRUE;
1276 else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1277 (c < 256 &&
1278 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1279 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1281 if (codevalue == OP_TYPEPOSUPTO)
1283 active_count--; /* Remove non-match possibility */
1284 next_active_state--;
1286 if (++count >= GET2(code, 1))
1287 { ADD_NEW(state_offset + 2 + IMM2_SIZE, 0); }
1288 else
1289 { ADD_NEW(state_offset, count); }
1292 break;
1294 /* ========================================================================== */
1295 /* These are virtual opcodes that are used when something like
1296 OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
1297 argument. It keeps the code above fast for the other cases. The argument
1298 is in the d variable. */
1300 #ifdef SUPPORT_UCP
1301 case OP_PROP_EXTRA + OP_TYPEPLUS:
1302 case OP_PROP_EXTRA + OP_TYPEMINPLUS:
1303 case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
1304 count = current_state->count; /* Already matched */
1305 if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
1306 if (clen > 0)
1308 BOOL OK;
1309 const pcre_uint32 *cp;
1310 const ucd_record * prop = GET_UCD(c);
1311 switch(code[2])
1313 case PT_ANY:
1314 OK = TRUE;
1315 break;
1317 case PT_LAMP:
1318 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1319 prop->chartype == ucp_Lt;
1320 break;
1322 case PT_GC:
1323 OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
1324 break;
1326 case PT_PC:
1327 OK = prop->chartype == code[3];
1328 break;
1330 case PT_SC:
1331 OK = prop->script == code[3];
1332 break;
1334 /* These are specials for combination cases. */
1336 case PT_ALNUM:
1337 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1338 PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1339 break;
1341 case PT_SPACE: /* Perl space */
1342 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1343 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1344 break;
1346 case PT_PXSPACE: /* POSIX space */
1347 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1348 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1349 c == CHAR_FF || c == CHAR_CR;
1350 break;
1352 case PT_WORD:
1353 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1354 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1355 c == CHAR_UNDERSCORE;
1356 break;
1358 case PT_CLIST:
1359 cp = PRIV(ucd_caseless_sets) + code[3];
1360 for (;;)
1362 if (c < *cp) { OK = FALSE; break; }
1363 if (c == *cp++) { OK = TRUE; break; }
1365 break;
1367 /* Should never occur, but keep compilers from grumbling. */
1369 default:
1370 OK = codevalue != OP_PROP;
1371 break;
1374 if (OK == (d == OP_PROP))
1376 if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
1378 active_count--; /* Remove non-match possibility */
1379 next_active_state--;
1381 count++;
1382 ADD_NEW(state_offset, count);
1385 break;
1387 /*-----------------------------------------------------------------*/
1388 case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
1389 case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
1390 case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1391 count = current_state->count; /* Already matched */
1392 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1393 if (clen > 0)
1395 int lgb, rgb;
1396 const pcre_uchar *nptr = ptr + clen;
1397 int ncount = 0;
1398 if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1400 active_count--; /* Remove non-match possibility */
1401 next_active_state--;
1403 lgb = UCD_GRAPHBREAK(c);
1404 while (nptr < end_subject)
1406 dlen = 1;
1407 if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); }
1408 rgb = UCD_GRAPHBREAK(d);
1409 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
1410 ncount++;
1411 lgb = rgb;
1412 nptr += dlen;
1414 count++;
1415 ADD_NEW_DATA(-state_offset, count, ncount);
1417 break;
1418 #endif
1420 /*-----------------------------------------------------------------*/
1421 case OP_ANYNL_EXTRA + OP_TYPEPLUS:
1422 case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
1423 case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
1424 count = current_state->count; /* Already matched */
1425 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1426 if (clen > 0)
1428 int ncount = 0;
1429 switch (c)
1431 case CHAR_VT:
1432 case CHAR_FF:
1433 case CHAR_NEL:
1434 #ifndef EBCDIC
1435 case 0x2028:
1436 case 0x2029:
1437 #endif /* Not EBCDIC */
1438 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1439 goto ANYNL01;
1441 case CHAR_CR:
1442 if (ptr + 1 < end_subject && RAWUCHARTEST(ptr + 1) == CHAR_LF) ncount = 1;
1443 /* Fall through */
1445 ANYNL01:
1446 case CHAR_LF:
1447 if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1449 active_count--; /* Remove non-match possibility */
1450 next_active_state--;
1452 count++;
1453 ADD_NEW_DATA(-state_offset, count, ncount);
1454 break;
1456 default:
1457 break;
1460 break;
1462 /*-----------------------------------------------------------------*/
1463 case OP_VSPACE_EXTRA + OP_TYPEPLUS:
1464 case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
1465 case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
1466 count = current_state->count; /* Already matched */
1467 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1468 if (clen > 0)
1470 BOOL OK;
1471 switch (c)
1473 VSPACE_CASES:
1474 OK = TRUE;
1475 break;
1477 default:
1478 OK = FALSE;
1479 break;
1482 if (OK == (d == OP_VSPACE))
1484 if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
1486 active_count--; /* Remove non-match possibility */
1487 next_active_state--;
1489 count++;
1490 ADD_NEW_DATA(-state_offset, count, 0);
1493 break;
1495 /*-----------------------------------------------------------------*/
1496 case OP_HSPACE_EXTRA + OP_TYPEPLUS:
1497 case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
1498 case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
1499 count = current_state->count; /* Already matched */
1500 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1501 if (clen > 0)
1503 BOOL OK;
1504 switch (c)
1506 HSPACE_CASES:
1507 OK = TRUE;
1508 break;
1510 default:
1511 OK = FALSE;
1512 break;
1515 if (OK == (d == OP_HSPACE))
1517 if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
1519 active_count--; /* Remove non-match possibility */
1520 next_active_state--;
1522 count++;
1523 ADD_NEW_DATA(-state_offset, count, 0);
1526 break;
1528 /*-----------------------------------------------------------------*/
1529 #ifdef SUPPORT_UCP
1530 case OP_PROP_EXTRA + OP_TYPEQUERY:
1531 case OP_PROP_EXTRA + OP_TYPEMINQUERY:
1532 case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
1533 count = 4;
1534 goto QS1;
1536 case OP_PROP_EXTRA + OP_TYPESTAR:
1537 case OP_PROP_EXTRA + OP_TYPEMINSTAR:
1538 case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
1539 count = 0;
1541 QS1:
1543 ADD_ACTIVE(state_offset + 4, 0);
1544 if (clen > 0)
1546 BOOL OK;
1547 const pcre_uint32 *cp;
1548 const ucd_record * prop = GET_UCD(c);
1549 switch(code[2])
1551 case PT_ANY:
1552 OK = TRUE;
1553 break;
1555 case PT_LAMP:
1556 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1557 prop->chartype == ucp_Lt;
1558 break;
1560 case PT_GC:
1561 OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
1562 break;
1564 case PT_PC:
1565 OK = prop->chartype == code[3];
1566 break;
1568 case PT_SC:
1569 OK = prop->script == code[3];
1570 break;
1572 /* These are specials for combination cases. */
1574 case PT_ALNUM:
1575 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1576 PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1577 break;
1579 case PT_SPACE: /* Perl space */
1580 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1581 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1582 break;
1584 case PT_PXSPACE: /* POSIX space */
1585 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1586 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1587 c == CHAR_FF || c == CHAR_CR;
1588 break;
1590 case PT_WORD:
1591 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1592 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1593 c == CHAR_UNDERSCORE;
1594 break;
1596 case PT_CLIST:
1597 cp = PRIV(ucd_caseless_sets) + code[3];
1598 for (;;)
1600 if (c < *cp) { OK = FALSE; break; }
1601 if (c == *cp++) { OK = TRUE; break; }
1603 break;
1605 /* Should never occur, but keep compilers from grumbling. */
1607 default:
1608 OK = codevalue != OP_PROP;
1609 break;
1612 if (OK == (d == OP_PROP))
1614 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
1615 codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
1617 active_count--; /* Remove non-match possibility */
1618 next_active_state--;
1620 ADD_NEW(state_offset + count, 0);
1623 break;
1625 /*-----------------------------------------------------------------*/
1626 case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
1627 case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
1628 case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
1629 count = 2;
1630 goto QS2;
1632 case OP_EXTUNI_EXTRA + OP_TYPESTAR:
1633 case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
1634 case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
1635 count = 0;
1637 QS2:
1639 ADD_ACTIVE(state_offset + 2, 0);
1640 if (clen > 0)
1642 int lgb, rgb;
1643 const pcre_uchar *nptr = ptr + clen;
1644 int ncount = 0;
1645 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1646 codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
1648 active_count--; /* Remove non-match possibility */
1649 next_active_state--;
1651 lgb = UCD_GRAPHBREAK(c);
1652 while (nptr < end_subject)
1654 dlen = 1;
1655 if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); }
1656 rgb = UCD_GRAPHBREAK(d);
1657 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
1658 ncount++;
1659 lgb = rgb;
1660 nptr += dlen;
1662 ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1664 break;
1665 #endif
1667 /*-----------------------------------------------------------------*/
1668 case OP_ANYNL_EXTRA + OP_TYPEQUERY:
1669 case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
1670 case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
1671 count = 2;
1672 goto QS3;
1674 case OP_ANYNL_EXTRA + OP_TYPESTAR:
1675 case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
1676 case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
1677 count = 0;
1679 QS3:
1680 ADD_ACTIVE(state_offset + 2, 0);
1681 if (clen > 0)
1683 int ncount = 0;
1684 switch (c)
1686 case CHAR_VT:
1687 case CHAR_FF:
1688 case CHAR_NEL:
1689 #ifndef EBCDIC
1690 case 0x2028:
1691 case 0x2029:
1692 #endif /* Not EBCDIC */
1693 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1694 goto ANYNL02;
1696 case CHAR_CR:
1697 if (ptr + 1 < end_subject && RAWUCHARTEST(ptr + 1) == CHAR_LF) ncount = 1;
1698 /* Fall through */
1700 ANYNL02:
1701 case CHAR_LF:
1702 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1703 codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1705 active_count--; /* Remove non-match possibility */
1706 next_active_state--;
1708 ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1709 break;
1711 default:
1712 break;
1715 break;
1717 /*-----------------------------------------------------------------*/
1718 case OP_VSPACE_EXTRA + OP_TYPEQUERY:
1719 case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
1720 case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
1721 count = 2;
1722 goto QS4;
1724 case OP_VSPACE_EXTRA + OP_TYPESTAR:
1725 case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
1726 case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
1727 count = 0;
1729 QS4:
1730 ADD_ACTIVE(state_offset + 2, 0);
1731 if (clen > 0)
1733 BOOL OK;
1734 switch (c)
1736 VSPACE_CASES:
1737 OK = TRUE;
1738 break;
1740 default:
1741 OK = FALSE;
1742 break;
1744 if (OK == (d == OP_VSPACE))
1746 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
1747 codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
1749 active_count--; /* Remove non-match possibility */
1750 next_active_state--;
1752 ADD_NEW_DATA(-(state_offset + count), 0, 0);
1755 break;
1757 /*-----------------------------------------------------------------*/
1758 case OP_HSPACE_EXTRA + OP_TYPEQUERY:
1759 case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
1760 case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
1761 count = 2;
1762 goto QS5;
1764 case OP_HSPACE_EXTRA + OP_TYPESTAR:
1765 case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
1766 case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
1767 count = 0;
1769 QS5:
1770 ADD_ACTIVE(state_offset + 2, 0);
1771 if (clen > 0)
1773 BOOL OK;
1774 switch (c)
1776 HSPACE_CASES:
1777 OK = TRUE;
1778 break;
1780 default:
1781 OK = FALSE;
1782 break;
1785 if (OK == (d == OP_HSPACE))
1787 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
1788 codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
1790 active_count--; /* Remove non-match possibility */
1791 next_active_state--;
1793 ADD_NEW_DATA(-(state_offset + count), 0, 0);
1796 break;
1798 /*-----------------------------------------------------------------*/
1799 #ifdef SUPPORT_UCP
1800 case OP_PROP_EXTRA + OP_TYPEEXACT:
1801 case OP_PROP_EXTRA + OP_TYPEUPTO:
1802 case OP_PROP_EXTRA + OP_TYPEMINUPTO:
1803 case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
1804 if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1805 { ADD_ACTIVE(state_offset + 1 + IMM2_SIZE + 3, 0); }
1806 count = current_state->count; /* Number already matched */
1807 if (clen > 0)
1809 BOOL OK;
1810 const pcre_uint32 *cp;
1811 const ucd_record * prop = GET_UCD(c);
1812 switch(code[1 + IMM2_SIZE + 1])
1814 case PT_ANY:
1815 OK = TRUE;
1816 break;
1818 case PT_LAMP:
1819 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1820 prop->chartype == ucp_Lt;
1821 break;
1823 case PT_GC:
1824 OK = PRIV(ucp_gentype)[prop->chartype] == code[1 + IMM2_SIZE + 2];
1825 break;
1827 case PT_PC:
1828 OK = prop->chartype == code[1 + IMM2_SIZE + 2];
1829 break;
1831 case PT_SC:
1832 OK = prop->script == code[1 + IMM2_SIZE + 2];
1833 break;
1835 /* These are specials for combination cases. */
1837 case PT_ALNUM:
1838 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1839 PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1840 break;
1842 case PT_SPACE: /* Perl space */
1843 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1844 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1845 break;
1847 case PT_PXSPACE: /* POSIX space */
1848 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1849 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1850 c == CHAR_FF || c == CHAR_CR;
1851 break;
1853 case PT_WORD:
1854 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1855 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1856 c == CHAR_UNDERSCORE;
1857 break;
1859 case PT_CLIST:
1860 cp = PRIV(ucd_caseless_sets) + code[1 + IMM2_SIZE + 2];
1861 for (;;)
1863 if (c < *cp) { OK = FALSE; break; }
1864 if (c == *cp++) { OK = TRUE; break; }
1866 break;
1868 /* Should never occur, but keep compilers from grumbling. */
1870 default:
1871 OK = codevalue != OP_PROP;
1872 break;
1875 if (OK == (d == OP_PROP))
1877 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
1879 active_count--; /* Remove non-match possibility */
1880 next_active_state--;
1882 if (++count >= GET2(code, 1))
1883 { ADD_NEW(state_offset + 1 + IMM2_SIZE + 3, 0); }
1884 else
1885 { ADD_NEW(state_offset, count); }
1888 break;
1890 /*-----------------------------------------------------------------*/
1891 case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
1892 case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
1893 case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
1894 case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
1895 if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
1896 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1897 count = current_state->count; /* Number already matched */
1898 if (clen > 0)
1900 int lgb, rgb;
1901 const pcre_uchar *nptr = ptr + clen;
1902 int ncount = 0;
1903 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
1905 active_count--; /* Remove non-match possibility */
1906 next_active_state--;
1908 lgb = UCD_GRAPHBREAK(c);
1909 while (nptr < end_subject)
1911 dlen = 1;
1912 if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); }
1913 rgb = UCD_GRAPHBREAK(d);
1914 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
1915 ncount++;
1916 lgb = rgb;
1917 nptr += dlen;
1919 if (nptr >= end_subject && (md->moptions & PCRE_PARTIAL_HARD) != 0)
1920 reset_could_continue = TRUE;
1921 if (++count >= GET2(code, 1))
1922 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
1923 else
1924 { ADD_NEW_DATA(-state_offset, count, ncount); }
1926 break;
1927 #endif
1929 /*-----------------------------------------------------------------*/
1930 case OP_ANYNL_EXTRA + OP_TYPEEXACT:
1931 case OP_ANYNL_EXTRA + OP_TYPEUPTO:
1932 case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
1933 case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
1934 if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
1935 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1936 count = current_state->count; /* Number already matched */
1937 if (clen > 0)
1939 int ncount = 0;
1940 switch (c)
1942 case CHAR_VT:
1943 case CHAR_FF:
1944 case CHAR_NEL:
1945 #ifndef EBCDIC
1946 case 0x2028:
1947 case 0x2029:
1948 #endif /* Not EBCDIC */
1949 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1950 goto ANYNL03;
1952 case CHAR_CR:
1953 if (ptr + 1 < end_subject && RAWUCHARTEST(ptr + 1) == CHAR_LF) ncount = 1;
1954 /* Fall through */
1956 ANYNL03:
1957 case CHAR_LF:
1958 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
1960 active_count--; /* Remove non-match possibility */
1961 next_active_state--;
1963 if (++count >= GET2(code, 1))
1964 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
1965 else
1966 { ADD_NEW_DATA(-state_offset, count, ncount); }
1967 break;
1969 default:
1970 break;
1973 break;
1975 /*-----------------------------------------------------------------*/
1976 case OP_VSPACE_EXTRA + OP_TYPEEXACT:
1977 case OP_VSPACE_EXTRA + OP_TYPEUPTO:
1978 case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
1979 case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
1980 if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
1981 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1982 count = current_state->count; /* Number already matched */
1983 if (clen > 0)
1985 BOOL OK;
1986 switch (c)
1988 VSPACE_CASES:
1989 OK = TRUE;
1990 break;
1992 default:
1993 OK = FALSE;
1996 if (OK == (d == OP_VSPACE))
1998 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
2000 active_count--; /* Remove non-match possibility */
2001 next_active_state--;
2003 if (++count >= GET2(code, 1))
2004 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
2005 else
2006 { ADD_NEW_DATA(-state_offset, count, 0); }
2009 break;
2011 /*-----------------------------------------------------------------*/
2012 case OP_HSPACE_EXTRA + OP_TYPEEXACT:
2013 case OP_HSPACE_EXTRA + OP_TYPEUPTO:
2014 case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
2015 case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
2016 if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
2017 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2018 count = current_state->count; /* Number already matched */
2019 if (clen > 0)
2021 BOOL OK;
2022 switch (c)
2024 HSPACE_CASES:
2025 OK = TRUE;
2026 break;
2028 default:
2029 OK = FALSE;
2030 break;
2033 if (OK == (d == OP_HSPACE))
2035 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
2037 active_count--; /* Remove non-match possibility */
2038 next_active_state--;
2040 if (++count >= GET2(code, 1))
2041 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
2042 else
2043 { ADD_NEW_DATA(-state_offset, count, 0); }
2046 break;
2048 /* ========================================================================== */
2049 /* These opcodes are followed by a character that is usually compared
2050 to the current subject character; it is loaded into d. We still get
2051 here even if there is no subject character, because in some cases zero
2052 repetitions are permitted. */
2054 /*-----------------------------------------------------------------*/
2055 case OP_CHAR:
2056 if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
2057 break;
2059 /*-----------------------------------------------------------------*/
2060 case OP_CHARI:
2061 if (clen == 0) break;
2063 #ifdef SUPPORT_UTF
2064 if (utf)
2066 if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
2068 unsigned int othercase;
2069 if (c < 128)
2070 othercase = fcc[c];
2071 else
2072 /* If we have Unicode property support, we can use it to test the
2073 other case of the character. */
2074 #ifdef SUPPORT_UCP
2075 othercase = UCD_OTHERCASE(c);
2076 #else
2077 othercase = NOTACHAR;
2078 #endif
2080 if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
2083 else
2084 #endif /* SUPPORT_UTF */
2085 /* Not UTF mode */
2087 if (TABLE_GET(c, lcc, c) == TABLE_GET(d, lcc, d))
2088 { ADD_NEW(state_offset + 2, 0); }
2090 break;
2093 #ifdef SUPPORT_UCP
2094 /*-----------------------------------------------------------------*/
2095 /* This is a tricky one because it can match more than one character.
2096 Find out how many characters to skip, and then set up a negative state
2097 to wait for them to pass before continuing. */
2099 case OP_EXTUNI:
2100 if (clen > 0)
2102 int lgb, rgb;
2103 const pcre_uchar *nptr = ptr + clen;
2104 int ncount = 0;
2105 lgb = UCD_GRAPHBREAK(c);
2106 while (nptr < end_subject)
2108 dlen = 1;
2109 if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); }
2110 rgb = UCD_GRAPHBREAK(d);
2111 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
2112 ncount++;
2113 lgb = rgb;
2114 nptr += dlen;
2116 if (nptr >= end_subject && (md->moptions & PCRE_PARTIAL_HARD) != 0)
2117 reset_could_continue = TRUE;
2118 ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
2120 break;
2121 #endif
2123 /*-----------------------------------------------------------------*/
2124 /* This is a tricky like EXTUNI because it too can match more than one
2125 character (when CR is followed by LF). In this case, set up a negative
2126 state to wait for one character to pass before continuing. */
2128 case OP_ANYNL:
2129 if (clen > 0) switch(c)
2131 case CHAR_VT:
2132 case CHAR_FF:
2133 case CHAR_NEL:
2134 #ifndef EBCDIC
2135 case 0x2028:
2136 case 0x2029:
2137 #endif /* Not EBCDIC */
2138 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
2140 case CHAR_LF:
2141 ADD_NEW(state_offset + 1, 0);
2142 break;
2144 case CHAR_CR:
2145 if (ptr + 1 >= end_subject)
2147 ADD_NEW(state_offset + 1, 0);
2148 if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
2149 reset_could_continue = TRUE;
2151 else if (RAWUCHARTEST(ptr + 1) == CHAR_LF)
2153 ADD_NEW_DATA(-(state_offset + 1), 0, 1);
2155 else
2157 ADD_NEW(state_offset + 1, 0);
2159 break;
2161 break;
2163 /*-----------------------------------------------------------------*/
2164 case OP_NOT_VSPACE:
2165 if (clen > 0) switch(c)
2167 VSPACE_CASES:
2168 break;
2170 default:
2171 ADD_NEW(state_offset + 1, 0);
2172 break;
2174 break;
2176 /*-----------------------------------------------------------------*/
2177 case OP_VSPACE:
2178 if (clen > 0) switch(c)
2180 VSPACE_CASES:
2181 ADD_NEW(state_offset + 1, 0);
2182 break;
2184 default:
2185 break;
2187 break;
2189 /*-----------------------------------------------------------------*/
2190 case OP_NOT_HSPACE:
2191 if (clen > 0) switch(c)
2193 HSPACE_CASES:
2194 break;
2196 default:
2197 ADD_NEW(state_offset + 1, 0);
2198 break;
2200 break;
2202 /*-----------------------------------------------------------------*/
2203 case OP_HSPACE:
2204 if (clen > 0) switch(c)
2206 HSPACE_CASES:
2207 ADD_NEW(state_offset + 1, 0);
2208 break;
2210 default:
2211 break;
2213 break;
2215 /*-----------------------------------------------------------------*/
2216 /* Match a negated single character casefully. */
2218 case OP_NOT:
2219 if (clen > 0 && c != d) { ADD_NEW(state_offset + dlen + 1, 0); }
2220 break;
2222 /*-----------------------------------------------------------------*/
2223 /* Match a negated single character caselessly. */
2225 case OP_NOTI:
2226 if (clen > 0)
2228 unsigned int otherd;
2229 #ifdef SUPPORT_UTF
2230 if (utf && d >= 128)
2232 #ifdef SUPPORT_UCP
2233 otherd = UCD_OTHERCASE(d);
2234 #endif /* SUPPORT_UCP */
2236 else
2237 #endif /* SUPPORT_UTF */
2238 otherd = TABLE_GET(d, fcc, d);
2239 if (c != d && c != otherd)
2240 { ADD_NEW(state_offset + dlen + 1, 0); }
2242 break;
2244 /*-----------------------------------------------------------------*/
2245 case OP_PLUSI:
2246 case OP_MINPLUSI:
2247 case OP_POSPLUSI:
2248 case OP_NOTPLUSI:
2249 case OP_NOTMINPLUSI:
2250 case OP_NOTPOSPLUSI:
2251 caseless = TRUE;
2252 codevalue -= OP_STARI - OP_STAR;
2254 /* Fall through */
2255 case OP_PLUS:
2256 case OP_MINPLUS:
2257 case OP_POSPLUS:
2258 case OP_NOTPLUS:
2259 case OP_NOTMINPLUS:
2260 case OP_NOTPOSPLUS:
2261 count = current_state->count; /* Already matched */
2262 if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
2263 if (clen > 0)
2265 pcre_uint32 otherd = NOTACHAR;
2266 if (caseless)
2268 #ifdef SUPPORT_UTF
2269 if (utf && d >= 128)
2271 #ifdef SUPPORT_UCP
2272 otherd = UCD_OTHERCASE(d);
2273 #endif /* SUPPORT_UCP */
2275 else
2276 #endif /* SUPPORT_UTF */
2277 otherd = TABLE_GET(d, fcc, d);
2279 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2281 if (count > 0 &&
2282 (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
2284 active_count--; /* Remove non-match possibility */
2285 next_active_state--;
2287 count++;
2288 ADD_NEW(state_offset, count);
2291 break;
2293 /*-----------------------------------------------------------------*/
2294 case OP_QUERYI:
2295 case OP_MINQUERYI:
2296 case OP_POSQUERYI:
2297 case OP_NOTQUERYI:
2298 case OP_NOTMINQUERYI:
2299 case OP_NOTPOSQUERYI:
2300 caseless = TRUE;
2301 codevalue -= OP_STARI - OP_STAR;
2302 /* Fall through */
2303 case OP_QUERY:
2304 case OP_MINQUERY:
2305 case OP_POSQUERY:
2306 case OP_NOTQUERY:
2307 case OP_NOTMINQUERY:
2308 case OP_NOTPOSQUERY:
2309 ADD_ACTIVE(state_offset + dlen + 1, 0);
2310 if (clen > 0)
2312 pcre_uint32 otherd = NOTACHAR;
2313 if (caseless)
2315 #ifdef SUPPORT_UTF
2316 if (utf && d >= 128)
2318 #ifdef SUPPORT_UCP
2319 otherd = UCD_OTHERCASE(d);
2320 #endif /* SUPPORT_UCP */
2322 else
2323 #endif /* SUPPORT_UTF */
2324 otherd = TABLE_GET(d, fcc, d);
2326 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2328 if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
2330 active_count--; /* Remove non-match possibility */
2331 next_active_state--;
2333 ADD_NEW(state_offset + dlen + 1, 0);
2336 break;
2338 /*-----------------------------------------------------------------*/
2339 case OP_STARI:
2340 case OP_MINSTARI:
2341 case OP_POSSTARI:
2342 case OP_NOTSTARI:
2343 case OP_NOTMINSTARI:
2344 case OP_NOTPOSSTARI:
2345 caseless = TRUE;
2346 codevalue -= OP_STARI - OP_STAR;
2347 /* Fall through */
2348 case OP_STAR:
2349 case OP_MINSTAR:
2350 case OP_POSSTAR:
2351 case OP_NOTSTAR:
2352 case OP_NOTMINSTAR:
2353 case OP_NOTPOSSTAR:
2354 ADD_ACTIVE(state_offset + dlen + 1, 0);
2355 if (clen > 0)
2357 pcre_uint32 otherd = NOTACHAR;
2358 if (caseless)
2360 #ifdef SUPPORT_UTF
2361 if (utf && d >= 128)
2363 #ifdef SUPPORT_UCP
2364 otherd = UCD_OTHERCASE(d);
2365 #endif /* SUPPORT_UCP */
2367 else
2368 #endif /* SUPPORT_UTF */
2369 otherd = TABLE_GET(d, fcc, d);
2371 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2373 if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
2375 active_count--; /* Remove non-match possibility */
2376 next_active_state--;
2378 ADD_NEW(state_offset, 0);
2381 break;
2383 /*-----------------------------------------------------------------*/
2384 case OP_EXACTI:
2385 case OP_NOTEXACTI:
2386 caseless = TRUE;
2387 codevalue -= OP_STARI - OP_STAR;
2388 /* Fall through */
2389 case OP_EXACT:
2390 case OP_NOTEXACT:
2391 count = current_state->count; /* Number already matched */
2392 if (clen > 0)
2394 pcre_uint32 otherd = NOTACHAR;
2395 if (caseless)
2397 #ifdef SUPPORT_UTF
2398 if (utf && d >= 128)
2400 #ifdef SUPPORT_UCP
2401 otherd = UCD_OTHERCASE(d);
2402 #endif /* SUPPORT_UCP */
2404 else
2405 #endif /* SUPPORT_UTF */
2406 otherd = TABLE_GET(d, fcc, d);
2408 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2410 if (++count >= GET2(code, 1))
2411 { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2412 else
2413 { ADD_NEW(state_offset, count); }
2416 break;
2418 /*-----------------------------------------------------------------*/
2419 case OP_UPTOI:
2420 case OP_MINUPTOI:
2421 case OP_POSUPTOI:
2422 case OP_NOTUPTOI:
2423 case OP_NOTMINUPTOI:
2424 case OP_NOTPOSUPTOI:
2425 caseless = TRUE;
2426 codevalue -= OP_STARI - OP_STAR;
2427 /* Fall through */
2428 case OP_UPTO:
2429 case OP_MINUPTO:
2430 case OP_POSUPTO:
2431 case OP_NOTUPTO:
2432 case OP_NOTMINUPTO:
2433 case OP_NOTPOSUPTO:
2434 ADD_ACTIVE(state_offset + dlen + 1 + IMM2_SIZE, 0);
2435 count = current_state->count; /* Number already matched */
2436 if (clen > 0)
2438 pcre_uint32 otherd = NOTACHAR;
2439 if (caseless)
2441 #ifdef SUPPORT_UTF
2442 if (utf && d >= 128)
2444 #ifdef SUPPORT_UCP
2445 otherd = UCD_OTHERCASE(d);
2446 #endif /* SUPPORT_UCP */
2448 else
2449 #endif /* SUPPORT_UTF */
2450 otherd = TABLE_GET(d, fcc, d);
2452 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2454 if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
2456 active_count--; /* Remove non-match possibility */
2457 next_active_state--;
2459 if (++count >= GET2(code, 1))
2460 { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2461 else
2462 { ADD_NEW(state_offset, count); }
2465 break;
2468 /* ========================================================================== */
2469 /* These are the class-handling opcodes */
2471 case OP_CLASS:
2472 case OP_NCLASS:
2473 case OP_XCLASS:
2475 BOOL isinclass = FALSE;
2476 int next_state_offset;
2477 const pcre_uchar *ecode;
2479 /* For a simple class, there is always just a 32-byte table, and we
2480 can set isinclass from it. */
2482 if (codevalue != OP_XCLASS)
2484 ecode = code + 1 + (32 / sizeof(pcre_uchar));
2485 if (clen > 0)
2487 isinclass = (c > 255)? (codevalue == OP_NCLASS) :
2488 ((((pcre_uint8 *)(code + 1))[c/8] & (1 << (c&7))) != 0);
2492 /* An extended class may have a table or a list of single characters,
2493 ranges, or both, and it may be positive or negative. There's a
2494 function that sorts all this out. */
2496 else
2498 ecode = code + GET(code, 1);
2499 if (clen > 0) isinclass = PRIV(xclass)(c, code + 1 + LINK_SIZE, utf);
2502 /* At this point, isinclass is set for all kinds of class, and ecode
2503 points to the byte after the end of the class. If there is a
2504 quantifier, this is where it will be. */
2506 next_state_offset = (int)(ecode - start_code);
2508 switch (*ecode)
2510 case OP_CRSTAR:
2511 case OP_CRMINSTAR:
2512 ADD_ACTIVE(next_state_offset + 1, 0);
2513 if (isinclass) { ADD_NEW(state_offset, 0); }
2514 break;
2516 case OP_CRPLUS:
2517 case OP_CRMINPLUS:
2518 count = current_state->count; /* Already matched */
2519 if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
2520 if (isinclass) { count++; ADD_NEW(state_offset, count); }
2521 break;
2523 case OP_CRQUERY:
2524 case OP_CRMINQUERY:
2525 ADD_ACTIVE(next_state_offset + 1, 0);
2526 if (isinclass) { ADD_NEW(next_state_offset + 1, 0); }
2527 break;
2529 case OP_CRRANGE:
2530 case OP_CRMINRANGE:
2531 count = current_state->count; /* Already matched */
2532 if (count >= GET2(ecode, 1))
2533 { ADD_ACTIVE(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2534 if (isinclass)
2536 unsigned int max = GET2(ecode, 1 + IMM2_SIZE);
2537 if (++count >= max && max != 0) /* Max 0 => no limit */
2538 { ADD_NEW(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2539 else
2540 { ADD_NEW(state_offset, count); }
2542 break;
2544 default:
2545 if (isinclass) { ADD_NEW(next_state_offset, 0); }
2546 break;
2549 break;
2551 /* ========================================================================== */
2552 /* These are the opcodes for fancy brackets of various kinds. We have
2553 to use recursion in order to handle them. The "always failing" assertion
2554 (?!) is optimised to OP_FAIL when compiling, so we have to support that,
2555 though the other "backtracking verbs" are not supported. */
2557 case OP_FAIL:
2558 forced_fail++; /* Count FAILs for multiple states */
2559 break;
2561 case OP_ASSERT:
2562 case OP_ASSERT_NOT:
2563 case OP_ASSERTBACK:
2564 case OP_ASSERTBACK_NOT:
2566 int rc;
2567 int local_offsets[2];
2568 int local_workspace[1000];
2569 const pcre_uchar *endasscode = code + GET(code, 1);
2571 while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2573 rc = internal_dfa_exec(
2574 md, /* static match data */
2575 code, /* this subexpression's code */
2576 ptr, /* where we currently are */
2577 (int)(ptr - start_subject), /* start offset */
2578 local_offsets, /* offset vector */
2579 sizeof(local_offsets)/sizeof(int), /* size of same */
2580 local_workspace, /* workspace vector */
2581 sizeof(local_workspace)/sizeof(int), /* size of same */
2582 rlevel); /* function recursion level */
2584 if (rc == PCRE_ERROR_DFA_UITEM) return rc;
2585 if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
2586 { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2588 break;
2590 /*-----------------------------------------------------------------*/
2591 case OP_COND:
2592 case OP_SCOND:
2594 int local_offsets[1000];
2595 int local_workspace[1000];
2596 int codelink = GET(code, 1);
2597 int condcode;
2599 /* Because of the way auto-callout works during compile, a callout item
2600 is inserted between OP_COND and an assertion condition. This does not
2601 happen for the other conditions. */
2603 if (code[LINK_SIZE+1] == OP_CALLOUT)
2605 rrc = 0;
2606 if (PUBL(callout) != NULL)
2608 PUBL(callout_block) cb;
2609 cb.version = 1; /* Version 1 of the callout block */
2610 cb.callout_number = code[LINK_SIZE+2];
2611 cb.offset_vector = offsets;
2612 #if defined COMPILE_PCRE8
2613 cb.subject = (PCRE_SPTR)start_subject;
2614 #elif defined COMPILE_PCRE16
2615 cb.subject = (PCRE_SPTR16)start_subject;
2616 #elif defined COMPILE_PCRE32
2617 cb.subject = (PCRE_SPTR32)start_subject;
2618 #endif
2619 cb.subject_length = (int)(end_subject - start_subject);
2620 cb.start_match = (int)(current_subject - start_subject);
2621 cb.current_position = (int)(ptr - start_subject);
2622 cb.pattern_position = GET(code, LINK_SIZE + 3);
2623 cb.next_item_length = GET(code, 3 + 2*LINK_SIZE);
2624 cb.capture_top = 1;
2625 cb.capture_last = -1;
2626 cb.callout_data = md->callout_data;
2627 cb.mark = NULL; /* No (*MARK) support */
2628 if ((rrc = (*PUBL(callout))(&cb)) < 0) return rrc; /* Abandon */
2630 if (rrc > 0) break; /* Fail this thread */
2631 code += PRIV(OP_lengths)[OP_CALLOUT]; /* Skip callout data */
2634 condcode = code[LINK_SIZE+1];
2636 /* Back reference conditions are not supported */
2638 if (condcode == OP_CREF || condcode == OP_NCREF)
2639 return PCRE_ERROR_DFA_UCOND;
2641 /* The DEFINE condition is always false */
2643 if (condcode == OP_DEF)
2644 { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2646 /* The only supported version of OP_RREF is for the value RREF_ANY,
2647 which means "test if in any recursion". We can't test for specifically
2648 recursed groups. */
2650 else if (condcode == OP_RREF || condcode == OP_NRREF)
2652 int value = GET2(code, LINK_SIZE + 2);
2653 if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
2654 if (md->recursive != NULL)
2655 { ADD_ACTIVE(state_offset + LINK_SIZE + 2 + IMM2_SIZE, 0); }
2656 else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2659 /* Otherwise, the condition is an assertion */
2661 else
2663 int rc;
2664 const pcre_uchar *asscode = code + LINK_SIZE + 1;
2665 const pcre_uchar *endasscode = asscode + GET(asscode, 1);
2667 while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2669 rc = internal_dfa_exec(
2670 md, /* fixed match data */
2671 asscode, /* this subexpression's code */
2672 ptr, /* where we currently are */
2673 (int)(ptr - start_subject), /* start offset */
2674 local_offsets, /* offset vector */
2675 sizeof(local_offsets)/sizeof(int), /* size of same */
2676 local_workspace, /* workspace vector */
2677 sizeof(local_workspace)/sizeof(int), /* size of same */
2678 rlevel); /* function recursion level */
2680 if (rc == PCRE_ERROR_DFA_UITEM) return rc;
2681 if ((rc >= 0) ==
2682 (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2683 { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2684 else
2685 { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2688 break;
2690 /*-----------------------------------------------------------------*/
2691 case OP_RECURSE:
2693 dfa_recursion_info *ri;
2694 int local_offsets[1000];
2695 int local_workspace[1000];
2696 const pcre_uchar *callpat = start_code + GET(code, 1);
2697 int recno = (callpat == md->start_code)? 0 :
2698 GET2(callpat, 1 + LINK_SIZE);
2699 int rc;
2701 DPRINTF(("%.*sStarting regex recursion\n", rlevel*2-2, SP));
2703 /* Check for repeating a recursion without advancing the subject
2704 pointer. This should catch convoluted mutual recursions. (Some simple
2705 cases are caught at compile time.) */
2707 for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
2708 if (recno == ri->group_num && ptr == ri->subject_position)
2709 return PCRE_ERROR_RECURSELOOP;
2711 /* Remember this recursion and where we started it so as to
2712 catch infinite loops. */
2714 new_recursive.group_num = recno;
2715 new_recursive.subject_position = ptr;
2716 new_recursive.prevrec = md->recursive;
2717 md->recursive = &new_recursive;
2719 rc = internal_dfa_exec(
2720 md, /* fixed match data */
2721 callpat, /* this subexpression's code */
2722 ptr, /* where we currently are */
2723 (int)(ptr - start_subject), /* start offset */
2724 local_offsets, /* offset vector */
2725 sizeof(local_offsets)/sizeof(int), /* size of same */
2726 local_workspace, /* workspace vector */
2727 sizeof(local_workspace)/sizeof(int), /* size of same */
2728 rlevel); /* function recursion level */
2730 md->recursive = new_recursive.prevrec; /* Done this recursion */
2732 DPRINTF(("%.*sReturn from regex recursion: rc=%d\n", rlevel*2-2, SP,
2733 rc));
2735 /* Ran out of internal offsets */
2737 if (rc == 0) return PCRE_ERROR_DFA_RECURSE;
2739 /* For each successful matched substring, set up the next state with a
2740 count of characters to skip before trying it. Note that the count is in
2741 characters, not bytes. */
2743 if (rc > 0)
2745 for (rc = rc*2 - 2; rc >= 0; rc -= 2)
2747 int charcount = local_offsets[rc+1] - local_offsets[rc];
2748 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2749 if (utf)
2751 const pcre_uchar *p = start_subject + local_offsets[rc];
2752 const pcre_uchar *pp = start_subject + local_offsets[rc+1];
2753 while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
2755 #endif
2756 if (charcount > 0)
2758 ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
2760 else
2762 ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
2766 else if (rc != PCRE_ERROR_NOMATCH) return rc;
2768 break;
2770 /*-----------------------------------------------------------------*/
2771 case OP_BRAPOS:
2772 case OP_SBRAPOS:
2773 case OP_CBRAPOS:
2774 case OP_SCBRAPOS:
2775 case OP_BRAPOSZERO:
2777 int charcount, matched_count;
2778 const pcre_uchar *local_ptr = ptr;
2779 BOOL allow_zero;
2781 if (codevalue == OP_BRAPOSZERO)
2783 allow_zero = TRUE;
2784 codevalue = *(++code); /* Codevalue will be one of above BRAs */
2786 else allow_zero = FALSE;
2788 /* Loop to match the subpattern as many times as possible as if it were
2789 a complete pattern. */
2791 for (matched_count = 0;; matched_count++)
2793 int local_offsets[2];
2794 int local_workspace[1000];
2796 int rc = internal_dfa_exec(
2797 md, /* fixed match data */
2798 code, /* this subexpression's code */
2799 local_ptr, /* where we currently are */
2800 (int)(ptr - start_subject), /* start offset */
2801 local_offsets, /* offset vector */
2802 sizeof(local_offsets)/sizeof(int), /* size of same */
2803 local_workspace, /* workspace vector */
2804 sizeof(local_workspace)/sizeof(int), /* size of same */
2805 rlevel); /* function recursion level */
2807 /* Failed to match */
2809 if (rc < 0)
2811 if (rc != PCRE_ERROR_NOMATCH) return rc;
2812 break;
2815 /* Matched: break the loop if zero characters matched. */
2817 charcount = local_offsets[1] - local_offsets[0];
2818 if (charcount == 0) break;
2819 local_ptr += charcount; /* Advance temporary position ptr */
2822 /* At this point we have matched the subpattern matched_count
2823 times, and local_ptr is pointing to the character after the end of the
2824 last match. */
2826 if (matched_count > 0 || allow_zero)
2828 const pcre_uchar *end_subpattern = code;
2829 int next_state_offset;
2831 do { end_subpattern += GET(end_subpattern, 1); }
2832 while (*end_subpattern == OP_ALT);
2833 next_state_offset =
2834 (int)(end_subpattern - start_code + LINK_SIZE + 1);
2836 /* Optimization: if there are no more active states, and there
2837 are no new states yet set up, then skip over the subject string
2838 right here, to save looping. Otherwise, set up the new state to swing
2839 into action when the end of the matched substring is reached. */
2841 if (i + 1 >= active_count && new_count == 0)
2843 ptr = local_ptr;
2844 clen = 0;
2845 ADD_NEW(next_state_offset, 0);
2847 else
2849 const pcre_uchar *p = ptr;
2850 const pcre_uchar *pp = local_ptr;
2851 charcount = (int)(pp - p);
2852 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2853 if (utf) while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
2854 #endif
2855 ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2859 break;
2861 /*-----------------------------------------------------------------*/
2862 case OP_ONCE:
2863 case OP_ONCE_NC:
2865 int local_offsets[2];
2866 int local_workspace[1000];
2868 int rc = internal_dfa_exec(
2869 md, /* fixed match data */
2870 code, /* this subexpression's code */
2871 ptr, /* where we currently are */
2872 (int)(ptr - start_subject), /* start offset */
2873 local_offsets, /* offset vector */
2874 sizeof(local_offsets)/sizeof(int), /* size of same */
2875 local_workspace, /* workspace vector */
2876 sizeof(local_workspace)/sizeof(int), /* size of same */
2877 rlevel); /* function recursion level */
2879 if (rc >= 0)
2881 const pcre_uchar *end_subpattern = code;
2882 int charcount = local_offsets[1] - local_offsets[0];
2883 int next_state_offset, repeat_state_offset;
2885 do { end_subpattern += GET(end_subpattern, 1); }
2886 while (*end_subpattern == OP_ALT);
2887 next_state_offset =
2888 (int)(end_subpattern - start_code + LINK_SIZE + 1);
2890 /* If the end of this subpattern is KETRMAX or KETRMIN, we must
2891 arrange for the repeat state also to be added to the relevant list.
2892 Calculate the offset, or set -1 for no repeat. */
2894 repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
2895 *end_subpattern == OP_KETRMIN)?
2896 (int)(end_subpattern - start_code - GET(end_subpattern, 1)) : -1;
2898 /* If we have matched an empty string, add the next state at the
2899 current character pointer. This is important so that the duplicate
2900 checking kicks in, which is what breaks infinite loops that match an
2901 empty string. */
2903 if (charcount == 0)
2905 ADD_ACTIVE(next_state_offset, 0);
2908 /* Optimization: if there are no more active states, and there
2909 are no new states yet set up, then skip over the subject string
2910 right here, to save looping. Otherwise, set up the new state to swing
2911 into action when the end of the matched substring is reached. */
2913 else if (i + 1 >= active_count && new_count == 0)
2915 ptr += charcount;
2916 clen = 0;
2917 ADD_NEW(next_state_offset, 0);
2919 /* If we are adding a repeat state at the new character position,
2920 we must fudge things so that it is the only current state.
2921 Otherwise, it might be a duplicate of one we processed before, and
2922 that would cause it to be skipped. */
2924 if (repeat_state_offset >= 0)
2926 next_active_state = active_states;
2927 active_count = 0;
2928 i = -1;
2929 ADD_ACTIVE(repeat_state_offset, 0);
2932 else
2934 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2935 if (utf)
2937 const pcre_uchar *p = start_subject + local_offsets[0];
2938 const pcre_uchar *pp = start_subject + local_offsets[1];
2939 while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
2941 #endif
2942 ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2943 if (repeat_state_offset >= 0)
2944 { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
2947 else if (rc != PCRE_ERROR_NOMATCH) return rc;
2949 break;
2952 /* ========================================================================== */
2953 /* Handle callouts */
2955 case OP_CALLOUT:
2956 rrc = 0;
2957 if (PUBL(callout) != NULL)
2959 PUBL(callout_block) cb;
2960 cb.version = 1; /* Version 1 of the callout block */
2961 cb.callout_number = code[1];
2962 cb.offset_vector = offsets;
2963 #if defined COMPILE_PCRE8
2964 cb.subject = (PCRE_SPTR)start_subject;
2965 #elif defined COMPILE_PCRE16
2966 cb.subject = (PCRE_SPTR16)start_subject;
2967 #elif defined COMPILE_PCRE32
2968 cb.subject = (PCRE_SPTR32)start_subject;
2969 #endif
2970 cb.subject_length = (int)(end_subject - start_subject);
2971 cb.start_match = (int)(current_subject - start_subject);
2972 cb.current_position = (int)(ptr - start_subject);
2973 cb.pattern_position = GET(code, 2);
2974 cb.next_item_length = GET(code, 2 + LINK_SIZE);
2975 cb.capture_top = 1;
2976 cb.capture_last = -1;
2977 cb.callout_data = md->callout_data;
2978 cb.mark = NULL; /* No (*MARK) support */
2979 if ((rrc = (*PUBL(callout))(&cb)) < 0) return rrc; /* Abandon */
2981 if (rrc == 0)
2982 { ADD_ACTIVE(state_offset + PRIV(OP_lengths)[OP_CALLOUT], 0); }
2983 break;
2986 /* ========================================================================== */
2987 default: /* Unsupported opcode */
2988 return PCRE_ERROR_DFA_UITEM;
2991 NEXT_ACTIVE_STATE: continue;
2993 } /* End of loop scanning active states */
2995 /* We have finished the processing at the current subject character. If no
2996 new states have been set for the next character, we have found all the
2997 matches that we are going to find. If we are at the top level and partial
2998 matching has been requested, check for appropriate conditions.
3000 The "forced_ fail" variable counts the number of (*F) encountered for the
3001 character. If it is equal to the original active_count (saved in
3002 workspace[1]) it means that (*F) was found on every active state. In this
3003 case we don't want to give a partial match.
3005 The "could_continue" variable is true if a state could have continued but
3006 for the fact that the end of the subject was reached. */
3008 if (new_count <= 0)
3010 if (rlevel == 1 && /* Top level, and */
3011 could_continue && /* Some could go on, and */
3012 forced_fail != workspace[1] && /* Not all forced fail & */
3013 ( /* either... */
3014 (md->moptions & PCRE_PARTIAL_HARD) != 0 /* Hard partial */
3015 || /* or... */
3016 ((md->moptions & PCRE_PARTIAL_SOFT) != 0 && /* Soft partial and */
3017 match_count < 0) /* no matches */
3018 ) && /* And... */
3020 partial_newline || /* Either partial NL */
3021 ( /* or ... */
3022 ptr >= end_subject && /* End of subject and */
3023 ptr > md->start_used_ptr) /* Inspected non-empty string */
3027 if (offsetcount >= 2)
3029 offsets[0] = (int)(md->start_used_ptr - start_subject);
3030 offsets[1] = (int)(end_subject - start_subject);
3032 match_count = PCRE_ERROR_PARTIAL;
3035 DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
3036 "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count,
3037 rlevel*2-2, SP));
3038 break; /* In effect, "return", but see the comment below */
3041 /* One or more states are active for the next character. */
3043 ptr += clen; /* Advance to next subject character */
3044 } /* Loop to move along the subject string */
3046 /* Control gets here from "break" a few lines above. We do it this way because
3047 if we use "return" above, we have compiler trouble. Some compilers warn if
3048 there's nothing here because they think the function doesn't return a value. On
3049 the other hand, if we put a dummy statement here, some more clever compilers
3050 complain that it can't be reached. Sigh. */
3052 return match_count;
3058 /*************************************************
3059 * Execute a Regular Expression - DFA engine *
3060 *************************************************/
3062 /* This external function applies a compiled re to a subject string using a DFA
3063 engine. This function calls the internal function multiple times if the pattern
3064 is not anchored.
3066 Arguments:
3067 argument_re points to the compiled expression
3068 extra_data points to extra data or is NULL
3069 subject points to the subject string
3070 length length of subject string (may contain binary zeros)
3071 start_offset where to start in the subject string
3072 options option bits
3073 offsets vector of match offsets
3074 offsetcount size of same
3075 workspace workspace vector
3076 wscount size of same
3078 Returns: > 0 => number of match offset pairs placed in offsets
3079 = 0 => offsets overflowed; longest matches are present
3080 -1 => failed to match
3081 < -1 => some kind of unexpected problem
3084 #if defined COMPILE_PCRE8
3085 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
3086 pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
3087 const char *subject, int length, int start_offset, int options, int *offsets,
3088 int offsetcount, int *workspace, int wscount)
3089 #elif defined COMPILE_PCRE16
3090 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
3091 pcre16_dfa_exec(const pcre16 *argument_re, const pcre16_extra *extra_data,
3092 PCRE_SPTR16 subject, int length, int start_offset, int options, int *offsets,
3093 int offsetcount, int *workspace, int wscount)
3094 #elif defined COMPILE_PCRE32
3095 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
3096 pcre32_dfa_exec(const pcre32 *argument_re, const pcre32_extra *extra_data,
3097 PCRE_SPTR32 subject, int length, int start_offset, int options, int *offsets,
3098 int offsetcount, int *workspace, int wscount)
3099 #endif
3101 REAL_PCRE *re = (REAL_PCRE *)argument_re;
3102 dfa_match_data match_block;
3103 dfa_match_data *md = &match_block;
3104 BOOL utf, anchored, startline, firstline;
3105 const pcre_uchar *current_subject, *end_subject;
3106 const pcre_study_data *study = NULL;
3108 const pcre_uchar *req_char_ptr;
3109 const pcre_uint8 *start_bits = NULL;
3110 BOOL has_first_char = FALSE;
3111 BOOL has_req_char = FALSE;
3112 pcre_uchar first_char = 0;
3113 pcre_uchar first_char2 = 0;
3114 pcre_uchar req_char = 0;
3115 pcre_uchar req_char2 = 0;
3116 int newline;
3118 /* Plausibility checks */
3120 if ((options & ~PUBLIC_DFA_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
3121 if (re == NULL || subject == NULL || workspace == NULL ||
3122 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
3123 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
3124 if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
3125 if (length < 0) return PCRE_ERROR_BADLENGTH;
3126 if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
3128 /* Check that the first field in the block is the magic number. If it is not,
3129 return with PCRE_ERROR_BADMAGIC. However, if the magic number is equal to
3130 REVERSED_MAGIC_NUMBER we return with PCRE_ERROR_BADENDIANNESS, which
3131 means that the pattern is likely compiled with different endianness. */
3133 if (re->magic_number != MAGIC_NUMBER)
3134 return re->magic_number == REVERSED_MAGIC_NUMBER?
3135 PCRE_ERROR_BADENDIANNESS:PCRE_ERROR_BADMAGIC;
3136 if ((re->flags & PCRE_MODE) == 0) return PCRE_ERROR_BADMODE;
3138 /* If restarting after a partial match, do some sanity checks on the contents
3139 of the workspace. */
3141 if ((options & PCRE_DFA_RESTART) != 0)
3143 if ((workspace[0] & (-2)) != 0 || workspace[1] < 1 ||
3144 workspace[1] > (wscount - 2)/INTS_PER_STATEBLOCK)
3145 return PCRE_ERROR_DFA_BADRESTART;
3148 /* Set up study, callout, and table data */
3150 md->tables = re->tables;
3151 md->callout_data = NULL;
3153 if (extra_data != NULL)
3155 unsigned int flags = extra_data->flags;
3156 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
3157 study = (const pcre_study_data *)extra_data->study_data;
3158 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT;
3159 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
3160 return PCRE_ERROR_DFA_UMLIMIT;
3161 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
3162 md->callout_data = extra_data->callout_data;
3163 if ((flags & PCRE_EXTRA_TABLES) != 0)
3164 md->tables = extra_data->tables;
3167 /* Set some local values */
3169 current_subject = (const pcre_uchar *)subject + start_offset;
3170 end_subject = (const pcre_uchar *)subject + length;
3171 req_char_ptr = current_subject - 1;
3173 #ifdef SUPPORT_UTF
3174 /* PCRE_UTF(16|32) have the same value as PCRE_UTF8. */
3175 utf = (re->options & PCRE_UTF8) != 0;
3176 #else
3177 utf = FALSE;
3178 #endif
3180 anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
3181 (re->options & PCRE_ANCHORED) != 0;
3183 /* The remaining fixed data for passing around. */
3185 md->start_code = (const pcre_uchar *)argument_re +
3186 re->name_table_offset + re->name_count * re->name_entry_size;
3187 md->start_subject = (const pcre_uchar *)subject;
3188 md->end_subject = end_subject;
3189 md->start_offset = start_offset;
3190 md->moptions = options;
3191 md->poptions = re->options;
3193 /* If the BSR option is not set at match time, copy what was set
3194 at compile time. */
3196 if ((md->moptions & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) == 0)
3198 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
3199 md->moptions |= re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE);
3200 #ifdef BSR_ANYCRLF
3201 else md->moptions |= PCRE_BSR_ANYCRLF;
3202 #endif
3205 /* Handle different types of newline. The three bits give eight cases. If
3206 nothing is set at run time, whatever was used at compile time applies. */
3208 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) &
3209 PCRE_NEWLINE_BITS)
3211 case 0: newline = NEWLINE; break; /* Compile-time default */
3212 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
3213 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
3214 case PCRE_NEWLINE_CR+
3215 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
3216 case PCRE_NEWLINE_ANY: newline = -1; break;
3217 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
3218 default: return PCRE_ERROR_BADNEWLINE;
3221 if (newline == -2)
3223 md->nltype = NLTYPE_ANYCRLF;
3225 else if (newline < 0)
3227 md->nltype = NLTYPE_ANY;
3229 else
3231 md->nltype = NLTYPE_FIXED;
3232 if (newline > 255)
3234 md->nllen = 2;
3235 md->nl[0] = (newline >> 8) & 255;
3236 md->nl[1] = newline & 255;
3238 else
3240 md->nllen = 1;
3241 md->nl[0] = newline;
3245 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
3246 back the character offset. */
3248 #ifdef SUPPORT_UTF
3249 if (utf && (options & PCRE_NO_UTF8_CHECK) == 0)
3251 int erroroffset;
3252 int errorcode = PRIV(valid_utf)((pcre_uchar *)subject, length, &erroroffset);
3253 if (errorcode != 0)
3255 if (offsetcount >= 2)
3257 offsets[0] = erroroffset;
3258 offsets[1] = errorcode;
3260 #if defined COMPILE_PCRE8
3261 return (errorcode <= PCRE_UTF8_ERR5 && (options & PCRE_PARTIAL_HARD) != 0) ?
3262 PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
3263 #elif defined COMPILE_PCRE16
3264 return (errorcode <= PCRE_UTF16_ERR1 && (options & PCRE_PARTIAL_HARD) != 0) ?
3265 PCRE_ERROR_SHORTUTF16 : PCRE_ERROR_BADUTF16;
3266 #elif defined COMPILE_PCRE32
3267 return PCRE_ERROR_BADUTF32;
3268 #endif
3270 #if defined COMPILE_PCRE8 || defined COMPILE_PCRE16
3271 if (start_offset > 0 && start_offset < length &&
3272 NOT_FIRSTCHAR(((PCRE_PUCHAR)subject)[start_offset]))
3273 return PCRE_ERROR_BADUTF8_OFFSET;
3274 #endif
3276 #endif
3278 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
3279 is a feature that makes it possible to save compiled regex and re-use them
3280 in other programs later. */
3282 if (md->tables == NULL) md->tables = PRIV(default_tables);
3284 /* The "must be at the start of a line" flags are used in a loop when finding
3285 where to start. */
3287 startline = (re->flags & PCRE_STARTLINE) != 0;
3288 firstline = (re->options & PCRE_FIRSTLINE) != 0;
3290 /* Set up the first character to match, if available. The first_byte value is
3291 never set for an anchored regular expression, but the anchoring may be forced
3292 at run time, so we have to test for anchoring. The first char may be unset for
3293 an unanchored pattern, of course. If there's no first char and the pattern was
3294 studied, there may be a bitmap of possible first characters. */
3296 if (!anchored)
3298 if ((re->flags & PCRE_FIRSTSET) != 0)
3300 has_first_char = TRUE;
3301 first_char = first_char2 = (pcre_uchar)(re->first_char);
3302 if ((re->flags & PCRE_FCH_CASELESS) != 0)
3304 first_char2 = TABLE_GET(first_char, md->tables + fcc_offset, first_char);
3305 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
3306 if (utf && first_char > 127)
3307 first_char2 = UCD_OTHERCASE(first_char);
3308 #endif
3311 else
3313 if (!startline && study != NULL &&
3314 (study->flags & PCRE_STUDY_MAPPED) != 0)
3315 start_bits = study->start_bits;
3319 /* For anchored or unanchored matches, there may be a "last known required
3320 character" set. */
3322 if ((re->flags & PCRE_REQCHSET) != 0)
3324 has_req_char = TRUE;
3325 req_char = req_char2 = (pcre_uchar)(re->req_char);
3326 if ((re->flags & PCRE_RCH_CASELESS) != 0)
3328 req_char2 = TABLE_GET(req_char, md->tables + fcc_offset, req_char);
3329 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
3330 if (utf && req_char > 127)
3331 req_char2 = UCD_OTHERCASE(req_char);
3332 #endif
3336 /* Call the main matching function, looping for a non-anchored regex after a
3337 failed match. If not restarting, perform certain optimizations at the start of
3338 a match. */
3340 for (;;)
3342 int rc;
3344 if ((options & PCRE_DFA_RESTART) == 0)
3346 const pcre_uchar *save_end_subject = end_subject;
3348 /* If firstline is TRUE, the start of the match is constrained to the first
3349 line of a multiline string. Implement this by temporarily adjusting
3350 end_subject so that we stop scanning at a newline. If the match fails at
3351 the newline, later code breaks this loop. */
3353 if (firstline)
3355 PCRE_PUCHAR t = current_subject;
3356 #ifdef SUPPORT_UTF
3357 if (utf)
3359 while (t < md->end_subject && !IS_NEWLINE(t))
3361 t++;
3362 ACROSSCHAR(t < end_subject, *t, t++);
3365 else
3366 #endif
3367 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
3368 end_subject = t;
3371 /* There are some optimizations that avoid running the match if a known
3372 starting point is not found. However, there is an option that disables
3373 these, for testing and for ensuring that all callouts do actually occur.
3374 The option can be set in the regex by (*NO_START_OPT) or passed in
3375 match-time options. */
3377 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
3379 /* Advance to a known first char. */
3381 if (has_first_char)
3383 if (first_char != first_char2)
3385 pcre_uchar csc;
3386 while (current_subject < end_subject &&
3387 (csc = RAWUCHARTEST(current_subject)) != first_char && csc != first_char2)
3388 current_subject++;
3390 else
3391 while (current_subject < end_subject &&
3392 RAWUCHARTEST(current_subject) != first_char)
3393 current_subject++;
3396 /* Or to just after a linebreak for a multiline match if possible */
3398 else if (startline)
3400 if (current_subject > md->start_subject + start_offset)
3402 #ifdef SUPPORT_UTF
3403 if (utf)
3405 while (current_subject < end_subject &&
3406 !WAS_NEWLINE(current_subject))
3408 current_subject++;
3409 ACROSSCHAR(current_subject < end_subject, *current_subject,
3410 current_subject++);
3413 else
3414 #endif
3415 while (current_subject < end_subject && !WAS_NEWLINE(current_subject))
3416 current_subject++;
3418 /* If we have just passed a CR and the newline option is ANY or
3419 ANYCRLF, and we are now at a LF, advance the match position by one
3420 more character. */
3422 if (RAWUCHARTEST(current_subject - 1) == CHAR_CR &&
3423 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
3424 current_subject < end_subject &&
3425 RAWUCHARTEST(current_subject) == CHAR_NL)
3426 current_subject++;
3430 /* Or to a non-unique first char after study */
3432 else if (start_bits != NULL)
3434 while (current_subject < end_subject)
3436 register pcre_uint32 c = RAWUCHARTEST(current_subject);
3437 #ifndef COMPILE_PCRE8
3438 if (c > 255) c = 255;
3439 #endif
3440 if ((start_bits[c/8] & (1 << (c&7))) == 0)
3442 current_subject++;
3443 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
3444 /* In non 8-bit mode, the iteration will stop for
3445 characters > 255 at the beginning or not stop at all. */
3446 if (utf)
3447 ACROSSCHAR(current_subject < end_subject, *current_subject,
3448 current_subject++);
3449 #endif
3451 else break;
3456 /* Restore fudged end_subject */
3458 end_subject = save_end_subject;
3460 /* The following two optimizations are disabled for partial matching or if
3461 disabling is explicitly requested (and of course, by the test above, this
3462 code is not obeyed when restarting after a partial match). */
3464 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0 &&
3465 (options & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) == 0)
3467 /* If the pattern was studied, a minimum subject length may be set. This
3468 is a lower bound; no actual string of that length may actually match the
3469 pattern. Although the value is, strictly, in characters, we treat it as
3470 bytes to avoid spending too much time in this optimization. */
3472 if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
3473 (pcre_uint32)(end_subject - current_subject) < study->minlength)
3474 return PCRE_ERROR_NOMATCH;
3476 /* If req_char is set, we know that that character must appear in the
3477 subject for the match to succeed. If the first character is set, req_char
3478 must be later in the subject; otherwise the test starts at the match
3479 point. This optimization can save a huge amount of work in patterns with
3480 nested unlimited repeats that aren't going to match. Writing separate
3481 code for cased/caseless versions makes it go faster, as does using an
3482 autoincrement and backing off on a match.
3484 HOWEVER: when the subject string is very, very long, searching to its end
3485 can take a long time, and give bad performance on quite ordinary
3486 patterns. This showed up when somebody was matching /^C/ on a 32-megabyte
3487 string... so we don't do this when the string is sufficiently long. */
3489 if (has_req_char && end_subject - current_subject < REQ_BYTE_MAX)
3491 register PCRE_PUCHAR p = current_subject + (has_first_char? 1:0);
3493 /* We don't need to repeat the search if we haven't yet reached the
3494 place we found it at last time. */
3496 if (p > req_char_ptr)
3498 if (req_char != req_char2)
3500 while (p < end_subject)
3502 register pcre_uint32 pp = RAWUCHARINCTEST(p);
3503 if (pp == req_char || pp == req_char2) { p--; break; }
3506 else
3508 while (p < end_subject)
3510 if (RAWUCHARINCTEST(p) == req_char) { p--; break; }
3514 /* If we can't find the required character, break the matching loop,
3515 which will cause a return or PCRE_ERROR_NOMATCH. */
3517 if (p >= end_subject) break;
3519 /* If we have found the required character, save the point where we
3520 found it, so that we don't search again next time round the loop if
3521 the start hasn't passed this character yet. */
3523 req_char_ptr = p;
3527 } /* End of optimizations that are done when not restarting */
3529 /* OK, now we can do the business */
3531 md->start_used_ptr = current_subject;
3532 md->recursive = NULL;
3534 rc = internal_dfa_exec(
3535 md, /* fixed match data */
3536 md->start_code, /* this subexpression's code */
3537 current_subject, /* where we currently are */
3538 start_offset, /* start offset in subject */
3539 offsets, /* offset vector */
3540 offsetcount, /* size of same */
3541 workspace, /* workspace vector */
3542 wscount, /* size of same */
3543 0); /* function recurse level */
3545 /* Anything other than "no match" means we are done, always; otherwise, carry
3546 on only if not anchored. */
3548 if (rc != PCRE_ERROR_NOMATCH || anchored) return rc;
3550 /* Advance to the next subject character unless we are at the end of a line
3551 and firstline is set. */
3553 if (firstline && IS_NEWLINE(current_subject)) break;
3554 current_subject++;
3555 #ifdef SUPPORT_UTF
3556 if (utf)
3558 ACROSSCHAR(current_subject < end_subject, *current_subject,
3559 current_subject++);
3561 #endif
3562 if (current_subject > end_subject) break;
3564 /* If we have just passed a CR and we are now at a LF, and the pattern does
3565 not contain any explicit matches for \r or \n, and the newline option is CRLF
3566 or ANY or ANYCRLF, advance the match position by one more character. */
3568 if (RAWUCHARTEST(current_subject - 1) == CHAR_CR &&
3569 current_subject < end_subject &&
3570 RAWUCHARTEST(current_subject) == CHAR_NL &&
3571 (re->flags & PCRE_HASCRORLF) == 0 &&
3572 (md->nltype == NLTYPE_ANY ||
3573 md->nltype == NLTYPE_ANYCRLF ||
3574 md->nllen == 2))
3575 current_subject++;
3577 } /* "Bumpalong" loop */
3579 return PCRE_ERROR_NOMATCH;
3582 /* End of pcre_dfa_exec.c */