tgupdate: merge pcreposix-compat base into pcreposix-compat
[pcreposix-compat.git] / pcre_study.c
blobd9d4960d84ecbac0ad16228cf56bf6d40a4b6cbc
1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
8 Written by Philip Hazel
9 Copyright (c) 1997-2012 University of Cambridge
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
41 /* This module contains the external function pcre_study(), along with local
42 supporting functions. */
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
49 #include "pcre_internal.h"
51 #define SET_BIT(c) start_bits[c/8] |= (1 << (c&7))
53 /* Returns from set_start_bits() */
55 enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE, SSB_UNKNOWN };
59 /*************************************************
60 * Find the minimum subject length for a group *
61 *************************************************/
63 /* Scan a parenthesized group and compute the minimum length of subject that
64 is needed to match it. This is a lower bound; it does not mean there is a
65 string of that length that matches. In UTF8 mode, the result is in characters
66 rather than bytes.
68 Arguments:
69 re compiled pattern block
70 code pointer to start of group (the bracket)
71 startcode pointer to start of the whole pattern's code
72 options the compiling options
73 recurses chain of recurse_check to catch mutual recursion
74 countptr pointer to call count (to catch over complexity)
76 Returns: the minimum length
77 -1 if \C in UTF-8 mode or (*ACCEPT) was encountered
78 -2 internal error (missing capturing bracket)
79 -3 internal error (opcode not listed)
82 static int
83 find_minlength(const REAL_PCRE *re, const pcre_uchar *code,
84 const pcre_uchar *startcode, int options, recurse_check *recurses,
85 int *countptr)
87 int length = -1;
88 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
89 BOOL utf = (options & PCRE_UTF8) != 0;
90 BOOL had_recurse = FALSE;
91 recurse_check this_recurse;
92 register int branchlength = 0;
93 register pcre_uchar *cc = (pcre_uchar *)code + 1 + LINK_SIZE;
95 if ((*countptr)++ > 1000) return -1; /* too complex */
97 if (*code == OP_CBRA || *code == OP_SCBRA ||
98 *code == OP_CBRAPOS || *code == OP_SCBRAPOS) cc += IMM2_SIZE;
100 /* Scan along the opcodes for this branch. If we get to the end of the
101 branch, check the length against that of the other branches. */
103 for (;;)
105 int d, min;
106 pcre_uchar *cs, *ce;
107 register pcre_uchar op = *cc;
109 switch (op)
111 case OP_COND:
112 case OP_SCOND:
114 /* If there is only one branch in a condition, the implied branch has zero
115 length, so we don't add anything. This covers the DEFINE "condition"
116 automatically. */
118 cs = cc + GET(cc, 1);
119 if (*cs != OP_ALT)
121 cc = cs + 1 + LINK_SIZE;
122 break;
125 /* Otherwise we can fall through and treat it the same as any other
126 subpattern. */
128 case OP_CBRA:
129 case OP_SCBRA:
130 case OP_BRA:
131 case OP_SBRA:
132 case OP_CBRAPOS:
133 case OP_SCBRAPOS:
134 case OP_BRAPOS:
135 case OP_SBRAPOS:
136 case OP_ONCE:
137 case OP_ONCE_NC:
138 d = find_minlength(re, cc, startcode, options, recurses, countptr);
139 if (d < 0) return d;
140 branchlength += d;
141 do cc += GET(cc, 1); while (*cc == OP_ALT);
142 cc += 1 + LINK_SIZE;
143 break;
145 /* ACCEPT makes things far too complicated; we have to give up. */
147 case OP_ACCEPT:
148 case OP_ASSERT_ACCEPT:
149 return -1;
151 /* Reached end of a branch; if it's a ket it is the end of a nested
152 call. If it's ALT it is an alternation in a nested call. If it is END it's
153 the end of the outer call. All can be handled by the same code. If an
154 ACCEPT was previously encountered, use the length that was in force at that
155 time, and pass back the shortest ACCEPT length. */
157 case OP_ALT:
158 case OP_KET:
159 case OP_KETRMAX:
160 case OP_KETRMIN:
161 case OP_KETRPOS:
162 case OP_END:
163 if (length < 0 || (!had_recurse && branchlength < length))
164 length = branchlength;
165 if (op != OP_ALT) return length;
166 cc += 1 + LINK_SIZE;
167 branchlength = 0;
168 had_recurse = FALSE;
169 break;
171 /* Skip over assertive subpatterns */
173 case OP_ASSERT:
174 case OP_ASSERT_NOT:
175 case OP_ASSERTBACK:
176 case OP_ASSERTBACK_NOT:
177 do cc += GET(cc, 1); while (*cc == OP_ALT);
178 /* Fall through */
180 /* Skip over things that don't match chars */
182 case OP_REVERSE:
183 case OP_CREF:
184 case OP_DNCREF:
185 case OP_RREF:
186 case OP_DNRREF:
187 case OP_DEF:
188 case OP_CALLOUT:
189 case OP_SOD:
190 case OP_SOM:
191 case OP_EOD:
192 case OP_EODN:
193 case OP_CIRC:
194 case OP_CIRCM:
195 case OP_DOLL:
196 case OP_DOLLM:
197 case OP_NOT_WORD_BOUNDARY:
198 case OP_WORD_BOUNDARY:
199 cc += PRIV(OP_lengths)[*cc];
200 break;
202 /* Skip over a subpattern that has a {0} or {0,x} quantifier */
204 case OP_BRAZERO:
205 case OP_BRAMINZERO:
206 case OP_BRAPOSZERO:
207 case OP_SKIPZERO:
208 cc += PRIV(OP_lengths)[*cc];
209 do cc += GET(cc, 1); while (*cc == OP_ALT);
210 cc += 1 + LINK_SIZE;
211 break;
213 /* Handle literal characters and + repetitions */
215 case OP_CHAR:
216 case OP_CHARI:
217 case OP_NOT:
218 case OP_NOTI:
219 case OP_PLUS:
220 case OP_PLUSI:
221 case OP_MINPLUS:
222 case OP_MINPLUSI:
223 case OP_POSPLUS:
224 case OP_POSPLUSI:
225 case OP_NOTPLUS:
226 case OP_NOTPLUSI:
227 case OP_NOTMINPLUS:
228 case OP_NOTMINPLUSI:
229 case OP_NOTPOSPLUS:
230 case OP_NOTPOSPLUSI:
231 branchlength++;
232 cc += 2;
233 #ifdef SUPPORT_UTF
234 if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
235 #endif
236 break;
238 case OP_TYPEPLUS:
239 case OP_TYPEMINPLUS:
240 case OP_TYPEPOSPLUS:
241 branchlength++;
242 cc += (cc[1] == OP_PROP || cc[1] == OP_NOTPROP)? 4 : 2;
243 break;
245 /* Handle exact repetitions. The count is already in characters, but we
246 need to skip over a multibyte character in UTF8 mode. */
248 case OP_EXACT:
249 case OP_EXACTI:
250 case OP_NOTEXACT:
251 case OP_NOTEXACTI:
252 branchlength += GET2(cc,1);
253 cc += 2 + IMM2_SIZE;
254 #ifdef SUPPORT_UTF
255 if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
256 #endif
257 break;
259 case OP_TYPEEXACT:
260 branchlength += GET2(cc,1);
261 cc += 2 + IMM2_SIZE + ((cc[1 + IMM2_SIZE] == OP_PROP
262 || cc[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
263 break;
265 /* Handle single-char non-literal matchers */
267 case OP_PROP:
268 case OP_NOTPROP:
269 cc += 2;
270 /* Fall through */
272 case OP_NOT_DIGIT:
273 case OP_DIGIT:
274 case OP_NOT_WHITESPACE:
275 case OP_WHITESPACE:
276 case OP_NOT_WORDCHAR:
277 case OP_WORDCHAR:
278 case OP_ANY:
279 case OP_ALLANY:
280 case OP_EXTUNI:
281 case OP_HSPACE:
282 case OP_NOT_HSPACE:
283 case OP_VSPACE:
284 case OP_NOT_VSPACE:
285 branchlength++;
286 cc++;
287 break;
289 /* "Any newline" might match two characters, but it also might match just
290 one. */
292 case OP_ANYNL:
293 branchlength += 1;
294 cc++;
295 break;
297 /* The single-byte matcher means we can't proceed in UTF-8 mode. (In
298 non-UTF-8 mode \C will actually be turned into OP_ALLANY, so won't ever
299 appear, but leave the code, just in case.) */
301 case OP_ANYBYTE:
302 #ifdef SUPPORT_UTF
303 if (utf) return -1;
304 #endif
305 branchlength++;
306 cc++;
307 break;
309 /* For repeated character types, we have to test for \p and \P, which have
310 an extra two bytes of parameters. */
312 case OP_TYPESTAR:
313 case OP_TYPEMINSTAR:
314 case OP_TYPEQUERY:
315 case OP_TYPEMINQUERY:
316 case OP_TYPEPOSSTAR:
317 case OP_TYPEPOSQUERY:
318 if (cc[1] == OP_PROP || cc[1] == OP_NOTPROP) cc += 2;
319 cc += PRIV(OP_lengths)[op];
320 break;
322 case OP_TYPEUPTO:
323 case OP_TYPEMINUPTO:
324 case OP_TYPEPOSUPTO:
325 if (cc[1 + IMM2_SIZE] == OP_PROP
326 || cc[1 + IMM2_SIZE] == OP_NOTPROP) cc += 2;
327 cc += PRIV(OP_lengths)[op];
328 break;
330 /* Check a class for variable quantification */
332 case OP_CLASS:
333 case OP_NCLASS:
334 #if defined SUPPORT_UTF || defined COMPILE_PCRE16 || defined COMPILE_PCRE32
335 case OP_XCLASS:
336 /* The original code caused an unsigned overflow in 64 bit systems,
337 so now we use a conditional statement. */
338 if (op == OP_XCLASS)
339 cc += GET(cc, 1);
340 else
341 cc += PRIV(OP_lengths)[OP_CLASS];
342 #else
343 cc += PRIV(OP_lengths)[OP_CLASS];
344 #endif
346 switch (*cc)
348 case OP_CRPLUS:
349 case OP_CRMINPLUS:
350 case OP_CRPOSPLUS:
351 branchlength++;
352 /* Fall through */
354 case OP_CRSTAR:
355 case OP_CRMINSTAR:
356 case OP_CRQUERY:
357 case OP_CRMINQUERY:
358 case OP_CRPOSSTAR:
359 case OP_CRPOSQUERY:
360 cc++;
361 break;
363 case OP_CRRANGE:
364 case OP_CRMINRANGE:
365 case OP_CRPOSRANGE:
366 branchlength += GET2(cc,1);
367 cc += 1 + 2 * IMM2_SIZE;
368 break;
370 default:
371 branchlength++;
372 break;
374 break;
376 /* Backreferences and subroutine calls are treated in the same way: we find
377 the minimum length for the subpattern. A recursion, however, causes an
378 a flag to be set that causes the length of this branch to be ignored. The
379 logic is that a recursion can only make sense if there is another
380 alternation that stops the recursing. That will provide the minimum length
381 (when no recursion happens). A backreference within the group that it is
382 referencing behaves in the same way.
384 If PCRE_JAVASCRIPT_COMPAT is set, a backreference to an unset bracket
385 matches an empty string (by default it causes a matching failure), so in
386 that case we must set the minimum length to zero. */
388 case OP_DNREF: /* Duplicate named pattern back reference */
389 case OP_DNREFI:
390 if ((options & PCRE_JAVASCRIPT_COMPAT) == 0)
392 int count = GET2(cc, 1+IMM2_SIZE);
393 pcre_uchar *slot = (pcre_uchar *)re +
394 re->name_table_offset + GET2(cc, 1) * re->name_entry_size;
395 d = INT_MAX;
396 while (count-- > 0)
398 ce = cs = (pcre_uchar *)PRIV(find_bracket)(startcode, utf, GET2(slot, 0));
399 if (cs == NULL) return -2;
400 do ce += GET(ce, 1); while (*ce == OP_ALT);
401 if (cc > cs && cc < ce) /* Simple recursion */
403 d = 0;
404 had_recurse = TRUE;
405 break;
407 else
409 recurse_check *r = recurses;
410 for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
411 if (r != NULL) /* Mutual recursion */
413 d = 0;
414 had_recurse = TRUE;
415 break;
417 else
419 int dd;
420 this_recurse.prev = recurses;
421 this_recurse.group = cs;
422 dd = find_minlength(re, cs, startcode, options, &this_recurse,
423 countptr);
424 if (dd < d) d = dd;
427 slot += re->name_entry_size;
430 else d = 0;
431 cc += 1 + 2*IMM2_SIZE;
432 goto REPEAT_BACK_REFERENCE;
434 case OP_REF: /* Single back reference */
435 case OP_REFI:
436 if ((options & PCRE_JAVASCRIPT_COMPAT) == 0)
438 ce = cs = (pcre_uchar *)PRIV(find_bracket)(startcode, utf, GET2(cc, 1));
439 if (cs == NULL) return -2;
440 do ce += GET(ce, 1); while (*ce == OP_ALT);
441 if (cc > cs && cc < ce) /* Simple recursion */
443 d = 0;
444 had_recurse = TRUE;
446 else
448 recurse_check *r = recurses;
449 for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
450 if (r != NULL) /* Mutual recursion */
452 d = 0;
453 had_recurse = TRUE;
455 else
457 this_recurse.prev = recurses;
458 this_recurse.group = cs;
459 d = find_minlength(re, cs, startcode, options, &this_recurse,
460 countptr);
464 else d = 0;
465 cc += 1 + IMM2_SIZE;
467 /* Handle repeated back references */
469 REPEAT_BACK_REFERENCE:
470 switch (*cc)
472 case OP_CRSTAR:
473 case OP_CRMINSTAR:
474 case OP_CRQUERY:
475 case OP_CRMINQUERY:
476 case OP_CRPOSSTAR:
477 case OP_CRPOSQUERY:
478 min = 0;
479 cc++;
480 break;
482 case OP_CRPLUS:
483 case OP_CRMINPLUS:
484 case OP_CRPOSPLUS:
485 min = 1;
486 cc++;
487 break;
489 case OP_CRRANGE:
490 case OP_CRMINRANGE:
491 case OP_CRPOSRANGE:
492 min = GET2(cc, 1);
493 cc += 1 + 2 * IMM2_SIZE;
494 break;
496 default:
497 min = 1;
498 break;
501 branchlength += min * d;
502 break;
504 /* We can easily detect direct recursion, but not mutual recursion. This is
505 caught by a recursion depth count. */
507 case OP_RECURSE:
508 cs = ce = (pcre_uchar *)startcode + GET(cc, 1);
509 do ce += GET(ce, 1); while (*ce == OP_ALT);
510 if (cc > cs && cc < ce) /* Simple recursion */
511 had_recurse = TRUE;
512 else
514 recurse_check *r = recurses;
515 for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
516 if (r != NULL) /* Mutual recursion */
517 had_recurse = TRUE;
518 else
520 this_recurse.prev = recurses;
521 this_recurse.group = cs;
522 branchlength += find_minlength(re, cs, startcode, options,
523 &this_recurse, countptr);
526 cc += 1 + LINK_SIZE;
527 break;
529 /* Anything else does not or need not match a character. We can get the
530 item's length from the table, but for those that can match zero occurrences
531 of a character, we must take special action for UTF-8 characters. As it
532 happens, the "NOT" versions of these opcodes are used at present only for
533 ASCII characters, so they could be omitted from this list. However, in
534 future that may change, so we include them here so as not to leave a
535 gotcha for a future maintainer. */
537 case OP_UPTO:
538 case OP_UPTOI:
539 case OP_NOTUPTO:
540 case OP_NOTUPTOI:
541 case OP_MINUPTO:
542 case OP_MINUPTOI:
543 case OP_NOTMINUPTO:
544 case OP_NOTMINUPTOI:
545 case OP_POSUPTO:
546 case OP_POSUPTOI:
547 case OP_NOTPOSUPTO:
548 case OP_NOTPOSUPTOI:
550 case OP_STAR:
551 case OP_STARI:
552 case OP_NOTSTAR:
553 case OP_NOTSTARI:
554 case OP_MINSTAR:
555 case OP_MINSTARI:
556 case OP_NOTMINSTAR:
557 case OP_NOTMINSTARI:
558 case OP_POSSTAR:
559 case OP_POSSTARI:
560 case OP_NOTPOSSTAR:
561 case OP_NOTPOSSTARI:
563 case OP_QUERY:
564 case OP_QUERYI:
565 case OP_NOTQUERY:
566 case OP_NOTQUERYI:
567 case OP_MINQUERY:
568 case OP_MINQUERYI:
569 case OP_NOTMINQUERY:
570 case OP_NOTMINQUERYI:
571 case OP_POSQUERY:
572 case OP_POSQUERYI:
573 case OP_NOTPOSQUERY:
574 case OP_NOTPOSQUERYI:
576 cc += PRIV(OP_lengths)[op];
577 #ifdef SUPPORT_UTF
578 if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
579 #endif
580 break;
582 /* Skip these, but we need to add in the name length. */
584 case OP_MARK:
585 case OP_PRUNE_ARG:
586 case OP_SKIP_ARG:
587 case OP_THEN_ARG:
588 cc += PRIV(OP_lengths)[op] + cc[1];
589 break;
591 /* The remaining opcodes are just skipped over. */
593 case OP_CLOSE:
594 case OP_COMMIT:
595 case OP_FAIL:
596 case OP_PRUNE:
597 case OP_SET_SOM:
598 case OP_SKIP:
599 case OP_THEN:
600 cc += PRIV(OP_lengths)[op];
601 break;
603 /* This should not occur: we list all opcodes explicitly so that when
604 new ones get added they are properly considered. */
606 default:
607 return -3;
610 /* Control never gets here */
615 /*************************************************
616 * Set a bit and maybe its alternate case *
617 *************************************************/
619 /* Given a character, set its first byte's bit in the table, and also the
620 corresponding bit for the other version of a letter if we are caseless. In
621 UTF-8 mode, for characters greater than 127, we can only do the caseless thing
622 when Unicode property support is available.
624 Arguments:
625 start_bits points to the bit map
626 p points to the character
627 caseless the caseless flag
628 cd the block with char table pointers
629 utf TRUE for UTF-8 / UTF-16 / UTF-32 mode
631 Returns: pointer after the character
634 static const pcre_uchar *
635 set_table_bit(pcre_uint8 *start_bits, const pcre_uchar *p, BOOL caseless,
636 compile_data *cd, BOOL utf)
638 pcre_uint32 c = *p;
640 #ifdef COMPILE_PCRE8
641 SET_BIT(c);
643 #ifdef SUPPORT_UTF
644 if (utf && c > 127)
646 GETCHARINC(c, p);
647 #ifdef SUPPORT_UCP
648 if (caseless)
650 pcre_uchar buff[6];
651 c = UCD_OTHERCASE(c);
652 (void)PRIV(ord2utf)(c, buff);
653 SET_BIT(buff[0]);
655 #endif /* Not SUPPORT_UCP */
656 return p;
658 #else /* Not SUPPORT_UTF */
659 (void)(utf); /* Stops warning for unused parameter */
660 #endif /* SUPPORT_UTF */
662 /* Not UTF-8 mode, or character is less than 127. */
664 if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);
665 return p + 1;
666 #endif /* COMPILE_PCRE8 */
668 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
669 if (c > 0xff)
671 c = 0xff;
672 caseless = FALSE;
674 SET_BIT(c);
676 #ifdef SUPPORT_UTF
677 if (utf && c > 127)
679 GETCHARINC(c, p);
680 #ifdef SUPPORT_UCP
681 if (caseless)
683 c = UCD_OTHERCASE(c);
684 if (c > 0xff)
685 c = 0xff;
686 SET_BIT(c);
688 #endif /* SUPPORT_UCP */
689 return p;
691 #else /* Not SUPPORT_UTF */
692 (void)(utf); /* Stops warning for unused parameter */
693 #endif /* SUPPORT_UTF */
695 if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);
696 return p + 1;
697 #endif
702 /*************************************************
703 * Set bits for a positive character type *
704 *************************************************/
706 /* This function sets starting bits for a character type. In UTF-8 mode, we can
707 only do a direct setting for bytes less than 128, as otherwise there can be
708 confusion with bytes in the middle of UTF-8 characters. In a "traditional"
709 environment, the tables will only recognize ASCII characters anyway, but in at
710 least one Windows environment, some higher bytes bits were set in the tables.
711 So we deal with that case by considering the UTF-8 encoding.
713 Arguments:
714 start_bits the starting bitmap
715 cbit type the type of character wanted
716 table_limit 32 for non-UTF-8; 16 for UTF-8
717 cd the block with char table pointers
719 Returns: nothing
722 static void
723 set_type_bits(pcre_uint8 *start_bits, int cbit_type, unsigned int table_limit,
724 compile_data *cd)
726 register pcre_uint32 c;
727 for (c = 0; c < table_limit; c++) start_bits[c] |= cd->cbits[c+cbit_type];
728 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
729 if (table_limit == 32) return;
730 for (c = 128; c < 256; c++)
732 if ((cd->cbits[c/8] & (1 << (c&7))) != 0)
734 pcre_uchar buff[6];
735 (void)PRIV(ord2utf)(c, buff);
736 SET_BIT(buff[0]);
739 #endif
743 /*************************************************
744 * Set bits for a negative character type *
745 *************************************************/
747 /* This function sets starting bits for a negative character type such as \D.
748 In UTF-8 mode, we can only do a direct setting for bytes less than 128, as
749 otherwise there can be confusion with bytes in the middle of UTF-8 characters.
750 Unlike in the positive case, where we can set appropriate starting bits for
751 specific high-valued UTF-8 characters, in this case we have to set the bits for
752 all high-valued characters. The lowest is 0xc2, but we overkill by starting at
753 0xc0 (192) for simplicity.
755 Arguments:
756 start_bits the starting bitmap
757 cbit type the type of character wanted
758 table_limit 32 for non-UTF-8; 16 for UTF-8
759 cd the block with char table pointers
761 Returns: nothing
764 static void
765 set_nottype_bits(pcre_uint8 *start_bits, int cbit_type, unsigned int table_limit,
766 compile_data *cd)
768 register pcre_uint32 c;
769 for (c = 0; c < table_limit; c++) start_bits[c] |= ~cd->cbits[c+cbit_type];
770 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
771 if (table_limit != 32) for (c = 24; c < 32; c++) start_bits[c] = 0xff;
772 #endif
777 /*************************************************
778 * Create bitmap of starting bytes *
779 *************************************************/
781 /* This function scans a compiled unanchored expression recursively and
782 attempts to build a bitmap of the set of possible starting bytes. As time goes
783 by, we may be able to get more clever at doing this. The SSB_CONTINUE return is
784 useful for parenthesized groups in patterns such as (a*)b where the group
785 provides some optional starting bytes but scanning must continue at the outer
786 level to find at least one mandatory byte. At the outermost level, this
787 function fails unless the result is SSB_DONE.
789 Arguments:
790 code points to an expression
791 start_bits points to a 32-byte table, initialized to 0
792 utf TRUE if in UTF-8 / UTF-16 / UTF-32 mode
793 cd the block with char table pointers
795 Returns: SSB_FAIL => Failed to find any starting bytes
796 SSB_DONE => Found mandatory starting bytes
797 SSB_CONTINUE => Found optional starting bytes
798 SSB_UNKNOWN => Hit an unrecognized opcode
801 static int
802 set_start_bits(const pcre_uchar *code, pcre_uint8 *start_bits, BOOL utf,
803 compile_data *cd)
805 register pcre_uint32 c;
806 int yield = SSB_DONE;
807 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
808 int table_limit = utf? 16:32;
809 #else
810 int table_limit = 32;
811 #endif
813 #if 0
814 /* ========================================================================= */
815 /* The following comment and code was inserted in January 1999. In May 2006,
816 when it was observed to cause compiler warnings about unused values, I took it
817 out again. If anybody is still using OS/2, they will have to put it back
818 manually. */
820 /* This next statement and the later reference to dummy are here in order to
821 trick the optimizer of the IBM C compiler for OS/2 into generating correct
822 code. Apparently IBM isn't going to fix the problem, and we would rather not
823 disable optimization (in this module it actually makes a big difference, and
824 the pcre module can use all the optimization it can get). */
826 volatile int dummy;
827 /* ========================================================================= */
828 #endif
832 BOOL try_next = TRUE;
833 const pcre_uchar *tcode = code + 1 + LINK_SIZE;
835 if (*code == OP_CBRA || *code == OP_SCBRA ||
836 *code == OP_CBRAPOS || *code == OP_SCBRAPOS) tcode += IMM2_SIZE;
838 while (try_next) /* Loop for items in this branch */
840 int rc;
842 switch(*tcode)
844 /* If we reach something we don't understand, it means a new opcode has
845 been created that hasn't been added to this code. Hopefully this problem
846 will be discovered during testing. */
848 default:
849 return SSB_UNKNOWN;
851 /* Fail for a valid opcode that implies no starting bits. */
853 case OP_ACCEPT:
854 case OP_ASSERT_ACCEPT:
855 case OP_ALLANY:
856 case OP_ANY:
857 case OP_ANYBYTE:
858 case OP_CIRC:
859 case OP_CIRCM:
860 case OP_CLOSE:
861 case OP_COMMIT:
862 case OP_COND:
863 case OP_CREF:
864 case OP_DEF:
865 case OP_DNCREF:
866 case OP_DNREF:
867 case OP_DNREFI:
868 case OP_DNRREF:
869 case OP_DOLL:
870 case OP_DOLLM:
871 case OP_END:
872 case OP_EOD:
873 case OP_EODN:
874 case OP_EXTUNI:
875 case OP_FAIL:
876 case OP_MARK:
877 case OP_NOT:
878 case OP_NOTEXACT:
879 case OP_NOTEXACTI:
880 case OP_NOTI:
881 case OP_NOTMINPLUS:
882 case OP_NOTMINPLUSI:
883 case OP_NOTMINQUERY:
884 case OP_NOTMINQUERYI:
885 case OP_NOTMINSTAR:
886 case OP_NOTMINSTARI:
887 case OP_NOTMINUPTO:
888 case OP_NOTMINUPTOI:
889 case OP_NOTPLUS:
890 case OP_NOTPLUSI:
891 case OP_NOTPOSPLUS:
892 case OP_NOTPOSPLUSI:
893 case OP_NOTPOSQUERY:
894 case OP_NOTPOSQUERYI:
895 case OP_NOTPOSSTAR:
896 case OP_NOTPOSSTARI:
897 case OP_NOTPOSUPTO:
898 case OP_NOTPOSUPTOI:
899 case OP_NOTPROP:
900 case OP_NOTQUERY:
901 case OP_NOTQUERYI:
902 case OP_NOTSTAR:
903 case OP_NOTSTARI:
904 case OP_NOTUPTO:
905 case OP_NOTUPTOI:
906 case OP_NOT_HSPACE:
907 case OP_NOT_VSPACE:
908 case OP_PRUNE:
909 case OP_PRUNE_ARG:
910 case OP_RECURSE:
911 case OP_REF:
912 case OP_REFI:
913 case OP_REVERSE:
914 case OP_RREF:
915 case OP_SCOND:
916 case OP_SET_SOM:
917 case OP_SKIP:
918 case OP_SKIP_ARG:
919 case OP_SOD:
920 case OP_SOM:
921 case OP_THEN:
922 case OP_THEN_ARG:
923 return SSB_FAIL;
925 /* A "real" property test implies no starting bits, but the fake property
926 PT_CLIST identifies a list of characters. These lists are short, as they
927 are used for characters with more than one "other case", so there is no
928 point in recognizing them for OP_NOTPROP. */
930 case OP_PROP:
931 if (tcode[1] != PT_CLIST) return SSB_FAIL;
933 const pcre_uint32 *p = PRIV(ucd_caseless_sets) + tcode[2];
934 while ((c = *p++) < NOTACHAR)
936 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
937 if (utf)
939 pcre_uchar buff[6];
940 (void)PRIV(ord2utf)(c, buff);
941 c = buff[0];
943 #endif
944 if (c > 0xff) SET_BIT(0xff); else SET_BIT(c);
947 try_next = FALSE;
948 break;
950 /* We can ignore word boundary tests. */
952 case OP_WORD_BOUNDARY:
953 case OP_NOT_WORD_BOUNDARY:
954 tcode++;
955 break;
957 /* If we hit a bracket or a positive lookahead assertion, recurse to set
958 bits from within the subpattern. If it can't find anything, we have to
959 give up. If it finds some mandatory character(s), we are done for this
960 branch. Otherwise, carry on scanning after the subpattern. */
962 case OP_BRA:
963 case OP_SBRA:
964 case OP_CBRA:
965 case OP_SCBRA:
966 case OP_BRAPOS:
967 case OP_SBRAPOS:
968 case OP_CBRAPOS:
969 case OP_SCBRAPOS:
970 case OP_ONCE:
971 case OP_ONCE_NC:
972 case OP_ASSERT:
973 rc = set_start_bits(tcode, start_bits, utf, cd);
974 if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
975 if (rc == SSB_DONE) try_next = FALSE; else
977 do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
978 tcode += 1 + LINK_SIZE;
980 break;
982 /* If we hit ALT or KET, it means we haven't found anything mandatory in
983 this branch, though we might have found something optional. For ALT, we
984 continue with the next alternative, but we have to arrange that the final
985 result from subpattern is SSB_CONTINUE rather than SSB_DONE. For KET,
986 return SSB_CONTINUE: if this is the top level, that indicates failure,
987 but after a nested subpattern, it causes scanning to continue. */
989 case OP_ALT:
990 yield = SSB_CONTINUE;
991 try_next = FALSE;
992 break;
994 case OP_KET:
995 case OP_KETRMAX:
996 case OP_KETRMIN:
997 case OP_KETRPOS:
998 return SSB_CONTINUE;
1000 /* Skip over callout */
1002 case OP_CALLOUT:
1003 tcode += 2 + 2*LINK_SIZE;
1004 break;
1006 /* Skip over lookbehind and negative lookahead assertions */
1008 case OP_ASSERT_NOT:
1009 case OP_ASSERTBACK:
1010 case OP_ASSERTBACK_NOT:
1011 do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
1012 tcode += 1 + LINK_SIZE;
1013 break;
1015 /* BRAZERO does the bracket, but carries on. */
1017 case OP_BRAZERO:
1018 case OP_BRAMINZERO:
1019 case OP_BRAPOSZERO:
1020 rc = set_start_bits(++tcode, start_bits, utf, cd);
1021 if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
1022 /* =========================================================================
1023 See the comment at the head of this function concerning the next line,
1024 which was an old fudge for the benefit of OS/2.
1025 dummy = 1;
1026 ========================================================================= */
1027 do tcode += GET(tcode,1); while (*tcode == OP_ALT);
1028 tcode += 1 + LINK_SIZE;
1029 break;
1031 /* SKIPZERO skips the bracket. */
1033 case OP_SKIPZERO:
1034 tcode++;
1035 do tcode += GET(tcode,1); while (*tcode == OP_ALT);
1036 tcode += 1 + LINK_SIZE;
1037 break;
1039 /* Single-char * or ? sets the bit and tries the next item */
1041 case OP_STAR:
1042 case OP_MINSTAR:
1043 case OP_POSSTAR:
1044 case OP_QUERY:
1045 case OP_MINQUERY:
1046 case OP_POSQUERY:
1047 tcode = set_table_bit(start_bits, tcode + 1, FALSE, cd, utf);
1048 break;
1050 case OP_STARI:
1051 case OP_MINSTARI:
1052 case OP_POSSTARI:
1053 case OP_QUERYI:
1054 case OP_MINQUERYI:
1055 case OP_POSQUERYI:
1056 tcode = set_table_bit(start_bits, tcode + 1, TRUE, cd, utf);
1057 break;
1059 /* Single-char upto sets the bit and tries the next */
1061 case OP_UPTO:
1062 case OP_MINUPTO:
1063 case OP_POSUPTO:
1064 tcode = set_table_bit(start_bits, tcode + 1 + IMM2_SIZE, FALSE, cd, utf);
1065 break;
1067 case OP_UPTOI:
1068 case OP_MINUPTOI:
1069 case OP_POSUPTOI:
1070 tcode = set_table_bit(start_bits, tcode + 1 + IMM2_SIZE, TRUE, cd, utf);
1071 break;
1073 /* At least one single char sets the bit and stops */
1075 case OP_EXACT:
1076 tcode += IMM2_SIZE;
1077 /* Fall through */
1078 case OP_CHAR:
1079 case OP_PLUS:
1080 case OP_MINPLUS:
1081 case OP_POSPLUS:
1082 (void)set_table_bit(start_bits, tcode + 1, FALSE, cd, utf);
1083 try_next = FALSE;
1084 break;
1086 case OP_EXACTI:
1087 tcode += IMM2_SIZE;
1088 /* Fall through */
1089 case OP_CHARI:
1090 case OP_PLUSI:
1091 case OP_MINPLUSI:
1092 case OP_POSPLUSI:
1093 (void)set_table_bit(start_bits, tcode + 1, TRUE, cd, utf);
1094 try_next = FALSE;
1095 break;
1097 /* Special spacing and line-terminating items. These recognize specific
1098 lists of characters. The difference between VSPACE and ANYNL is that the
1099 latter can match the two-character CRLF sequence, but that is not
1100 relevant for finding the first character, so their code here is
1101 identical. */
1103 case OP_HSPACE:
1104 SET_BIT(CHAR_HT);
1105 SET_BIT(CHAR_SPACE);
1106 #ifdef SUPPORT_UTF
1107 if (utf)
1109 #ifdef COMPILE_PCRE8
1110 SET_BIT(0xC2); /* For U+00A0 */
1111 SET_BIT(0xE1); /* For U+1680, U+180E */
1112 SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */
1113 SET_BIT(0xE3); /* For U+3000 */
1114 #elif defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1115 SET_BIT(0xA0);
1116 SET_BIT(0xFF); /* For characters > 255 */
1117 #endif /* COMPILE_PCRE[8|16|32] */
1119 else
1120 #endif /* SUPPORT_UTF */
1122 #ifndef EBCDIC
1123 SET_BIT(0xA0);
1124 #endif /* Not EBCDIC */
1125 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1126 SET_BIT(0xFF); /* For characters > 255 */
1127 #endif /* COMPILE_PCRE[16|32] */
1129 try_next = FALSE;
1130 break;
1132 case OP_ANYNL:
1133 case OP_VSPACE:
1134 SET_BIT(CHAR_LF);
1135 SET_BIT(CHAR_VT);
1136 SET_BIT(CHAR_FF);
1137 SET_BIT(CHAR_CR);
1138 #ifdef SUPPORT_UTF
1139 if (utf)
1141 #ifdef COMPILE_PCRE8
1142 SET_BIT(0xC2); /* For U+0085 */
1143 SET_BIT(0xE2); /* For U+2028, U+2029 */
1144 #elif defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1145 SET_BIT(CHAR_NEL);
1146 SET_BIT(0xFF); /* For characters > 255 */
1147 #endif /* COMPILE_PCRE[8|16|32] */
1149 else
1150 #endif /* SUPPORT_UTF */
1152 SET_BIT(CHAR_NEL);
1153 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1154 SET_BIT(0xFF); /* For characters > 255 */
1155 #endif
1157 try_next = FALSE;
1158 break;
1160 /* Single character types set the bits and stop. Note that if PCRE_UCP
1161 is set, we do not see these op codes because \d etc are converted to
1162 properties. Therefore, these apply in the case when only characters less
1163 than 256 are recognized to match the types. */
1165 case OP_NOT_DIGIT:
1166 set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
1167 try_next = FALSE;
1168 break;
1170 case OP_DIGIT:
1171 set_type_bits(start_bits, cbit_digit, table_limit, cd);
1172 try_next = FALSE;
1173 break;
1175 /* The cbit_space table has vertical tab as whitespace; we no longer
1176 have to play fancy tricks because Perl added VT to its whitespace at
1177 release 5.18. PCRE added it at release 8.34. */
1179 case OP_NOT_WHITESPACE:
1180 set_nottype_bits(start_bits, cbit_space, table_limit, cd);
1181 try_next = FALSE;
1182 break;
1184 case OP_WHITESPACE:
1185 set_type_bits(start_bits, cbit_space, table_limit, cd);
1186 try_next = FALSE;
1187 break;
1189 case OP_NOT_WORDCHAR:
1190 set_nottype_bits(start_bits, cbit_word, table_limit, cd);
1191 try_next = FALSE;
1192 break;
1194 case OP_WORDCHAR:
1195 set_type_bits(start_bits, cbit_word, table_limit, cd);
1196 try_next = FALSE;
1197 break;
1199 /* One or more character type fudges the pointer and restarts, knowing
1200 it will hit a single character type and stop there. */
1202 case OP_TYPEPLUS:
1203 case OP_TYPEMINPLUS:
1204 case OP_TYPEPOSPLUS:
1205 tcode++;
1206 break;
1208 case OP_TYPEEXACT:
1209 tcode += 1 + IMM2_SIZE;
1210 break;
1212 /* Zero or more repeats of character types set the bits and then
1213 try again. */
1215 case OP_TYPEUPTO:
1216 case OP_TYPEMINUPTO:
1217 case OP_TYPEPOSUPTO:
1218 tcode += IMM2_SIZE; /* Fall through */
1220 case OP_TYPESTAR:
1221 case OP_TYPEMINSTAR:
1222 case OP_TYPEPOSSTAR:
1223 case OP_TYPEQUERY:
1224 case OP_TYPEMINQUERY:
1225 case OP_TYPEPOSQUERY:
1226 switch(tcode[1])
1228 default:
1229 case OP_ANY:
1230 case OP_ALLANY:
1231 return SSB_FAIL;
1233 case OP_HSPACE:
1234 SET_BIT(CHAR_HT);
1235 SET_BIT(CHAR_SPACE);
1236 #ifdef SUPPORT_UTF
1237 if (utf)
1239 #ifdef COMPILE_PCRE8
1240 SET_BIT(0xC2); /* For U+00A0 */
1241 SET_BIT(0xE1); /* For U+1680, U+180E */
1242 SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */
1243 SET_BIT(0xE3); /* For U+3000 */
1244 #elif defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1245 SET_BIT(0xA0);
1246 SET_BIT(0xFF); /* For characters > 255 */
1247 #endif /* COMPILE_PCRE[8|16|32] */
1249 else
1250 #endif /* SUPPORT_UTF */
1251 #ifndef EBCDIC
1252 SET_BIT(0xA0);
1253 #endif /* Not EBCDIC */
1254 break;
1256 case OP_ANYNL:
1257 case OP_VSPACE:
1258 SET_BIT(CHAR_LF);
1259 SET_BIT(CHAR_VT);
1260 SET_BIT(CHAR_FF);
1261 SET_BIT(CHAR_CR);
1262 #ifdef SUPPORT_UTF
1263 if (utf)
1265 #ifdef COMPILE_PCRE8
1266 SET_BIT(0xC2); /* For U+0085 */
1267 SET_BIT(0xE2); /* For U+2028, U+2029 */
1268 #elif defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1269 SET_BIT(CHAR_NEL);
1270 SET_BIT(0xFF); /* For characters > 255 */
1271 #endif /* COMPILE_PCRE16 */
1273 else
1274 #endif /* SUPPORT_UTF */
1275 SET_BIT(CHAR_NEL);
1276 break;
1278 case OP_NOT_DIGIT:
1279 set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
1280 break;
1282 case OP_DIGIT:
1283 set_type_bits(start_bits, cbit_digit, table_limit, cd);
1284 break;
1286 /* The cbit_space table has vertical tab as whitespace; we no longer
1287 have to play fancy tricks because Perl added VT to its whitespace at
1288 release 5.18. PCRE added it at release 8.34. */
1290 case OP_NOT_WHITESPACE:
1291 set_nottype_bits(start_bits, cbit_space, table_limit, cd);
1292 break;
1294 case OP_WHITESPACE:
1295 set_type_bits(start_bits, cbit_space, table_limit, cd);
1296 break;
1298 case OP_NOT_WORDCHAR:
1299 set_nottype_bits(start_bits, cbit_word, table_limit, cd);
1300 break;
1302 case OP_WORDCHAR:
1303 set_type_bits(start_bits, cbit_word, table_limit, cd);
1304 break;
1307 tcode += 2;
1308 break;
1310 /* Character class where all the information is in a bit map: set the
1311 bits and either carry on or not, according to the repeat count. If it was
1312 a negative class, and we are operating with UTF-8 characters, any byte
1313 with a value >= 0xc4 is a potentially valid starter because it starts a
1314 character with a value > 255. */
1316 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
1317 case OP_XCLASS:
1318 if ((tcode[1 + LINK_SIZE] & XCL_HASPROP) != 0)
1319 return SSB_FAIL;
1320 /* All bits are set. */
1321 if ((tcode[1 + LINK_SIZE] & XCL_MAP) == 0 && (tcode[1 + LINK_SIZE] & XCL_NOT) != 0)
1322 return SSB_FAIL;
1323 #endif
1324 /* Fall through */
1326 case OP_NCLASS:
1327 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
1328 if (utf)
1330 start_bits[24] |= 0xf0; /* Bits for 0xc4 - 0xc8 */
1331 memset(start_bits+25, 0xff, 7); /* Bits for 0xc9 - 0xff */
1333 #endif
1334 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1335 SET_BIT(0xFF); /* For characters > 255 */
1336 #endif
1337 /* Fall through */
1339 case OP_CLASS:
1341 pcre_uint8 *map;
1342 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
1343 map = NULL;
1344 if (*tcode == OP_XCLASS)
1346 if ((tcode[1 + LINK_SIZE] & XCL_MAP) != 0)
1347 map = (pcre_uint8 *)(tcode + 1 + LINK_SIZE + 1);
1348 tcode += GET(tcode, 1);
1350 else
1351 #endif
1353 tcode++;
1354 map = (pcre_uint8 *)tcode;
1355 tcode += 32 / sizeof(pcre_uchar);
1358 /* In UTF-8 mode, the bits in a bit map correspond to character
1359 values, not to byte values. However, the bit map we are constructing is
1360 for byte values. So we have to do a conversion for characters whose
1361 value is > 127. In fact, there are only two possible starting bytes for
1362 characters in the range 128 - 255. */
1364 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
1365 if (map != NULL)
1366 #endif
1368 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
1369 if (utf)
1371 for (c = 0; c < 16; c++) start_bits[c] |= map[c];
1372 for (c = 128; c < 256; c++)
1374 if ((map[c/8] & (1 << (c&7))) != 0)
1376 int d = (c >> 6) | 0xc0; /* Set bit for this starter */
1377 start_bits[d/8] |= (1 << (d&7)); /* and then skip on to the */
1378 c = (c & 0xc0) + 0x40 - 1; /* next relevant character. */
1382 else
1383 #endif
1385 /* In non-UTF-8 mode, the two bit maps are completely compatible. */
1386 for (c = 0; c < 32; c++) start_bits[c] |= map[c];
1390 /* Advance past the bit map, and act on what follows. For a zero
1391 minimum repeat, continue; otherwise stop processing. */
1393 switch (*tcode)
1395 case OP_CRSTAR:
1396 case OP_CRMINSTAR:
1397 case OP_CRQUERY:
1398 case OP_CRMINQUERY:
1399 case OP_CRPOSSTAR:
1400 case OP_CRPOSQUERY:
1401 tcode++;
1402 break;
1404 case OP_CRRANGE:
1405 case OP_CRMINRANGE:
1406 case OP_CRPOSRANGE:
1407 if (GET2(tcode, 1) == 0) tcode += 1 + 2 * IMM2_SIZE;
1408 else try_next = FALSE;
1409 break;
1411 default:
1412 try_next = FALSE;
1413 break;
1416 break; /* End of bitmap class handling */
1418 } /* End of switch */
1419 } /* End of try_next loop */
1421 code += GET(code, 1); /* Advance to next branch */
1423 while (*code == OP_ALT);
1424 return yield;
1431 /*************************************************
1432 * Study a compiled expression *
1433 *************************************************/
1435 /* This function is handed a compiled expression that it must study to produce
1436 information that will speed up the matching. It returns a pcre[16]_extra block
1437 which then gets handed back to pcre_exec().
1439 Arguments:
1440 re points to the compiled expression
1441 options contains option bits
1442 errorptr points to where to place error messages;
1443 set NULL unless error
1445 Returns: pointer to a pcre[16]_extra block, with study_data filled in and
1446 the appropriate flags set;
1447 NULL on error or if no optimization possible
1450 #if defined COMPILE_PCRE8
1451 PCRE_EXP_DEFN pcre_extra * PCRE_CALL_CONVENTION
1452 pcre_study(const pcre *external_re, int options, const char **errorptr)
1453 #elif defined COMPILE_PCRE16
1454 PCRE_EXP_DEFN pcre16_extra * PCRE_CALL_CONVENTION
1455 pcre16_study(const pcre16 *external_re, int options, const char **errorptr)
1456 #elif defined COMPILE_PCRE32
1457 PCRE_EXP_DEFN pcre32_extra * PCRE_CALL_CONVENTION
1458 pcre32_study(const pcre32 *external_re, int options, const char **errorptr)
1459 #endif
1461 int min;
1462 int count = 0;
1463 BOOL bits_set = FALSE;
1464 pcre_uint8 start_bits[32];
1465 PUBL(extra) *extra = NULL;
1466 pcre_study_data *study;
1467 const pcre_uint8 *tables;
1468 pcre_uchar *code;
1469 compile_data compile_block;
1470 const REAL_PCRE *re = (const REAL_PCRE *)external_re;
1473 *errorptr = NULL;
1475 if (re == NULL || re->magic_number != MAGIC_NUMBER)
1477 *errorptr = "argument is not a compiled regular expression";
1478 return NULL;
1481 if ((re->flags & PCRE_MODE) == 0)
1483 #if defined COMPILE_PCRE8
1484 *errorptr = "argument not compiled in 8 bit mode";
1485 #elif defined COMPILE_PCRE16
1486 *errorptr = "argument not compiled in 16 bit mode";
1487 #elif defined COMPILE_PCRE32
1488 *errorptr = "argument not compiled in 32 bit mode";
1489 #endif
1490 return NULL;
1493 if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
1495 *errorptr = "unknown or incorrect option bit(s) set";
1496 return NULL;
1499 code = (pcre_uchar *)re + re->name_table_offset +
1500 (re->name_count * re->name_entry_size);
1502 /* For an anchored pattern, or an unanchored pattern that has a first char, or
1503 a multiline pattern that matches only at "line starts", there is no point in
1504 seeking a list of starting bytes. */
1506 if ((re->options & PCRE_ANCHORED) == 0 &&
1507 (re->flags & (PCRE_FIRSTSET|PCRE_STARTLINE)) == 0)
1509 int rc;
1511 /* Set the character tables in the block that is passed around */
1513 tables = re->tables;
1515 #if defined COMPILE_PCRE8
1516 if (tables == NULL)
1517 (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
1518 (void *)(&tables));
1519 #elif defined COMPILE_PCRE16
1520 if (tables == NULL)
1521 (void)pcre16_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
1522 (void *)(&tables));
1523 #elif defined COMPILE_PCRE32
1524 if (tables == NULL)
1525 (void)pcre32_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
1526 (void *)(&tables));
1527 #endif
1529 compile_block.lcc = tables + lcc_offset;
1530 compile_block.fcc = tables + fcc_offset;
1531 compile_block.cbits = tables + cbits_offset;
1532 compile_block.ctypes = tables + ctypes_offset;
1534 /* See if we can find a fixed set of initial characters for the pattern. */
1536 memset(start_bits, 0, 32 * sizeof(pcre_uint8));
1537 rc = set_start_bits(code, start_bits, (re->options & PCRE_UTF8) != 0,
1538 &compile_block);
1539 bits_set = rc == SSB_DONE;
1540 if (rc == SSB_UNKNOWN)
1542 *errorptr = "internal error: opcode not recognized";
1543 return NULL;
1547 /* Find the minimum length of subject string. */
1549 switch(min = find_minlength(re, code, code, re->options, NULL, &count))
1551 case -2: *errorptr = "internal error: missing capturing bracket"; return NULL;
1552 case -3: *errorptr = "internal error: opcode not recognized"; return NULL;
1553 default: break;
1556 /* If a set of starting bytes has been identified, or if the minimum length is
1557 greater than zero, or if JIT optimization has been requested, or if
1558 PCRE_STUDY_EXTRA_NEEDED is set, get a pcre[16]_extra block and a
1559 pcre_study_data block. The study data is put in the latter, which is pointed to
1560 by the former, which may also get additional data set later by the calling
1561 program. At the moment, the size of pcre_study_data is fixed. We nevertheless
1562 save it in a field for returning via the pcre_fullinfo() function so that if it
1563 becomes variable in the future, we don't have to change that code. */
1565 if (bits_set || min > 0 || (options & (
1566 #ifdef SUPPORT_JIT
1567 PCRE_STUDY_JIT_COMPILE | PCRE_STUDY_JIT_PARTIAL_SOFT_COMPILE |
1568 PCRE_STUDY_JIT_PARTIAL_HARD_COMPILE |
1569 #endif
1570 PCRE_STUDY_EXTRA_NEEDED)) != 0)
1572 extra = (PUBL(extra) *)(PUBL(malloc))
1573 (sizeof(PUBL(extra)) + sizeof(pcre_study_data));
1574 if (extra == NULL)
1576 *errorptr = "failed to get memory";
1577 return NULL;
1580 study = (pcre_study_data *)((char *)extra + sizeof(PUBL(extra)));
1581 extra->flags = PCRE_EXTRA_STUDY_DATA;
1582 extra->study_data = study;
1584 study->size = sizeof(pcre_study_data);
1585 study->flags = 0;
1587 /* Set the start bits always, to avoid unset memory errors if the
1588 study data is written to a file, but set the flag only if any of the bits
1589 are set, to save time looking when none are. */
1591 if (bits_set)
1593 study->flags |= PCRE_STUDY_MAPPED;
1594 memcpy(study->start_bits, start_bits, sizeof(start_bits));
1596 else memset(study->start_bits, 0, 32 * sizeof(pcre_uint8));
1598 #ifdef PCRE_DEBUG
1599 if (bits_set)
1601 pcre_uint8 *ptr = start_bits;
1602 int i;
1604 printf("Start bits:\n");
1605 for (i = 0; i < 32; i++)
1606 printf("%3d: %02x%s", i * 8, *ptr++, ((i + 1) & 0x7) != 0? " " : "\n");
1608 #endif
1610 /* Always set the minlength value in the block, because the JIT compiler
1611 makes use of it. However, don't set the bit unless the length is greater than
1612 zero - the interpretive pcre_exec() and pcre_dfa_exec() needn't waste time
1613 checking the zero case. */
1615 if (min > 0)
1617 study->flags |= PCRE_STUDY_MINLEN;
1618 study->minlength = min;
1620 else study->minlength = 0;
1622 /* If JIT support was compiled and requested, attempt the JIT compilation.
1623 If no starting bytes were found, and the minimum length is zero, and JIT
1624 compilation fails, abandon the extra block and return NULL, unless
1625 PCRE_STUDY_EXTRA_NEEDED is set. */
1627 #ifdef SUPPORT_JIT
1628 extra->executable_jit = NULL;
1629 if ((options & PCRE_STUDY_JIT_COMPILE) != 0)
1630 PRIV(jit_compile)(re, extra, JIT_COMPILE);
1631 if ((options & PCRE_STUDY_JIT_PARTIAL_SOFT_COMPILE) != 0)
1632 PRIV(jit_compile)(re, extra, JIT_PARTIAL_SOFT_COMPILE);
1633 if ((options & PCRE_STUDY_JIT_PARTIAL_HARD_COMPILE) != 0)
1634 PRIV(jit_compile)(re, extra, JIT_PARTIAL_HARD_COMPILE);
1636 if (study->flags == 0 && (extra->flags & PCRE_EXTRA_EXECUTABLE_JIT) == 0 &&
1637 (options & PCRE_STUDY_EXTRA_NEEDED) == 0)
1639 #if defined COMPILE_PCRE8
1640 pcre_free_study(extra);
1641 #elif defined COMPILE_PCRE16
1642 pcre16_free_study(extra);
1643 #elif defined COMPILE_PCRE32
1644 pcre32_free_study(extra);
1645 #endif
1646 extra = NULL;
1648 #endif
1651 return extra;
1655 /*************************************************
1656 * Free the study data *
1657 *************************************************/
1659 /* This function frees the memory that was obtained by pcre_study().
1661 Argument: a pointer to the pcre[16]_extra block
1662 Returns: nothing
1665 #if defined COMPILE_PCRE8
1666 PCRE_EXP_DEFN void
1667 pcre_free_study(pcre_extra *extra)
1668 #elif defined COMPILE_PCRE16
1669 PCRE_EXP_DEFN void
1670 pcre16_free_study(pcre16_extra *extra)
1671 #elif defined COMPILE_PCRE32
1672 PCRE_EXP_DEFN void
1673 pcre32_free_study(pcre32_extra *extra)
1674 #endif
1676 if (extra == NULL)
1677 return;
1678 #ifdef SUPPORT_JIT
1679 if ((extra->flags & PCRE_EXTRA_EXECUTABLE_JIT) != 0 &&
1680 extra->executable_jit != NULL)
1681 PRIV(jit_free)(extra->executable_jit);
1682 #endif
1683 PUBL(free)(extra);
1686 /* End of pcre_study.c */