release/src/router/pcre/pcre_study.c

   1 /*************************************************
   2 *      Perl-Compatible Regular Expressions       *
   3 *************************************************/
   4
   5 /* PCRE is a library of functions to support regular expressions whose syntax
   6 and semantics are as close as possible to those of the Perl 5 language.
   7
   8                        Written by Philip Hazel
   9            Copyright (c) 1997-2012 University of Cambridge
  10
  11 -----------------------------------------------------------------------------
  12 Redistribution and use in source and binary forms, with or without
  13 modification, are permitted provided that the following conditions are met:
  14
  15     * Redistributions of source code must retain the above copyright notice,
  16       this list of conditions and the following disclaimer.
  17
  18     * Redistributions in binary form must reproduce the above copyright
  19       notice, this list of conditions and the following disclaimer in the
  20       documentation and/or other materials provided with the distribution.
  21
  22     * Neither the name of the University of Cambridge nor the names of its
  23       contributors may be used to endorse or promote products derived from
  24       this software without specific prior written permission.
  25
  26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  36 POSSIBILITY OF SUCH DAMAGE.
  37 -----------------------------------------------------------------------------
  38 */
  39
  40
  41 /* This module contains the external function pcre_study(), along with local
  42 supporting functions. */
  43
  44
  45 #ifdef HAVE_CONFIG_H
  46 #include "config.h"
  47 #endif
  48
  49 #include "pcre_internal.h"
  50
  51 #define SET_BIT(c) start_bits[c/8] |= (1 << (c&7))
  52
  53 /* Returns from set_start_bits() */
  54
  55 enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE, SSB_UNKNOWN };
  56
  57
  58
  59 /*************************************************
  60 *   Find the minimum subject length for a group  *
  61 *************************************************/
  62
  63 /* Scan a parenthesized group and compute the minimum length of subject that
  64 is needed to match it. This is a lower bound; it does not mean there is a
  65 string of that length that matches. In UTF8 mode, the result is in characters
  66 rather than bytes.
  67
  68 Arguments:
  69   code            pointer to start of group (the bracket)
  70   startcode       pointer to start of the whole pattern
  71   options         the compiling options
  72   int             RECURSE depth
  73
  74 Returns:   the minimum length
  75            -1 if \C in UTF-8 mode or (*ACCEPT) was encountered
  76            -2 internal error (missing capturing bracket)
  77            -3 internal error (opcode not listed)
  78 */
  79
  80 static int
  81 find_minlength(const pcre_uchar *code, const pcre_uchar *startcode, int options,
  82   int recurse_depth)
  83 {
  84 int length = -1;
  85 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
  86 BOOL utf = (options & PCRE_UTF8) != 0;
  87 BOOL had_recurse = FALSE;
  88 register int branchlength = 0;
  89 register pcre_uchar *cc = (pcre_uchar *)code + 1 + LINK_SIZE;
  90
  91 if (*code == OP_CBRA || *code == OP_SCBRA ||
  92     *code == OP_CBRAPOS || *code == OP_SCBRAPOS) cc += IMM2_SIZE;
  93
  94 /* Scan along the opcodes for this branch. If we get to the end of the
  95 branch, check the length against that of the other branches. */
  96
  97 for (;;)
  98   {
  99   int d, min;
 100   pcre_uchar *cs, *ce;
 101   register pcre_uchar op = *cc;
 102
 103   switch (op)
 104     {
 105     case OP_COND:
 106     case OP_SCOND:
 107
 108     /* If there is only one branch in a condition, the implied branch has zero
 109     length, so we don't add anything. This covers the DEFINE "condition"
 110     automatically. */
 111
 112     cs = cc + GET(cc, 1);
 113     if (*cs != OP_ALT)
 114       {
 115       cc = cs + 1 + LINK_SIZE;
 116       break;
 117       }
 118
 119     /* Otherwise we can fall through and treat it the same as any other
 120     subpattern. */
 121
 122     case OP_CBRA:
 123     case OP_SCBRA:
 124     case OP_BRA:
 125     case OP_SBRA:
 126     case OP_CBRAPOS:
 127     case OP_SCBRAPOS:
 128     case OP_BRAPOS:
 129     case OP_SBRAPOS:
 130     case OP_ONCE:
 131     case OP_ONCE_NC:
 132     d = find_minlength(cc, startcode, options, recurse_depth);
 133     if (d < 0) return d;
 134     branchlength += d;
 135     do cc += GET(cc, 1); while (*cc == OP_ALT);
 136     cc += 1 + LINK_SIZE;
 137     break;
 138
 139     /* ACCEPT makes things far too complicated; we have to give up. */
 140
 141     case OP_ACCEPT:
 142     case OP_ASSERT_ACCEPT:
 143     return -1;
 144
 145     /* Reached end of a branch; if it's a ket it is the end of a nested
 146     call. If it's ALT it is an alternation in a nested call. If it is END it's
 147     the end of the outer call. All can be handled by the same code. If an
 148     ACCEPT was previously encountered, use the length that was in force at that
 149     time, and pass back the shortest ACCEPT length. */
 150
 151     case OP_ALT:
 152     case OP_KET:
 153     case OP_KETRMAX:
 154     case OP_KETRMIN:
 155     case OP_KETRPOS:
 156     case OP_END:
 157     if (length < 0 || (!had_recurse && branchlength < length))
 158       length = branchlength;
 159     if (op != OP_ALT) return length;
 160     cc += 1 + LINK_SIZE;
 161     branchlength = 0;
 162     had_recurse = FALSE;
 163     break;
 164
 165     /* Skip over assertive subpatterns */
 166
 167     case OP_ASSERT:
 168     case OP_ASSERT_NOT:
 169     case OP_ASSERTBACK:
 170     case OP_ASSERTBACK_NOT:
 171     do cc += GET(cc, 1); while (*cc == OP_ALT);
 172     /* Fall through */
 173
 174     /* Skip over things that don't match chars */
 175
 176     case OP_REVERSE:
 177     case OP_CREF:
 178     case OP_NCREF:
 179     case OP_RREF:
 180     case OP_NRREF:
 181     case OP_DEF:
 182     case OP_CALLOUT:
 183     case OP_SOD:
 184     case OP_SOM:
 185     case OP_EOD:
 186     case OP_EODN:
 187     case OP_CIRC:
 188     case OP_CIRCM:
 189     case OP_DOLL:
 190     case OP_DOLLM:
 191     case OP_NOT_WORD_BOUNDARY:
 192     case OP_WORD_BOUNDARY:
 193     cc += PRIV(OP_lengths)[*cc];
 194     break;
 195
 196     /* Skip over a subpattern that has a {0} or {0,x} quantifier */
 197
 198     case OP_BRAZERO:
 199     case OP_BRAMINZERO:
 200     case OP_BRAPOSZERO:
 201     case OP_SKIPZERO:
 202     cc += PRIV(OP_lengths)[*cc];
 203     do cc += GET(cc, 1); while (*cc == OP_ALT);
 204     cc += 1 + LINK_SIZE;
 205     break;
 206
 207     /* Handle literal characters and + repetitions */
 208
 209     case OP_CHAR:
 210     case OP_CHARI:
 211     case OP_NOT:
 212     case OP_NOTI:
 213     case OP_PLUS:
 214     case OP_PLUSI:
 215     case OP_MINPLUS:
 216     case OP_MINPLUSI:
 217     case OP_POSPLUS:
 218     case OP_POSPLUSI:
 219     case OP_NOTPLUS:
 220     case OP_NOTPLUSI:
 221     case OP_NOTMINPLUS:
 222     case OP_NOTMINPLUSI:
 223     case OP_NOTPOSPLUS:
 224     case OP_NOTPOSPLUSI:
 225     branchlength++;
 226     cc += 2;
 227 #ifdef SUPPORT_UTF
 228     if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
 229 #endif
 230     break;
 231
 232     case OP_TYPEPLUS:
 233     case OP_TYPEMINPLUS:
 234     case OP_TYPEPOSPLUS:
 235     branchlength++;
 236     cc += (cc[1] == OP_PROP || cc[1] == OP_NOTPROP)? 4 : 2;
 237     break;
 238
 239     /* Handle exact repetitions. The count is already in characters, but we
 240     need to skip over a multibyte character in UTF8 mode.  */
 241
 242     case OP_EXACT:
 243     case OP_EXACTI:
 244     case OP_NOTEXACT:
 245     case OP_NOTEXACTI:
 246     branchlength += GET2(cc,1);
 247     cc += 2 + IMM2_SIZE;
 248 #ifdef SUPPORT_UTF
 249     if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
 250 #endif
 251     break;
 252
 253     case OP_TYPEEXACT:
 254     branchlength += GET2(cc,1);
 255     cc += 2 + IMM2_SIZE + ((cc[1 + IMM2_SIZE] == OP_PROP
 256       || cc[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
 257     break;
 258
 259     /* Handle single-char non-literal matchers */
 260
 261     case OP_PROP:
 262     case OP_NOTPROP:
 263     cc += 2;
 264     /* Fall through */
 265
 266     case OP_NOT_DIGIT:
 267     case OP_DIGIT:
 268     case OP_NOT_WHITESPACE:
 269     case OP_WHITESPACE:
 270     case OP_NOT_WORDCHAR:
 271     case OP_WORDCHAR:
 272     case OP_ANY:
 273     case OP_ALLANY:
 274     case OP_EXTUNI:
 275     case OP_HSPACE:
 276     case OP_NOT_HSPACE:
 277     case OP_VSPACE:
 278     case OP_NOT_VSPACE:
 279     branchlength++;
 280     cc++;
 281     break;
 282
 283     /* "Any newline" might match two characters, but it also might match just
 284     one. */
 285
 286     case OP_ANYNL:
 287     branchlength += 1;
 288     cc++;
 289     break;
 290
 291     /* The single-byte matcher means we can't proceed in UTF-8 mode. (In
 292     non-UTF-8 mode \C will actually be turned into OP_ALLANY, so won't ever
 293     appear, but leave the code, just in case.) */
 294
 295     case OP_ANYBYTE:
 296 #ifdef SUPPORT_UTF
 297     if (utf) return -1;
 298 #endif
 299     branchlength++;
 300     cc++;
 301     break;
 302
 303     /* For repeated character types, we have to test for \p and \P, which have
 304     an extra two bytes of parameters. */
 305
 306     case OP_TYPESTAR:
 307     case OP_TYPEMINSTAR:
 308     case OP_TYPEQUERY:
 309     case OP_TYPEMINQUERY:
 310     case OP_TYPEPOSSTAR:
 311     case OP_TYPEPOSQUERY:
 312     if (cc[1] == OP_PROP || cc[1] == OP_NOTPROP) cc += 2;
 313     cc += PRIV(OP_lengths)[op];
 314     break;
 315
 316     case OP_TYPEUPTO:
 317     case OP_TYPEMINUPTO:
 318     case OP_TYPEPOSUPTO:
 319     if (cc[1 + IMM2_SIZE] == OP_PROP
 320       || cc[1 + IMM2_SIZE] == OP_NOTPROP) cc += 2;
 321     cc += PRIV(OP_lengths)[op];
 322     break;
 323
 324     /* Check a class for variable quantification */
 325
 326     case OP_CLASS:
 327     case OP_NCLASS:
 328 #if defined SUPPORT_UTF || defined COMPILE_PCRE16 || defined COMPILE_PCRE32
 329     case OP_XCLASS:
 330     /* The original code caused an unsigned overflow in 64 bit systems,
 331     so now we use a conditional statement. */
 332     if (op == OP_XCLASS)
 333       cc += GET(cc, 1);
 334     else
 335       cc += PRIV(OP_lengths)[OP_CLASS];
 336 #else
 337     cc += PRIV(OP_lengths)[OP_CLASS];
 338 #endif
 339
 340     switch (*cc)
 341       {
 342       case OP_CRPLUS:
 343       case OP_CRMINPLUS:
 344       branchlength++;
 345       /* Fall through */
 346
 347       case OP_CRSTAR:
 348       case OP_CRMINSTAR:
 349       case OP_CRQUERY:
 350       case OP_CRMINQUERY:
 351       cc++;
 352       break;
 353
 354       case OP_CRRANGE:
 355       case OP_CRMINRANGE:
 356       branchlength += GET2(cc,1);
 357       cc += 1 + 2 * IMM2_SIZE;
 358       break;
 359
 360       default:
 361       branchlength++;
 362       break;
 363       }
 364     break;
 365
 366     /* Backreferences and subroutine calls are treated in the same way: we find
 367     the minimum length for the subpattern. A recursion, however, causes an
 368     a flag to be set that causes the length of this branch to be ignored. The
 369     logic is that a recursion can only make sense if there is another
 370     alternation that stops the recursing. That will provide the minimum length
 371     (when no recursion happens). A backreference within the group that it is
 372     referencing behaves in the same way.
 373
 374     If PCRE_JAVASCRIPT_COMPAT is set, a backreference to an unset bracket
 375     matches an empty string (by default it causes a matching failure), so in
 376     that case we must set the minimum length to zero. */
 377
 378     case OP_REF:
 379     case OP_REFI:
 380     if ((options & PCRE_JAVASCRIPT_COMPAT) == 0)
 381       {
 382       ce = cs = (pcre_uchar *)PRIV(find_bracket)(startcode, utf, GET2(cc, 1));
 383       if (cs == NULL) return -2;
 384       do ce += GET(ce, 1); while (*ce == OP_ALT);
 385       if (cc > cs && cc < ce)
 386         {
 387         d = 0;
 388         had_recurse = TRUE;
 389         }
 390       else
 391         {
 392         d = find_minlength(cs, startcode, options, recurse_depth);
 393         }
 394       }
 395     else d = 0;
 396     cc += 1 + IMM2_SIZE;
 397
 398     /* Handle repeated back references */
 399
 400     switch (*cc)
 401       {
 402       case OP_CRSTAR:
 403       case OP_CRMINSTAR:
 404       case OP_CRQUERY:
 405       case OP_CRMINQUERY:
 406       min = 0;
 407       cc++;
 408       break;
 409
 410       case OP_CRPLUS:
 411       case OP_CRMINPLUS:
 412       min = 1;
 413       cc++;
 414       break;
 415
 416       case OP_CRRANGE:
 417       case OP_CRMINRANGE:
 418       min = GET2(cc, 1);
 419       cc += 1 + 2 * IMM2_SIZE;
 420       break;
 421
 422       default:
 423       min = 1;
 424       break;
 425       }
 426
 427     branchlength += min * d;
 428     break;
 429
 430     /* We can easily detect direct recursion, but not mutual recursion. This is
 431     caught by a recursion depth count. */
 432
 433     case OP_RECURSE:
 434     cs = ce = (pcre_uchar *)startcode + GET(cc, 1);
 435     do ce += GET(ce, 1); while (*ce == OP_ALT);
 436     if ((cc > cs && cc < ce) || recurse_depth > 10)
 437       had_recurse = TRUE;
 438     else
 439       {
 440       branchlength += find_minlength(cs, startcode, options, recurse_depth + 1);
 441       }
 442     cc += 1 + LINK_SIZE;
 443     break;
 444
 445     /* Anything else does not or need not match a character. We can get the
 446     item's length from the table, but for those that can match zero occurrences
 447     of a character, we must take special action for UTF-8 characters. As it
 448     happens, the "NOT" versions of these opcodes are used at present only for
 449     ASCII characters, so they could be omitted from this list. However, in
 450     future that may change, so we include them here so as not to leave a
 451     gotcha for a future maintainer. */
 452
 453     case OP_UPTO:
 454     case OP_UPTOI:
 455     case OP_NOTUPTO:
 456     case OP_NOTUPTOI:
 457     case OP_MINUPTO:
 458     case OP_MINUPTOI:
 459     case OP_NOTMINUPTO:
 460     case OP_NOTMINUPTOI:
 461     case OP_POSUPTO:
 462     case OP_POSUPTOI:
 463     case OP_NOTPOSUPTO:
 464     case OP_NOTPOSUPTOI:
 465
 466     case OP_STAR:
 467     case OP_STARI:
 468     case OP_NOTSTAR:
 469     case OP_NOTSTARI:
 470     case OP_MINSTAR:
 471     case OP_MINSTARI:
 472     case OP_NOTMINSTAR:
 473     case OP_NOTMINSTARI:
 474     case OP_POSSTAR:
 475     case OP_POSSTARI:
 476     case OP_NOTPOSSTAR:
 477     case OP_NOTPOSSTARI:
 478
 479     case OP_QUERY:
 480     case OP_QUERYI:
 481     case OP_NOTQUERY:
 482     case OP_NOTQUERYI:
 483     case OP_MINQUERY:
 484     case OP_MINQUERYI:
 485     case OP_NOTMINQUERY:
 486     case OP_NOTMINQUERYI:
 487     case OP_POSQUERY:
 488     case OP_POSQUERYI:
 489     case OP_NOTPOSQUERY:
 490     case OP_NOTPOSQUERYI:
 491
 492     cc += PRIV(OP_lengths)[op];
 493 #ifdef SUPPORT_UTF
 494     if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
 495 #endif
 496     break;
 497
 498     /* Skip these, but we need to add in the name length. */
 499
 500     case OP_MARK:
 501     case OP_PRUNE_ARG:
 502     case OP_SKIP_ARG:
 503     case OP_THEN_ARG:
 504     cc += PRIV(OP_lengths)[op] + cc[1];
 505     break;
 506
 507     /* The remaining opcodes are just skipped over. */
 508
 509     case OP_CLOSE:
 510     case OP_COMMIT:
 511     case OP_FAIL:
 512     case OP_PRUNE:
 513     case OP_SET_SOM:
 514     case OP_SKIP:
 515     case OP_THEN:
 516     cc += PRIV(OP_lengths)[op];
 517     break;
 518
 519     /* This should not occur: we list all opcodes explicitly so that when
 520     new ones get added they are properly considered. */
 521
 522     default:
 523     return -3;
 524     }
 525   }
 526 /* Control never gets here */
 527 }
 528
 529
 530
 531 /*************************************************
 532 *      Set a bit and maybe its alternate case    *
 533 *************************************************/
 534
 535 /* Given a character, set its first byte's bit in the table, and also the
 536 corresponding bit for the other version of a letter if we are caseless. In
 537 UTF-8 mode, for characters greater than 127, we can only do the caseless thing
 538 when Unicode property support is available.
 539
 540 Arguments:
 541   start_bits    points to the bit map
 542   p             points to the character
 543   caseless      the caseless flag
 544   cd            the block with char table pointers
 545   utf           TRUE for UTF-8 / UTF-16 / UTF-32 mode
 546
 547 Returns:        pointer after the character
 548 */
 549
 550 static const pcre_uchar *
 551 set_table_bit(pcre_uint8 *start_bits, const pcre_uchar *p, BOOL caseless,
 552   compile_data *cd, BOOL utf)
 553 {
 554 pcre_uint32 c = *p;
 555
 556 #ifdef COMPILE_PCRE8
 557 SET_BIT(c);
 558
 559 #ifdef SUPPORT_UTF
 560 if (utf && c > 127)
 561   {
 562   GETCHARINC(c, p);
 563 #ifdef SUPPORT_UCP
 564   if (caseless)
 565     {
 566     pcre_uchar buff[6];
 567     c = UCD_OTHERCASE(c);
 568     (void)PRIV(ord2utf)(c, buff);
 569     SET_BIT(buff[0]);
 570     }
 571 #endif  /* Not SUPPORT_UCP */
 572   return p;
 573   }
 574 #else   /* Not SUPPORT_UTF */
 575 (void)(utf);   /* Stops warning for unused parameter */
 576 #endif  /* SUPPORT_UTF */
 577
 578 /* Not UTF-8 mode, or character is less than 127. */
 579
 580 if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);
 581 return p + 1;
 582 #endif  /* COMPILE_PCRE8 */
 583
 584 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
 585 if (c > 0xff)
 586   {
 587   c = 0xff;
 588   caseless = FALSE;
 589   }
 590 SET_BIT(c);
 591
 592 #ifdef SUPPORT_UTF
 593 if (utf && c > 127)
 594   {
 595   GETCHARINC(c, p);
 596 #ifdef SUPPORT_UCP
 597   if (caseless)
 598     {
 599     c = UCD_OTHERCASE(c);
 600     if (c > 0xff)
 601       c = 0xff;
 602     SET_BIT(c);
 603     }
 604 #endif  /* SUPPORT_UCP */
 605   return p;
 606   }
 607 #else   /* Not SUPPORT_UTF */
 608 (void)(utf);   /* Stops warning for unused parameter */
 609 #endif  /* SUPPORT_UTF */
 610
 611 if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);
 612 return p + 1;
 613 #endif
 614 }
 615
 616
 617
 618 /*************************************************
 619 *     Set bits for a positive character type     *
 620 *************************************************/
 621
 622 /* This function sets starting bits for a character type. In UTF-8 mode, we can
 623 only do a direct setting for bytes less than 128, as otherwise there can be
 624 confusion with bytes in the middle of UTF-8 characters. In a "traditional"
 625 environment, the tables will only recognize ASCII characters anyway, but in at
 626 least one Windows environment, some higher bytes bits were set in the tables.
 627 So we deal with that case by considering the UTF-8 encoding.
 628
 629 Arguments:
 630   start_bits     the starting bitmap
 631   cbit type      the type of character wanted
 632   table_limit    32 for non-UTF-8; 16 for UTF-8
 633   cd             the block with char table pointers
 634
 635 Returns:         nothing
 636 */
 637
 638 static void
 639 set_type_bits(pcre_uint8 *start_bits, int cbit_type, unsigned int table_limit,
 640   compile_data *cd)
 641 {
 642 register pcre_uint32 c;
 643 for (c = 0; c < table_limit; c++) start_bits[c] |= cd->cbits[c+cbit_type];
 644 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
 645 if (table_limit == 32) return;
 646 for (c = 128; c < 256; c++)
 647   {
 648   if ((cd->cbits[c/8] & (1 << (c&7))) != 0)
 649     {
 650     pcre_uchar buff[6];
 651     (void)PRIV(ord2utf)(c, buff);
 652     SET_BIT(buff[0]);
 653     }
 654   }
 655 #endif
 656 }
 657
 658
 659 /*************************************************
 660 *     Set bits for a negative character type     *
 661 *************************************************/
 662
 663 /* This function sets starting bits for a negative character type such as \D.
 664 In UTF-8 mode, we can only do a direct setting for bytes less than 128, as
 665 otherwise there can be confusion with bytes in the middle of UTF-8 characters.
 666 Unlike in the positive case, where we can set appropriate starting bits for
 667 specific high-valued UTF-8 characters, in this case we have to set the bits for
 668 all high-valued characters. The lowest is 0xc2, but we overkill by starting at
 669 0xc0 (192) for simplicity.
 670
 671 Arguments:
 672   start_bits     the starting bitmap
 673   cbit type      the type of character wanted
 674   table_limit    32 for non-UTF-8; 16 for UTF-8
 675   cd             the block with char table pointers
 676
 677 Returns:         nothing
 678 */
 679
 680 static void
 681 set_nottype_bits(pcre_uint8 *start_bits, int cbit_type, unsigned int table_limit,
 682   compile_data *cd)
 683 {
 684 register pcre_uint32 c;
 685 for (c = 0; c < table_limit; c++) start_bits[c] |= ~cd->cbits[c+cbit_type];
 686 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
 687 if (table_limit != 32) for (c = 24; c < 32; c++) start_bits[c] = 0xff;
 688 #endif
 689 }
 690
 691
 692
 693 /*************************************************
 694 *          Create bitmap of starting bytes       *
 695 *************************************************/
 696
 697 /* This function scans a compiled unanchored expression recursively and
 698 attempts to build a bitmap of the set of possible starting bytes. As time goes
 699 by, we may be able to get more clever at doing this. The SSB_CONTINUE return is
 700 useful for parenthesized groups in patterns such as (a*)b where the group
 701 provides some optional starting bytes but scanning must continue at the outer
 702 level to find at least one mandatory byte. At the outermost level, this
 703 function fails unless the result is SSB_DONE.
 704
 705 Arguments:
 706   code         points to an expression
 707   start_bits   points to a 32-byte table, initialized to 0
 708   utf          TRUE if in UTF-8 / UTF-16 / UTF-32 mode
 709   cd           the block with char table pointers
 710
 711 Returns:       SSB_FAIL     => Failed to find any starting bytes
 712                SSB_DONE     => Found mandatory starting bytes
 713                SSB_CONTINUE => Found optional starting bytes
 714                SSB_UNKNOWN  => Hit an unrecognized opcode
 715 */
 716
 717 static int
 718 set_start_bits(const pcre_uchar *code, pcre_uint8 *start_bits, BOOL utf,
 719   compile_data *cd)
 720 {
 721 register pcre_uint32 c;
 722 int yield = SSB_DONE;
 723 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
 724 int table_limit = utf? 16:32;
 725 #else
 726 int table_limit = 32;
 727 #endif
 728
 729 #if 0
 730 /* ========================================================================= */
 731 /* The following comment and code was inserted in January 1999. In May 2006,
 732 when it was observed to cause compiler warnings about unused values, I took it
 733 out again. If anybody is still using OS/2, they will have to put it back
 734 manually. */
 735
 736 /* This next statement and the later reference to dummy are here in order to
 737 trick the optimizer of the IBM C compiler for OS/2 into generating correct
 738 code. Apparently IBM isn't going to fix the problem, and we would rather not
 739 disable optimization (in this module it actually makes a big difference, and
 740 the pcre module can use all the optimization it can get). */
 741
 742 volatile int dummy;
 743 /* ========================================================================= */
 744 #endif
 745
 746 do
 747   {
 748   BOOL try_next = TRUE;
 749   const pcre_uchar *tcode = code + 1 + LINK_SIZE;
 750
 751   if (*code == OP_CBRA || *code == OP_SCBRA ||
 752       *code == OP_CBRAPOS || *code == OP_SCBRAPOS) tcode += IMM2_SIZE;
 753
 754   while (try_next)    /* Loop for items in this branch */
 755     {
 756     int rc;
 757
 758     switch(*tcode)
 759       {
 760       /* If we reach something we don't understand, it means a new opcode has
 761       been created that hasn't been added to this code. Hopefully this problem
 762       will be discovered during testing. */
 763
 764       default:
 765       return SSB_UNKNOWN;
 766
 767       /* Fail for a valid opcode that implies no starting bits. */
 768
 769       case OP_ACCEPT:
 770       case OP_ASSERT_ACCEPT:
 771       case OP_ALLANY:
 772       case OP_ANY:
 773       case OP_ANYBYTE:
 774       case OP_CIRC:
 775       case OP_CIRCM:
 776       case OP_CLOSE:
 777       case OP_COMMIT:
 778       case OP_COND:
 779       case OP_CREF:
 780       case OP_DEF:
 781       case OP_DOLL:
 782       case OP_DOLLM:
 783       case OP_END:
 784       case OP_EOD:
 785       case OP_EODN:
 786       case OP_EXTUNI:
 787       case OP_FAIL:
 788       case OP_MARK:
 789       case OP_NCREF:
 790       case OP_NOT:
 791       case OP_NOTEXACT:
 792       case OP_NOTEXACTI:
 793       case OP_NOTI:
 794       case OP_NOTMINPLUS:
 795       case OP_NOTMINPLUSI:
 796       case OP_NOTMINQUERY:
 797       case OP_NOTMINQUERYI:
 798       case OP_NOTMINSTAR:
 799       case OP_NOTMINSTARI:
 800       case OP_NOTMINUPTO:
 801       case OP_NOTMINUPTOI:
 802       case OP_NOTPLUS:
 803       case OP_NOTPLUSI:
 804       case OP_NOTPOSPLUS:
 805       case OP_NOTPOSPLUSI:
 806       case OP_NOTPOSQUERY:
 807       case OP_NOTPOSQUERYI:
 808       case OP_NOTPOSSTAR:
 809       case OP_NOTPOSSTARI:
 810       case OP_NOTPOSUPTO:
 811       case OP_NOTPOSUPTOI:
 812       case OP_NOTPROP:
 813       case OP_NOTQUERY:
 814       case OP_NOTQUERYI:
 815       case OP_NOTSTAR:
 816       case OP_NOTSTARI:
 817       case OP_NOTUPTO:
 818       case OP_NOTUPTOI:
 819       case OP_NOT_HSPACE:
 820       case OP_NOT_VSPACE:
 821       case OP_NRREF:
 822       case OP_PROP:
 823       case OP_PRUNE:
 824       case OP_PRUNE_ARG:
 825       case OP_RECURSE:
 826       case OP_REF:
 827       case OP_REFI:
 828       case OP_REVERSE:
 829       case OP_RREF:
 830       case OP_SCOND:
 831       case OP_SET_SOM:
 832       case OP_SKIP:
 833       case OP_SKIP_ARG:
 834       case OP_SOD:
 835       case OP_SOM:
 836       case OP_THEN:
 837       case OP_THEN_ARG:
 838 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
 839       case OP_XCLASS:
 840 #endif
 841       return SSB_FAIL;
 842
 843       /* We can ignore word boundary tests. */
 844
 845       case OP_WORD_BOUNDARY:
 846       case OP_NOT_WORD_BOUNDARY:
 847       tcode++;
 848       break;
 849
 850       /* If we hit a bracket or a positive lookahead assertion, recurse to set
 851       bits from within the subpattern. If it can't find anything, we have to
 852       give up. If it finds some mandatory character(s), we are done for this
 853       branch. Otherwise, carry on scanning after the subpattern. */
 854
 855       case OP_BRA:
 856       case OP_SBRA:
 857       case OP_CBRA:
 858       case OP_SCBRA:
 859       case OP_BRAPOS:
 860       case OP_SBRAPOS:
 861       case OP_CBRAPOS:
 862       case OP_SCBRAPOS:
 863       case OP_ONCE:
 864       case OP_ONCE_NC:
 865       case OP_ASSERT:
 866       rc = set_start_bits(tcode, start_bits, utf, cd);
 867       if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
 868       if (rc == SSB_DONE) try_next = FALSE; else
 869         {
 870         do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
 871         tcode += 1 + LINK_SIZE;
 872         }
 873       break;
 874
 875       /* If we hit ALT or KET, it means we haven't found anything mandatory in
 876       this branch, though we might have found something optional. For ALT, we
 877       continue with the next alternative, but we have to arrange that the final
 878       result from subpattern is SSB_CONTINUE rather than SSB_DONE. For KET,
 879       return SSB_CONTINUE: if this is the top level, that indicates failure,
 880       but after a nested subpattern, it causes scanning to continue. */
 881
 882       case OP_ALT:
 883       yield = SSB_CONTINUE;
 884       try_next = FALSE;
 885       break;
 886
 887       case OP_KET:
 888       case OP_KETRMAX:
 889       case OP_KETRMIN:
 890       case OP_KETRPOS:
 891       return SSB_CONTINUE;
 892
 893       /* Skip over callout */
 894
 895       case OP_CALLOUT:
 896       tcode += 2 + 2*LINK_SIZE;
 897       break;
 898
 899       /* Skip over lookbehind and negative lookahead assertions */
 900
 901       case OP_ASSERT_NOT:
 902       case OP_ASSERTBACK:
 903       case OP_ASSERTBACK_NOT:
 904       do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
 905       tcode += 1 + LINK_SIZE;
 906       break;
 907
 908       /* BRAZERO does the bracket, but carries on. */
 909
 910       case OP_BRAZERO:
 911       case OP_BRAMINZERO:
 912       case OP_BRAPOSZERO:
 913       rc = set_start_bits(++tcode, start_bits, utf, cd);
 914       if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
 915 /* =========================================================================
 916       See the comment at the head of this function concerning the next line,
 917       which was an old fudge for the benefit of OS/2.
 918       dummy = 1;
 919   ========================================================================= */
 920       do tcode += GET(tcode,1); while (*tcode == OP_ALT);
 921       tcode += 1 + LINK_SIZE;
 922       break;
 923
 924       /* SKIPZERO skips the bracket. */
 925
 926       case OP_SKIPZERO:
 927       tcode++;
 928       do tcode += GET(tcode,1); while (*tcode == OP_ALT);
 929       tcode += 1 + LINK_SIZE;
 930       break;
 931
 932       /* Single-char * or ? sets the bit and tries the next item */
 933
 934       case OP_STAR:
 935       case OP_MINSTAR:
 936       case OP_POSSTAR:
 937       case OP_QUERY:
 938       case OP_MINQUERY:
 939       case OP_POSQUERY:
 940       tcode = set_table_bit(start_bits, tcode + 1, FALSE, cd, utf);
 941       break;
 942
 943       case OP_STARI:
 944       case OP_MINSTARI:
 945       case OP_POSSTARI:
 946       case OP_QUERYI:
 947       case OP_MINQUERYI:
 948       case OP_POSQUERYI:
 949       tcode = set_table_bit(start_bits, tcode + 1, TRUE, cd, utf);
 950       break;
 951
 952       /* Single-char upto sets the bit and tries the next */
 953
 954       case OP_UPTO:
 955       case OP_MINUPTO:
 956       case OP_POSUPTO:
 957       tcode = set_table_bit(start_bits, tcode + 1 + IMM2_SIZE, FALSE, cd, utf);
 958       break;
 959
 960       case OP_UPTOI:
 961       case OP_MINUPTOI:
 962       case OP_POSUPTOI:
 963       tcode = set_table_bit(start_bits, tcode + 1 + IMM2_SIZE, TRUE, cd, utf);
 964       break;
 965
 966       /* At least one single char sets the bit and stops */
 967
 968       case OP_EXACT:
 969       tcode += IMM2_SIZE;
 970       /* Fall through */
 971       case OP_CHAR:
 972       case OP_PLUS:
 973       case OP_MINPLUS:
 974       case OP_POSPLUS:
 975       (void)set_table_bit(start_bits, tcode + 1, FALSE, cd, utf);
 976       try_next = FALSE;
 977       break;
 978
 979       case OP_EXACTI:
 980       tcode += IMM2_SIZE;
 981       /* Fall through */
 982       case OP_CHARI:
 983       case OP_PLUSI:
 984       case OP_MINPLUSI:
 985       case OP_POSPLUSI:
 986       (void)set_table_bit(start_bits, tcode + 1, TRUE, cd, utf);
 987       try_next = FALSE;
 988       break;
 989
 990       /* Special spacing and line-terminating items. These recognize specific
 991       lists of characters. The difference between VSPACE and ANYNL is that the
 992       latter can match the two-character CRLF sequence, but that is not
 993       relevant for finding the first character, so their code here is
 994       identical. */
 995
 996       case OP_HSPACE:
 997       SET_BIT(CHAR_HT);
 998       SET_BIT(CHAR_SPACE);
 999 #ifdef SUPPORT_UTF
1000       if (utf)
1001         {
1002 #ifdef COMPILE_PCRE8
1003         SET_BIT(0xC2);  /* For U+00A0 */
1004         SET_BIT(0xE1);  /* For U+1680, U+180E */
1005         SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
1006         SET_BIT(0xE3);  /* For U+3000 */
1007 #elif defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1008         SET_BIT(0xA0);
1009         SET_BIT(0xFF);  /* For characters > 255 */
1010 #endif  /* COMPILE_PCRE[8|16|32] */
1011         }
1012       else
1013 #endif /* SUPPORT_UTF */
1014         {
1015 #ifndef EBCDIC
1016         SET_BIT(0xA0);
1017 #endif  /* Not EBCDIC */
1018 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1019         SET_BIT(0xFF);  /* For characters > 255 */
1020 #endif  /* COMPILE_PCRE[16|32] */
1021         }
1022       try_next = FALSE;
1023       break;
1024
1025       case OP_ANYNL:
1026       case OP_VSPACE:
1027       SET_BIT(CHAR_LF);
1028       SET_BIT(CHAR_VT);
1029       SET_BIT(CHAR_FF);
1030       SET_BIT(CHAR_CR);
1031 #ifdef SUPPORT_UTF
1032       if (utf)
1033         {
1034 #ifdef COMPILE_PCRE8
1035         SET_BIT(0xC2);  /* For U+0085 */
1036         SET_BIT(0xE2);  /* For U+2028, U+2029 */
1037 #elif defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1038         SET_BIT(CHAR_NEL);
1039         SET_BIT(0xFF);  /* For characters > 255 */
1040 #endif  /* COMPILE_PCRE[8|16|32] */
1041         }
1042       else
1043 #endif /* SUPPORT_UTF */
1044         {
1045         SET_BIT(CHAR_NEL);
1046 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1047         SET_BIT(0xFF);  /* For characters > 255 */
1048 #endif
1049         }
1050       try_next = FALSE;
1051       break;
1052
1053       /* Single character types set the bits and stop. Note that if PCRE_UCP
1054       is set, we do not see these op codes because \d etc are converted to
1055       properties. Therefore, these apply in the case when only characters less
1056       than 256 are recognized to match the types. */
1057
1058       case OP_NOT_DIGIT:
1059       set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
1060       try_next = FALSE;
1061       break;
1062
1063       case OP_DIGIT:
1064       set_type_bits(start_bits, cbit_digit, table_limit, cd);
1065       try_next = FALSE;
1066       break;
1067
1068       /* The cbit_space table has vertical tab as whitespace; we have to
1069       ensure it is set as not whitespace. Luckily, the code value is the same
1070       (0x0b) in ASCII and EBCDIC, so we can just adjust the appropriate bit. */
1071
1072       case OP_NOT_WHITESPACE:
1073       set_nottype_bits(start_bits, cbit_space, table_limit, cd);
1074       start_bits[1] |= 0x08;
1075       try_next = FALSE;
1076       break;
1077
1078       /* The cbit_space table has vertical tab as whitespace; we have to not
1079       set it from the table. Luckily, the code value is the same (0x0b) in
1080       ASCII and EBCDIC, so we can just adjust the appropriate bit. */
1081
1082       case OP_WHITESPACE:
1083       c = start_bits[1];    /* Save in case it was already set */
1084       set_type_bits(start_bits, cbit_space, table_limit, cd);
1085       start_bits[1] = (start_bits[1] & ~0x08) | c;
1086       try_next = FALSE;
1087       break;
1088
1089       case OP_NOT_WORDCHAR:
1090       set_nottype_bits(start_bits, cbit_word, table_limit, cd);
1091       try_next = FALSE;
1092       break;
1093
1094       case OP_WORDCHAR:
1095       set_type_bits(start_bits, cbit_word, table_limit, cd);
1096       try_next = FALSE;
1097       break;
1098
1099       /* One or more character type fudges the pointer and restarts, knowing
1100       it will hit a single character type and stop there. */
1101
1102       case OP_TYPEPLUS:
1103       case OP_TYPEMINPLUS:
1104       case OP_TYPEPOSPLUS:
1105       tcode++;
1106       break;
1107
1108       case OP_TYPEEXACT:
1109       tcode += 1 + IMM2_SIZE;
1110       break;
1111
1112       /* Zero or more repeats of character types set the bits and then
1113       try again. */
1114
1115       case OP_TYPEUPTO:
1116       case OP_TYPEMINUPTO:
1117       case OP_TYPEPOSUPTO:
1118       tcode += IMM2_SIZE;  /* Fall through */
1119
1120       case OP_TYPESTAR:
1121       case OP_TYPEMINSTAR:
1122       case OP_TYPEPOSSTAR:
1123       case OP_TYPEQUERY:
1124       case OP_TYPEMINQUERY:
1125       case OP_TYPEPOSQUERY:
1126       switch(tcode[1])
1127         {
1128         default:
1129         case OP_ANY:
1130         case OP_ALLANY:
1131         return SSB_FAIL;
1132
1133         case OP_HSPACE:
1134         SET_BIT(CHAR_HT);
1135         SET_BIT(CHAR_SPACE);
1136 #ifdef SUPPORT_UTF
1137         if (utf)
1138           {
1139 #ifdef COMPILE_PCRE8
1140           SET_BIT(0xC2);  /* For U+00A0 */
1141           SET_BIT(0xE1);  /* For U+1680, U+180E */
1142           SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
1143           SET_BIT(0xE3);  /* For U+3000 */
1144 #elif defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1145           SET_BIT(0xA0);
1146           SET_BIT(0xFF);  /* For characters > 255 */
1147 #endif  /* COMPILE_PCRE[8|16|32] */
1148           }
1149         else
1150 #endif /* SUPPORT_UTF */
1151 #ifndef EBCDIC
1152           SET_BIT(0xA0);
1153 #endif  /* Not EBCDIC */
1154         break;
1155
1156         case OP_ANYNL:
1157         case OP_VSPACE:
1158         SET_BIT(CHAR_LF);
1159         SET_BIT(CHAR_VT);
1160         SET_BIT(CHAR_FF);
1161         SET_BIT(CHAR_CR);
1162 #ifdef SUPPORT_UTF
1163         if (utf)
1164           {
1165 #ifdef COMPILE_PCRE8
1166           SET_BIT(0xC2);  /* For U+0085 */
1167           SET_BIT(0xE2);  /* For U+2028, U+2029 */
1168 #elif defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1169           SET_BIT(CHAR_NEL);
1170           SET_BIT(0xFF);  /* For characters > 255 */
1171 #endif  /* COMPILE_PCRE16 */
1172           }
1173         else
1174 #endif /* SUPPORT_UTF */
1175           SET_BIT(CHAR_NEL);
1176         break;
1177
1178         case OP_NOT_DIGIT:
1179         set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
1180         break;
1181
1182         case OP_DIGIT:
1183         set_type_bits(start_bits, cbit_digit, table_limit, cd);
1184         break;
1185
1186         /* The cbit_space table has vertical tab as whitespace; we have to
1187         ensure it gets set as not whitespace. Luckily, the code value is the
1188         same (0x0b) in ASCII and EBCDIC, so we can just adjust the appropriate
1189         bit. */
1190
1191         case OP_NOT_WHITESPACE:
1192         set_nottype_bits(start_bits, cbit_space, table_limit, cd);
1193         start_bits[1] |= 0x08;
1194         break;
1195
1196         /* The cbit_space table has vertical tab as whitespace; we have to
1197         avoid setting it. Luckily, the code value is the same (0x0b) in ASCII
1198         and EBCDIC, so we can just adjust the appropriate bit. */
1199
1200         case OP_WHITESPACE:
1201         c = start_bits[1];    /* Save in case it was already set */
1202         set_type_bits(start_bits, cbit_space, table_limit, cd);
1203         start_bits[1] = (start_bits[1] & ~0x08) | c;
1204         break;
1205
1206         case OP_NOT_WORDCHAR:
1207         set_nottype_bits(start_bits, cbit_word, table_limit, cd);
1208         break;
1209
1210         case OP_WORDCHAR:
1211         set_type_bits(start_bits, cbit_word, table_limit, cd);
1212         break;
1213         }
1214
1215       tcode += 2;
1216       break;
1217
1218       /* Character class where all the information is in a bit map: set the
1219       bits and either carry on or not, according to the repeat count. If it was
1220       a negative class, and we are operating with UTF-8 characters, any byte
1221       with a value >= 0xc4 is a potentially valid starter because it starts a
1222       character with a value > 255. */
1223
1224       case OP_NCLASS:
1225 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
1226       if (utf)
1227         {
1228         start_bits[24] |= 0xf0;              /* Bits for 0xc4 - 0xc8 */
1229         memset(start_bits+25, 0xff, 7);      /* Bits for 0xc9 - 0xff */
1230         }
1231 #endif
1232 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1233       SET_BIT(0xFF);                         /* For characters > 255 */
1234 #endif
1235       /* Fall through */
1236
1237       case OP_CLASS:
1238         {
1239         pcre_uint8 *map;
1240         tcode++;
1241         map = (pcre_uint8 *)tcode;
1242
1243         /* In UTF-8 mode, the bits in a bit map correspond to character
1244         values, not to byte values. However, the bit map we are constructing is
1245         for byte values. So we have to do a conversion for characters whose
1246         value is > 127. In fact, there are only two possible starting bytes for
1247         characters in the range 128 - 255. */
1248
1249 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
1250         if (utf)
1251           {
1252           for (c = 0; c < 16; c++) start_bits[c] |= map[c];
1253           for (c = 128; c < 256; c++)
1254             {
1255             if ((map[c/8] && (1 << (c&7))) != 0)
1256               {
1257               int d = (c >> 6) | 0xc0;            /* Set bit for this starter */
1258               start_bits[d/8] |= (1 << (d&7));    /* and then skip on to the */
1259               c = (c & 0xc0) + 0x40 - 1;          /* next relevant character. */
1260               }
1261             }
1262           }
1263         else
1264 #endif
1265           {
1266           /* In non-UTF-8 mode, the two bit maps are completely compatible. */
1267           for (c = 0; c < 32; c++) start_bits[c] |= map[c];
1268           }
1269
1270         /* Advance past the bit map, and act on what follows. For a zero
1271         minimum repeat, continue; otherwise stop processing. */
1272
1273         tcode += 32 / sizeof(pcre_uchar);
1274         switch (*tcode)
1275           {
1276           case OP_CRSTAR:
1277           case OP_CRMINSTAR:
1278           case OP_CRQUERY:
1279           case OP_CRMINQUERY:
1280           tcode++;
1281           break;
1282
1283           case OP_CRRANGE:
1284           case OP_CRMINRANGE:
1285           if (GET2(tcode, 1) == 0) tcode += 1 + 2 * IMM2_SIZE;
1286             else try_next = FALSE;
1287           break;
1288
1289           default:
1290           try_next = FALSE;
1291           break;
1292           }
1293         }
1294       break; /* End of bitmap class handling */
1295
1296       }      /* End of switch */
1297     }        /* End of try_next loop */
1298
1299   code += GET(code, 1);   /* Advance to next branch */
1300   }
1301 while (*code == OP_ALT);
1302 return yield;
1303 }
1304
1305
1306
1307
1308
1309 /*************************************************
1310 *          Study a compiled expression           *
1311 *************************************************/
1312
1313 /* This function is handed a compiled expression that it must study to produce
1314 information that will speed up the matching. It returns a pcre[16]_extra block
1315 which then gets handed back to pcre_exec().
1316
1317 Arguments:
1318   re        points to the compiled expression
1319   options   contains option bits
1320   errorptr  points to where to place error messages;
1321             set NULL unless error
1322
1323 Returns:    pointer to a pcre[16]_extra block, with study_data filled in and
1324               the appropriate flags set;
1325             NULL on error or if no optimization possible
1326 */
1327
1328 #if defined COMPILE_PCRE8
1329 PCRE_EXP_DEFN pcre_extra * PCRE_CALL_CONVENTION
1330 pcre_study(const pcre *external_re, int options, const char **errorptr)
1331 #elif defined COMPILE_PCRE16
1332 PCRE_EXP_DEFN pcre16_extra * PCRE_CALL_CONVENTION
1333 pcre16_study(const pcre16 *external_re, int options, const char **errorptr)
1334 #elif defined COMPILE_PCRE32
1335 PCRE_EXP_DEFN pcre32_extra * PCRE_CALL_CONVENTION
1336 pcre32_study(const pcre32 *external_re, int options, const char **errorptr)
1337 #endif
1338 {
1339 int min;
1340 BOOL bits_set = FALSE;
1341 pcre_uint8 start_bits[32];
1342 PUBL(extra) *extra = NULL;
1343 pcre_study_data *study;
1344 const pcre_uint8 *tables;
1345 pcre_uchar *code;
1346 compile_data compile_block;
1347 const REAL_PCRE *re = (const REAL_PCRE *)external_re;
1348
1349 *errorptr = NULL;
1350
1351 if (re == NULL || re->magic_number != MAGIC_NUMBER)
1352   {
1353   *errorptr = "argument is not a compiled regular expression";
1354   return NULL;
1355   }
1356
1357 if ((re->flags & PCRE_MODE) == 0)
1358   {
1359 #if defined COMPILE_PCRE8
1360   *errorptr = "argument not compiled in 8 bit mode";
1361 #elif defined COMPILE_PCRE16
1362   *errorptr = "argument not compiled in 16 bit mode";
1363 #elif defined COMPILE_PCRE32
1364   *errorptr = "argument not compiled in 32 bit mode";
1365 #endif
1366   return NULL;
1367   }
1368
1369 if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
1370   {
1371   *errorptr = "unknown or incorrect option bit(s) set";
1372   return NULL;
1373   }
1374
1375 code = (pcre_uchar *)re + re->name_table_offset +
1376   (re->name_count * re->name_entry_size);
1377
1378 /* For an anchored pattern, or an unanchored pattern that has a first char, or
1379 a multiline pattern that matches only at "line starts", there is no point in
1380 seeking a list of starting bytes. */
1381
1382 if ((re->options & PCRE_ANCHORED) == 0 &&
1383     (re->flags & (PCRE_FIRSTSET|PCRE_STARTLINE)) == 0)
1384   {
1385   int rc;
1386
1387   /* Set the character tables in the block that is passed around */
1388
1389   tables = re->tables;
1390
1391 #if defined COMPILE_PCRE8
1392   if (tables == NULL)
1393     (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
1394     (void *)(&tables));
1395 #elif defined COMPILE_PCRE16
1396   if (tables == NULL)
1397     (void)pcre16_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
1398     (void *)(&tables));
1399 #elif defined COMPILE_PCRE32
1400   if (tables == NULL)
1401     (void)pcre32_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
1402     (void *)(&tables));
1403 #endif
1404
1405   compile_block.lcc = tables + lcc_offset;
1406   compile_block.fcc = tables + fcc_offset;
1407   compile_block.cbits = tables + cbits_offset;
1408   compile_block.ctypes = tables + ctypes_offset;
1409
1410   /* See if we can find a fixed set of initial characters for the pattern. */
1411
1412   memset(start_bits, 0, 32 * sizeof(pcre_uint8));
1413   rc = set_start_bits(code, start_bits, (re->options & PCRE_UTF8) != 0,
1414     &compile_block);
1415   bits_set = rc == SSB_DONE;
1416   if (rc == SSB_UNKNOWN)
1417     {
1418     *errorptr = "internal error: opcode not recognized";
1419     return NULL;
1420     }
1421   }
1422
1423 /* Find the minimum length of subject string. */
1424
1425 switch(min = find_minlength(code, code, re->options, 0))
1426   {
1427   case -2: *errorptr = "internal error: missing capturing bracket"; return NULL;
1428   case -3: *errorptr = "internal error: opcode not recognized"; return NULL;
1429   default: break;
1430   }
1431
1432 /* If a set of starting bytes has been identified, or if the minimum length is
1433 greater than zero, or if JIT optimization has been requested, or if
1434 PCRE_STUDY_EXTRA_NEEDED is set, get a pcre[16]_extra block and a
1435 pcre_study_data block. The study data is put in the latter, which is pointed to
1436 by the former, which may also get additional data set later by the calling
1437 program. At the moment, the size of pcre_study_data is fixed. We nevertheless
1438 save it in a field for returning via the pcre_fullinfo() function so that if it
1439 becomes variable in the future, we don't have to change that code. */
1440
1441 if (bits_set || min > 0 || (options & (
1442 #ifdef SUPPORT_JIT
1443     PCRE_STUDY_JIT_COMPILE | PCRE_STUDY_JIT_PARTIAL_SOFT_COMPILE |
1444     PCRE_STUDY_JIT_PARTIAL_HARD_COMPILE |
1445 #endif
1446     PCRE_STUDY_EXTRA_NEEDED)) != 0)
1447   {
1448   extra = (PUBL(extra) *)(PUBL(malloc))
1449     (sizeof(PUBL(extra)) + sizeof(pcre_study_data));
1450   if (extra == NULL)
1451     {
1452     *errorptr = "failed to get memory";
1453     return NULL;
1454     }
1455
1456   study = (pcre_study_data *)((char *)extra + sizeof(PUBL(extra)));
1457   extra->flags = PCRE_EXTRA_STUDY_DATA;
1458   extra->study_data = study;
1459
1460   study->size = sizeof(pcre_study_data);
1461   study->flags = 0;
1462
1463   /* Set the start bits always, to avoid unset memory errors if the
1464   study data is written to a file, but set the flag only if any of the bits
1465   are set, to save time looking when none are. */
1466
1467   if (bits_set)
1468     {
1469     study->flags |= PCRE_STUDY_MAPPED;
1470     memcpy(study->start_bits, start_bits, sizeof(start_bits));
1471     }
1472   else memset(study->start_bits, 0, 32 * sizeof(pcre_uint8));
1473
1474 #ifdef PCRE_DEBUG
1475   if (bits_set)
1476     {
1477     pcre_uint8 *ptr = start_bits;
1478     int i;
1479
1480     printf("Start bits:\n");
1481     for (i = 0; i < 32; i++)
1482       printf("%3d: %02x%s", i * 8, *ptr++, ((i + 1) & 0x7) != 0? " " : "\n");
1483     }
1484 #endif
1485
1486   /* Always set the minlength value in the block, because the JIT compiler
1487   makes use of it. However, don't set the bit unless the length is greater than
1488   zero - the interpretive pcre_exec() and pcre_dfa_exec() needn't waste time
1489   checking the zero case. */
1490
1491   if (min > 0)
1492     {
1493     study->flags |= PCRE_STUDY_MINLEN;
1494     study->minlength = min;
1495     }
1496   else study->minlength = 0;
1497
1498   /* If JIT support was compiled and requested, attempt the JIT compilation.
1499   If no starting bytes were found, and the minimum length is zero, and JIT
1500   compilation fails, abandon the extra block and return NULL, unless
1501   PCRE_STUDY_EXTRA_NEEDED is set. */
1502
1503 #ifdef SUPPORT_JIT
1504   extra->executable_jit = NULL;
1505   if ((options & PCRE_STUDY_JIT_COMPILE) != 0)
1506     PRIV(jit_compile)(re, extra, JIT_COMPILE);
1507   if ((options & PCRE_STUDY_JIT_PARTIAL_SOFT_COMPILE) != 0)
1508     PRIV(jit_compile)(re, extra, JIT_PARTIAL_SOFT_COMPILE);
1509   if ((options & PCRE_STUDY_JIT_PARTIAL_HARD_COMPILE) != 0)
1510     PRIV(jit_compile)(re, extra, JIT_PARTIAL_HARD_COMPILE);
1511
1512   if (study->flags == 0 && (extra->flags & PCRE_EXTRA_EXECUTABLE_JIT) == 0 &&
1513       (options & PCRE_STUDY_EXTRA_NEEDED) == 0)
1514     {
1515 #if defined COMPILE_PCRE8
1516     pcre_free_study(extra);
1517 #elif defined COMPILE_PCRE16
1518     pcre16_free_study(extra);
1519 #elif defined COMPILE_PCRE32
1520     pcre32_free_study(extra);
1521 #endif
1522     extra = NULL;
1523     }
1524 #endif
1525   }
1526
1527 return extra;
1528 }
1529
1530
1531 /*************************************************
1532 *          Free the study data                   *
1533 *************************************************/
1534
1535 /* This function frees the memory that was obtained by pcre_study().
1536
1537 Argument:   a pointer to the pcre[16]_extra block
1538 Returns:    nothing
1539 */
1540
1541 #if defined COMPILE_PCRE8
1542 PCRE_EXP_DEFN void
1543 pcre_free_study(pcre_extra *extra)
1544 #elif defined COMPILE_PCRE16
1545 PCRE_EXP_DEFN void
1546 pcre16_free_study(pcre16_extra *extra)
1547 #elif defined COMPILE_PCRE32
1548 PCRE_EXP_DEFN void
1549 pcre32_free_study(pcre32_extra *extra)
1550 #endif
1551 {
1552 if (extra == NULL)
1553   return;
1554 #ifdef SUPPORT_JIT
1555 if ((extra->flags & PCRE_EXTRA_EXECUTABLE_JIT) != 0 &&
1556      extra->executable_jit != NULL)
1557   PRIV(jit_free)(extra->executable_jit);
1558 #endif
1559 PUBL(free)(extra);
1560 }
1561
1562 /* End of pcre_study.c */