release/src/router/pcre/pcre_dfa_exec.c

   1 /*************************************************
   2 *      Perl-Compatible Regular Expressions       *
   3 *************************************************/
   4
   5 /* PCRE is a library of functions to support regular expressions whose syntax
   6 and semantics are as close as possible to those of the Perl 5 language (but see
   7 below for why this module is different).
   8
   9                        Written by Philip Hazel
  10            Copyright (c) 1997-2012 University of Cambridge
  11
  12 -----------------------------------------------------------------------------
  13 Redistribution and use in source and binary forms, with or without
  14 modification, are permitted provided that the following conditions are met:
  15
  16     * Redistributions of source code must retain the above copyright notice,
  17       this list of conditions and the following disclaimer.
  18
  19     * Redistributions in binary form must reproduce the above copyright
  20       notice, this list of conditions and the following disclaimer in the
  21       documentation and/or other materials provided with the distribution.
  22
  23     * Neither the name of the University of Cambridge nor the names of its
  24       contributors may be used to endorse or promote products derived from
  25       this software without specific prior written permission.
  26
  27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  37 POSSIBILITY OF SUCH DAMAGE.
  38 -----------------------------------------------------------------------------
  39 */
  40
  41 /* This module contains the external function pcre_dfa_exec(), which is an
  42 alternative matching function that uses a sort of DFA algorithm (not a true
  43 FSM). This is NOT Perl-compatible, but it has advantages in certain
  44 applications. */
  45
  46
  47 /* NOTE ABOUT PERFORMANCE: A user of this function sent some code that improved
  48 the performance of his patterns greatly. I could not use it as it stood, as it
  49 was not thread safe, and made assumptions about pattern sizes. Also, it caused
  50 test 7 to loop, and test 9 to crash with a segfault.
  51
  52 The issue is the check for duplicate states, which is done by a simple linear
  53 search up the state list. (Grep for "duplicate" below to find the code.) For
  54 many patterns, there will never be many states active at one time, so a simple
  55 linear search is fine. In patterns that have many active states, it might be a
  56 bottleneck. The suggested code used an indexing scheme to remember which states
  57 had previously been used for each character, and avoided the linear search when
  58 it knew there was no chance of a duplicate. This was implemented when adding
  59 states to the state lists.
  60
  61 I wrote some thread-safe, not-limited code to try something similar at the time
  62 of checking for duplicates (instead of when adding states), using index vectors
  63 on the stack. It did give a 13% improvement with one specially constructed
  64 pattern for certain subject strings, but on other strings and on many of the
  65 simpler patterns in the test suite it did worse. The major problem, I think,
  66 was the extra time to initialize the index. This had to be done for each call
  67 of internal_dfa_exec(). (The supplied patch used a static vector, initialized
  68 only once - I suspect this was the cause of the problems with the tests.)
  69
  70 Overall, I concluded that the gains in some cases did not outweigh the losses
  71 in others, so I abandoned this code. */
  72
  73
  74
  75 #ifdef HAVE_CONFIG_H
  76 #include "config.h"
  77 #endif
  78
  79 #define NLBLOCK md             /* Block containing newline information */
  80 #define PSSTART start_subject  /* Field containing processed string start */
  81 #define PSEND   end_subject    /* Field containing processed string end */
  82
  83 #include "pcre_internal.h"
  84
  85
  86 /* For use to indent debugging output */
  87
  88 #define SP "                   "
  89
  90
  91 /*************************************************
  92 *      Code parameters and static tables         *
  93 *************************************************/
  94
  95 /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
  96 into others, under special conditions. A gap of 20 between the blocks should be
  97 enough. The resulting opcodes don't have to be less than 256 because they are
  98 never stored, so we push them well clear of the normal opcodes. */
  99
 100 #define OP_PROP_EXTRA       300
 101 #define OP_EXTUNI_EXTRA     320
 102 #define OP_ANYNL_EXTRA      340
 103 #define OP_HSPACE_EXTRA     360
 104 #define OP_VSPACE_EXTRA     380
 105
 106
 107 /* This table identifies those opcodes that are followed immediately by a
 108 character that is to be tested in some way. This makes it possible to
 109 centralize the loading of these characters. In the case of Type * etc, the
 110 "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
 111 small value. Non-zero values in the table are the offsets from the opcode where
 112 the character is to be found. ***NOTE*** If the start of this table is
 113 modified, the three tables that follow must also be modified. */
 114
 115 static const pcre_uint8 coptable[] = {
 116   0,                             /* End                                    */
 117   0, 0, 0, 0, 0,                 /* \A, \G, \K, \B, \b                     */
 118   0, 0, 0, 0, 0, 0,              /* \D, \d, \S, \s, \W, \w                 */
 119   0, 0, 0,                       /* Any, AllAny, Anybyte                   */
 120   0, 0,                          /* \P, \p                                 */
 121   0, 0, 0, 0, 0,                 /* \R, \H, \h, \V, \v                     */
 122   0,                             /* \X                                     */
 123   0, 0, 0, 0, 0, 0,              /* \Z, \z, ^, ^M, $, $M                   */
 124   1,                             /* Char                                   */
 125   1,                             /* Chari                                  */
 126   1,                             /* not                                    */
 127   1,                             /* noti                                   */
 128   /* Positive single-char repeats                                          */
 129   1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
 130   1+IMM2_SIZE, 1+IMM2_SIZE,      /* upto, minupto                          */
 131   1+IMM2_SIZE,                   /* exact                                  */
 132   1, 1, 1, 1+IMM2_SIZE,          /* *+, ++, ?+, upto+                      */
 133   1, 1, 1, 1, 1, 1,              /* *I, *?I, +I, +?I, ?I, ??I              */
 134   1+IMM2_SIZE, 1+IMM2_SIZE,      /* upto I, minupto I                      */
 135   1+IMM2_SIZE,                   /* exact I                                */
 136   1, 1, 1, 1+IMM2_SIZE,          /* *+I, ++I, ?+I, upto+I                  */
 137   /* Negative single-char repeats - only for chars < 256                   */
 138   1, 1, 1, 1, 1, 1,              /* NOT *, *?, +, +?, ?, ??                */
 139   1+IMM2_SIZE, 1+IMM2_SIZE,      /* NOT upto, minupto                      */
 140   1+IMM2_SIZE,                   /* NOT exact                              */
 141   1, 1, 1, 1+IMM2_SIZE,          /* NOT *+, ++, ?+, upto+                  */
 142   1, 1, 1, 1, 1, 1,              /* NOT *I, *?I, +I, +?I, ?I, ??I          */
 143   1+IMM2_SIZE, 1+IMM2_SIZE,      /* NOT upto I, minupto I                  */
 144   1+IMM2_SIZE,                   /* NOT exact I                            */
 145   1, 1, 1, 1+IMM2_SIZE,          /* NOT *+I, ++I, ?+I, upto+I              */
 146   /* Positive type repeats                                                 */
 147   1, 1, 1, 1, 1, 1,              /* Type *, *?, +, +?, ?, ??               */
 148   1+IMM2_SIZE, 1+IMM2_SIZE,      /* Type upto, minupto                     */
 149   1+IMM2_SIZE,                   /* Type exact                             */
 150   1, 1, 1, 1+IMM2_SIZE,          /* Type *+, ++, ?+, upto+                 */
 151   /* Character class & ref repeats                                         */
 152   0, 0, 0, 0, 0, 0,              /* *, *?, +, +?, ?, ??                    */
 153   0, 0,                          /* CRRANGE, CRMINRANGE                    */
 154   0,                             /* CLASS                                  */
 155   0,                             /* NCLASS                                 */
 156   0,                             /* XCLASS - variable length               */
 157   0,                             /* REF                                    */
 158   0,                             /* REFI                                   */
 159   0,                             /* RECURSE                                */
 160   0,                             /* CALLOUT                                */
 161   0,                             /* Alt                                    */
 162   0,                             /* Ket                                    */
 163   0,                             /* KetRmax                                */
 164   0,                             /* KetRmin                                */
 165   0,                             /* KetRpos                                */
 166   0,                             /* Reverse                                */
 167   0,                             /* Assert                                 */
 168   0,                             /* Assert not                             */
 169   0,                             /* Assert behind                          */
 170   0,                             /* Assert behind not                      */
 171   0, 0,                          /* ONCE, ONCE_NC                          */
 172   0, 0, 0, 0, 0,                 /* BRA, BRAPOS, CBRA, CBRAPOS, COND       */
 173   0, 0, 0, 0, 0,                 /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND  */
 174   0, 0,                          /* CREF, NCREF                            */
 175   0, 0,                          /* RREF, NRREF                            */
 176   0,                             /* DEF                                    */
 177   0, 0, 0,                       /* BRAZERO, BRAMINZERO, BRAPOSZERO        */
 178   0, 0, 0,                       /* MARK, PRUNE, PRUNE_ARG                 */
 179   0, 0, 0, 0,                    /* SKIP, SKIP_ARG, THEN, THEN_ARG         */
 180   0, 0, 0, 0,                    /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT    */
 181   0, 0                           /* CLOSE, SKIPZERO  */
 182 };
 183
 184 /* This table identifies those opcodes that inspect a character. It is used to
 185 remember the fact that a character could have been inspected when the end of
 186 the subject is reached. ***NOTE*** If the start of this table is modified, the
 187 two tables that follow must also be modified. */
 188
 189 static const pcre_uint8 poptable[] = {
 190   0,                             /* End                                    */
 191   0, 0, 0, 1, 1,                 /* \A, \G, \K, \B, \b                     */
 192   1, 1, 1, 1, 1, 1,              /* \D, \d, \S, \s, \W, \w                 */
 193   1, 1, 1,                       /* Any, AllAny, Anybyte                   */
 194   1, 1,                          /* \P, \p                                 */
 195   1, 1, 1, 1, 1,                 /* \R, \H, \h, \V, \v                     */
 196   1,                             /* \X                                     */
 197   0, 0, 0, 0, 0, 0,              /* \Z, \z, ^, ^M, $, $M                   */
 198   1,                             /* Char                                   */
 199   1,                             /* Chari                                  */
 200   1,                             /* not                                    */
 201   1,                             /* noti                                   */
 202   /* Positive single-char repeats                                          */
 203   1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
 204   1, 1, 1,                       /* upto, minupto, exact                   */
 205   1, 1, 1, 1,                    /* *+, ++, ?+, upto+                      */
 206   1, 1, 1, 1, 1, 1,              /* *I, *?I, +I, +?I, ?I, ??I              */
 207   1, 1, 1,                       /* upto I, minupto I, exact I             */
 208   1, 1, 1, 1,                    /* *+I, ++I, ?+I, upto+I                  */
 209   /* Negative single-char repeats - only for chars < 256                   */
 210   1, 1, 1, 1, 1, 1,              /* NOT *, *?, +, +?, ?, ??                */
 211   1, 1, 1,                       /* NOT upto, minupto, exact               */
 212   1, 1, 1, 1,                    /* NOT *+, ++, ?+, upto+                  */
 213   1, 1, 1, 1, 1, 1,              /* NOT *I, *?I, +I, +?I, ?I, ??I          */
 214   1, 1, 1,                       /* NOT upto I, minupto I, exact I         */
 215   1, 1, 1, 1,                    /* NOT *+I, ++I, ?+I, upto+I              */
 216   /* Positive type repeats                                                 */
 217   1, 1, 1, 1, 1, 1,              /* Type *, *?, +, +?, ?, ??               */
 218   1, 1, 1,                       /* Type upto, minupto, exact              */
 219   1, 1, 1, 1,                    /* Type *+, ++, ?+, upto+                 */
 220   /* Character class & ref repeats                                         */
 221   1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
 222   1, 1,                          /* CRRANGE, CRMINRANGE                    */
 223   1,                             /* CLASS                                  */
 224   1,                             /* NCLASS                                 */
 225   1,                             /* XCLASS - variable length               */
 226   0,                             /* REF                                    */
 227   0,                             /* REFI                                   */
 228   0,                             /* RECURSE                                */
 229   0,                             /* CALLOUT                                */
 230   0,                             /* Alt                                    */
 231   0,                             /* Ket                                    */
 232   0,                             /* KetRmax                                */
 233   0,                             /* KetRmin                                */
 234   0,                             /* KetRpos                                */
 235   0,                             /* Reverse                                */
 236   0,                             /* Assert                                 */
 237   0,                             /* Assert not                             */
 238   0,                             /* Assert behind                          */
 239   0,                             /* Assert behind not                      */
 240   0, 0,                          /* ONCE, ONCE_NC                          */
 241   0, 0, 0, 0, 0,                 /* BRA, BRAPOS, CBRA, CBRAPOS, COND       */
 242   0, 0, 0, 0, 0,                 /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND  */
 243   0, 0,                          /* CREF, NCREF                            */
 244   0, 0,                          /* RREF, NRREF                            */
 245   0,                             /* DEF                                    */
 246   0, 0, 0,                       /* BRAZERO, BRAMINZERO, BRAPOSZERO        */
 247   0, 0, 0,                       /* MARK, PRUNE, PRUNE_ARG                 */
 248   0, 0, 0, 0,                    /* SKIP, SKIP_ARG, THEN, THEN_ARG         */
 249   0, 0, 0, 0,                    /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT    */
 250   0, 0                           /* CLOSE, SKIPZERO                        */
 251 };
 252
 253 /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
 254 and \w */
 255
 256 static const pcre_uint8 toptable1[] = {
 257   0, 0, 0, 0, 0, 0,
 258   ctype_digit, ctype_digit,
 259   ctype_space, ctype_space,
 260   ctype_word,  ctype_word,
 261   0, 0                            /* OP_ANY, OP_ALLANY */
 262 };
 263
 264 static const pcre_uint8 toptable2[] = {
 265   0, 0, 0, 0, 0, 0,
 266   ctype_digit, 0,
 267   ctype_space, 0,
 268   ctype_word,  0,
 269   1, 1                            /* OP_ANY, OP_ALLANY */
 270 };
 271
 272
 273 /* Structure for holding data about a particular state, which is in effect the
 274 current data for an active path through the match tree. It must consist
 275 entirely of ints because the working vector we are passed, and which we put
 276 these structures in, is a vector of ints. */
 277
 278 typedef struct stateblock {
 279   int offset;                     /* Offset to opcode */
 280   int count;                      /* Count for repeats */
 281   int data;                       /* Some use extra data */
 282 } stateblock;
 283
 284 #define INTS_PER_STATEBLOCK  (int)(sizeof(stateblock)/sizeof(int))
 285
 286
 287 #ifdef PCRE_DEBUG
 288 /*************************************************
 289 *             Print character string             *
 290 *************************************************/
 291
 292 /* Character string printing function for debugging.
 293
 294 Arguments:
 295   p            points to string
 296   length       number of bytes
 297   f            where to print
 298
 299 Returns:       nothing
 300 */
 301
 302 static void
 303 pchars(const pcre_uchar *p, int length, FILE *f)
 304 {
 305 pcre_uint32 c;
 306 while (length-- > 0)
 307   {
 308   if (isprint(c = *(p++)))
 309     fprintf(f, "%c", c);
 310   else
 311     fprintf(f, "\\x{%02x}", c);
 312   }
 313 }
 314 #endif
 315
 316
 317
 318 /*************************************************
 319 *    Execute a Regular Expression - DFA engine   *
 320 *************************************************/
 321
 322 /* This internal function applies a compiled pattern to a subject string,
 323 starting at a given point, using a DFA engine. This function is called from the
 324 external one, possibly multiple times if the pattern is not anchored. The
 325 function calls itself recursively for some kinds of subpattern.
 326
 327 Arguments:
 328   md                the match_data block with fixed information
 329   this_start_code   the opening bracket of this subexpression's code
 330   current_subject   where we currently are in the subject string
 331   start_offset      start offset in the subject string
 332   offsets           vector to contain the matching string offsets
 333   offsetcount       size of same
 334   workspace         vector of workspace
 335   wscount           size of same
 336   rlevel            function call recursion level
 337
 338 Returns:            > 0 => number of match offset pairs placed in offsets
 339                     = 0 => offsets overflowed; longest matches are present
 340                      -1 => failed to match
 341                    < -1 => some kind of unexpected problem
 342
 343 The following macros are used for adding states to the two state vectors (one
 344 for the current character, one for the following character). */
 345
 346 #define ADD_ACTIVE(x,y) \
 347   if (active_count++ < wscount) \
 348     { \
 349     next_active_state->offset = (x); \
 350     next_active_state->count  = (y); \
 351     next_active_state++; \
 352     DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
 353     } \
 354   else return PCRE_ERROR_DFA_WSSIZE
 355
 356 #define ADD_ACTIVE_DATA(x,y,z) \
 357   if (active_count++ < wscount) \
 358     { \
 359     next_active_state->offset = (x); \
 360     next_active_state->count  = (y); \
 361     next_active_state->data   = (z); \
 362     next_active_state++; \
 363     DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
 364     } \
 365   else return PCRE_ERROR_DFA_WSSIZE
 366
 367 #define ADD_NEW(x,y) \
 368   if (new_count++ < wscount) \
 369     { \
 370     next_new_state->offset = (x); \
 371     next_new_state->count  = (y); \
 372     next_new_state++; \
 373     DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
 374     } \
 375   else return PCRE_ERROR_DFA_WSSIZE
 376
 377 #define ADD_NEW_DATA(x,y,z) \
 378   if (new_count++ < wscount) \
 379     { \
 380     next_new_state->offset = (x); \
 381     next_new_state->count  = (y); \
 382     next_new_state->data   = (z); \
 383     next_new_state++; \
 384     DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d) line %d\n", rlevel*2-2, SP, \
 385       (x), (y), (z), __LINE__)); \
 386     } \
 387   else return PCRE_ERROR_DFA_WSSIZE
 388
 389 /* And now, here is the code */
 390
 391 static int
 392 internal_dfa_exec(
 393   dfa_match_data *md,
 394   const pcre_uchar *this_start_code,
 395   const pcre_uchar *current_subject,
 396   int start_offset,
 397   int *offsets,
 398   int offsetcount,
 399   int *workspace,
 400   int wscount,
 401   int  rlevel)
 402 {
 403 stateblock *active_states, *new_states, *temp_states;
 404 stateblock *next_active_state, *next_new_state;
 405
 406 const pcre_uint8 *ctypes, *lcc, *fcc;
 407 const pcre_uchar *ptr;
 408 const pcre_uchar *end_code, *first_op;
 409
 410 dfa_recursion_info new_recursive;
 411
 412 int active_count, new_count, match_count;
 413
 414 /* Some fields in the md block are frequently referenced, so we load them into
 415 independent variables in the hope that this will perform better. */
 416
 417 const pcre_uchar *start_subject = md->start_subject;
 418 const pcre_uchar *end_subject = md->end_subject;
 419 const pcre_uchar *start_code = md->start_code;
 420
 421 #ifdef SUPPORT_UTF
 422 BOOL utf = (md->poptions & PCRE_UTF8) != 0;
 423 #else
 424 BOOL utf = FALSE;
 425 #endif
 426
 427 BOOL reset_could_continue = FALSE;
 428
 429 rlevel++;
 430 offsetcount &= (-2);
 431
 432 wscount -= 2;
 433 wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
 434           (2 * INTS_PER_STATEBLOCK);
 435
 436 DPRINTF(("\n%.*s---------------------\n"
 437   "%.*sCall to internal_dfa_exec f=%d\n",
 438   rlevel*2-2, SP, rlevel*2-2, SP, rlevel));
 439
 440 ctypes = md->tables + ctypes_offset;
 441 lcc = md->tables + lcc_offset;
 442 fcc = md->tables + fcc_offset;
 443
 444 match_count = PCRE_ERROR_NOMATCH;   /* A negative number */
 445
 446 active_states = (stateblock *)(workspace + 2);
 447 next_new_state = new_states = active_states + wscount;
 448 new_count = 0;
 449
 450 first_op = this_start_code + 1 + LINK_SIZE +
 451   ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
 452     *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)
 453     ? IMM2_SIZE:0);
 454
 455 /* The first thing in any (sub) pattern is a bracket of some sort. Push all
 456 the alternative states onto the list, and find out where the end is. This
 457 makes is possible to use this function recursively, when we want to stop at a
 458 matching internal ket rather than at the end.
 459
 460 If the first opcode in the first alternative is OP_REVERSE, we are dealing with
 461 a backward assertion. In that case, we have to find out the maximum amount to
 462 move back, and set up each alternative appropriately. */
 463
 464 if (*first_op == OP_REVERSE)
 465   {
 466   int max_back = 0;
 467   int gone_back;
 468
 469   end_code = this_start_code;
 470   do
 471     {
 472     int back = GET(end_code, 2+LINK_SIZE);
 473     if (back > max_back) max_back = back;
 474     end_code += GET(end_code, 1);
 475     }
 476   while (*end_code == OP_ALT);
 477
 478   /* If we can't go back the amount required for the longest lookbehind
 479   pattern, go back as far as we can; some alternatives may still be viable. */
 480
 481 #ifdef SUPPORT_UTF
 482   /* In character mode we have to step back character by character */
 483
 484   if (utf)
 485     {
 486     for (gone_back = 0; gone_back < max_back; gone_back++)
 487       {
 488       if (current_subject <= start_subject) break;
 489       current_subject--;
 490       ACROSSCHAR(current_subject > start_subject, *current_subject, current_subject--);
 491       }
 492     }
 493   else
 494 #endif
 495
 496   /* In byte-mode we can do this quickly. */
 497
 498     {
 499     gone_back = (current_subject - max_back < start_subject)?
 500       (int)(current_subject - start_subject) : max_back;
 501     current_subject -= gone_back;
 502     }
 503
 504   /* Save the earliest consulted character */
 505
 506   if (current_subject < md->start_used_ptr)
 507     md->start_used_ptr = current_subject;
 508
 509   /* Now we can process the individual branches. */
 510
 511   end_code = this_start_code;
 512   do
 513     {
 514     int back = GET(end_code, 2+LINK_SIZE);
 515     if (back <= gone_back)
 516       {
 517       int bstate = (int)(end_code - start_code + 2 + 2*LINK_SIZE);
 518       ADD_NEW_DATA(-bstate, 0, gone_back - back);
 519       }
 520     end_code += GET(end_code, 1);
 521     }
 522   while (*end_code == OP_ALT);
 523  }
 524
 525 /* This is the code for a "normal" subpattern (not a backward assertion). The
 526 start of a whole pattern is always one of these. If we are at the top level,
 527 we may be asked to restart matching from the same point that we reached for a
 528 previous partial match. We still have to scan through the top-level branches to
 529 find the end state. */
 530
 531 else
 532   {
 533   end_code = this_start_code;
 534
 535   /* Restarting */
 536
 537   if (rlevel == 1 && (md->moptions & PCRE_DFA_RESTART) != 0)
 538     {
 539     do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
 540     new_count = workspace[1];
 541     if (!workspace[0])
 542       memcpy(new_states, active_states, new_count * sizeof(stateblock));
 543     }
 544
 545   /* Not restarting */
 546
 547   else
 548     {
 549     int length = 1 + LINK_SIZE +
 550       ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
 551         *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)
 552         ? IMM2_SIZE:0);
 553     do
 554       {
 555       ADD_NEW((int)(end_code - start_code + length), 0);
 556       end_code += GET(end_code, 1);
 557       length = 1 + LINK_SIZE;
 558       }
 559     while (*end_code == OP_ALT);
 560     }
 561   }
 562
 563 workspace[0] = 0;    /* Bit indicating which vector is current */
 564
 565 DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, (int)(end_code - start_code)));
 566
 567 /* Loop for scanning the subject */
 568
 569 ptr = current_subject;
 570 for (;;)
 571   {
 572   int i, j;
 573   int clen, dlen;
 574   pcre_uint32 c, d;
 575   int forced_fail = 0;
 576   BOOL partial_newline = FALSE;
 577   BOOL could_continue = reset_could_continue;
 578   reset_could_continue = FALSE;
 579
 580   /* Make the new state list into the active state list and empty the
 581   new state list. */
 582
 583   temp_states = active_states;
 584   active_states = new_states;
 585   new_states = temp_states;
 586   active_count = new_count;
 587   new_count = 0;
 588
 589   workspace[0] ^= 1;              /* Remember for the restarting feature */
 590   workspace[1] = active_count;
 591
 592 #ifdef PCRE_DEBUG
 593   printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
 594   pchars(ptr, STRLEN_UC(ptr), stdout);
 595   printf("\"\n");
 596
 597   printf("%.*sActive states: ", rlevel*2-2, SP);
 598   for (i = 0; i < active_count; i++)
 599     printf("%d/%d ", active_states[i].offset, active_states[i].count);
 600   printf("\n");
 601 #endif
 602
 603   /* Set the pointers for adding new states */
 604
 605   next_active_state = active_states + active_count;
 606   next_new_state = new_states;
 607
 608   /* Load the current character from the subject outside the loop, as many
 609   different states may want to look at it, and we assume that at least one
 610   will. */
 611
 612   if (ptr < end_subject)
 613     {
 614     clen = 1;        /* Number of data items in the character */
 615 #ifdef SUPPORT_UTF
 616     GETCHARLENTEST(c, ptr, clen);
 617 #else
 618     c = *ptr;
 619 #endif  /* SUPPORT_UTF */
 620     }
 621   else
 622     {
 623     clen = 0;        /* This indicates the end of the subject */
 624     c = NOTACHAR;    /* This value should never actually be used */
 625     }
 626
 627   /* Scan up the active states and act on each one. The result of an action
 628   may be to add more states to the currently active list (e.g. on hitting a
 629   parenthesis) or it may be to put states on the new list, for considering
 630   when we move the character pointer on. */
 631
 632   for (i = 0; i < active_count; i++)
 633     {
 634     stateblock *current_state = active_states + i;
 635     BOOL caseless = FALSE;
 636     const pcre_uchar *code;
 637     int state_offset = current_state->offset;
 638     int codevalue, rrc;
 639     unsigned int count;
 640
 641 #ifdef PCRE_DEBUG
 642     printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
 643     if (clen == 0) printf("EOL\n");
 644       else if (c > 32 && c < 127) printf("'%c'\n", c);
 645         else printf("0x%02x\n", c);
 646 #endif
 647
 648     /* A negative offset is a special case meaning "hold off going to this
 649     (negated) state until the number of characters in the data field have
 650     been skipped". If the could_continue flag was passed over from a previous
 651     state, arrange for it to passed on. */
 652
 653     if (state_offset < 0)
 654       {
 655       if (current_state->data > 0)
 656         {
 657         DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));
 658         ADD_NEW_DATA(state_offset, current_state->count,
 659           current_state->data - 1);
 660         if (could_continue) reset_could_continue = TRUE;
 661         continue;
 662         }
 663       else
 664         {
 665         current_state->offset = state_offset = -state_offset;
 666         }
 667       }
 668
 669     /* Check for a duplicate state with the same count, and skip if found.
 670     See the note at the head of this module about the possibility of improving
 671     performance here. */
 672
 673     for (j = 0; j < i; j++)
 674       {
 675       if (active_states[j].offset == state_offset &&
 676           active_states[j].count == current_state->count)
 677         {
 678         DPRINTF(("%.*sDuplicate state: skipped\n", rlevel*2-2, SP));
 679         goto NEXT_ACTIVE_STATE;
 680         }
 681       }
 682
 683     /* The state offset is the offset to the opcode */
 684
 685     code = start_code + state_offset;
 686     codevalue = *code;
 687
 688     /* If this opcode inspects a character, but we are at the end of the
 689     subject, remember the fact for use when testing for a partial match. */
 690
 691     if (clen == 0 && poptable[codevalue] != 0)
 692       could_continue = TRUE;
 693
 694     /* If this opcode is followed by an inline character, load it. It is
 695     tempting to test for the presence of a subject character here, but that
 696     is wrong, because sometimes zero repetitions of the subject are
 697     permitted.
 698
 699     We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
 700     argument that is not a data character - but is always one byte long because
 701     the values are small. We have to take special action to deal with  \P, \p,
 702     \H, \h, \V, \v and \X in this case. To keep the other cases fast, convert
 703     these ones to new opcodes. */
 704
 705     if (coptable[codevalue] > 0)
 706       {
 707       dlen = 1;
 708 #ifdef SUPPORT_UTF
 709       if (utf) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
 710 #endif  /* SUPPORT_UTF */
 711       d = code[coptable[codevalue]];
 712       if (codevalue >= OP_TYPESTAR)
 713         {
 714         switch(d)
 715           {
 716           case OP_ANYBYTE: return PCRE_ERROR_DFA_UITEM;
 717           case OP_NOTPROP:
 718           case OP_PROP: codevalue += OP_PROP_EXTRA; break;
 719           case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
 720           case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
 721           case OP_NOT_HSPACE:
 722           case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
 723           case OP_NOT_VSPACE:
 724           case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
 725           default: break;
 726           }
 727         }
 728       }
 729     else
 730       {
 731       dlen = 0;         /* Not strictly necessary, but compilers moan */
 732       d = NOTACHAR;     /* if these variables are not set. */
 733       }
 734
 735
 736     /* Now process the individual opcodes */
 737
 738     switch (codevalue)
 739       {
 740 /* ========================================================================== */
 741       /* These cases are never obeyed. This is a fudge that causes a compile-
 742       time error if the vectors coptable or poptable, which are indexed by
 743       opcode, are not the correct length. It seems to be the only way to do
 744       such a check at compile time, as the sizeof() operator does not work
 745       in the C preprocessor. */
 746
 747       case OP_TABLE_LENGTH:
 748       case OP_TABLE_LENGTH +
 749         ((sizeof(coptable) == OP_TABLE_LENGTH) &&
 750          (sizeof(poptable) == OP_TABLE_LENGTH)):
 751       break;
 752
 753 /* ========================================================================== */
 754       /* Reached a closing bracket. If not at the end of the pattern, carry
 755       on with the next opcode. For repeating opcodes, also add the repeat
 756       state. Note that KETRPOS will always be encountered at the end of the
 757       subpattern, because the possessive subpattern repeats are always handled
 758       using recursive calls. Thus, it never adds any new states.
 759
 760       At the end of the (sub)pattern, unless we have an empty string and
 761       PCRE_NOTEMPTY is set, or PCRE_NOTEMPTY_ATSTART is set and we are at the
 762       start of the subject, save the match data, shifting up all previous
 763       matches so we always have the longest first. */
 764
 765       case OP_KET:
 766       case OP_KETRMIN:
 767       case OP_KETRMAX:
 768       case OP_KETRPOS:
 769       if (code != end_code)
 770         {
 771         ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
 772         if (codevalue != OP_KET)
 773           {
 774           ADD_ACTIVE(state_offset - GET(code, 1), 0);
 775           }
 776         }
 777       else
 778         {
 779         if (ptr > current_subject ||
 780             ((md->moptions & PCRE_NOTEMPTY) == 0 &&
 781               ((md->moptions & PCRE_NOTEMPTY_ATSTART) == 0 ||
 782                 current_subject > start_subject + md->start_offset)))
 783           {
 784           if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
 785             else if (match_count > 0 && ++match_count * 2 > offsetcount)
 786               match_count = 0;
 787           count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
 788           if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
 789           if (offsetcount >= 2)
 790             {
 791             offsets[0] = (int)(current_subject - start_subject);
 792             offsets[1] = (int)(ptr - start_subject);
 793             DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
 794               offsets[1] - offsets[0], (char *)current_subject));
 795             }
 796           if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
 797             {
 798             DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
 799               "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
 800               match_count, rlevel*2-2, SP));
 801             return match_count;
 802             }
 803           }
 804         }
 805       break;
 806
 807 /* ========================================================================== */
 808       /* These opcodes add to the current list of states without looking
 809       at the current character. */
 810
 811       /*-----------------------------------------------------------------*/
 812       case OP_ALT:
 813       do { code += GET(code, 1); } while (*code == OP_ALT);
 814       ADD_ACTIVE((int)(code - start_code), 0);
 815       break;
 816
 817       /*-----------------------------------------------------------------*/
 818       case OP_BRA:
 819       case OP_SBRA:
 820       do
 821         {
 822         ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
 823         code += GET(code, 1);
 824         }
 825       while (*code == OP_ALT);
 826       break;
 827
 828       /*-----------------------------------------------------------------*/
 829       case OP_CBRA:
 830       case OP_SCBRA:
 831       ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE + IMM2_SIZE),  0);
 832       code += GET(code, 1);
 833       while (*code == OP_ALT)
 834         {
 835         ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE),  0);
 836         code += GET(code, 1);
 837         }
 838       break;
 839
 840       /*-----------------------------------------------------------------*/
 841       case OP_BRAZERO:
 842       case OP_BRAMINZERO:
 843       ADD_ACTIVE(state_offset + 1, 0);
 844       code += 1 + GET(code, 2);
 845       while (*code == OP_ALT) code += GET(code, 1);
 846       ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
 847       break;
 848
 849       /*-----------------------------------------------------------------*/
 850       case OP_SKIPZERO:
 851       code += 1 + GET(code, 2);
 852       while (*code == OP_ALT) code += GET(code, 1);
 853       ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
 854       break;
 855
 856       /*-----------------------------------------------------------------*/
 857       case OP_CIRC:
 858       if (ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0)
 859         { ADD_ACTIVE(state_offset + 1, 0); }
 860       break;
 861
 862       /*-----------------------------------------------------------------*/
 863       case OP_CIRCM:
 864       if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
 865           (ptr != end_subject && WAS_NEWLINE(ptr)))
 866         { ADD_ACTIVE(state_offset + 1, 0); }
 867       break;
 868
 869       /*-----------------------------------------------------------------*/
 870       case OP_EOD:
 871       if (ptr >= end_subject)
 872         {
 873         if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
 874           could_continue = TRUE;
 875         else { ADD_ACTIVE(state_offset + 1, 0); }
 876         }
 877       break;
 878
 879       /*-----------------------------------------------------------------*/
 880       case OP_SOD:
 881       if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
 882       break;
 883
 884       /*-----------------------------------------------------------------*/
 885       case OP_SOM:
 886       if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
 887       break;
 888
 889
 890 /* ========================================================================== */
 891       /* These opcodes inspect the next subject character, and sometimes
 892       the previous one as well, but do not have an argument. The variable
 893       clen contains the length of the current character and is zero if we are
 894       at the end of the subject. */
 895
 896       /*-----------------------------------------------------------------*/
 897       case OP_ANY:
 898       if (clen > 0 && !IS_NEWLINE(ptr))
 899         {
 900         if (ptr + 1 >= md->end_subject &&
 901             (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
 902             NLBLOCK->nltype == NLTYPE_FIXED &&
 903             NLBLOCK->nllen == 2 &&
 904             c == NLBLOCK->nl[0])
 905           {
 906           could_continue = partial_newline = TRUE;
 907           }
 908         else
 909           {
 910           ADD_NEW(state_offset + 1, 0);
 911           }
 912         }
 913       break;
 914
 915       /*-----------------------------------------------------------------*/
 916       case OP_ALLANY:
 917       if (clen > 0)
 918         { ADD_NEW(state_offset + 1, 0); }
 919       break;
 920
 921       /*-----------------------------------------------------------------*/
 922       case OP_EODN:
 923       if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
 924         could_continue = TRUE;
 925       else if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
 926         { ADD_ACTIVE(state_offset + 1, 0); }
 927       break;
 928
 929       /*-----------------------------------------------------------------*/
 930       case OP_DOLL:
 931       if ((md->moptions & PCRE_NOTEOL) == 0)
 932         {
 933         if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
 934           could_continue = TRUE;
 935         else if (clen == 0 ||
 936             ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
 937                (ptr == end_subject - md->nllen)
 938             ))
 939           { ADD_ACTIVE(state_offset + 1, 0); }
 940         else if (ptr + 1 >= md->end_subject &&
 941                  (md->moptions & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) != 0 &&
 942                  NLBLOCK->nltype == NLTYPE_FIXED &&
 943                  NLBLOCK->nllen == 2 &&
 944                  c == NLBLOCK->nl[0])
 945           {
 946           if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
 947             {
 948             reset_could_continue = TRUE;
 949             ADD_NEW_DATA(-(state_offset + 1), 0, 1);
 950             }
 951           else could_continue = partial_newline = TRUE;
 952           }
 953         }
 954       break;
 955
 956       /*-----------------------------------------------------------------*/
 957       case OP_DOLLM:
 958       if ((md->moptions & PCRE_NOTEOL) == 0)
 959         {
 960         if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
 961           could_continue = TRUE;
 962         else if (clen == 0 ||
 963             ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr)))
 964           { ADD_ACTIVE(state_offset + 1, 0); }
 965         else if (ptr + 1 >= md->end_subject &&
 966                  (md->moptions & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) != 0 &&
 967                  NLBLOCK->nltype == NLTYPE_FIXED &&
 968                  NLBLOCK->nllen == 2 &&
 969                  c == NLBLOCK->nl[0])
 970           {
 971           if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
 972             {
 973             reset_could_continue = TRUE;
 974             ADD_NEW_DATA(-(state_offset + 1), 0, 1);
 975             }
 976           else could_continue = partial_newline = TRUE;
 977           }
 978         }
 979       else if (IS_NEWLINE(ptr))
 980         { ADD_ACTIVE(state_offset + 1, 0); }
 981       break;
 982
 983       /*-----------------------------------------------------------------*/
 984
 985       case OP_DIGIT:
 986       case OP_WHITESPACE:
 987       case OP_WORDCHAR:
 988       if (clen > 0 && c < 256 &&
 989             ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
 990         { ADD_NEW(state_offset + 1, 0); }
 991       break;
 992
 993       /*-----------------------------------------------------------------*/
 994       case OP_NOT_DIGIT:
 995       case OP_NOT_WHITESPACE:
 996       case OP_NOT_WORDCHAR:
 997       if (clen > 0 && (c >= 256 ||
 998             ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
 999         { ADD_NEW(state_offset + 1, 0); }
1000       break;
1001
1002       /*-----------------------------------------------------------------*/
1003       case OP_WORD_BOUNDARY:
1004       case OP_NOT_WORD_BOUNDARY:
1005         {
1006         int left_word, right_word;
1007
1008         if (ptr > start_subject)
1009           {
1010           const pcre_uchar *temp = ptr - 1;
1011           if (temp < md->start_used_ptr) md->start_used_ptr = temp;
1012 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
1013           if (utf) { BACKCHAR(temp); }
1014 #endif
1015           GETCHARTEST(d, temp);
1016 #ifdef SUPPORT_UCP
1017           if ((md->poptions & PCRE_UCP) != 0)
1018             {
1019             if (d == '_') left_word = TRUE; else
1020               {
1021               int cat = UCD_CATEGORY(d);
1022               left_word = (cat == ucp_L || cat == ucp_N);
1023               }
1024             }
1025           else
1026 #endif
1027           left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
1028           }
1029         else left_word = FALSE;
1030
1031         if (clen > 0)
1032           {
1033 #ifdef SUPPORT_UCP
1034           if ((md->poptions & PCRE_UCP) != 0)
1035             {
1036             if (c == '_') right_word = TRUE; else
1037               {
1038               int cat = UCD_CATEGORY(c);
1039               right_word = (cat == ucp_L || cat == ucp_N);
1040               }
1041             }
1042           else
1043 #endif
1044           right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
1045           }
1046         else right_word = FALSE;
1047
1048         if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
1049           { ADD_ACTIVE(state_offset + 1, 0); }
1050         }
1051       break;
1052
1053
1054       /*-----------------------------------------------------------------*/
1055       /* Check the next character by Unicode property. We will get here only
1056       if the support is in the binary; otherwise a compile-time error occurs.
1057       */
1058
1059 #ifdef SUPPORT_UCP
1060       case OP_PROP:
1061       case OP_NOTPROP:
1062       if (clen > 0)
1063         {
1064         BOOL OK;
1065         const pcre_uint32 *cp;
1066         const ucd_record * prop = GET_UCD(c);
1067         switch(code[1])
1068           {
1069           case PT_ANY:
1070           OK = TRUE;
1071           break;
1072
1073           case PT_LAMP:
1074           OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1075                prop->chartype == ucp_Lt;
1076           break;
1077
1078           case PT_GC:
1079           OK = PRIV(ucp_gentype)[prop->chartype] == code[2];
1080           break;
1081
1082           case PT_PC:
1083           OK = prop->chartype == code[2];
1084           break;
1085
1086           case PT_SC:
1087           OK = prop->script == code[2];
1088           break;
1089
1090           /* These are specials for combination cases. */
1091
1092           case PT_ALNUM:
1093           OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1094                PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1095           break;
1096
1097           case PT_SPACE:    /* Perl space */
1098           OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1099                c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1100           break;
1101
1102           case PT_PXSPACE:  /* POSIX space */
1103           OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1104                c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1105                c == CHAR_FF || c == CHAR_CR;
1106           break;
1107
1108           case PT_WORD:
1109           OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1110                PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1111                c == CHAR_UNDERSCORE;
1112           break;
1113
1114           case PT_CLIST:
1115           cp = PRIV(ucd_caseless_sets) + code[2];
1116           for (;;)
1117             {
1118             if (c < *cp) { OK = FALSE; break; }
1119             if (c == *cp++) { OK = TRUE; break; }
1120             }
1121           break;
1122
1123           /* Should never occur, but keep compilers from grumbling. */
1124
1125           default:
1126           OK = codevalue != OP_PROP;
1127           break;
1128           }
1129
1130         if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
1131         }
1132       break;
1133 #endif
1134
1135
1136
1137 /* ========================================================================== */
1138       /* These opcodes likewise inspect the subject character, but have an
1139       argument that is not a data character. It is one of these opcodes:
1140       OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
1141       OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
1142
1143       case OP_TYPEPLUS:
1144       case OP_TYPEMINPLUS:
1145       case OP_TYPEPOSPLUS:
1146       count = current_state->count;  /* Already matched */
1147       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1148       if (clen > 0)
1149         {
1150         if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1151             (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1152             NLBLOCK->nltype == NLTYPE_FIXED &&
1153             NLBLOCK->nllen == 2 &&
1154             c == NLBLOCK->nl[0])
1155           {
1156           could_continue = partial_newline = TRUE;
1157           }
1158         else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1159             (c < 256 &&
1160               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1161               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1162           {
1163           if (count > 0 && codevalue == OP_TYPEPOSPLUS)
1164             {
1165             active_count--;            /* Remove non-match possibility */
1166             next_active_state--;
1167             }
1168           count++;
1169           ADD_NEW(state_offset, count);
1170           }
1171         }
1172       break;
1173
1174       /*-----------------------------------------------------------------*/
1175       case OP_TYPEQUERY:
1176       case OP_TYPEMINQUERY:
1177       case OP_TYPEPOSQUERY:
1178       ADD_ACTIVE(state_offset + 2, 0);
1179       if (clen > 0)
1180         {
1181         if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1182             (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1183             NLBLOCK->nltype == NLTYPE_FIXED &&
1184             NLBLOCK->nllen == 2 &&
1185             c == NLBLOCK->nl[0])
1186           {
1187           could_continue = partial_newline = TRUE;
1188           }
1189         else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1190             (c < 256 &&
1191               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1192               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1193           {
1194           if (codevalue == OP_TYPEPOSQUERY)
1195             {
1196             active_count--;            /* Remove non-match possibility */
1197             next_active_state--;
1198             }
1199           ADD_NEW(state_offset + 2, 0);
1200           }
1201         }
1202       break;
1203
1204       /*-----------------------------------------------------------------*/
1205       case OP_TYPESTAR:
1206       case OP_TYPEMINSTAR:
1207       case OP_TYPEPOSSTAR:
1208       ADD_ACTIVE(state_offset + 2, 0);
1209       if (clen > 0)
1210         {
1211         if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1212             (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1213             NLBLOCK->nltype == NLTYPE_FIXED &&
1214             NLBLOCK->nllen == 2 &&
1215             c == NLBLOCK->nl[0])
1216           {
1217           could_continue = partial_newline = TRUE;
1218           }
1219         else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1220             (c < 256 &&
1221               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1222               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1223           {
1224           if (codevalue == OP_TYPEPOSSTAR)
1225             {
1226             active_count--;            /* Remove non-match possibility */
1227             next_active_state--;
1228             }
1229           ADD_NEW(state_offset, 0);
1230           }
1231         }
1232       break;
1233
1234       /*-----------------------------------------------------------------*/
1235       case OP_TYPEEXACT:
1236       count = current_state->count;  /* Number already matched */
1237       if (clen > 0)
1238         {
1239         if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1240             (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1241             NLBLOCK->nltype == NLTYPE_FIXED &&
1242             NLBLOCK->nllen == 2 &&
1243             c == NLBLOCK->nl[0])
1244           {
1245           could_continue = partial_newline = TRUE;
1246           }
1247         else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1248             (c < 256 &&
1249               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1250               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1251           {
1252           if (++count >= GET2(code, 1))
1253             { ADD_NEW(state_offset + 1 + IMM2_SIZE + 1, 0); }
1254           else
1255             { ADD_NEW(state_offset, count); }
1256           }
1257         }
1258       break;
1259
1260       /*-----------------------------------------------------------------*/
1261       case OP_TYPEUPTO:
1262       case OP_TYPEMINUPTO:
1263       case OP_TYPEPOSUPTO:
1264       ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0);
1265       count = current_state->count;  /* Number already matched */
1266       if (clen > 0)
1267         {
1268         if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1269             (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1270             NLBLOCK->nltype == NLTYPE_FIXED &&
1271             NLBLOCK->nllen == 2 &&
1272             c == NLBLOCK->nl[0])
1273           {
1274           could_continue = partial_newline = TRUE;
1275           }
1276         else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1277             (c < 256 &&
1278               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1279               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1280           {
1281           if (codevalue == OP_TYPEPOSUPTO)
1282             {
1283             active_count--;           /* Remove non-match possibility */
1284             next_active_state--;
1285             }
1286           if (++count >= GET2(code, 1))
1287             { ADD_NEW(state_offset + 2 + IMM2_SIZE, 0); }
1288           else
1289             { ADD_NEW(state_offset, count); }
1290           }
1291         }
1292       break;
1293
1294 /* ========================================================================== */
1295       /* These are virtual opcodes that are used when something like
1296       OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
1297       argument. It keeps the code above fast for the other cases. The argument
1298       is in the d variable. */
1299
1300 #ifdef SUPPORT_UCP
1301       case OP_PROP_EXTRA + OP_TYPEPLUS:
1302       case OP_PROP_EXTRA + OP_TYPEMINPLUS:
1303       case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
1304       count = current_state->count;           /* Already matched */
1305       if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
1306       if (clen > 0)
1307         {
1308         BOOL OK;
1309         const pcre_uint32 *cp;
1310         const ucd_record * prop = GET_UCD(c);
1311         switch(code[2])
1312           {
1313           case PT_ANY:
1314           OK = TRUE;
1315           break;
1316
1317           case PT_LAMP:
1318           OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1319             prop->chartype == ucp_Lt;
1320           break;
1321
1322           case PT_GC:
1323           OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
1324           break;
1325
1326           case PT_PC:
1327           OK = prop->chartype == code[3];
1328           break;
1329
1330           case PT_SC:
1331           OK = prop->script == code[3];
1332           break;
1333
1334           /* These are specials for combination cases. */
1335
1336           case PT_ALNUM:
1337           OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1338                PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1339           break;
1340
1341           case PT_SPACE:    /* Perl space */
1342           OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1343                c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1344           break;
1345
1346           case PT_PXSPACE:  /* POSIX space */
1347           OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1348                c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1349                c == CHAR_FF || c == CHAR_CR;
1350           break;
1351
1352           case PT_WORD:
1353           OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1354                PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1355                c == CHAR_UNDERSCORE;
1356           break;
1357
1358           case PT_CLIST:
1359           cp = PRIV(ucd_caseless_sets) + code[3];
1360           for (;;)
1361             {
1362             if (c < *cp) { OK = FALSE; break; }
1363             if (c == *cp++) { OK = TRUE; break; }
1364             }
1365           break;
1366
1367           /* Should never occur, but keep compilers from grumbling. */
1368
1369           default:
1370           OK = codevalue != OP_PROP;
1371           break;
1372           }
1373
1374         if (OK == (d == OP_PROP))
1375           {
1376           if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
1377             {
1378             active_count--;           /* Remove non-match possibility */
1379             next_active_state--;
1380             }
1381           count++;
1382           ADD_NEW(state_offset, count);
1383           }
1384         }
1385       break;
1386
1387       /*-----------------------------------------------------------------*/
1388       case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
1389       case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
1390       case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1391       count = current_state->count;  /* Already matched */
1392       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1393       if (clen > 0)
1394         {
1395         int lgb, rgb;
1396         const pcre_uchar *nptr = ptr + clen;
1397         int ncount = 0;
1398         if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1399           {
1400           active_count--;           /* Remove non-match possibility */
1401           next_active_state--;
1402           }
1403         lgb = UCD_GRAPHBREAK(c);
1404         while (nptr < end_subject)
1405           {
1406           dlen = 1;
1407           if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); }
1408           rgb = UCD_GRAPHBREAK(d);
1409           if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
1410           ncount++;
1411           lgb = rgb;
1412           nptr += dlen;
1413           }
1414         count++;
1415         ADD_NEW_DATA(-state_offset, count, ncount);
1416         }
1417       break;
1418 #endif
1419
1420       /*-----------------------------------------------------------------*/
1421       case OP_ANYNL_EXTRA + OP_TYPEPLUS:
1422       case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
1423       case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
1424       count = current_state->count;  /* Already matched */
1425       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1426       if (clen > 0)
1427         {
1428         int ncount = 0;
1429         switch (c)
1430           {
1431           case CHAR_VT:
1432           case CHAR_FF:
1433           case CHAR_NEL:
1434 #ifndef EBCDIC
1435           case 0x2028:
1436           case 0x2029:
1437 #endif  /* Not EBCDIC */
1438           if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1439           goto ANYNL01;
1440
1441           case CHAR_CR:
1442           if (ptr + 1 < end_subject && RAWUCHARTEST(ptr + 1) == CHAR_LF) ncount = 1;
1443           /* Fall through */
1444
1445           ANYNL01:
1446           case CHAR_LF:
1447           if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1448             {
1449             active_count--;           /* Remove non-match possibility */
1450             next_active_state--;
1451             }
1452           count++;
1453           ADD_NEW_DATA(-state_offset, count, ncount);
1454           break;
1455
1456           default:
1457           break;
1458           }
1459         }
1460       break;
1461
1462       /*-----------------------------------------------------------------*/
1463       case OP_VSPACE_EXTRA + OP_TYPEPLUS:
1464       case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
1465       case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
1466       count = current_state->count;  /* Already matched */
1467       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1468       if (clen > 0)
1469         {
1470         BOOL OK;
1471         switch (c)
1472           {
1473           VSPACE_CASES:
1474           OK = TRUE;
1475           break;
1476
1477           default:
1478           OK = FALSE;
1479           break;
1480           }
1481
1482         if (OK == (d == OP_VSPACE))
1483           {
1484           if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
1485             {
1486             active_count--;           /* Remove non-match possibility */
1487             next_active_state--;
1488             }
1489           count++;
1490           ADD_NEW_DATA(-state_offset, count, 0);
1491           }
1492         }
1493       break;
1494
1495       /*-----------------------------------------------------------------*/
1496       case OP_HSPACE_EXTRA + OP_TYPEPLUS:
1497       case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
1498       case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
1499       count = current_state->count;  /* Already matched */
1500       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1501       if (clen > 0)
1502         {
1503         BOOL OK;
1504         switch (c)
1505           {
1506           HSPACE_CASES:
1507           OK = TRUE;
1508           break;
1509
1510           default:
1511           OK = FALSE;
1512           break;
1513           }
1514
1515         if (OK == (d == OP_HSPACE))
1516           {
1517           if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
1518             {
1519             active_count--;           /* Remove non-match possibility */
1520             next_active_state--;
1521             }
1522           count++;
1523           ADD_NEW_DATA(-state_offset, count, 0);
1524           }
1525         }
1526       break;
1527
1528       /*-----------------------------------------------------------------*/
1529 #ifdef SUPPORT_UCP
1530       case OP_PROP_EXTRA + OP_TYPEQUERY:
1531       case OP_PROP_EXTRA + OP_TYPEMINQUERY:
1532       case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
1533       count = 4;
1534       goto QS1;
1535
1536       case OP_PROP_EXTRA + OP_TYPESTAR:
1537       case OP_PROP_EXTRA + OP_TYPEMINSTAR:
1538       case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
1539       count = 0;
1540
1541       QS1:
1542
1543       ADD_ACTIVE(state_offset + 4, 0);
1544       if (clen > 0)
1545         {
1546         BOOL OK;
1547         const pcre_uint32 *cp;
1548         const ucd_record * prop = GET_UCD(c);
1549         switch(code[2])
1550           {
1551           case PT_ANY:
1552           OK = TRUE;
1553           break;
1554
1555           case PT_LAMP:
1556           OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1557             prop->chartype == ucp_Lt;
1558           break;
1559
1560           case PT_GC:
1561           OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
1562           break;
1563
1564           case PT_PC:
1565           OK = prop->chartype == code[3];
1566           break;
1567
1568           case PT_SC:
1569           OK = prop->script == code[3];
1570           break;
1571
1572           /* These are specials for combination cases. */
1573
1574           case PT_ALNUM:
1575           OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1576                PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1577           break;
1578
1579           case PT_SPACE:    /* Perl space */
1580           OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1581                c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1582           break;
1583
1584           case PT_PXSPACE:  /* POSIX space */
1585           OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1586                c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1587                c == CHAR_FF || c == CHAR_CR;
1588           break;
1589
1590           case PT_WORD:
1591           OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1592                PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1593                c == CHAR_UNDERSCORE;
1594           break;
1595
1596           case PT_CLIST:
1597           cp = PRIV(ucd_caseless_sets) + code[3];
1598           for (;;)
1599             {
1600             if (c < *cp) { OK = FALSE; break; }
1601             if (c == *cp++) { OK = TRUE; break; }
1602             }
1603           break;
1604
1605           /* Should never occur, but keep compilers from grumbling. */
1606
1607           default:
1608           OK = codevalue != OP_PROP;
1609           break;
1610           }
1611
1612         if (OK == (d == OP_PROP))
1613           {
1614           if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
1615               codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
1616             {
1617             active_count--;           /* Remove non-match possibility */
1618             next_active_state--;
1619             }
1620           ADD_NEW(state_offset + count, 0);
1621           }
1622         }
1623       break;
1624
1625       /*-----------------------------------------------------------------*/
1626       case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
1627       case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
1628       case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
1629       count = 2;
1630       goto QS2;
1631
1632       case OP_EXTUNI_EXTRA + OP_TYPESTAR:
1633       case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
1634       case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
1635       count = 0;
1636
1637       QS2:
1638
1639       ADD_ACTIVE(state_offset + 2, 0);
1640       if (clen > 0)
1641         {
1642         int lgb, rgb;
1643         const pcre_uchar *nptr = ptr + clen;
1644         int ncount = 0;
1645         if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1646             codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
1647           {
1648           active_count--;           /* Remove non-match possibility */
1649           next_active_state--;
1650           }
1651         lgb = UCD_GRAPHBREAK(c);
1652         while (nptr < end_subject)
1653           {
1654           dlen = 1;
1655           if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); }
1656           rgb = UCD_GRAPHBREAK(d);
1657           if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
1658           ncount++;
1659           lgb = rgb;
1660           nptr += dlen;
1661           }
1662         ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1663         }
1664       break;
1665 #endif
1666
1667       /*-----------------------------------------------------------------*/
1668       case OP_ANYNL_EXTRA + OP_TYPEQUERY:
1669       case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
1670       case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
1671       count = 2;
1672       goto QS3;
1673
1674       case OP_ANYNL_EXTRA + OP_TYPESTAR:
1675       case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
1676       case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
1677       count = 0;
1678
1679       QS3:
1680       ADD_ACTIVE(state_offset + 2, 0);
1681       if (clen > 0)
1682         {
1683         int ncount = 0;
1684         switch (c)
1685           {
1686           case CHAR_VT:
1687           case CHAR_FF:
1688           case CHAR_NEL:
1689 #ifndef EBCDIC
1690           case 0x2028:
1691           case 0x2029:
1692 #endif  /* Not EBCDIC */
1693           if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1694           goto ANYNL02;
1695
1696           case CHAR_CR:
1697           if (ptr + 1 < end_subject && RAWUCHARTEST(ptr + 1) == CHAR_LF) ncount = 1;
1698           /* Fall through */
1699
1700           ANYNL02:
1701           case CHAR_LF:
1702           if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1703               codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1704             {
1705             active_count--;           /* Remove non-match possibility */
1706             next_active_state--;
1707             }
1708           ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1709           break;
1710
1711           default:
1712           break;
1713           }
1714         }
1715       break;
1716
1717       /*-----------------------------------------------------------------*/
1718       case OP_VSPACE_EXTRA + OP_TYPEQUERY:
1719       case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
1720       case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
1721       count = 2;
1722       goto QS4;
1723
1724       case OP_VSPACE_EXTRA + OP_TYPESTAR:
1725       case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
1726       case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
1727       count = 0;
1728
1729       QS4:
1730       ADD_ACTIVE(state_offset + 2, 0);
1731       if (clen > 0)
1732         {
1733         BOOL OK;
1734         switch (c)
1735           {
1736           VSPACE_CASES:
1737           OK = TRUE;
1738           break;
1739
1740           default:
1741           OK = FALSE;
1742           break;
1743           }
1744         if (OK == (d == OP_VSPACE))
1745           {
1746           if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
1747               codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
1748             {
1749             active_count--;           /* Remove non-match possibility */
1750             next_active_state--;
1751             }
1752           ADD_NEW_DATA(-(state_offset + count), 0, 0);
1753           }
1754         }
1755       break;
1756
1757       /*-----------------------------------------------------------------*/
1758       case OP_HSPACE_EXTRA + OP_TYPEQUERY:
1759       case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
1760       case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
1761       count = 2;
1762       goto QS5;
1763
1764       case OP_HSPACE_EXTRA + OP_TYPESTAR:
1765       case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
1766       case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
1767       count = 0;
1768
1769       QS5:
1770       ADD_ACTIVE(state_offset + 2, 0);
1771       if (clen > 0)
1772         {
1773         BOOL OK;
1774         switch (c)
1775           {
1776           HSPACE_CASES:
1777           OK = TRUE;
1778           break;
1779
1780           default:
1781           OK = FALSE;
1782           break;
1783           }
1784
1785         if (OK == (d == OP_HSPACE))
1786           {
1787           if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
1788               codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
1789             {
1790             active_count--;           /* Remove non-match possibility */
1791             next_active_state--;
1792             }
1793           ADD_NEW_DATA(-(state_offset + count), 0, 0);
1794           }
1795         }
1796       break;
1797
1798       /*-----------------------------------------------------------------*/
1799 #ifdef SUPPORT_UCP
1800       case OP_PROP_EXTRA + OP_TYPEEXACT:
1801       case OP_PROP_EXTRA + OP_TYPEUPTO:
1802       case OP_PROP_EXTRA + OP_TYPEMINUPTO:
1803       case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
1804       if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1805         { ADD_ACTIVE(state_offset + 1 + IMM2_SIZE + 3, 0); }
1806       count = current_state->count;  /* Number already matched */
1807       if (clen > 0)
1808         {
1809         BOOL OK;
1810         const pcre_uint32 *cp;
1811         const ucd_record * prop = GET_UCD(c);
1812         switch(code[1 + IMM2_SIZE + 1])
1813           {
1814           case PT_ANY:
1815           OK = TRUE;
1816           break;
1817
1818           case PT_LAMP:
1819           OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1820             prop->chartype == ucp_Lt;
1821           break;
1822
1823           case PT_GC:
1824           OK = PRIV(ucp_gentype)[prop->chartype] == code[1 + IMM2_SIZE + 2];
1825           break;
1826
1827           case PT_PC:
1828           OK = prop->chartype == code[1 + IMM2_SIZE + 2];
1829           break;
1830
1831           case PT_SC:
1832           OK = prop->script == code[1 + IMM2_SIZE + 2];
1833           break;
1834
1835           /* These are specials for combination cases. */
1836
1837           case PT_ALNUM:
1838           OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1839                PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1840           break;
1841
1842           case PT_SPACE:    /* Perl space */
1843           OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1844                c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1845           break;
1846
1847           case PT_PXSPACE:  /* POSIX space */
1848           OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1849                c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1850                c == CHAR_FF || c == CHAR_CR;
1851           break;
1852
1853           case PT_WORD:
1854           OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1855                PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1856                c == CHAR_UNDERSCORE;
1857           break;
1858
1859           case PT_CLIST:
1860           cp = PRIV(ucd_caseless_sets) + code[1 + IMM2_SIZE + 2];
1861           for (;;)
1862             {
1863             if (c < *cp) { OK = FALSE; break; }
1864             if (c == *cp++) { OK = TRUE; break; }
1865             }
1866           break;
1867
1868           /* Should never occur, but keep compilers from grumbling. */
1869
1870           default:
1871           OK = codevalue != OP_PROP;
1872           break;
1873           }
1874
1875         if (OK == (d == OP_PROP))
1876           {
1877           if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
1878             {
1879             active_count--;           /* Remove non-match possibility */
1880             next_active_state--;
1881             }
1882           if (++count >= GET2(code, 1))
1883             { ADD_NEW(state_offset + 1 + IMM2_SIZE + 3, 0); }
1884           else
1885             { ADD_NEW(state_offset, count); }
1886           }
1887         }
1888       break;
1889
1890       /*-----------------------------------------------------------------*/
1891       case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
1892       case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
1893       case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
1894       case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
1895       if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
1896         { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1897       count = current_state->count;  /* Number already matched */
1898       if (clen > 0)
1899         {
1900         int lgb, rgb;
1901         const pcre_uchar *nptr = ptr + clen;
1902         int ncount = 0;
1903         if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
1904           {
1905           active_count--;           /* Remove non-match possibility */
1906           next_active_state--;
1907           }
1908         lgb = UCD_GRAPHBREAK(c);
1909         while (nptr < end_subject)
1910           {
1911           dlen = 1;
1912           if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); }
1913           rgb = UCD_GRAPHBREAK(d);
1914           if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
1915           ncount++;
1916           lgb = rgb;
1917           nptr += dlen;
1918           }
1919         if (nptr >= end_subject && (md->moptions & PCRE_PARTIAL_HARD) != 0)
1920             reset_could_continue = TRUE;
1921         if (++count >= GET2(code, 1))
1922           { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
1923         else
1924           { ADD_NEW_DATA(-state_offset, count, ncount); }
1925         }
1926       break;
1927 #endif
1928
1929       /*-----------------------------------------------------------------*/
1930       case OP_ANYNL_EXTRA + OP_TYPEEXACT:
1931       case OP_ANYNL_EXTRA + OP_TYPEUPTO:
1932       case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
1933       case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
1934       if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
1935         { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1936       count = current_state->count;  /* Number already matched */
1937       if (clen > 0)
1938         {
1939         int ncount = 0;
1940         switch (c)
1941           {
1942           case CHAR_VT:
1943           case CHAR_FF:
1944           case CHAR_NEL:
1945 #ifndef EBCDIC
1946           case 0x2028:
1947           case 0x2029:
1948 #endif  /* Not EBCDIC */
1949           if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1950           goto ANYNL03;
1951
1952           case CHAR_CR:
1953           if (ptr + 1 < end_subject && RAWUCHARTEST(ptr + 1) == CHAR_LF) ncount = 1;
1954           /* Fall through */
1955
1956           ANYNL03:
1957           case CHAR_LF:
1958           if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
1959             {
1960             active_count--;           /* Remove non-match possibility */
1961             next_active_state--;
1962             }
1963           if (++count >= GET2(code, 1))
1964             { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
1965           else
1966             { ADD_NEW_DATA(-state_offset, count, ncount); }
1967           break;
1968
1969           default:
1970           break;
1971           }
1972         }
1973       break;
1974
1975       /*-----------------------------------------------------------------*/
1976       case OP_VSPACE_EXTRA + OP_TYPEEXACT:
1977       case OP_VSPACE_EXTRA + OP_TYPEUPTO:
1978       case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
1979       case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
1980       if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
1981         { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1982       count = current_state->count;  /* Number already matched */
1983       if (clen > 0)
1984         {
1985         BOOL OK;
1986         switch (c)
1987           {
1988           VSPACE_CASES:
1989           OK = TRUE;
1990           break;
1991
1992           default:
1993           OK = FALSE;
1994           }
1995
1996         if (OK == (d == OP_VSPACE))
1997           {
1998           if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
1999             {
2000             active_count--;           /* Remove non-match possibility */
2001             next_active_state--;
2002             }
2003           if (++count >= GET2(code, 1))
2004             { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
2005           else
2006             { ADD_NEW_DATA(-state_offset, count, 0); }
2007           }
2008         }
2009       break;
2010
2011       /*-----------------------------------------------------------------*/
2012       case OP_HSPACE_EXTRA + OP_TYPEEXACT:
2013       case OP_HSPACE_EXTRA + OP_TYPEUPTO:
2014       case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
2015       case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
2016       if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
2017         { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2018       count = current_state->count;  /* Number already matched */
2019       if (clen > 0)
2020         {
2021         BOOL OK;
2022         switch (c)
2023           {
2024           HSPACE_CASES:
2025           OK = TRUE;
2026           break;
2027
2028           default:
2029           OK = FALSE;
2030           break;
2031           }
2032
2033         if (OK == (d == OP_HSPACE))
2034           {
2035           if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
2036             {
2037             active_count--;           /* Remove non-match possibility */
2038             next_active_state--;
2039             }
2040           if (++count >= GET2(code, 1))
2041             { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
2042           else
2043             { ADD_NEW_DATA(-state_offset, count, 0); }
2044           }
2045         }
2046       break;
2047
2048 /* ========================================================================== */
2049       /* These opcodes are followed by a character that is usually compared
2050       to the current subject character; it is loaded into d. We still get
2051       here even if there is no subject character, because in some cases zero
2052       repetitions are permitted. */
2053
2054       /*-----------------------------------------------------------------*/
2055       case OP_CHAR:
2056       if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
2057       break;
2058
2059       /*-----------------------------------------------------------------*/
2060       case OP_CHARI:
2061       if (clen == 0) break;
2062
2063 #ifdef SUPPORT_UTF
2064       if (utf)
2065         {
2066         if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
2067           {
2068           unsigned int othercase;
2069           if (c < 128)
2070             othercase = fcc[c];
2071           else
2072             /* If we have Unicode property support, we can use it to test the
2073             other case of the character. */
2074 #ifdef SUPPORT_UCP
2075             othercase = UCD_OTHERCASE(c);
2076 #else
2077             othercase = NOTACHAR;
2078 #endif
2079
2080           if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
2081           }
2082         }
2083       else
2084 #endif  /* SUPPORT_UTF */
2085       /* Not UTF mode */
2086         {
2087         if (TABLE_GET(c, lcc, c) == TABLE_GET(d, lcc, d))
2088           { ADD_NEW(state_offset + 2, 0); }
2089         }
2090       break;
2091
2092
2093 #ifdef SUPPORT_UCP
2094       /*-----------------------------------------------------------------*/
2095       /* This is a tricky one because it can match more than one character.
2096       Find out how many characters to skip, and then set up a negative state
2097       to wait for them to pass before continuing. */
2098
2099       case OP_EXTUNI:
2100       if (clen > 0)
2101         {
2102         int lgb, rgb;
2103         const pcre_uchar *nptr = ptr + clen;
2104         int ncount = 0;
2105         lgb = UCD_GRAPHBREAK(c);
2106         while (nptr < end_subject)
2107           {
2108           dlen = 1;
2109           if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); }
2110           rgb = UCD_GRAPHBREAK(d);
2111           if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
2112           ncount++;
2113           lgb = rgb;
2114           nptr += dlen;
2115           }
2116         if (nptr >= end_subject && (md->moptions & PCRE_PARTIAL_HARD) != 0)
2117             reset_could_continue = TRUE;
2118         ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
2119         }
2120       break;
2121 #endif
2122
2123       /*-----------------------------------------------------------------*/
2124       /* This is a tricky like EXTUNI because it too can match more than one
2125       character (when CR is followed by LF). In this case, set up a negative
2126       state to wait for one character to pass before continuing. */
2127
2128       case OP_ANYNL:
2129       if (clen > 0) switch(c)
2130         {
2131         case CHAR_VT:
2132         case CHAR_FF:
2133         case CHAR_NEL:
2134 #ifndef EBCDIC
2135         case 0x2028:
2136         case 0x2029:
2137 #endif  /* Not EBCDIC */
2138         if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
2139
2140         case CHAR_LF:
2141         ADD_NEW(state_offset + 1, 0);
2142         break;
2143
2144         case CHAR_CR:
2145         if (ptr + 1 >= end_subject)
2146           {
2147           ADD_NEW(state_offset + 1, 0);
2148           if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
2149             reset_could_continue = TRUE;
2150           }
2151         else if (RAWUCHARTEST(ptr + 1) == CHAR_LF)
2152           {
2153           ADD_NEW_DATA(-(state_offset + 1), 0, 1);
2154           }
2155         else
2156           {
2157           ADD_NEW(state_offset + 1, 0);
2158           }
2159         break;
2160         }
2161       break;
2162
2163       /*-----------------------------------------------------------------*/
2164       case OP_NOT_VSPACE:
2165       if (clen > 0) switch(c)
2166         {
2167         VSPACE_CASES:
2168         break;
2169
2170         default:
2171         ADD_NEW(state_offset + 1, 0);
2172         break;
2173         }
2174       break;
2175
2176       /*-----------------------------------------------------------------*/
2177       case OP_VSPACE:
2178       if (clen > 0) switch(c)
2179         {
2180         VSPACE_CASES:
2181         ADD_NEW(state_offset + 1, 0);
2182         break;
2183
2184         default:
2185         break;
2186         }
2187       break;
2188
2189       /*-----------------------------------------------------------------*/
2190       case OP_NOT_HSPACE:
2191       if (clen > 0) switch(c)
2192         {
2193         HSPACE_CASES:
2194         break;
2195
2196         default:
2197         ADD_NEW(state_offset + 1, 0);
2198         break;
2199         }
2200       break;
2201
2202       /*-----------------------------------------------------------------*/
2203       case OP_HSPACE:
2204       if (clen > 0) switch(c)
2205         {
2206         HSPACE_CASES:
2207         ADD_NEW(state_offset + 1, 0);
2208         break;
2209
2210         default:
2211         break;
2212         }
2213       break;
2214
2215       /*-----------------------------------------------------------------*/
2216       /* Match a negated single character casefully. */
2217
2218       case OP_NOT:
2219       if (clen > 0 && c != d) { ADD_NEW(state_offset + dlen + 1, 0); }
2220       break;
2221
2222       /*-----------------------------------------------------------------*/
2223       /* Match a negated single character caselessly. */
2224
2225       case OP_NOTI:
2226       if (clen > 0)
2227         {
2228         unsigned int otherd;
2229 #ifdef SUPPORT_UTF
2230         if (utf && d >= 128)
2231           {
2232 #ifdef SUPPORT_UCP
2233           otherd = UCD_OTHERCASE(d);
2234 #endif  /* SUPPORT_UCP */
2235           }
2236         else
2237 #endif  /* SUPPORT_UTF */
2238         otherd = TABLE_GET(d, fcc, d);
2239         if (c != d && c != otherd)
2240           { ADD_NEW(state_offset + dlen + 1, 0); }
2241         }
2242       break;
2243
2244       /*-----------------------------------------------------------------*/
2245       case OP_PLUSI:
2246       case OP_MINPLUSI:
2247       case OP_POSPLUSI:
2248       case OP_NOTPLUSI:
2249       case OP_NOTMINPLUSI:
2250       case OP_NOTPOSPLUSI:
2251       caseless = TRUE;
2252       codevalue -= OP_STARI - OP_STAR;
2253
2254       /* Fall through */
2255       case OP_PLUS:
2256       case OP_MINPLUS:
2257       case OP_POSPLUS:
2258       case OP_NOTPLUS:
2259       case OP_NOTMINPLUS:
2260       case OP_NOTPOSPLUS:
2261       count = current_state->count;  /* Already matched */
2262       if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
2263       if (clen > 0)
2264         {
2265         pcre_uint32 otherd = NOTACHAR;
2266         if (caseless)
2267           {
2268 #ifdef SUPPORT_UTF
2269           if (utf && d >= 128)
2270             {
2271 #ifdef SUPPORT_UCP
2272             otherd = UCD_OTHERCASE(d);
2273 #endif  /* SUPPORT_UCP */
2274             }
2275           else
2276 #endif  /* SUPPORT_UTF */
2277           otherd = TABLE_GET(d, fcc, d);
2278           }
2279         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2280           {
2281           if (count > 0 &&
2282               (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
2283             {
2284             active_count--;             /* Remove non-match possibility */
2285             next_active_state--;
2286             }
2287           count++;
2288           ADD_NEW(state_offset, count);
2289           }
2290         }
2291       break;
2292
2293       /*-----------------------------------------------------------------*/
2294       case OP_QUERYI:
2295       case OP_MINQUERYI:
2296       case OP_POSQUERYI:
2297       case OP_NOTQUERYI:
2298       case OP_NOTMINQUERYI:
2299       case OP_NOTPOSQUERYI:
2300       caseless = TRUE;
2301       codevalue -= OP_STARI - OP_STAR;
2302       /* Fall through */
2303       case OP_QUERY:
2304       case OP_MINQUERY:
2305       case OP_POSQUERY:
2306       case OP_NOTQUERY:
2307       case OP_NOTMINQUERY:
2308       case OP_NOTPOSQUERY:
2309       ADD_ACTIVE(state_offset + dlen + 1, 0);
2310       if (clen > 0)
2311         {
2312         pcre_uint32 otherd = NOTACHAR;
2313         if (caseless)
2314           {
2315 #ifdef SUPPORT_UTF
2316           if (utf && d >= 128)
2317             {
2318 #ifdef SUPPORT_UCP
2319             otherd = UCD_OTHERCASE(d);
2320 #endif  /* SUPPORT_UCP */
2321             }
2322           else
2323 #endif  /* SUPPORT_UTF */
2324           otherd = TABLE_GET(d, fcc, d);
2325           }
2326         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2327           {
2328           if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
2329             {
2330             active_count--;            /* Remove non-match possibility */
2331             next_active_state--;
2332             }
2333           ADD_NEW(state_offset + dlen + 1, 0);
2334           }
2335         }
2336       break;
2337
2338       /*-----------------------------------------------------------------*/
2339       case OP_STARI:
2340       case OP_MINSTARI:
2341       case OP_POSSTARI:
2342       case OP_NOTSTARI:
2343       case OP_NOTMINSTARI:
2344       case OP_NOTPOSSTARI:
2345       caseless = TRUE;
2346       codevalue -= OP_STARI - OP_STAR;
2347       /* Fall through */
2348       case OP_STAR:
2349       case OP_MINSTAR:
2350       case OP_POSSTAR:
2351       case OP_NOTSTAR:
2352       case OP_NOTMINSTAR:
2353       case OP_NOTPOSSTAR:
2354       ADD_ACTIVE(state_offset + dlen + 1, 0);
2355       if (clen > 0)
2356         {
2357         pcre_uint32 otherd = NOTACHAR;
2358         if (caseless)
2359           {
2360 #ifdef SUPPORT_UTF
2361           if (utf && d >= 128)
2362             {
2363 #ifdef SUPPORT_UCP
2364             otherd = UCD_OTHERCASE(d);
2365 #endif  /* SUPPORT_UCP */
2366             }
2367           else
2368 #endif  /* SUPPORT_UTF */
2369           otherd = TABLE_GET(d, fcc, d);
2370           }
2371         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2372           {
2373           if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
2374             {
2375             active_count--;            /* Remove non-match possibility */
2376             next_active_state--;
2377             }
2378           ADD_NEW(state_offset, 0);
2379           }
2380         }
2381       break;
2382
2383       /*-----------------------------------------------------------------*/
2384       case OP_EXACTI:
2385       case OP_NOTEXACTI:
2386       caseless = TRUE;
2387       codevalue -= OP_STARI - OP_STAR;
2388       /* Fall through */
2389       case OP_EXACT:
2390       case OP_NOTEXACT:
2391       count = current_state->count;  /* Number already matched */
2392       if (clen > 0)
2393         {
2394         pcre_uint32 otherd = NOTACHAR;
2395         if (caseless)
2396           {
2397 #ifdef SUPPORT_UTF
2398           if (utf && d >= 128)
2399             {
2400 #ifdef SUPPORT_UCP
2401             otherd = UCD_OTHERCASE(d);
2402 #endif  /* SUPPORT_UCP */
2403             }
2404           else
2405 #endif  /* SUPPORT_UTF */
2406           otherd = TABLE_GET(d, fcc, d);
2407           }
2408         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2409           {
2410           if (++count >= GET2(code, 1))
2411             { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2412           else
2413             { ADD_NEW(state_offset, count); }
2414           }
2415         }
2416       break;
2417
2418       /*-----------------------------------------------------------------*/
2419       case OP_UPTOI:
2420       case OP_MINUPTOI:
2421       case OP_POSUPTOI:
2422       case OP_NOTUPTOI:
2423       case OP_NOTMINUPTOI:
2424       case OP_NOTPOSUPTOI:
2425       caseless = TRUE;
2426       codevalue -= OP_STARI - OP_STAR;
2427       /* Fall through */
2428       case OP_UPTO:
2429       case OP_MINUPTO:
2430       case OP_POSUPTO:
2431       case OP_NOTUPTO:
2432       case OP_NOTMINUPTO:
2433       case OP_NOTPOSUPTO:
2434       ADD_ACTIVE(state_offset + dlen + 1 + IMM2_SIZE, 0);
2435       count = current_state->count;  /* Number already matched */
2436       if (clen > 0)
2437         {
2438         pcre_uint32 otherd = NOTACHAR;
2439         if (caseless)
2440           {
2441 #ifdef SUPPORT_UTF
2442           if (utf && d >= 128)
2443             {
2444 #ifdef SUPPORT_UCP
2445             otherd = UCD_OTHERCASE(d);
2446 #endif  /* SUPPORT_UCP */
2447             }
2448           else
2449 #endif  /* SUPPORT_UTF */
2450           otherd = TABLE_GET(d, fcc, d);
2451           }
2452         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2453           {
2454           if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
2455             {
2456             active_count--;             /* Remove non-match possibility */
2457             next_active_state--;
2458             }
2459           if (++count >= GET2(code, 1))
2460             { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2461           else
2462             { ADD_NEW(state_offset, count); }
2463           }
2464         }
2465       break;
2466
2467
2468 /* ========================================================================== */
2469       /* These are the class-handling opcodes */
2470
2471       case OP_CLASS:
2472       case OP_NCLASS:
2473       case OP_XCLASS:
2474         {
2475         BOOL isinclass = FALSE;
2476         int next_state_offset;
2477         const pcre_uchar *ecode;
2478
2479         /* For a simple class, there is always just a 32-byte table, and we
2480         can set isinclass from it. */
2481
2482         if (codevalue != OP_XCLASS)
2483           {
2484           ecode = code + 1 + (32 / sizeof(pcre_uchar));
2485           if (clen > 0)
2486             {
2487             isinclass = (c > 255)? (codevalue == OP_NCLASS) :
2488               ((((pcre_uint8 *)(code + 1))[c/8] & (1 << (c&7))) != 0);
2489             }
2490           }
2491
2492         /* An extended class may have a table or a list of single characters,
2493         ranges, or both, and it may be positive or negative. There's a
2494         function that sorts all this out. */
2495
2496         else
2497          {
2498          ecode = code + GET(code, 1);
2499          if (clen > 0) isinclass = PRIV(xclass)(c, code + 1 + LINK_SIZE, utf);
2500          }
2501
2502         /* At this point, isinclass is set for all kinds of class, and ecode
2503         points to the byte after the end of the class. If there is a
2504         quantifier, this is where it will be. */
2505
2506         next_state_offset = (int)(ecode - start_code);
2507
2508         switch (*ecode)
2509           {
2510           case OP_CRSTAR:
2511           case OP_CRMINSTAR:
2512           ADD_ACTIVE(next_state_offset + 1, 0);
2513           if (isinclass) { ADD_NEW(state_offset, 0); }
2514           break;
2515
2516           case OP_CRPLUS:
2517           case OP_CRMINPLUS:
2518           count = current_state->count;  /* Already matched */
2519           if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
2520           if (isinclass) { count++; ADD_NEW(state_offset, count); }
2521           break;
2522
2523           case OP_CRQUERY:
2524           case OP_CRMINQUERY:
2525           ADD_ACTIVE(next_state_offset + 1, 0);
2526           if (isinclass) { ADD_NEW(next_state_offset + 1, 0); }
2527           break;
2528
2529           case OP_CRRANGE:
2530           case OP_CRMINRANGE:
2531           count = current_state->count;  /* Already matched */
2532           if (count >= GET2(ecode, 1))
2533             { ADD_ACTIVE(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2534           if (isinclass)
2535             {
2536             unsigned int max = GET2(ecode, 1 + IMM2_SIZE);
2537             if (++count >= max && max != 0)   /* Max 0 => no limit */
2538               { ADD_NEW(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2539             else
2540               { ADD_NEW(state_offset, count); }
2541             }
2542           break;
2543
2544           default:
2545           if (isinclass) { ADD_NEW(next_state_offset, 0); }
2546           break;
2547           }
2548         }
2549       break;
2550
2551 /* ========================================================================== */
2552       /* These are the opcodes for fancy brackets of various kinds. We have
2553       to use recursion in order to handle them. The "always failing" assertion
2554       (?!) is optimised to OP_FAIL when compiling, so we have to support that,
2555       though the other "backtracking verbs" are not supported. */
2556
2557       case OP_FAIL:
2558       forced_fail++;    /* Count FAILs for multiple states */
2559       break;
2560
2561       case OP_ASSERT:
2562       case OP_ASSERT_NOT:
2563       case OP_ASSERTBACK:
2564       case OP_ASSERTBACK_NOT:
2565         {
2566         int rc;
2567         int local_offsets[2];
2568         int local_workspace[1000];
2569         const pcre_uchar *endasscode = code + GET(code, 1);
2570
2571         while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2572
2573         rc = internal_dfa_exec(
2574           md,                                   /* static match data */
2575           code,                                 /* this subexpression's code */
2576           ptr,                                  /* where we currently are */
2577           (int)(ptr - start_subject),           /* start offset */
2578           local_offsets,                        /* offset vector */
2579           sizeof(local_offsets)/sizeof(int),    /* size of same */
2580           local_workspace,                      /* workspace vector */
2581           sizeof(local_workspace)/sizeof(int),  /* size of same */
2582           rlevel);                              /* function recursion level */
2583
2584         if (rc == PCRE_ERROR_DFA_UITEM) return rc;
2585         if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
2586             { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2587         }
2588       break;
2589
2590       /*-----------------------------------------------------------------*/
2591       case OP_COND:
2592       case OP_SCOND:
2593         {
2594         int local_offsets[1000];
2595         int local_workspace[1000];
2596         int codelink = GET(code, 1);
2597         int condcode;
2598
2599         /* Because of the way auto-callout works during compile, a callout item
2600         is inserted between OP_COND and an assertion condition. This does not
2601         happen for the other conditions. */
2602
2603         if (code[LINK_SIZE+1] == OP_CALLOUT)
2604           {
2605           rrc = 0;
2606           if (PUBL(callout) != NULL)
2607             {
2608             PUBL(callout_block) cb;
2609             cb.version          = 1;   /* Version 1 of the callout block */
2610             cb.callout_number   = code[LINK_SIZE+2];
2611             cb.offset_vector    = offsets;
2612 #if defined COMPILE_PCRE8
2613             cb.subject          = (PCRE_SPTR)start_subject;
2614 #elif defined COMPILE_PCRE16
2615             cb.subject          = (PCRE_SPTR16)start_subject;
2616 #elif defined COMPILE_PCRE32
2617             cb.subject          = (PCRE_SPTR32)start_subject;
2618 #endif
2619             cb.subject_length   = (int)(end_subject - start_subject);
2620             cb.start_match      = (int)(current_subject - start_subject);
2621             cb.current_position = (int)(ptr - start_subject);
2622             cb.pattern_position = GET(code, LINK_SIZE + 3);
2623             cb.next_item_length = GET(code, 3 + 2*LINK_SIZE);
2624             cb.capture_top      = 1;
2625             cb.capture_last     = -1;
2626             cb.callout_data     = md->callout_data;
2627             cb.mark             = NULL;   /* No (*MARK) support */
2628             if ((rrc = (*PUBL(callout))(&cb)) < 0) return rrc;   /* Abandon */
2629             }
2630           if (rrc > 0) break;                      /* Fail this thread */
2631           code += PRIV(OP_lengths)[OP_CALLOUT];    /* Skip callout data */
2632           }
2633
2634         condcode = code[LINK_SIZE+1];
2635
2636         /* Back reference conditions are not supported */
2637
2638         if (condcode == OP_CREF || condcode == OP_NCREF)
2639           return PCRE_ERROR_DFA_UCOND;
2640
2641         /* The DEFINE condition is always false */
2642
2643         if (condcode == OP_DEF)
2644           { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2645
2646         /* The only supported version of OP_RREF is for the value RREF_ANY,
2647         which means "test if in any recursion". We can't test for specifically
2648         recursed groups. */
2649
2650         else if (condcode == OP_RREF || condcode == OP_NRREF)
2651           {
2652           int value = GET2(code, LINK_SIZE + 2);
2653           if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
2654           if (md->recursive != NULL)
2655             { ADD_ACTIVE(state_offset + LINK_SIZE + 2 + IMM2_SIZE, 0); }
2656           else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2657           }
2658
2659         /* Otherwise, the condition is an assertion */
2660
2661         else
2662           {
2663           int rc;
2664           const pcre_uchar *asscode = code + LINK_SIZE + 1;
2665           const pcre_uchar *endasscode = asscode + GET(asscode, 1);
2666
2667           while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2668
2669           rc = internal_dfa_exec(
2670             md,                                   /* fixed match data */
2671             asscode,                              /* this subexpression's code */
2672             ptr,                                  /* where we currently are */
2673             (int)(ptr - start_subject),           /* start offset */
2674             local_offsets,                        /* offset vector */
2675             sizeof(local_offsets)/sizeof(int),    /* size of same */
2676             local_workspace,                      /* workspace vector */
2677             sizeof(local_workspace)/sizeof(int),  /* size of same */
2678             rlevel);                              /* function recursion level */
2679
2680           if (rc == PCRE_ERROR_DFA_UITEM) return rc;
2681           if ((rc >= 0) ==
2682                 (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2683             { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2684           else
2685             { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2686           }
2687         }
2688       break;
2689
2690       /*-----------------------------------------------------------------*/
2691       case OP_RECURSE:
2692         {
2693         dfa_recursion_info *ri;
2694         int local_offsets[1000];
2695         int local_workspace[1000];
2696         const pcre_uchar *callpat = start_code + GET(code, 1);
2697         int recno = (callpat == md->start_code)? 0 :
2698           GET2(callpat, 1 + LINK_SIZE);
2699         int rc;
2700
2701         DPRINTF(("%.*sStarting regex recursion\n", rlevel*2-2, SP));
2702
2703         /* Check for repeating a recursion without advancing the subject
2704         pointer. This should catch convoluted mutual recursions. (Some simple
2705         cases are caught at compile time.) */
2706
2707         for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
2708           if (recno == ri->group_num && ptr == ri->subject_position)
2709             return PCRE_ERROR_RECURSELOOP;
2710
2711         /* Remember this recursion and where we started it so as to
2712         catch infinite loops. */
2713
2714         new_recursive.group_num = recno;
2715         new_recursive.subject_position = ptr;
2716         new_recursive.prevrec = md->recursive;
2717         md->recursive = &new_recursive;
2718
2719         rc = internal_dfa_exec(
2720           md,                                   /* fixed match data */
2721           callpat,                              /* this subexpression's code */
2722           ptr,                                  /* where we currently are */
2723           (int)(ptr - start_subject),           /* start offset */
2724           local_offsets,                        /* offset vector */
2725           sizeof(local_offsets)/sizeof(int),    /* size of same */
2726           local_workspace,                      /* workspace vector */
2727           sizeof(local_workspace)/sizeof(int),  /* size of same */
2728           rlevel);                              /* function recursion level */
2729
2730         md->recursive = new_recursive.prevrec;  /* Done this recursion */
2731
2732         DPRINTF(("%.*sReturn from regex recursion: rc=%d\n", rlevel*2-2, SP,
2733           rc));
2734
2735         /* Ran out of internal offsets */
2736
2737         if (rc == 0) return PCRE_ERROR_DFA_RECURSE;
2738
2739         /* For each successful matched substring, set up the next state with a
2740         count of characters to skip before trying it. Note that the count is in
2741         characters, not bytes. */
2742
2743         if (rc > 0)
2744           {
2745           for (rc = rc*2 - 2; rc >= 0; rc -= 2)
2746             {
2747             int charcount = local_offsets[rc+1] - local_offsets[rc];
2748 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2749             if (utf)
2750               {
2751               const pcre_uchar *p = start_subject + local_offsets[rc];
2752               const pcre_uchar *pp = start_subject + local_offsets[rc+1];
2753               while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
2754               }
2755 #endif
2756             if (charcount > 0)
2757               {
2758               ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
2759               }
2760             else
2761               {
2762               ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
2763               }
2764             }
2765           }
2766         else if (rc != PCRE_ERROR_NOMATCH) return rc;
2767         }
2768       break;
2769
2770       /*-----------------------------------------------------------------*/
2771       case OP_BRAPOS:
2772       case OP_SBRAPOS:
2773       case OP_CBRAPOS:
2774       case OP_SCBRAPOS:
2775       case OP_BRAPOSZERO:
2776         {
2777         int charcount, matched_count;
2778         const pcre_uchar *local_ptr = ptr;
2779         BOOL allow_zero;
2780
2781         if (codevalue == OP_BRAPOSZERO)
2782           {
2783           allow_zero = TRUE;
2784           codevalue = *(++code);  /* Codevalue will be one of above BRAs */
2785           }
2786         else allow_zero = FALSE;
2787
2788         /* Loop to match the subpattern as many times as possible as if it were
2789         a complete pattern. */
2790
2791         for (matched_count = 0;; matched_count++)
2792           {
2793           int local_offsets[2];
2794           int local_workspace[1000];
2795
2796           int rc = internal_dfa_exec(
2797             md,                                   /* fixed match data */
2798             code,                                 /* this subexpression's code */
2799             local_ptr,                            /* where we currently are */
2800             (int)(ptr - start_subject),           /* start offset */
2801             local_offsets,                        /* offset vector */
2802             sizeof(local_offsets)/sizeof(int),    /* size of same */
2803             local_workspace,                      /* workspace vector */
2804             sizeof(local_workspace)/sizeof(int),  /* size of same */
2805             rlevel);                              /* function recursion level */
2806
2807           /* Failed to match */
2808
2809           if (rc < 0)
2810             {
2811             if (rc != PCRE_ERROR_NOMATCH) return rc;
2812             break;
2813             }
2814
2815           /* Matched: break the loop if zero characters matched. */
2816
2817           charcount = local_offsets[1] - local_offsets[0];
2818           if (charcount == 0) break;
2819           local_ptr += charcount;    /* Advance temporary position ptr */
2820           }
2821
2822         /* At this point we have matched the subpattern matched_count
2823         times, and local_ptr is pointing to the character after the end of the
2824         last match. */
2825
2826         if (matched_count > 0 || allow_zero)
2827           {
2828           const pcre_uchar *end_subpattern = code;
2829           int next_state_offset;
2830
2831           do { end_subpattern += GET(end_subpattern, 1); }
2832             while (*end_subpattern == OP_ALT);
2833           next_state_offset =
2834             (int)(end_subpattern - start_code + LINK_SIZE + 1);
2835
2836           /* Optimization: if there are no more active states, and there
2837           are no new states yet set up, then skip over the subject string
2838           right here, to save looping. Otherwise, set up the new state to swing
2839           into action when the end of the matched substring is reached. */
2840
2841           if (i + 1 >= active_count && new_count == 0)
2842             {
2843             ptr = local_ptr;
2844             clen = 0;
2845             ADD_NEW(next_state_offset, 0);
2846             }
2847           else
2848             {
2849             const pcre_uchar *p = ptr;
2850             const pcre_uchar *pp = local_ptr;
2851             charcount = (int)(pp - p);
2852 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2853             if (utf) while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
2854 #endif
2855             ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2856             }
2857           }
2858         }
2859       break;
2860
2861       /*-----------------------------------------------------------------*/
2862       case OP_ONCE:
2863       case OP_ONCE_NC:
2864         {
2865         int local_offsets[2];
2866         int local_workspace[1000];
2867
2868         int rc = internal_dfa_exec(
2869           md,                                   /* fixed match data */
2870           code,                                 /* this subexpression's code */
2871           ptr,                                  /* where we currently are */
2872           (int)(ptr - start_subject),           /* start offset */
2873           local_offsets,                        /* offset vector */
2874           sizeof(local_offsets)/sizeof(int),    /* size of same */
2875           local_workspace,                      /* workspace vector */
2876           sizeof(local_workspace)/sizeof(int),  /* size of same */
2877           rlevel);                              /* function recursion level */
2878
2879         if (rc >= 0)
2880           {
2881           const pcre_uchar *end_subpattern = code;
2882           int charcount = local_offsets[1] - local_offsets[0];
2883           int next_state_offset, repeat_state_offset;
2884
2885           do { end_subpattern += GET(end_subpattern, 1); }
2886             while (*end_subpattern == OP_ALT);
2887           next_state_offset =
2888             (int)(end_subpattern - start_code + LINK_SIZE + 1);
2889
2890           /* If the end of this subpattern is KETRMAX or KETRMIN, we must
2891           arrange for the repeat state also to be added to the relevant list.
2892           Calculate the offset, or set -1 for no repeat. */
2893
2894           repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
2895                                  *end_subpattern == OP_KETRMIN)?
2896             (int)(end_subpattern - start_code - GET(end_subpattern, 1)) : -1;
2897
2898           /* If we have matched an empty string, add the next state at the
2899           current character pointer. This is important so that the duplicate
2900           checking kicks in, which is what breaks infinite loops that match an
2901           empty string. */
2902
2903           if (charcount == 0)
2904             {
2905             ADD_ACTIVE(next_state_offset, 0);
2906             }
2907
2908           /* Optimization: if there are no more active states, and there
2909           are no new states yet set up, then skip over the subject string
2910           right here, to save looping. Otherwise, set up the new state to swing
2911           into action when the end of the matched substring is reached. */
2912
2913           else if (i + 1 >= active_count && new_count == 0)
2914             {
2915             ptr += charcount;
2916             clen = 0;
2917             ADD_NEW(next_state_offset, 0);
2918
2919             /* If we are adding a repeat state at the new character position,
2920             we must fudge things so that it is the only current state.
2921             Otherwise, it might be a duplicate of one we processed before, and
2922             that would cause it to be skipped. */
2923
2924             if (repeat_state_offset >= 0)
2925               {
2926               next_active_state = active_states;
2927               active_count = 0;
2928               i = -1;
2929               ADD_ACTIVE(repeat_state_offset, 0);
2930               }
2931             }
2932           else
2933             {
2934 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2935             if (utf)
2936               {
2937               const pcre_uchar *p = start_subject + local_offsets[0];
2938               const pcre_uchar *pp = start_subject + local_offsets[1];
2939               while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
2940               }
2941 #endif
2942             ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2943             if (repeat_state_offset >= 0)
2944               { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
2945             }
2946           }
2947         else if (rc != PCRE_ERROR_NOMATCH) return rc;
2948         }
2949       break;
2950
2951
2952 /* ========================================================================== */
2953       /* Handle callouts */
2954
2955       case OP_CALLOUT:
2956       rrc = 0;
2957       if (PUBL(callout) != NULL)
2958         {
2959         PUBL(callout_block) cb;
2960         cb.version          = 1;   /* Version 1 of the callout block */
2961         cb.callout_number   = code[1];
2962         cb.offset_vector    = offsets;
2963 #if defined COMPILE_PCRE8
2964         cb.subject          = (PCRE_SPTR)start_subject;
2965 #elif defined COMPILE_PCRE16
2966         cb.subject          = (PCRE_SPTR16)start_subject;
2967 #elif defined COMPILE_PCRE32
2968         cb.subject          = (PCRE_SPTR32)start_subject;
2969 #endif
2970         cb.subject_length   = (int)(end_subject - start_subject);
2971         cb.start_match      = (int)(current_subject - start_subject);
2972         cb.current_position = (int)(ptr - start_subject);
2973         cb.pattern_position = GET(code, 2);
2974         cb.next_item_length = GET(code, 2 + LINK_SIZE);
2975         cb.capture_top      = 1;
2976         cb.capture_last     = -1;
2977         cb.callout_data     = md->callout_data;
2978         cb.mark             = NULL;   /* No (*MARK) support */
2979         if ((rrc = (*PUBL(callout))(&cb)) < 0) return rrc;   /* Abandon */
2980         }
2981       if (rrc == 0)
2982         { ADD_ACTIVE(state_offset + PRIV(OP_lengths)[OP_CALLOUT], 0); }
2983       break;
2984
2985
2986 /* ========================================================================== */
2987       default:        /* Unsupported opcode */
2988       return PCRE_ERROR_DFA_UITEM;
2989       }
2990
2991     NEXT_ACTIVE_STATE: continue;
2992
2993     }      /* End of loop scanning active states */
2994
2995   /* We have finished the processing at the current subject character. If no
2996   new states have been set for the next character, we have found all the
2997   matches that we are going to find. If we are at the top level and partial
2998   matching has been requested, check for appropriate conditions.
2999
3000   The "forced_ fail" variable counts the number of (*F) encountered for the
3001   character. If it is equal to the original active_count (saved in
3002   workspace[1]) it means that (*F) was found on every active state. In this
3003   case we don't want to give a partial match.
3004
3005   The "could_continue" variable is true if a state could have continued but
3006   for the fact that the end of the subject was reached. */
3007
3008   if (new_count <= 0)
3009     {
3010     if (rlevel == 1 &&                               /* Top level, and */
3011         could_continue &&                            /* Some could go on, and */
3012         forced_fail != workspace[1] &&               /* Not all forced fail & */
3013         (                                            /* either... */
3014         (md->moptions & PCRE_PARTIAL_HARD) != 0      /* Hard partial */
3015         ||                                           /* or... */
3016         ((md->moptions & PCRE_PARTIAL_SOFT) != 0 &&  /* Soft partial and */
3017          match_count < 0)                            /* no matches */
3018         ) &&                                         /* And... */
3019         (
3020         partial_newline ||                           /* Either partial NL */
3021           (                                          /* or ... */
3022           ptr >= end_subject &&                /* End of subject and */
3023           ptr > md->start_used_ptr)            /* Inspected non-empty string */
3024           )
3025         )
3026       {
3027       if (offsetcount >= 2)
3028         {
3029         offsets[0] = (int)(md->start_used_ptr - start_subject);
3030         offsets[1] = (int)(end_subject - start_subject);
3031         }
3032       match_count = PCRE_ERROR_PARTIAL;
3033       }
3034
3035     DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
3036       "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count,
3037       rlevel*2-2, SP));
3038     break;        /* In effect, "return", but see the comment below */
3039     }
3040
3041   /* One or more states are active for the next character. */
3042
3043   ptr += clen;    /* Advance to next subject character */
3044   }               /* Loop to move along the subject string */
3045
3046 /* Control gets here from "break" a few lines above. We do it this way because
3047 if we use "return" above, we have compiler trouble. Some compilers warn if
3048 there's nothing here because they think the function doesn't return a value. On
3049 the other hand, if we put a dummy statement here, some more clever compilers
3050 complain that it can't be reached. Sigh. */
3051
3052 return match_count;
3053 }
3054
3055
3056
3057
3058 /*************************************************
3059 *    Execute a Regular Expression - DFA engine   *
3060 *************************************************/
3061
3062 /* This external function applies a compiled re to a subject string using a DFA
3063 engine. This function calls the internal function multiple times if the pattern
3064 is not anchored.
3065
3066 Arguments:
3067   argument_re     points to the compiled expression
3068   extra_data      points to extra data or is NULL
3069   subject         points to the subject string
3070   length          length of subject string (may contain binary zeros)
3071   start_offset    where to start in the subject string
3072   options         option bits
3073   offsets         vector of match offsets
3074   offsetcount     size of same
3075   workspace       workspace vector
3076   wscount         size of same
3077
3078 Returns:          > 0 => number of match offset pairs placed in offsets
3079                   = 0 => offsets overflowed; longest matches are present
3080                    -1 => failed to match
3081                  < -1 => some kind of unexpected problem
3082 */
3083
3084 #if defined COMPILE_PCRE8
3085 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
3086 pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
3087   const char *subject, int length, int start_offset, int options, int *offsets,
3088   int offsetcount, int *workspace, int wscount)
3089 #elif defined COMPILE_PCRE16
3090 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
3091 pcre16_dfa_exec(const pcre16 *argument_re, const pcre16_extra *extra_data,
3092   PCRE_SPTR16 subject, int length, int start_offset, int options, int *offsets,
3093   int offsetcount, int *workspace, int wscount)
3094 #elif defined COMPILE_PCRE32
3095 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
3096 pcre32_dfa_exec(const pcre32 *argument_re, const pcre32_extra *extra_data,
3097   PCRE_SPTR32 subject, int length, int start_offset, int options, int *offsets,
3098   int offsetcount, int *workspace, int wscount)
3099 #endif
3100 {
3101 REAL_PCRE *re = (REAL_PCRE *)argument_re;
3102 dfa_match_data match_block;
3103 dfa_match_data *md = &match_block;
3104 BOOL utf, anchored, startline, firstline;
3105 const pcre_uchar *current_subject, *end_subject;
3106 const pcre_study_data *study = NULL;
3107
3108 const pcre_uchar *req_char_ptr;
3109 const pcre_uint8 *start_bits = NULL;
3110 BOOL has_first_char = FALSE;
3111 BOOL has_req_char = FALSE;
3112 pcre_uchar first_char = 0;
3113 pcre_uchar first_char2 = 0;
3114 pcre_uchar req_char = 0;
3115 pcre_uchar req_char2 = 0;
3116 int newline;
3117
3118 /* Plausibility checks */
3119
3120 if ((options & ~PUBLIC_DFA_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
3121 if (re == NULL || subject == NULL || workspace == NULL ||
3122    (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
3123 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
3124 if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
3125 if (length < 0) return PCRE_ERROR_BADLENGTH;
3126 if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
3127
3128 /* Check that the first field in the block is the magic number. If it is not,
3129 return with PCRE_ERROR_BADMAGIC. However, if the magic number is equal to
3130 REVERSED_MAGIC_NUMBER we return with PCRE_ERROR_BADENDIANNESS, which
3131 means that the pattern is likely compiled with different endianness. */
3132
3133 if (re->magic_number != MAGIC_NUMBER)
3134   return re->magic_number == REVERSED_MAGIC_NUMBER?
3135     PCRE_ERROR_BADENDIANNESS:PCRE_ERROR_BADMAGIC;
3136 if ((re->flags & PCRE_MODE) == 0) return PCRE_ERROR_BADMODE;
3137
3138 /* If restarting after a partial match, do some sanity checks on the contents
3139 of the workspace. */
3140
3141 if ((options & PCRE_DFA_RESTART) != 0)
3142   {
3143   if ((workspace[0] & (-2)) != 0 || workspace[1] < 1 ||
3144     workspace[1] > (wscount - 2)/INTS_PER_STATEBLOCK)
3145       return PCRE_ERROR_DFA_BADRESTART;
3146   }
3147
3148 /* Set up study, callout, and table data */
3149
3150 md->tables = re->tables;
3151 md->callout_data = NULL;
3152
3153 if (extra_data != NULL)
3154   {
3155   unsigned int flags = extra_data->flags;
3156   if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
3157     study = (const pcre_study_data *)extra_data->study_data;
3158   if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT;
3159   if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
3160     return PCRE_ERROR_DFA_UMLIMIT;
3161   if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
3162     md->callout_data = extra_data->callout_data;
3163   if ((flags & PCRE_EXTRA_TABLES) != 0)
3164     md->tables = extra_data->tables;
3165   }
3166
3167 /* Set some local values */
3168
3169 current_subject = (const pcre_uchar *)subject + start_offset;
3170 end_subject = (const pcre_uchar *)subject + length;
3171 req_char_ptr = current_subject - 1;
3172
3173 #ifdef SUPPORT_UTF
3174 /* PCRE_UTF(16|32) have the same value as PCRE_UTF8. */
3175 utf = (re->options & PCRE_UTF8) != 0;
3176 #else
3177 utf = FALSE;
3178 #endif
3179
3180 anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
3181   (re->options & PCRE_ANCHORED) != 0;
3182
3183 /* The remaining fixed data for passing around. */
3184
3185 md->start_code = (const pcre_uchar *)argument_re +
3186     re->name_table_offset + re->name_count * re->name_entry_size;
3187 md->start_subject = (const pcre_uchar *)subject;
3188 md->end_subject = end_subject;
3189 md->start_offset = start_offset;
3190 md->moptions = options;
3191 md->poptions = re->options;
3192
3193 /* If the BSR option is not set at match time, copy what was set
3194 at compile time. */
3195
3196 if ((md->moptions & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) == 0)
3197   {
3198   if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
3199     md->moptions |= re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE);
3200 #ifdef BSR_ANYCRLF
3201   else md->moptions |= PCRE_BSR_ANYCRLF;
3202 #endif
3203   }
3204
3205 /* Handle different types of newline. The three bits give eight cases. If
3206 nothing is set at run time, whatever was used at compile time applies. */
3207
3208 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) &
3209          PCRE_NEWLINE_BITS)
3210   {
3211   case 0: newline = NEWLINE; break;   /* Compile-time default */
3212   case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
3213   case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
3214   case PCRE_NEWLINE_CR+
3215        PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
3216   case PCRE_NEWLINE_ANY: newline = -1; break;
3217   case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
3218   default: return PCRE_ERROR_BADNEWLINE;
3219   }
3220
3221 if (newline == -2)
3222   {
3223   md->nltype = NLTYPE_ANYCRLF;
3224   }
3225 else if (newline < 0)
3226   {
3227   md->nltype = NLTYPE_ANY;
3228   }
3229 else
3230   {
3231   md->nltype = NLTYPE_FIXED;
3232   if (newline > 255)
3233     {
3234     md->nllen = 2;
3235     md->nl[0] = (newline >> 8) & 255;
3236     md->nl[1] = newline & 255;
3237     }
3238   else
3239     {
3240     md->nllen = 1;
3241     md->nl[0] = newline;
3242     }
3243   }
3244
3245 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
3246 back the character offset. */
3247
3248 #ifdef SUPPORT_UTF
3249 if (utf && (options & PCRE_NO_UTF8_CHECK) == 0)
3250   {
3251   int erroroffset;
3252   int errorcode = PRIV(valid_utf)((pcre_uchar *)subject, length, &erroroffset);
3253   if (errorcode != 0)
3254     {
3255     if (offsetcount >= 2)
3256       {
3257       offsets[0] = erroroffset;
3258       offsets[1] = errorcode;
3259       }
3260 #if defined COMPILE_PCRE8
3261     return (errorcode <= PCRE_UTF8_ERR5 && (options & PCRE_PARTIAL_HARD) != 0) ?
3262       PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
3263 #elif defined COMPILE_PCRE16
3264     return (errorcode <= PCRE_UTF16_ERR1 && (options & PCRE_PARTIAL_HARD) != 0) ?
3265       PCRE_ERROR_SHORTUTF16 : PCRE_ERROR_BADUTF16;
3266 #elif defined COMPILE_PCRE32
3267     return PCRE_ERROR_BADUTF32;
3268 #endif
3269     }
3270 #if defined COMPILE_PCRE8 || defined COMPILE_PCRE16
3271   if (start_offset > 0 && start_offset < length &&
3272         NOT_FIRSTCHAR(((PCRE_PUCHAR)subject)[start_offset]))
3273     return PCRE_ERROR_BADUTF8_OFFSET;
3274 #endif
3275   }
3276 #endif
3277
3278 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
3279 is a feature that makes it possible to save compiled regex and re-use them
3280 in other programs later. */
3281
3282 if (md->tables == NULL) md->tables = PRIV(default_tables);
3283
3284 /* The "must be at the start of a line" flags are used in a loop when finding
3285 where to start. */
3286
3287 startline = (re->flags & PCRE_STARTLINE) != 0;
3288 firstline = (re->options & PCRE_FIRSTLINE) != 0;
3289
3290 /* Set up the first character to match, if available. The first_byte value is
3291 never set for an anchored regular expression, but the anchoring may be forced
3292 at run time, so we have to test for anchoring. The first char may be unset for
3293 an unanchored pattern, of course. If there's no first char and the pattern was
3294 studied, there may be a bitmap of possible first characters. */
3295
3296 if (!anchored)
3297   {
3298   if ((re->flags & PCRE_FIRSTSET) != 0)
3299     {
3300     has_first_char = TRUE;
3301     first_char = first_char2 = (pcre_uchar)(re->first_char);
3302     if ((re->flags & PCRE_FCH_CASELESS) != 0)
3303       {
3304       first_char2 = TABLE_GET(first_char, md->tables + fcc_offset, first_char);
3305 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
3306       if (utf && first_char > 127)
3307         first_char2 = UCD_OTHERCASE(first_char);
3308 #endif
3309       }
3310     }
3311   else
3312     {
3313     if (!startline && study != NULL &&
3314          (study->flags & PCRE_STUDY_MAPPED) != 0)
3315       start_bits = study->start_bits;
3316     }
3317   }
3318
3319 /* For anchored or unanchored matches, there may be a "last known required
3320 character" set. */
3321
3322 if ((re->flags & PCRE_REQCHSET) != 0)
3323   {
3324   has_req_char = TRUE;
3325   req_char = req_char2 = (pcre_uchar)(re->req_char);
3326   if ((re->flags & PCRE_RCH_CASELESS) != 0)
3327     {
3328     req_char2 = TABLE_GET(req_char, md->tables + fcc_offset, req_char);
3329 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
3330     if (utf && req_char > 127)
3331       req_char2 = UCD_OTHERCASE(req_char);
3332 #endif
3333     }
3334   }
3335
3336 /* Call the main matching function, looping for a non-anchored regex after a
3337 failed match. If not restarting, perform certain optimizations at the start of
3338 a match. */
3339
3340 for (;;)
3341   {
3342   int rc;
3343
3344   if ((options & PCRE_DFA_RESTART) == 0)
3345     {
3346     const pcre_uchar *save_end_subject = end_subject;
3347
3348     /* If firstline is TRUE, the start of the match is constrained to the first
3349     line of a multiline string. Implement this by temporarily adjusting
3350     end_subject so that we stop scanning at a newline. If the match fails at
3351     the newline, later code breaks this loop. */
3352
3353     if (firstline)
3354       {
3355       PCRE_PUCHAR t = current_subject;
3356 #ifdef SUPPORT_UTF
3357       if (utf)
3358         {
3359         while (t < md->end_subject && !IS_NEWLINE(t))
3360           {
3361           t++;
3362           ACROSSCHAR(t < end_subject, *t, t++);
3363           }
3364         }
3365       else
3366 #endif
3367       while (t < md->end_subject && !IS_NEWLINE(t)) t++;
3368       end_subject = t;
3369       }
3370
3371     /* There are some optimizations that avoid running the match if a known
3372     starting point is not found. However, there is an option that disables
3373     these, for testing and for ensuring that all callouts do actually occur.
3374     The option can be set in the regex by (*NO_START_OPT) or passed in
3375     match-time options. */
3376
3377     if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
3378       {
3379       /* Advance to a known first char. */
3380
3381       if (has_first_char)
3382         {
3383         if (first_char != first_char2)
3384           {
3385           pcre_uchar csc;
3386           while (current_subject < end_subject &&
3387                  (csc = RAWUCHARTEST(current_subject)) != first_char && csc != first_char2)
3388             current_subject++;
3389           }
3390         else
3391           while (current_subject < end_subject &&
3392                  RAWUCHARTEST(current_subject) != first_char)
3393             current_subject++;
3394         }
3395
3396       /* Or to just after a linebreak for a multiline match if possible */
3397
3398       else if (startline)
3399         {
3400         if (current_subject > md->start_subject + start_offset)
3401           {
3402 #ifdef SUPPORT_UTF
3403           if (utf)
3404             {
3405             while (current_subject < end_subject &&
3406                    !WAS_NEWLINE(current_subject))
3407               {
3408               current_subject++;
3409               ACROSSCHAR(current_subject < end_subject, *current_subject,
3410                 current_subject++);
3411               }
3412             }
3413           else
3414 #endif
3415           while (current_subject < end_subject && !WAS_NEWLINE(current_subject))
3416             current_subject++;
3417
3418           /* If we have just passed a CR and the newline option is ANY or
3419           ANYCRLF, and we are now at a LF, advance the match position by one
3420           more character. */
3421
3422           if (RAWUCHARTEST(current_subject - 1) == CHAR_CR &&
3423                (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
3424                current_subject < end_subject &&
3425                RAWUCHARTEST(current_subject) == CHAR_NL)
3426             current_subject++;
3427           }
3428         }
3429
3430       /* Or to a non-unique first char after study */
3431
3432       else if (start_bits != NULL)
3433         {
3434         while (current_subject < end_subject)
3435           {
3436           register pcre_uint32 c = RAWUCHARTEST(current_subject);
3437 #ifndef COMPILE_PCRE8
3438           if (c > 255) c = 255;
3439 #endif
3440           if ((start_bits[c/8] & (1 << (c&7))) == 0)
3441             {
3442             current_subject++;
3443 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
3444             /* In non 8-bit mode, the iteration will stop for
3445             characters > 255 at the beginning or not stop at all. */
3446             if (utf)
3447               ACROSSCHAR(current_subject < end_subject, *current_subject,
3448                 current_subject++);
3449 #endif
3450             }
3451           else break;
3452           }
3453         }
3454       }
3455
3456     /* Restore fudged end_subject */
3457
3458     end_subject = save_end_subject;
3459
3460     /* The following two optimizations are disabled for partial matching or if
3461     disabling is explicitly requested (and of course, by the test above, this
3462     code is not obeyed when restarting after a partial match). */
3463
3464     if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0 &&
3465         (options & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) == 0)
3466       {
3467       /* If the pattern was studied, a minimum subject length may be set. This
3468       is a lower bound; no actual string of that length may actually match the
3469       pattern. Although the value is, strictly, in characters, we treat it as
3470       bytes to avoid spending too much time in this optimization. */
3471
3472       if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
3473           (pcre_uint32)(end_subject - current_subject) < study->minlength)
3474         return PCRE_ERROR_NOMATCH;
3475
3476       /* If req_char is set, we know that that character must appear in the
3477       subject for the match to succeed. If the first character is set, req_char
3478       must be later in the subject; otherwise the test starts at the match
3479       point. This optimization can save a huge amount of work in patterns with
3480       nested unlimited repeats that aren't going to match. Writing separate
3481       code for cased/caseless versions makes it go faster, as does using an
3482       autoincrement and backing off on a match.
3483
3484       HOWEVER: when the subject string is very, very long, searching to its end
3485       can take a long time, and give bad performance on quite ordinary
3486       patterns. This showed up when somebody was matching /^C/ on a 32-megabyte
3487       string... so we don't do this when the string is sufficiently long. */
3488
3489       if (has_req_char && end_subject - current_subject < REQ_BYTE_MAX)
3490         {
3491         register PCRE_PUCHAR p = current_subject + (has_first_char? 1:0);
3492
3493         /* We don't need to repeat the search if we haven't yet reached the
3494         place we found it at last time. */
3495
3496         if (p > req_char_ptr)
3497           {
3498           if (req_char != req_char2)
3499             {
3500             while (p < end_subject)
3501               {
3502               register pcre_uint32 pp = RAWUCHARINCTEST(p);
3503               if (pp == req_char || pp == req_char2) { p--; break; }
3504               }
3505             }
3506           else
3507             {
3508             while (p < end_subject)
3509               {
3510               if (RAWUCHARINCTEST(p) == req_char) { p--; break; }
3511               }
3512             }
3513
3514           /* If we can't find the required character, break the matching loop,
3515           which will cause a return or PCRE_ERROR_NOMATCH. */
3516
3517           if (p >= end_subject) break;
3518
3519           /* If we have found the required character, save the point where we
3520           found it, so that we don't search again next time round the loop if
3521           the start hasn't passed this character yet. */
3522
3523           req_char_ptr = p;
3524           }
3525         }
3526       }
3527     }   /* End of optimizations that are done when not restarting */
3528
3529   /* OK, now we can do the business */
3530
3531   md->start_used_ptr = current_subject;
3532   md->recursive = NULL;
3533
3534   rc = internal_dfa_exec(
3535     md,                                /* fixed match data */
3536     md->start_code,                    /* this subexpression's code */
3537     current_subject,                   /* where we currently are */
3538     start_offset,                      /* start offset in subject */
3539     offsets,                           /* offset vector */
3540     offsetcount,                       /* size of same */
3541     workspace,                         /* workspace vector */
3542     wscount,                           /* size of same */
3543     0);                                /* function recurse level */
3544
3545   /* Anything other than "no match" means we are done, always; otherwise, carry
3546   on only if not anchored. */
3547
3548   if (rc != PCRE_ERROR_NOMATCH || anchored) return rc;
3549
3550   /* Advance to the next subject character unless we are at the end of a line
3551   and firstline is set. */
3552
3553   if (firstline && IS_NEWLINE(current_subject)) break;
3554   current_subject++;
3555 #ifdef SUPPORT_UTF
3556   if (utf)
3557     {
3558     ACROSSCHAR(current_subject < end_subject, *current_subject,
3559       current_subject++);
3560     }
3561 #endif
3562   if (current_subject > end_subject) break;
3563
3564   /* If we have just passed a CR and we are now at a LF, and the pattern does
3565   not contain any explicit matches for \r or \n, and the newline option is CRLF
3566   or ANY or ANYCRLF, advance the match position by one more character. */
3567
3568   if (RAWUCHARTEST(current_subject - 1) == CHAR_CR &&
3569       current_subject < end_subject &&
3570       RAWUCHARTEST(current_subject) == CHAR_NL &&
3571       (re->flags & PCRE_HASCRORLF) == 0 &&
3572         (md->nltype == NLTYPE_ANY ||
3573          md->nltype == NLTYPE_ANYCRLF ||
3574          md->nllen == 2))
3575     current_subject++;
3576
3577   }   /* "Bumpalong" loop */
3578
3579 return PCRE_ERROR_NOMATCH;
3580 }
3581
3582 /* End of pcre_dfa_exec.c */