glib/pcre/pcre_compile.c

   1 /*************************************************
   2 *      Perl-Compatible Regular Expressions       *
   3 *************************************************/
   4
   5 /* PCRE is a library of functions to support regular expressions whose syntax
   6 and semantics are as close as possible to those of the Perl 5 language.
   7
   8                        Written by Philip Hazel
   9            Copyright (c) 1997-2012 University of Cambridge
  10
  11 -----------------------------------------------------------------------------
  12 Redistribution and use in source and binary forms, with or without
  13 modification, are permitted provided that the following conditions are met:
  14
  15     * Redistributions of source code must retain the above copyright notice,
  16       this list of conditions and the following disclaimer.
  17
  18     * Redistributions in binary form must reproduce the above copyright
  19       notice, this list of conditions and the following disclaimer in the
  20       documentation and/or other materials provided with the distribution.
  21
  22     * Neither the name of the University of Cambridge nor the names of its
  23       contributors may be used to endorse or promote products derived from
  24       this software without specific prior written permission.
  25
  26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  36 POSSIBILITY OF SUCH DAMAGE.
  37 -----------------------------------------------------------------------------
  38 */
  39
  40
  41 /* This module contains the external function pcre_compile(), along with
  42 supporting internal functions that are not used by other modules. */
  43
  44
  45 #include "config.h"
  46
  47 #define NLBLOCK cd             /* Block containing newline information */
  48 #define PSSTART start_pattern  /* Field containing processed string start */
  49 #define PSEND   end_pattern    /* Field containing processed string end */
  50
  51 #include "pcre_internal.h"
  52
  53 #ifdef GLIB_COMPILATION
  54 #include "gstrfuncs.h"
  55 #else
  56 #include <glib.h>
  57 #endif
  58
  59 /* When PCRE_DEBUG is defined, we need the pcre(16)_printint() function, which
  60 is also used by pcretest. PCRE_DEBUG is not defined when building a production
  61 library. We do not need to select pcre16_printint.c specially, because the
  62 COMPILE_PCREx macro will already be appropriately set. */
  63
  64 #ifdef PCRE_DEBUG
  65 /* pcre_printint.c should not include any headers */
  66 #define PCRE_INCLUDED
  67 #include "pcre_printint.c"
  68 #undef PCRE_INCLUDED
  69 #endif
  70
  71
  72 /* Macro for setting individual bits in class bitmaps. */
  73
  74 #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
  75
  76 /* Maximum length value to check against when making sure that the integer that
  77 holds the compiled pattern length does not overflow. We make it a bit less than
  78 INT_MAX to allow for adding in group terminating bytes, so that we don't have
  79 to check them every time. */
  80
  81 #define OFLOW_MAX (INT_MAX - 20)
  82
  83
  84 /*************************************************
  85 *      Code parameters and static tables         *
  86 *************************************************/
  87
  88 /* This value specifies the size of stack workspace that is used during the
  89 first pre-compile phase that determines how much memory is required. The regex
  90 is partly compiled into this space, but the compiled parts are discarded as
  91 soon as they can be, so that hopefully there will never be an overrun. The code
  92 does, however, check for an overrun. The largest amount I've seen used is 218,
  93 so this number is very generous.
  94
  95 The same workspace is used during the second, actual compile phase for
  96 remembering forward references to groups so that they can be filled in at the
  97 end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
  98 is 4 there is plenty of room for most patterns. However, the memory can get
  99 filled up by repetitions of forward references, for example patterns like
 100 /(?1){0,1999}(b)/, and one user did hit the limit. The code has been changed so
 101 that the workspace is expanded using malloc() in this situation. The value
 102 below is therefore a minimum, and we put a maximum on it for safety. The
 103 minimum is now also defined in terms of LINK_SIZE so that the use of malloc()
 104 kicks in at the same number of forward references in all cases. */
 105
 106 #define COMPILE_WORK_SIZE (2048*LINK_SIZE)
 107 #define COMPILE_WORK_SIZE_MAX (100*COMPILE_WORK_SIZE)
 108
 109 /* The overrun tests check for a slightly smaller size so that they detect the
 110 overrun before it actually does run off the end of the data block. */
 111
 112 #define WORK_SIZE_SAFETY_MARGIN (100)
 113
 114 /* Private flags added to firstchar and reqchar. */
 115
 116 #define REQ_CASELESS   0x10000000l      /* Indicates caselessness */
 117 #define REQ_VARY       0x20000000l      /* Reqchar followed non-literal item */
 118
 119 /* Repeated character flags. */
 120
 121 #define UTF_LENGTH     0x10000000l      /* The char contains its length. */
 122
 123 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
 124 are simple data values; negative values are for special things like \d and so
 125 on. Zero means further processing is needed (for things like \x), or the escape
 126 is invalid. */
 127
 128 #ifndef EBCDIC
 129
 130 /* This is the "normal" table for ASCII systems or for EBCDIC systems running
 131 in UTF-8 mode. */
 132
 133 static const short int escapes[] = {
 134      0,                       0,
 135      0,                       0,
 136      0,                       0,
 137      0,                       0,
 138      0,                       0,
 139      CHAR_COLON,              CHAR_SEMICOLON,
 140      CHAR_LESS_THAN_SIGN,     CHAR_EQUALS_SIGN,
 141      CHAR_GREATER_THAN_SIGN,  CHAR_QUESTION_MARK,
 142      CHAR_COMMERCIAL_AT,      -ESC_A,
 143      -ESC_B,                  -ESC_C,
 144      -ESC_D,                  -ESC_E,
 145      0,                       -ESC_G,
 146      -ESC_H,                  0,
 147      0,                       -ESC_K,
 148      0,                       0,
 149      -ESC_N,                  0,
 150      -ESC_P,                  -ESC_Q,
 151      -ESC_R,                  -ESC_S,
 152      0,                       0,
 153      -ESC_V,                  -ESC_W,
 154      -ESC_X,                  0,
 155      -ESC_Z,                  CHAR_LEFT_SQUARE_BRACKET,
 156      CHAR_BACKSLASH,          CHAR_RIGHT_SQUARE_BRACKET,
 157      CHAR_CIRCUMFLEX_ACCENT,  CHAR_UNDERSCORE,
 158      CHAR_GRAVE_ACCENT,       7,
 159      -ESC_b,                  0,
 160      -ESC_d,                  ESC_e,
 161      ESC_f,                   0,
 162      -ESC_h,                  0,
 163      0,                       -ESC_k,
 164      0,                       0,
 165      ESC_n,                   0,
 166      -ESC_p,                  0,
 167      ESC_r,                   -ESC_s,
 168      ESC_tee,                 0,
 169      -ESC_v,                  -ESC_w,
 170      0,                       0,
 171      -ESC_z
 172 };
 173
 174 #else
 175
 176 /* This is the "abnormal" table for EBCDIC systems without UTF-8 support. */
 177
 178 static const short int escapes[] = {
 179 /*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',
 180 /*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,
 181 /*  58 */     0,     0,    '!',     '$',    '*',   ')',    ';',    '~',
 182 /*  60 */   '-',   '/',      0,       0,      0,     0,      0,      0,
 183 /*  68 */     0,     0,    '|',     ',',    '%',   '_',    '>',    '?',
 184 /*  70 */     0,     0,      0,       0,      0,     0,      0,      0,
 185 /*  78 */     0,   '`',    ':',     '#',    '@',  '\'',    '=',    '"',
 186 /*  80 */     0,     7, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,
 187 /*  88 */-ESC_h,     0,      0,     '{',      0,     0,      0,      0,
 188 /*  90 */     0,     0, -ESC_k,     'l',      0, ESC_n,      0, -ESC_p,
 189 /*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,
 190 /*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,-ESC_v, -ESC_w,      0,
 191 /*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,
 192 /*  B0 */     0,     0,      0,       0,      0,     0,      0,      0,
 193 /*  B8 */     0,     0,      0,       0,      0,   ']',    '=',    '-',
 194 /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,
 195 /*  C8 */-ESC_H,     0,      0,       0,      0,     0,      0,      0,
 196 /*  D0 */   '}',     0, -ESC_K,       0,      0,-ESC_N,      0, -ESC_P,
 197 /*  D8 */-ESC_Q,-ESC_R,      0,       0,      0,     0,      0,      0,
 198 /*  E0 */  '\\',     0, -ESC_S,       0,      0,-ESC_V, -ESC_W, -ESC_X,
 199 /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,
 200 /*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,
 201 /*  F8 */     0,     0,      0,       0,      0,     0,      0,      0
 202 };
 203 #endif
 204
 205
 206 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
 207 searched linearly. Put all the names into a single string, in order to reduce
 208 the number of relocations when a shared library is dynamically linked. The
 209 string is built from string macros so that it works in UTF-8 mode on EBCDIC
 210 platforms. */
 211
 212 typedef struct verbitem {
 213   int   len;                 /* Length of verb name */
 214   int   op;                  /* Op when no arg, or -1 if arg mandatory */
 215   int   op_arg;              /* Op when arg present, or -1 if not allowed */
 216 } verbitem;
 217
 218 static const char verbnames[] =
 219   "\0"                       /* Empty name is a shorthand for MARK */
 220   STRING_MARK0
 221   STRING_ACCEPT0
 222   STRING_COMMIT0
 223   STRING_F0
 224   STRING_FAIL0
 225   STRING_PRUNE0
 226   STRING_SKIP0
 227   STRING_THEN;
 228
 229 static const verbitem verbs[] = {
 230   { 0, -1,        OP_MARK },
 231   { 4, -1,        OP_MARK },
 232   { 6, OP_ACCEPT, -1 },
 233   { 6, OP_COMMIT, -1 },
 234   { 1, OP_FAIL,   -1 },
 235   { 4, OP_FAIL,   -1 },
 236   { 5, OP_PRUNE,  OP_PRUNE_ARG },
 237   { 4, OP_SKIP,   OP_SKIP_ARG  },
 238   { 4, OP_THEN,   OP_THEN_ARG  }
 239 };
 240
 241 static const int verbcount = sizeof(verbs)/sizeof(verbitem);
 242
 243
 244 /* Tables of names of POSIX character classes and their lengths. The names are
 245 now all in a single string, to reduce the number of relocations when a shared
 246 library is dynamically loaded. The list of lengths is terminated by a zero
 247 length entry. The first three must be alpha, lower, upper, as this is assumed
 248 for handling case independence. */
 249
 250 static const char posix_names[] =
 251   STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
 252   STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
 253   STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
 254   STRING_word0  STRING_xdigit;
 255
 256 static const pcre_uint8 posix_name_lengths[] = {
 257   5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
 258
 259 /* Table of class bit maps for each POSIX class. Each class is formed from a
 260 base map, with an optional addition or removal of another map. Then, for some
 261 classes, there is some additional tweaking: for [:blank:] the vertical space
 262 characters are removed, and for [:alpha:] and [:alnum:] the underscore
 263 character is removed. The triples in the table consist of the base map offset,
 264 second map offset or -1 if no second map, and a non-negative value for map
 265 addition or a negative value for map subtraction (if there are two maps). The
 266 absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
 267 remove vertical space characters, 2 => remove underscore. */
 268
 269 static const int posix_class_maps[] = {
 270   cbit_word,  cbit_digit, -2,             /* alpha */
 271   cbit_lower, -1,          0,             /* lower */
 272   cbit_upper, -1,          0,             /* upper */
 273   cbit_word,  -1,          2,             /* alnum - word without underscore */
 274   cbit_print, cbit_cntrl,  0,             /* ascii */
 275   cbit_space, -1,          1,             /* blank - a GNU extension */
 276   cbit_cntrl, -1,          0,             /* cntrl */
 277   cbit_digit, -1,          0,             /* digit */
 278   cbit_graph, -1,          0,             /* graph */
 279   cbit_print, -1,          0,             /* print */
 280   cbit_punct, -1,          0,             /* punct */
 281   cbit_space, -1,          0,             /* space */
 282   cbit_word,  -1,          0,             /* word - a Perl extension */
 283   cbit_xdigit,-1,          0              /* xdigit */
 284 };
 285
 286 /* Table of substitutes for \d etc when PCRE_UCP is set. The POSIX class
 287 substitutes must be in the order of the names, defined above, and there are
 288 both positive and negative cases. NULL means no substitute. */
 289
 290 #ifdef SUPPORT_UCP
 291 static const pcre_uchar string_PNd[]  = {
 292   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
 293   CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
 294 static const pcre_uchar string_pNd[]  = {
 295   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
 296   CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
 297 static const pcre_uchar string_PXsp[] = {
 298   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
 299   CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' };
 300 static const pcre_uchar string_pXsp[] = {
 301   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
 302   CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' };
 303 static const pcre_uchar string_PXwd[] = {
 304   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
 305   CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
 306 static const pcre_uchar string_pXwd[] = {
 307   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
 308   CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
 309
 310 static const pcre_uchar *substitutes[] = {
 311   string_PNd,           /* \D */
 312   string_pNd,           /* \d */
 313   string_PXsp,          /* \S */       /* NOTE: Xsp is Perl space */
 314   string_pXsp,          /* \s */
 315   string_PXwd,          /* \W */
 316   string_pXwd           /* \w */
 317 };
 318
 319 static const pcre_uchar string_pL[] =   {
 320   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
 321   CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
 322 static const pcre_uchar string_pLl[] =  {
 323   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
 324   CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' };
 325 static const pcre_uchar string_pLu[] =  {
 326   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
 327   CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' };
 328 static const pcre_uchar string_pXan[] = {
 329   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
 330   CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' };
 331 static const pcre_uchar string_h[] =    {
 332   CHAR_BACKSLASH, CHAR_h, '\0' };
 333 static const pcre_uchar string_pXps[] = {
 334   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
 335   CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
 336 static const pcre_uchar string_PL[] =   {
 337   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
 338   CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
 339 static const pcre_uchar string_PLl[] =  {
 340   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
 341   CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' };
 342 static const pcre_uchar string_PLu[] =  {
 343   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
 344   CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' };
 345 static const pcre_uchar string_PXan[] = {
 346   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
 347   CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' };
 348 static const pcre_uchar string_H[] =    {
 349   CHAR_BACKSLASH, CHAR_H, '\0' };
 350 static const pcre_uchar string_PXps[] = {
 351   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
 352   CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
 353
 354 static const pcre_uchar *posix_substitutes[] = {
 355   string_pL,            /* alpha */
 356   string_pLl,           /* lower */
 357   string_pLu,           /* upper */
 358   string_pXan,          /* alnum */
 359   NULL,                 /* ascii */
 360   string_h,             /* blank */
 361   NULL,                 /* cntrl */
 362   string_pNd,           /* digit */
 363   NULL,                 /* graph */
 364   NULL,                 /* print */
 365   NULL,                 /* punct */
 366   string_pXps,          /* space */    /* NOTE: Xps is POSIX space */
 367   string_pXwd,          /* word */
 368   NULL,                 /* xdigit */
 369   /* Negated cases */
 370   string_PL,            /* ^alpha */
 371   string_PLl,           /* ^lower */
 372   string_PLu,           /* ^upper */
 373   string_PXan,          /* ^alnum */
 374   NULL,                 /* ^ascii */
 375   string_H,             /* ^blank */
 376   NULL,                 /* ^cntrl */
 377   string_PNd,           /* ^digit */
 378   NULL,                 /* ^graph */
 379   NULL,                 /* ^print */
 380   NULL,                 /* ^punct */
 381   string_PXps,          /* ^space */   /* NOTE: Xps is POSIX space */
 382   string_PXwd,          /* ^word */
 383   NULL                  /* ^xdigit */
 384 };
 385 #define POSIX_SUBSIZE (sizeof(posix_substitutes) / sizeof(pcre_uchar *))
 386 #endif
 387
 388 #define STRING(a)  # a
 389 #define XSTRING(s) STRING(s)
 390
 391 /* The texts of compile-time error messages. These are "char *" because they
 392 are passed to the outside world. Do not ever re-use any error number, because
 393 they are documented. Always add a new error instead. Messages marked DEAD below
 394 are no longer used. This used to be a table of strings, but in order to reduce
 395 the number of relocations needed when a shared library is loaded dynamically,
 396 it is now one long string. We cannot use a table of offsets, because the
 397 lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
 398 simply count through to the one we want - this isn't a performance issue
 399 because these strings are used only when there is a compilation error.
 400
 401 Each substring ends with \0 to insert a null character. This includes the final
 402 substring, so that the whole string ends with \0\0, which can be detected when
 403 counting through. */
 404
 405 static const char error_texts[] =
 406   "no error\0"
 407   "\\ at end of pattern\0"
 408   "\\c at end of pattern\0"
 409   "unrecognized character follows \\\0"
 410   "numbers out of order in {} quantifier\0"
 411   /* 5 */
 412   "number too big in {} quantifier\0"
 413   "missing terminating ] for character class\0"
 414   "invalid escape sequence in character class\0"
 415   "range out of order in character class\0"
 416   "nothing to repeat\0"
 417   /* 10 */
 418   "operand of unlimited repeat could match the empty string\0"  /** DEAD **/
 419   "internal error: unexpected repeat\0"
 420   "unrecognized character after (? or (?-\0"
 421   "POSIX named classes are supported only within a class\0"
 422   "missing )\0"
 423   /* 15 */
 424   "reference to non-existent subpattern\0"
 425   "erroffset passed as NULL\0"
 426   "unknown option bit(s) set\0"
 427   "missing ) after comment\0"
 428   "parentheses nested too deeply\0"  /** DEAD **/
 429   /* 20 */
 430   "regular expression is too large\0"
 431   "failed to get memory\0"
 432   "unmatched parentheses\0"
 433   "internal error: code overflow\0"
 434   "unrecognized character after (?<\0"
 435   /* 25 */
 436   "lookbehind assertion is not fixed length\0"
 437   "malformed number or name after (?(\0"
 438   "conditional group contains more than two branches\0"
 439   "assertion expected after (?(\0"
 440   "(?R or (?[+-]digits must be followed by )\0"
 441   /* 30 */
 442   "unknown POSIX class name\0"
 443   "POSIX collating elements are not supported\0"
 444   "this version of PCRE is compiled without UTF support\0"
 445   "spare error\0"  /** DEAD **/
 446   "character value in \\x{...} sequence is too large\0"
 447   /* 35 */
 448   "invalid condition (?(0)\0"
 449   "\\C not allowed in lookbehind assertion\0"
 450   "PCRE does not support \\L, \\l, \\N{name}, \\U, or \\u\0"
 451   "number after (?C is > 255\0"
 452   "closing ) for (?C expected\0"
 453   /* 40 */
 454   "recursive call could loop indefinitely\0"
 455   "unrecognized character after (?P\0"
 456   "syntax error in subpattern name (missing terminator)\0"
 457   "two named subpatterns have the same name\0"
 458   "invalid UTF-8 string\0"
 459   /* 45 */
 460   "support for \\P, \\p, and \\X has not been compiled\0"
 461   "malformed \\P or \\p sequence\0"
 462   "unknown property name after \\P or \\p\0"
 463   "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
 464   "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
 465   /* 50 */
 466   "repeated subpattern is too long\0"    /** DEAD **/
 467   "octal value is greater than \\377 in 8-bit non-UTF-8 mode\0"
 468   "internal error: overran compiling workspace\0"
 469   "internal error: previously-checked referenced subpattern not found\0"
 470   "DEFINE group contains more than one branch\0"
 471   /* 55 */
 472   "repeating a DEFINE group is not allowed\0"  /** DEAD **/
 473   "inconsistent NEWLINE options\0"
 474   "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
 475   "a numbered reference must not be zero\0"
 476   "an argument is not allowed for (*ACCEPT), (*FAIL), or (*COMMIT)\0"
 477   /* 60 */
 478   "(*VERB) not recognized\0"
 479   "number is too big\0"
 480   "subpattern name expected\0"
 481   "digit expected after (?+\0"
 482   "] is an invalid data character in JavaScript compatibility mode\0"
 483   /* 65 */
 484   "different names for subpatterns of the same number are not allowed\0"
 485   "(*MARK) must have an argument\0"
 486   "this version of PCRE is not compiled with Unicode property support\0"
 487   "\\c must be followed by an ASCII character\0"
 488   "\\k is not followed by a braced, angle-bracketed, or quoted name\0"
 489   /* 70 */
 490   "internal error: unknown opcode in find_fixedlength()\0"
 491   "\\N is not supported in a class\0"
 492   "too many forward references\0"
 493   "disallowed Unicode code point (>= 0xd800 && <= 0xdfff)\0"
 494   "invalid UTF-16 string\0"
 495   /* 75 */
 496   "name is too long in (*MARK), (*PRUNE), (*SKIP), or (*THEN)\0"
 497   "character value in \\u.... sequence is too large\0"
 498   ;
 499
 500 /* Table to identify digits and hex digits. This is used when compiling
 501 patterns. Note that the tables in chartables are dependent on the locale, and
 502 may mark arbitrary characters as digits - but the PCRE compiling code expects
 503 to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
 504 a private table here. It costs 256 bytes, but it is a lot faster than doing
 505 character value tests (at least in some simple cases I timed), and in some
 506 applications one wants PCRE to compile efficiently as well as match
 507 efficiently.
 508
 509 For convenience, we use the same bit definitions as in chartables:
 510
 511   0x04   decimal digit
 512   0x08   hexadecimal digit
 513
 514 Then we can use ctype_digit and ctype_xdigit in the code. */
 515
 516 /* Using a simple comparison for decimal numbers rather than a memory read
 517 is much faster, and the resulting code is simpler (the compiler turns it
 518 into a subtraction and unsigned comparison). */
 519
 520 #define IS_DIGIT(x) ((x) >= CHAR_0 && (x) <= CHAR_9)
 521
 522 #if 0
 523 #ifndef EBCDIC
 524
 525 /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
 526 UTF-8 mode. */
 527
 528 static const pcre_uint8 digitab[] =
 529   {
 530   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */
 531   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15 */
 532   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  16- 23 */
 533   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31 */
 534   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - '  */
 535   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ( - /  */
 536   0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  */
 537   0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /*  8 - ?  */
 538   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  @ - G  */
 539   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  H - O  */
 540   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  P - W  */
 541   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  X - _  */
 542   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  ` - g  */
 543   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  h - o  */
 544   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  p - w  */
 545   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  x -127 */
 546   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
 547   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
 548   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
 549   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
 550   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
 551   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
 552   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
 553   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
 554   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
 555   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
 556   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
 557   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
 558   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
 559   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
 560   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
 561   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
 562
 563 #else
 564
 565 /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
 566
 567 static const pcre_uint8 digitab[] =
 568   {
 569   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */
 570   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15    */
 571   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  16- 23 10 */
 572   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31    */
 573   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  32- 39 20 */
 574   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  40- 47    */
 575   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  48- 55 30 */
 576   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  56- 63    */
 577   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 40 */
 578   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  72- |     */
 579   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 50 */
 580   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  88- 95    */
 581   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 60 */
 582   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ?     */
 583   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
 584   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- "     */
 585   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g  80 */
 586   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  h -143    */
 587   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p  90 */
 588   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  q -159    */
 589   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x  A0 */
 590   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  y -175    */
 591   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ^ -183 B0 */
 592   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191    */
 593   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  { - G  C0 */
 594   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  H -207    */
 595   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  } - P  D0 */
 596   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  Q -223    */
 597   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  \ - X  E0 */
 598   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  Y -239    */
 599   0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  F0 */
 600   0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/*  8 -255    */
 601
 602 static const pcre_uint8 ebcdic_chartab[] = { /* chartable partial dup */
 603   0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*   0-  7 */
 604   0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /*   8- 15 */
 605   0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*  16- 23 */
 606   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31 */
 607   0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*  32- 39 */
 608   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  40- 47 */
 609   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  48- 55 */
 610   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  56- 63 */
 611   0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 */
 612   0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /*  72- |  */
 613   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 */
 614   0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /*  88- 95 */
 615   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 */
 616   0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ?  */
 617   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
 618   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- "  */
 619   0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g  */
 620   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  h -143 */
 621   0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p  */
 622   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  q -159 */
 623   0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x  */
 624   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  y -175 */
 625   0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ^ -183 */
 626   0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
 627   0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /*  { - G  */
 628   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  H -207 */
 629   0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /*  } - P  */
 630   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  Q -223 */
 631   0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /*  \ - X  */
 632   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  Y -239 */
 633   0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /*  0 - 7  */
 634   0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/*  8 -255 */
 635 #endif
 636 #endif /* 0 */
 637
 638 /* Definition to allow mutual recursion */
 639
 640 static BOOL
 641   compile_regex(int, pcre_uchar **, const pcre_uchar **, int *, BOOL, BOOL, int, int,
 642     int *, int *, branch_chain *, compile_data *, int *);
 643
 644
 645
 646 /*************************************************
 647 *            Find an error text                  *
 648 *************************************************/
 649
 650 /* The error texts are now all in one long string, to save on relocations. As
 651 some of the text is of unknown length, we can't use a table of offsets.
 652 Instead, just count through the strings. This is not a performance issue
 653 because it happens only when there has been a compilation error.
 654
 655 Argument:   the error number
 656 Returns:    pointer to the error string
 657 */
 658
 659 static const char *
 660 find_error_text(int n)
 661 {
 662 const char *s = error_texts;
 663 for (; n > 0; n--)
 664   {
 665   while (*s++ != 0) {};
 666   if (*s == 0) return "Error text not found (please report)";
 667   }
 668 return s;
 669 }
 670
 671
 672 /*************************************************
 673 *           Expand the workspace                 *
 674 *************************************************/
 675
 676 /* This function is called during the second compiling phase, if the number of
 677 forward references fills the existing workspace, which is originally a block on
 678 the stack. A larger block is obtained from malloc() unless the ultimate limit
 679 has been reached or the increase will be rather small.
 680
 681 Argument: pointer to the compile data block
 682 Returns:  0 if all went well, else an error number
 683 */
 684
 685 static int
 686 expand_workspace(compile_data *cd)
 687 {
 688 pcre_uchar *newspace;
 689 int newsize = cd->workspace_size * 2;
 690
 691 if (newsize > COMPILE_WORK_SIZE_MAX) newsize = COMPILE_WORK_SIZE_MAX;
 692 if (cd->workspace_size >= COMPILE_WORK_SIZE_MAX ||
 693     newsize - cd->workspace_size < WORK_SIZE_SAFETY_MARGIN)
 694  return ERR72;
 695
 696 newspace = (PUBL(malloc))(IN_UCHARS(newsize));
 697 if (newspace == NULL) return ERR21;
 698 memcpy(newspace, cd->start_workspace, cd->workspace_size * sizeof(pcre_uchar));
 699 cd->hwm = (pcre_uchar *)newspace + (cd->hwm - cd->start_workspace);
 700 if (cd->workspace_size > COMPILE_WORK_SIZE)
 701   (PUBL(free))((void *)cd->start_workspace);
 702 cd->start_workspace = newspace;
 703 cd->workspace_size = newsize;
 704 return 0;
 705 }
 706
 707
 708
 709 /*************************************************
 710 *            Check for counted repeat            *
 711 *************************************************/
 712
 713 /* This function is called when a '{' is encountered in a place where it might
 714 start a quantifier. It looks ahead to see if it really is a quantifier or not.
 715 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
 716 where the ddds are digits.
 717
 718 Arguments:
 719   p         pointer to the first char after '{'
 720
 721 Returns:    TRUE or FALSE
 722 */
 723
 724 static BOOL
 725 is_counted_repeat(const pcre_uchar *p)
 726 {
 727 if (!IS_DIGIT(*p)) return FALSE;
 728 p++;
 729 while (IS_DIGIT(*p)) p++;
 730 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
 731
 732 if (*p++ != CHAR_COMMA) return FALSE;
 733 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
 734
 735 if (!IS_DIGIT(*p)) return FALSE;
 736 p++;
 737 while (IS_DIGIT(*p)) p++;
 738
 739 return (*p == CHAR_RIGHT_CURLY_BRACKET);
 740 }
 741
 742
 743
 744 /*************************************************
 745 *            Handle escapes                      *
 746 *************************************************/
 747
 748 /* This function is called when a \ has been encountered. It either returns a
 749 positive value for a simple escape such as \n, or a negative value which
 750 encodes one of the more complicated things such as \d. A backreference to group
 751 n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
 752 UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
 753 ptr is pointing at the \. On exit, it is on the final character of the escape
 754 sequence.
 755
 756 Arguments:
 757   ptrptr         points to the pattern position pointer
 758   errorcodeptr   points to the errorcode variable
 759   bracount       number of previous extracting brackets
 760   options        the options bits
 761   isclass        TRUE if inside a character class
 762
 763 Returns:         zero or positive => a data character
 764                  negative => a special escape sequence
 765                  on error, errorcodeptr is set
 766 */
 767
 768 static int
 769 check_escape(const pcre_uchar **ptrptr, int *errorcodeptr, int bracount,
 770   int options, BOOL isclass)
 771 {
 772 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
 773 BOOL utf = (options & PCRE_UTF8) != 0;
 774 const pcre_uchar *ptr = *ptrptr + 1;
 775 pcre_int32 c;
 776 int i;
 777
 778 GETCHARINCTEST(c, ptr);           /* Get character value, increment pointer */
 779 ptr--;                            /* Set pointer back to the last byte */
 780
 781 /* If backslash is at the end of the pattern, it's an error. */
 782
 783 if (c == 0) *errorcodeptr = ERR1;
 784
 785 /* Non-alphanumerics are literals. For digits or letters, do an initial lookup
 786 in a table. A non-zero result is something that can be returned immediately.
 787 Otherwise further processing may be required. */
 788
 789 #ifndef EBCDIC  /* ASCII/UTF-8 coding */
 790 /* Not alphanumeric */
 791 else if (c < CHAR_0 || c > CHAR_z) {}
 792 else if ((i = escapes[c - CHAR_0]) != 0) c = i;
 793
 794 #else           /* EBCDIC coding */
 795 /* Not alphanumeric */
 796 else if (c < 'a' || (!MAX_255(c) || (ebcdic_chartab[c] & 0x0E) == 0)) {}
 797 else if ((i = escapes[c - 0x48]) != 0)  c = i;
 798 #endif
 799
 800 /* Escapes that need further processing, or are illegal. */
 801
 802 else
 803   {
 804   const pcre_uchar *oldptr;
 805   BOOL braced, negated;
 806
 807   switch (c)
 808     {
 809     /* A number of Perl escapes are not handled by PCRE. We give an explicit
 810     error. */
 811
 812     case CHAR_l:
 813     case CHAR_L:
 814     *errorcodeptr = ERR37;
 815     break;
 816
 817     case CHAR_u:
 818     if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
 819       {
 820       /* In JavaScript, \u must be followed by four hexadecimal numbers.
 821       Otherwise it is a lowercase u letter. */
 822       if (MAX_255(ptr[1]) && g_ascii_isxdigit(ptr[1]) != 0
 823         && MAX_255(ptr[2]) && g_ascii_isxdigit(ptr[2]) != 0
 824         && MAX_255(ptr[3]) && g_ascii_isxdigit(ptr[3]) != 0
 825         && MAX_255(ptr[4]) && g_ascii_isxdigit(ptr[4]) != 0)
 826         {
 827         c = 0;
 828         for (i = 0; i < 4; ++i)
 829           {
 830           int cc = *(++ptr);
 831 #ifndef EBCDIC  /* ASCII/UTF-8 coding */
 832           if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
 833           c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
 834 #else           /* EBCDIC coding */
 835           if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
 836           c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
 837 #endif
 838           }
 839
 840 #ifdef COMPILE_PCRE8
 841         if (c > (utf ? 0x10ffff : 0xff))
 842 #else
 843 #ifdef COMPILE_PCRE16
 844         if (c > (utf ? 0x10ffff : 0xffff))
 845 #endif
 846 #endif
 847           {
 848           *errorcodeptr = ERR76;
 849           }
 850         else if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
 851         }
 852       }
 853     else
 854       *errorcodeptr = ERR37;
 855     break;
 856
 857     case CHAR_U:
 858     /* In JavaScript, \U is an uppercase U letter. */
 859     if ((options & PCRE_JAVASCRIPT_COMPAT) == 0) *errorcodeptr = ERR37;
 860     break;
 861
 862     /* In a character class, \g is just a literal "g". Outside a character
 863     class, \g must be followed by one of a number of specific things:
 864
 865     (1) A number, either plain or braced. If positive, it is an absolute
 866     backreference. If negative, it is a relative backreference. This is a Perl
 867     5.10 feature.
 868
 869     (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
 870     is part of Perl's movement towards a unified syntax for back references. As
 871     this is synonymous with \k{name}, we fudge it up by pretending it really
 872     was \k.
 873
 874     (3) For Oniguruma compatibility we also support \g followed by a name or a
 875     number either in angle brackets or in single quotes. However, these are
 876     (possibly recursive) subroutine calls, _not_ backreferences. Just return
 877     the -ESC_g code (cf \k). */
 878
 879     case CHAR_g:
 880     if (isclass) break;
 881     if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE)
 882       {
 883       c = -ESC_g;
 884       break;
 885       }
 886
 887     /* Handle the Perl-compatible cases */
 888
 889     if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
 890       {
 891       const pcre_uchar *p;
 892       for (p = ptr+2; *p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET; p++)
 893         if (*p != CHAR_MINUS && !IS_DIGIT(*p)) break;
 894       if (*p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET)
 895         {
 896         c = -ESC_k;
 897         break;
 898         }
 899       braced = TRUE;
 900       ptr++;
 901       }
 902     else braced = FALSE;
 903
 904     if (ptr[1] == CHAR_MINUS)
 905       {
 906       negated = TRUE;
 907       ptr++;
 908       }
 909     else negated = FALSE;
 910
 911     /* The integer range is limited by the machine's int representation. */
 912     c = 0;
 913     while (IS_DIGIT(ptr[1]))
 914       {
 915       if (((unsigned int)c) > INT_MAX / 10) /* Integer overflow */
 916         {
 917         c = -1;
 918         break;
 919         }
 920       c = c * 10 + *(++ptr) - CHAR_0;
 921       }
 922     if (((unsigned int)c) > INT_MAX) /* Integer overflow */
 923       {
 924       while (IS_DIGIT(ptr[1]))
 925         ptr++;
 926       *errorcodeptr = ERR61;
 927       break;
 928       }
 929
 930     if (braced && *(++ptr) != CHAR_RIGHT_CURLY_BRACKET)
 931       {
 932       *errorcodeptr = ERR57;
 933       break;
 934       }
 935
 936     if (c == 0)
 937       {
 938       *errorcodeptr = ERR58;
 939       break;
 940       }
 941
 942     if (negated)
 943       {
 944       if (c > bracount)
 945         {
 946         *errorcodeptr = ERR15;
 947         break;
 948         }
 949       c = bracount - (c - 1);
 950       }
 951
 952     c = -(ESC_REF + c);
 953     break;
 954
 955     /* The handling of escape sequences consisting of a string of digits
 956     starting with one that is not zero is not straightforward. By experiment,
 957     the way Perl works seems to be as follows:
 958
 959     Outside a character class, the digits are read as a decimal number. If the
 960     number is less than 10, or if there are that many previous extracting
 961     left brackets, then it is a back reference. Otherwise, up to three octal
 962     digits are read to form an escaped byte. Thus \123 is likely to be octal
 963     123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
 964     value is greater than 377, the least significant 8 bits are taken. Inside a
 965     character class, \ followed by a digit is always an octal number. */
 966
 967     case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
 968     case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
 969
 970     if (!isclass)
 971       {
 972       oldptr = ptr;
 973       /* The integer range is limited by the machine's int representation. */
 974       c -= CHAR_0;
 975       while (IS_DIGIT(ptr[1]))
 976         {
 977         if (((unsigned int)c) > INT_MAX / 10) /* Integer overflow */
 978           {
 979           c = -1;
 980           break;
 981           }
 982         c = c * 10 + *(++ptr) - CHAR_0;
 983         }
 984       if (((unsigned int)c) > INT_MAX) /* Integer overflow */
 985         {
 986         while (IS_DIGIT(ptr[1]))
 987           ptr++;
 988         *errorcodeptr = ERR61;
 989         break;
 990         }
 991       if (c < 10 || c <= bracount)
 992         {
 993         c = -(ESC_REF + c);
 994         break;
 995         }
 996       ptr = oldptr;      /* Put the pointer back and fall through */
 997       }
 998
 999     /* Handle an octal number following \. If the first digit is 8 or 9, Perl
1000     generates a binary zero byte and treats the digit as a following literal.
1001     Thus we have to pull back the pointer by one. */
1002
1003     if ((c = *ptr) >= CHAR_8)
1004       {
1005       ptr--;
1006       c = 0;
1007       break;
1008       }
1009
1010     /* \0 always starts an octal number, but we may drop through to here with a
1011     larger first octal digit. The original code used just to take the least
1012     significant 8 bits of octal numbers (I think this is what early Perls used
1013     to do). Nowadays we allow for larger numbers in UTF-8 mode and 16-bit mode,
1014     but no more than 3 octal digits. */
1015
1016     case CHAR_0:
1017     c -= CHAR_0;
1018     while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
1019         c = c * 8 + *(++ptr) - CHAR_0;
1020 #ifdef COMPILE_PCRE8
1021     if (!utf && c > 0xff) *errorcodeptr = ERR51;
1022 #endif
1023     break;
1024
1025     /* \x is complicated. \x{ddd} is a character number which can be greater
1026     than 0xff in utf or non-8bit mode, but only if the ddd are hex digits.
1027     If not, { is treated as a data character. */
1028
1029     case CHAR_x:
1030     if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
1031       {
1032       /* In JavaScript, \x must be followed by two hexadecimal numbers.
1033       Otherwise it is a lowercase x letter. */
1034       if (MAX_255(ptr[1]) && g_ascii_isxdigit(ptr[1]) != 0
1035         && MAX_255(ptr[2]) && g_ascii_isxdigit(ptr[2]) != 0)
1036         {
1037         c = 0;
1038         for (i = 0; i < 2; ++i)
1039           {
1040           int cc = *(++ptr);
1041 #ifndef EBCDIC  /* ASCII/UTF-8 coding */
1042           if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
1043           c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1044 #else           /* EBCDIC coding */
1045           if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
1046           c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1047 #endif
1048           }
1049         }
1050       break;
1051       }
1052
1053     if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
1054       {
1055       const pcre_uchar *pt = ptr + 2;
1056
1057       c = 0;
1058       while (MAX_255(*pt) && g_ascii_isxdigit(*pt) != 0)
1059         {
1060         int cc = *pt++;
1061         if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */
1062
1063 #ifndef EBCDIC  /* ASCII/UTF-8 coding */
1064         if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
1065         c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1066 #else           /* EBCDIC coding */
1067         if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
1068         c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1069 #endif
1070
1071 #ifdef COMPILE_PCRE8
1072         if (c > (utf ? 0x10ffff : 0xff)) { c = -1; break; }
1073 #else
1074 #ifdef COMPILE_PCRE16
1075         if (c > (utf ? 0x10ffff : 0xffff)) { c = -1; break; }
1076 #endif
1077 #endif
1078         }
1079
1080       if (c < 0)
1081         {
1082         while (MAX_255(*pt) && g_ascii_isxdigit(*pt) != 0) pt++;
1083         *errorcodeptr = ERR34;
1084         }
1085
1086       if (*pt == CHAR_RIGHT_CURLY_BRACKET)
1087         {
1088         if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
1089         ptr = pt;
1090         break;
1091         }
1092
1093       /* If the sequence of hex digits does not end with '}', then we don't
1094       recognize this construct; fall through to the normal \x handling. */
1095       }
1096
1097     /* Read just a single-byte hex-defined char */
1098
1099     c = 0;
1100     while (i++ < 2 && MAX_255(ptr[1]) && g_ascii_isxdigit(ptr[1]) != 0)
1101       {
1102       int cc;                                  /* Some compilers don't like */
1103       cc = *(++ptr);                           /* ++ in initializers */
1104 #ifndef EBCDIC  /* ASCII/UTF-8 coding */
1105       if (cc >= CHAR_a) cc -= 32;              /* Convert to upper case */
1106       c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1107 #else           /* EBCDIC coding */
1108       if (cc <= CHAR_z) cc += 64;              /* Convert to upper case */
1109       c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1110 #endif
1111       }
1112     break;
1113
1114     /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
1115     An error is given if the byte following \c is not an ASCII character. This
1116     coding is ASCII-specific, but then the whole concept of \cx is
1117     ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
1118
1119     case CHAR_c:
1120     c = *(++ptr);
1121     if (c == 0)
1122       {
1123       *errorcodeptr = ERR2;
1124       break;
1125       }
1126 #ifndef EBCDIC    /* ASCII/UTF-8 coding */
1127     if (c > 127)  /* Excludes all non-ASCII in either mode */
1128       {
1129       *errorcodeptr = ERR68;
1130       break;
1131       }
1132     if (c >= CHAR_a && c <= CHAR_z) c -= 32;
1133     c ^= 0x40;
1134 #else             /* EBCDIC coding */
1135     if (c >= CHAR_a && c <= CHAR_z) c += 64;
1136     c ^= 0xC0;
1137 #endif
1138     break;
1139
1140     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
1141     other alphanumeric following \ is an error if PCRE_EXTRA was set;
1142     otherwise, for Perl compatibility, it is a literal. This code looks a bit
1143     odd, but there used to be some cases other than the default, and there may
1144     be again in future, so I haven't "optimized" it. */
1145
1146     default:
1147     if ((options & PCRE_EXTRA) != 0) switch(c)
1148       {
1149       default:
1150       *errorcodeptr = ERR3;
1151       break;
1152       }
1153     break;
1154     }
1155   }
1156
1157 /* Perl supports \N{name} for character names, as well as plain \N for "not
1158 newline". PCRE does not support \N{name}. However, it does support
1159 quantification such as \N{2,3}. */
1160
1161 if (c == -ESC_N && ptr[1] == CHAR_LEFT_CURLY_BRACKET &&
1162      !is_counted_repeat(ptr+2))
1163   *errorcodeptr = ERR37;
1164
1165 /* If PCRE_UCP is set, we change the values for \d etc. */
1166
1167 if ((options & PCRE_UCP) != 0 && c <= -ESC_D && c >= -ESC_w)
1168   c -= (ESC_DU - ESC_D);
1169
1170 /* Set the pointer to the final character before returning. */
1171
1172 *ptrptr = ptr;
1173 return c;
1174 }
1175
1176
1177
1178 #ifdef SUPPORT_UCP
1179 /*************************************************
1180 *               Handle \P and \p                 *
1181 *************************************************/
1182
1183 /* This function is called after \P or \p has been encountered, provided that
1184 PCRE is compiled with support for Unicode properties. On entry, ptrptr is
1185 pointing at the P or p. On exit, it is pointing at the final character of the
1186 escape sequence.
1187
1188 Argument:
1189   ptrptr         points to the pattern position pointer
1190   negptr         points to a boolean that is set TRUE for negation else FALSE
1191   dptr           points to an int that is set to the detailed property value
1192   errorcodeptr   points to the error code variable
1193
1194 Returns:         type value from ucp_type_table, or -1 for an invalid type
1195 */
1196
1197 static int
1198 get_ucp(const pcre_uchar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
1199 {
1200 int c, i, bot, top;
1201 const pcre_uchar *ptr = *ptrptr;
1202 pcre_uchar name[32];
1203
1204 c = *(++ptr);
1205 if (c == 0) goto ERROR_RETURN;
1206
1207 *negptr = FALSE;
1208
1209 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
1210 negation. */
1211
1212 if (c == CHAR_LEFT_CURLY_BRACKET)
1213   {
1214   if (ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
1215     {
1216     *negptr = TRUE;
1217     ptr++;
1218     }
1219   for (i = 0; i < (int)(sizeof(name) / sizeof(pcre_uchar)) - 1; i++)
1220     {
1221     c = *(++ptr);
1222     if (c == 0) goto ERROR_RETURN;
1223     if (c == CHAR_RIGHT_CURLY_BRACKET) break;
1224     name[i] = c;
1225     }
1226   if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
1227   name[i] = 0;
1228   }
1229
1230 /* Otherwise there is just one following character */
1231
1232 else
1233   {
1234   name[0] = c;
1235   name[1] = 0;
1236   }
1237
1238 *ptrptr = ptr;
1239
1240 /* Search for a recognized property name using binary chop */
1241
1242 bot = 0;
1243 top = PRIV(utt_size);
1244
1245 while (bot < top)
1246   {
1247   i = (bot + top) >> 1;
1248   c = STRCMP_UC_C8(name, PRIV(utt_names) + PRIV(utt)[i].name_offset);
1249   if (c == 0)
1250     {
1251     *dptr = PRIV(utt)[i].value;
1252     return PRIV(utt)[i].type;
1253     }
1254   if (c > 0) bot = i + 1; else top = i;
1255   }
1256
1257 *errorcodeptr = ERR47;
1258 *ptrptr = ptr;
1259 return -1;
1260
1261 ERROR_RETURN:
1262 *errorcodeptr = ERR46;
1263 *ptrptr = ptr;
1264 return -1;
1265 }
1266 #endif
1267
1268
1269
1270
1271 /*************************************************
1272 *         Read repeat counts                     *
1273 *************************************************/
1274
1275 /* Read an item of the form {n,m} and return the values. This is called only
1276 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
1277 so the syntax is guaranteed to be correct, but we need to check the values.
1278
1279 Arguments:
1280   p              pointer to first char after '{'
1281   minp           pointer to int for min
1282   maxp           pointer to int for max
1283                  returned as -1 if no max
1284   errorcodeptr   points to error code variable
1285
1286 Returns:         pointer to '}' on success;
1287                  current ptr on error, with errorcodeptr set non-zero
1288 */
1289
1290 static const pcre_uchar *
1291 read_repeat_counts(const pcre_uchar *p, int *minp, int *maxp, int *errorcodeptr)
1292 {
1293 int min = 0;
1294 int max = -1;
1295
1296 /* Read the minimum value and do a paranoid check: a negative value indicates
1297 an integer overflow. */
1298
1299 while (IS_DIGIT(*p)) min = min * 10 + *p++ - CHAR_0;
1300 if (min < 0 || min > 65535)
1301   {
1302   *errorcodeptr = ERR5;
1303   return p;
1304   }
1305
1306 /* Read the maximum value if there is one, and again do a paranoid on its size.
1307 Also, max must not be less than min. */
1308
1309 if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else
1310   {
1311   if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
1312     {
1313     max = 0;
1314     while(IS_DIGIT(*p)) max = max * 10 + *p++ - CHAR_0;
1315     if (max < 0 || max > 65535)
1316       {
1317       *errorcodeptr = ERR5;
1318       return p;
1319       }
1320     if (max < min)
1321       {
1322       *errorcodeptr = ERR4;
1323       return p;
1324       }
1325     }
1326   }
1327
1328 /* Fill in the required variables, and pass back the pointer to the terminating
1329 '}'. */
1330
1331 *minp = min;
1332 *maxp = max;
1333 return p;
1334 }
1335
1336
1337
1338 /*************************************************
1339 *  Subroutine for finding forward reference      *
1340 *************************************************/
1341
1342 /* This recursive function is called only from find_parens() below. The
1343 top-level call starts at the beginning of the pattern. All other calls must
1344 start at a parenthesis. It scans along a pattern's text looking for capturing
1345 subpatterns, and counting them. If it finds a named pattern that matches the
1346 name it is given, it returns its number. Alternatively, if the name is NULL, it
1347 returns when it reaches a given numbered subpattern. Recursion is used to keep
1348 track of subpatterns that reset the capturing group numbers - the (?| feature.
1349
1350 This function was originally called only from the second pass, in which we know
1351 that if (?< or (?' or (?P< is encountered, the name will be correctly
1352 terminated because that is checked in the first pass. There is now one call to
1353 this function in the first pass, to check for a recursive back reference by
1354 name (so that we can make the whole group atomic). In this case, we need check
1355 only up to the current position in the pattern, and that is still OK because
1356 and previous occurrences will have been checked. To make this work, the test
1357 for "end of pattern" is a check against cd->end_pattern in the main loop,
1358 instead of looking for a binary zero. This means that the special first-pass
1359 call can adjust cd->end_pattern temporarily. (Checks for binary zero while
1360 processing items within the loop are OK, because afterwards the main loop will
1361 terminate.)
1362
1363 Arguments:
1364   ptrptr       address of the current character pointer (updated)
1365   cd           compile background data
1366   name         name to seek, or NULL if seeking a numbered subpattern
1367   lorn         name length, or subpattern number if name is NULL
1368   xmode        TRUE if we are in /x mode
1369   utf          TRUE if we are in UTF-8 / UTF-16 mode
1370   count        pointer to the current capturing subpattern number (updated)
1371
1372 Returns:       the number of the named subpattern, or -1 if not found
1373 */
1374
1375 static int
1376 find_parens_sub(pcre_uchar **ptrptr, compile_data *cd, const pcre_uchar *name, int lorn,
1377   BOOL xmode, BOOL utf, int *count)
1378 {
1379 pcre_uchar *ptr = *ptrptr;
1380 int start_count = *count;
1381 int hwm_count = start_count;
1382 BOOL dup_parens = FALSE;
1383
1384 /* If the first character is a parenthesis, check on the type of group we are
1385 dealing with. The very first call may not start with a parenthesis. */
1386
1387 if (ptr[0] == CHAR_LEFT_PARENTHESIS)
1388   {
1389   /* Handle specials such as (*SKIP) or (*UTF8) etc. */
1390
1391   if (ptr[1] == CHAR_ASTERISK) ptr += 2;
1392
1393   /* Handle a normal, unnamed capturing parenthesis. */
1394
1395   else if (ptr[1] != CHAR_QUESTION_MARK)
1396     {
1397     *count += 1;
1398     if (name == NULL && *count == lorn) return *count;
1399     ptr++;
1400     }
1401
1402   /* All cases now have (? at the start. Remember when we are in a group
1403   where the parenthesis numbers are duplicated. */
1404
1405   else if (ptr[2] == CHAR_VERTICAL_LINE)
1406     {
1407     ptr += 3;
1408     dup_parens = TRUE;
1409     }
1410
1411   /* Handle comments; all characters are allowed until a ket is reached. */
1412
1413   else if (ptr[2] == CHAR_NUMBER_SIGN)
1414     {
1415     for (ptr += 3; *ptr != 0; ptr++) if (*ptr == CHAR_RIGHT_PARENTHESIS) break;
1416     goto FAIL_EXIT;
1417     }
1418
1419   /* Handle a condition. If it is an assertion, just carry on so that it
1420   is processed as normal. If not, skip to the closing parenthesis of the
1421   condition (there can't be any nested parens). */
1422
1423   else if (ptr[2] == CHAR_LEFT_PARENTHESIS)
1424     {
1425     ptr += 2;
1426     if (ptr[1] != CHAR_QUESTION_MARK)
1427       {
1428       while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
1429       if (*ptr != 0) ptr++;
1430       }
1431     }
1432
1433   /* Start with (? but not a condition. */
1434
1435   else
1436     {
1437     ptr += 2;
1438     if (*ptr == CHAR_P) ptr++;                      /* Allow optional P */
1439
1440     /* We have to disambiguate (?<! and (?<= from (?<name> for named groups */
1441
1442     if ((*ptr == CHAR_LESS_THAN_SIGN && ptr[1] != CHAR_EXCLAMATION_MARK &&
1443         ptr[1] != CHAR_EQUALS_SIGN) || *ptr == CHAR_APOSTROPHE)
1444       {
1445       int term;
1446       const pcre_uchar *thisname;
1447       *count += 1;
1448       if (name == NULL && *count == lorn) return *count;
1449       term = *ptr++;
1450       if (term == CHAR_LESS_THAN_SIGN) term = CHAR_GREATER_THAN_SIGN;
1451       thisname = ptr;
1452       while (*ptr != term) ptr++;
1453       if (name != NULL && lorn == ptr - thisname &&
1454           STRNCMP_UC_UC(name, thisname, lorn) == 0)
1455         return *count;
1456       term++;
1457       }
1458     }
1459   }
1460
1461 /* Past any initial parenthesis handling, scan for parentheses or vertical
1462 bars. Stop if we get to cd->end_pattern. Note that this is important for the
1463 first-pass call when this value is temporarily adjusted to stop at the current
1464 position. So DO NOT change this to a test for binary zero. */
1465
1466 for (; ptr < cd->end_pattern; ptr++)
1467   {
1468   /* Skip over backslashed characters and also entire \Q...\E */
1469
1470   if (*ptr == CHAR_BACKSLASH)
1471     {
1472     if (*(++ptr) == 0) goto FAIL_EXIT;
1473     if (*ptr == CHAR_Q) for (;;)
1474       {
1475       while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1476       if (*ptr == 0) goto FAIL_EXIT;
1477       if (*(++ptr) == CHAR_E) break;
1478       }
1479     continue;
1480     }
1481
1482   /* Skip over character classes; this logic must be similar to the way they
1483   are handled for real. If the first character is '^', skip it. Also, if the
1484   first few characters (either before or after ^) are \Q\E or \E we skip them
1485   too. This makes for compatibility with Perl. Note the use of STR macros to
1486   encode "Q\\E" so that it works in UTF-8 on EBCDIC platforms. */
1487
1488   if (*ptr == CHAR_LEFT_SQUARE_BRACKET)
1489     {
1490     BOOL negate_class = FALSE;
1491     for (;;)
1492       {
1493       if (ptr[1] == CHAR_BACKSLASH)
1494         {
1495         if (ptr[2] == CHAR_E)
1496           ptr+= 2;
1497         else if (STRNCMP_UC_C8(ptr + 2,
1498                  STR_Q STR_BACKSLASH STR_E, 3) == 0)
1499           ptr += 4;
1500         else
1501           break;
1502         }
1503       else if (!negate_class && ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
1504         {
1505         negate_class = TRUE;
1506         ptr++;
1507         }
1508       else break;
1509       }
1510
1511     /* If the next character is ']', it is a data character that must be
1512     skipped, except in JavaScript compatibility mode. */
1513
1514     if (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET &&
1515         (cd->external_options & PCRE_JAVASCRIPT_COMPAT) == 0)
1516       ptr++;
1517
1518     while (*(++ptr) != CHAR_RIGHT_SQUARE_BRACKET)
1519       {
1520       if (*ptr == 0) return -1;
1521       if (*ptr == CHAR_BACKSLASH)
1522         {
1523         if (*(++ptr) == 0) goto FAIL_EXIT;
1524         if (*ptr == CHAR_Q) for (;;)
1525           {
1526           while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1527           if (*ptr == 0) goto FAIL_EXIT;
1528           if (*(++ptr) == CHAR_E) break;
1529           }
1530         continue;
1531         }
1532       }
1533     continue;
1534     }
1535
1536   /* Skip comments in /x mode */
1537
1538   if (xmode && *ptr == CHAR_NUMBER_SIGN)
1539     {
1540     ptr++;
1541     while (*ptr != 0)
1542       {
1543       if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
1544       ptr++;
1545 #ifdef SUPPORT_UTF
1546       if (utf) FORWARDCHAR(ptr);
1547 #endif
1548       }
1549     if (*ptr == 0) goto FAIL_EXIT;
1550     continue;
1551     }
1552
1553   /* Check for the special metacharacters */
1554
1555   if (*ptr == CHAR_LEFT_PARENTHESIS)
1556     {
1557     int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf, count);
1558     if (rc > 0) return rc;
1559     if (*ptr == 0) goto FAIL_EXIT;
1560     }
1561
1562   else if (*ptr == CHAR_RIGHT_PARENTHESIS)
1563     {
1564     if (dup_parens && *count < hwm_count) *count = hwm_count;
1565     goto FAIL_EXIT;
1566     }
1567
1568   else if (*ptr == CHAR_VERTICAL_LINE && dup_parens)
1569     {
1570     if (*count > hwm_count) hwm_count = *count;
1571     *count = start_count;
1572     }
1573   }
1574
1575 FAIL_EXIT:
1576 *ptrptr = ptr;
1577 return -1;
1578 }
1579
1580
1581
1582
1583 /*************************************************
1584 *       Find forward referenced subpattern       *
1585 *************************************************/
1586
1587 /* This function scans along a pattern's text looking for capturing
1588 subpatterns, and counting them. If it finds a named pattern that matches the
1589 name it is given, it returns its number. Alternatively, if the name is NULL, it
1590 returns when it reaches a given numbered subpattern. This is used for forward
1591 references to subpatterns. We used to be able to start this scan from the
1592 current compiling point, using the current count value from cd->bracount, and
1593 do it all in a single loop, but the addition of the possibility of duplicate
1594 subpattern numbers means that we have to scan from the very start, in order to
1595 take account of such duplicates, and to use a recursive function to keep track
1596 of the different types of group.
1597
1598 Arguments:
1599   cd           compile background data
1600   name         name to seek, or NULL if seeking a numbered subpattern
1601   lorn         name length, or subpattern number if name is NULL
1602   xmode        TRUE if we are in /x mode
1603   utf          TRUE if we are in UTF-8 / UTF-16 mode
1604
1605 Returns:       the number of the found subpattern, or -1 if not found
1606 */
1607
1608 static int
1609 find_parens(compile_data *cd, const pcre_uchar *name, int lorn, BOOL xmode,
1610   BOOL utf)
1611 {
1612 pcre_uchar *ptr = (pcre_uchar *)cd->start_pattern;
1613 int count = 0;
1614 int rc;
1615
1616 /* If the pattern does not start with an opening parenthesis, the first call
1617 to find_parens_sub() will scan right to the end (if necessary). However, if it
1618 does start with a parenthesis, find_parens_sub() will return when it hits the
1619 matching closing parens. That is why we have to have a loop. */
1620
1621 for (;;)
1622   {
1623   rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf, &count);
1624   if (rc > 0 || *ptr++ == 0) break;
1625   }
1626
1627 return rc;
1628 }
1629
1630
1631
1632
1633 /*************************************************
1634 *      Find first significant op code            *
1635 *************************************************/
1636
1637 /* This is called by several functions that scan a compiled expression looking
1638 for a fixed first character, or an anchoring op code etc. It skips over things
1639 that do not influence this. For some calls, it makes sense to skip negative
1640 forward and all backward assertions, and also the \b assertion; for others it
1641 does not.
1642
1643 Arguments:
1644   code         pointer to the start of the group
1645   skipassert   TRUE if certain assertions are to be skipped
1646
1647 Returns:       pointer to the first significant opcode
1648 */
1649
1650 static const pcre_uchar*
1651 first_significant_code(const pcre_uchar *code, BOOL skipassert)
1652 {
1653 for (;;)
1654   {
1655   switch ((int)*code)
1656     {
1657     case OP_ASSERT_NOT:
1658     case OP_ASSERTBACK:
1659     case OP_ASSERTBACK_NOT:
1660     if (!skipassert) return code;
1661     do code += GET(code, 1); while (*code == OP_ALT);
1662     code += PRIV(OP_lengths)[*code];
1663     break;
1664
1665     case OP_WORD_BOUNDARY:
1666     case OP_NOT_WORD_BOUNDARY:
1667     if (!skipassert) return code;
1668     /* Fall through */
1669
1670     case OP_CALLOUT:
1671     case OP_CREF:
1672     case OP_NCREF:
1673     case OP_RREF:
1674     case OP_NRREF:
1675     case OP_DEF:
1676     code += PRIV(OP_lengths)[*code];
1677     break;
1678
1679     default:
1680     return code;
1681     }
1682   }
1683 /* Control never reaches here */
1684 }
1685
1686
1687
1688
1689 /*************************************************
1690 *        Find the fixed length of a branch       *
1691 *************************************************/
1692
1693 /* Scan a branch and compute the fixed length of subject that will match it,
1694 if the length is fixed. This is needed for dealing with backward assertions.
1695 In UTF8 mode, the result is in characters rather than bytes. The branch is
1696 temporarily terminated with OP_END when this function is called.
1697
1698 This function is called when a backward assertion is encountered, so that if it
1699 fails, the error message can point to the correct place in the pattern.
1700 However, we cannot do this when the assertion contains subroutine calls,
1701 because they can be forward references. We solve this by remembering this case
1702 and doing the check at the end; a flag specifies which mode we are running in.
1703
1704 Arguments:
1705   code     points to the start of the pattern (the bracket)
1706   utf      TRUE in UTF-8 / UTF-16 mode
1707   atend    TRUE if called when the pattern is complete
1708   cd       the "compile data" structure
1709
1710 Returns:   the fixed length,
1711              or -1 if there is no fixed length,
1712              or -2 if \C was encountered (in UTF-8 mode only)
1713              or -3 if an OP_RECURSE item was encountered and atend is FALSE
1714              or -4 if an unknown opcode was encountered (internal error)
1715 */
1716
1717 static int
1718 find_fixedlength(pcre_uchar *code, BOOL utf, BOOL atend, compile_data *cd)
1719 {
1720 int length = -1;
1721
1722 int branchlength = 0;
1723 pcre_uchar *cc = code + 1 + LINK_SIZE;
1724
1725 /* Scan along the opcodes for this branch. If we get to the end of the
1726 branch, check the length against that of the other branches. */
1727
1728 for (;;)
1729   {
1730   int d;
1731   pcre_uchar *ce, *cs;
1732   int op = *cc;
1733
1734   switch (op)
1735     {
1736     /* We only need to continue for OP_CBRA (normal capturing bracket) and
1737     OP_BRA (normal non-capturing bracket) because the other variants of these
1738     opcodes are all concerned with unlimited repeated groups, which of course
1739     are not of fixed length. */
1740
1741     case OP_CBRA:
1742     case OP_BRA:
1743     case OP_ONCE:
1744     case OP_ONCE_NC:
1745     case OP_COND:
1746     d = find_fixedlength(cc + ((op == OP_CBRA)? IMM2_SIZE : 0), utf, atend, cd);
1747     if (d < 0) return d;
1748     branchlength += d;
1749     do cc += GET(cc, 1); while (*cc == OP_ALT);
1750     cc += 1 + LINK_SIZE;
1751     break;
1752
1753     /* Reached end of a branch; if it's a ket it is the end of a nested call.
1754     If it's ALT it is an alternation in a nested call. An ACCEPT is effectively
1755     an ALT. If it is END it's the end of the outer call. All can be handled by
1756     the same code. Note that we must not include the OP_KETRxxx opcodes here,
1757     because they all imply an unlimited repeat. */
1758
1759     case OP_ALT:
1760     case OP_KET:
1761     case OP_END:
1762     case OP_ACCEPT:
1763     case OP_ASSERT_ACCEPT:
1764     if (length < 0) length = branchlength;
1765       else if (length != branchlength) return -1;
1766     if (*cc != OP_ALT) return length;
1767     cc += 1 + LINK_SIZE;
1768     branchlength = 0;
1769     break;
1770
1771     /* A true recursion implies not fixed length, but a subroutine call may
1772     be OK. If the subroutine is a forward reference, we can't deal with
1773     it until the end of the pattern, so return -3. */
1774
1775     case OP_RECURSE:
1776     if (!atend) return -3;
1777     cs = ce = (pcre_uchar *)cd->start_code + GET(cc, 1);  /* Start subpattern */
1778     do ce += GET(ce, 1); while (*ce == OP_ALT);           /* End subpattern */
1779     if (cc > cs && cc < ce) return -1;                    /* Recursion */
1780     d = find_fixedlength(cs + IMM2_SIZE, utf, atend, cd);
1781     if (d < 0) return d;
1782     branchlength += d;
1783     cc += 1 + LINK_SIZE;
1784     break;
1785
1786     /* Skip over assertive subpatterns */
1787
1788     case OP_ASSERT:
1789     case OP_ASSERT_NOT:
1790     case OP_ASSERTBACK:
1791     case OP_ASSERTBACK_NOT:
1792     do cc += GET(cc, 1); while (*cc == OP_ALT);
1793     cc += PRIV(OP_lengths)[*cc];
1794     break;
1795
1796     /* Skip over things that don't match chars */
1797
1798     case OP_MARK:
1799     case OP_PRUNE_ARG:
1800     case OP_SKIP_ARG:
1801     case OP_THEN_ARG:
1802     cc += cc[1] + PRIV(OP_lengths)[*cc];
1803     break;
1804
1805     case OP_CALLOUT:
1806     case OP_CIRC:
1807     case OP_CIRCM:
1808     case OP_CLOSE:
1809     case OP_COMMIT:
1810     case OP_CREF:
1811     case OP_DEF:
1812     case OP_DOLL:
1813     case OP_DOLLM:
1814     case OP_EOD:
1815     case OP_EODN:
1816     case OP_FAIL:
1817     case OP_NCREF:
1818     case OP_NRREF:
1819     case OP_NOT_WORD_BOUNDARY:
1820     case OP_PRUNE:
1821     case OP_REVERSE:
1822     case OP_RREF:
1823     case OP_SET_SOM:
1824     case OP_SKIP:
1825     case OP_SOD:
1826     case OP_SOM:
1827     case OP_THEN:
1828     case OP_WORD_BOUNDARY:
1829     cc += PRIV(OP_lengths)[*cc];
1830     break;
1831
1832     /* Handle literal characters */
1833
1834     case OP_CHAR:
1835     case OP_CHARI:
1836     case OP_NOT:
1837     case OP_NOTI:
1838     branchlength++;
1839     cc += 2;
1840 #ifdef SUPPORT_UTF
1841     if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
1842 #endif
1843     break;
1844
1845     /* Handle exact repetitions. The count is already in characters, but we
1846     need to skip over a multibyte character in UTF8 mode.  */
1847
1848     case OP_EXACT:
1849     case OP_EXACTI:
1850     case OP_NOTEXACT:
1851     case OP_NOTEXACTI:
1852     branchlength += GET2(cc,1);
1853     cc += 2 + IMM2_SIZE;
1854 #ifdef SUPPORT_UTF
1855     if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
1856 #endif
1857     break;
1858
1859     case OP_TYPEEXACT:
1860     branchlength += GET2(cc,1);
1861     if (cc[1 + IMM2_SIZE] == OP_PROP || cc[1 + IMM2_SIZE] == OP_NOTPROP) cc += 2;
1862     cc += 1 + IMM2_SIZE + 1;
1863     break;
1864
1865     /* Handle single-char matchers */
1866
1867     case OP_PROP:
1868     case OP_NOTPROP:
1869     cc += 2;
1870     /* Fall through */
1871
1872     case OP_HSPACE:
1873     case OP_VSPACE:
1874     case OP_NOT_HSPACE:
1875     case OP_NOT_VSPACE:
1876     case OP_NOT_DIGIT:
1877     case OP_DIGIT:
1878     case OP_NOT_WHITESPACE:
1879     case OP_WHITESPACE:
1880     case OP_NOT_WORDCHAR:
1881     case OP_WORDCHAR:
1882     case OP_ANY:
1883     case OP_ALLANY:
1884     branchlength++;
1885     cc++;
1886     break;
1887
1888     /* The single-byte matcher isn't allowed. This only happens in UTF-8 mode;
1889     otherwise \C is coded as OP_ALLANY. */
1890
1891     case OP_ANYBYTE:
1892     return -2;
1893
1894     /* Check a class for variable quantification */
1895
1896 #if defined SUPPORT_UTF || defined COMPILE_PCRE16
1897     case OP_XCLASS:
1898     cc += GET(cc, 1) - PRIV(OP_lengths)[OP_CLASS];
1899     /* Fall through */
1900 #endif
1901
1902     case OP_CLASS:
1903     case OP_NCLASS:
1904     cc += PRIV(OP_lengths)[OP_CLASS];
1905
1906     switch (*cc)
1907       {
1908       case OP_CRPLUS:
1909       case OP_CRMINPLUS:
1910       case OP_CRSTAR:
1911       case OP_CRMINSTAR:
1912       case OP_CRQUERY:
1913       case OP_CRMINQUERY:
1914       return -1;
1915
1916       case OP_CRRANGE:
1917       case OP_CRMINRANGE:
1918       if (GET2(cc,1) != GET2(cc,1+IMM2_SIZE)) return -1;
1919       branchlength += GET2(cc,1);
1920       cc += 1 + 2 * IMM2_SIZE;
1921       break;
1922
1923       default:
1924       branchlength++;
1925       }
1926     break;
1927
1928     /* Anything else is variable length */
1929
1930     case OP_ANYNL:
1931     case OP_BRAMINZERO:
1932     case OP_BRAPOS:
1933     case OP_BRAPOSZERO:
1934     case OP_BRAZERO:
1935     case OP_CBRAPOS:
1936     case OP_EXTUNI:
1937     case OP_KETRMAX:
1938     case OP_KETRMIN:
1939     case OP_KETRPOS:
1940     case OP_MINPLUS:
1941     case OP_MINPLUSI:
1942     case OP_MINQUERY:
1943     case OP_MINQUERYI:
1944     case OP_MINSTAR:
1945     case OP_MINSTARI:
1946     case OP_MINUPTO:
1947     case OP_MINUPTOI:
1948     case OP_NOTMINPLUS:
1949     case OP_NOTMINPLUSI:
1950     case OP_NOTMINQUERY:
1951     case OP_NOTMINQUERYI:
1952     case OP_NOTMINSTAR:
1953     case OP_NOTMINSTARI:
1954     case OP_NOTMINUPTO:
1955     case OP_NOTMINUPTOI:
1956     case OP_NOTPLUS:
1957     case OP_NOTPLUSI:
1958     case OP_NOTPOSPLUS:
1959     case OP_NOTPOSPLUSI:
1960     case OP_NOTPOSQUERY:
1961     case OP_NOTPOSQUERYI:
1962     case OP_NOTPOSSTAR:
1963     case OP_NOTPOSSTARI:
1964     case OP_NOTPOSUPTO:
1965     case OP_NOTPOSUPTOI:
1966     case OP_NOTQUERY:
1967     case OP_NOTQUERYI:
1968     case OP_NOTSTAR:
1969     case OP_NOTSTARI:
1970     case OP_NOTUPTO:
1971     case OP_NOTUPTOI:
1972     case OP_PLUS:
1973     case OP_PLUSI:
1974     case OP_POSPLUS:
1975     case OP_POSPLUSI:
1976     case OP_POSQUERY:
1977     case OP_POSQUERYI:
1978     case OP_POSSTAR:
1979     case OP_POSSTARI:
1980     case OP_POSUPTO:
1981     case OP_POSUPTOI:
1982     case OP_QUERY:
1983     case OP_QUERYI:
1984     case OP_REF:
1985     case OP_REFI:
1986     case OP_SBRA:
1987     case OP_SBRAPOS:
1988     case OP_SCBRA:
1989     case OP_SCBRAPOS:
1990     case OP_SCOND:
1991     case OP_SKIPZERO:
1992     case OP_STAR:
1993     case OP_STARI:
1994     case OP_TYPEMINPLUS:
1995     case OP_TYPEMINQUERY:
1996     case OP_TYPEMINSTAR:
1997     case OP_TYPEMINUPTO:
1998     case OP_TYPEPLUS:
1999     case OP_TYPEPOSPLUS:
2000     case OP_TYPEPOSQUERY:
2001     case OP_TYPEPOSSTAR:
2002     case OP_TYPEPOSUPTO:
2003     case OP_TYPEQUERY:
2004     case OP_TYPESTAR:
2005     case OP_TYPEUPTO:
2006     case OP_UPTO:
2007     case OP_UPTOI:
2008     return -1;
2009
2010     /* Catch unrecognized opcodes so that when new ones are added they
2011     are not forgotten, as has happened in the past. */
2012
2013     default:
2014     return -4;
2015     }
2016   }
2017 /* Control never gets here */
2018 }
2019
2020
2021
2022
2023 /*************************************************
2024 *    Scan compiled regex for specific bracket    *
2025 *************************************************/
2026
2027 /* This little function scans through a compiled pattern until it finds a
2028 capturing bracket with the given number, or, if the number is negative, an
2029 instance of OP_REVERSE for a lookbehind. The function is global in the C sense
2030 so that it can be called from pcre_study() when finding the minimum matching
2031 length.
2032
2033 Arguments:
2034   code        points to start of expression
2035   utf         TRUE in UTF-8 / UTF-16 mode
2036   number      the required bracket number or negative to find a lookbehind
2037
2038 Returns:      pointer to the opcode for the bracket, or NULL if not found
2039 */
2040
2041 const pcre_uchar *
2042 PRIV(find_bracket)(const pcre_uchar *code, BOOL utf, int number)
2043 {
2044 for (;;)
2045   {
2046   int c = *code;
2047
2048   if (c == OP_END) return NULL;
2049
2050   /* XCLASS is used for classes that cannot be represented just by a bit
2051   map. This includes negated single high-valued characters. The length in
2052   the table is zero; the actual length is stored in the compiled code. */
2053
2054   if (c == OP_XCLASS) code += GET(code, 1);
2055
2056   /* Handle recursion */
2057
2058   else if (c == OP_REVERSE)
2059     {
2060     if (number < 0) return (pcre_uchar *)code;
2061     code += PRIV(OP_lengths)[c];
2062     }
2063
2064   /* Handle capturing bracket */
2065
2066   else if (c == OP_CBRA || c == OP_SCBRA ||
2067            c == OP_CBRAPOS || c == OP_SCBRAPOS)
2068     {
2069     int n = GET2(code, 1+LINK_SIZE);
2070     if (n == number) return (pcre_uchar *)code;
2071     code += PRIV(OP_lengths)[c];
2072     }
2073
2074   /* Otherwise, we can get the item's length from the table, except that for
2075   repeated character types, we have to test for \p and \P, which have an extra
2076   two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
2077   must add in its length. */
2078
2079   else
2080     {
2081     switch(c)
2082       {
2083       case OP_TYPESTAR:
2084       case OP_TYPEMINSTAR:
2085       case OP_TYPEPLUS:
2086       case OP_TYPEMINPLUS:
2087       case OP_TYPEQUERY:
2088       case OP_TYPEMINQUERY:
2089       case OP_TYPEPOSSTAR:
2090       case OP_TYPEPOSPLUS:
2091       case OP_TYPEPOSQUERY:
2092       if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2093       break;
2094
2095       case OP_TYPEUPTO:
2096       case OP_TYPEMINUPTO:
2097       case OP_TYPEEXACT:
2098       case OP_TYPEPOSUPTO:
2099       if (code[1 + IMM2_SIZE] == OP_PROP
2100         || code[1 + IMM2_SIZE] == OP_NOTPROP) code += 2;
2101       break;
2102
2103       case OP_MARK:
2104       case OP_PRUNE_ARG:
2105       case OP_SKIP_ARG:
2106       code += code[1];
2107       break;
2108
2109       case OP_THEN_ARG:
2110       code += code[1];
2111       break;
2112       }
2113
2114     /* Add in the fixed length from the table */
2115
2116     code += PRIV(OP_lengths)[c];
2117
2118   /* In UTF-8 mode, opcodes that are followed by a character may be followed by
2119   a multi-byte character. The length in the table is a minimum, so we have to
2120   arrange to skip the extra bytes. */
2121
2122 #ifdef SUPPORT_UTF
2123     if (utf) switch(c)
2124       {
2125       case OP_CHAR:
2126       case OP_CHARI:
2127       case OP_EXACT:
2128       case OP_EXACTI:
2129       case OP_UPTO:
2130       case OP_UPTOI:
2131       case OP_MINUPTO:
2132       case OP_MINUPTOI:
2133       case OP_POSUPTO:
2134       case OP_POSUPTOI:
2135       case OP_STAR:
2136       case OP_STARI:
2137       case OP_MINSTAR:
2138       case OP_MINSTARI:
2139       case OP_POSSTAR:
2140       case OP_POSSTARI:
2141       case OP_PLUS:
2142       case OP_PLUSI:
2143       case OP_MINPLUS:
2144       case OP_MINPLUSI:
2145       case OP_POSPLUS:
2146       case OP_POSPLUSI:
2147       case OP_QUERY:
2148       case OP_QUERYI:
2149       case OP_MINQUERY:
2150       case OP_MINQUERYI:
2151       case OP_POSQUERY:
2152       case OP_POSQUERYI:
2153       if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
2154       break;
2155       }
2156 #else
2157     (void)(utf);  /* Keep compiler happy by referencing function argument */
2158 #endif
2159     }
2160   }
2161 }
2162
2163
2164
2165 /*************************************************
2166 *   Scan compiled regex for recursion reference  *
2167 *************************************************/
2168
2169 /* This little function scans through a compiled pattern until it finds an
2170 instance of OP_RECURSE.
2171
2172 Arguments:
2173   code        points to start of expression
2174   utf         TRUE in UTF-8 / UTF-16 mode
2175
2176 Returns:      pointer to the opcode for OP_RECURSE, or NULL if not found
2177 */
2178
2179 static const pcre_uchar *
2180 find_recurse(const pcre_uchar *code, BOOL utf)
2181 {
2182 for (;;)
2183   {
2184   int c = *code;
2185   if (c == OP_END) return NULL;
2186   if (c == OP_RECURSE) return code;
2187
2188   /* XCLASS is used for classes that cannot be represented just by a bit
2189   map. This includes negated single high-valued characters. The length in
2190   the table is zero; the actual length is stored in the compiled code. */
2191
2192   if (c == OP_XCLASS) code += GET(code, 1);
2193
2194   /* Otherwise, we can get the item's length from the table, except that for
2195   repeated character types, we have to test for \p and \P, which have an extra
2196   two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
2197   must add in its length. */
2198
2199   else
2200     {
2201     switch(c)
2202       {
2203       case OP_TYPESTAR:
2204       case OP_TYPEMINSTAR:
2205       case OP_TYPEPLUS:
2206       case OP_TYPEMINPLUS:
2207       case OP_TYPEQUERY:
2208       case OP_TYPEMINQUERY:
2209       case OP_TYPEPOSSTAR:
2210       case OP_TYPEPOSPLUS:
2211       case OP_TYPEPOSQUERY:
2212       if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2213       break;
2214
2215       case OP_TYPEPOSUPTO:
2216       case OP_TYPEUPTO:
2217       case OP_TYPEMINUPTO:
2218       case OP_TYPEEXACT:
2219       if (code[1 + IMM2_SIZE] == OP_PROP
2220         || code[1 + IMM2_SIZE] == OP_NOTPROP) code += 2;
2221       break;
2222
2223       case OP_MARK:
2224       case OP_PRUNE_ARG:
2225       case OP_SKIP_ARG:
2226       code += code[1];
2227       break;
2228
2229       case OP_THEN_ARG:
2230       code += code[1];
2231       break;
2232       }
2233
2234     /* Add in the fixed length from the table */
2235
2236     code += PRIV(OP_lengths)[c];
2237
2238     /* In UTF-8 mode, opcodes that are followed by a character may be followed
2239     by a multi-byte character. The length in the table is a minimum, so we have
2240     to arrange to skip the extra bytes. */
2241
2242 #ifdef SUPPORT_UTF
2243     if (utf) switch(c)
2244       {
2245       case OP_CHAR:
2246       case OP_CHARI:
2247       case OP_NOT:
2248       case OP_NOTI:
2249       case OP_EXACT:
2250       case OP_EXACTI:
2251       case OP_NOTEXACT:
2252       case OP_NOTEXACTI:
2253       case OP_UPTO:
2254       case OP_UPTOI:
2255       case OP_NOTUPTO:
2256       case OP_NOTUPTOI:
2257       case OP_MINUPTO:
2258       case OP_MINUPTOI:
2259       case OP_NOTMINUPTO:
2260       case OP_NOTMINUPTOI:
2261       case OP_POSUPTO:
2262       case OP_POSUPTOI:
2263       case OP_NOTPOSUPTO:
2264       case OP_NOTPOSUPTOI:
2265       case OP_STAR:
2266       case OP_STARI:
2267       case OP_NOTSTAR:
2268       case OP_NOTSTARI:
2269       case OP_MINSTAR:
2270       case OP_MINSTARI:
2271       case OP_NOTMINSTAR:
2272       case OP_NOTMINSTARI:
2273       case OP_POSSTAR:
2274       case OP_POSSTARI:
2275       case OP_NOTPOSSTAR:
2276       case OP_NOTPOSSTARI:
2277       case OP_PLUS:
2278       case OP_PLUSI:
2279       case OP_NOTPLUS:
2280       case OP_NOTPLUSI:
2281       case OP_MINPLUS:
2282       case OP_MINPLUSI:
2283       case OP_NOTMINPLUS:
2284       case OP_NOTMINPLUSI:
2285       case OP_POSPLUS:
2286       case OP_POSPLUSI:
2287       case OP_NOTPOSPLUS:
2288       case OP_NOTPOSPLUSI:
2289       case OP_QUERY:
2290       case OP_QUERYI:
2291       case OP_NOTQUERY:
2292       case OP_NOTQUERYI:
2293       case OP_MINQUERY:
2294       case OP_MINQUERYI:
2295       case OP_NOTMINQUERY:
2296       case OP_NOTMINQUERYI:
2297       case OP_POSQUERY:
2298       case OP_POSQUERYI:
2299       case OP_NOTPOSQUERY:
2300       case OP_NOTPOSQUERYI:
2301       if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
2302       break;
2303       }
2304 #else
2305     (void)(utf);  /* Keep compiler happy by referencing function argument */
2306 #endif
2307     }
2308   }
2309 }
2310
2311
2312
2313 /*************************************************
2314 *    Scan compiled branch for non-emptiness      *
2315 *************************************************/
2316
2317 /* This function scans through a branch of a compiled pattern to see whether it
2318 can match the empty string or not. It is called from could_be_empty()
2319 below and from compile_branch() when checking for an unlimited repeat of a
2320 group that can match nothing. Note that first_significant_code() skips over
2321 backward and negative forward assertions when its final argument is TRUE. If we
2322 hit an unclosed bracket, we return "empty" - this means we've struck an inner
2323 bracket whose current branch will already have been scanned.
2324
2325 Arguments:
2326   code        points to start of search
2327   endcode     points to where to stop
2328   utf         TRUE if in UTF-8 / UTF-16 mode
2329   cd          contains pointers to tables etc.
2330
2331 Returns:      TRUE if what is matched could be empty
2332 */
2333
2334 static BOOL
2335 could_be_empty_branch(const pcre_uchar *code, const pcre_uchar *endcode,
2336   BOOL utf, compile_data *cd)
2337 {
2338 int c;
2339 for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE);
2340      code < endcode;
2341      code = first_significant_code(code + PRIV(OP_lengths)[c], TRUE))
2342   {
2343   const pcre_uchar *ccode;
2344
2345   c = *code;
2346
2347   /* Skip over forward assertions; the other assertions are skipped by
2348   first_significant_code() with a TRUE final argument. */
2349
2350   if (c == OP_ASSERT)
2351     {
2352     do code += GET(code, 1); while (*code == OP_ALT);
2353     c = *code;
2354     continue;
2355     }
2356
2357   /* For a recursion/subroutine call, if its end has been reached, which
2358   implies a backward reference subroutine call, we can scan it. If it's a
2359   forward reference subroutine call, we can't. To detect forward reference
2360   we have to scan up the list that is kept in the workspace. This function is
2361   called only when doing the real compile, not during the pre-compile that
2362   measures the size of the compiled pattern. */
2363
2364   if (c == OP_RECURSE)
2365     {
2366     const pcre_uchar *scode;
2367     BOOL empty_branch;
2368
2369     /* Test for forward reference */
2370
2371     for (scode = cd->start_workspace; scode < cd->hwm; scode += LINK_SIZE)
2372       if (GET(scode, 0) == code + 1 - cd->start_code) return TRUE;
2373
2374     /* Not a forward reference, test for completed backward reference */
2375
2376     empty_branch = FALSE;
2377     scode = cd->start_code + GET(code, 1);
2378     if (GET(scode, 1) == 0) return TRUE;    /* Unclosed */
2379
2380     /* Completed backwards reference */
2381
2382     do
2383       {
2384       if (could_be_empty_branch(scode, endcode, utf, cd))
2385         {
2386         empty_branch = TRUE;
2387         break;
2388         }
2389       scode += GET(scode, 1);
2390       }
2391     while (*scode == OP_ALT);
2392
2393     if (!empty_branch) return FALSE;  /* All branches are non-empty */
2394     continue;
2395     }
2396
2397   /* Groups with zero repeats can of course be empty; skip them. */
2398
2399   if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO ||
2400       c == OP_BRAPOSZERO)
2401     {
2402     code += PRIV(OP_lengths)[c];
2403     do code += GET(code, 1); while (*code == OP_ALT);
2404     c = *code;
2405     continue;
2406     }
2407
2408   /* A nested group that is already marked as "could be empty" can just be
2409   skipped. */
2410
2411   if (c == OP_SBRA  || c == OP_SBRAPOS ||
2412       c == OP_SCBRA || c == OP_SCBRAPOS)
2413     {
2414     do code += GET(code, 1); while (*code == OP_ALT);
2415     c = *code;
2416     continue;
2417     }
2418
2419   /* For other groups, scan the branches. */
2420
2421   if (c == OP_BRA  || c == OP_BRAPOS ||
2422       c == OP_CBRA || c == OP_CBRAPOS ||
2423       c == OP_ONCE || c == OP_ONCE_NC ||
2424       c == OP_COND)
2425     {
2426     BOOL empty_branch;
2427     if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */
2428
2429     /* If a conditional group has only one branch, there is a second, implied,
2430     empty branch, so just skip over the conditional, because it could be empty.
2431     Otherwise, scan the individual branches of the group. */
2432
2433     if (c == OP_COND && code[GET(code, 1)] != OP_ALT)
2434       code += GET(code, 1);
2435     else
2436       {
2437       empty_branch = FALSE;
2438       do
2439         {
2440         if (!empty_branch && could_be_empty_branch(code, endcode, utf, cd))
2441           empty_branch = TRUE;
2442         code += GET(code, 1);
2443         }
2444       while (*code == OP_ALT);
2445       if (!empty_branch) return FALSE;   /* All branches are non-empty */
2446       }
2447
2448     c = *code;
2449     continue;
2450     }
2451
2452   /* Handle the other opcodes */
2453
2454   switch (c)
2455     {
2456     /* Check for quantifiers after a class. XCLASS is used for classes that
2457     cannot be represented just by a bit map. This includes negated single
2458     high-valued characters. The length in PRIV(OP_lengths)[] is zero; the
2459     actual length is stored in the compiled code, so we must update "code"
2460     here. */
2461
2462 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2463     case OP_XCLASS:
2464     ccode = code += GET(code, 1);
2465     goto CHECK_CLASS_REPEAT;
2466 #endif
2467
2468     case OP_CLASS:
2469     case OP_NCLASS:
2470     ccode = code + PRIV(OP_lengths)[OP_CLASS];
2471
2472 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2473     CHECK_CLASS_REPEAT:
2474 #endif
2475
2476     switch (*ccode)
2477       {
2478       case OP_CRSTAR:            /* These could be empty; continue */
2479       case OP_CRMINSTAR:
2480       case OP_CRQUERY:
2481       case OP_CRMINQUERY:
2482       break;
2483
2484       default:                   /* Non-repeat => class must match */
2485       case OP_CRPLUS:            /* These repeats aren't empty */
2486       case OP_CRMINPLUS:
2487       return FALSE;
2488
2489       case OP_CRRANGE:
2490       case OP_CRMINRANGE:
2491       if (GET2(ccode, 1) > 0) return FALSE;  /* Minimum > 0 */
2492       break;
2493       }
2494     break;
2495
2496     /* Opcodes that must match a character */
2497
2498     case OP_PROP:
2499     case OP_NOTPROP:
2500     case OP_EXTUNI:
2501     case OP_NOT_DIGIT:
2502     case OP_DIGIT:
2503     case OP_NOT_WHITESPACE:
2504     case OP_WHITESPACE:
2505     case OP_NOT_WORDCHAR:
2506     case OP_WORDCHAR:
2507     case OP_ANY:
2508     case OP_ALLANY:
2509     case OP_ANYBYTE:
2510     case OP_CHAR:
2511     case OP_CHARI:
2512     case OP_NOT:
2513     case OP_NOTI:
2514     case OP_PLUS:
2515     case OP_MINPLUS:
2516     case OP_POSPLUS:
2517     case OP_EXACT:
2518     case OP_NOTPLUS:
2519     case OP_NOTMINPLUS:
2520     case OP_NOTPOSPLUS:
2521     case OP_NOTEXACT:
2522     case OP_TYPEPLUS:
2523     case OP_TYPEMINPLUS:
2524     case OP_TYPEPOSPLUS:
2525     case OP_TYPEEXACT:
2526     return FALSE;
2527
2528     /* These are going to continue, as they may be empty, but we have to
2529     fudge the length for the \p and \P cases. */
2530
2531     case OP_TYPESTAR:
2532     case OP_TYPEMINSTAR:
2533     case OP_TYPEPOSSTAR:
2534     case OP_TYPEQUERY:
2535     case OP_TYPEMINQUERY:
2536     case OP_TYPEPOSQUERY:
2537     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2538     break;
2539
2540     /* Same for these */
2541
2542     case OP_TYPEUPTO:
2543     case OP_TYPEMINUPTO:
2544     case OP_TYPEPOSUPTO:
2545     if (code[1 + IMM2_SIZE] == OP_PROP
2546       || code[1 + IMM2_SIZE] == OP_NOTPROP) code += 2;
2547     break;
2548
2549     /* End of branch */
2550
2551     case OP_KET:
2552     case OP_KETRMAX:
2553     case OP_KETRMIN:
2554     case OP_KETRPOS:
2555     case OP_ALT:
2556     return TRUE;
2557
2558     /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
2559     MINUPTO, and POSUPTO may be followed by a multibyte character */
2560
2561 #ifdef SUPPORT_UTF
2562     case OP_STAR:
2563     case OP_STARI:
2564     case OP_MINSTAR:
2565     case OP_MINSTARI:
2566     case OP_POSSTAR:
2567     case OP_POSSTARI:
2568     case OP_QUERY:
2569     case OP_QUERYI:
2570     case OP_MINQUERY:
2571     case OP_MINQUERYI:
2572     case OP_POSQUERY:
2573     case OP_POSQUERYI:
2574     if (utf && HAS_EXTRALEN(code[1])) code += GET_EXTRALEN(code[1]);
2575     break;
2576
2577     case OP_UPTO:
2578     case OP_UPTOI:
2579     case OP_MINUPTO:
2580     case OP_MINUPTOI:
2581     case OP_POSUPTO:
2582     case OP_POSUPTOI:
2583     if (utf && HAS_EXTRALEN(code[1 + IMM2_SIZE])) code += GET_EXTRALEN(code[1 + IMM2_SIZE]);
2584     break;
2585 #endif
2586
2587     /* MARK, and PRUNE/SKIP/THEN with an argument must skip over the argument
2588     string. */
2589
2590     case OP_MARK:
2591     case OP_PRUNE_ARG:
2592     case OP_SKIP_ARG:
2593     code += code[1];
2594     break;
2595
2596     case OP_THEN_ARG:
2597     code += code[1];
2598     break;
2599
2600     /* None of the remaining opcodes are required to match a character. */
2601
2602     default:
2603     break;
2604     }
2605   }
2606
2607 return TRUE;
2608 }
2609
2610
2611
2612 /*************************************************
2613 *    Scan compiled regex for non-emptiness       *
2614 *************************************************/
2615
2616 /* This function is called to check for left recursive calls. We want to check
2617 the current branch of the current pattern to see if it could match the empty
2618 string. If it could, we must look outwards for branches at other levels,
2619 stopping when we pass beyond the bracket which is the subject of the recursion.
2620 This function is called only during the real compile, not during the
2621 pre-compile.
2622
2623 Arguments:
2624   code        points to start of the recursion
2625   endcode     points to where to stop (current RECURSE item)
2626   bcptr       points to the chain of current (unclosed) branch starts
2627   utf         TRUE if in UTF-8 / UTF-16 mode
2628   cd          pointers to tables etc
2629
2630 Returns:      TRUE if what is matched could be empty
2631 */
2632
2633 static BOOL
2634 could_be_empty(const pcre_uchar *code, const pcre_uchar *endcode,
2635   branch_chain *bcptr, BOOL utf, compile_data *cd)
2636 {
2637 while (bcptr != NULL && bcptr->current_branch >= code)
2638   {
2639   if (!could_be_empty_branch(bcptr->current_branch, endcode, utf, cd))
2640     return FALSE;
2641   bcptr = bcptr->outer;
2642   }
2643 return TRUE;
2644 }
2645
2646
2647
2648 /*************************************************
2649 *           Check for POSIX class syntax         *
2650 *************************************************/
2651
2652 /* This function is called when the sequence "[:" or "[." or "[=" is
2653 encountered in a character class. It checks whether this is followed by a
2654 sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
2655 reach an unescaped ']' without the special preceding character, return FALSE.
2656
2657 Originally, this function only recognized a sequence of letters between the
2658 terminators, but it seems that Perl recognizes any sequence of characters,
2659 though of course unknown POSIX names are subsequently rejected. Perl gives an
2660 "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
2661 didn't consider this to be a POSIX class. Likewise for [:1234:].
2662
2663 The problem in trying to be exactly like Perl is in the handling of escapes. We
2664 have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
2665 class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
2666 below handles the special case of \], but does not try to do any other escape
2667 processing. This makes it different from Perl for cases such as [:l\ower:]
2668 where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
2669 "l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,
2670 I think.
2671
2672 A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.
2673 It seems that the appearance of a nested POSIX class supersedes an apparent
2674 external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or
2675 a digit.
2676
2677 In Perl, unescaped square brackets may also appear as part of class names. For
2678 example, [:a[:abc]b:] gives unknown POSIX class "[:abc]b:]". However, for
2679 [:a[:abc]b][b:] it gives unknown POSIX class "[:abc]b][b:]", which does not
2680 seem right at all. PCRE does not allow closing square brackets in POSIX class
2681 names.
2682
2683 Arguments:
2684   ptr      pointer to the initial [
2685   endptr   where to return the end pointer
2686
2687 Returns:   TRUE or FALSE
2688 */
2689
2690 static BOOL
2691 check_posix_syntax(const pcre_uchar *ptr, const pcre_uchar **endptr)
2692 {
2693 int terminator;          /* Don't combine these lines; the Solaris cc */
2694 terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */
2695 for (++ptr; *ptr != 0; ptr++)
2696   {
2697   if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
2698     ptr++;
2699   else if (*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
2700   else
2701     {
2702     if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
2703       {
2704       *endptr = ptr;
2705       return TRUE;
2706       }
2707     if (*ptr == CHAR_LEFT_SQUARE_BRACKET &&
2708          (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
2709           ptr[1] == CHAR_EQUALS_SIGN) &&
2710         check_posix_syntax(ptr, endptr))
2711       return FALSE;
2712     }
2713   }
2714 return FALSE;
2715 }
2716
2717
2718
2719
2720 /*************************************************
2721 *          Check POSIX class name                *
2722 *************************************************/
2723
2724 /* This function is called to check the name given in a POSIX-style class entry
2725 such as [:alnum:].
2726
2727 Arguments:
2728   ptr        points to the first letter
2729   len        the length of the name
2730
2731 Returns:     a value representing the name, or -1 if unknown
2732 */
2733
2734 static int
2735 check_posix_name(const pcre_uchar *ptr, int len)
2736 {
2737 const char *pn = posix_names;
2738 int yield = 0;
2739 while (posix_name_lengths[yield] != 0)
2740   {
2741   if (len == posix_name_lengths[yield] &&
2742     STRNCMP_UC_C8(ptr, pn, len) == 0) return yield;
2743   pn += posix_name_lengths[yield] + 1;
2744   yield++;
2745   }
2746 return -1;
2747 }
2748
2749
2750 /*************************************************
2751 *    Adjust OP_RECURSE items in repeated group   *
2752 *************************************************/
2753
2754 /* OP_RECURSE items contain an offset from the start of the regex to the group
2755 that is referenced. This means that groups can be replicated for fixed
2756 repetition simply by copying (because the recursion is allowed to refer to
2757 earlier groups that are outside the current group). However, when a group is
2758 optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
2759 inserted before it, after it has been compiled. This means that any OP_RECURSE
2760 items within it that refer to the group itself or any contained groups have to
2761 have their offsets adjusted. That one of the jobs of this function. Before it
2762 is called, the partially compiled regex must be temporarily terminated with
2763 OP_END.
2764
2765 This function has been extended with the possibility of forward references for
2766 recursions and subroutine calls. It must also check the list of such references
2767 for the group we are dealing with. If it finds that one of the recursions in
2768 the current group is on this list, it adjusts the offset in the list, not the
2769 value in the reference (which is a group number).
2770
2771 Arguments:
2772   group      points to the start of the group
2773   adjust     the amount by which the group is to be moved
2774   utf        TRUE in UTF-8 / UTF-16 mode
2775   cd         contains pointers to tables etc.
2776   save_hwm   the hwm forward reference pointer at the start of the group
2777
2778 Returns:     nothing
2779 */
2780
2781 static void
2782 adjust_recurse(pcre_uchar *group, int adjust, BOOL utf, compile_data *cd,
2783   pcre_uchar *save_hwm)
2784 {
2785 pcre_uchar *ptr = group;
2786
2787 while ((ptr = (pcre_uchar *)find_recurse(ptr, utf)) != NULL)
2788   {
2789   int offset;
2790   pcre_uchar *hc;
2791
2792   /* See if this recursion is on the forward reference list. If so, adjust the
2793   reference. */
2794
2795   for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
2796     {
2797     offset = GET(hc, 0);
2798     if (cd->start_code + offset == ptr + 1)
2799       {
2800       PUT(hc, 0, offset + adjust);
2801       break;
2802       }
2803     }
2804
2805   /* Otherwise, adjust the recursion offset if it's after the start of this
2806   group. */
2807
2808   if (hc >= cd->hwm)
2809     {
2810     offset = GET(ptr, 1);
2811     if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
2812     }
2813
2814   ptr += 1 + LINK_SIZE;
2815   }
2816 }
2817
2818
2819
2820 /*************************************************
2821 *        Insert an automatic callout point       *
2822 *************************************************/
2823
2824 /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
2825 callout points before each pattern item.
2826
2827 Arguments:
2828   code           current code pointer
2829   ptr            current pattern pointer
2830   cd             pointers to tables etc
2831
2832 Returns:         new code pointer
2833 */
2834
2835 static pcre_uchar *
2836 auto_callout(pcre_uchar *code, const pcre_uchar *ptr, compile_data *cd)
2837 {
2838 *code++ = OP_CALLOUT;
2839 *code++ = 255;
2840 PUT(code, 0, (int)(ptr - cd->start_pattern));  /* Pattern offset */
2841 PUT(code, LINK_SIZE, 0);                       /* Default length */
2842 return code + 2 * LINK_SIZE;
2843 }
2844
2845
2846
2847 /*************************************************
2848 *         Complete a callout item                *
2849 *************************************************/
2850
2851 /* A callout item contains the length of the next item in the pattern, which
2852 we can't fill in till after we have reached the relevant point. This is used
2853 for both automatic and manual callouts.
2854
2855 Arguments:
2856   previous_callout   points to previous callout item
2857   ptr                current pattern pointer
2858   cd                 pointers to tables etc
2859
2860 Returns:             nothing
2861 */
2862
2863 static void
2864 complete_callout(pcre_uchar *previous_callout, const pcre_uchar *ptr, compile_data *cd)
2865 {
2866 int length = (int)(ptr - cd->start_pattern - GET(previous_callout, 2));
2867 PUT(previous_callout, 2 + LINK_SIZE, length);
2868 }
2869
2870
2871
2872 #ifdef SUPPORT_UCP
2873 /*************************************************
2874 *           Get othercase range                  *
2875 *************************************************/
2876
2877 /* This function is passed the start and end of a class range, in UTF-8 mode
2878 with UCP support. It searches up the characters, looking for internal ranges of
2879 characters in the "other" case. Each call returns the next one, updating the
2880 start address.
2881
2882 Arguments:
2883   cptr        points to starting character value; updated
2884   d           end value
2885   ocptr       where to put start of othercase range
2886   odptr       where to put end of othercase range
2887
2888 Yield:        TRUE when range returned; FALSE when no more
2889 */
2890
2891 static BOOL
2892 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
2893   unsigned int *odptr)
2894 {
2895 unsigned int c, othercase, next;
2896
2897 for (c = *cptr; c <= d; c++)
2898   { if ((othercase = UCD_OTHERCASE(c)) != c) break; }
2899
2900 if (c > d) return FALSE;
2901
2902 *ocptr = othercase;
2903 next = othercase + 1;
2904
2905 for (++c; c <= d; c++)
2906   {
2907   if (UCD_OTHERCASE(c) != next) break;
2908   next++;
2909   }
2910
2911 *odptr = next - 1;
2912 *cptr = c;
2913
2914 return TRUE;
2915 }
2916
2917
2918
2919 /*************************************************
2920 *        Check a character and a property        *
2921 *************************************************/
2922
2923 /* This function is called by check_auto_possessive() when a property item
2924 is adjacent to a fixed character.
2925
2926 Arguments:
2927   c            the character
2928   ptype        the property type
2929   pdata        the data for the type
2930   negated      TRUE if it's a negated property (\P or \p{^)
2931
2932 Returns:       TRUE if auto-possessifying is OK
2933 */
2934
2935 static BOOL
2936 check_char_prop(int c, int ptype, int pdata, BOOL negated)
2937 {
2938 const pcre_uint8 chartype = UCD_CHARTYPE(c);
2939 switch(ptype)
2940   {
2941   case PT_LAMP:
2942   return (chartype == ucp_Lu ||
2943           chartype == ucp_Ll ||
2944           chartype == ucp_Lt) == negated;
2945
2946   case PT_GC:
2947   return (pdata == PRIV(ucp_gentype)[chartype]) == negated;
2948
2949   case PT_PC:
2950   return (pdata == chartype) == negated;
2951
2952   case PT_SC:
2953   return (pdata == UCD_SCRIPT(c)) == negated;
2954
2955   /* These are specials */
2956
2957   case PT_ALNUM:
2958   return (PRIV(ucp_gentype)[chartype] == ucp_L ||
2959           PRIV(ucp_gentype)[chartype] == ucp_N) == negated;
2960
2961   case PT_SPACE:    /* Perl space */
2962   return (PRIV(ucp_gentype)[chartype] == ucp_Z ||
2963           c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2964           == negated;
2965
2966   case PT_PXSPACE:  /* POSIX space */
2967   return (PRIV(ucp_gentype)[chartype] == ucp_Z ||
2968           c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2969           c == CHAR_FF || c == CHAR_CR)
2970           == negated;
2971
2972   case PT_WORD:
2973   return (PRIV(ucp_gentype)[chartype] == ucp_L ||
2974           PRIV(ucp_gentype)[chartype] == ucp_N ||
2975           c == CHAR_UNDERSCORE) == negated;
2976   }
2977 return FALSE;
2978 }
2979 #endif  /* SUPPORT_UCP */
2980
2981
2982
2983 /*************************************************
2984 *     Check if auto-possessifying is possible    *
2985 *************************************************/
2986
2987 /* This function is called for unlimited repeats of certain items, to see
2988 whether the next thing could possibly match the repeated item. If not, it makes
2989 sense to automatically possessify the repeated item.
2990
2991 Arguments:
2992   previous      pointer to the repeated opcode
2993   utf           TRUE in UTF-8 / UTF-16 mode
2994   ptr           next character in pattern
2995   options       options bits
2996   cd            contains pointers to tables etc.
2997
2998 Returns:        TRUE if possessifying is wanted
2999 */
3000
3001 static BOOL
3002 check_auto_possessive(const pcre_uchar *previous, BOOL utf,
3003   const pcre_uchar *ptr, int options, compile_data *cd)
3004 {
3005 pcre_int32 c, next;
3006 int op_code = *previous++;
3007
3008 /* Skip whitespace and comments in extended mode */
3009
3010 if ((options & PCRE_EXTENDED) != 0)
3011   {
3012   for (;;)
3013     {
3014     while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
3015     if (*ptr == CHAR_NUMBER_SIGN)
3016       {
3017       ptr++;
3018       while (*ptr != 0)
3019         {
3020         if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
3021         ptr++;
3022 #ifdef SUPPORT_UTF
3023         if (utf) FORWARDCHAR(ptr);
3024 #endif
3025         }
3026       }
3027     else break;
3028     }
3029   }
3030
3031 /* If the next item is one that we can handle, get its value. A non-negative
3032 value is a character, a negative value is an escape value. */
3033
3034 if (*ptr == CHAR_BACKSLASH)
3035   {
3036   int temperrorcode = 0;
3037   next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
3038   if (temperrorcode != 0) return FALSE;
3039   ptr++;    /* Point after the escape sequence */
3040   }
3041 else if (!MAX_255(*ptr) || (cd->ctypes[*ptr] & ctype_meta) == 0)
3042   {
3043 #ifdef SUPPORT_UTF
3044   if (utf) { GETCHARINC(next, ptr); } else
3045 #endif
3046   next = *ptr++;
3047   }
3048 else return FALSE;
3049
3050 /* Skip whitespace and comments in extended mode */
3051
3052 if ((options & PCRE_EXTENDED) != 0)
3053   {
3054   for (;;)
3055     {
3056     while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
3057     if (*ptr == CHAR_NUMBER_SIGN)
3058       {
3059       ptr++;
3060       while (*ptr != 0)
3061         {
3062         if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
3063         ptr++;
3064 #ifdef SUPPORT_UTF
3065         if (utf) FORWARDCHAR(ptr);
3066 #endif
3067         }
3068       }
3069     else break;
3070     }
3071   }
3072
3073 /* If the next thing is itself optional, we have to give up. */
3074
3075 if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
3076   STRNCMP_UC_C8(ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
3077     return FALSE;
3078
3079 /* Now compare the next item with the previous opcode. First, handle cases when
3080 the next item is a character. */
3081
3082 if (next >= 0) switch(op_code)
3083   {
3084   case OP_CHAR:
3085 #ifdef SUPPORT_UTF
3086   GETCHARTEST(c, previous);
3087 #else
3088   c = *previous;
3089 #endif
3090   return c != next;
3091
3092   /* For CHARI (caseless character) we must check the other case. If we have
3093   Unicode property support, we can use it to test the other case of
3094   high-valued characters. */
3095
3096   case OP_CHARI:
3097 #ifdef SUPPORT_UTF
3098   GETCHARTEST(c, previous);
3099 #else
3100   c = *previous;
3101 #endif
3102   if (c == next) return FALSE;
3103 #ifdef SUPPORT_UTF
3104   if (utf)
3105     {
3106     unsigned int othercase;
3107     if (next < 128) othercase = cd->fcc[next]; else
3108 #ifdef SUPPORT_UCP
3109     othercase = UCD_OTHERCASE((unsigned int)next);
3110 #else
3111     othercase = NOTACHAR;
3112 #endif
3113     return (unsigned int)c != othercase;
3114     }
3115   else
3116 #endif  /* SUPPORT_UTF */
3117   return (c != TABLE_GET((unsigned int)next, cd->fcc, next));  /* Non-UTF-8 mode */
3118
3119   case OP_NOT:
3120 #ifdef SUPPORT_UTF
3121   GETCHARTEST(c, previous);
3122 #else
3123   c = *previous;
3124 #endif
3125   return c == next;
3126
3127   case OP_NOTI:
3128 #ifdef SUPPORT_UTF
3129   GETCHARTEST(c, previous);
3130 #else
3131   c = *previous;
3132 #endif
3133   if (c == next) return TRUE;
3134 #ifdef SUPPORT_UTF
3135   if (utf)
3136     {
3137     unsigned int othercase;
3138     if (next < 128) othercase = cd->fcc[next]; else
3139 #ifdef SUPPORT_UCP
3140     othercase = UCD_OTHERCASE((unsigned int)next);
3141 #else
3142     othercase = NOTACHAR;
3143 #endif
3144     return (unsigned int)c == othercase;
3145     }
3146   else
3147 #endif  /* SUPPORT_UTF */
3148   return (c == TABLE_GET((unsigned int)next, cd->fcc, next));  /* Non-UTF-8 mode */
3149
3150   /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not* set.
3151   When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
3152
3153   case OP_DIGIT:
3154   return next > 255 || (cd->ctypes[next] & ctype_digit) == 0;
3155
3156   case OP_NOT_DIGIT:
3157   return next <= 255 && (cd->ctypes[next] & ctype_digit) != 0;
3158
3159   case OP_WHITESPACE:
3160   return next > 255 || (cd->ctypes[next] & ctype_space) == 0;
3161
3162   case OP_NOT_WHITESPACE:
3163   return next <= 255 && (cd->ctypes[next] & ctype_space) != 0;
3164
3165   case OP_WORDCHAR:
3166   return next > 255 || (cd->ctypes[next] & ctype_word) == 0;
3167
3168   case OP_NOT_WORDCHAR:
3169   return next <= 255 && (cd->ctypes[next] & ctype_word) != 0;
3170
3171   case OP_HSPACE:
3172   case OP_NOT_HSPACE:
3173   switch(next)
3174     {
3175     case 0x09:
3176     case 0x20:
3177     case 0xa0:
3178     case 0x1680:
3179     case 0x180e:
3180     case 0x2000:
3181     case 0x2001:
3182     case 0x2002:
3183     case 0x2003:
3184     case 0x2004:
3185     case 0x2005:
3186     case 0x2006:
3187     case 0x2007:
3188     case 0x2008:
3189     case 0x2009:
3190     case 0x200A:
3191     case 0x202f:
3192     case 0x205f:
3193     case 0x3000:
3194     return op_code == OP_NOT_HSPACE;
3195     default:
3196     return op_code != OP_NOT_HSPACE;
3197     }
3198
3199   case OP_ANYNL:
3200   case OP_VSPACE:
3201   case OP_NOT_VSPACE:
3202   switch(next)
3203     {
3204     case 0x0a:
3205     case 0x0b:
3206     case 0x0c:
3207     case 0x0d:
3208     case 0x85:
3209     case 0x2028:
3210     case 0x2029:
3211     return op_code == OP_NOT_VSPACE;
3212     default:
3213     return op_code != OP_NOT_VSPACE;
3214     }
3215
3216 #ifdef SUPPORT_UCP
3217   case OP_PROP:
3218   return check_char_prop(next, previous[0], previous[1], FALSE);
3219
3220   case OP_NOTPROP:
3221   return check_char_prop(next, previous[0], previous[1], TRUE);
3222 #endif
3223
3224   default:
3225   return FALSE;
3226   }
3227
3228
3229 /* Handle the case when the next item is \d, \s, etc. Note that when PCRE_UCP
3230 is set, \d turns into ESC_du rather than ESC_d, etc., so ESC_d etc. are
3231 generated only when PCRE_UCP is *not* set, that is, when only ASCII
3232 characteristics are recognized. Similarly, the opcodes OP_DIGIT etc. are
3233 replaced by OP_PROP codes when PCRE_UCP is set. */
3234
3235 switch(op_code)
3236   {
3237   case OP_CHAR:
3238   case OP_CHARI:
3239 #ifdef SUPPORT_UTF
3240   GETCHARTEST(c, previous);
3241 #else
3242   c = *previous;
3243 #endif
3244   switch(-next)
3245     {
3246     case ESC_d:
3247     return c > 255 || (cd->ctypes[c] & ctype_digit) == 0;
3248
3249     case ESC_D:
3250     return c <= 255 && (cd->ctypes[c] & ctype_digit) != 0;
3251
3252     case ESC_s:
3253     return c > 255 || (cd->ctypes[c] & ctype_space) == 0;
3254
3255     case ESC_S:
3256     return c <= 255 && (cd->ctypes[c] & ctype_space) != 0;
3257
3258     case ESC_w:
3259     return c > 255 || (cd->ctypes[c] & ctype_word) == 0;
3260
3261     case ESC_W:
3262     return c <= 255 && (cd->ctypes[c] & ctype_word) != 0;
3263
3264     case ESC_h:
3265     case ESC_H:
3266     switch(c)
3267       {
3268       case 0x09:
3269       case 0x20:
3270       case 0xa0:
3271       case 0x1680:
3272       case 0x180e:
3273       case 0x2000:
3274       case 0x2001:
3275       case 0x2002:
3276       case 0x2003:
3277       case 0x2004:
3278       case 0x2005:
3279       case 0x2006:
3280       case 0x2007:
3281       case 0x2008:
3282       case 0x2009:
3283       case 0x200A:
3284       case 0x202f:
3285       case 0x205f:
3286       case 0x3000:
3287       return -next != ESC_h;
3288       default:
3289       return -next == ESC_h;
3290       }
3291
3292     case ESC_v:
3293     case ESC_V:
3294     switch(c)
3295       {
3296       case 0x0a:
3297       case 0x0b:
3298       case 0x0c:
3299       case 0x0d:
3300       case 0x85:
3301       case 0x2028:
3302       case 0x2029:
3303       return -next != ESC_v;
3304       default:
3305       return -next == ESC_v;
3306       }
3307
3308     /* When PCRE_UCP is set, these values get generated for \d etc. Find
3309     their substitutions and process them. The result will always be either
3310     -ESC_p or -ESC_P. Then fall through to process those values. */
3311
3312 #ifdef SUPPORT_UCP
3313     case ESC_du:
3314     case ESC_DU:
3315     case ESC_wu:
3316     case ESC_WU:
3317     case ESC_su:
3318     case ESC_SU:
3319       {
3320       int temperrorcode = 0;
3321       ptr = substitutes[-next - ESC_DU];
3322       next = check_escape(&ptr, &temperrorcode, 0, options, FALSE);
3323       if (temperrorcode != 0) return FALSE;
3324       ptr++;    /* For compatibility */
3325       }
3326     /* Fall through */
3327
3328     case ESC_p:
3329     case ESC_P:
3330       {
3331       int ptype, pdata, errorcodeptr;
3332       BOOL negated;
3333
3334       ptr--;      /* Make ptr point at the p or P */
3335       ptype = get_ucp(&ptr, &negated, &pdata, &errorcodeptr);
3336       if (ptype < 0) return FALSE;
3337       ptr++;      /* Point past the final curly ket */
3338
3339       /* If the property item is optional, we have to give up. (When generated
3340       from \d etc by PCRE_UCP, this test will have been applied much earlier,
3341       to the original \d etc. At this point, ptr will point to a zero byte. */
3342
3343       if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
3344         STRNCMP_UC_C8(ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
3345           return FALSE;
3346
3347       /* Do the property check. */
3348
3349       return check_char_prop(c, ptype, pdata, (next == -ESC_P) != negated);
3350       }
3351 #endif
3352
3353     default:
3354     return FALSE;
3355     }
3356
3357   /* In principle, support for Unicode properties should be integrated here as
3358   well. It means re-organizing the above code so as to get hold of the property
3359   values before switching on the op-code. However, I wonder how many patterns
3360   combine ASCII \d etc with Unicode properties? (Note that if PCRE_UCP is set,
3361   these op-codes are never generated.) */
3362
3363   case OP_DIGIT:
3364   return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
3365          next == -ESC_h || next == -ESC_v || next == -ESC_R;
3366
3367   case OP_NOT_DIGIT:
3368   return next == -ESC_d;
3369
3370   case OP_WHITESPACE:
3371   return next == -ESC_S || next == -ESC_d || next == -ESC_w;
3372
3373   case OP_NOT_WHITESPACE:
3374   return next == -ESC_s || next == -ESC_h || next == -ESC_v || next == -ESC_R;
3375
3376   case OP_HSPACE:
3377   return next == -ESC_S || next == -ESC_H || next == -ESC_d ||
3378          next == -ESC_w || next == -ESC_v || next == -ESC_R;
3379
3380   case OP_NOT_HSPACE:
3381   return next == -ESC_h;
3382
3383   /* Can't have \S in here because VT matches \S (Perl anomaly) */
3384   case OP_ANYNL:
3385   case OP_VSPACE:
3386   return next == -ESC_V || next == -ESC_d || next == -ESC_w;
3387
3388   case OP_NOT_VSPACE:
3389   return next == -ESC_v || next == -ESC_R;
3390
3391   case OP_WORDCHAR:
3392   return next == -ESC_W || next == -ESC_s || next == -ESC_h ||
3393          next == -ESC_v || next == -ESC_R;
3394
3395   case OP_NOT_WORDCHAR:
3396   return next == -ESC_w || next == -ESC_d;
3397
3398   default:
3399   return FALSE;
3400   }
3401
3402 /* Control does not reach here */
3403 }
3404
3405
3406
3407 /*************************************************
3408 *           Compile one branch                   *
3409 *************************************************/
3410
3411 /* Scan the pattern, compiling it into the a vector. If the options are
3412 changed during the branch, the pointer is used to change the external options
3413 bits. This function is used during the pre-compile phase when we are trying
3414 to find out the amount of memory needed, as well as during the real compile
3415 phase. The value of lengthptr distinguishes the two phases.
3416
3417 Arguments:
3418   optionsptr     pointer to the option bits
3419   codeptr        points to the pointer to the current code point
3420   ptrptr         points to the current pattern pointer
3421   errorcodeptr   points to error code variable
3422   firstcharptr   set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
3423   reqcharptr     set to the last literal character required, else < 0
3424   bcptr          points to current branch chain
3425   cond_depth     conditional nesting depth
3426   cd             contains pointers to tables etc.
3427   lengthptr      NULL during the real compile phase
3428                  points to length accumulator during pre-compile phase
3429
3430 Returns:         TRUE on success
3431                  FALSE, with *errorcodeptr set non-zero on error
3432 */
3433
3434 static BOOL
3435 compile_branch(int *optionsptr, pcre_uchar **codeptr,
3436   const pcre_uchar **ptrptr, int *errorcodeptr, pcre_int32 *firstcharptr,
3437   pcre_int32 *reqcharptr, branch_chain *bcptr, int cond_depth,
3438   compile_data *cd, int *lengthptr)
3439 {
3440 int repeat_type, op_type;
3441 int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
3442 int bravalue = 0;
3443 int greedy_default, greedy_non_default;
3444 pcre_int32 firstchar, reqchar;
3445 pcre_int32 zeroreqchar, zerofirstchar;
3446 pcre_int32 req_caseopt, reqvary, tempreqvary;
3447 int options = *optionsptr;               /* May change dynamically */
3448 int after_manual_callout = 0;
3449 int length_prevgroup = 0;
3450 int c;
3451 pcre_uchar *code = *codeptr;
3452 pcre_uchar *last_code = code;
3453 pcre_uchar *orig_code = code;
3454 pcre_uchar *tempcode;
3455 BOOL inescq = FALSE;
3456 BOOL groupsetfirstchar = FALSE;
3457 const pcre_uchar *ptr = *ptrptr;
3458 const pcre_uchar *tempptr;
3459 const pcre_uchar *nestptr = NULL;
3460 pcre_uchar *previous = NULL;
3461 pcre_uchar *previous_callout = NULL;
3462 pcre_uchar *save_hwm = NULL;
3463 pcre_uint8 classbits[32];
3464
3465 /* We can fish out the UTF-8 setting once and for all into a BOOL, but we
3466 must not do this for other options (e.g. PCRE_EXTENDED) because they may change
3467 dynamically as we process the pattern. */
3468
3469 #ifdef SUPPORT_UTF
3470 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
3471 BOOL utf = (options & PCRE_UTF8) != 0;
3472 pcre_uchar utf_chars[6];
3473 #else
3474 BOOL utf = FALSE;
3475 #endif
3476
3477 /* Helper variables for OP_XCLASS opcode (for characters > 255). */
3478
3479 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3480 BOOL xclass;
3481 pcre_uchar *class_uchardata;
3482 pcre_uchar *class_uchardata_base;
3483 #endif
3484
3485 #ifdef PCRE_DEBUG
3486 if (lengthptr != NULL) DPRINTF((">> start branch\n"));
3487 #endif
3488
3489 /* Set up the default and non-default settings for greediness */
3490
3491 greedy_default = ((options & PCRE_UNGREEDY) != 0);
3492 greedy_non_default = greedy_default ^ 1;
3493
3494 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
3495 matching encountered yet". It gets changed to REQ_NONE if we hit something that
3496 matches a non-fixed char first char; reqchar just remains unset if we never
3497 find one.
3498
3499 When we hit a repeat whose minimum is zero, we may have to adjust these values
3500 to take the zero repeat into account. This is implemented by setting them to
3501 zerofirstbyte and zeroreqchar when such a repeat is encountered. The individual
3502 item types that can be repeated set these backoff variables appropriately. */
3503
3504 firstchar = reqchar = zerofirstchar = zeroreqchar = REQ_UNSET;
3505
3506 /* The variable req_caseopt contains either the REQ_CASELESS value
3507 or zero, according to the current setting of the caseless flag. The
3508 REQ_CASELESS leaves the lower 28 bit empty. It is added into the
3509 firstchar or reqchar variables to record the case status of the
3510 value. This is used only for ASCII characters. */
3511
3512 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS:0;
3513
3514 /* Switch on next character until the end of the branch */
3515
3516 for (;; ptr++)
3517   {
3518   BOOL negate_class;
3519   BOOL should_flip_negation;
3520   BOOL possessive_quantifier;
3521   BOOL is_quantifier;
3522   BOOL is_recurse;
3523   BOOL reset_bracount;
3524   int class_has_8bitchar;
3525   int class_single_char;
3526   int newoptions;
3527   int recno;
3528   int refsign;
3529   int skipbytes;
3530   int subreqchar;
3531   int subfirstchar;
3532   int terminator;
3533   int mclength;
3534   int tempbracount;
3535   pcre_uchar mcbuffer[8];
3536
3537   /* Get next character in the pattern */
3538
3539   c = *ptr;
3540
3541   /* If we are at the end of a nested substitution, revert to the outer level
3542   string. Nesting only happens one level deep. */
3543
3544   if (c == 0 && nestptr != NULL)
3545     {
3546     ptr = nestptr;
3547     nestptr = NULL;
3548     c = *ptr;
3549     }
3550
3551   /* If we are in the pre-compile phase, accumulate the length used for the
3552   previous cycle of this loop. */
3553
3554   if (lengthptr != NULL)
3555     {
3556 #ifdef PCRE_DEBUG
3557     if (code > cd->hwm) cd->hwm = code;                 /* High water info */
3558 #endif
3559     if (code > cd->start_workspace + cd->workspace_size -
3560         WORK_SIZE_SAFETY_MARGIN)                       /* Check for overrun */
3561       {
3562       *errorcodeptr = ERR52;
3563       goto FAILED;
3564       }
3565
3566     /* There is at least one situation where code goes backwards: this is the
3567     case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
3568     the class is simply eliminated. However, it is created first, so we have to
3569     allow memory for it. Therefore, don't ever reduce the length at this point.
3570     */
3571
3572     if (code < last_code) code = last_code;
3573
3574     /* Paranoid check for integer overflow */
3575
3576     if (OFLOW_MAX - *lengthptr < code - last_code)
3577       {
3578       *errorcodeptr = ERR20;
3579       goto FAILED;
3580       }
3581
3582     *lengthptr += (int)(code - last_code);
3583     DPRINTF(("length=%d added %d c=%c (0x%x)\n", *lengthptr,
3584       (int)(code - last_code), c, c));
3585
3586     /* If "previous" is set and it is not at the start of the work space, move
3587     it back to there, in order to avoid filling up the work space. Otherwise,
3588     if "previous" is NULL, reset the current code pointer to the start. */
3589
3590     if (previous != NULL)
3591       {
3592       if (previous > orig_code)
3593         {
3594         memmove(orig_code, previous, IN_UCHARS(code - previous));
3595         code -= previous - orig_code;
3596         previous = orig_code;
3597         }
3598       }
3599     else code = orig_code;
3600
3601     /* Remember where this code item starts so we can pick up the length
3602     next time round. */
3603
3604     last_code = code;
3605     }
3606
3607   /* In the real compile phase, just check the workspace used by the forward
3608   reference list. */
3609
3610   else if (cd->hwm > cd->start_workspace + cd->workspace_size -
3611            WORK_SIZE_SAFETY_MARGIN)
3612     {
3613     *errorcodeptr = ERR52;
3614     goto FAILED;
3615     }
3616
3617   /* If in \Q...\E, check for the end; if not, we have a literal */
3618
3619   if (inescq && c != 0)
3620     {
3621     if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)
3622       {
3623       inescq = FALSE;
3624       ptr++;
3625       continue;
3626       }
3627     else
3628       {
3629       if (previous_callout != NULL)
3630         {
3631         if (lengthptr == NULL)  /* Don't attempt in pre-compile phase */
3632           complete_callout(previous_callout, ptr, cd);
3633         previous_callout = NULL;
3634         }
3635       if ((options & PCRE_AUTO_CALLOUT) != 0)
3636         {
3637         previous_callout = code;
3638         code = auto_callout(code, ptr, cd);
3639         }
3640       goto NORMAL_CHAR;
3641       }
3642     }
3643
3644   /* Fill in length of a previous callout, except when the next thing is
3645   a quantifier. */
3646
3647   is_quantifier =
3648     c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||
3649     (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));
3650
3651   if (!is_quantifier && previous_callout != NULL &&
3652        after_manual_callout-- <= 0)
3653     {
3654     if (lengthptr == NULL)      /* Don't attempt in pre-compile phase */
3655       complete_callout(previous_callout, ptr, cd);
3656     previous_callout = NULL;
3657     }
3658
3659   /* In extended mode, skip white space and comments. */
3660
3661   if ((options & PCRE_EXTENDED) != 0)
3662     {
3663     if (MAX_255(*ptr) && (cd->ctypes[c] & ctype_space) != 0) continue;
3664     if (c == CHAR_NUMBER_SIGN)
3665       {
3666       ptr++;
3667       while (*ptr != 0)
3668         {
3669         if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
3670         ptr++;
3671 #ifdef SUPPORT_UTF
3672         if (utf) FORWARDCHAR(ptr);
3673 #endif
3674         }
3675       if (*ptr != 0) continue;
3676
3677       /* Else fall through to handle end of string */
3678       c = 0;
3679       }
3680     }
3681
3682   /* No auto callout for quantifiers. */
3683
3684   if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
3685     {
3686     previous_callout = code;
3687     code = auto_callout(code, ptr, cd);
3688     }
3689
3690   switch(c)
3691     {
3692     /* ===================================================================*/
3693     case 0:                        /* The branch terminates at string end */
3694     case CHAR_VERTICAL_LINE:       /* or | or ) */
3695     case CHAR_RIGHT_PARENTHESIS:
3696     *firstcharptr = firstchar;
3697     *reqcharptr = reqchar;
3698     *codeptr = code;
3699     *ptrptr = ptr;
3700     if (lengthptr != NULL)
3701       {
3702       if (OFLOW_MAX - *lengthptr < code - last_code)
3703         {
3704         *errorcodeptr = ERR20;
3705         goto FAILED;
3706         }
3707       *lengthptr += (int)(code - last_code);   /* To include callout length */
3708       DPRINTF((">> end branch\n"));
3709       }
3710     return TRUE;
3711
3712
3713     /* ===================================================================*/
3714     /* Handle single-character metacharacters. In multiline mode, ^ disables
3715     the setting of any following char as a first character. */
3716
3717     case CHAR_CIRCUMFLEX_ACCENT:
3718     previous = NULL;
3719     if ((options & PCRE_MULTILINE) != 0)
3720       {
3721       if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
3722       *code++ = OP_CIRCM;
3723       }
3724     else *code++ = OP_CIRC;
3725     break;
3726
3727     case CHAR_DOLLAR_SIGN:
3728     previous = NULL;
3729     *code++ = ((options & PCRE_MULTILINE) != 0)? OP_DOLLM : OP_DOLL;
3730     break;
3731
3732     /* There can never be a first char if '.' is first, whatever happens about
3733     repeats. The value of reqchar doesn't change either. */
3734
3735     case CHAR_DOT:
3736     if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
3737     zerofirstchar = firstchar;
3738     zeroreqchar = reqchar;
3739     previous = code;
3740     *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
3741     break;
3742
3743
3744     /* ===================================================================*/
3745     /* Character classes. If the included characters are all < 256, we build a
3746     32-byte bitmap of the permitted characters, except in the special case
3747     where there is only one such character. For negated classes, we build the
3748     map as usual, then invert it at the end. However, we use a different opcode
3749     so that data characters > 255 can be handled correctly.
3750
3751     If the class contains characters outside the 0-255 range, a different
3752     opcode is compiled. It may optionally have a bit map for characters < 256,
3753     but those above are are explicitly listed afterwards. A flag byte tells
3754     whether the bitmap is present, and whether this is a negated class or not.
3755
3756     In JavaScript compatibility mode, an isolated ']' causes an error. In
3757     default (Perl) mode, it is treated as a data character. */
3758
3759     case CHAR_RIGHT_SQUARE_BRACKET:
3760     if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
3761       {
3762       *errorcodeptr = ERR64;
3763       goto FAILED;
3764       }
3765     goto NORMAL_CHAR;
3766
3767     case CHAR_LEFT_SQUARE_BRACKET:
3768     previous = code;
3769
3770     /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
3771     they are encountered at the top level, so we'll do that too. */
3772
3773     if ((ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
3774          ptr[1] == CHAR_EQUALS_SIGN) &&
3775         check_posix_syntax(ptr, &tempptr))
3776       {
3777       *errorcodeptr = (ptr[1] == CHAR_COLON)? ERR13 : ERR31;
3778       goto FAILED;
3779       }
3780
3781     /* If the first character is '^', set the negation flag and skip it. Also,
3782     if the first few characters (either before or after ^) are \Q\E or \E we
3783     skip them too. This makes for compatibility with Perl. */
3784
3785     negate_class = FALSE;
3786     for (;;)
3787       {
3788       c = *(++ptr);
3789       if (c == CHAR_BACKSLASH)
3790         {
3791         if (ptr[1] == CHAR_E)
3792           ptr++;
3793         else if (STRNCMP_UC_C8(ptr + 1, STR_Q STR_BACKSLASH STR_E, 3) == 0)
3794           ptr += 3;
3795         else
3796           break;
3797         }
3798       else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
3799         negate_class = TRUE;
3800       else break;
3801       }
3802
3803     /* Empty classes are allowed in JavaScript compatibility mode. Otherwise,
3804     an initial ']' is taken as a data character -- the code below handles
3805     that. In JS mode, [] must always fail, so generate OP_FAIL, whereas
3806     [^] must match any character, so generate OP_ALLANY. */
3807
3808     if (c == CHAR_RIGHT_SQUARE_BRACKET &&
3809         (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
3810       {
3811       *code++ = negate_class? OP_ALLANY : OP_FAIL;
3812       if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
3813       zerofirstchar = firstchar;
3814       break;
3815       }
3816
3817     /* If a class contains a negative special such as \S, we need to flip the
3818     negation flag at the end, so that support for characters > 255 works
3819     correctly (they are all included in the class). */
3820
3821     should_flip_negation = FALSE;
3822
3823     /* For optimization purposes, we track some properties of the class.
3824     class_has_8bitchar will be non-zero, if the class contains at least one
3825     < 256 character. class_single_char will be 1 if the class contains only
3826     a single character. */
3827
3828     class_has_8bitchar = 0;
3829     class_single_char = 0;
3830
3831     /* Initialize the 32-char bit map to all zeros. We build the map in a
3832     temporary bit of memory, in case the class contains only 1 character (less
3833     than 256), because in that case the compiled code doesn't use the bit map.
3834     */
3835
3836     memset(classbits, 0, 32 * sizeof(pcre_uint8));
3837
3838 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3839     xclass = FALSE;                           /* No chars >= 256 */
3840     class_uchardata = code + LINK_SIZE + 2;   /* For UTF-8 items */
3841     class_uchardata_base = class_uchardata;   /* For resetting in pass 1 */
3842 #endif
3843
3844     /* Process characters until ] is reached. By writing this as a "do" it
3845     means that an initial ] is taken as a data character. At the start of the
3846     loop, c contains the first byte of the character. */
3847
3848     if (c != 0) do
3849       {
3850       const pcre_uchar *oldptr;
3851
3852 #ifdef SUPPORT_UTF
3853       if (utf && HAS_EXTRALEN(c))
3854         {                           /* Braces are required because the */
3855         GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */
3856         }
3857 #endif
3858
3859 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3860       /* In the pre-compile phase, accumulate the length of any extra
3861       data and reset the pointer. This is so that very large classes that
3862       contain a zillion > 255 characters no longer overwrite the work space
3863       (which is on the stack). */
3864
3865       if (lengthptr != NULL)
3866         {
3867         *lengthptr += class_uchardata - class_uchardata_base;
3868         class_uchardata = class_uchardata_base;
3869         }
3870 #endif
3871
3872       /* Inside \Q...\E everything is literal except \E */
3873
3874       if (inescq)
3875         {
3876         if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)  /* If we are at \E */
3877           {
3878           inescq = FALSE;                   /* Reset literal state */
3879           ptr++;                            /* Skip the 'E' */
3880           continue;                         /* Carry on with next */
3881           }
3882         goto CHECK_RANGE;                   /* Could be range if \E follows */
3883         }
3884
3885       /* Handle POSIX class names. Perl allows a negation extension of the
3886       form [:^name:]. A square bracket that doesn't match the syntax is
3887       treated as a literal. We also recognize the POSIX constructions
3888       [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
3889       5.6 and 5.8 do. */
3890
3891       if (c == CHAR_LEFT_SQUARE_BRACKET &&
3892           (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
3893            ptr[1] == CHAR_EQUALS_SIGN) && check_posix_syntax(ptr, &tempptr))
3894         {
3895         BOOL local_negate = FALSE;
3896         int posix_class, taboffset, tabopt;
3897         const pcre_uint8 *cbits = cd->cbits;
3898         pcre_uint8 pbits[32];
3899
3900         if (ptr[1] != CHAR_COLON)
3901           {
3902           *errorcodeptr = ERR31;
3903           goto FAILED;
3904           }
3905
3906         ptr += 2;
3907         if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
3908           {
3909           local_negate = TRUE;
3910           should_flip_negation = TRUE;  /* Note negative special */
3911           ptr++;
3912           }
3913
3914         posix_class = check_posix_name(ptr, (int)(tempptr - ptr));
3915         if (posix_class < 0)
3916           {
3917           *errorcodeptr = ERR30;
3918           goto FAILED;
3919           }
3920
3921         /* If matching is caseless, upper and lower are converted to
3922         alpha. This relies on the fact that the class table starts with
3923         alpha, lower, upper as the first 3 entries. */
3924
3925         if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
3926           posix_class = 0;
3927
3928         /* When PCRE_UCP is set, some of the POSIX classes are converted to
3929         different escape sequences that use Unicode properties. */
3930
3931 #ifdef SUPPORT_UCP
3932         if ((options & PCRE_UCP) != 0)
3933           {
3934           int pc = posix_class + ((local_negate)? POSIX_SUBSIZE/2 : 0);
3935           if (posix_substitutes[pc] != NULL)
3936             {
3937             nestptr = tempptr + 1;
3938             ptr = posix_substitutes[pc] - 1;
3939             continue;
3940             }
3941           }
3942 #endif
3943         /* In the non-UCP case, we build the bit map for the POSIX class in a
3944         chunk of local store because we may be adding and subtracting from it,
3945         and we don't want to subtract bits that may be in the main map already.
3946         At the end we or the result into the bit map that is being built. */
3947
3948         posix_class *= 3;
3949
3950         /* Copy in the first table (always present) */
3951
3952         memcpy(pbits, cbits + posix_class_maps[posix_class],
3953           32 * sizeof(pcre_uint8));
3954
3955         /* If there is a second table, add or remove it as required. */
3956
3957         taboffset = posix_class_maps[posix_class + 1];
3958         tabopt = posix_class_maps[posix_class + 2];
3959
3960         if (taboffset >= 0)
3961           {
3962           if (tabopt >= 0)
3963             for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
3964           else
3965             for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
3966           }
3967
3968         /* Not see if we need to remove any special characters. An option
3969         value of 1 removes vertical space and 2 removes underscore. */
3970
3971         if (tabopt < 0) tabopt = -tabopt;
3972         if (tabopt == 1) pbits[1] &= ~0x3c;
3973           else if (tabopt == 2) pbits[11] &= 0x7f;
3974
3975         /* Add the POSIX table or its complement into the main table that is
3976         being built and we are done. */
3977
3978         if (local_negate)
3979           for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
3980         else
3981           for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
3982
3983         ptr = tempptr + 1;
3984         /* Every class contains at least one < 256 characters. */
3985         class_has_8bitchar = 1;
3986         /* Every class contains at least two characters. */
3987         class_single_char = 2;
3988         continue;    /* End of POSIX syntax handling */
3989         }
3990
3991       /* Backslash may introduce a single character, or it may introduce one
3992       of the specials, which just set a flag. The sequence \b is a special
3993       case. Inside a class (and only there) it is treated as backspace. We
3994       assume that other escapes have more than one character in them, so
3995       speculatively set both class_has_8bitchar and class_single_char bigger
3996       than one. Unrecognized escapes fall through and are either treated
3997       as literal characters (by default), or are faulted if
3998       PCRE_EXTRA is set. */
3999
4000       if (c == CHAR_BACKSLASH)
4001         {
4002         c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
4003         if (*errorcodeptr != 0) goto FAILED;
4004
4005         if (-c == ESC_b) c = CHAR_BS;    /* \b is backspace in a class */
4006         else if (-c == ESC_N)            /* \N is not supported in a class */
4007           {
4008           *errorcodeptr = ERR71;
4009           goto FAILED;
4010           }
4011         else if (-c == ESC_Q)            /* Handle start of quoted string */
4012           {
4013           if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
4014             {
4015             ptr += 2; /* avoid empty string */
4016             }
4017           else inescq = TRUE;
4018           continue;
4019           }
4020         else if (-c == ESC_E) continue;  /* Ignore orphan \E */
4021
4022         if (c < 0)
4023           {
4024           const pcre_uint8 *cbits = cd->cbits;
4025           /* Every class contains at least two < 256 characters. */
4026           class_has_8bitchar++;
4027           /* Every class contains at least two characters. */
4028           class_single_char += 2;
4029
4030           switch (-c)
4031             {
4032 #ifdef SUPPORT_UCP
4033             case ESC_du:     /* These are the values given for \d etc */
4034             case ESC_DU:     /* when PCRE_UCP is set. We replace the */
4035             case ESC_wu:     /* escape sequence with an appropriate \p */
4036             case ESC_WU:     /* or \P to test Unicode properties instead */
4037             case ESC_su:     /* of the default ASCII testing. */
4038             case ESC_SU:
4039             nestptr = ptr;
4040             ptr = substitutes[-c - ESC_DU] - 1;  /* Just before substitute */
4041             class_has_8bitchar--;                /* Undo! */
4042             continue;
4043 #endif
4044             case ESC_d:
4045             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
4046             continue;
4047
4048             case ESC_D:
4049             should_flip_negation = TRUE;
4050             for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
4051             continue;
4052
4053             case ESC_w:
4054             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
4055             continue;
4056
4057             case ESC_W:
4058             should_flip_negation = TRUE;
4059             for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
4060             continue;
4061
4062             /* Perl 5.004 onwards omits VT from \s, but we must preserve it
4063             if it was previously set by something earlier in the character
4064             class. */
4065
4066             case ESC_s:
4067             classbits[0] |= cbits[cbit_space];
4068             classbits[1] |= cbits[cbit_space+1] & ~0x08;
4069             for (c = 2; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
4070             continue;
4071
4072             case ESC_S:
4073             should_flip_negation = TRUE;
4074             for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
4075             classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */
4076             continue;
4077
4078             case ESC_h:
4079             SETBIT(classbits, 0x09); /* VT */
4080             SETBIT(classbits, 0x20); /* SPACE */
4081             SETBIT(classbits, 0xa0); /* NSBP */
4082 #ifndef COMPILE_PCRE8
4083             xclass = TRUE;
4084             *class_uchardata++ = XCL_SINGLE;
4085             *class_uchardata++ = 0x1680;
4086             *class_uchardata++ = XCL_SINGLE;
4087             *class_uchardata++ = 0x180e;
4088             *class_uchardata++ = XCL_RANGE;
4089             *class_uchardata++ = 0x2000;
4090             *class_uchardata++ = 0x200a;
4091             *class_uchardata++ = XCL_SINGLE;
4092             *class_uchardata++ = 0x202f;
4093             *class_uchardata++ = XCL_SINGLE;
4094             *class_uchardata++ = 0x205f;
4095             *class_uchardata++ = XCL_SINGLE;
4096             *class_uchardata++ = 0x3000;
4097 #elif defined SUPPORT_UTF
4098             if (utf)
4099               {
4100               xclass = TRUE;
4101               *class_uchardata++ = XCL_SINGLE;
4102               class_uchardata += PRIV(ord2utf)(0x1680, class_uchardata);
4103               *class_uchardata++ = XCL_SINGLE;
4104               class_uchardata += PRIV(ord2utf)(0x180e, class_uchardata);
4105               *class_uchardata++ = XCL_RANGE;
4106               class_uchardata += PRIV(ord2utf)(0x2000, class_uchardata);
4107               class_uchardata += PRIV(ord2utf)(0x200a, class_uchardata);
4108               *class_uchardata++ = XCL_SINGLE;
4109               class_uchardata += PRIV(ord2utf)(0x202f, class_uchardata);
4110               *class_uchardata++ = XCL_SINGLE;
4111               class_uchardata += PRIV(ord2utf)(0x205f, class_uchardata);
4112               *class_uchardata++ = XCL_SINGLE;
4113               class_uchardata += PRIV(ord2utf)(0x3000, class_uchardata);
4114               }
4115 #endif
4116             continue;
4117
4118             case ESC_H:
4119             for (c = 0; c < 32; c++)
4120               {
4121               int x = 0xff;
4122               switch (c)
4123                 {
4124                 case 0x09/8: x ^= 1 << (0x09%8); break;
4125                 case 0x20/8: x ^= 1 << (0x20%8); break;
4126                 case 0xa0/8: x ^= 1 << (0xa0%8); break;
4127                 default: break;
4128                 }
4129               classbits[c] |= x;
4130               }
4131 #ifndef COMPILE_PCRE8
4132             xclass = TRUE;
4133             *class_uchardata++ = XCL_RANGE;
4134             *class_uchardata++ = 0x0100;
4135             *class_uchardata++ = 0x167f;
4136             *class_uchardata++ = XCL_RANGE;
4137             *class_uchardata++ = 0x1681;
4138             *class_uchardata++ = 0x180d;
4139             *class_uchardata++ = XCL_RANGE;
4140             *class_uchardata++ = 0x180f;
4141             *class_uchardata++ = 0x1fff;
4142             *class_uchardata++ = XCL_RANGE;
4143             *class_uchardata++ = 0x200b;
4144             *class_uchardata++ = 0x202e;
4145             *class_uchardata++ = XCL_RANGE;
4146             *class_uchardata++ = 0x2030;
4147             *class_uchardata++ = 0x205e;
4148             *class_uchardata++ = XCL_RANGE;
4149             *class_uchardata++ = 0x2060;
4150             *class_uchardata++ = 0x2fff;
4151             *class_uchardata++ = XCL_RANGE;
4152             *class_uchardata++ = 0x3001;
4153 #ifdef SUPPORT_UTF
4154             if (utf)
4155               class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata);
4156             else
4157 #endif
4158               *class_uchardata++ = 0xffff;
4159 #elif defined SUPPORT_UTF
4160             if (utf)
4161               {
4162               xclass = TRUE;
4163               *class_uchardata++ = XCL_RANGE;
4164               class_uchardata += PRIV(ord2utf)(0x0100, class_uchardata);
4165               class_uchardata += PRIV(ord2utf)(0x167f, class_uchardata);
4166               *class_uchardata++ = XCL_RANGE;
4167               class_uchardata += PRIV(ord2utf)(0x1681, class_uchardata);
4168               class_uchardata += PRIV(ord2utf)(0x180d, class_uchardata);
4169               *class_uchardata++ = XCL_RANGE;
4170               class_uchardata += PRIV(ord2utf)(0x180f, class_uchardata);
4171               class_uchardata += PRIV(ord2utf)(0x1fff, class_uchardata);
4172               *class_uchardata++ = XCL_RANGE;
4173               class_uchardata += PRIV(ord2utf)(0x200b, class_uchardata);
4174               class_uchardata += PRIV(ord2utf)(0x202e, class_uchardata);
4175               *class_uchardata++ = XCL_RANGE;
4176               class_uchardata += PRIV(ord2utf)(0x2030, class_uchardata);
4177               class_uchardata += PRIV(ord2utf)(0x205e, class_uchardata);
4178               *class_uchardata++ = XCL_RANGE;
4179               class_uchardata += PRIV(ord2utf)(0x2060, class_uchardata);
4180               class_uchardata += PRIV(ord2utf)(0x2fff, class_uchardata);
4181               *class_uchardata++ = XCL_RANGE;
4182               class_uchardata += PRIV(ord2utf)(0x3001, class_uchardata);
4183               class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata);
4184               }
4185 #endif
4186             continue;
4187
4188             case ESC_v:
4189             SETBIT(classbits, 0x0a); /* LF */
4190             SETBIT(classbits, 0x0b); /* VT */
4191             SETBIT(classbits, 0x0c); /* FF */
4192             SETBIT(classbits, 0x0d); /* CR */
4193             SETBIT(classbits, 0x85); /* NEL */
4194 #ifndef COMPILE_PCRE8
4195             xclass = TRUE;
4196             *class_uchardata++ = XCL_RANGE;
4197             *class_uchardata++ = 0x2028;
4198             *class_uchardata++ = 0x2029;
4199 #elif defined SUPPORT_UTF
4200             if (utf)
4201               {
4202               xclass = TRUE;
4203               *class_uchardata++ = XCL_RANGE;
4204               class_uchardata += PRIV(ord2utf)(0x2028, class_uchardata);
4205               class_uchardata += PRIV(ord2utf)(0x2029, class_uchardata);
4206               }
4207 #endif
4208             continue;
4209
4210             case ESC_V:
4211             for (c = 0; c < 32; c++)
4212               {
4213               int x = 0xff;
4214               switch (c)
4215                 {
4216                 case 0x0a/8: x ^= 1 << (0x0a%8);
4217                              x ^= 1 << (0x0b%8);
4218                              x ^= 1 << (0x0c%8);
4219                              x ^= 1 << (0x0d%8);
4220                              break;
4221                 case 0x85/8: x ^= 1 << (0x85%8); break;
4222                 default: break;
4223                 }
4224               classbits[c] |= x;
4225               }
4226
4227 #ifndef COMPILE_PCRE8
4228             xclass = TRUE;
4229             *class_uchardata++ = XCL_RANGE;
4230             *class_uchardata++ = 0x0100;
4231             *class_uchardata++ = 0x2027;
4232             *class_uchardata++ = XCL_RANGE;
4233             *class_uchardata++ = 0x202a;
4234 #ifdef SUPPORT_UTF
4235             if (utf)
4236               class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata);
4237             else
4238 #endif
4239               *class_uchardata++ = 0xffff;
4240 #elif defined SUPPORT_UTF
4241             if (utf)
4242               {
4243               xclass = TRUE;
4244               *class_uchardata++ = XCL_RANGE;
4245               class_uchardata += PRIV(ord2utf)(0x0100, class_uchardata);
4246               class_uchardata += PRIV(ord2utf)(0x2027, class_uchardata);
4247               *class_uchardata++ = XCL_RANGE;
4248               class_uchardata += PRIV(ord2utf)(0x202a, class_uchardata);
4249               class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata);
4250               }
4251 #endif
4252             continue;
4253
4254 #ifdef SUPPORT_UCP
4255             case ESC_p:
4256             case ESC_P:
4257               {
4258               BOOL negated;
4259               int pdata;
4260               int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
4261               if (ptype < 0) goto FAILED;
4262               xclass = TRUE;
4263               *class_uchardata++ = ((-c == ESC_p) != negated)?
4264                 XCL_PROP : XCL_NOTPROP;
4265               *class_uchardata++ = ptype;
4266               *class_uchardata++ = pdata;
4267               class_has_8bitchar--;                /* Undo! */
4268               continue;
4269               }
4270 #endif
4271             /* Unrecognized escapes are faulted if PCRE is running in its
4272             strict mode. By default, for compatibility with Perl, they are
4273             treated as literals. */
4274
4275             default:
4276             if ((options & PCRE_EXTRA) != 0)
4277               {
4278               *errorcodeptr = ERR7;
4279               goto FAILED;
4280               }
4281             class_has_8bitchar--;    /* Undo the speculative increase. */
4282             class_single_char -= 2;  /* Undo the speculative increase. */
4283             c = *ptr;                /* Get the final character and fall through */
4284             break;
4285             }
4286           }
4287
4288         /* Fall through if we have a single character (c >= 0). This may be
4289         greater than 256. */
4290
4291         }   /* End of backslash handling */
4292
4293       /* A single character may be followed by '-' to form a range. However,
4294       Perl does not permit ']' to be the end of the range. A '-' character
4295       at the end is treated as a literal. Perl ignores orphaned \E sequences
4296       entirely. The code for handling \Q and \E is messy. */
4297
4298       CHECK_RANGE:
4299       while (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
4300         {
4301         inescq = FALSE;
4302         ptr += 2;
4303         }
4304
4305       oldptr = ptr;
4306
4307       /* Remember \r or \n */
4308
4309       if (c == CHAR_CR || c == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
4310
4311       /* Check for range */
4312
4313       if (!inescq && ptr[1] == CHAR_MINUS)
4314         {
4315         int d;
4316         ptr += 2;
4317         while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E) ptr += 2;
4318
4319         /* If we hit \Q (not followed by \E) at this point, go into escaped
4320         mode. */
4321
4322         while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_Q)
4323           {
4324           ptr += 2;
4325           if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E)
4326             { ptr += 2; continue; }
4327           inescq = TRUE;
4328           break;
4329           }
4330
4331         if (*ptr == 0 || (!inescq && *ptr == CHAR_RIGHT_SQUARE_BRACKET))
4332           {
4333           ptr = oldptr;
4334           goto LONE_SINGLE_CHARACTER;
4335           }
4336
4337 #ifdef SUPPORT_UTF
4338         if (utf)
4339           {                           /* Braces are required because the */
4340           GETCHARLEN(d, ptr, ptr);    /* macro generates multiple statements */
4341           }
4342         else
4343 #endif
4344         d = *ptr;  /* Not UTF-8 mode */
4345
4346         /* The second part of a range can be a single-character escape, but
4347         not any of the other escapes. Perl 5.6 treats a hyphen as a literal
4348         in such circumstances. */
4349
4350         if (!inescq && d == CHAR_BACKSLASH)
4351           {
4352           d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
4353           if (*errorcodeptr != 0) goto FAILED;
4354
4355           /* \b is backspace; any other special means the '-' was literal */
4356
4357           if (d < 0)
4358             {
4359             if (d == -ESC_b) d = CHAR_BS; else
4360               {
4361               ptr = oldptr;
4362               goto LONE_SINGLE_CHARACTER;  /* A few lines below */
4363               }
4364             }
4365           }
4366
4367         /* Check that the two values are in the correct order. Optimize
4368         one-character ranges */
4369
4370         if (d < c)
4371           {
4372           *errorcodeptr = ERR8;
4373           goto FAILED;
4374           }
4375
4376         if (d == c) goto LONE_SINGLE_CHARACTER;  /* A few lines below */
4377
4378         /* Remember \r or \n */
4379
4380         if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
4381
4382         /* Since we found a character range, single character optimizations
4383         cannot be done anymore. */
4384         class_single_char = 2;
4385
4386         /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
4387         matching, we have to use an XCLASS with extra data items. Caseless
4388         matching for characters > 127 is available only if UCP support is
4389         available. */
4390
4391 #if defined SUPPORT_UTF && !(defined COMPILE_PCRE8)
4392         if ((d > 255) || (utf && ((options & PCRE_CASELESS) != 0 && d > 127)))
4393 #elif defined  SUPPORT_UTF
4394         if (utf && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
4395 #elif !(defined COMPILE_PCRE8)
4396         if (d > 255)
4397 #endif
4398 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
4399           {
4400           xclass = TRUE;
4401
4402           /* With UCP support, we can find the other case equivalents of
4403           the relevant characters. There may be several ranges. Optimize how
4404           they fit with the basic range. */
4405
4406 #ifdef SUPPORT_UCP
4407 #ifndef COMPILE_PCRE8
4408           if (utf && (options & PCRE_CASELESS) != 0)
4409 #else
4410           if ((options & PCRE_CASELESS) != 0)
4411 #endif
4412             {
4413             unsigned int occ, ocd;
4414             unsigned int cc = c;
4415             unsigned int origd = d;
4416             while (get_othercase_range(&cc, origd, &occ, &ocd))
4417               {
4418               if (occ >= (unsigned int)c &&
4419                   ocd <= (unsigned int)d)
4420                 continue;                          /* Skip embedded ranges */
4421
4422               if (occ < (unsigned int)c  &&
4423                   ocd >= (unsigned int)c - 1)      /* Extend the basic range */
4424                 {                                  /* if there is overlap,   */
4425                 c = occ;                           /* noting that if occ < c */
4426                 continue;                          /* we can't have ocd > d  */
4427                 }                                  /* because a subrange is  */
4428               if (ocd > (unsigned int)d &&
4429                   occ <= (unsigned int)d + 1)      /* always shorter than    */
4430                 {                                  /* the basic range.       */
4431                 d = ocd;
4432                 continue;
4433                 }
4434
4435               if (occ == ocd)
4436                 {
4437                 *class_uchardata++ = XCL_SINGLE;
4438                 }
4439               else
4440                 {
4441                 *class_uchardata++ = XCL_RANGE;
4442                 class_uchardata += PRIV(ord2utf)(occ, class_uchardata);
4443                 }
4444               class_uchardata += PRIV(ord2utf)(ocd, class_uchardata);
4445               }
4446             }
4447 #endif  /* SUPPORT_UCP */
4448
4449           /* Now record the original range, possibly modified for UCP caseless
4450           overlapping ranges. */
4451
4452           *class_uchardata++ = XCL_RANGE;
4453 #ifdef SUPPORT_UTF
4454 #ifndef COMPILE_PCRE8
4455           if (utf)
4456             {
4457             class_uchardata += PRIV(ord2utf)(c, class_uchardata);
4458             class_uchardata += PRIV(ord2utf)(d, class_uchardata);
4459             }
4460           else
4461             {
4462             *class_uchardata++ = c;
4463             *class_uchardata++ = d;
4464             }
4465 #else
4466           class_uchardata += PRIV(ord2utf)(c, class_uchardata);
4467           class_uchardata += PRIV(ord2utf)(d, class_uchardata);
4468 #endif
4469 #else /* SUPPORT_UTF */
4470           *class_uchardata++ = c;
4471           *class_uchardata++ = d;
4472 #endif /* SUPPORT_UTF */
4473
4474           /* With UCP support, we are done. Without UCP support, there is no
4475           caseless matching for UTF characters > 127; we can use the bit map
4476           for the smaller ones. As for 16 bit characters without UTF, we
4477           can still use  */
4478
4479 #ifdef SUPPORT_UCP
4480 #ifndef COMPILE_PCRE8
4481           if (utf)
4482 #endif
4483             continue;    /* With next character in the class */
4484 #endif  /* SUPPORT_UCP */
4485
4486 #if defined SUPPORT_UTF && !defined(SUPPORT_UCP) && !(defined COMPILE_PCRE8)
4487           if (utf)
4488             {
4489             if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
4490             /* Adjust upper limit and fall through to set up the map */
4491             d = 127;
4492             }
4493           else
4494             {
4495             if (c > 255) continue;
4496             /* Adjust upper limit and fall through to set up the map */
4497             d = 255;
4498             }
4499 #elif defined SUPPORT_UTF && !defined(SUPPORT_UCP)
4500           if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
4501           /* Adjust upper limit and fall through to set up the map */
4502           d = 127;
4503 #else
4504           if (c > 255) continue;
4505           /* Adjust upper limit and fall through to set up the map */
4506           d = 255;
4507 #endif  /* SUPPORT_UTF && !SUPPORT_UCP && !COMPILE_PCRE8 */
4508           }
4509 #endif  /* SUPPORT_UTF || !COMPILE_PCRE8 */
4510
4511         /* We use the bit map for 8 bit mode, or when the characters fall
4512         partially or entirely to [0-255] ([0-127] for UCP) ranges. */
4513
4514         class_has_8bitchar = 1;
4515
4516         /* We can save a bit of time by skipping this in the pre-compile. */
4517
4518         if (lengthptr == NULL) for (; c <= d; c++)
4519           {
4520           classbits[c/8] |= (1 << (c&7));
4521           if ((options & PCRE_CASELESS) != 0)
4522             {
4523             int uc = cd->fcc[c]; /* flip case */
4524             classbits[uc/8] |= (1 << (uc&7));
4525             }
4526           }
4527
4528         continue;   /* Go get the next char in the class */
4529         }
4530
4531       /* Handle a lone single character - we can get here for a normal
4532       non-escape char, or after \ that introduces a single character or for an
4533       apparent range that isn't. */
4534
4535       LONE_SINGLE_CHARACTER:
4536
4537       /* Only the value of 1 matters for class_single_char. */
4538
4539       if (class_single_char < 2) class_single_char++;
4540
4541       /* If class_charcount is 1, we saw precisely one character. As long as
4542       there was no use of \p or \P, in other words, no use of any XCLASS
4543       features, we can optimize.
4544
4545       The optimization throws away the bit map. We turn the item into a
4546       1-character OP_CHAR[I] if it's positive, or OP_NOT[I] if it's negative.
4547       In the positive case, it can cause firstchar to be set. Otherwise, there
4548       can be no first char if this item is first, whatever repeat count may
4549       follow. In the case of reqchar, save the previous value for reinstating. */
4550
4551       if (class_single_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
4552         {
4553         ptr++;
4554         zeroreqchar = reqchar;
4555
4556         if (negate_class)
4557           {
4558           if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
4559           zerofirstchar = firstchar;
4560           *code++ = ((options & PCRE_CASELESS) != 0)? OP_NOTI: OP_NOT;
4561 #ifdef SUPPORT_UTF
4562           if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
4563             code += PRIV(ord2utf)(c, code);
4564           else
4565 #endif
4566             *code++ = c;
4567           goto NOT_CHAR;
4568           }
4569
4570         /* For a single, positive character, get the value into mcbuffer, and
4571         then we can handle this with the normal one-character code. */
4572
4573 #ifdef SUPPORT_UTF
4574         if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
4575           mclength = PRIV(ord2utf)(c, mcbuffer);
4576         else
4577 #endif
4578           {
4579           mcbuffer[0] = c;
4580           mclength = 1;
4581           }
4582         goto ONE_CHAR;
4583         }       /* End of 1-char optimization */
4584
4585       /* Handle a character that cannot go in the bit map. */
4586
4587 #if defined SUPPORT_UTF && !(defined COMPILE_PCRE8)
4588       if ((c > 255) || (utf && ((options & PCRE_CASELESS) != 0 && c > 127)))
4589 #elif defined SUPPORT_UTF
4590       if (utf && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
4591 #elif !(defined COMPILE_PCRE8)
4592       if (c > 255)
4593 #endif
4594
4595 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
4596         {
4597         xclass = TRUE;
4598         *class_uchardata++ = XCL_SINGLE;
4599 #ifdef SUPPORT_UTF
4600 #ifndef COMPILE_PCRE8
4601         /* In non 8 bit mode, we can get here even if we are not in UTF mode. */
4602         if (!utf)
4603           *class_uchardata++ = c;
4604         else
4605 #endif
4606           class_uchardata += PRIV(ord2utf)(c, class_uchardata);
4607 #else /* SUPPORT_UTF */
4608         *class_uchardata++ = c;
4609 #endif /* SUPPORT_UTF */
4610
4611 #ifdef SUPPORT_UCP
4612 #ifdef COMPILE_PCRE8
4613         if ((options & PCRE_CASELESS) != 0)
4614 #else
4615         /* In non 8 bit mode, we can get here even if we are not in UTF mode. */
4616         if (utf && (options & PCRE_CASELESS) != 0)
4617 #endif
4618           {
4619           unsigned int othercase;
4620           if ((int)(othercase = UCD_OTHERCASE(c)) != c)
4621             {
4622             *class_uchardata++ = XCL_SINGLE;
4623             class_uchardata += PRIV(ord2utf)(othercase, class_uchardata);
4624             }
4625           }
4626 #endif  /* SUPPORT_UCP */
4627
4628         }
4629       else
4630 #endif  /* SUPPORT_UTF || COMPILE_PCRE16 */
4631
4632       /* Handle a single-byte character */
4633         {
4634         class_has_8bitchar = 1;
4635         classbits[c/8] |= (1 << (c&7));
4636         if ((options & PCRE_CASELESS) != 0)
4637           {
4638           c = cd->fcc[c]; /* flip case */
4639           classbits[c/8] |= (1 << (c&7));
4640           }
4641         }
4642       }
4643
4644     /* Loop until ']' reached. This "while" is the end of the "do" far above.
4645     If we are at the end of an internal nested string, revert to the outer
4646     string. */
4647
4648     while (((c = *(++ptr)) != 0 ||
4649            (nestptr != NULL &&
4650              (ptr = nestptr, nestptr = NULL, c = *(++ptr)) != 0)) &&
4651            (c != CHAR_RIGHT_SQUARE_BRACKET || inescq));
4652
4653     /* Check for missing terminating ']' */
4654
4655     if (c == 0)
4656       {
4657       *errorcodeptr = ERR6;
4658       goto FAILED;
4659       }
4660
4661     /* If this is the first thing in the branch, there can be no first char
4662     setting, whatever the repeat count. Any reqchar setting must remain
4663     unchanged after any kind of repeat. */
4664
4665     if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
4666     zerofirstchar = firstchar;
4667     zeroreqchar = reqchar;
4668
4669     /* If there are characters with values > 255, we have to compile an
4670     extended class, with its own opcode, unless there was a negated special
4671     such as \S in the class, and PCRE_UCP is not set, because in that case all
4672     characters > 255 are in the class, so any that were explicitly given as
4673     well can be ignored. If (when there are explicit characters > 255 that must
4674     be listed) there are no characters < 256, we can omit the bitmap in the
4675     actual compiled code. */
4676
4677 #ifdef SUPPORT_UTF
4678     if (xclass && (!should_flip_negation || (options & PCRE_UCP) != 0))
4679 #elif !defined COMPILE_PCRE8
4680     if (xclass && !should_flip_negation)
4681 #endif
4682 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4683       {
4684       *class_uchardata++ = XCL_END;    /* Marks the end of extra data */
4685       *code++ = OP_XCLASS;
4686       code += LINK_SIZE;
4687       *code = negate_class? XCL_NOT:0;
4688
4689       /* If the map is required, move up the extra data to make room for it;
4690       otherwise just move the code pointer to the end of the extra data. */
4691
4692       if (class_has_8bitchar > 0)
4693         {
4694         *code++ |= XCL_MAP;
4695         memmove(code + (32 / sizeof(pcre_uchar)), code,
4696           IN_UCHARS(class_uchardata - code));
4697         memcpy(code, classbits, 32);
4698         code = class_uchardata + (32 / sizeof(pcre_uchar));
4699         }
4700       else code = class_uchardata;
4701
4702       /* Now fill in the complete length of the item */
4703
4704       PUT(previous, 1, (int)(code - previous));
4705       break;   /* End of class handling */
4706       }
4707 #endif
4708
4709     /* If there are no characters > 255, or they are all to be included or
4710     excluded, set the opcode to OP_CLASS or OP_NCLASS, depending on whether the
4711     whole class was negated and whether there were negative specials such as \S
4712     (non-UCP) in the class. Then copy the 32-byte map into the code vector,
4713     negating it if necessary. */
4714
4715     *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
4716     if (lengthptr == NULL)    /* Save time in the pre-compile phase */
4717       {
4718       if (negate_class)
4719         for (c = 0; c < 32; c++) classbits[c] = ~classbits[c];
4720       memcpy(code, classbits, 32);
4721       }
4722     code += 32 / sizeof(pcre_uchar);
4723     NOT_CHAR:
4724     break;
4725
4726
4727     /* ===================================================================*/
4728     /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
4729     has been tested above. */
4730
4731     case CHAR_LEFT_CURLY_BRACKET:
4732     if (!is_quantifier) goto NORMAL_CHAR;
4733     ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
4734     if (*errorcodeptr != 0) goto FAILED;
4735     goto REPEAT;
4736
4737     case CHAR_ASTERISK:
4738     repeat_min = 0;
4739     repeat_max = -1;
4740     goto REPEAT;
4741
4742     case CHAR_PLUS:
4743     repeat_min = 1;
4744     repeat_max = -1;
4745     goto REPEAT;
4746
4747     case CHAR_QUESTION_MARK:
4748     repeat_min = 0;
4749     repeat_max = 1;
4750
4751     REPEAT:
4752     if (previous == NULL)
4753       {
4754       *errorcodeptr = ERR9;
4755       goto FAILED;
4756       }
4757
4758     if (repeat_min == 0)
4759       {
4760       firstchar = zerofirstchar;    /* Adjust for zero repeat */
4761       reqchar = zeroreqchar;        /* Ditto */
4762       }
4763
4764     /* Remember whether this is a variable length repeat */
4765
4766     reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
4767
4768     op_type = 0;                    /* Default single-char op codes */
4769     possessive_quantifier = FALSE;  /* Default not possessive quantifier */
4770
4771     /* Save start of previous item, in case we have to move it up in order to
4772     insert something before it. */
4773
4774     tempcode = previous;
4775
4776     /* If the next character is '+', we have a possessive quantifier. This
4777     implies greediness, whatever the setting of the PCRE_UNGREEDY option.
4778     If the next character is '?' this is a minimizing repeat, by default,
4779     but if PCRE_UNGREEDY is set, it works the other way round. We change the
4780     repeat type to the non-default. */
4781
4782     if (ptr[1] == CHAR_PLUS)
4783       {
4784       repeat_type = 0;                  /* Force greedy */
4785       possessive_quantifier = TRUE;
4786       ptr++;
4787       }
4788     else if (ptr[1] == CHAR_QUESTION_MARK)
4789       {
4790       repeat_type = greedy_non_default;
4791       ptr++;
4792       }
4793     else repeat_type = greedy_default;
4794
4795     /* If previous was a recursion call, wrap it in atomic brackets so that
4796     previous becomes the atomic group. All recursions were so wrapped in the
4797     past, but it no longer happens for non-repeated recursions. In fact, the
4798     repeated ones could be re-implemented independently so as not to need this,
4799     but for the moment we rely on the code for repeating groups. */
4800
4801     if (*previous == OP_RECURSE)
4802       {
4803       memmove(previous + 1 + LINK_SIZE, previous, IN_UCHARS(1 + LINK_SIZE));
4804       *previous = OP_ONCE;
4805       PUT(previous, 1, 2 + 2*LINK_SIZE);
4806       previous[2 + 2*LINK_SIZE] = OP_KET;
4807       PUT(previous, 3 + 2*LINK_SIZE, 2 + 2*LINK_SIZE);
4808       code += 2 + 2 * LINK_SIZE;
4809       length_prevgroup = 3 + 3*LINK_SIZE;
4810
4811       /* When actually compiling, we need to check whether this was a forward
4812       reference, and if so, adjust the offset. */
4813
4814       if (lengthptr == NULL && cd->hwm >= cd->start_workspace + LINK_SIZE)
4815         {
4816         int offset = GET(cd->hwm, -LINK_SIZE);
4817         if (offset == previous + 1 - cd->start_code)
4818           PUT(cd->hwm, -LINK_SIZE, offset + 1 + LINK_SIZE);
4819         }
4820       }
4821
4822     /* Now handle repetition for the different types of item. */
4823
4824     /* If previous was a character or negated character match, abolish the item
4825     and generate a repeat item instead. If a char item has a minimum of more
4826     than one, ensure that it is set in reqchar - it might not be if a sequence
4827     such as x{3} is the first thing in a branch because the x will have gone
4828     into firstchar instead.  */
4829
4830     if (*previous == OP_CHAR || *previous == OP_CHARI
4831         || *previous == OP_NOT || *previous == OP_NOTI)
4832       {
4833       switch (*previous)
4834         {
4835         default: /* Make compiler happy. */
4836         case OP_CHAR:  op_type = OP_STAR - OP_STAR; break;
4837         case OP_CHARI: op_type = OP_STARI - OP_STAR; break;
4838         case OP_NOT:   op_type = OP_NOTSTAR - OP_STAR; break;
4839         case OP_NOTI:  op_type = OP_NOTSTARI - OP_STAR; break;
4840         }
4841
4842       /* Deal with UTF characters that take up more than one character. It's
4843       easier to write this out separately than try to macrify it. Use c to
4844       hold the length of the character in bytes, plus UTF_LENGTH to flag that
4845       it's a length rather than a small character. */
4846
4847 #ifdef SUPPORT_UTF
4848       if (utf && NOT_FIRSTCHAR(code[-1]))
4849         {
4850         pcre_uchar *lastchar = code - 1;
4851         BACKCHAR(lastchar);
4852         c = (int)(code - lastchar);     /* Length of UTF-8 character */
4853         memcpy(utf_chars, lastchar, IN_UCHARS(c)); /* Save the char */
4854         c |= UTF_LENGTH;                /* Flag c as a length */
4855         }
4856       else
4857 #endif /* SUPPORT_UTF */
4858
4859       /* Handle the case of a single character - either with no UTF support, or
4860       with UTF disabled, or for a single character UTF character. */
4861         {
4862         c = code[-1];
4863         if (*previous <= OP_CHARI && repeat_min > 1)
4864           reqchar = c | req_caseopt | cd->req_varyopt;
4865         }
4866
4867       /* If the repetition is unlimited, it pays to see if the next thing on
4868       the line is something that cannot possibly match this character. If so,
4869       automatically possessifying this item gains some performance in the case
4870       where the match fails. */
4871
4872       if (!possessive_quantifier &&
4873           repeat_max < 0 &&
4874           check_auto_possessive(previous, utf, ptr + 1, options, cd))
4875         {
4876         repeat_type = 0;    /* Force greedy */
4877         possessive_quantifier = TRUE;
4878         }
4879
4880       goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */
4881       }
4882
4883     /* If previous was a character type match (\d or similar), abolish it and
4884     create a suitable repeat item. The code is shared with single-character
4885     repeats by setting op_type to add a suitable offset into repeat_type. Note
4886     the the Unicode property types will be present only when SUPPORT_UCP is
4887     defined, but we don't wrap the little bits of code here because it just
4888     makes it horribly messy. */
4889
4890     else if (*previous < OP_EODN)
4891       {
4892       pcre_uchar *oldcode;
4893       int prop_type, prop_value;
4894       op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */
4895       c = *previous;
4896
4897       if (!possessive_quantifier &&
4898           repeat_max < 0 &&
4899           check_auto_possessive(previous, utf, ptr + 1, options, cd))
4900         {
4901         repeat_type = 0;    /* Force greedy */
4902         possessive_quantifier = TRUE;
4903         }
4904
4905       OUTPUT_SINGLE_REPEAT:
4906       if (*previous == OP_PROP || *previous == OP_NOTPROP)
4907         {
4908         prop_type = previous[1];
4909         prop_value = previous[2];
4910         }
4911       else prop_type = prop_value = -1;
4912
4913       oldcode = code;
4914       code = previous;                  /* Usually overwrite previous item */
4915
4916       /* If the maximum is zero then the minimum must also be zero; Perl allows
4917       this case, so we do too - by simply omitting the item altogether. */
4918
4919       if (repeat_max == 0) goto END_REPEAT;
4920
4921       /*--------------------------------------------------------------------*/
4922       /* This code is obsolete from release 8.00; the restriction was finally
4923       removed: */
4924
4925       /* All real repeats make it impossible to handle partial matching (maybe
4926       one day we will be able to remove this restriction). */
4927
4928       /* if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL; */
4929       /*--------------------------------------------------------------------*/
4930
4931       /* Combine the op_type with the repeat_type */
4932
4933       repeat_type += op_type;
4934
4935       /* A minimum of zero is handled either as the special case * or ?, or as
4936       an UPTO, with the maximum given. */
4937
4938       if (repeat_min == 0)
4939         {
4940         if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
4941           else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
4942         else
4943           {
4944           *code++ = OP_UPTO + repeat_type;
4945           PUT2INC(code, 0, repeat_max);
4946           }
4947         }
4948
4949       /* A repeat minimum of 1 is optimized into some special cases. If the
4950       maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
4951       left in place and, if the maximum is greater than 1, we use OP_UPTO with
4952       one less than the maximum. */
4953
4954       else if (repeat_min == 1)
4955         {
4956         if (repeat_max == -1)
4957           *code++ = OP_PLUS + repeat_type;
4958         else
4959           {
4960           code = oldcode;                 /* leave previous item in place */
4961           if (repeat_max == 1) goto END_REPEAT;
4962           *code++ = OP_UPTO + repeat_type;
4963           PUT2INC(code, 0, repeat_max - 1);
4964           }
4965         }
4966
4967       /* The case {n,n} is just an EXACT, while the general case {n,m} is
4968       handled as an EXACT followed by an UPTO. */
4969
4970       else
4971         {
4972         *code++ = OP_EXACT + op_type;  /* NB EXACT doesn't have repeat_type */
4973         PUT2INC(code, 0, repeat_min);
4974
4975         /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
4976         we have to insert the character for the previous code. For a repeated
4977         Unicode property match, there are two extra bytes that define the
4978         required property. In UTF-8 mode, long characters have their length in
4979         c, with the UTF_LENGTH bit as a flag. */
4980
4981         if (repeat_max < 0)
4982           {
4983 #ifdef SUPPORT_UTF
4984           if (utf && (c & UTF_LENGTH) != 0)
4985             {
4986             memcpy(code, utf_chars, IN_UCHARS(c & 7));
4987             code += c & 7;
4988             }
4989           else
4990 #endif
4991             {
4992             *code++ = c;
4993             if (prop_type >= 0)
4994               {
4995               *code++ = prop_type;
4996               *code++ = prop_value;
4997               }
4998             }
4999           *code++ = OP_STAR + repeat_type;
5000           }
5001
5002         /* Else insert an UPTO if the max is greater than the min, again
5003         preceded by the character, for the previously inserted code. If the
5004         UPTO is just for 1 instance, we can use QUERY instead. */
5005
5006         else if (repeat_max != repeat_min)
5007           {
5008 #ifdef SUPPORT_UTF
5009           if (utf && (c & UTF_LENGTH) != 0)
5010             {
5011             memcpy(code, utf_chars, IN_UCHARS(c & 7));
5012             code += c & 7;
5013             }
5014           else
5015 #endif
5016           *code++ = c;
5017           if (prop_type >= 0)
5018             {
5019             *code++ = prop_type;
5020             *code++ = prop_value;
5021             }
5022           repeat_max -= repeat_min;
5023
5024           if (repeat_max == 1)
5025             {
5026             *code++ = OP_QUERY + repeat_type;
5027             }
5028           else
5029             {
5030             *code++ = OP_UPTO + repeat_type;
5031             PUT2INC(code, 0, repeat_max);
5032             }
5033           }
5034         }
5035
5036       /* The character or character type itself comes last in all cases. */
5037
5038 #ifdef SUPPORT_UTF
5039       if (utf && (c & UTF_LENGTH) != 0)
5040         {
5041         memcpy(code, utf_chars, IN_UCHARS(c & 7));
5042         code += c & 7;
5043         }
5044       else
5045 #endif
5046       *code++ = c;
5047
5048       /* For a repeated Unicode property match, there are two extra bytes that
5049       define the required property. */
5050
5051 #ifdef SUPPORT_UCP
5052       if (prop_type >= 0)
5053         {
5054         *code++ = prop_type;
5055         *code++ = prop_value;
5056         }
5057 #endif
5058       }
5059
5060     /* If previous was a character class or a back reference, we put the repeat
5061     stuff after it, but just skip the item if the repeat was {0,0}. */
5062
5063     else if (*previous == OP_CLASS ||
5064              *previous == OP_NCLASS ||
5065 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
5066              *previous == OP_XCLASS ||
5067 #endif
5068              *previous == OP_REF ||
5069              *previous == OP_REFI)
5070       {
5071       if (repeat_max == 0)
5072         {
5073         code = previous;
5074         goto END_REPEAT;
5075         }
5076
5077       /*--------------------------------------------------------------------*/
5078       /* This code is obsolete from release 8.00; the restriction was finally
5079       removed: */
5080
5081       /* All real repeats make it impossible to handle partial matching (maybe
5082       one day we will be able to remove this restriction). */
5083
5084       /* if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL; */
5085       /*--------------------------------------------------------------------*/
5086
5087       if (repeat_min == 0 && repeat_max == -1)
5088         *code++ = OP_CRSTAR + repeat_type;
5089       else if (repeat_min == 1 && repeat_max == -1)
5090         *code++ = OP_CRPLUS + repeat_type;
5091       else if (repeat_min == 0 && repeat_max == 1)
5092         *code++ = OP_CRQUERY + repeat_type;
5093       else
5094         {
5095         *code++ = OP_CRRANGE + repeat_type;
5096         PUT2INC(code, 0, repeat_min);
5097         if (repeat_max == -1) repeat_max = 0;  /* 2-byte encoding for max */
5098         PUT2INC(code, 0, repeat_max);
5099         }
5100       }
5101
5102     /* If previous was a bracket group, we may have to replicate it in certain
5103     cases. Note that at this point we can encounter only the "basic" bracket
5104     opcodes such as BRA and CBRA, as this is the place where they get converted
5105     into the more special varieties such as BRAPOS and SBRA. A test for >=
5106     OP_ASSERT and <= OP_COND includes ASSERT, ASSERT_NOT, ASSERTBACK,
5107     ASSERTBACK_NOT, ONCE, BRA, CBRA, and COND. Originally, PCRE did not allow
5108     repetition of assertions, but now it does, for Perl compatibility. */
5109
5110     else if (*previous >= OP_ASSERT && *previous <= OP_COND)
5111       {
5112       int i;
5113       int len = (int)(code - previous);
5114       pcre_uchar *bralink = NULL;
5115       pcre_uchar *brazeroptr = NULL;
5116
5117       /* Repeating a DEFINE group is pointless, but Perl allows the syntax, so
5118       we just ignore the repeat. */
5119
5120       if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
5121         goto END_REPEAT;
5122
5123       /* There is no sense in actually repeating assertions. The only potential
5124       use of repetition is in cases when the assertion is optional. Therefore,
5125       if the minimum is greater than zero, just ignore the repeat. If the
5126       maximum is not not zero or one, set it to 1. */
5127
5128       if (*previous < OP_ONCE)    /* Assertion */
5129         {
5130         if (repeat_min > 0) goto END_REPEAT;
5131         if (repeat_max < 0 || repeat_max > 1) repeat_max = 1;
5132         }
5133
5134       /* The case of a zero minimum is special because of the need to stick
5135       OP_BRAZERO in front of it, and because the group appears once in the
5136       data, whereas in other cases it appears the minimum number of times. For
5137       this reason, it is simplest to treat this case separately, as otherwise
5138       the code gets far too messy. There are several special subcases when the
5139       minimum is zero. */
5140
5141       if (repeat_min == 0)
5142         {
5143         /* If the maximum is also zero, we used to just omit the group from the
5144         output altogether, like this:
5145
5146         ** if (repeat_max == 0)
5147         **   {
5148         **   code = previous;
5149         **   goto END_REPEAT;
5150         **   }
5151
5152         However, that fails when a group or a subgroup within it is referenced
5153         as a subroutine from elsewhere in the pattern, so now we stick in
5154         OP_SKIPZERO in front of it so that it is skipped on execution. As we
5155         don't have a list of which groups are referenced, we cannot do this
5156         selectively.
5157
5158         If the maximum is 1 or unlimited, we just have to stick in the BRAZERO
5159         and do no more at this point. However, we do need to adjust any
5160         OP_RECURSE calls inside the group that refer to the group itself or any
5161         internal or forward referenced group, because the offset is from the
5162         start of the whole regex. Temporarily terminate the pattern while doing
5163         this. */
5164
5165         if (repeat_max <= 1)    /* Covers 0, 1, and unlimited */
5166           {
5167           *code = OP_END;
5168           adjust_recurse(previous, 1, utf, cd, save_hwm);
5169           memmove(previous + 1, previous, IN_UCHARS(len));
5170           code++;
5171           if (repeat_max == 0)
5172             {
5173             *previous++ = OP_SKIPZERO;
5174             goto END_REPEAT;
5175             }
5176           brazeroptr = previous;    /* Save for possessive optimizing */
5177           *previous++ = OP_BRAZERO + repeat_type;
5178           }
5179
5180         /* If the maximum is greater than 1 and limited, we have to replicate
5181         in a nested fashion, sticking OP_BRAZERO before each set of brackets.
5182         The first one has to be handled carefully because it's the original
5183         copy, which has to be moved up. The remainder can be handled by code
5184         that is common with the non-zero minimum case below. We have to
5185         adjust the value or repeat_max, since one less copy is required. Once
5186         again, we may have to adjust any OP_RECURSE calls inside the group. */
5187
5188         else
5189           {
5190           int offset;
5191           *code = OP_END;
5192           adjust_recurse(previous, 2 + LINK_SIZE, utf, cd, save_hwm);
5193           memmove(previous + 2 + LINK_SIZE, previous, IN_UCHARS(len));
5194           code += 2 + LINK_SIZE;
5195           *previous++ = OP_BRAZERO + repeat_type;
5196           *previous++ = OP_BRA;
5197
5198           /* We chain together the bracket offset fields that have to be
5199           filled in later when the ends of the brackets are reached. */
5200
5201           offset = (bralink == NULL)? 0 : (int)(previous - bralink);
5202           bralink = previous;
5203           PUTINC(previous, 0, offset);
5204           }
5205
5206         repeat_max--;
5207         }
5208
5209       /* If the minimum is greater than zero, replicate the group as many
5210       times as necessary, and adjust the maximum to the number of subsequent
5211       copies that we need. If we set a first char from the group, and didn't
5212       set a required char, copy the latter from the former. If there are any
5213       forward reference subroutine calls in the group, there will be entries on
5214       the workspace list; replicate these with an appropriate increment. */
5215
5216       else
5217         {
5218         if (repeat_min > 1)
5219           {
5220           /* In the pre-compile phase, we don't actually do the replication. We
5221           just adjust the length as if we had. Do some paranoid checks for
5222           potential integer overflow. The INT64_OR_DOUBLE type is a 64-bit
5223           integer type when available, otherwise double. */
5224
5225           if (lengthptr != NULL)
5226             {
5227             int delta = (repeat_min - 1)*length_prevgroup;
5228             if ((INT64_OR_DOUBLE)(repeat_min - 1)*
5229                   (INT64_OR_DOUBLE)length_prevgroup >
5230                     (INT64_OR_DOUBLE)INT_MAX ||
5231                 OFLOW_MAX - *lengthptr < delta)
5232               {
5233               *errorcodeptr = ERR20;
5234               goto FAILED;
5235               }
5236             *lengthptr += delta;
5237             }
5238
5239           /* This is compiling for real. If there is a set first byte for
5240           the group, and we have not yet set a "required byte", set it. Make
5241           sure there is enough workspace for copying forward references before
5242           doing the copy. */
5243
5244           else
5245             {
5246             if (groupsetfirstchar && reqchar < 0) reqchar = firstchar;
5247
5248             for (i = 1; i < repeat_min; i++)
5249               {
5250               pcre_uchar *hc;
5251               pcre_uchar *this_hwm = cd->hwm;
5252               memcpy(code, previous, IN_UCHARS(len));
5253
5254               while (cd->hwm > cd->start_workspace + cd->workspace_size -
5255                      WORK_SIZE_SAFETY_MARGIN - (this_hwm - save_hwm))
5256                 {
5257                 int save_offset = save_hwm - cd->start_workspace;
5258                 int this_offset = this_hwm - cd->start_workspace;
5259                 *errorcodeptr = expand_workspace(cd);
5260                 if (*errorcodeptr != 0) goto FAILED;
5261                 save_hwm = (pcre_uchar *)cd->start_workspace + save_offset;
5262                 this_hwm = (pcre_uchar *)cd->start_workspace + this_offset;
5263                 }
5264
5265               for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
5266                 {
5267                 PUT(cd->hwm, 0, GET(hc, 0) + len);
5268                 cd->hwm += LINK_SIZE;
5269                 }
5270               save_hwm = this_hwm;
5271               code += len;
5272               }
5273             }
5274           }
5275
5276         if (repeat_max > 0) repeat_max -= repeat_min;
5277         }
5278
5279       /* This code is common to both the zero and non-zero minimum cases. If
5280       the maximum is limited, it replicates the group in a nested fashion,
5281       remembering the bracket starts on a stack. In the case of a zero minimum,
5282       the first one was set up above. In all cases the repeat_max now specifies
5283       the number of additional copies needed. Again, we must remember to
5284       replicate entries on the forward reference list. */
5285
5286       if (repeat_max >= 0)
5287         {
5288         /* In the pre-compile phase, we don't actually do the replication. We
5289         just adjust the length as if we had. For each repetition we must add 1
5290         to the length for BRAZERO and for all but the last repetition we must
5291         add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
5292         paranoid checks to avoid integer overflow. The INT64_OR_DOUBLE type is
5293         a 64-bit integer type when available, otherwise double. */
5294
5295         if (lengthptr != NULL && repeat_max > 0)
5296           {
5297           int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
5298                       2 - 2*LINK_SIZE;   /* Last one doesn't nest */
5299           if ((INT64_OR_DOUBLE)repeat_max *
5300                 (INT64_OR_DOUBLE)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
5301                   > (INT64_OR_DOUBLE)INT_MAX ||
5302               OFLOW_MAX - *lengthptr < delta)
5303             {
5304             *errorcodeptr = ERR20;
5305             goto FAILED;
5306             }
5307           *lengthptr += delta;
5308           }
5309
5310         /* This is compiling for real */
5311
5312         else for (i = repeat_max - 1; i >= 0; i--)
5313           {
5314           pcre_uchar *hc;
5315           pcre_uchar *this_hwm = cd->hwm;
5316
5317           *code++ = OP_BRAZERO + repeat_type;
5318
5319           /* All but the final copy start a new nesting, maintaining the
5320           chain of brackets outstanding. */
5321
5322           if (i != 0)
5323             {
5324             int offset;
5325             *code++ = OP_BRA;
5326             offset = (bralink == NULL)? 0 : (int)(code - bralink);
5327             bralink = code;
5328             PUTINC(code, 0, offset);
5329             }
5330
5331           memcpy(code, previous, IN_UCHARS(len));
5332
5333           /* Ensure there is enough workspace for forward references before
5334           copying them. */
5335
5336           while (cd->hwm > cd->start_workspace + cd->workspace_size -
5337                  WORK_SIZE_SAFETY_MARGIN - (this_hwm - save_hwm))
5338             {
5339             int save_offset = save_hwm - cd->start_workspace;
5340             int this_offset = this_hwm - cd->start_workspace;
5341             *errorcodeptr = expand_workspace(cd);
5342             if (*errorcodeptr != 0) goto FAILED;
5343             save_hwm = (pcre_uchar *)cd->start_workspace + save_offset;
5344             this_hwm = (pcre_uchar *)cd->start_workspace + this_offset;
5345             }
5346
5347           for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
5348             {
5349             PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
5350             cd->hwm += LINK_SIZE;
5351             }
5352           save_hwm = this_hwm;
5353           code += len;
5354           }
5355
5356         /* Now chain through the pending brackets, and fill in their length
5357         fields (which are holding the chain links pro tem). */
5358
5359         while (bralink != NULL)
5360           {
5361           int oldlinkoffset;
5362           int offset = (int)(code - bralink + 1);
5363           pcre_uchar *bra = code - offset;
5364           oldlinkoffset = GET(bra, 1);
5365           bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
5366           *code++ = OP_KET;
5367           PUTINC(code, 0, offset);
5368           PUT(bra, 1, offset);
5369           }
5370         }
5371
5372       /* If the maximum is unlimited, set a repeater in the final copy. For
5373       ONCE brackets, that's all we need to do. However, possessively repeated
5374       ONCE brackets can be converted into non-capturing brackets, as the
5375       behaviour of (?:xx)++ is the same as (?>xx)++ and this saves having to
5376       deal with possessive ONCEs specially.
5377
5378       Otherwise, when we are doing the actual compile phase, check to see
5379       whether this group is one that could match an empty string. If so,
5380       convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
5381       that runtime checking can be done. [This check is also applied to ONCE
5382       groups at runtime, but in a different way.]
5383
5384       Then, if the quantifier was possessive and the bracket is not a
5385       conditional, we convert the BRA code to the POS form, and the KET code to
5386       KETRPOS. (It turns out to be convenient at runtime to detect this kind of
5387       subpattern at both the start and at the end.) The use of special opcodes
5388       makes it possible to reduce greatly the stack usage in pcre_exec(). If
5389       the group is preceded by OP_BRAZERO, convert this to OP_BRAPOSZERO.
5390
5391       Then, if the minimum number of matches is 1 or 0, cancel the possessive
5392       flag so that the default action below, of wrapping everything inside
5393       atomic brackets, does not happen. When the minimum is greater than 1,
5394       there will be earlier copies of the group, and so we still have to wrap
5395       the whole thing. */
5396
5397       else
5398         {
5399         pcre_uchar *ketcode = code - 1 - LINK_SIZE;
5400         pcre_uchar *bracode = ketcode - GET(ketcode, 1);
5401
5402         /* Convert possessive ONCE brackets to non-capturing */
5403
5404         if ((*bracode == OP_ONCE || *bracode == OP_ONCE_NC) &&
5405             possessive_quantifier) *bracode = OP_BRA;
5406
5407         /* For non-possessive ONCE brackets, all we need to do is to
5408         set the KET. */
5409
5410         if (*bracode == OP_ONCE || *bracode == OP_ONCE_NC)
5411           *ketcode = OP_KETRMAX + repeat_type;
5412
5413         /* Handle non-ONCE brackets and possessive ONCEs (which have been
5414         converted to non-capturing above). */
5415
5416         else
5417           {
5418           /* In the compile phase, check for empty string matching. */
5419
5420           if (lengthptr == NULL)
5421             {
5422             pcre_uchar *scode = bracode;
5423             do
5424               {
5425               if (could_be_empty_branch(scode, ketcode, utf, cd))
5426                 {
5427                 *bracode += OP_SBRA - OP_BRA;
5428                 break;
5429                 }
5430               scode += GET(scode, 1);
5431               }
5432             while (*scode == OP_ALT);
5433             }
5434
5435           /* Handle possessive quantifiers. */
5436
5437           if (possessive_quantifier)
5438             {
5439             /* For COND brackets, we wrap the whole thing in a possessively
5440             repeated non-capturing bracket, because we have not invented POS
5441             versions of the COND opcodes. Because we are moving code along, we
5442             must ensure that any pending recursive references are updated. */
5443
5444             if (*bracode == OP_COND || *bracode == OP_SCOND)
5445               {
5446               int nlen = (int)(code - bracode);
5447               *code = OP_END;
5448               adjust_recurse(bracode, 1 + LINK_SIZE, utf, cd, save_hwm);
5449               memmove(bracode + 1 + LINK_SIZE, bracode, IN_UCHARS(nlen));
5450               code += 1 + LINK_SIZE;
5451               nlen += 1 + LINK_SIZE;
5452               *bracode = OP_BRAPOS;
5453               *code++ = OP_KETRPOS;
5454               PUTINC(code, 0, nlen);
5455               PUT(bracode, 1, nlen);
5456               }
5457
5458             /* For non-COND brackets, we modify the BRA code and use KETRPOS. */
5459
5460             else
5461               {
5462               *bracode += 1;              /* Switch to xxxPOS opcodes */
5463               *ketcode = OP_KETRPOS;
5464               }
5465
5466             /* If the minimum is zero, mark it as possessive, then unset the
5467             possessive flag when the minimum is 0 or 1. */
5468
5469             if (brazeroptr != NULL) *brazeroptr = OP_BRAPOSZERO;
5470             if (repeat_min < 2) possessive_quantifier = FALSE;
5471             }
5472
5473           /* Non-possessive quantifier */
5474
5475           else *ketcode = OP_KETRMAX + repeat_type;
5476           }
5477         }
5478       }
5479
5480     /* If previous is OP_FAIL, it was generated by an empty class [] in
5481     JavaScript mode. The other ways in which OP_FAIL can be generated, that is
5482     by (*FAIL) or (?!) set previous to NULL, which gives a "nothing to repeat"
5483     error above. We can just ignore the repeat in JS case. */
5484
5485     else if (*previous == OP_FAIL) goto END_REPEAT;
5486
5487     /* Else there's some kind of shambles */
5488
5489     else
5490       {
5491       *errorcodeptr = ERR11;
5492       goto FAILED;
5493       }
5494
5495     /* If the character following a repeat is '+', or if certain optimization
5496     tests above succeeded, possessive_quantifier is TRUE. For some opcodes,
5497     there are special alternative opcodes for this case. For anything else, we
5498     wrap the entire repeated item inside OP_ONCE brackets. Logically, the '+'
5499     notation is just syntactic sugar, taken from Sun's Java package, but the
5500     special opcodes can optimize it.
5501
5502     Some (but not all) possessively repeated subpatterns have already been
5503     completely handled in the code just above. For them, possessive_quantifier
5504     is always FALSE at this stage.
5505
5506     Note that the repeated item starts at tempcode, not at previous, which
5507     might be the first part of a string whose (former) last char we repeated.
5508
5509     Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
5510     an 'upto' may follow. We skip over an 'exact' item, and then test the
5511     length of what remains before proceeding. */
5512
5513     if (possessive_quantifier)
5514       {
5515       int len;
5516
5517       if (*tempcode == OP_TYPEEXACT)
5518         tempcode += PRIV(OP_lengths)[*tempcode] +
5519           ((tempcode[1 + IMM2_SIZE] == OP_PROP
5520           || tempcode[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
5521
5522       else if (*tempcode == OP_EXACT || *tempcode == OP_NOTEXACT)
5523         {
5524         tempcode += PRIV(OP_lengths)[*tempcode];
5525 #ifdef SUPPORT_UTF
5526         if (utf && HAS_EXTRALEN(tempcode[-1]))
5527           tempcode += GET_EXTRALEN(tempcode[-1]);
5528 #endif
5529         }
5530
5531       len = (int)(code - tempcode);
5532       if (len > 0) switch (*tempcode)
5533         {
5534         case OP_STAR:  *tempcode = OP_POSSTAR; break;
5535         case OP_PLUS:  *tempcode = OP_POSPLUS; break;
5536         case OP_QUERY: *tempcode = OP_POSQUERY; break;
5537         case OP_UPTO:  *tempcode = OP_POSUPTO; break;
5538
5539         case OP_STARI:  *tempcode = OP_POSSTARI; break;
5540         case OP_PLUSI:  *tempcode = OP_POSPLUSI; break;
5541         case OP_QUERYI: *tempcode = OP_POSQUERYI; break;
5542         case OP_UPTOI:  *tempcode = OP_POSUPTOI; break;
5543
5544         case OP_NOTSTAR:  *tempcode = OP_NOTPOSSTAR; break;
5545         case OP_NOTPLUS:  *tempcode = OP_NOTPOSPLUS; break;
5546         case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
5547         case OP_NOTUPTO:  *tempcode = OP_NOTPOSUPTO; break;
5548
5549         case OP_NOTSTARI:  *tempcode = OP_NOTPOSSTARI; break;
5550         case OP_NOTPLUSI:  *tempcode = OP_NOTPOSPLUSI; break;
5551         case OP_NOTQUERYI: *tempcode = OP_NOTPOSQUERYI; break;
5552         case OP_NOTUPTOI:  *tempcode = OP_NOTPOSUPTOI; break;
5553
5554         case OP_TYPESTAR:  *tempcode = OP_TYPEPOSSTAR; break;
5555         case OP_TYPEPLUS:  *tempcode = OP_TYPEPOSPLUS; break;
5556         case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
5557         case OP_TYPEUPTO:  *tempcode = OP_TYPEPOSUPTO; break;
5558
5559         /* Because we are moving code along, we must ensure that any
5560         pending recursive references are updated. */
5561
5562         default:
5563         *code = OP_END;
5564         adjust_recurse(tempcode, 1 + LINK_SIZE, utf, cd, save_hwm);
5565         memmove(tempcode + 1 + LINK_SIZE, tempcode, IN_UCHARS(len));
5566         code += 1 + LINK_SIZE;
5567         len += 1 + LINK_SIZE;
5568         tempcode[0] = OP_ONCE;
5569         *code++ = OP_KET;
5570         PUTINC(code, 0, len);
5571         PUT(tempcode, 1, len);
5572         break;
5573         }
5574       }
5575
5576     /* In all case we no longer have a previous item. We also set the
5577     "follows varying string" flag for subsequently encountered reqchars if
5578     it isn't already set and we have just passed a varying length item. */
5579
5580     END_REPEAT:
5581     previous = NULL;
5582     cd->req_varyopt |= reqvary;
5583     break;
5584
5585
5586     /* ===================================================================*/
5587     /* Start of nested parenthesized sub-expression, or comment or lookahead or
5588     lookbehind or option setting or condition or all the other extended
5589     parenthesis forms.  */
5590
5591     case CHAR_LEFT_PARENTHESIS:
5592     newoptions = options;
5593     skipbytes = 0;
5594     bravalue = OP_CBRA;
5595     save_hwm = cd->hwm;
5596     reset_bracount = FALSE;
5597
5598     /* First deal with various "verbs" that can be introduced by '*'. */
5599
5600     ptr++;
5601     if (ptr[0] == CHAR_ASTERISK && (ptr[1] == ':'
5602          || (MAX_255(ptr[1]) && ((cd->ctypes[ptr[1]] & ctype_letter) != 0))))
5603       {
5604       int i, namelen;
5605       int arglen = 0;
5606       const char *vn = verbnames;
5607       const pcre_uchar *name = ptr + 1;
5608       const pcre_uchar *arg = NULL;
5609       previous = NULL;
5610       ptr++;
5611       while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
5612       namelen = (int)(ptr - name);
5613
5614       /* It appears that Perl allows any characters whatsoever, other than
5615       a closing parenthesis, to appear in arguments, so we no longer insist on
5616       letters, digits, and underscores. */
5617
5618       if (*ptr == CHAR_COLON)
5619         {
5620         arg = ++ptr;
5621         while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
5622         arglen = (int)(ptr - arg);
5623         if (arglen > (int)MAX_MARK)
5624           {
5625           *errorcodeptr = ERR75;
5626           goto FAILED;
5627           }
5628         }
5629
5630       if (*ptr != CHAR_RIGHT_PARENTHESIS)
5631         {
5632         *errorcodeptr = ERR60;
5633         goto FAILED;
5634         }
5635
5636       /* Scan the table of verb names */
5637
5638       for (i = 0; i < verbcount; i++)
5639         {
5640         if (namelen == verbs[i].len &&
5641             STRNCMP_UC_C8(name, vn, namelen) == 0)
5642           {
5643           /* Check for open captures before ACCEPT and convert it to
5644           ASSERT_ACCEPT if in an assertion. */
5645
5646           if (verbs[i].op == OP_ACCEPT)
5647             {
5648             open_capitem *oc;
5649             if (arglen != 0)
5650               {
5651               *errorcodeptr = ERR59;
5652               goto FAILED;
5653               }
5654             cd->had_accept = TRUE;
5655             for (oc = cd->open_caps; oc != NULL; oc = oc->next)
5656               {
5657               *code++ = OP_CLOSE;
5658               PUT2INC(code, 0, oc->number);
5659               }
5660             *code++ = (cd->assert_depth > 0)? OP_ASSERT_ACCEPT : OP_ACCEPT;
5661
5662             /* Do not set firstchar after *ACCEPT */
5663             if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
5664             }
5665
5666           /* Handle other cases with/without an argument */
5667
5668           else if (arglen == 0)
5669             {
5670             if (verbs[i].op < 0)   /* Argument is mandatory */
5671               {
5672               *errorcodeptr = ERR66;
5673               goto FAILED;
5674               }
5675             *code = verbs[i].op;
5676             if (*code++ == OP_THEN) cd->external_flags |= PCRE_HASTHEN;
5677             }
5678
5679           else
5680             {
5681             if (verbs[i].op_arg < 0)   /* Argument is forbidden */
5682               {
5683               *errorcodeptr = ERR59;
5684               goto FAILED;
5685               }
5686             *code = verbs[i].op_arg;
5687             if (*code++ == OP_THEN_ARG) cd->external_flags |= PCRE_HASTHEN;
5688             *code++ = arglen;
5689             memcpy(code, arg, IN_UCHARS(arglen));
5690             code += arglen;
5691             *code++ = 0;
5692             }
5693
5694           break;  /* Found verb, exit loop */
5695           }
5696
5697         vn += verbs[i].len + 1;
5698         }
5699
5700       if (i < verbcount) continue;    /* Successfully handled a verb */
5701       *errorcodeptr = ERR60;          /* Verb not recognized */
5702       goto FAILED;
5703       }
5704
5705     /* Deal with the extended parentheses; all are introduced by '?', and the
5706     appearance of any of them means that this is not a capturing group. */
5707
5708     else if (*ptr == CHAR_QUESTION_MARK)
5709       {
5710       int i, set, unset, namelen;
5711       int *optset;
5712       const pcre_uchar *name;
5713       pcre_uchar *slot;
5714
5715       switch (*(++ptr))
5716         {
5717         case CHAR_NUMBER_SIGN:                 /* Comment; skip to ket */
5718         ptr++;
5719         while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
5720         if (*ptr == 0)
5721           {
5722           *errorcodeptr = ERR18;
5723           goto FAILED;
5724           }
5725         continue;
5726
5727
5728         /* ------------------------------------------------------------ */
5729         case CHAR_VERTICAL_LINE:  /* Reset capture count for each branch */
5730         reset_bracount = TRUE;
5731         /* Fall through */
5732
5733         /* ------------------------------------------------------------ */
5734         case CHAR_COLON:          /* Non-capturing bracket */
5735         bravalue = OP_BRA;
5736         ptr++;
5737         break;
5738
5739
5740         /* ------------------------------------------------------------ */
5741         case CHAR_LEFT_PARENTHESIS:
5742         bravalue = OP_COND;       /* Conditional group */
5743
5744         /* A condition can be an assertion, a number (referring to a numbered
5745         group), a name (referring to a named group), or 'R', referring to
5746         recursion. R<digits> and R&name are also permitted for recursion tests.
5747
5748         There are several syntaxes for testing a named group: (?(name)) is used
5749         by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
5750
5751         There are two unfortunate ambiguities, caused by history. (a) 'R' can
5752         be the recursive thing or the name 'R' (and similarly for 'R' followed
5753         by digits), and (b) a number could be a name that consists of digits.
5754         In both cases, we look for a name first; if not found, we try the other
5755         cases. */
5756
5757         /* For conditions that are assertions, check the syntax, and then exit
5758         the switch. This will take control down to where bracketed groups,
5759         including assertions, are processed. */
5760
5761         if (ptr[1] == CHAR_QUESTION_MARK && (ptr[2] == CHAR_EQUALS_SIGN ||
5762             ptr[2] == CHAR_EXCLAMATION_MARK || ptr[2] == CHAR_LESS_THAN_SIGN))
5763           break;
5764
5765         /* Most other conditions use OP_CREF (a couple change to OP_RREF
5766         below), and all need to skip 1+IMM2_SIZE bytes at the start of the group. */
5767
5768         code[1+LINK_SIZE] = OP_CREF;
5769         skipbytes = 1+IMM2_SIZE;
5770         refsign = -1;
5771
5772         /* Check for a test for recursion in a named group. */
5773
5774         if (ptr[1] == CHAR_R && ptr[2] == CHAR_AMPERSAND)
5775           {
5776           terminator = -1;
5777           ptr += 2;
5778           code[1+LINK_SIZE] = OP_RREF;    /* Change the type of test */
5779           }
5780
5781         /* Check for a test for a named group's having been set, using the Perl
5782         syntax (?(<name>) or (?('name') */
5783
5784         else if (ptr[1] == CHAR_LESS_THAN_SIGN)
5785           {
5786           terminator = CHAR_GREATER_THAN_SIGN;
5787           ptr++;
5788           }
5789         else if (ptr[1] == CHAR_APOSTROPHE)
5790           {
5791           terminator = CHAR_APOSTROPHE;
5792           ptr++;
5793           }
5794         else
5795           {
5796           terminator = 0;
5797           if (ptr[1] == CHAR_MINUS || ptr[1] == CHAR_PLUS) refsign = *(++ptr);
5798           }
5799
5800         /* We now expect to read a name; any thing else is an error */
5801
5802         if (!MAX_255(ptr[1]) || (cd->ctypes[ptr[1]] & ctype_word) == 0)
5803           {
5804           ptr += 1;  /* To get the right offset */
5805           *errorcodeptr = ERR28;
5806           goto FAILED;
5807           }
5808
5809         /* Read the name, but also get it as a number if it's all digits */
5810
5811         recno = 0;
5812         name = ++ptr;
5813         while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0)
5814           {
5815           if (recno >= 0)
5816             recno = (IS_DIGIT(*ptr))? recno * 10 + *ptr - CHAR_0 : -1;
5817           ptr++;
5818           }
5819         namelen = (int)(ptr - name);
5820
5821         if ((terminator > 0 && *ptr++ != terminator) ||
5822             *ptr++ != CHAR_RIGHT_PARENTHESIS)
5823           {
5824           ptr--;      /* Error offset */
5825           *errorcodeptr = ERR26;
5826           goto FAILED;
5827           }
5828
5829         /* Do no further checking in the pre-compile phase. */
5830
5831         if (lengthptr != NULL) break;
5832
5833         /* In the real compile we do the work of looking for the actual
5834         reference. If the string started with "+" or "-" we require the rest to
5835         be digits, in which case recno will be set. */
5836
5837         if (refsign > 0)
5838           {
5839           if (recno <= 0)
5840             {
5841             *errorcodeptr = ERR58;
5842             goto FAILED;
5843             }
5844           recno = (refsign == CHAR_MINUS)?
5845             cd->bracount - recno + 1 : recno +cd->bracount;
5846           if (recno <= 0 || recno > cd->final_bracount)
5847             {
5848             *errorcodeptr = ERR15;
5849             goto FAILED;
5850             }
5851           PUT2(code, 2+LINK_SIZE, recno);
5852           break;
5853           }
5854
5855         /* Otherwise (did not start with "+" or "-"), start by looking for the
5856         name. If we find a name, add one to the opcode to change OP_CREF or
5857         OP_RREF into OP_NCREF or OP_NRREF. These behave exactly the same,
5858         except they record that the reference was originally to a name. The
5859         information is used to check duplicate names. */
5860
5861         slot = cd->name_table;
5862         for (i = 0; i < cd->names_found; i++)
5863           {
5864           if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) == 0) break;
5865           slot += cd->name_entry_size;
5866           }
5867
5868         /* Found a previous named subpattern */
5869
5870         if (i < cd->names_found)
5871           {
5872           recno = GET2(slot, 0);
5873           PUT2(code, 2+LINK_SIZE, recno);
5874           code[1+LINK_SIZE]++;
5875           }
5876
5877         /* Search the pattern for a forward reference */
5878
5879         else if ((i = find_parens(cd, name, namelen,
5880                         (options & PCRE_EXTENDED) != 0, utf)) > 0)
5881           {
5882           PUT2(code, 2+LINK_SIZE, i);
5883           code[1+LINK_SIZE]++;
5884           }
5885
5886         /* If terminator == 0 it means that the name followed directly after
5887         the opening parenthesis [e.g. (?(abc)...] and in this case there are
5888         some further alternatives to try. For the cases where terminator != 0
5889         [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
5890         now checked all the possibilities, so give an error. */
5891
5892         else if (terminator != 0)
5893           {
5894           *errorcodeptr = ERR15;
5895           goto FAILED;
5896           }
5897
5898         /* Check for (?(R) for recursion. Allow digits after R to specify a
5899         specific group number. */
5900
5901         else if (*name == CHAR_R)
5902           {
5903           recno = 0;
5904           for (i = 1; i < namelen; i++)
5905             {
5906             if (!IS_DIGIT(name[i]))
5907               {
5908               *errorcodeptr = ERR15;
5909               goto FAILED;
5910               }
5911             recno = recno * 10 + name[i] - CHAR_0;
5912             }
5913           if (recno == 0) recno = RREF_ANY;
5914           code[1+LINK_SIZE] = OP_RREF;      /* Change test type */
5915           PUT2(code, 2+LINK_SIZE, recno);
5916           }
5917
5918         /* Similarly, check for the (?(DEFINE) "condition", which is always
5919         false. */
5920
5921         else if (namelen == 6 && STRNCMP_UC_C8(name, STRING_DEFINE, 6) == 0)
5922           {
5923           code[1+LINK_SIZE] = OP_DEF;
5924           skipbytes = 1;
5925           }
5926
5927         /* Check for the "name" actually being a subpattern number. We are
5928         in the second pass here, so final_bracount is set. */
5929
5930         else if (recno > 0 && recno <= cd->final_bracount)
5931           {
5932           PUT2(code, 2+LINK_SIZE, recno);
5933           }
5934
5935         /* Either an unidentified subpattern, or a reference to (?(0) */
5936
5937         else
5938           {
5939           *errorcodeptr = (recno == 0)? ERR35: ERR15;
5940           goto FAILED;
5941           }
5942         break;
5943
5944
5945         /* ------------------------------------------------------------ */
5946         case CHAR_EQUALS_SIGN:                 /* Positive lookahead */
5947         bravalue = OP_ASSERT;
5948         cd->assert_depth += 1;
5949         ptr++;
5950         break;
5951
5952
5953         /* ------------------------------------------------------------ */
5954         case CHAR_EXCLAMATION_MARK:            /* Negative lookahead */
5955         ptr++;
5956         if (*ptr == CHAR_RIGHT_PARENTHESIS)    /* Optimize (?!) */
5957           {
5958           *code++ = OP_FAIL;
5959           previous = NULL;
5960           continue;
5961           }
5962         bravalue = OP_ASSERT_NOT;
5963         cd->assert_depth += 1;
5964         break;
5965
5966
5967         /* ------------------------------------------------------------ */
5968         case CHAR_LESS_THAN_SIGN:              /* Lookbehind or named define */
5969         switch (ptr[1])
5970           {
5971           case CHAR_EQUALS_SIGN:               /* Positive lookbehind */
5972           bravalue = OP_ASSERTBACK;
5973           cd->assert_depth += 1;
5974           ptr += 2;
5975           break;
5976
5977           case CHAR_EXCLAMATION_MARK:          /* Negative lookbehind */
5978           bravalue = OP_ASSERTBACK_NOT;
5979           cd->assert_depth += 1;
5980           ptr += 2;
5981           break;
5982
5983           default:                /* Could be name define, else bad */
5984           if (MAX_255(ptr[1]) && (cd->ctypes[ptr[1]] & ctype_word) != 0)
5985             goto DEFINE_NAME;
5986           ptr++;                  /* Correct offset for error */
5987           *errorcodeptr = ERR24;
5988           goto FAILED;
5989           }
5990         break;
5991
5992
5993         /* ------------------------------------------------------------ */
5994         case CHAR_GREATER_THAN_SIGN:           /* One-time brackets */
5995         bravalue = OP_ONCE;
5996         ptr++;
5997         break;
5998
5999
6000         /* ------------------------------------------------------------ */
6001         case CHAR_C:                 /* Callout - may be followed by digits; */
6002         previous_callout = code;     /* Save for later completion */
6003         after_manual_callout = 1;    /* Skip one item before completing */
6004         *code++ = OP_CALLOUT;
6005           {
6006           int n = 0;
6007           ptr++;
6008           while(IS_DIGIT(*ptr))
6009             n = n * 10 + *ptr++ - CHAR_0;
6010           if (*ptr != CHAR_RIGHT_PARENTHESIS)
6011             {
6012             *errorcodeptr = ERR39;
6013             goto FAILED;
6014             }
6015           if (n > 255)
6016             {
6017             *errorcodeptr = ERR38;
6018             goto FAILED;
6019             }
6020           *code++ = n;
6021           PUT(code, 0, (int)(ptr - cd->start_pattern + 1)); /* Pattern offset */
6022           PUT(code, LINK_SIZE, 0);                          /* Default length */
6023           code += 2 * LINK_SIZE;
6024           }
6025         previous = NULL;
6026         continue;
6027
6028
6029         /* ------------------------------------------------------------ */
6030         case CHAR_P:              /* Python-style named subpattern handling */
6031         if (*(++ptr) == CHAR_EQUALS_SIGN ||
6032             *ptr == CHAR_GREATER_THAN_SIGN)  /* Reference or recursion */
6033           {
6034           is_recurse = *ptr == CHAR_GREATER_THAN_SIGN;
6035           terminator = CHAR_RIGHT_PARENTHESIS;
6036           goto NAMED_REF_OR_RECURSE;
6037           }
6038         else if (*ptr != CHAR_LESS_THAN_SIGN)  /* Test for Python-style defn */
6039           {
6040           *errorcodeptr = ERR41;
6041           goto FAILED;
6042           }
6043         /* Fall through to handle (?P< as (?< is handled */
6044
6045
6046         /* ------------------------------------------------------------ */
6047         DEFINE_NAME:    /* Come here from (?< handling */
6048         case CHAR_APOSTROPHE:
6049           {
6050           terminator = (*ptr == CHAR_LESS_THAN_SIGN)?
6051             CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
6052           name = ++ptr;
6053
6054           while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
6055           namelen = (int)(ptr - name);
6056
6057           /* In the pre-compile phase, just do a syntax check. */
6058
6059           if (lengthptr != NULL)
6060             {
6061             if (*ptr != terminator)
6062               {
6063               *errorcodeptr = ERR42;
6064               goto FAILED;
6065               }
6066             if (cd->names_found >= MAX_NAME_COUNT)
6067               {
6068               *errorcodeptr = ERR49;
6069               goto FAILED;
6070               }
6071             if (namelen + IMM2_SIZE + 1 > cd->name_entry_size)
6072               {
6073               cd->name_entry_size = namelen + IMM2_SIZE + 1;
6074               if (namelen > MAX_NAME_SIZE)
6075                 {
6076                 *errorcodeptr = ERR48;
6077                 goto FAILED;
6078                 }
6079               }
6080             }
6081
6082           /* In the real compile, create the entry in the table, maintaining
6083           alphabetical order. Duplicate names for different numbers are
6084           permitted only if PCRE_DUPNAMES is set. Duplicate names for the same
6085           number are always OK. (An existing number can be re-used if (?|
6086           appears in the pattern.) In either event, a duplicate name results in
6087           a duplicate entry in the table, even if the number is the same. This
6088           is because the number of names, and hence the table size, is computed
6089           in the pre-compile, and it affects various numbers and pointers which
6090           would all have to be modified, and the compiled code moved down, if
6091           duplicates with the same number were omitted from the table. This
6092           doesn't seem worth the hassle. However, *different* names for the
6093           same number are not permitted. */
6094
6095           else
6096             {
6097             BOOL dupname = FALSE;
6098             slot = cd->name_table;
6099
6100             for (i = 0; i < cd->names_found; i++)
6101               {
6102               int crc = memcmp(name, slot+IMM2_SIZE, IN_UCHARS(namelen));
6103               if (crc == 0)
6104                 {
6105                 if (slot[IMM2_SIZE+namelen] == 0)
6106                   {
6107                   if (GET2(slot, 0) != cd->bracount + 1 &&
6108                       (options & PCRE_DUPNAMES) == 0)
6109                     {
6110                     *errorcodeptr = ERR43;
6111                     goto FAILED;
6112                     }
6113                   else dupname = TRUE;
6114                   }
6115                 else crc = -1;      /* Current name is a substring */
6116                 }
6117
6118               /* Make space in the table and break the loop for an earlier
6119               name. For a duplicate or later name, carry on. We do this for
6120               duplicates so that in the simple case (when ?(| is not used) they
6121               are in order of their numbers. */
6122
6123               if (crc < 0)
6124                 {
6125                 memmove(slot + cd->name_entry_size, slot,
6126                   IN_UCHARS((cd->names_found - i) * cd->name_entry_size));
6127                 break;
6128                 }
6129
6130               /* Continue the loop for a later or duplicate name */
6131
6132               slot += cd->name_entry_size;
6133               }
6134
6135             /* For non-duplicate names, check for a duplicate number before
6136             adding the new name. */
6137
6138             if (!dupname)
6139               {
6140               pcre_uchar *cslot = cd->name_table;
6141               for (i = 0; i < cd->names_found; i++)
6142                 {
6143                 if (cslot != slot)
6144                   {
6145                   if (GET2(cslot, 0) == cd->bracount + 1)
6146                     {
6147                     *errorcodeptr = ERR65;
6148                     goto FAILED;
6149                     }
6150                   }
6151                 else i--;
6152                 cslot += cd->name_entry_size;
6153                 }
6154               }
6155
6156             PUT2(slot, 0, cd->bracount + 1);
6157             memcpy(slot + IMM2_SIZE, name, IN_UCHARS(namelen));
6158             slot[IMM2_SIZE + namelen] = 0;
6159             }
6160           }
6161
6162         /* In both pre-compile and compile, count the number of names we've
6163         encountered. */
6164
6165         cd->names_found++;
6166         ptr++;                    /* Move past > or ' */
6167         goto NUMBERED_GROUP;
6168
6169
6170         /* ------------------------------------------------------------ */
6171         case CHAR_AMPERSAND:            /* Perl recursion/subroutine syntax */
6172         terminator = CHAR_RIGHT_PARENTHESIS;
6173         is_recurse = TRUE;
6174         /* Fall through */
6175
6176         /* We come here from the Python syntax above that handles both
6177         references (?P=name) and recursion (?P>name), as well as falling
6178         through from the Perl recursion syntax (?&name). We also come here from
6179         the Perl \k<name> or \k'name' back reference syntax and the \k{name}
6180         .NET syntax, and the Oniguruma \g<...> and \g'...' subroutine syntax. */
6181
6182         NAMED_REF_OR_RECURSE:
6183         name = ++ptr;
6184         while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
6185         namelen = (int)(ptr - name);
6186
6187         /* In the pre-compile phase, do a syntax check. We used to just set
6188         a dummy reference number, because it was not used in the first pass.
6189         However, with the change of recursive back references to be atomic,
6190         we have to look for the number so that this state can be identified, as
6191         otherwise the incorrect length is computed. If it's not a backwards
6192         reference, the dummy number will do. */
6193
6194         if (lengthptr != NULL)
6195           {
6196           const pcre_uchar *temp;
6197
6198           if (namelen == 0)
6199             {
6200             *errorcodeptr = ERR62;
6201             goto FAILED;
6202             }
6203           if (*ptr != terminator)
6204             {
6205             *errorcodeptr = ERR42;
6206             goto FAILED;
6207             }
6208           if (namelen > MAX_NAME_SIZE)
6209             {
6210             *errorcodeptr = ERR48;
6211             goto FAILED;
6212             }
6213
6214           /* The name table does not exist in the first pass, so we cannot
6215           do a simple search as in the code below. Instead, we have to scan the
6216           pattern to find the number. It is important that we scan it only as
6217           far as we have got because the syntax of named subpatterns has not
6218           been checked for the rest of the pattern, and find_parens() assumes
6219           correct syntax. In any case, it's a waste of resources to scan
6220           further. We stop the scan at the current point by temporarily
6221           adjusting the value of cd->endpattern. */
6222
6223           temp = cd->end_pattern;
6224           cd->end_pattern = ptr;
6225           recno = find_parens(cd, name, namelen,
6226             (options & PCRE_EXTENDED) != 0, utf);
6227           cd->end_pattern = temp;
6228           if (recno < 0) recno = 0;    /* Forward ref; set dummy number */
6229           }
6230
6231         /* In the real compile, seek the name in the table. We check the name
6232         first, and then check that we have reached the end of the name in the
6233         table. That way, if the name that is longer than any in the table,
6234         the comparison will fail without reading beyond the table entry. */
6235
6236         else
6237           {
6238           slot = cd->name_table;
6239           for (i = 0; i < cd->names_found; i++)
6240             {
6241             if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) == 0 &&
6242                 slot[IMM2_SIZE+namelen] == 0)
6243               break;
6244             slot += cd->name_entry_size;
6245             }
6246
6247           if (i < cd->names_found)         /* Back reference */
6248             {
6249             recno = GET2(slot, 0);
6250             }
6251           else if ((recno =                /* Forward back reference */
6252                     find_parens(cd, name, namelen,
6253                       (options & PCRE_EXTENDED) != 0, utf)) <= 0)
6254             {
6255             *errorcodeptr = ERR15;
6256             goto FAILED;
6257             }
6258           }
6259
6260         /* In both phases, we can now go to the code than handles numerical
6261         recursion or backreferences. */
6262
6263         if (is_recurse) goto HANDLE_RECURSION;
6264           else goto HANDLE_REFERENCE;
6265
6266
6267         /* ------------------------------------------------------------ */
6268         case CHAR_R:              /* Recursion */
6269         ptr++;                    /* Same as (?0)      */
6270         /* Fall through */
6271
6272
6273         /* ------------------------------------------------------------ */
6274         case CHAR_MINUS: case CHAR_PLUS:  /* Recursion or subroutine */
6275         case CHAR_0: case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4:
6276         case CHAR_5: case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
6277           {
6278           const pcre_uchar *called;
6279           terminator = CHAR_RIGHT_PARENTHESIS;
6280
6281           /* Come here from the \g<...> and \g'...' code (Oniguruma
6282           compatibility). However, the syntax has been checked to ensure that
6283           the ... are a (signed) number, so that neither ERR63 nor ERR29 will
6284           be called on this path, nor with the jump to OTHER_CHAR_AFTER_QUERY
6285           ever be taken. */
6286
6287           HANDLE_NUMERICAL_RECURSION:
6288
6289           if ((refsign = *ptr) == CHAR_PLUS)
6290             {
6291             ptr++;
6292             if (!IS_DIGIT(*ptr))
6293               {
6294               *errorcodeptr = ERR63;
6295               goto FAILED;
6296               }
6297             }
6298           else if (refsign == CHAR_MINUS)
6299             {
6300             if (!IS_DIGIT(ptr[1]))
6301               goto OTHER_CHAR_AFTER_QUERY;
6302             ptr++;
6303             }
6304
6305           recno = 0;
6306           while(IS_DIGIT(*ptr))
6307             recno = recno * 10 + *ptr++ - CHAR_0;
6308
6309           if (*ptr != terminator)
6310             {
6311             *errorcodeptr = ERR29;
6312             goto FAILED;
6313             }
6314
6315           if (refsign == CHAR_MINUS)
6316             {
6317             if (recno == 0)
6318               {
6319               *errorcodeptr = ERR58;
6320               goto FAILED;
6321               }
6322             recno = cd->bracount - recno + 1;
6323             if (recno <= 0)
6324               {
6325               *errorcodeptr = ERR15;
6326               goto FAILED;
6327               }
6328             }
6329           else if (refsign == CHAR_PLUS)
6330             {
6331             if (recno == 0)
6332               {
6333               *errorcodeptr = ERR58;
6334               goto FAILED;
6335               }
6336             recno += cd->bracount;
6337             }
6338
6339           /* Come here from code above that handles a named recursion */
6340
6341           HANDLE_RECURSION:
6342
6343           previous = code;
6344           called = cd->start_code;
6345
6346           /* When we are actually compiling, find the bracket that is being
6347           referenced. Temporarily end the regex in case it doesn't exist before
6348           this point. If we end up with a forward reference, first check that
6349           the bracket does occur later so we can give the error (and position)
6350           now. Then remember this forward reference in the workspace so it can
6351           be filled in at the end. */
6352
6353           if (lengthptr == NULL)
6354             {
6355             *code = OP_END;
6356             if (recno != 0)
6357               called = PRIV(find_bracket)(cd->start_code, utf, recno);
6358
6359             /* Forward reference */
6360
6361             if (called == NULL)
6362               {
6363               if (find_parens(cd, NULL, recno,
6364                     (options & PCRE_EXTENDED) != 0, utf) < 0)
6365                 {
6366                 *errorcodeptr = ERR15;
6367                 goto FAILED;
6368                 }
6369
6370               /* Fudge the value of "called" so that when it is inserted as an
6371               offset below, what it actually inserted is the reference number
6372               of the group. Then remember the forward reference. */
6373
6374               called = cd->start_code + recno;
6375               if (cd->hwm >= cd->start_workspace + cd->workspace_size -
6376                   WORK_SIZE_SAFETY_MARGIN)
6377                 {
6378                 *errorcodeptr = expand_workspace(cd);
6379                 if (*errorcodeptr != 0) goto FAILED;
6380                 }
6381               PUTINC(cd->hwm, 0, (int)(code + 1 - cd->start_code));
6382               }
6383
6384             /* If not a forward reference, and the subpattern is still open,
6385             this is a recursive call. We check to see if this is a left
6386             recursion that could loop for ever, and diagnose that case. We
6387             must not, however, do this check if we are in a conditional
6388             subpattern because the condition might be testing for recursion in
6389             a pattern such as /(?(R)a+|(?R)b)/, which is perfectly valid.
6390             Forever loops are also detected at runtime, so those that occur in
6391             conditional subpatterns will be picked up then. */
6392
6393             else if (GET(called, 1) == 0 && cond_depth <= 0 &&
6394                      could_be_empty(called, code, bcptr, utf, cd))
6395               {
6396               *errorcodeptr = ERR40;
6397               goto FAILED;
6398               }
6399             }
6400
6401           /* Insert the recursion/subroutine item. It does not have a set first
6402           character (relevant if it is repeated, because it will then be
6403           wrapped with ONCE brackets). */
6404
6405           *code = OP_RECURSE;
6406           PUT(code, 1, (int)(called - cd->start_code));
6407           code += 1 + LINK_SIZE;
6408           groupsetfirstchar = FALSE;
6409           }
6410
6411         /* Can't determine a first byte now */
6412
6413         if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
6414         continue;
6415
6416
6417         /* ------------------------------------------------------------ */
6418         default:              /* Other characters: check option setting */
6419         OTHER_CHAR_AFTER_QUERY:
6420         set = unset = 0;
6421         optset = &set;
6422
6423         while (*ptr != CHAR_RIGHT_PARENTHESIS && *ptr != CHAR_COLON)
6424           {
6425           switch (*ptr++)
6426             {
6427             case CHAR_MINUS: optset = &unset; break;
6428
6429             case CHAR_J:    /* Record that it changed in the external options */
6430             *optset |= PCRE_DUPNAMES;
6431             cd->external_flags |= PCRE_JCHANGED;
6432             break;
6433
6434             case CHAR_i: *optset |= PCRE_CASELESS; break;
6435             case CHAR_m: *optset |= PCRE_MULTILINE; break;
6436             case CHAR_s: *optset |= PCRE_DOTALL; break;
6437             case CHAR_x: *optset |= PCRE_EXTENDED; break;
6438             case CHAR_U: *optset |= PCRE_UNGREEDY; break;
6439             case CHAR_X: *optset |= PCRE_EXTRA; break;
6440
6441             default:  *errorcodeptr = ERR12;
6442                       ptr--;    /* Correct the offset */
6443                       goto FAILED;
6444             }
6445           }
6446
6447         /* Set up the changed option bits, but don't change anything yet. */
6448
6449         newoptions = (options | set) & (~unset);
6450
6451         /* If the options ended with ')' this is not the start of a nested
6452         group with option changes, so the options change at this level. If this
6453         item is right at the start of the pattern, the options can be
6454         abstracted and made external in the pre-compile phase, and ignored in
6455         the compile phase. This can be helpful when matching -- for instance in
6456         caseless checking of required bytes.
6457
6458         If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
6459         definitely *not* at the start of the pattern because something has been
6460         compiled. In the pre-compile phase, however, the code pointer can have
6461         that value after the start, because it gets reset as code is discarded
6462         during the pre-compile. However, this can happen only at top level - if
6463         we are within parentheses, the starting BRA will still be present. At
6464         any parenthesis level, the length value can be used to test if anything
6465         has been compiled at that level. Thus, a test for both these conditions
6466         is necessary to ensure we correctly detect the start of the pattern in
6467         both phases.
6468
6469         If we are not at the pattern start, reset the greedy defaults and the
6470         case value for firstchar and reqchar. */
6471
6472         if (*ptr == CHAR_RIGHT_PARENTHESIS)
6473           {
6474           if (code == cd->start_code + 1 + LINK_SIZE &&
6475                (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
6476             {
6477             cd->external_options = newoptions;
6478             }
6479           else
6480             {
6481             greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
6482             greedy_non_default = greedy_default ^ 1;
6483             req_caseopt = ((newoptions & PCRE_CASELESS) != 0)? REQ_CASELESS:0;
6484             }
6485
6486           /* Change options at this level, and pass them back for use
6487           in subsequent branches. */
6488
6489           *optionsptr = options = newoptions;
6490           previous = NULL;       /* This item can't be repeated */
6491           continue;              /* It is complete */
6492           }
6493
6494         /* If the options ended with ':' we are heading into a nested group
6495         with possible change of options. Such groups are non-capturing and are
6496         not assertions of any kind. All we need to do is skip over the ':';
6497         the newoptions value is handled below. */
6498
6499         bravalue = OP_BRA;
6500         ptr++;
6501         }     /* End of switch for character following (? */
6502       }       /* End of (? handling */
6503
6504     /* Opening parenthesis not followed by '*' or '?'. If PCRE_NO_AUTO_CAPTURE
6505     is set, all unadorned brackets become non-capturing and behave like (?:...)
6506     brackets. */
6507
6508     else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
6509       {
6510       bravalue = OP_BRA;
6511       }
6512
6513     /* Else we have a capturing group. */
6514
6515     else
6516       {
6517       NUMBERED_GROUP:
6518       cd->bracount += 1;
6519       PUT2(code, 1+LINK_SIZE, cd->bracount);
6520       skipbytes = IMM2_SIZE;
6521       }
6522
6523     /* Process nested bracketed regex. Assertions used not to be repeatable,
6524     but this was changed for Perl compatibility, so all kinds can now be
6525     repeated. We copy code into a non-register variable (tempcode) in order to
6526     be able to pass its address because some compilers complain otherwise. */
6527
6528     previous = code;                      /* For handling repetition */
6529     *code = bravalue;
6530     tempcode = code;
6531     tempreqvary = cd->req_varyopt;        /* Save value before bracket */
6532     tempbracount = cd->bracount;          /* Save value before bracket */
6533     length_prevgroup = 0;                 /* Initialize for pre-compile phase */
6534
6535     if (!compile_regex(
6536          newoptions,                      /* The complete new option state */
6537          &tempcode,                       /* Where to put code (updated) */
6538          &ptr,                            /* Input pointer (updated) */
6539          errorcodeptr,                    /* Where to put an error message */
6540          (bravalue == OP_ASSERTBACK ||
6541           bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
6542          reset_bracount,                  /* True if (?| group */
6543          skipbytes,                       /* Skip over bracket number */
6544          cond_depth +
6545            ((bravalue == OP_COND)?1:0),   /* Depth of condition subpatterns */
6546          &subfirstchar,                   /* For possible first char */
6547          &subreqchar,                     /* For possible last char */
6548          bcptr,                           /* Current branch chain */
6549          cd,                              /* Tables block */
6550          (lengthptr == NULL)? NULL :      /* Actual compile phase */
6551            &length_prevgroup              /* Pre-compile phase */
6552          ))
6553       goto FAILED;
6554
6555     /* If this was an atomic group and there are no capturing groups within it,
6556     generate OP_ONCE_NC instead of OP_ONCE. */
6557
6558     if (bravalue == OP_ONCE && cd->bracount <= tempbracount)
6559       *code = OP_ONCE_NC;
6560
6561     if (bravalue >= OP_ASSERT && bravalue <= OP_ASSERTBACK_NOT)
6562       cd->assert_depth -= 1;
6563
6564     /* At the end of compiling, code is still pointing to the start of the
6565     group, while tempcode has been updated to point past the end of the group.
6566     The pattern pointer (ptr) is on the bracket.
6567
6568     If this is a conditional bracket, check that there are no more than
6569     two branches in the group, or just one if it's a DEFINE group. We do this
6570     in the real compile phase, not in the pre-pass, where the whole group may
6571     not be available. */
6572
6573     if (bravalue == OP_COND && lengthptr == NULL)
6574       {
6575       pcre_uchar *tc = code;
6576       int condcount = 0;
6577
6578       do {
6579          condcount++;
6580          tc += GET(tc,1);
6581          }
6582       while (*tc != OP_KET);
6583
6584       /* A DEFINE group is never obeyed inline (the "condition" is always
6585       false). It must have only one branch. */
6586
6587       if (code[LINK_SIZE+1] == OP_DEF)
6588         {
6589         if (condcount > 1)
6590           {
6591           *errorcodeptr = ERR54;
6592           goto FAILED;
6593           }
6594         bravalue = OP_DEF;   /* Just a flag to suppress char handling below */
6595         }
6596
6597       /* A "normal" conditional group. If there is just one branch, we must not
6598       make use of its firstchar or reqchar, because this is equivalent to an
6599       empty second branch. */
6600
6601       else
6602         {
6603         if (condcount > 2)
6604           {
6605           *errorcodeptr = ERR27;
6606           goto FAILED;
6607           }
6608         if (condcount == 1) subfirstchar = subreqchar = REQ_NONE;
6609         }
6610       }
6611
6612     /* Error if hit end of pattern */
6613
6614     if (*ptr != CHAR_RIGHT_PARENTHESIS)
6615       {
6616       *errorcodeptr = ERR14;
6617       goto FAILED;
6618       }
6619
6620     /* In the pre-compile phase, update the length by the length of the group,
6621     less the brackets at either end. Then reduce the compiled code to just a
6622     set of non-capturing brackets so that it doesn't use much memory if it is
6623     duplicated by a quantifier.*/
6624
6625     if (lengthptr != NULL)
6626       {
6627       if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
6628         {
6629         *errorcodeptr = ERR20;
6630         goto FAILED;
6631         }
6632       *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
6633       code++;   /* This already contains bravalue */
6634       PUTINC(code, 0, 1 + LINK_SIZE);
6635       *code++ = OP_KET;
6636       PUTINC(code, 0, 1 + LINK_SIZE);
6637       break;    /* No need to waste time with special character handling */
6638       }
6639
6640     /* Otherwise update the main code pointer to the end of the group. */
6641
6642     code = tempcode;
6643
6644     /* For a DEFINE group, required and first character settings are not
6645     relevant. */
6646
6647     if (bravalue == OP_DEF) break;
6648
6649     /* Handle updating of the required and first characters for other types of
6650     group. Update for normal brackets of all kinds, and conditions with two
6651     branches (see code above). If the bracket is followed by a quantifier with
6652     zero repeat, we have to back off. Hence the definition of zeroreqchar and
6653     zerofirstchar outside the main loop so that they can be accessed for the
6654     back off. */
6655
6656     zeroreqchar = reqchar;
6657     zerofirstchar = firstchar;
6658     groupsetfirstchar = FALSE;
6659
6660     if (bravalue >= OP_ONCE)
6661       {
6662       /* If we have not yet set a firstchar in this branch, take it from the
6663       subpattern, remembering that it was set here so that a repeat of more
6664       than one can replicate it as reqchar if necessary. If the subpattern has
6665       no firstchar, set "none" for the whole branch. In both cases, a zero
6666       repeat forces firstchar to "none". */
6667
6668       if (firstchar == REQ_UNSET)
6669         {
6670         if (subfirstchar >= 0)
6671           {
6672           firstchar = subfirstchar;
6673           groupsetfirstchar = TRUE;
6674           }
6675         else firstchar = REQ_NONE;
6676         zerofirstchar = REQ_NONE;
6677         }
6678
6679       /* If firstchar was previously set, convert the subpattern's firstchar
6680       into reqchar if there wasn't one, using the vary flag that was in
6681       existence beforehand. */
6682
6683       else if (subfirstchar >= 0 && subreqchar < 0)
6684         subreqchar = subfirstchar | tempreqvary;
6685
6686       /* If the subpattern set a required byte (or set a first byte that isn't
6687       really the first byte - see above), set it. */
6688
6689       if (subreqchar >= 0) reqchar = subreqchar;
6690       }
6691
6692     /* For a forward assertion, we take the reqchar, if set. This can be
6693     helpful if the pattern that follows the assertion doesn't set a different
6694     char. For example, it's useful for /(?=abcde).+/. We can't set firstchar
6695     for an assertion, however because it leads to incorrect effect for patterns
6696     such as /(?=a)a.+/ when the "real" "a" would then become a reqchar instead
6697     of a firstchar. This is overcome by a scan at the end if there's no
6698     firstchar, looking for an asserted first char. */
6699
6700     else if (bravalue == OP_ASSERT && subreqchar >= 0) reqchar = subreqchar;
6701     break;     /* End of processing '(' */
6702
6703
6704     /* ===================================================================*/
6705     /* Handle metasequences introduced by \. For ones like \d, the ESC_ values
6706     are arranged to be the negation of the corresponding OP_values in the
6707     default case when PCRE_UCP is not set. For the back references, the values
6708     are ESC_REF plus the reference number. Only back references and those types
6709     that consume a character may be repeated. We can test for values between
6710     ESC_b and ESC_Z for the latter; this may have to change if any new ones are
6711     ever created. */
6712
6713     case CHAR_BACKSLASH:
6714     tempptr = ptr;
6715     c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE);
6716     if (*errorcodeptr != 0) goto FAILED;
6717
6718     if (c < 0)
6719       {
6720       if (-c == ESC_Q)            /* Handle start of quoted string */
6721         {
6722         if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
6723           ptr += 2;               /* avoid empty string */
6724             else inescq = TRUE;
6725         continue;
6726         }
6727
6728       if (-c == ESC_E) continue;  /* Perl ignores an orphan \E */
6729
6730       /* For metasequences that actually match a character, we disable the
6731       setting of a first character if it hasn't already been set. */
6732
6733       if (firstchar == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
6734         firstchar = REQ_NONE;
6735
6736       /* Set values to reset to if this is followed by a zero repeat. */
6737
6738       zerofirstchar = firstchar;
6739       zeroreqchar = reqchar;
6740
6741       /* \g<name> or \g'name' is a subroutine call by name and \g<n> or \g'n'
6742       is a subroutine call by number (Oniguruma syntax). In fact, the value
6743       -ESC_g is returned only for these cases. So we don't need to check for <
6744       or ' if the value is -ESC_g. For the Perl syntax \g{n} the value is
6745       -ESC_REF+n, and for the Perl syntax \g{name} the result is -ESC_k (as
6746       that is a synonym for a named back reference). */
6747
6748       if (-c == ESC_g)
6749         {
6750         const pcre_uchar *p;
6751         save_hwm = cd->hwm;   /* Normally this is set when '(' is read */
6752         terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
6753           CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
6754
6755         /* These two statements stop the compiler for warning about possibly
6756         unset variables caused by the jump to HANDLE_NUMERICAL_RECURSION. In
6757         fact, because we actually check for a number below, the paths that
6758         would actually be in error are never taken. */
6759
6760         skipbytes = 0;
6761         reset_bracount = FALSE;
6762
6763         /* Test for a name */
6764
6765         if (ptr[1] != CHAR_PLUS && ptr[1] != CHAR_MINUS)
6766           {
6767           BOOL is_a_number = TRUE;
6768           for (p = ptr + 1; *p != 0 && *p != terminator; p++)
6769             {
6770             if (!MAX_255(*p)) { is_a_number = FALSE; break; }
6771             if ((cd->ctypes[*p] & ctype_digit) == 0) is_a_number = FALSE;
6772             if ((cd->ctypes[*p] & ctype_word) == 0) break;
6773             }
6774           if (*p != terminator)
6775             {
6776             *errorcodeptr = ERR57;
6777             break;
6778             }
6779           if (is_a_number)
6780             {
6781             ptr++;
6782             goto HANDLE_NUMERICAL_RECURSION;
6783             }
6784           is_recurse = TRUE;
6785           goto NAMED_REF_OR_RECURSE;
6786           }
6787
6788         /* Test a signed number in angle brackets or quotes. */
6789
6790         p = ptr + 2;
6791         while (IS_DIGIT(*p)) p++;
6792         if (*p != terminator)
6793           {
6794           *errorcodeptr = ERR57;
6795           break;
6796           }
6797         ptr++;
6798         goto HANDLE_NUMERICAL_RECURSION;
6799         }
6800
6801       /* \k<name> or \k'name' is a back reference by name (Perl syntax).
6802       We also support \k{name} (.NET syntax).  */
6803
6804       if (-c == ESC_k)
6805         {
6806         if ((ptr[1] != CHAR_LESS_THAN_SIGN &&
6807           ptr[1] != CHAR_APOSTROPHE && ptr[1] != CHAR_LEFT_CURLY_BRACKET))
6808           {
6809           *errorcodeptr = ERR69;
6810           break;
6811           }
6812         is_recurse = FALSE;
6813         terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
6814           CHAR_GREATER_THAN_SIGN : (*ptr == CHAR_APOSTROPHE)?
6815           CHAR_APOSTROPHE : CHAR_RIGHT_CURLY_BRACKET;
6816         goto NAMED_REF_OR_RECURSE;
6817         }
6818
6819       /* Back references are handled specially; must disable firstchar if
6820       not set to cope with cases like (?=(\w+))\1: which would otherwise set
6821       ':' later. */
6822
6823       if (-c >= ESC_REF)
6824         {
6825         open_capitem *oc;
6826         recno = -c - ESC_REF;
6827
6828         HANDLE_REFERENCE:    /* Come here from named backref handling */
6829         if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
6830         previous = code;
6831         *code++ = ((options & PCRE_CASELESS) != 0)? OP_REFI : OP_REF;
6832         PUT2INC(code, 0, recno);
6833         cd->backref_map |= (recno < 32)? (1 << recno) : 1;
6834         if (recno > cd->top_backref) cd->top_backref = recno;
6835
6836         /* Check to see if this back reference is recursive, that it, it
6837         is inside the group that it references. A flag is set so that the
6838         group can be made atomic. */
6839
6840         for (oc = cd->open_caps; oc != NULL; oc = oc->next)
6841           {
6842           if (oc->number == recno)
6843             {
6844             oc->flag = TRUE;
6845             break;
6846             }
6847           }
6848         }
6849
6850       /* So are Unicode property matches, if supported. */
6851
6852 #ifdef SUPPORT_UCP
6853       else if (-c == ESC_P || -c == ESC_p)
6854         {
6855         BOOL negated;
6856         int pdata;
6857         int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
6858         if (ptype < 0) goto FAILED;
6859         previous = code;
6860         *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
6861         *code++ = ptype;
6862         *code++ = pdata;
6863         }
6864 #else
6865
6866       /* If Unicode properties are not supported, \X, \P, and \p are not
6867       allowed. */
6868
6869       else if (-c == ESC_X || -c == ESC_P || -c == ESC_p)
6870         {
6871         *errorcodeptr = ERR45;
6872         goto FAILED;
6873         }
6874 #endif
6875
6876       /* For the rest (including \X when Unicode properties are supported), we
6877       can obtain the OP value by negating the escape value in the default
6878       situation when PCRE_UCP is not set. When it *is* set, we substitute
6879       Unicode property tests. Note that \b and \B do a one-character
6880       lookbehind. */
6881
6882       else
6883         {
6884         if ((-c == ESC_b || -c == ESC_B) && cd->max_lookbehind == 0)
6885           cd->max_lookbehind = 1;
6886 #ifdef SUPPORT_UCP
6887         if (-c >= ESC_DU && -c <= ESC_wu)
6888           {
6889           nestptr = ptr + 1;                   /* Where to resume */
6890           ptr = substitutes[-c - ESC_DU] - 1;  /* Just before substitute */
6891           }
6892         else
6893 #endif
6894         /* In non-UTF-8 mode, we turn \C into OP_ALLANY instead of OP_ANYBYTE
6895         so that it works in DFA mode and in lookbehinds. */
6896
6897           {
6898           previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
6899           *code++ = (!utf && c == -ESC_C)? OP_ALLANY : -c;
6900           }
6901         }
6902       continue;
6903       }
6904
6905     /* We have a data character whose value is in c. In UTF-8 mode it may have
6906     a value > 127. We set its representation in the length/buffer, and then
6907     handle it as a data character. */
6908
6909 #ifdef SUPPORT_UTF
6910     if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
6911       mclength = PRIV(ord2utf)(c, mcbuffer);
6912     else
6913 #endif
6914
6915      {
6916      mcbuffer[0] = c;
6917      mclength = 1;
6918      }
6919     goto ONE_CHAR;
6920
6921
6922     /* ===================================================================*/
6923     /* Handle a literal character. It is guaranteed not to be whitespace or #
6924     when the extended flag is set. If we are in UTF-8 mode, it may be a
6925     multi-byte literal character. */
6926
6927     default:
6928     NORMAL_CHAR:
6929     mclength = 1;
6930     mcbuffer[0] = c;
6931
6932 #ifdef SUPPORT_UTF
6933     if (utf && HAS_EXTRALEN(c))
6934       ACROSSCHAR(TRUE, ptr[1], mcbuffer[mclength++] = *(++ptr));
6935 #endif
6936
6937     /* At this point we have the character's bytes in mcbuffer, and the length
6938     in mclength. When not in UTF-8 mode, the length is always 1. */
6939
6940     ONE_CHAR:
6941     previous = code;
6942     *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARI : OP_CHAR;
6943     for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
6944
6945     /* Remember if \r or \n were seen */
6946
6947     if (mcbuffer[0] == CHAR_CR || mcbuffer[0] == CHAR_NL)
6948       cd->external_flags |= PCRE_HASCRORLF;
6949
6950     /* Set the first and required bytes appropriately. If no previous first
6951     byte, set it from this character, but revert to none on a zero repeat.
6952     Otherwise, leave the firstchar value alone, and don't change it on a zero
6953     repeat. */
6954
6955     if (firstchar == REQ_UNSET)
6956       {
6957       zerofirstchar = REQ_NONE;
6958       zeroreqchar = reqchar;
6959
6960       /* If the character is more than one byte long, we can set firstchar
6961       only if it is not to be matched caselessly. */
6962
6963       if (mclength == 1 || req_caseopt == 0)
6964         {
6965         firstchar = mcbuffer[0] | req_caseopt;
6966         if (mclength != 1) reqchar = code[-1] | cd->req_varyopt;
6967         }
6968       else firstchar = reqchar = REQ_NONE;
6969       }
6970
6971     /* firstchar was previously set; we can set reqchar only if the length is
6972     1 or the matching is caseful. */
6973
6974     else
6975       {
6976       zerofirstchar = firstchar;
6977       zeroreqchar = reqchar;
6978       if (mclength == 1 || req_caseopt == 0)
6979         reqchar = code[-1] | req_caseopt | cd->req_varyopt;
6980       }
6981
6982     break;            /* End of literal character handling */
6983     }
6984   }                   /* end of big loop */
6985
6986
6987 /* Control never reaches here by falling through, only by a goto for all the
6988 error states. Pass back the position in the pattern so that it can be displayed
6989 to the user for diagnosing the error. */
6990
6991 FAILED:
6992 *ptrptr = ptr;
6993 return FALSE;
6994 }
6995
6996
6997
6998
6999 /*************************************************
7000 *     Compile sequence of alternatives           *
7001 *************************************************/
7002
7003 /* On entry, ptr is pointing past the bracket character, but on return it
7004 points to the closing bracket, or vertical bar, or end of string. The code
7005 variable is pointing at the byte into which the BRA operator has been stored.
7006 This function is used during the pre-compile phase when we are trying to find
7007 out the amount of memory needed, as well as during the real compile phase. The
7008 value of lengthptr distinguishes the two phases.
7009
7010 Arguments:
7011   options        option bits, including any changes for this subpattern
7012   codeptr        -> the address of the current code pointer
7013   ptrptr         -> the address of the current pattern pointer
7014   errorcodeptr   -> pointer to error code variable
7015   lookbehind     TRUE if this is a lookbehind assertion
7016   reset_bracount TRUE to reset the count for each branch
7017   skipbytes      skip this many bytes at start (for brackets and OP_COND)
7018   cond_depth     depth of nesting for conditional subpatterns
7019   firstcharptr   place to put the first required character, or a negative number
7020   reqcharptr     place to put the last required character, or a negative number
7021   bcptr          pointer to the chain of currently open branches
7022   cd             points to the data block with tables pointers etc.
7023   lengthptr      NULL during the real compile phase
7024                  points to length accumulator during pre-compile phase
7025
7026 Returns:         TRUE on success
7027 */
7028
7029 static BOOL
7030 compile_regex(int options, pcre_uchar **codeptr, const pcre_uchar **ptrptr,
7031   int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipbytes,
7032   int cond_depth, pcre_int32 *firstcharptr, pcre_int32 *reqcharptr,
7033   branch_chain *bcptr, compile_data *cd, int *lengthptr)
7034 {
7035 const pcre_uchar *ptr = *ptrptr;
7036 pcre_uchar *code = *codeptr;
7037 pcre_uchar *last_branch = code;
7038 pcre_uchar *start_bracket = code;
7039 pcre_uchar *reverse_count = NULL;
7040 open_capitem capitem;
7041 int capnumber = 0;
7042 pcre_int32 firstchar, reqchar;
7043 pcre_int32 branchfirstchar, branchreqchar;
7044 int length;
7045 int orig_bracount;
7046 int max_bracount;
7047 branch_chain bc;
7048
7049 bc.outer = bcptr;
7050 bc.current_branch = code;
7051
7052 firstchar = reqchar = REQ_UNSET;
7053
7054 /* Accumulate the length for use in the pre-compile phase. Start with the
7055 length of the BRA and KET and any extra bytes that are required at the
7056 beginning. We accumulate in a local variable to save frequent testing of
7057 lenthptr for NULL. We cannot do this by looking at the value of code at the
7058 start and end of each alternative, because compiled items are discarded during
7059 the pre-compile phase so that the work space is not exceeded. */
7060
7061 length = 2 + 2*LINK_SIZE + skipbytes;
7062
7063 /* WARNING: If the above line is changed for any reason, you must also change
7064 the code that abstracts option settings at the start of the pattern and makes
7065 them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
7066 pre-compile phase to find out whether anything has yet been compiled or not. */
7067
7068 /* If this is a capturing subpattern, add to the chain of open capturing items
7069 so that we can detect them if (*ACCEPT) is encountered. This is also used to
7070 detect groups that contain recursive back references to themselves. Note that
7071 only OP_CBRA need be tested here; changing this opcode to one of its variants,
7072 e.g. OP_SCBRAPOS, happens later, after the group has been compiled. */
7073
7074 if (*code == OP_CBRA)
7075   {
7076   capnumber = GET2(code, 1 + LINK_SIZE);
7077   capitem.number = capnumber;
7078   capitem.next = cd->open_caps;
7079   capitem.flag = FALSE;
7080   cd->open_caps = &capitem;
7081   }
7082
7083 /* Offset is set zero to mark that this bracket is still open */
7084
7085 PUT(code, 1, 0);
7086 code += 1 + LINK_SIZE + skipbytes;
7087
7088 /* Loop for each alternative branch */
7089
7090 orig_bracount = max_bracount = cd->bracount;
7091 for (;;)
7092   {
7093   /* For a (?| group, reset the capturing bracket count so that each branch
7094   uses the same numbers. */
7095
7096   if (reset_bracount) cd->bracount = orig_bracount;
7097
7098   /* Set up dummy OP_REVERSE if lookbehind assertion */
7099
7100   if (lookbehind)
7101     {
7102     *code++ = OP_REVERSE;
7103     reverse_count = code;
7104     PUTINC(code, 0, 0);
7105     length += 1 + LINK_SIZE;
7106     }
7107
7108   /* Now compile the branch; in the pre-compile phase its length gets added
7109   into the length. */
7110
7111   if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstchar,
7112         &branchreqchar, &bc, cond_depth, cd,
7113         (lengthptr == NULL)? NULL : &length))
7114     {
7115     *ptrptr = ptr;
7116     return FALSE;
7117     }
7118
7119   /* Keep the highest bracket count in case (?| was used and some branch
7120   has fewer than the rest. */
7121
7122   if (cd->bracount > max_bracount) max_bracount = cd->bracount;
7123
7124   /* In the real compile phase, there is some post-processing to be done. */
7125
7126   if (lengthptr == NULL)
7127     {
7128     /* If this is the first branch, the firstchar and reqchar values for the
7129     branch become the values for the regex. */
7130
7131     if (*last_branch != OP_ALT)
7132       {
7133       firstchar = branchfirstchar;
7134       reqchar = branchreqchar;
7135       }
7136
7137     /* If this is not the first branch, the first char and reqchar have to
7138     match the values from all the previous branches, except that if the
7139     previous value for reqchar didn't have REQ_VARY set, it can still match,
7140     and we set REQ_VARY for the regex. */
7141
7142     else
7143       {
7144       /* If we previously had a firstchar, but it doesn't match the new branch,
7145       we have to abandon the firstchar for the regex, but if there was
7146       previously no reqchar, it takes on the value of the old firstchar. */
7147
7148       if (firstchar >= 0 && firstchar != branchfirstchar)
7149         {
7150         if (reqchar < 0) reqchar = firstchar;
7151         firstchar = REQ_NONE;
7152         }
7153
7154       /* If we (now or from before) have no firstchar, a firstchar from the
7155       branch becomes a reqchar if there isn't a branch reqchar. */
7156
7157       if (firstchar < 0 && branchfirstchar >= 0 && branchreqchar < 0)
7158           branchreqchar = branchfirstchar;
7159
7160       /* Now ensure that the reqchars match */
7161
7162       if ((reqchar & ~REQ_VARY) != (branchreqchar & ~REQ_VARY))
7163         reqchar = REQ_NONE;
7164       else reqchar |= branchreqchar;   /* To "or" REQ_VARY */
7165       }
7166
7167     /* If lookbehind, check that this branch matches a fixed-length string, and
7168     put the length into the OP_REVERSE item. Temporarily mark the end of the
7169     branch with OP_END. If the branch contains OP_RECURSE, the result is -3
7170     because there may be forward references that we can't check here. Set a
7171     flag to cause another lookbehind check at the end. Why not do it all at the
7172     end? Because common, erroneous checks are picked up here and the offset of
7173     the problem can be shown. */
7174
7175     if (lookbehind)
7176       {
7177       int fixed_length;
7178       *code = OP_END;
7179       fixed_length = find_fixedlength(last_branch,  (options & PCRE_UTF8) != 0,
7180         FALSE, cd);
7181       DPRINTF(("fixed length = %d\n", fixed_length));
7182       if (fixed_length == -3)
7183         {
7184         cd->check_lookbehind = TRUE;
7185         }
7186       else if (fixed_length < 0)
7187         {
7188         *errorcodeptr = (fixed_length == -2)? ERR36 :
7189                         (fixed_length == -4)? ERR70: ERR25;
7190         *ptrptr = ptr;
7191         return FALSE;
7192         }
7193       else
7194         {
7195         if (fixed_length > cd->max_lookbehind)
7196           cd->max_lookbehind = fixed_length;
7197         PUT(reverse_count, 0, fixed_length);
7198         }
7199       }
7200     }
7201
7202   /* Reached end of expression, either ')' or end of pattern. In the real
7203   compile phase, go back through the alternative branches and reverse the chain
7204   of offsets, with the field in the BRA item now becoming an offset to the
7205   first alternative. If there are no alternatives, it points to the end of the
7206   group. The length in the terminating ket is always the length of the whole
7207   bracketed item. Return leaving the pointer at the terminating char. */
7208
7209   if (*ptr != CHAR_VERTICAL_LINE)
7210     {
7211     if (lengthptr == NULL)
7212       {
7213       int branch_length = (int)(code - last_branch);
7214       do
7215         {
7216         int prev_length = GET(last_branch, 1);
7217         PUT(last_branch, 1, branch_length);
7218         branch_length = prev_length;
7219         last_branch -= branch_length;
7220         }
7221       while (branch_length > 0);
7222       }
7223
7224     /* Fill in the ket */
7225
7226     *code = OP_KET;
7227     PUT(code, 1, (int)(code - start_bracket));
7228     code += 1 + LINK_SIZE;
7229
7230     /* If it was a capturing subpattern, check to see if it contained any
7231     recursive back references. If so, we must wrap it in atomic brackets.
7232     In any event, remove the block from the chain. */
7233
7234     if (capnumber > 0)
7235       {
7236       if (cd->open_caps->flag)
7237         {
7238         memmove(start_bracket + 1 + LINK_SIZE, start_bracket,
7239           IN_UCHARS(code - start_bracket));
7240         *start_bracket = OP_ONCE;
7241         code += 1 + LINK_SIZE;
7242         PUT(start_bracket, 1, (int)(code - start_bracket));
7243         *code = OP_KET;
7244         PUT(code, 1, (int)(code - start_bracket));
7245         code += 1 + LINK_SIZE;
7246         length += 2 + 2*LINK_SIZE;
7247         }
7248       cd->open_caps = cd->open_caps->next;
7249       }
7250
7251     /* Retain the highest bracket number, in case resetting was used. */
7252
7253     cd->bracount = max_bracount;
7254
7255     /* Set values to pass back */
7256
7257     *codeptr = code;
7258     *ptrptr = ptr;
7259     *firstcharptr = firstchar;
7260     *reqcharptr = reqchar;
7261     if (lengthptr != NULL)
7262       {
7263       if (OFLOW_MAX - *lengthptr < length)
7264         {
7265         *errorcodeptr = ERR20;
7266         return FALSE;
7267         }
7268       *lengthptr += length;
7269       }
7270     return TRUE;
7271     }
7272
7273   /* Another branch follows. In the pre-compile phase, we can move the code
7274   pointer back to where it was for the start of the first branch. (That is,
7275   pretend that each branch is the only one.)
7276
7277   In the real compile phase, insert an ALT node. Its length field points back
7278   to the previous branch while the bracket remains open. At the end the chain
7279   is reversed. It's done like this so that the start of the bracket has a
7280   zero offset until it is closed, making it possible to detect recursion. */
7281
7282   if (lengthptr != NULL)
7283     {
7284     code = *codeptr + 1 + LINK_SIZE + skipbytes;
7285     length += 1 + LINK_SIZE;
7286     }
7287   else
7288     {
7289     *code = OP_ALT;
7290     PUT(code, 1, (int)(code - last_branch));
7291     bc.current_branch = last_branch = code;
7292     code += 1 + LINK_SIZE;
7293     }
7294
7295   ptr++;
7296   }
7297 /* Control never reaches here */
7298 }
7299
7300
7301
7302
7303 /*************************************************
7304 *          Check for anchored expression         *
7305 *************************************************/
7306
7307 /* Try to find out if this is an anchored regular expression. Consider each
7308 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
7309 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
7310 it's anchored. However, if this is a multiline pattern, then only OP_SOD will
7311 be found, because ^ generates OP_CIRCM in that mode.
7312
7313 We can also consider a regex to be anchored if OP_SOM starts all its branches.
7314 This is the code for \G, which means "match at start of match position, taking
7315 into account the match offset".
7316
7317 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
7318 because that will try the rest of the pattern at all possible matching points,
7319 so there is no point trying again.... er ....
7320
7321 .... except when the .* appears inside capturing parentheses, and there is a
7322 subsequent back reference to those parentheses. We haven't enough information
7323 to catch that case precisely.
7324
7325 At first, the best we could do was to detect when .* was in capturing brackets
7326 and the highest back reference was greater than or equal to that level.
7327 However, by keeping a bitmap of the first 31 back references, we can catch some
7328 of the more common cases more precisely.
7329
7330 Arguments:
7331   code           points to start of expression (the bracket)
7332   bracket_map    a bitmap of which brackets we are inside while testing; this
7333                   handles up to substring 31; after that we just have to take
7334                   the less precise approach
7335   backref_map    the back reference bitmap
7336
7337 Returns:     TRUE or FALSE
7338 */
7339
7340 static BOOL
7341 is_anchored(const pcre_uchar *code, unsigned int bracket_map,
7342   unsigned int backref_map)
7343 {
7344 do {
7345    const pcre_uchar *scode = first_significant_code(
7346      code + PRIV(OP_lengths)[*code], FALSE);
7347    int op = *scode;
7348
7349    /* Non-capturing brackets */
7350
7351    if (op == OP_BRA  || op == OP_BRAPOS ||
7352        op == OP_SBRA || op == OP_SBRAPOS)
7353      {
7354      if (!is_anchored(scode, bracket_map, backref_map)) return FALSE;
7355      }
7356
7357    /* Capturing brackets */
7358
7359    else if (op == OP_CBRA  || op == OP_CBRAPOS ||
7360             op == OP_SCBRA || op == OP_SCBRAPOS)
7361      {
7362      int n = GET2(scode, 1+LINK_SIZE);
7363      int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
7364      if (!is_anchored(scode, new_map, backref_map)) return FALSE;
7365      }
7366
7367    /* Other brackets */
7368
7369    else if (op == OP_ASSERT || op == OP_ONCE || op == OP_ONCE_NC ||
7370             op == OP_COND)
7371      {
7372      if (!is_anchored(scode, bracket_map, backref_map)) return FALSE;
7373      }
7374
7375    /* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and
7376    it isn't in brackets that are or may be referenced. */
7377
7378    else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
7379              op == OP_TYPEPOSSTAR))
7380      {
7381      if (scode[1] != OP_ALLANY || (bracket_map & backref_map) != 0)
7382        return FALSE;
7383      }
7384
7385    /* Check for explicit anchoring */
7386
7387    else if (op != OP_SOD && op != OP_SOM && op != OP_CIRC) return FALSE;
7388    code += GET(code, 1);
7389    }
7390 while (*code == OP_ALT);   /* Loop for each alternative */
7391 return TRUE;
7392 }
7393
7394
7395
7396 /*************************************************
7397 *         Check for starting with ^ or .*        *
7398 *************************************************/
7399
7400 /* This is called to find out if every branch starts with ^ or .* so that
7401 "first char" processing can be done to speed things up in multiline
7402 matching and for non-DOTALL patterns that start with .* (which must start at
7403 the beginning or after \n). As in the case of is_anchored() (see above), we
7404 have to take account of back references to capturing brackets that contain .*
7405 because in that case we can't make the assumption.
7406
7407 Arguments:
7408   code           points to start of expression (the bracket)
7409   bracket_map    a bitmap of which brackets we are inside while testing; this
7410                   handles up to substring 31; after that we just have to take
7411                   the less precise approach
7412   backref_map    the back reference bitmap
7413
7414 Returns:         TRUE or FALSE
7415 */
7416
7417 static BOOL
7418 is_startline(const pcre_uchar *code, unsigned int bracket_map,
7419   unsigned int backref_map)
7420 {
7421 do {
7422    const pcre_uchar *scode = first_significant_code(
7423      code + PRIV(OP_lengths)[*code], FALSE);
7424    int op = *scode;
7425
7426    /* If we are at the start of a conditional assertion group, *both* the
7427    conditional assertion *and* what follows the condition must satisfy the test
7428    for start of line. Other kinds of condition fail. Note that there may be an
7429    auto-callout at the start of a condition. */
7430
7431    if (op == OP_COND)
7432      {
7433      scode += 1 + LINK_SIZE;
7434      if (*scode == OP_CALLOUT) scode += PRIV(OP_lengths)[OP_CALLOUT];
7435      switch (*scode)
7436        {
7437        case OP_CREF:
7438        case OP_NCREF:
7439        case OP_RREF:
7440        case OP_NRREF:
7441        case OP_DEF:
7442        return FALSE;
7443
7444        default:     /* Assertion */
7445        if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
7446        do scode += GET(scode, 1); while (*scode == OP_ALT);
7447        scode += 1 + LINK_SIZE;
7448        break;
7449        }
7450      scode = first_significant_code(scode, FALSE);
7451      op = *scode;
7452      }
7453
7454    /* Non-capturing brackets */
7455
7456    if (op == OP_BRA  || op == OP_BRAPOS ||
7457        op == OP_SBRA || op == OP_SBRAPOS)
7458      {
7459      if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
7460      }
7461
7462    /* Capturing brackets */
7463
7464    else if (op == OP_CBRA  || op == OP_CBRAPOS ||
7465             op == OP_SCBRA || op == OP_SCBRAPOS)
7466      {
7467      int n = GET2(scode, 1+LINK_SIZE);
7468      int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
7469      if (!is_startline(scode, new_map, backref_map)) return FALSE;
7470      }
7471
7472    /* Other brackets */
7473
7474    else if (op == OP_ASSERT || op == OP_ONCE || op == OP_ONCE_NC)
7475      {
7476      if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
7477      }
7478
7479    /* .* means "start at start or after \n" if it isn't in brackets that
7480    may be referenced. */
7481
7482    else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
7483      {
7484      if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
7485      }
7486
7487    /* Check for explicit circumflex */
7488
7489    else if (op != OP_CIRC && op != OP_CIRCM) return FALSE;
7490
7491    /* Move on to the next alternative */
7492
7493    code += GET(code, 1);
7494    }
7495 while (*code == OP_ALT);  /* Loop for each alternative */
7496 return TRUE;
7497 }
7498
7499
7500
7501 /*************************************************
7502 *       Check for asserted fixed first char      *
7503 *************************************************/
7504
7505 /* During compilation, the "first char" settings from forward assertions are
7506 discarded, because they can cause conflicts with actual literals that follow.
7507 However, if we end up without a first char setting for an unanchored pattern,
7508 it is worth scanning the regex to see if there is an initial asserted first
7509 char. If all branches start with the same asserted char, or with a bracket all
7510 of whose alternatives start with the same asserted char (recurse ad lib), then
7511 we return that char, otherwise -1.
7512
7513 Arguments:
7514   code       points to start of expression (the bracket)
7515   inassert   TRUE if in an assertion
7516
7517 Returns:     -1 or the fixed first char
7518 */
7519
7520 static int
7521 find_firstassertedchar(const pcre_uchar *code, BOOL inassert)
7522 {
7523 int c = -1;
7524 do {
7525    int d;
7526    int xl = (*code == OP_CBRA || *code == OP_SCBRA ||
7527              *code == OP_CBRAPOS || *code == OP_SCBRAPOS)? IMM2_SIZE:0;
7528    const pcre_uchar *scode = first_significant_code(code + 1+LINK_SIZE + xl,
7529      TRUE);
7530    int op = *scode;
7531
7532    switch(op)
7533      {
7534      default:
7535      return -1;
7536
7537      case OP_BRA:
7538      case OP_BRAPOS:
7539      case OP_CBRA:
7540      case OP_SCBRA:
7541      case OP_CBRAPOS:
7542      case OP_SCBRAPOS:
7543      case OP_ASSERT:
7544      case OP_ONCE:
7545      case OP_ONCE_NC:
7546      case OP_COND:
7547      if ((d = find_firstassertedchar(scode, op == OP_ASSERT)) < 0)
7548        return -1;
7549      if (c < 0) c = d; else if (c != d) return -1;
7550      break;
7551
7552      case OP_EXACT:
7553      scode += IMM2_SIZE;
7554      /* Fall through */
7555
7556      case OP_CHAR:
7557      case OP_PLUS:
7558      case OP_MINPLUS:
7559      case OP_POSPLUS:
7560      if (!inassert) return -1;
7561      if (c < 0) c = scode[1];
7562        else if (c != scode[1]) return -1;
7563      break;
7564
7565      case OP_EXACTI:
7566      scode += IMM2_SIZE;
7567      /* Fall through */
7568
7569      case OP_CHARI:
7570      case OP_PLUSI:
7571      case OP_MINPLUSI:
7572      case OP_POSPLUSI:
7573      if (!inassert) return -1;
7574      if (c < 0) c = scode[1] | REQ_CASELESS;
7575        else if (c != scode[1]) return -1;
7576      break;
7577      }
7578
7579    code += GET(code, 1);
7580    }
7581 while (*code == OP_ALT);
7582 return c;
7583 }
7584
7585
7586
7587 /*************************************************
7588 *        Compile a Regular Expression            *
7589 *************************************************/
7590
7591 /* This function takes a string and returns a pointer to a block of store
7592 holding a compiled version of the expression. The original API for this
7593 function had no error code return variable; it is retained for backwards
7594 compatibility. The new function is given a new name.
7595
7596 Arguments:
7597   pattern       the regular expression
7598   options       various option bits
7599   errorcodeptr  pointer to error code variable (pcre_compile2() only)
7600                   can be NULL if you don't want a code value
7601   errorptr      pointer to pointer to error text
7602   erroroffset   ptr offset in pattern where error was detected
7603   tables        pointer to character tables or NULL
7604
7605 Returns:        pointer to compiled data block, or NULL on error,
7606                 with errorptr and erroroffset set
7607 */
7608
7609 #ifdef COMPILE_PCRE8
7610 PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
7611 pcre_compile(const char *pattern, int options, const char **errorptr,
7612   int *erroroffset, const unsigned char *tables)
7613 #else
7614 PCRE_EXP_DEFN pcre16 * PCRE_CALL_CONVENTION
7615 pcre16_compile(PCRE_SPTR16 pattern, int options, const char **errorptr,
7616   int *erroroffset, const unsigned char *tables)
7617 #endif
7618 {
7619 #ifdef COMPILE_PCRE8
7620 return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
7621 #else
7622 return pcre16_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
7623 #endif
7624 }
7625
7626
7627 #ifdef COMPILE_PCRE8
7628 PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
7629 pcre_compile2(const char *pattern, int options, int *errorcodeptr,
7630   const char **errorptr, int *erroroffset, const unsigned char *tables)
7631 #else
7632 PCRE_EXP_DEFN pcre16 * PCRE_CALL_CONVENTION
7633 pcre16_compile2(PCRE_SPTR16 pattern, int options, int *errorcodeptr,
7634   const char **errorptr, int *erroroffset, const unsigned char *tables)
7635 #endif
7636 {
7637 REAL_PCRE *re;
7638 int length = 1;  /* For final END opcode */
7639 pcre_int32 firstchar, reqchar;
7640 int newline;
7641 int errorcode = 0;
7642 int skipatstart = 0;
7643 BOOL utf;
7644 size_t size;
7645 pcre_uchar *code;
7646 const pcre_uchar *codestart;
7647 const pcre_uchar *ptr;
7648 compile_data compile_block;
7649 compile_data *cd = &compile_block;
7650
7651 /* This space is used for "compiling" into during the first phase, when we are
7652 computing the amount of memory that is needed. Compiled items are thrown away
7653 as soon as possible, so that a fairly large buffer should be sufficient for
7654 this purpose. The same space is used in the second phase for remembering where
7655 to fill in forward references to subpatterns. That may overflow, in which case
7656 new memory is obtained from malloc(). */
7657
7658 pcre_uchar cworkspace[COMPILE_WORK_SIZE];
7659
7660 /* Set this early so that early errors get offset 0. */
7661
7662 ptr = (const pcre_uchar *)pattern;
7663
7664 /* We can't pass back an error message if errorptr is NULL; I guess the best we
7665 can do is just return NULL, but we can set a code value if there is a code
7666 pointer. */
7667
7668 if (errorptr == NULL)
7669   {
7670   if (errorcodeptr != NULL) *errorcodeptr = 99;
7671   return NULL;
7672   }
7673
7674 *errorptr = NULL;
7675 if (errorcodeptr != NULL) *errorcodeptr = ERR0;
7676
7677 /* However, we can give a message for this error */
7678
7679 if (erroroffset == NULL)
7680   {
7681   errorcode = ERR16;
7682   goto PCRE_EARLY_ERROR_RETURN2;
7683   }
7684
7685 *erroroffset = 0;
7686
7687 /* Set up pointers to the individual character tables */
7688
7689 if (tables == NULL) tables = PRIV(default_tables);
7690 cd->lcc = tables + lcc_offset;
7691 cd->fcc = tables + fcc_offset;
7692 cd->cbits = tables + cbits_offset;
7693 cd->ctypes = tables + ctypes_offset;
7694
7695 /* Check that all undefined public option bits are zero */
7696
7697 if ((options & ~PUBLIC_COMPILE_OPTIONS) != 0)
7698   {
7699   errorcode = ERR17;
7700   goto PCRE_EARLY_ERROR_RETURN;
7701   }
7702
7703 /* Check for global one-time settings at the start of the pattern, and remember
7704 the offset for later. */
7705
7706 while (ptr[skipatstart] == CHAR_LEFT_PARENTHESIS &&
7707        ptr[skipatstart+1] == CHAR_ASTERISK)
7708   {
7709   int newnl = 0;
7710   int newbsr = 0;
7711
7712 #ifdef COMPILE_PCRE8
7713   if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UTF_RIGHTPAR, 5) == 0)
7714     { skipatstart += 7; options |= PCRE_UTF8; continue; }
7715 #endif
7716 #ifdef COMPILE_PCRE16
7717   if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UTF_RIGHTPAR, 6) == 0)
7718     { skipatstart += 8; options |= PCRE_UTF16; continue; }
7719 #endif
7720   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UCP_RIGHTPAR, 4) == 0)
7721     { skipatstart += 6; options |= PCRE_UCP; continue; }
7722   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_NO_START_OPT_RIGHTPAR, 13) == 0)
7723     { skipatstart += 15; options |= PCRE_NO_START_OPTIMIZE; continue; }
7724
7725   if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_CR_RIGHTPAR, 3) == 0)
7726     { skipatstart += 5; newnl = PCRE_NEWLINE_CR; }
7727   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_LF_RIGHTPAR, 3)  == 0)
7728     { skipatstart += 5; newnl = PCRE_NEWLINE_LF; }
7729   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_CRLF_RIGHTPAR, 5)  == 0)
7730     { skipatstart += 7; newnl = PCRE_NEWLINE_CR + PCRE_NEWLINE_LF; }
7731   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_ANY_RIGHTPAR, 4) == 0)
7732     { skipatstart += 6; newnl = PCRE_NEWLINE_ANY; }
7733   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_ANYCRLF_RIGHTPAR, 8) == 0)
7734     { skipatstart += 10; newnl = PCRE_NEWLINE_ANYCRLF; }
7735
7736   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_BSR_ANYCRLF_RIGHTPAR, 12) == 0)
7737     { skipatstart += 14; newbsr = PCRE_BSR_ANYCRLF; }
7738   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_BSR_UNICODE_RIGHTPAR, 12) == 0)
7739     { skipatstart += 14; newbsr = PCRE_BSR_UNICODE; }
7740
7741   if (newnl != 0)
7742     options = (options & ~PCRE_NEWLINE_BITS) | newnl;
7743   else if (newbsr != 0)
7744     options = (options & ~(PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) | newbsr;
7745   else break;
7746   }
7747
7748 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
7749 utf = (options & PCRE_UTF8) != 0;
7750
7751 /* Can't support UTF unless PCRE has been compiled to include the code. The
7752 return of an error code from PRIV(valid_utf)() is a new feature, introduced in
7753 release 8.13. It is passed back from pcre_[dfa_]exec(), but at the moment is
7754 not used here. */
7755
7756 #ifdef SUPPORT_UTF
7757 if (utf && (options & PCRE_NO_UTF8_CHECK) == 0 &&
7758      (errorcode = PRIV(valid_utf)((PCRE_PUCHAR)pattern, -1, erroroffset)) != 0)
7759   {
7760 #ifdef COMPILE_PCRE8
7761   errorcode = ERR44;
7762 #else
7763   errorcode = ERR74;
7764 #endif
7765   goto PCRE_EARLY_ERROR_RETURN2;
7766   }
7767 #else
7768 if (utf)
7769   {
7770   errorcode = ERR32;
7771   goto PCRE_EARLY_ERROR_RETURN;
7772   }
7773 #endif
7774
7775 /* Can't support UCP unless PCRE has been compiled to include the code. */
7776
7777 #ifndef SUPPORT_UCP
7778 if ((options & PCRE_UCP) != 0)
7779   {
7780   errorcode = ERR67;
7781   goto PCRE_EARLY_ERROR_RETURN;
7782   }
7783 #endif
7784
7785 /* Check validity of \R options. */
7786
7787 if ((options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) ==
7788      (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
7789   {
7790   errorcode = ERR56;
7791   goto PCRE_EARLY_ERROR_RETURN;
7792   }
7793
7794 /* Handle different types of newline. The three bits give seven cases. The
7795 current code allows for fixed one- or two-byte sequences, plus "any" and
7796 "anycrlf". */
7797
7798 switch (options & PCRE_NEWLINE_BITS)
7799   {
7800   case 0: newline = NEWLINE; break;   /* Build-time default */
7801   case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
7802   case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
7803   case PCRE_NEWLINE_CR+
7804        PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
7805   case PCRE_NEWLINE_ANY: newline = -1; break;
7806   case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
7807   default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
7808   }
7809
7810 if (newline == -2)
7811   {
7812   cd->nltype = NLTYPE_ANYCRLF;
7813   }
7814 else if (newline < 0)
7815   {
7816   cd->nltype = NLTYPE_ANY;
7817   }
7818 else
7819   {
7820   cd->nltype = NLTYPE_FIXED;
7821   if (newline > 255)
7822     {
7823     cd->nllen = 2;
7824     cd->nl[0] = (newline >> 8) & 255;
7825     cd->nl[1] = newline & 255;
7826     }
7827   else
7828     {
7829     cd->nllen = 1;
7830     cd->nl[0] = newline;
7831     }
7832   }
7833
7834 /* Maximum back reference and backref bitmap. The bitmap records up to 31 back
7835 references to help in deciding whether (.*) can be treated as anchored or not.
7836 */
7837
7838 cd->top_backref = 0;
7839 cd->backref_map = 0;
7840
7841 /* Reflect pattern for debugging output */
7842
7843 DPRINTF(("------------------------------------------------------------------\n"));
7844 #ifdef PCRE_DEBUG
7845 print_puchar(stdout, (PCRE_PUCHAR)pattern);
7846 #endif
7847 DPRINTF(("\n"));
7848
7849 /* Pretend to compile the pattern while actually just accumulating the length
7850 of memory required. This behaviour is triggered by passing a non-NULL final
7851 argument to compile_regex(). We pass a block of workspace (cworkspace) for it
7852 to compile parts of the pattern into; the compiled code is discarded when it is
7853 no longer needed, so hopefully this workspace will never overflow, though there
7854 is a test for its doing so. */
7855
7856 cd->bracount = cd->final_bracount = 0;
7857 cd->names_found = 0;
7858 cd->name_entry_size = 0;
7859 cd->name_table = NULL;
7860 cd->start_code = cworkspace;
7861 cd->hwm = cworkspace;
7862 cd->start_workspace = cworkspace;
7863 cd->workspace_size = COMPILE_WORK_SIZE;
7864 cd->start_pattern = (const pcre_uchar *)pattern;
7865 cd->end_pattern = (const pcre_uchar *)(pattern + STRLEN_UC((const pcre_uchar *)pattern));
7866 cd->req_varyopt = 0;
7867 cd->assert_depth = 0;
7868 cd->max_lookbehind = 0;
7869 cd->external_options = options;
7870 cd->external_flags = 0;
7871 cd->open_caps = NULL;
7872
7873 /* Now do the pre-compile. On error, errorcode will be set non-zero, so we
7874 don't need to look at the result of the function here. The initial options have
7875 been put into the cd block so that they can be changed if an option setting is
7876 found within the regex right at the beginning. Bringing initial option settings
7877 outside can help speed up starting point checks. */
7878
7879 ptr += skipatstart;
7880 code = cworkspace;
7881 *code = OP_BRA;
7882 (void)compile_regex(cd->external_options, &code, &ptr, &errorcode, FALSE,
7883   FALSE, 0, 0, &firstchar, &reqchar, NULL, cd, &length);
7884 if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN;
7885
7886 DPRINTF(("end pre-compile: length=%d workspace=%d\n", length,
7887   (int)(cd->hwm - cworkspace)));
7888
7889 if (length > MAX_PATTERN_SIZE)
7890   {
7891   errorcode = ERR20;
7892   goto PCRE_EARLY_ERROR_RETURN;
7893   }
7894
7895 /* Compute the size of data block needed and get it, either from malloc or
7896 externally provided function. Integer overflow should no longer be possible
7897 because nowadays we limit the maximum value of cd->names_found and
7898 cd->name_entry_size. */
7899
7900 size = sizeof(REAL_PCRE) + (length + cd->names_found * cd->name_entry_size) * sizeof(pcre_uchar);
7901 re = (REAL_PCRE *)(PUBL(malloc))(size);
7902
7903 if (re == NULL)
7904   {
7905   errorcode = ERR21;
7906   goto PCRE_EARLY_ERROR_RETURN;
7907   }
7908
7909 /* Put in the magic number, and save the sizes, initial options, internal
7910 flags, and character table pointer. NULL is used for the default character
7911 tables. The nullpad field is at the end; it's there to help in the case when a
7912 regex compiled on a system with 4-byte pointers is run on another with 8-byte
7913 pointers. */
7914
7915 re->magic_number = MAGIC_NUMBER;
7916 re->size = (int)size;
7917 re->options = cd->external_options;
7918 re->flags = cd->external_flags;
7919 re->first_char = 0;
7920 re->req_char = 0;
7921 re->name_table_offset = sizeof(REAL_PCRE) / sizeof(pcre_uchar);
7922 re->name_entry_size = cd->name_entry_size;
7923 re->name_count = cd->names_found;
7924 re->ref_count = 0;
7925 re->tables = (tables == PRIV(default_tables))? NULL : tables;
7926 re->nullpad = NULL;
7927
7928 /* The starting points of the name/number translation table and of the code are
7929 passed around in the compile data block. The start/end pattern and initial
7930 options are already set from the pre-compile phase, as is the name_entry_size
7931 field. Reset the bracket count and the names_found field. Also reset the hwm
7932 field; this time it's used for remembering forward references to subpatterns.
7933 */
7934
7935 cd->final_bracount = cd->bracount;  /* Save for checking forward references */
7936 cd->assert_depth = 0;
7937 cd->bracount = 0;
7938 cd->max_lookbehind = 0;
7939 cd->names_found = 0;
7940 cd->name_table = (pcre_uchar *)re + re->name_table_offset;
7941 codestart = cd->name_table + re->name_entry_size * re->name_count;
7942 cd->start_code = codestart;
7943 cd->hwm = (pcre_uchar *)(cd->start_workspace);
7944 cd->req_varyopt = 0;
7945 cd->had_accept = FALSE;
7946 cd->check_lookbehind = FALSE;
7947 cd->open_caps = NULL;
7948
7949 /* Set up a starting, non-extracting bracket, then compile the expression. On
7950 error, errorcode will be set non-zero, so we don't need to look at the result
7951 of the function here. */
7952
7953 ptr = (const pcre_uchar *)pattern + skipatstart;
7954 code = (pcre_uchar *)codestart;
7955 *code = OP_BRA;
7956 (void)compile_regex(re->options, &code, &ptr, &errorcode, FALSE, FALSE, 0, 0,
7957   &firstchar, &reqchar, NULL, cd, NULL);
7958 re->top_bracket = cd->bracount;
7959 re->top_backref = cd->top_backref;
7960 re->max_lookbehind = cd->max_lookbehind;
7961 re->flags = cd->external_flags | PCRE_MODE;
7962
7963 if (cd->had_accept) reqchar = REQ_NONE;   /* Must disable after (*ACCEPT) */
7964
7965 /* If not reached end of pattern on success, there's an excess bracket. */
7966
7967 if (errorcode == 0 && *ptr != 0) errorcode = ERR22;
7968
7969 /* Fill in the terminating state and check for disastrous overflow, but
7970 if debugging, leave the test till after things are printed out. */
7971
7972 *code++ = OP_END;
7973
7974 #ifndef PCRE_DEBUG
7975 if (code - codestart > length) errorcode = ERR23;
7976 #endif
7977
7978 /* Fill in any forward references that are required. There may be repeated
7979 references; optimize for them, as searching a large regex takes time. */
7980
7981 if (cd->hwm > cd->start_workspace)
7982   {
7983   int prev_recno = -1;
7984   const pcre_uchar *groupptr = NULL;
7985   while (errorcode == 0 && cd->hwm > cd->start_workspace)
7986     {
7987     int offset, recno;
7988     cd->hwm -= LINK_SIZE;
7989     offset = GET(cd->hwm, 0);
7990     recno = GET(codestart, offset);
7991     if (recno != prev_recno)
7992       {
7993       groupptr = PRIV(find_bracket)(codestart, utf, recno);
7994       prev_recno = recno;
7995       }
7996     if (groupptr == NULL) errorcode = ERR53;
7997       else PUT(((pcre_uchar *)codestart), offset, (int)(groupptr - codestart));
7998     }
7999   }
8000
8001 /* If the workspace had to be expanded, free the new memory. */
8002
8003 if (cd->workspace_size > COMPILE_WORK_SIZE)
8004   (PUBL(free))((void *)cd->start_workspace);
8005
8006 /* Give an error if there's back reference to a non-existent capturing
8007 subpattern. */
8008
8009 if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;
8010
8011 /* If there were any lookbehind assertions that contained OP_RECURSE
8012 (recursions or subroutine calls), a flag is set for them to be checked here,
8013 because they may contain forward references. Actual recursions can't be fixed
8014 length, but subroutine calls can. It is done like this so that those without
8015 OP_RECURSE that are not fixed length get a diagnosic with a useful offset. The
8016 exceptional ones forgo this. We scan the pattern to check that they are fixed
8017 length, and set their lengths. */
8018
8019 if (cd->check_lookbehind)
8020   {
8021   pcre_uchar *cc = (pcre_uchar *)codestart;
8022
8023   /* Loop, searching for OP_REVERSE items, and process those that do not have
8024   their length set. (Actually, it will also re-process any that have a length
8025   of zero, but that is a pathological case, and it does no harm.) When we find
8026   one, we temporarily terminate the branch it is in while we scan it. */
8027
8028   for (cc = (pcre_uchar *)PRIV(find_bracket)(codestart, utf, -1);
8029        cc != NULL;
8030        cc = (pcre_uchar *)PRIV(find_bracket)(cc, utf, -1))
8031     {
8032     if (GET(cc, 1) == 0)
8033       {
8034       int fixed_length;
8035       pcre_uchar *be = cc - 1 - LINK_SIZE + GET(cc, -LINK_SIZE);
8036       int end_op = *be;
8037       *be = OP_END;
8038       fixed_length = find_fixedlength(cc, (re->options & PCRE_UTF8) != 0, TRUE,
8039         cd);
8040       *be = end_op;
8041       DPRINTF(("fixed length = %d\n", fixed_length));
8042       if (fixed_length < 0)
8043         {
8044         errorcode = (fixed_length == -2)? ERR36 :
8045                     (fixed_length == -4)? ERR70 : ERR25;
8046         break;
8047         }
8048       if (fixed_length > cd->max_lookbehind) cd->max_lookbehind = fixed_length;
8049       PUT(cc, 1, fixed_length);
8050       }
8051     cc += 1 + LINK_SIZE;
8052     }
8053   }
8054
8055 /* Failed to compile, or error while post-processing */
8056
8057 if (errorcode != 0)
8058   {
8059   (PUBL(free))(re);
8060   PCRE_EARLY_ERROR_RETURN:
8061   *erroroffset = (int)(ptr - (const pcre_uchar *)pattern);
8062   PCRE_EARLY_ERROR_RETURN2:
8063   *errorptr = find_error_text(errorcode);
8064   if (errorcodeptr != NULL) *errorcodeptr = errorcode;
8065   return NULL;
8066   }
8067
8068 /* If the anchored option was not passed, set the flag if we can determine that
8069 the pattern is anchored by virtue of ^ characters or \A or anything else (such
8070 as starting with .* when DOTALL is set).
8071
8072 Otherwise, if we know what the first byte has to be, save it, because that
8073 speeds up unanchored matches no end. If not, see if we can set the
8074 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
8075 start with ^. and also when all branches start with .* for non-DOTALL matches.
8076 */
8077
8078 if ((re->options & PCRE_ANCHORED) == 0)
8079   {
8080   if (is_anchored(codestart, 0, cd->backref_map))
8081     re->options |= PCRE_ANCHORED;
8082   else
8083     {
8084     if (firstchar < 0)
8085       firstchar = find_firstassertedchar(codestart, FALSE);
8086     if (firstchar >= 0)   /* Remove caseless flag for non-caseable chars */
8087       {
8088 #ifdef COMPILE_PCRE8
8089       re->first_char = firstchar & 0xff;
8090 #else
8091 #ifdef COMPILE_PCRE16
8092       re->first_char = firstchar & 0xffff;
8093 #endif
8094 #endif
8095       if ((firstchar & REQ_CASELESS) != 0)
8096         {
8097 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
8098         /* We ignore non-ASCII first chars in 8 bit mode. */
8099         if (utf)
8100           {
8101           if (re->first_char < 128)
8102             {
8103             if (cd->fcc[re->first_char] != re->first_char)
8104               re->flags |= PCRE_FCH_CASELESS;
8105             }
8106           else if (UCD_OTHERCASE(re->first_char) != re->first_char)
8107             re->flags |= PCRE_FCH_CASELESS;
8108           }
8109         else
8110 #endif
8111         if (MAX_255(re->first_char)
8112             && cd->fcc[re->first_char] != re->first_char)
8113           re->flags |= PCRE_FCH_CASELESS;
8114         }
8115
8116       re->flags |= PCRE_FIRSTSET;
8117       }
8118     else if (is_startline(codestart, 0, cd->backref_map))
8119       re->flags |= PCRE_STARTLINE;
8120     }
8121   }
8122
8123 /* For an anchored pattern, we use the "required byte" only if it follows a
8124 variable length item in the regex. Remove the caseless flag for non-caseable
8125 bytes. */
8126
8127 if (reqchar >= 0 &&
8128      ((re->options & PCRE_ANCHORED) == 0 || (reqchar & REQ_VARY) != 0))
8129   {
8130 #ifdef COMPILE_PCRE8
8131   re->req_char = reqchar & 0xff;
8132 #else
8133 #ifdef COMPILE_PCRE16
8134   re->req_char = reqchar & 0xffff;
8135 #endif
8136 #endif
8137   if ((reqchar & REQ_CASELESS) != 0)
8138     {
8139 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
8140     /* We ignore non-ASCII first chars in 8 bit mode. */
8141     if (utf)
8142       {
8143       if (re->req_char < 128)
8144         {
8145         if (cd->fcc[re->req_char] != re->req_char)
8146           re->flags |= PCRE_RCH_CASELESS;
8147         }
8148       else if (UCD_OTHERCASE(re->req_char) != re->req_char)
8149         re->flags |= PCRE_RCH_CASELESS;
8150       }
8151     else
8152 #endif
8153     if (MAX_255(re->req_char) && cd->fcc[re->req_char] != re->req_char)
8154       re->flags |= PCRE_RCH_CASELESS;
8155     }
8156
8157   re->flags |= PCRE_REQCHSET;
8158   }
8159
8160 /* Print out the compiled data if debugging is enabled. This is never the
8161 case when building a production library. */
8162
8163 #ifdef PCRE_DEBUG
8164 printf("Length = %d top_bracket = %d top_backref = %d\n",
8165   length, re->top_bracket, re->top_backref);
8166
8167 printf("Options=%08x\n", re->options);
8168
8169 if ((re->flags & PCRE_FIRSTSET) != 0)
8170   {
8171   pcre_uchar ch = re->first_char;
8172   const char *caseless =
8173     ((re->flags & PCRE_FCH_CASELESS) == 0)? "" : " (caseless)";
8174   if (PRINTABLE(ch)) printf("First char = %c%s\n", ch, caseless);
8175     else printf("First char = \\x%02x%s\n", ch, caseless);
8176   }
8177
8178 if ((re->flags & PCRE_REQCHSET) != 0)
8179   {
8180   pcre_uchar ch = re->req_char;
8181   const char *caseless =
8182     ((re->flags & PCRE_RCH_CASELESS) == 0)? "" : " (caseless)";
8183   if (PRINTABLE(ch)) printf("Req char = %c%s\n", ch, caseless);
8184     else printf("Req char = \\x%02x%s\n", ch, caseless);
8185   }
8186
8187 #ifdef COMPILE_PCRE8
8188 pcre_printint((pcre *)re, stdout, TRUE);
8189 #else
8190 pcre16_printint((pcre *)re, stdout, TRUE);
8191 #endif
8192
8193 /* This check is done here in the debugging case so that the code that
8194 was compiled can be seen. */
8195
8196 if (code - codestart > length)
8197   {
8198   (PUBL(free))(re);
8199   *errorptr = find_error_text(ERR23);
8200   *erroroffset = ptr - (pcre_uchar *)pattern;
8201   if (errorcodeptr != NULL) *errorcodeptr = ERR23;
8202   return NULL;
8203   }
8204 #endif   /* PCRE_DEBUG */
8205
8206 #ifdef COMPILE_PCRE8
8207 return (pcre *)re;
8208 #else
8209 return (pcre16 *)re;
8210 #endif
8211 }
8212
8213 /* End of pcre_compile.c */