pcre/pcre_compile.cpp

   1 #include "avmplus.h"
   2
   3 /*************************************************
   4 *      Perl-Compatible Regular Expressions       *
   5 *************************************************/
   6
   7 /* PCRE is a library of functions to support regular expressions whose syntax
   8 and semantics are as close as possible to those of the Perl 5 language.
   9
  10                        Written by Philip Hazel
  11            Copyright (c) 1997-2007 University of Cambridge
  12
  13 -----------------------------------------------------------------------------
  14 Redistribution and use in source and binary forms, with or without
  15 modification, are permitted provided that the following conditions are met:
  16
  17     * Redistributions of source code must retain the above copyright notice,
  18       this list of conditions and the following disclaimer.
  19
  20     * Redistributions in binary form must reproduce the above copyright
  21       notice, this list of conditions and the following disclaimer in the
  22       documentation and/or other materials provided with the distribution.
  23
  24     * Neither the name of the University of Cambridge nor the names of its
  25       contributors may be used to endorse or promote products derived from
  26       this software without specific prior written permission.
  27
  28 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  29 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  30 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  31 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  32 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  33 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  34 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  35 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  36 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  37 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  38 POSSIBILITY OF SUCH DAMAGE.
  39 -----------------------------------------------------------------------------
  40 */
  41
  42
  43 /* This module contains the external function pcre_compile(), along with
  44 supporting internal functions that are not used by other modules. */
  45
  46
  47 #include "config.h"
  48
  49 #define NLBLOCK cd             /* Block containing newline information */
  50 #define PSSTART start_pattern  /* Field containing processed string start */
  51 #define PSEND   end_pattern    /* Field containing processed string end */
  52
  53 #include "pcre_internal.h"
  54
  55
  56 /* When DEBUG is defined, we need the pcre_printint() function, which is also
  57 used by pcretest. DEBUG is not defined when building a production library. */
  58
  59 #ifdef PCRE_DEBUG
  60 #include "pcre_printint.src"
  61 #endif
  62
  63
  64 /* Macro for setting individual bits in class bitmaps. */
  65
  66 #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
  67
  68 /* Maximum length value to check against when making sure that the integer that
  69 holds the compiled pattern length does not overflow. We make it a bit less than
  70 INT_MAX to allow for adding in group terminating bytes, so that we don't have
  71 to check them every time. */
  72
  73 #define OFLOW_MAX (INT_MAX - 20)
  74
  75
  76 /*************************************************
  77 *      Code parameters and static tables         *
  78 *************************************************/
  79
  80 /* This value specifies the size of stack workspace that is used during the
  81 first pre-compile phase that determines how much memory is required. The regex
  82 is partly compiled into this space, but the compiled parts are discarded as
  83 soon as they can be, so that hopefully there will never be an overrun. The code
  84 does, however, check for an overrun. The largest amount I've seen used is 218,
  85 so this number is very generous.
  86
  87 The same workspace is used during the second, actual compile phase for
  88 remembering forward references to groups so that they can be filled in at the
  89 end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
  90 is 4 there is plenty of room. */
  91
  92 //#define COMPILE_WORK_SIZE (4096)
  93 #define COMPILE_WORK_SIZE (2048) //tierney:  smaller size to avoid stack overflow when compiling patterns with many nested ()'s
  94
  95
  96 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
  97 are simple data values; negative values are for special things like \d and so
  98 on. Zero means further processing is needed (for things like \x), or the escape
  99 is invalid. */
 100
 101 #ifndef EBCDIC  /* This is the "normal" table for ASCII systems */
 102 static const short int escapes[] = {
 103      0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */
 104      0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */
 105    '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E,      0, -ESC_G,   /* @ - G */
 106 -ESC_H,      0,      0, -ESC_K,      0,      0,      0,      0,   /* H - O */
 107 -ESC_P, -ESC_Q, -ESC_R, -ESC_S,      0,      0, -ESC_V, -ESC_W,   /* P - W */
 108 -ESC_X,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */
 109    '`',      7, -ESC_b,      0, -ESC_d,  ESC_e,  ESC_f,      0,   /* ` - g */
 110 -ESC_h,      0,      0, -ESC_k,      0,      0,  ESC_n,      0,   /* h - o */
 111 -ESC_p,      0,  ESC_r, -ESC_s,  ESC_tee,    0, -ESC_v, -ESC_w,   /* p - w */
 112      0,      0, -ESC_z                                            /* x - z */
 113 };
 114
 115 #else           /* This is the "abnormal" table for EBCDIC systems */
 116 static const short int escapes[] = {
 117 /*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',
 118 /*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,
 119 /*  58 */     0,     0,    '!',     '$',    '*',   ')',    ';',    '~',
 120 /*  60 */   '-',   '/',      0,       0,      0,     0,      0,      0,
 121 /*  68 */     0,     0,    '|',     ',',    '%',   '_',    '>',    '?',
 122 /*  70 */     0,     0,      0,       0,      0,     0,      0,      0,
 123 /*  78 */     0,   '`',    ':',     '#',    '@',  '\'',    '=',    '"',
 124 /*  80 */     0,     7, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,
 125 /*  88 */-ESC_h,     0,      0,     '{',      0,     0,      0,      0,
 126 /*  90 */     0,     0, -ESC_k,     'l',      0, ESC_n,      0, -ESC_p,
 127 /*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,
 128 /*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,-ESC_v, -ESC_w,      0,
 129 /*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,
 130 /*  B0 */     0,     0,      0,       0,      0,     0,      0,      0,
 131 /*  B8 */     0,     0,      0,       0,      0,   ']',    '=',    '-',
 132 /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,
 133 /*  C8 */-ESC_H,     0,      0,       0,      0,     0,      0,      0,
 134 /*  D0 */   '}',     0, -ESC_K,       0,      0,     0,      0, -ESC_P,
 135 /*  D8 */-ESC_Q,-ESC_R,      0,       0,      0,     0,      0,      0,
 136 /*  E0 */  '\\',     0, -ESC_S,       0,      0,-ESC_V, -ESC_W, -ESC_X,
 137 /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,
 138 /*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,
 139 /*  F8 */     0,     0,      0,       0,      0,     0,      0,      0
 140 };
 141 #endif
 142
 143
 144 /* Table of special "verbs" like (*PRUNE) */
 145
 146 typedef struct verbitem {
 147   const char *name;
 148   int   len;
 149   int   op;
 150 } verbitem;
 151
 152 static verbitem verbs[] = {
 153   { "ACCEPT", 6, OP_ACCEPT },
 154   { "COMMIT", 6, OP_COMMIT },
 155   { "F",      1, OP_FAIL },
 156   { "FAIL",   4, OP_FAIL },
 157   { "PRUNE",  5, OP_PRUNE },
 158   { "SKIP",   4, OP_SKIP  },
 159   { "THEN",   4, OP_THEN  }
 160 };
 161
 162 static int verbcount = sizeof(verbs)/sizeof(verbitem);
 163
 164
 165 /* Tables of names of POSIX character classes and their lengths. The list is
 166 terminated by a zero length entry. The first three must be alpha, lower, upper,
 167 as this is assumed for handling case independence. */
 168
 169 static const char *const posix_names[] = {
 170   "alpha", "lower", "upper",
 171   "alnum", "ascii", "blank", "cntrl", "digit", "graph",
 172   "print", "punct", "space", "word",  "xdigit" };
 173
 174 static const uschar posix_name_lengths[] = {
 175   5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
 176
 177 /* Table of class bit maps for each POSIX class. Each class is formed from a
 178 base map, with an optional addition or removal of another map. Then, for some
 179 classes, there is some additional tweaking: for [:blank:] the vertical space
 180 characters are removed, and for [:alpha:] and [:alnum:] the underscore
 181 character is removed. The triples in the table consist of the base map offset,
 182 second map offset or -1 if no second map, and a non-negative value for map
 183 addition or a negative value for map subtraction (if there are two maps). The
 184 absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
 185 remove vertical space characters, 2 => remove underscore. */
 186
 187 static const int posix_class_maps[] = {
 188   cbit_word,  cbit_digit, -2,             /* alpha */
 189   cbit_lower, -1,          0,             /* lower */
 190   cbit_upper, -1,          0,             /* upper */
 191   cbit_word,  -1,          2,             /* alnum - word without underscore */
 192   cbit_print, cbit_cntrl,  0,             /* ascii */
 193   cbit_space, -1,          1,             /* blank - a GNU extension */
 194   cbit_cntrl, -1,          0,             /* cntrl */
 195   cbit_digit, -1,          0,             /* digit */
 196   cbit_graph, -1,          0,             /* graph */
 197   cbit_print, -1,          0,             /* print */
 198   cbit_punct, -1,          0,             /* punct */
 199   cbit_space, -1,          0,             /* space */
 200   cbit_word,  -1,          0,             /* word - a Perl extension */
 201   cbit_xdigit,-1,          0              /* xdigit */
 202 };
 203
 204
 205 #define STRING(a)  # a
 206 #define XSTRING(s) STRING(s)
 207
 208 /* The texts of compile-time error messages. These are "char *" because they
 209 are passed to the outside world. Do not ever re-use any error number, because
 210 they are documented. Always add a new error instead. Messages marked DEAD below
 211 are no longer used. */
 212
 213 static const char *error_texts[] = {
 214 #ifdef AVMPLUS_PCRE
 215   "",
 216   "",
 217   "",
 218   "",
 219   "",
 220   /* 5 */
 221   "",
 222   "",
 223   "",
 224   "",
 225   "",
 226   /* 10 */
 227   "",
 228   "",
 229   "",
 230   "",
 231   "",
 232   /* 15 */
 233   "",
 234   "",
 235   "",
 236   "",
 237   "",
 238   /* 20 */
 239   "",
 240   "",
 241   "",
 242   "",
 243   "",
 244   /* 25 */
 245   "",
 246   "",
 247   "",
 248   "",
 249   "",
 250   /* 30 */
 251   "",
 252   "",
 253   "",
 254   "",
 255   "",
 256   /* 35 */
 257   "",
 258   "",
 259   "",
 260   "",
 261   "",
 262   /* 40 */
 263   "",
 264   "",
 265   "",
 266   "",
 267   "",
 268   /* 45 */
 269   "",
 270   "",
 271   "",
 272   "",
 273   "",
 274   /* 50 */
 275   "",    /** DEAD **/
 276   "",
 277   "",
 278   "",
 279   "",
 280   /* 55 */
 281   "",
 282   "",
 283   "",
 284   "",
 285   "",
 286   /* 60 */
 287   "",
 288   ""
 289 #else // AVMPLUS_PCRE
 290   "no error",
 291   "\\ at end of pattern",
 292   "\\c at end of pattern",
 293   "unrecognized character follows \\",
 294   "numbers out of order in {} quantifier",
 295   /* 5 */
 296   "number too big in {} quantifier",
 297   "missing terminating ] for character class",
 298   "invalid escape sequence in character class",
 299   "range out of order in character class",
 300   "nothing to repeat",
 301   /* 10 */
 302   "operand of unlimited repeat could match the empty string",  /** DEAD **/
 303   "internal error: unexpected repeat",
 304   "unrecognized character after (?",
 305   "POSIX named classes are supported only within a class",
 306   "missing )",
 307   /* 15 */
 308   "reference to non-existent subpattern",
 309   "erroffset passed as NULL",
 310   "unknown option bit(s) set",
 311   "missing ) after comment",
 312   "parentheses nested too deeply",  /** DEAD **/
 313   /* 20 */
 314   "regular expression is too large",
 315   "failed to get memory",
 316   "unmatched parentheses",
 317   "internal error: code overflow",
 318   "unrecognized character after (?<",
 319   /* 25 */
 320   "lookbehind assertion is not fixed length",
 321   "malformed number or name after (?(",
 322   "conditional group contains more than two branches",
 323   "assertion expected after (?(",
 324   "(?R or (?[+-]digits must be followed by )",
 325   /* 30 */
 326   "unknown POSIX class name",
 327   "POSIX collating elements are not supported",
 328   "this version of PCRE is not compiled with PCRE_UTF8 support",
 329   "spare error",  /** DEAD **/
 330   "character value in \\x{...} sequence is too large",
 331   /* 35 */
 332   "invalid condition (?(0)",
 333   "\\C not allowed in lookbehind assertion",
 334   "PCRE does not support \\L, \\l, \\N, \\U, or \\u",
 335   "number after (?C is > 255",
 336   "closing ) for (?C expected",
 337   /* 40 */
 338   "recursive call could loop indefinitely",
 339   "unrecognized character after (?P",
 340   "syntax error in subpattern name (missing terminator)",
 341   "two named subpatterns have the same name",
 342   "invalid UTF-8 string",
 343   /* 45 */
 344   "support for \\P, \\p, and \\X has not been compiled",
 345   "malformed \\P or \\p sequence",
 346   "unknown property name after \\P or \\p",
 347   "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)",
 348   "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")",
 349   /* 50 */
 350   "repeated subpattern is too long",    /** DEAD **/
 351   "octal value is greater than \\377 (not in UTF-8 mode)",
 352   "internal error: overran compiling workspace",
 353   "internal error: previously-checked referenced subpattern not found",
 354   "DEFINE group contains more than one branch",
 355   /* 55 */
 356   "repeating a DEFINE group is not allowed",
 357   "inconsistent NEWLINE options",
 358   "\\g is not followed by a braced name or an optionally braced non-zero number",
 359   "(?+ or (?- or (?(+ or (?(- must be followed by a non-zero number",
 360   "(*VERB) with an argument is not supported",
 361   /* 60 */
 362   "(*VERB) not recognized",
 363   "number is too big"
 364 #endif // AVMPLUS_PCRE
 365 };
 366
 367
 368 /* Table to identify digits and hex digits. This is used when compiling
 369 patterns. Note that the tables in chartables are dependent on the locale, and
 370 may mark arbitrary characters as digits - but the PCRE compiling code expects
 371 to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
 372 a private table here. It costs 256 bytes, but it is a lot faster than doing
 373 character value tests (at least in some simple cases I timed), and in some
 374 applications one wants PCRE to compile efficiently as well as match
 375 efficiently.
 376
 377 For convenience, we use the same bit definitions as in chartables:
 378
 379   0x04   decimal digit
 380   0x08   hexadecimal digit
 381
 382 Then we can use ctype_digit and ctype_xdigit in the code. */
 383
 384 #ifndef EBCDIC  /* This is the "normal" case, for ASCII systems */
 385 static const unsigned char digitab[] =
 386   {
 387   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */
 388   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15 */
 389   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  16- 23 */
 390   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31 */
 391   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - '  */
 392   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ( - /  */
 393   0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  */
 394   0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /*  8 - ?  */
 395   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  @ - G  */
 396   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  H - O  */
 397   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  P - W  */
 398   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  X - _  */
 399   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  ` - g  */
 400   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  h - o  */
 401   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  p - w  */
 402   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  x -127 */
 403   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
 404   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
 405   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
 406   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
 407   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
 408   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
 409   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
 410   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
 411   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
 412   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
 413   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
 414   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
 415   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
 416   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
 417   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
 418   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
 419
 420 #else           /* This is the "abnormal" case, for EBCDIC systems */
 421 static const unsigned char digitab[] =
 422   {
 423   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */
 424   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15    */
 425   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  16- 23 10 */
 426   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31    */
 427   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  32- 39 20 */
 428   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  40- 47    */
 429   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  48- 55 30 */
 430   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  56- 63    */
 431   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 40 */
 432   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  72- |     */
 433   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 50 */
 434   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  88- 95    */
 435   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 60 */
 436   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ?     */
 437   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
 438   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- "     */
 439   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g  80 */
 440   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  h -143    */
 441   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p  90 */
 442   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  q -159    */
 443   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x  A0 */
 444   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  y -175    */
 445   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ^ -183 B0 */
 446   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191    */
 447   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  { - G  C0 */
 448   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  H -207    */
 449   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  } - P  D0 */
 450   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  Q -223    */
 451   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  \ - X  E0 */
 452   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  Y -239    */
 453   0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  F0 */
 454   0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/*  8 -255    */
 455
 456 static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
 457   0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*   0-  7 */
 458   0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /*   8- 15 */
 459   0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*  16- 23 */
 460   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31 */
 461   0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*  32- 39 */
 462   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  40- 47 */
 463   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  48- 55 */
 464   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  56- 63 */
 465   0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 */
 466   0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /*  72- |  */
 467   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 */
 468   0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /*  88- 95 */
 469   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 */
 470   0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ?  */
 471   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
 472   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- "  */
 473   0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g  */
 474   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  h -143 */
 475   0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p  */
 476   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  q -159 */
 477   0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x  */
 478   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  y -175 */
 479   0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ^ -183 */
 480   0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
 481   0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /*  { - G  */
 482   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  H -207 */
 483   0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /*  } - P  */
 484   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  Q -223 */
 485   0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /*  \ - X  */
 486   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  Y -239 */
 487   0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /*  0 - 7  */
 488   0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/*  8 -255 */
 489 #endif
 490
 491
 492 /* Definition to allow mutual recursion */
 493
 494 static BOOL
 495   compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
 496     int *, int *, branch_chain *, compile_data *, int *);
 497
 498
 499
 500 /*************************************************
 501 *            Handle escapes                      *
 502 *************************************************/
 503
 504 /* This function is called when a \ has been encountered. It either returns a
 505 positive value for a simple escape such as \n, or a negative value which
 506 encodes one of the more complicated things such as \d. A backreference to group
 507 n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
 508 UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
 509 ptr is pointing at the \. On exit, it is on the final character of the escape
 510 sequence.
 511
 512 Arguments:
 513   ptrptr         points to the pattern position pointer
 514   errorcodeptr   points to the errorcode variable
 515   bracount       number of previous extracting brackets
 516   options        the options bits
 517   isclass        TRUE if inside a character class
 518
 519 Returns:         zero or positive => a data character
 520                  negative => a special escape sequence
 521                  on error, errorcodeptr is set
 522 */
 523
 524 static int
 525 check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
 526   int options, BOOL isclass)
 527 {
 528 BOOL utf8 = (options & PCRE_UTF8) != 0;
 529 const uschar *ptr = *ptrptr + 1;
 530 int c, i;
 531
 532 GETCHARINCTEST(c, ptr);           /* Get character value, increment pointer */
 533 ptr--;                            /* Set pointer back to the last byte */
 534
 535 /* If backslash is at the end of the pattern, it's an error. */
 536
 537 if (c == 0) *errorcodeptr = ERR1;
 538
 539 /* Non-alphamerics are literals. For digits or letters, do an initial lookup in
 540 a table. A non-zero result is something that can be returned immediately.
 541 Otherwise further processing may be required. */
 542
 543 #ifndef EBCDIC  /* ASCII coding */
 544 else if (c < '0' || c > 'z') {}                           /* Not alphameric */
 545 else if ((i = escapes[c - '0']) != 0) c = i;
 546
 547 #else           /* EBCDIC coding */
 548 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {}   /* Not alphameric */
 549 else if ((i = escapes[c - 0x48]) != 0)  c = i;
 550 #endif
 551
 552 /* Escapes that need further processing, or are illegal. */
 553
 554 else
 555   {
 556   const uschar *oldptr;
 557   BOOL braced, negated;
 558
 559   switch (c)
 560     {
 561     /* A number of Perl escapes are not handled by PCRE. We give an explicit
 562     error. */
 563
 564     case 'l':
 565     case 'L':
 566     case 'N':
 567     case 'u':
 568     case 'U':
 569     *errorcodeptr = ERR37;
 570     break;
 571
 572     /* \g must be followed by a number, either plain or braced. If positive, it
 573     is an absolute backreference. If negative, it is a relative backreference.
 574     This is a Perl 5.10 feature. Perl 5.10 also supports \g{name} as a
 575     reference to a named group. This is part of Perl's movement towards a
 576     unified syntax for back references. As this is synonymous with \k{name}, we
 577     fudge it up by pretending it really was \k. */
 578
 579     case 'g':
 580     if (ptr[1] == '{')
 581       {
 582       const uschar *p;
 583       for (p = ptr+2; *p != 0 && *p != '}'; p++)
 584         if (*p != '-' && (digitab[*p] & ctype_digit) == 0) break;
 585       if (*p != 0 && *p != '}')
 586         {
 587         c = -ESC_k;
 588         break;
 589         }
 590       braced = TRUE;
 591       ptr++;
 592       }
 593     else braced = FALSE;
 594
 595     if (ptr[1] == '-')
 596       {
 597       negated = TRUE;
 598       ptr++;
 599       }
 600     else negated = FALSE;
 601
 602     c = 0;
 603     while ((digitab[ptr[1]] & ctype_digit) != 0)
 604       c = c * 10 + *(++ptr) - '0';
 605
 606     if (c < 0)
 607       {
 608       *errorcodeptr = ERR61;
 609       break;
 610       }
 611
 612     if (c == 0 || (braced && *(++ptr) != '}'))
 613       {
 614       *errorcodeptr = ERR57;
 615       break;
 616       }
 617
 618     if (negated)
 619       {
 620       if (c > bracount)
 621         {
 622         *errorcodeptr = ERR15;
 623         break;
 624         }
 625       c = bracount - (c - 1);
 626       }
 627
 628     c = -(ESC_REF + c);
 629     break;
 630
 631     /* The handling of escape sequences consisting of a string of digits
 632     starting with one that is not zero is not straightforward. By experiment,
 633     the way Perl works seems to be as follows:
 634
 635     Outside a character class, the digits are read as a decimal number. If the
 636     number is less than 10, or if there are that many previous extracting
 637     left brackets, then it is a back reference. Otherwise, up to three octal
 638     digits are read to form an escaped byte. Thus \123 is likely to be octal
 639     123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
 640     value is greater than 377, the least significant 8 bits are taken. Inside a
 641     character class, \ followed by a digit is always an octal number. */
 642
 643     case '1': case '2': case '3': case '4': case '5':
 644     case '6': case '7': case '8': case '9':
 645
 646     if (!isclass)
 647       {
 648       oldptr = ptr;
 649       c -= '0';
 650       while ((digitab[ptr[1]] & ctype_digit) != 0)
 651         c = c * 10 + *(++ptr) - '0';
 652       if (c < 0)
 653         {
 654         *errorcodeptr = ERR61;
 655         break;
 656         }
 657       if (c < 10 || c <= bracount)
 658         {
 659         c = -(ESC_REF + c);
 660         break;
 661         }
 662       ptr = oldptr;      /* Put the pointer back and fall through */
 663       }
 664
 665     /* Handle an octal number following \. If the first digit is 8 or 9, Perl
 666     generates a binary zero byte and treats the digit as a following literal.
 667     Thus we have to pull back the pointer by one. */
 668
 669     if ((c = *ptr) >= '8')
 670       {
 671       ptr--;
 672       c = 0;
 673       break;
 674       }
 675
 676     /* \0 always starts an octal number, but we may drop through to here with a
 677     larger first octal digit. The original code used just to take the least
 678     significant 8 bits of octal numbers (I think this is what early Perls used
 679     to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
 680     than 3 octal digits. */
 681
 682     case '0':
 683     c -= '0';
 684     while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
 685         c = c * 8 + *(++ptr) - '0';
 686     if (!utf8 && c > 255) *errorcodeptr = ERR51;
 687     break;
 688
 689     /* \x is complicated. \x{ddd} is a character number which can be greater
 690     than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
 691     treated as a data character. */
 692
 693     case 'x':
 694     if (ptr[1] == '{')
 695       {
 696       const uschar *pt = ptr + 2;
 697       int count = 0;
 698
 699       c = 0;
 700       while ((digitab[*pt] & ctype_xdigit) != 0)
 701         {
 702         register int cc = *pt++;
 703         if (c == 0 && cc == '0') continue;     /* Leading zeroes */
 704         count++;
 705
 706 #ifndef EBCDIC  /* ASCII coding */
 707         if (cc >= 'a') cc -= 32;               /* Convert to upper case */
 708         c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
 709 #else           /* EBCDIC coding */
 710         if (cc >= 'a' && cc <= 'z') cc += 64;  /* Convert to upper case */
 711         c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
 712 #endif
 713         }
 714
 715       if (*pt == '}')
 716         {
 717         if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
 718         ptr = pt;
 719         break;
 720         }
 721
 722       /* If the sequence of hex digits does not end with '}', then we don't
 723       recognize this construct; fall through to the normal \x handling. */
 724       }
 725
 726     /* Read just a single-byte hex-defined char */
 727
 728     c = 0;
 729     while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
 730       {
 731       int cc;                               /* Some compilers don't like ++ */
 732       cc = *(++ptr);                        /* in initializers */
 733 #ifndef EBCDIC  /* ASCII coding */
 734       if (cc >= 'a') cc -= 32;              /* Convert to upper case */
 735       c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
 736 #else           /* EBCDIC coding */
 737       if (cc <= 'z') cc += 64;              /* Convert to upper case */
 738       c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
 739 #endif
 740       }
 741     break;
 742
 743     /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
 744     This coding is ASCII-specific, but then the whole concept of \cx is
 745     ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
 746
 747     case 'c':
 748     c = *(++ptr);
 749     if (c == 0)
 750       {
 751       *errorcodeptr = ERR2;
 752       break;
 753       }
 754
 755 #ifndef EBCDIC  /* ASCII coding */
 756     if (c >= 'a' && c <= 'z') c -= 32;
 757     c ^= 0x40;
 758 #else           /* EBCDIC coding */
 759     if (c >= 'a' && c <= 'z') c += 64;
 760     c ^= 0xC0;
 761 #endif
 762     break;
 763
 764     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
 765     other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
 766     for Perl compatibility, it is a literal. This code looks a bit odd, but
 767     there used to be some cases other than the default, and there may be again
 768     in future, so I haven't "optimized" it. */
 769
 770     default:
 771 #ifdef AVMPLUS_PCRE
 772           if ((options & PCRE_EXTRA) != 0)
 773           {
 774                 *errorcodeptr = ERR3;
 775           }
 776 #else
 777       if ((options & PCRE_EXTRA) != 0) switch(c)
 778       {
 779       default:
 780       *errorcodeptr = ERR3;
 781       break;
 782       }
 783 #endif
 784     break;
 785     }
 786   }
 787
 788 *ptrptr = ptr;
 789 return c;
 790 }
 791
 792
 793
 794 #ifdef SUPPORT_UCP
 795 /*************************************************
 796 *               Handle \P and \p                 *
 797 *************************************************/
 798
 799 /* This function is called after \P or \p has been encountered, provided that
 800 PCRE is compiled with support for Unicode properties. On entry, ptrptr is
 801 pointing at the P or p. On exit, it is pointing at the final character of the
 802 escape sequence.
 803
 804 Argument:
 805   ptrptr         points to the pattern position pointer
 806   negptr         points to a boolean that is set TRUE for negation else FALSE
 807   dptr           points to an int that is set to the detailed property value
 808   errorcodeptr   points to the error code variable
 809
 810 Returns:         type value from ucp_type_table, or -1 for an invalid type
 811 */
 812
 813 static int
 814 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
 815 {
 816 int c, i, bot, top;
 817 const uschar *ptr = *ptrptr;
 818 char name[32];
 819
 820 c = *(++ptr);
 821 if (c == 0) goto ERROR_RETURN;
 822
 823 *negptr = FALSE;
 824
 825 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
 826 negation. */
 827
 828 if (c == '{')
 829   {
 830   if (ptr[1] == '^')
 831     {
 832     *negptr = TRUE;
 833     ptr++;
 834     }
 835   for (i = 0; i < (int)sizeof(name) - 1; i++)
 836     {
 837     c = *(++ptr);
 838     if (c == 0) goto ERROR_RETURN;
 839     if (c == '}') break;
 840     name[i] = c;
 841     }
 842   if (c !='}') goto ERROR_RETURN;
 843   name[i] = 0;
 844   }
 845
 846 /* Otherwise there is just one following character */
 847
 848 else
 849   {
 850   name[0] = c;
 851   name[1] = 0;
 852   }
 853
 854 *ptrptr = ptr;
 855
 856 /* Search for a recognized property name using binary chop */
 857
 858 bot = 0;
 859 top = _pcre_utt_size;
 860
 861 while (bot < top)
 862   {
 863   i = (bot + top) >> 1;
 864   c = VMPI_strcmp(name, _pcre_utt[i].name);
 865   if (c == 0)
 866     {
 867     *dptr = _pcre_utt[i].value;
 868     return _pcre_utt[i].type;
 869     }
 870   if (c > 0) bot = i + 1; else top = i;
 871   }
 872
 873 *errorcodeptr = ERR47;
 874 *ptrptr = ptr;
 875 return -1;
 876
 877 ERROR_RETURN:
 878 *errorcodeptr = ERR46;
 879 *ptrptr = ptr;
 880 return -1;
 881 }
 882 #endif
 883
 884
 885
 886
 887 /*************************************************
 888 *            Check for counted repeat            *
 889 *************************************************/
 890
 891 /* This function is called when a '{' is encountered in a place where it might
 892 start a quantifier. It looks ahead to see if it really is a quantifier or not.
 893 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
 894 where the ddds are digits.
 895
 896 Arguments:
 897   p         pointer to the first char after '{'
 898
 899 Returns:    TRUE or FALSE
 900 */
 901
 902 static BOOL
 903 is_counted_repeat(const uschar *p)
 904 {
 905 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
 906 while ((digitab[*p] & ctype_digit) != 0) p++;
 907 if (*p == '}') return TRUE;
 908
 909 if (*p++ != ',') return FALSE;
 910 if (*p == '}') return TRUE;
 911
 912 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
 913 while ((digitab[*p] & ctype_digit) != 0) p++;
 914
 915 return (*p == '}');
 916 }
 917
 918
 919
 920 /*************************************************
 921 *         Read repeat counts                     *
 922 *************************************************/
 923
 924 /* Read an item of the form {n,m} and return the values. This is called only
 925 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
 926 so the syntax is guaranteed to be correct, but we need to check the values.
 927
 928 Arguments:
 929   p              pointer to first char after '{'
 930   minp           pointer to int for min
 931   maxp           pointer to int for max
 932                  returned as -1 if no max
 933   errorcodeptr   points to error code variable
 934
 935 Returns:         pointer to '}' on success;
 936                  current ptr on error, with errorcodeptr set non-zero
 937 */
 938
 939 static const uschar *
 940 read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
 941 {
 942 int min = 0;
 943 int max = -1;
 944
 945 /* Read the minimum value and do a paranoid check: a negative value indicates
 946 an integer overflow. */
 947
 948 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
 949 if (min < 0 || min > 65535)
 950   {
 951   *errorcodeptr = ERR5;
 952   return p;
 953   }
 954
 955 /* Read the maximum value if there is one, and again do a paranoid on its size.
 956 Also, max must not be less than min. */
 957
 958 if (*p == '}') max = min; else
 959   {
 960   if (*(++p) != '}')
 961     {
 962     max = 0;
 963     while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
 964     if (max < 0 || max > 65535)
 965       {
 966       *errorcodeptr = ERR5;
 967       return p;
 968       }
 969     if (max < min)
 970       {
 971       *errorcodeptr = ERR4;
 972       return p;
 973       }
 974     }
 975   }
 976
 977 /* Fill in the required variables, and pass back the pointer to the terminating
 978 '}'. */
 979
 980 *minp = min;
 981 *maxp = max;
 982 return p;
 983 }
 984
 985
 986
 987 /*************************************************
 988 *       Find forward referenced subpattern       *
 989 *************************************************/
 990
 991 /* This function scans along a pattern's text looking for capturing
 992 subpatterns, and counting them. If it finds a named pattern that matches the
 993 name it is given, it returns its number. Alternatively, if the name is NULL, it
 994 returns when it reaches a given numbered subpattern. This is used for forward
 995 references to subpatterns. We know that if (?P< is encountered, the name will
 996 be terminated by '>' because that is checked in the first pass.
 997
 998 Arguments:
 999   ptr          current position in the pattern
1000   count        current count of capturing parens so far encountered
1001   name         name to seek, or NULL if seeking a numbered subpattern
1002   lorn         name length, or subpattern number if name is NULL
1003   xmode        TRUE if we are in /x mode
1004
1005 Returns:       the number of the named subpattern, or -1 if not found
1006 */
1007
1008 static int
1009 find_parens(const uschar *ptr, int count, const uschar *name, int lorn,
1010   BOOL xmode)
1011 {
1012 const uschar *thisname;
1013
1014 for (; *ptr != 0; ptr++)
1015   {
1016   int term;
1017
1018   /* Skip over backslashed characters and also entire \Q...\E */
1019
1020   if (*ptr == '\\')
1021     {
1022     if (*(++ptr) == 0) return -1;
1023     if (*ptr == 'Q') for (;;)
1024       {
1025       while (*(++ptr) != 0 && *ptr != '\\') {}
1026       if (*ptr == 0) return -1;
1027       if (*(++ptr) == 'E') break;
1028       }
1029     continue;
1030     }
1031
1032   /* Skip over character classes */
1033
1034   if (*ptr == '[')
1035     {
1036     while (*(++ptr) != ']')
1037       {
1038       if (*ptr == 0) return -1;
1039       if (*ptr == '\\')
1040         {
1041         if (*(++ptr) == 0) return -1;
1042         if (*ptr == 'Q') for (;;)
1043           {
1044           while (*(++ptr) != 0 && *ptr != '\\'){}
1045           if (*ptr == 0) return -1;
1046           if (*(++ptr) == 'E') break;
1047           }
1048         continue;
1049         }
1050       }
1051     continue;
1052     }
1053
1054   /* Skip comments in /x mode */
1055
1056   if (xmode && *ptr == '#')
1057     {
1058     while (*(++ptr) != 0 && *ptr != '\n'){}
1059     if (*ptr == 0) return -1;
1060     continue;
1061     }
1062
1063   /* An opening parens must now be a real metacharacter */
1064
1065   if (*ptr != '(') continue;
1066   if (ptr[1] != '?' && ptr[1] != '*')
1067     {
1068     count++;
1069     if (name == NULL && count == lorn) return count;
1070     continue;
1071     }
1072
1073   ptr += 2;
1074   if (*ptr == 'P') ptr++;                      /* Allow optional P */
1075
1076   /* We have to disambiguate (?<! and (?<= from (?<name> */
1077
1078   if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&
1079        *ptr != '\'')
1080     continue;
1081
1082   count++;
1083
1084   if (name == NULL && count == lorn) return count;
1085   term = *ptr++;
1086   if (term == '<') term = '>';
1087   thisname = ptr;
1088   while (*ptr != term) ptr++;
1089   if (name != NULL && lorn == ptr - thisname &&
1090       VMPI_strncmp((const char *)name, (const char *)thisname, lorn) == 0)
1091     return count;
1092   }
1093
1094 return -1;
1095 }
1096
1097
1098
1099 /*************************************************
1100 *      Find first significant op code            *
1101 *************************************************/
1102
1103 /* This is called by several functions that scan a compiled expression looking
1104 for a fixed first character, or an anchoring op code etc. It skips over things
1105 that do not influence this. For some calls, a change of option is important.
1106 For some calls, it makes sense to skip negative forward and all backward
1107 assertions, and also the \b assertion; for others it does not.
1108
1109 Arguments:
1110   code         pointer to the start of the group
1111   options      pointer to external options
1112   optbit       the option bit whose changing is significant, or
1113                  zero if none are
1114   skipassert   TRUE if certain assertions are to be skipped
1115
1116 Returns:       pointer to the first significant opcode
1117 */
1118
1119 static const uschar*
1120 first_significant_code(const uschar *code, int *options, int optbit,
1121   BOOL skipassert)
1122 {
1123 for (;;)
1124   {
1125   switch ((int)*code)
1126     {
1127     case OP_OPT:
1128     if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
1129       *options = (int)code[1];
1130     code += 2;
1131     break;
1132
1133     case OP_ASSERT_NOT:
1134     case OP_ASSERTBACK:
1135     case OP_ASSERTBACK_NOT:
1136     if (!skipassert) return code;
1137     do code += GET(code, 1); while (*code == OP_ALT);
1138     code += _pcre_OP_lengths[*code];
1139     break;
1140
1141     case OP_WORD_BOUNDARY:
1142     case OP_NOT_WORD_BOUNDARY:
1143     if (!skipassert) return code;
1144     /* Fall through */
1145
1146     case OP_CALLOUT:
1147     case OP_CREF:
1148     case OP_RREF:
1149     case OP_DEF:
1150     code += _pcre_OP_lengths[*code];
1151     break;
1152
1153     default:
1154     return code;
1155     }
1156   }
1157 /* Control never reaches here */
1158 }
1159
1160
1161 /*****************************************************************************
1162  *        Generic stack used by find_fixedlength, is_anchored, is_startline  *
1163  *        to avoid unbounded recursion                                       *
1164  *****************************************************************************/
1165
1166 #define TYPED_SEGMENT_SIZE 20
1167
1168 template <class T>
1169 class TypedStack {
1170 public:
1171         TypedStack();
1172         ~TypedStack();
1173
1174         bool isEmpty();                                         // true iff the stack is empty
1175         void push(T& value);                            // push value onto the stack (by copy)
1176         void pop(T& value);                                     // pop value off the stack (by copy)
1177         T& stackTop();                                          // reference to top element
1178
1179 private:
1180         struct TypedSegment {
1181                 T memory[TYPED_SEGMENT_SIZE];   // stack memory
1182                 T *top;                                                 // for non-top segments, the saved 'top' pointer
1183                 TypedSegment *next;                             // next segment on the stack
1184         };
1185
1186         void pushSegment();
1187         void popSegment();
1188         void invariants();
1189
1190         TypedSegment* topSegment;                       // segment at the top of stack
1191         T *top;                                                         // next free address in that segment
1192         T *limit;                                                       // limit address in that segment
1193 #ifdef DEBUG
1194         uint32_t depth;                                         // current stack depth
1195         uint32_t maxdepth;                                      // peak stack depth
1196 #endif
1197 };
1198
1199 template <class T>
1200 void TypedStack<T>::invariants()
1201 {
1202         AvmAssert(topSegment == NULL || top > topSegment->memory);
1203         AvmAssert(topSegment == NULL || limit == topSegment->memory + TYPED_SEGMENT_SIZE);
1204         AvmAssert(top <= limit);
1205 }
1206
1207 template <class T>
1208 TypedStack<T>::TypedStack()
1209         : topSegment(NULL)
1210         , top(NULL)
1211         , limit(NULL)
1212 #ifdef DEBUG
1213         , depth(0)
1214         , maxdepth(0)
1215 #endif
1216 {
1217         invariants();
1218 }
1219
1220 template <class T>
1221 TypedStack<T>::~TypedStack()
1222 {
1223 #if 0 && defined DEBUG
1224         // For the morbidly curious only.  See bugzilla 502589 for some data.
1225         FILE *fp = fopen("typedstack.txt", "a");
1226         fprintf(fp, "PCRE TypedStack depth = %u\n", unsigned(maxdepth));
1227         fclose(fp);
1228 #endif
1229         invariants();
1230         while (topSegment != NULL)
1231                 popSegment();
1232 }
1233
1234 template <class T>
1235 bool TypedStack<T>::isEmpty()
1236 {
1237         return topSegment == NULL;
1238 }
1239
1240 template <class T>
1241 T& TypedStack<T>::stackTop()
1242 {
1243         invariants();
1244         AvmAssert(!isEmpty());
1245         return top[-1];
1246 }
1247
1248 template <class T>
1249 void TypedStack<T>::push(T& value)
1250 {
1251         invariants();
1252         if (top == limit)
1253                 pushSegment();
1254         *top++ = value;
1255 #ifdef DEBUG
1256         depth++;
1257         if (depth > maxdepth)
1258                 maxdepth = depth;
1259 #endif
1260         invariants();
1261 }
1262
1263 template <class T>
1264 void TypedStack<T>::pop(T& value)
1265 {
1266         invariants();
1267 #ifdef DEBUG
1268         depth--;
1269 #endif
1270         top--;
1271         value = *top;
1272         if (top == topSegment->memory)
1273                 popSegment();
1274         invariants();
1275 }
1276
1277 template <class T>
1278 void TypedStack<T>::pushSegment()
1279 {
1280         TypedSegment *s = (TypedSegment*)pcre_malloc(sizeof(TypedSegment));
1281         if (topSegment != NULL)
1282                 topSegment->top = top;
1283         s->next = topSegment;
1284         topSegment = s;
1285         top = s->memory;
1286         limit = s->memory + TYPED_SEGMENT_SIZE;
1287 }
1288
1289 template <class T>
1290 void TypedStack<T>::popSegment()
1291 {
1292         TypedSegment *s = topSegment;
1293         topSegment = topSegment->next;
1294         pcre_free(s);
1295         if (topSegment != NULL)
1296         {
1297                 top = topSegment->top;
1298                 limit = topSegment->memory + TYPED_SEGMENT_SIZE;
1299         }
1300         else
1301         {
1302                 top = NULL;
1303                 limit = NULL;
1304         }
1305 }
1306
1307 /*************************************************
1308 *        Find the fixed length of a pattern      *
1309 *************************************************/
1310
1311 /* Scan a pattern and compute the fixed length of subject that will match it,
1312 if the length is fixed. This is needed for dealing with backward assertions.
1313 In UTF8 mode, the result is in characters rather than bytes.
1314
1315 Arguments:
1316   code     points to the start of the pattern (the bracket)
1317   options  the compiling options
1318
1319 Returns:   the fixed length, or -1 if there is no fixed length,
1320              or -2 if \C was encountered
1321 */
1322
1323 // processing information for find_fixedlength
1324 typedef struct pi_ffl {
1325   uschar* code;
1326   int length;
1327   int branchlength;
1328   /* the work item from which this work item was derived */
1329   pi_ffl* parent;
1330 } pi_ffl;
1331
1332 #define FFL_NEWITEM(ITEM) \
1333   ITEM.code = 0; \
1334   ITEM.length = -1; \
1335   ITEM.branchlength = 0; \
1336   ITEM.parent = 0
1337
1338 static int
1339 find_fixedlength(uschar * const code, const int options)
1340 {
1341 (void)options;
1342
1343 TypedStack<pi_ffl> work_stack;
1344 pi_ffl work_item;
1345
1346 /* set up the first work item */
1347 FFL_NEWITEM(work_item);
1348 work_item.code = code;
1349 work_stack.push(work_item);
1350
1351 /* start the processing loop */
1352 while (!work_stack.isEmpty()) {
1353   work_stack.pop(work_item);
1354   int length = work_item.length;
1355   register int branchlength = work_item.branchlength;
1356   register uschar *cc = work_item.code + 1 + LINK_SIZE;
1357   BOOL skip_to_next_item = FALSE;
1358
1359   for (;!skip_to_next_item;)
1360     {
1361     register int op = *cc;
1362     switch (op)
1363       {
1364       case OP_CBRA:
1365       case OP_BRA:
1366       case OP_ONCE:
1367       case OP_COND:
1368           {
1369       /*
1370        - create a new workitem for the remainder of this item (or alter the
1371          current one)
1372        - create a new workitem with the bracketed code and put it on the stack
1373        - this results in the following stack state: new_item -> work_item -> ...
1374          this way we keep the processing order as if recursion would have been
1375          used
1376        - move the code pointer of work_item after the bracketed code
1377        - start the next iteration (with new_item)
1378       * */
1379       work_item.length = length;
1380       work_item.branchlength = branchlength;
1381           work_stack.push(work_item);
1382
1383       pi_ffl new_item;
1384       FFL_NEWITEM(new_item);
1385       new_item.code = cc + ((op == OP_CBRA)? 2:0);
1386           pi_ffl* parent_item = &work_stack.stackTop();
1387           new_item.parent = parent_item;
1388           work_stack.push(new_item);
1389
1390       do cc += GET(cc, 1); while (*cc == OP_ALT);
1391       // original code added 1 + LINK_SIZE to the pointer
1392       // we don't need this as the processing of an item start with that
1393       parent_item->code = cc;
1394
1395       skip_to_next_item = TRUE;
1396       continue;
1397           }
1398
1399       /* Reached end of a branch; if it's a ket it is the end of a nested
1400       call. If it's ALT it is an alternation in a nested call. If it is
1401       END it's the end of the outer call. All can be handled by the same code.*/
1402
1403       case OP_ALT:
1404       case OP_KET:
1405       case OP_KETRMAX:
1406       case OP_KETRMIN:
1407       case OP_END:
1408       if (length < 0) length = branchlength;
1409       /* clean up the stack and current item and return -1 */
1410       else if (length != branchlength)
1411         { return -1; }
1412       if (*cc != OP_ALT)
1413         {
1414         /* end of the pattern, clean up and return */
1415         if (*cc == OP_END)
1416           { return length; }
1417         else
1418         /* KET is the end of in internal group, propagate the brenchlength back
1419            to the parent group's branchlength */
1420           {
1421             work_item.parent->branchlength += branchlength;
1422             skip_to_next_item = TRUE;
1423             continue;
1424           }
1425         }
1426       cc += 1 + LINK_SIZE;
1427       branchlength = 0;
1428       break;
1429
1430     /* Skip over assertive subpatterns */
1431
1432       case OP_ASSERT:
1433       case OP_ASSERT_NOT:
1434       case OP_ASSERTBACK:
1435       case OP_ASSERTBACK_NOT:
1436       do cc += GET(cc, 1); while (*cc == OP_ALT);
1437       /* Fall through */
1438
1439       /* Skip over things that don't match chars */
1440
1441       case OP_REVERSE:
1442       case OP_CREF:
1443       case OP_RREF:
1444       case OP_DEF:
1445       case OP_OPT:
1446       case OP_CALLOUT:
1447       case OP_SOD:
1448       case OP_SOM:
1449       case OP_EOD:
1450       case OP_EODN:
1451       case OP_CIRC:
1452       case OP_DOLL:
1453       case OP_NOT_WORD_BOUNDARY:
1454       case OP_WORD_BOUNDARY:
1455       cc += _pcre_OP_lengths[*cc];
1456       break;
1457
1458       /* Handle literal characters */
1459
1460       case OP_CHAR:
1461       case OP_CHARNC:
1462       case OP_NOT:
1463       branchlength++;
1464       cc += 2;
1465 #ifdef SUPPORT_UTF8
1466       if ((options & PCRE_UTF8) != 0)
1467         {
1468         while ((*cc & 0xc0) == 0x80) cc++;
1469         }
1470 #endif
1471       break;
1472
1473       /* Handle exact repetitions. The count is already in characters, but we
1474       need to skip over a multibyte character in UTF8 mode.  */
1475
1476       case OP_EXACT:
1477       branchlength += GET2(cc,1);
1478       cc += 4;
1479 #ifdef SUPPORT_UTF8
1480       if ((options & PCRE_UTF8) != 0)
1481         {
1482         while((*cc & 0x80) == 0x80) cc++;
1483         }
1484 #endif
1485       break;
1486
1487       case OP_TYPEEXACT:
1488       branchlength += GET2(cc,1);
1489       if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
1490       cc += 4;
1491       break;
1492
1493       /* Handle single-char matchers */
1494
1495       case OP_PROP:
1496       case OP_NOTPROP:
1497       cc += 2;
1498       /* Fall through */
1499
1500       case OP_NOT_DIGIT:
1501       case OP_DIGIT:
1502       case OP_NOT_WHITESPACE:
1503       case OP_WHITESPACE:
1504       case OP_NOT_WORDCHAR:
1505       case OP_WORDCHAR:
1506       case OP_ANY:
1507       branchlength++;
1508       cc++;
1509       break;
1510
1511       /* The single-byte matcher isn't allowed */
1512
1513       case OP_ANYBYTE:
1514       /* free the proc stack and the current item*/
1515       return -2;
1516
1517       /* Check a class for variable quantification */
1518
1519 #ifdef SUPPORT_UTF8
1520       case OP_XCLASS:
1521       cc += GET(cc, 1) - 33;
1522       /* Fall through */
1523 #endif
1524
1525       case OP_CLASS:
1526       case OP_NCLASS:
1527       cc += 33;
1528
1529       switch (*cc)
1530         {
1531         case OP_CRSTAR:
1532         case OP_CRMINSTAR:
1533         case OP_CRQUERY:
1534         case OP_CRMINQUERY:
1535         /* free the proc stack and the current item*/
1536         return -1;
1537
1538         case OP_CRRANGE:
1539         case OP_CRMINRANGE:
1540         if (GET2(cc,1) != GET2(cc,3)) return -1;
1541         branchlength += GET2(cc,1);
1542         cc += 5;
1543         break;
1544
1545         default:
1546         branchlength++;
1547         }
1548       break;
1549
1550       /* Anything else is variable length */
1551
1552       default:
1553       /* free the proc stack and the current item*/
1554       return -1;
1555       }
1556     }
1557 }
1558 /* Control never gets here */
1559 AvmAssert(0);
1560 return -1;
1561 }
1562
1563 #undef FFL_NEWITEM
1564
1565
1566
1567 /*************************************************
1568 *    Scan compiled regex for numbered bracket    *
1569 *************************************************/
1570
1571 /* This little function scans through a compiled pattern until it finds a
1572 capturing bracket with the given number.
1573
1574 Arguments:
1575   code        points to start of expression
1576   utf8        TRUE in UTF-8 mode
1577   number      the required bracket number
1578
1579 Returns:      pointer to the opcode for the bracket, or NULL if not found
1580 */
1581
1582 static const uschar *
1583 find_bracket(const uschar *code, BOOL utf8, int number)
1584 {
1585 for (;;)
1586   {
1587   register int c = *code;
1588   if (c == OP_END) return NULL;
1589
1590   /* XCLASS is used for classes that cannot be represented just by a bit
1591   map. This includes negated single high-valued characters. The length in
1592   the table is zero; the actual length is stored in the compiled code. */
1593
1594   if (c == OP_XCLASS) code += GET(code, 1);
1595
1596   /* Handle capturing bracket */
1597
1598   else if (c == OP_CBRA)
1599     {
1600     int n = GET2(code, 1+LINK_SIZE);
1601     if (n == number) return (uschar *)code;
1602     code += _pcre_OP_lengths[c];
1603     }
1604
1605   /* Otherwise, we can get the item's length from the table, except that for
1606   repeated character types, we have to test for \p and \P, which have an extra
1607   two bytes of parameters. */
1608
1609   else
1610     {
1611     switch(c)
1612       {
1613       case OP_TYPESTAR:
1614       case OP_TYPEMINSTAR:
1615       case OP_TYPEPLUS:
1616       case OP_TYPEMINPLUS:
1617       case OP_TYPEQUERY:
1618       case OP_TYPEMINQUERY:
1619       case OP_TYPEPOSSTAR:
1620       case OP_TYPEPOSPLUS:
1621       case OP_TYPEPOSQUERY:
1622       if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1623       break;
1624
1625       case OP_TYPEUPTO:
1626       case OP_TYPEMINUPTO:
1627       case OP_TYPEEXACT:
1628       case OP_TYPEPOSUPTO:
1629       if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1630       break;
1631       }
1632
1633     /* Add in the fixed length from the table */
1634
1635     code += _pcre_OP_lengths[c];
1636
1637   /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1638   a multi-byte character. The length in the table is a minimum, so we have to
1639   arrange to skip the extra bytes. */
1640
1641 #ifdef SUPPORT_UTF8
1642     if (utf8) switch(c)
1643       {
1644       case OP_CHAR:
1645       case OP_CHARNC:
1646       case OP_EXACT:
1647       case OP_UPTO:
1648       case OP_MINUPTO:
1649       case OP_POSUPTO:
1650       case OP_STAR:
1651       case OP_MINSTAR:
1652       case OP_POSSTAR:
1653       case OP_PLUS:
1654       case OP_MINPLUS:
1655       case OP_POSPLUS:
1656       case OP_QUERY:
1657       case OP_MINQUERY:
1658       case OP_POSQUERY:
1659       if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1660       break;
1661       }
1662 #endif
1663     }
1664   }
1665 }
1666
1667
1668
1669 /*************************************************
1670 *   Scan compiled regex for recursion reference  *
1671 *************************************************/
1672
1673 /* This little function scans through a compiled pattern until it finds an
1674 instance of OP_RECURSE.
1675
1676 Arguments:
1677   code        points to start of expression
1678   utf8        TRUE in UTF-8 mode
1679
1680 Returns:      pointer to the opcode for OP_RECURSE, or NULL if not found
1681 */
1682
1683 static const uschar *
1684 find_recurse(const uschar *code, BOOL utf8)
1685 {
1686 for (;;)
1687   {
1688   register int c = *code;
1689   if (c == OP_END) return NULL;
1690   if (c == OP_RECURSE) return code;
1691
1692   /* XCLASS is used for classes that cannot be represented just by a bit
1693   map. This includes negated single high-valued characters. The length in
1694   the table is zero; the actual length is stored in the compiled code. */
1695
1696   if (c == OP_XCLASS) code += GET(code, 1);
1697
1698   /* Otherwise, we can get the item's length from the table, except that for
1699   repeated character types, we have to test for \p and \P, which have an extra
1700   two bytes of parameters. */
1701
1702   else
1703     {
1704     switch(c)
1705       {
1706       case OP_TYPESTAR:
1707       case OP_TYPEMINSTAR:
1708       case OP_TYPEPLUS:
1709       case OP_TYPEMINPLUS:
1710       case OP_TYPEQUERY:
1711       case OP_TYPEMINQUERY:
1712       case OP_TYPEPOSSTAR:
1713       case OP_TYPEPOSPLUS:
1714       case OP_TYPEPOSQUERY:
1715       if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1716       break;
1717
1718       case OP_TYPEPOSUPTO:
1719       case OP_TYPEUPTO:
1720       case OP_TYPEMINUPTO:
1721       case OP_TYPEEXACT:
1722       if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1723       break;
1724       }
1725
1726     /* Add in the fixed length from the table */
1727
1728     code += _pcre_OP_lengths[c];
1729
1730     /* In UTF-8 mode, opcodes that are followed by a character may be followed
1731     by a multi-byte character. The length in the table is a minimum, so we have
1732     to arrange to skip the extra bytes. */
1733
1734 #ifdef SUPPORT_UTF8
1735     if (utf8) switch(c)
1736       {
1737       case OP_CHAR:
1738       case OP_CHARNC:
1739       case OP_EXACT:
1740       case OP_UPTO:
1741       case OP_MINUPTO:
1742       case OP_POSUPTO:
1743       case OP_STAR:
1744       case OP_MINSTAR:
1745       case OP_POSSTAR:
1746       case OP_PLUS:
1747       case OP_MINPLUS:
1748       case OP_POSPLUS:
1749       case OP_QUERY:
1750       case OP_MINQUERY:
1751       case OP_POSQUERY:
1752       if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1753       break;
1754       }
1755 #endif
1756     }
1757   }
1758 }
1759
1760
1761
1762 /*************************************************
1763 *    Scan compiled branch for non-emptiness      *
1764 *************************************************/
1765
1766 /* This function scans through a branch of a compiled pattern to see whether it
1767 can match the empty string or not. It is called from could_be_empty()
1768 below and from compile_branch() when checking for an unlimited repeat of a
1769 group that can match nothing. Note that first_significant_code() skips over
1770 assertions. If we hit an unclosed bracket, we return "empty" - this means we've
1771 struck an inner bracket whose current branch will already have been scanned.
1772
1773 Arguments:
1774   code        points to start of search
1775   endcode     points to where to stop
1776   utf8        TRUE if in UTF8 mode
1777
1778 Returns:      TRUE if what is matched could be empty
1779 */
1780
1781 static BOOL
1782 could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1783 {
1784 register int c;
1785 avmplus::AvmCore::checkPCREStackOverflow();
1786 for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1787      code < endcode;
1788      code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1789   {
1790   const uschar *ccode;
1791
1792   c = *code;
1793
1794   /* Groups with zero repeats can of course be empty; skip them. */
1795
1796   if (c == OP_BRAZERO || c == OP_BRAMINZERO)
1797     {
1798     code += _pcre_OP_lengths[c];
1799     do code += GET(code, 1); while (*code == OP_ALT);
1800     c = *code;
1801     continue;
1802     }
1803
1804   /* For other groups, scan the branches. */
1805
1806   if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)
1807     {
1808     BOOL empty_branch;
1809     if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */
1810
1811     /* Scan a closed bracket */
1812
1813     empty_branch = FALSE;
1814     do
1815       {
1816       if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1817         empty_branch = TRUE;
1818       code += GET(code, 1);
1819       }
1820     while (*code == OP_ALT);
1821     if (!empty_branch) return FALSE;   /* All branches are non-empty */
1822     c = *code;
1823     continue;
1824     }
1825
1826   /* Handle the other opcodes */
1827
1828   switch (c)
1829     {
1830     /* Check for quantifiers after a class. XCLASS is used for classes that
1831     cannot be represented just by a bit map. This includes negated single
1832     high-valued characters. The length in _pcre_OP_lengths[] is zero; the
1833     actual length is stored in the compiled code, so we must update "code"
1834     here. */
1835
1836 #ifdef SUPPORT_UTF8
1837     case OP_XCLASS:
1838     ccode = code += GET(code, 1);
1839     goto CHECK_CLASS_REPEAT;
1840 #endif
1841
1842     case OP_CLASS:
1843     case OP_NCLASS:
1844     ccode = code + 33;
1845
1846 #ifdef SUPPORT_UTF8
1847     CHECK_CLASS_REPEAT:
1848 #endif
1849
1850     switch (*ccode)
1851       {
1852       case OP_CRSTAR:            /* These could be empty; continue */
1853       case OP_CRMINSTAR:
1854       case OP_CRQUERY:
1855       case OP_CRMINQUERY:
1856       break;
1857
1858       default:                   /* Non-repeat => class must match */
1859       case OP_CRPLUS:            /* These repeats aren't empty */
1860       case OP_CRMINPLUS:
1861       return FALSE;
1862
1863       case OP_CRRANGE:
1864       case OP_CRMINRANGE:
1865       if (GET2(ccode, 1) > 0) return FALSE;  /* Minimum > 0 */
1866       break;
1867       }
1868     break;
1869
1870     /* Opcodes that must match a character */
1871
1872     case OP_PROP:
1873     case OP_NOTPROP:
1874     case OP_EXTUNI:
1875     case OP_NOT_DIGIT:
1876     case OP_DIGIT:
1877     case OP_NOT_WHITESPACE:
1878     case OP_WHITESPACE:
1879     case OP_NOT_WORDCHAR:
1880     case OP_WORDCHAR:
1881     case OP_ANY:
1882     case OP_ANYBYTE:
1883     case OP_CHAR:
1884     case OP_CHARNC:
1885     case OP_NOT:
1886     case OP_PLUS:
1887     case OP_MINPLUS:
1888     case OP_POSPLUS:
1889     case OP_EXACT:
1890     case OP_NOTPLUS:
1891     case OP_NOTMINPLUS:
1892     case OP_NOTPOSPLUS:
1893     case OP_NOTEXACT:
1894     case OP_TYPEPLUS:
1895     case OP_TYPEMINPLUS:
1896     case OP_TYPEPOSPLUS:
1897     case OP_TYPEEXACT:
1898     return FALSE;
1899
1900     /* These are going to continue, as they may be empty, but we have to
1901     fudge the length for the \p and \P cases. */
1902
1903     case OP_TYPESTAR:
1904     case OP_TYPEMINSTAR:
1905     case OP_TYPEPOSSTAR:
1906     case OP_TYPEQUERY:
1907     case OP_TYPEMINQUERY:
1908     case OP_TYPEPOSQUERY:
1909     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1910     break;
1911
1912     /* Same for these */
1913
1914     case OP_TYPEUPTO:
1915     case OP_TYPEMINUPTO:
1916     case OP_TYPEPOSUPTO:
1917     if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1918     break;
1919
1920     /* End of branch */
1921
1922     case OP_KET:
1923     case OP_KETRMAX:
1924     case OP_KETRMIN:
1925     case OP_ALT:
1926     return TRUE;
1927
1928     /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1929     MINUPTO, and POSUPTO may be followed by a multibyte character */
1930
1931 #ifdef SUPPORT_UTF8
1932     case OP_STAR:
1933     case OP_MINSTAR:
1934     case OP_POSSTAR:
1935     case OP_QUERY:
1936     case OP_MINQUERY:
1937     case OP_POSQUERY:
1938     case OP_UPTO:
1939     case OP_MINUPTO:
1940     case OP_POSUPTO:
1941     if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1942     break;
1943 #endif
1944     }
1945   }
1946
1947 return TRUE;
1948 }
1949
1950
1951
1952 /*************************************************
1953 *    Scan compiled regex for non-emptiness       *
1954 *************************************************/
1955
1956 /* This function is called to check for left recursive calls. We want to check
1957 the current branch of the current pattern to see if it could match the empty
1958 string. If it could, we must look outwards for branches at other levels,
1959 stopping when we pass beyond the bracket which is the subject of the recursion.
1960
1961 Arguments:
1962   code        points to start of the recursion
1963   endcode     points to where to stop (current RECURSE item)
1964   bcptr       points to the chain of current (unclosed) branch starts
1965   utf8        TRUE if in UTF-8 mode
1966
1967 Returns:      TRUE if what is matched could be empty
1968 */
1969
1970 static BOOL
1971 could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1972   BOOL utf8)
1973 {
1974 while (bcptr != NULL && bcptr->current >= code)
1975   {
1976   if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1977   bcptr = bcptr->outer;
1978   }
1979 return TRUE;
1980 }
1981
1982
1983
1984 /*************************************************
1985 *           Check for POSIX class syntax         *
1986 *************************************************/
1987
1988 /* This function is called when the sequence "[:" or "[." or "[=" is
1989 encountered in a character class. It checks whether this is followed by an
1990 optional ^ and then a sequence of letters, terminated by a matching ":]" or
1991 ".]" or "=]".
1992
1993 Argument:
1994   ptr      pointer to the initial [
1995   endptr   where to return the end pointer
1996   cd       pointer to compile data
1997
1998 Returns:   TRUE or FALSE
1999 */
2000
2001 static BOOL
2002 check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
2003 {
2004 int terminator;          /* Don't combine these lines; the Solaris cc */
2005 terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */
2006 if (*(++ptr) == '^') ptr++;
2007 while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
2008 if (*ptr == terminator && ptr[1] == ']')
2009   {
2010   *endptr = ptr;
2011   return TRUE;
2012   }
2013 return FALSE;
2014 }
2015
2016
2017
2018
2019 /*************************************************
2020 *          Check POSIX class name                *
2021 *************************************************/
2022
2023 /* This function is called to check the name given in a POSIX-style class entry
2024 such as [:alnum:].
2025
2026 Arguments:
2027   ptr        points to the first letter
2028   len        the length of the name
2029
2030 Returns:     a value representing the name, or -1 if unknown
2031 */
2032
2033 static int
2034 check_posix_name(const uschar *ptr, int len)
2035 {
2036 register int yield = 0;
2037 while (posix_name_lengths[yield] != 0)
2038   {
2039   if (len == posix_name_lengths[yield] &&
2040     VMPI_strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
2041   yield++;
2042   }
2043 return -1;
2044 }
2045
2046
2047 /*************************************************
2048 *    Adjust OP_RECURSE items in repeated group   *
2049 *************************************************/
2050
2051 /* OP_RECURSE items contain an offset from the start of the regex to the group
2052 that is referenced. This means that groups can be replicated for fixed
2053 repetition simply by copying (because the recursion is allowed to refer to
2054 earlier groups that are outside the current group). However, when a group is
2055 optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
2056 it, after it has been compiled. This means that any OP_RECURSE items within it
2057 that refer to the group itself or any contained groups have to have their
2058 offsets adjusted. That one of the jobs of this function. Before it is called,
2059 the partially compiled regex must be temporarily terminated with OP_END.
2060
2061 This function has been extended with the possibility of forward references for
2062 recursions and subroutine calls. It must also check the list of such references
2063 for the group we are dealing with. If it finds that one of the recursions in
2064 the current group is on this list, it adjusts the offset in the list, not the
2065 value in the reference (which is a group number).
2066
2067 Arguments:
2068   group      points to the start of the group
2069   adjust     the amount by which the group is to be moved
2070   utf8       TRUE in UTF-8 mode
2071   cd         contains pointers to tables etc.
2072   save_hwm   the hwm forward reference pointer at the start of the group
2073
2074 Returns:     nothing
2075 */
2076
2077 static void
2078 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
2079   uschar *save_hwm)
2080 {
2081 uschar *ptr = group;
2082
2083 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
2084   {
2085   int offset;
2086   uschar *hc;
2087
2088   /* See if this recursion is on the forward reference list. If so, adjust the
2089   reference. */
2090
2091   for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
2092     {
2093     offset = GET(hc, 0);
2094     if (cd->start_code + offset == ptr + 1)
2095       {
2096       PUT(hc, 0, offset + adjust);
2097       break;
2098       }
2099     }
2100
2101   /* Otherwise, adjust the recursion offset if it's after the start of this
2102   group. */
2103
2104   if (hc >= cd->hwm)
2105     {
2106     offset = GET(ptr, 1);
2107     if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
2108     }
2109
2110   ptr += 1 + LINK_SIZE;
2111   }
2112 }
2113
2114
2115
2116 /*************************************************
2117 *        Insert an automatic callout point       *
2118 *************************************************/
2119
2120 /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
2121 callout points before each pattern item.
2122
2123 Arguments:
2124   code           current code pointer
2125   ptr            current pattern pointer
2126   cd             pointers to tables etc
2127
2128 Returns:         new code pointer
2129 */
2130
2131 static uschar *
2132 auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
2133 {
2134 *code++ = OP_CALLOUT;
2135 *code++ = 255;
2136 PUT(code, 0, ptr - cd->start_pattern);  /* Pattern offset */
2137 PUT(code, LINK_SIZE, 0);                /* Default length */
2138 return code + 2*LINK_SIZE;
2139 }
2140
2141
2142
2143 /*************************************************
2144 *         Complete a callout item                *
2145 *************************************************/
2146
2147 /* A callout item contains the length of the next item in the pattern, which
2148 we can't fill in till after we have reached the relevant point. This is used
2149 for both automatic and manual callouts.
2150
2151 Arguments:
2152   previous_callout   points to previous callout item
2153   ptr                current pattern pointer
2154   cd                 pointers to tables etc
2155
2156 Returns:             nothing
2157 */
2158
2159 static void
2160 complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
2161 {
2162 int length = ptr - cd->start_pattern - GET(previous_callout, 2);
2163 PUT(previous_callout, 2 + LINK_SIZE, length);
2164 }
2165
2166
2167
2168 #ifdef SUPPORT_UCP
2169 /*************************************************
2170 *           Get othercase range                  *
2171 *************************************************/
2172
2173 /* This function is passed the start and end of a class range, in UTF-8 mode
2174 with UCP support. It searches up the characters, looking for internal ranges of
2175 characters in the "other" case. Each call returns the next one, updating the
2176 start address.
2177
2178 Arguments:
2179   cptr        points to starting character value; updated
2180   d           end value
2181   ocptr       where to put start of othercase range
2182   odptr       where to put end of othercase range
2183
2184 Yield:        TRUE when range returned; FALSE when no more
2185 */
2186
2187 static BOOL
2188 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
2189   unsigned int *odptr)
2190 {
2191 unsigned int c, othercase, next;
2192
2193 for (c = *cptr; c <= d; c++)
2194   { if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) break; }
2195
2196 if (c > d) return FALSE;
2197
2198 *ocptr = othercase;
2199 next = othercase + 1;
2200
2201 for (++c; c <= d; c++)
2202   {
2203   if (_pcre_ucp_othercase(c) != next) break;
2204   next++;
2205   }
2206
2207 *odptr = next - 1;
2208 *cptr = c;
2209
2210 return TRUE;
2211 }
2212 #endif  /* SUPPORT_UCP */
2213
2214
2215
2216 /*************************************************
2217 *     Check if auto-possessifying is possible    *
2218 *************************************************/
2219
2220 /* This function is called for unlimited repeats of certain items, to see
2221 whether the next thing could possibly match the repeated item. If not, it makes
2222 sense to automatically possessify the repeated item.
2223
2224 Arguments:
2225   op_code       the repeated op code
2226   this          data for this item, depends on the opcode
2227   utf8          TRUE in UTF-8 mode
2228   utf8_char     used for utf8 character bytes, NULL if not relevant
2229   ptr           next character in pattern
2230   options       options bits
2231   cd            contains pointers to tables etc.
2232
2233 Returns:        TRUE if possessifying is wanted
2234 */
2235
2236 static BOOL
2237 check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
2238   const uschar *ptr, int options, compile_data *cd)
2239 {
2240 int next;
2241
2242 /* Skip whitespace and comments in extended mode */
2243
2244 if ((options & PCRE_EXTENDED) != 0)
2245   {
2246   for (;;)
2247     {
2248     while ( true ) //(*ptr < 256 && (cd->ctypes[*ptr] & ctype_space) != 0) || (*ptr >= 256 && isUnicodeWhiteSpace(*ptr)) )
2249         {
2250                 if( (cd->ctypes[*ptr] & ctype_space) != 0 )
2251                 {
2252                         ptr++;
2253                         continue;
2254                 }
2255                 break;
2256         }
2257     if (*ptr == '#')
2258       {
2259       while (*(++ptr) != 0)
2260         if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2261       }
2262     else break;
2263     }
2264   }
2265
2266 /* If the next item is one that we can handle, get its value. A non-negative
2267 value is a character, a negative value is an escape value. */
2268
2269 if (*ptr == '\\')
2270   {
2271   int temperrorcode = 0;
2272   next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
2273   if (temperrorcode != 0) return FALSE;
2274   ptr++;    /* Point after the escape sequence */
2275   }
2276
2277 else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
2278   {
2279 #ifdef SUPPORT_UTF8
2280   if (utf8) { GETCHARINC(next, ptr); } else
2281 #endif
2282   next = *ptr++;
2283   }
2284
2285 else return FALSE;
2286
2287 /* Skip whitespace and comments in extended mode */
2288
2289 if ((options & PCRE_EXTENDED) != 0)
2290   {
2291   for (;;)
2292     {
2293     while ( true ) //(*ptr < 256 && (cd->ctypes[*ptr] & ctype_space) != 0) || (*ptr >= 256 && isUnicodeWhiteSpace(*ptr)) )
2294         {
2295                 if( (cd->ctypes[*ptr] & ctype_space) != 0 )
2296                 {
2297                         ptr++;
2298                         continue;
2299                 }
2300                 break;
2301         }
2302     if (*ptr == '#')
2303       {
2304       while (*(++ptr) != 0)
2305         if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2306       }
2307     else break;
2308     }
2309   }
2310
2311 /* If the next thing is itself optional, we have to give up. */
2312
2313 if (*ptr == '*' || *ptr == '?' || VMPI_strncmp((char *)ptr, "{0,", 3) == 0)
2314   return FALSE;
2315
2316 /* Now compare the next item with the previous opcode. If the previous is a
2317 positive single character match, "item" either contains the character or, if
2318 "item" is greater than 127 in utf8 mode, the character's bytes are in
2319 utf8_char. */
2320
2321
2322 /* Handle cases when the next item is a character. */
2323
2324 if (next >= 0) switch(op_code)
2325   {
2326   case OP_CHAR:
2327 #ifdef SUPPORT_UTF8
2328   if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2329 #endif
2330   return item != next;
2331
2332   /* For CHARNC (caseless character) we must check the other case. If we have
2333   Unicode property support, we can use it to test the other case of
2334   high-valued characters. */
2335
2336   case OP_CHARNC:
2337 #ifdef SUPPORT_UTF8
2338   if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2339 #endif
2340   if (item == next) return FALSE;
2341 #ifdef SUPPORT_UTF8
2342   if (utf8)
2343     {
2344     unsigned int othercase;
2345     if (next < 128) othercase = cd->fcc[next]; else
2346 #ifdef SUPPORT_UCP
2347     othercase = _pcre_ucp_othercase((unsigned int)next);
2348 #else
2349     othercase = NOTACHAR;
2350 #endif
2351     return (unsigned int)item != othercase;
2352     }
2353   else
2354 #endif  /* SUPPORT_UTF8 */
2355   return (item != cd->fcc[next]);  /* Non-UTF-8 mode */
2356
2357   /* For OP_NOT, "item" must be a single-byte character. */
2358
2359   case OP_NOT:
2360   if (next < 0) return FALSE;  /* Not a character */
2361   if (item == next) return TRUE;
2362   if ((options & PCRE_CASELESS) == 0) return FALSE;
2363 #ifdef SUPPORT_UTF8
2364   if (utf8)
2365     {
2366     unsigned int othercase;
2367     if (next < 128) othercase = cd->fcc[next]; else
2368 #ifdef SUPPORT_UCP
2369     othercase = _pcre_ucp_othercase(next);
2370 #else
2371     othercase = NOTACHAR;
2372 #endif
2373     return (unsigned int)item == othercase;
2374     }
2375   else
2376 #endif  /* SUPPORT_UTF8 */
2377   return (item == cd->fcc[next]);  /* Non-UTF-8 mode */
2378
2379   case OP_DIGIT:
2380   return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
2381
2382   case OP_NOT_DIGIT:
2383   return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
2384
2385   case OP_WHITESPACE:
2386   return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
2387
2388   case OP_NOT_WHITESPACE:
2389   return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
2390
2391   case OP_WORDCHAR:
2392   return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
2393
2394   case OP_NOT_WORDCHAR:
2395   return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
2396
2397   case OP_HSPACE:
2398   case OP_NOT_HSPACE:
2399   switch(next)
2400     {
2401     case 0x09:
2402     case 0x20:
2403     case 0xa0:
2404     case 0x1680:
2405     case 0x180e:
2406     case 0x2000:
2407     case 0x2001:
2408     case 0x2002:
2409     case 0x2003:
2410     case 0x2004:
2411     case 0x2005:
2412     case 0x2006:
2413     case 0x2007:
2414     case 0x2008:
2415     case 0x2009:
2416     case 0x200A:
2417     case 0x202f:
2418     case 0x205f:
2419     case 0x3000:
2420     return op_code != OP_HSPACE;
2421     default:
2422     return op_code == OP_HSPACE;
2423     }
2424
2425   case OP_VSPACE:
2426   case OP_NOT_VSPACE:
2427   switch(next)
2428     {
2429     case 0x0a:
2430     case 0x0b:
2431     case 0x0c:
2432     case 0x0d:
2433     case 0x85:
2434     case 0x2028:
2435     case 0x2029:
2436     return op_code != OP_VSPACE;
2437     default:
2438     return op_code == OP_VSPACE;
2439     }
2440
2441   default:
2442   return FALSE;
2443   }
2444
2445
2446 /* Handle the case when the next item is \d, \s, etc. */
2447
2448 switch(op_code)
2449   {
2450   case OP_CHAR:
2451   case OP_CHARNC:
2452 #ifdef SUPPORT_UTF8
2453   if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2454 #endif
2455   switch(-next)
2456     {
2457     case ESC_d:
2458     return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
2459
2460     case ESC_D:
2461     return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
2462
2463     case ESC_s:
2464     return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
2465
2466     case ESC_S:
2467     return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
2468
2469     case ESC_w:
2470     return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
2471
2472     case ESC_W:
2473     return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
2474
2475     case ESC_h:
2476     case ESC_H:
2477     switch(item)
2478       {
2479       case 0x09:
2480       case 0x20:
2481       case 0xa0:
2482       case 0x1680:
2483       case 0x180e:
2484       case 0x2000:
2485       case 0x2001:
2486       case 0x2002:
2487       case 0x2003:
2488       case 0x2004:
2489       case 0x2005:
2490       case 0x2006:
2491       case 0x2007:
2492       case 0x2008:
2493       case 0x2009:
2494       case 0x200A:
2495       case 0x202f:
2496       case 0x205f:
2497       case 0x3000:
2498       return -next != ESC_h;
2499       default:
2500       return -next == ESC_h;
2501       }
2502
2503     case ESC_v:
2504     case ESC_V:
2505     switch(item)
2506       {
2507       case 0x0a:
2508       case 0x0b:
2509       case 0x0c:
2510       case 0x0d:
2511       case 0x85:
2512       case 0x2028:
2513       case 0x2029:
2514       return -next != ESC_v;
2515       default:
2516       return -next == ESC_v;
2517       }
2518
2519     default:
2520     return FALSE;
2521     }
2522
2523   case OP_DIGIT:
2524   return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2525          next == -ESC_h || next == -ESC_v;
2526
2527   case OP_NOT_DIGIT:
2528   return next == -ESC_d;
2529
2530   case OP_WHITESPACE:
2531   return next == -ESC_S || next == -ESC_d || next == -ESC_w;
2532
2533   case OP_NOT_WHITESPACE:
2534   return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2535
2536   case OP_HSPACE:
2537   return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
2538
2539   case OP_NOT_HSPACE:
2540   return next == -ESC_h;
2541
2542   /* Can't have \S in here because VT matches \S (Perl anomaly) */
2543   case OP_VSPACE:
2544   return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2545
2546   case OP_NOT_VSPACE:
2547   return next == -ESC_v;
2548
2549   case OP_WORDCHAR:
2550   return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
2551
2552   case OP_NOT_WORDCHAR:
2553   return next == -ESC_w || next == -ESC_d;
2554
2555   default:
2556   return FALSE;
2557   }
2558
2559 /* Control does not reach here */
2560 }
2561
2562
2563
2564 /*************************************************
2565 *           Compile one branch                   *
2566 *************************************************/
2567
2568 /* Scan the pattern, compiling it into the a vector. If the options are
2569 changed during the branch, the pointer is used to change the external options
2570 bits. This function is used during the pre-compile phase when we are trying
2571 to find out the amount of memory needed, as well as during the real compile
2572 phase. The value of lengthptr distinguishes the two phases.
2573
2574 Arguments:
2575   optionsptr     pointer to the option bits
2576   codeptr        points to the pointer to the current code point
2577   ptrptr         points to the current pattern pointer
2578   errorcodeptr   points to error code variable
2579   firstbyteptr   set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2580   reqbyteptr     set to the last literal character required, else < 0
2581   bcptr          points to current branch chain
2582   cd             contains pointers to tables etc.
2583   lengthptr      NULL during the real compile phase
2584                  points to length accumulator during pre-compile phase
2585
2586 Returns:         TRUE on success
2587                  FALSE, with *errorcodeptr set non-zero on error
2588 */
2589
2590 static BOOL
2591 compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2592   int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2593   compile_data *cd, int *lengthptr)
2594 {
2595 int repeat_type, op_type;
2596 int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
2597 int bravalue = 0;
2598 int greedy_default, greedy_non_default;
2599 int firstbyte, reqbyte;
2600 int zeroreqbyte, zerofirstbyte;
2601 int req_caseopt, reqvary, tempreqvary;
2602 int options = *optionsptr;
2603 int after_manual_callout = 0;
2604 int length_prevgroup = 0;
2605 register int c;
2606 register uschar *code = *codeptr;
2607 uschar *last_code = code;
2608 uschar *orig_code = code;
2609 uschar *tempcode;
2610 BOOL inescq = FALSE;
2611 BOOL groupsetfirstbyte = FALSE;
2612 const uschar *ptr = *ptrptr;
2613 const uschar *tempptr;
2614 uschar *previous = NULL;
2615 uschar *previous_callout = NULL;
2616 uschar *save_hwm = NULL;
2617 uschar classbits[32];
2618
2619 #ifdef SUPPORT_UTF8
2620 BOOL class_utf8;
2621 BOOL utf8 = (options & PCRE_UTF8) != 0;
2622 uschar *class_utf8data;
2623 uschar *class_utf8data_base;
2624 uschar utf8_char[6];
2625 #else
2626 BOOL utf8 = FALSE;
2627 uschar *utf8_char = NULL;
2628 #endif
2629
2630 #ifdef PCRE_DEBUG
2631 if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2632 #endif
2633
2634 /* Set up the default and non-default settings for greediness */
2635
2636 greedy_default = ((options & PCRE_UNGREEDY) != 0);
2637 greedy_non_default = greedy_default ^ 1;
2638
2639 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2640 matching encountered yet". It gets changed to REQ_NONE if we hit something that
2641 matches a non-fixed char first char; reqbyte just remains unset if we never
2642 find one.
2643
2644 When we hit a repeat whose minimum is zero, we may have to adjust these values
2645 to take the zero repeat into account. This is implemented by setting them to
2646 zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2647 item types that can be repeated set these backoff variables appropriately. */
2648
2649 firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2650
2651 /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2652 according to the current setting of the caseless flag. REQ_CASELESS is a bit
2653 value > 255. It is added into the firstbyte or reqbyte variables to record the
2654 case status of the value. This is used only for ASCII characters. */
2655
2656 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2657
2658 /* Switch on next character until the end of the branch */
2659
2660 for (;; ptr++)
2661   {
2662   BOOL negate_class;
2663   BOOL inverted_class = FALSE;
2664   BOOL possessive_quantifier;
2665   BOOL is_quantifier;
2666   BOOL is_recurse;
2667   BOOL reset_bracount;
2668   int class_charcount;
2669   int class_lastchar;
2670   int newoptions;
2671   int recno;
2672   int refsign;
2673   int skipbytes;
2674   int subreqbyte;
2675   int subfirstbyte;
2676   int terminator;
2677   int mclength;
2678   uschar mcbuffer[8];
2679
2680   /* Get next byte in the pattern */
2681
2682   c = *ptr;
2683
2684   /* If we are in the pre-compile phase, accumulate the length used for the
2685   previous cycle of this loop. */
2686
2687   if (lengthptr != NULL)
2688     {
2689 #ifdef PCRE_DEBUG
2690     if (code > cd->hwm) cd->hwm = code;                 /* High water info */
2691 #endif
2692     if (code > cd->start_workspace + (COMPILE_WORK_SIZE-10)) /* tierney: added -10, since actual overwriting corrupts the stack Check for overrun */
2693       {
2694       *errorcodeptr = ERR52;
2695       goto FAILED;
2696       }
2697
2698     /* There is at least one situation where code goes backwards: this is the
2699     case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2700     the class is simply eliminated. However, it is created first, so we have to
2701     allow memory for it. Therefore, don't ever reduce the length at this point.
2702     */
2703
2704     if (code < last_code) code = last_code;
2705
2706     /* Paranoid check for integer overflow */
2707
2708     if (OFLOW_MAX - *lengthptr < code - last_code)
2709       {
2710       *errorcodeptr = ERR20;
2711       goto FAILED;
2712       }
2713
2714     *lengthptr += code - last_code;
2715     DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2716
2717     /* If "previous" is set and it is not at the start of the work space, move
2718     it back to there, in order to avoid filling up the work space. Otherwise,
2719     if "previous" is NULL, reset the current code pointer to the start. */
2720
2721     if (previous != NULL)
2722       {
2723       if (previous > orig_code)
2724         {
2725         VMPI_memmove(orig_code, previous, code - previous);
2726         code -= previous - orig_code;
2727         previous = orig_code;
2728         }
2729       }
2730     else code = orig_code;
2731
2732     /* Remember where this code item starts so we can pick up the length
2733     next time round. */
2734
2735     last_code = code;
2736     }
2737
2738   /* In the real compile phase, just check the workspace used by the forward
2739   reference list. */
2740
2741   else if (cd->hwm > cd->start_workspace + (COMPILE_WORK_SIZE-10)) // tierney: added -10, since actual overwriting corrupts the stack
2742     {
2743     *errorcodeptr = ERR52;
2744     goto FAILED;
2745     }
2746
2747   /* If in \Q...\E, check for the end; if not, we have a literal */
2748
2749   if (inescq && c != 0)
2750     {
2751     if (c == '\\' && ptr[1] == 'E')
2752       {
2753       inescq = FALSE;
2754       ptr++;
2755       continue;
2756       }
2757     else
2758       {
2759       if (previous_callout != NULL)
2760         {
2761         if (lengthptr == NULL)  /* Don't attempt in pre-compile phase */
2762           complete_callout(previous_callout, ptr, cd);
2763         previous_callout = NULL;
2764         }
2765       if ((options & PCRE_AUTO_CALLOUT) != 0)
2766         {
2767         previous_callout = code;
2768         code = auto_callout(code, ptr, cd);
2769         }
2770       goto NORMAL_CHAR;
2771       }
2772     }
2773
2774   /* Fill in length of a previous callout, except when the next thing is
2775   a quantifier. */
2776
2777   is_quantifier = c == '*' || c == '+' || c == '?' ||
2778     (c == '{' && is_counted_repeat(ptr+1));
2779
2780   if (!is_quantifier && previous_callout != NULL &&
2781        after_manual_callout-- <= 0)
2782     {
2783     if (lengthptr == NULL)      /* Don't attempt in pre-compile phase */
2784       complete_callout(previous_callout, ptr, cd);
2785     previous_callout = NULL;
2786     }
2787
2788   /* In extended mode, skip white space and comments */
2789
2790   if ((options & PCRE_EXTENDED) != 0)
2791     {
2792     if (c < 256)
2793         {
2794                 if ((cd->ctypes[c] & ctype_space) != 0)
2795                         continue;
2796         }
2797         else if (isUnicodeWhiteSpace(c) == true)
2798         {
2799                 continue;
2800         }
2801     if (c == '#')
2802       {
2803       while (*(++ptr) != 0)
2804         {
2805         if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2806         }
2807       if (*ptr != 0) continue;
2808
2809       /* Else fall through to handle end of string */
2810       c = 0;
2811       }
2812     }
2813
2814   /* No auto callout for quantifiers. */
2815
2816   if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
2817     {
2818     previous_callout = code;
2819     code = auto_callout(code, ptr, cd);
2820     }
2821
2822   switch(c)
2823     {
2824     /* ===================================================================*/
2825     case 0:                        /* The branch terminates at string end */
2826     case '|':                      /* or | or ) */
2827     case ')':
2828     *firstbyteptr = firstbyte;
2829     *reqbyteptr = reqbyte;
2830     *codeptr = code;
2831     *ptrptr = ptr;
2832     if (lengthptr != NULL)
2833       {
2834       if (OFLOW_MAX - *lengthptr < code - last_code)
2835         {
2836         *errorcodeptr = ERR20;
2837         goto FAILED;
2838         }
2839       *lengthptr += code - last_code;   /* To include callout length */
2840       DPRINTF((">> end branch\n"));
2841       }
2842     return TRUE;
2843
2844
2845     /* ===================================================================*/
2846     /* Handle single-character metacharacters. In multiline mode, ^ disables
2847     the setting of any following char as a first character. */
2848
2849     case '^':
2850     if ((options & PCRE_MULTILINE) != 0)
2851       {
2852       if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2853       }
2854     previous = NULL;
2855     *code++ = OP_CIRC;
2856     break;
2857
2858     case '$':
2859     previous = NULL;
2860     *code++ = OP_DOLL;
2861     break;
2862
2863     /* There can never be a first char if '.' is first, whatever happens about
2864     repeats. The value of reqbyte doesn't change either. */
2865
2866     case '.':
2867     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2868     zerofirstbyte = firstbyte;
2869     zeroreqbyte = reqbyte;
2870     previous = code;
2871     *code++ = OP_ANY;
2872     break;
2873
2874
2875     /* ===================================================================*/
2876     /* Character classes. If the included characters are all < 256, we build a
2877     32-byte bitmap of the permitted characters, except in the special case
2878     where there is only one such character. For negated classes, we build the
2879     map as usual, then invert it at the end. However, we use a different opcode
2880     so that data characters > 255 can be handled correctly.
2881
2882     If the class contains characters outside the 0-255 range, a different
2883     opcode is compiled. It may optionally have a bit map for characters < 256,
2884     but those above are are explicitly listed afterwards. A flag byte tells
2885     whether the bitmap is present, and whether this is a negated class or not.
2886     */
2887
2888     case '[':
2889     previous = code;
2890
2891     /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2892     they are encountered at the top level, so we'll do that too. */
2893
2894     if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2895         check_posix_syntax(ptr, &tempptr, cd))
2896       {
2897       *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
2898       goto FAILED;
2899       }
2900
2901     /* If the first character is '^', set the negation flag and skip it. Also,
2902     if the first few characters (either before or after ^) are \Q\E or \E we
2903     skip them too. This makes for compatibility with Perl. */
2904
2905     negate_class = FALSE;
2906     for (;;)
2907       {
2908       c = *(++ptr);
2909       if (c == '\\')
2910         {
2911         if (ptr[1] == 'E') ptr++;
2912           else if (VMPI_strncmp((const char *)ptr+1, "Q\\E", 3) == 0) ptr += 3;
2913             else break;
2914         }
2915       else if (!negate_class && c == '^')
2916         negate_class = TRUE;
2917       else break;
2918       }
2919
2920     /* Keep a count of chars with values < 256 so that we can optimize the case
2921     of just a single character (as long as it's < 256). However, For higher
2922     valued UTF-8 characters, we don't yet do any optimization. */
2923
2924     class_charcount = 0;
2925     class_lastchar = -1;
2926
2927     /* Initialize the 32-char bit map to all zeros. We build the map in a
2928     temporary bit of memory, in case the class contains only 1 character (less
2929     than 256), because in that case the compiled code doesn't use the bit map.
2930     */
2931
2932     VMPI_memset(classbits, 0, 32 * sizeof(uschar));
2933
2934 #ifdef SUPPORT_UTF8
2935     class_utf8 = FALSE;                       /* No chars >= 256 */
2936     class_utf8data = code + LINK_SIZE + 2;    /* For UTF-8 items */
2937     class_utf8data_base = class_utf8data;     /* For resetting in pass 1 */
2938 #endif
2939
2940     /* Process characters until ] is reached. By writing this as a "do" it
2941     means that an initial ] is taken as a data character. At the start of the
2942     loop, c contains the first byte of the character. */
2943
2944         /* cn:  Due to bug http://flashqa.macromedia.com/bugapp/detail.asp?ID=110290
2945            we need to handle [^] as matching any character (its not not a character...)
2946            The ^ is already stripped off, so we just need to catch a leading ']'
2947     */
2948           if (c == ']') // [] is still invalid, so we know we must have [^].  [^] means any character other than null
2949           {
2950                 class_lastchar = 0;
2951                 class_charcount = 1;
2952           }
2953           else if (c != 0) do
2954       {
2955       const uschar *oldptr;
2956
2957 #ifdef SUPPORT_UTF8
2958       if (utf8 && c > 127)
2959         {                           /* Braces are required because the */
2960         GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */
2961         }
2962
2963       /* In the pre-compile phase, accumulate the length of any UTF-8 extra
2964       data and reset the pointer, to ensure that large classes that cannot
2965       overwrite the work space. */
2966
2967       if (lengthptr != NULL)
2968         {
2969         *lengthptr += class_utf8data - class_utf8data_base;
2970         class_utf8data = class_utf8data_base;
2971         }
2972
2973 #endif
2974
2975       /* Inside \Q...\E everything is literal except \E */
2976
2977       if (inescq)
2978         {
2979         if (c == '\\' && ptr[1] == 'E')     /* If we are at \E */
2980           {
2981           inescq = FALSE;                   /* Reset literal state */
2982           ptr++;                            /* Skip the 'E' */
2983           continue;                         /* Carry on with next */
2984           }
2985         goto CHECK_RANGE;                   /* Could be range if \E follows */
2986         }
2987
2988       /* Handle POSIX class names. Perl allows a negation extension of the
2989       form [:^name:]. A square bracket that doesn't match the syntax is
2990       treated as a literal. We also recognize the POSIX constructions
2991       [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
2992       5.6 and 5.8 do. */
2993
2994       if (c == '[' &&
2995           (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2996           check_posix_syntax(ptr, &tempptr, cd))
2997         {
2998         BOOL local_negate = FALSE;
2999         int posix_class, taboffset, tabopt;
3000         register const uschar *cbits = cd->cbits;
3001         uschar pbits[32];
3002
3003         if (ptr[1] != ':')
3004           {
3005           *errorcodeptr = ERR31;
3006           goto FAILED;
3007           }
3008
3009         ptr += 2;
3010         if (*ptr == '^')
3011           {
3012           local_negate = TRUE;
3013           ptr++;
3014           }
3015
3016         posix_class = check_posix_name(ptr, tempptr - ptr);
3017         if (posix_class < 0)
3018           {
3019           *errorcodeptr = ERR30;
3020           goto FAILED;
3021           }
3022
3023         /* If matching is caseless, upper and lower are converted to
3024         alpha. This relies on the fact that the class table starts with
3025         alpha, lower, upper as the first 3 entries. */
3026
3027         if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
3028           posix_class = 0;
3029
3030         /* We build the bit map for the POSIX class in a chunk of local store
3031         because we may be adding and subtracting from it, and we don't want to
3032         subtract bits that may be in the main map already. At the end we or the
3033         result into the bit map that is being built. */
3034
3035         posix_class *= 3;
3036
3037         /* Copy in the first table (always present) */
3038
3039         VMPI_memcpy(pbits, cbits + posix_class_maps[posix_class],
3040           32 * sizeof(uschar));
3041
3042         /* If there is a second table, add or remove it as required. */
3043
3044         taboffset = posix_class_maps[posix_class + 1];
3045         tabopt = posix_class_maps[posix_class + 2];
3046
3047         if (taboffset >= 0)
3048           {
3049           if (tabopt >= 0)
3050             for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
3051           else
3052             for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
3053           }
3054
3055         /* Not see if we need to remove any special characters. An option
3056         value of 1 removes vertical space and 2 removes underscore. */
3057
3058         if (tabopt < 0) tabopt = -tabopt;
3059         if (tabopt == 1) pbits[1] &= ~0x3c;
3060           else if (tabopt == 2) pbits[11] &= 0x7f;
3061
3062         /* Add the POSIX table or its complement into the main table that is
3063         being built and we are done. */
3064
3065         if (local_negate)
3066           for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
3067         else
3068           for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
3069
3070         ptr = tempptr + 1;
3071         class_charcount = 10;  /* Set > 1; assumes more than 1 per class */
3072         continue;    /* End of POSIX syntax handling */
3073         }
3074
3075       /* Backslash may introduce a single character, or it may introduce one
3076       of the specials, which just set a flag. The sequence \b is a special
3077       case. Inside a class (and only there) it is treated as backspace.
3078       Elsewhere it marks a word boundary. Other escapes have preset maps ready
3079       to 'or' into the one we are building. We assume they have more than one
3080       character in them, so set class_charcount bigger than one. */
3081
3082       if (c == '\\')
3083         {
3084         c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3085         if (*errorcodeptr != 0) goto FAILED;
3086
3087         if (-c == ESC_b) c = '\b';       /* \b is backslash in a class */
3088         else if (-c == ESC_X) c = 'X';   /* \X is literal X in a class */
3089         else if (-c == ESC_R) c = 'R';   /* \R is literal R in a class */
3090         else if (-c == ESC_Q)            /* Handle start of quoted string */
3091           {
3092           if (ptr[1] == '\\' && ptr[2] == 'E')
3093             {
3094             ptr += 2; /* avoid empty string */
3095             }
3096           else inescq = TRUE;
3097           continue;
3098           }
3099         else if (-c == ESC_E) continue;  /* Ignore orphan \E */
3100
3101         if (c < 0)
3102           {
3103           register const uschar *cbits = cd->cbits;
3104           class_charcount += 2;     /* Greater than 1 is what matters */
3105
3106           /* Save time by not doing this in the pre-compile phase. */
3107
3108           if (lengthptr == NULL) switch (-c)
3109             {
3110             case ESC_d:
3111             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
3112             continue;
3113
3114             case ESC_D:
3115             for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
3116             inverted_class = TRUE;
3117             continue;
3118
3119             case ESC_w:
3120             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
3121             continue;
3122
3123             case ESC_W:
3124             for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
3125             inverted_class = TRUE;
3126             continue;
3127
3128             case ESC_s:
3129             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
3130             classbits[1] &= ~0x08;   /* Perl 5.004 onwards omits VT from \s */
3131             continue;
3132
3133             case ESC_S:
3134             for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
3135             classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */
3136             inverted_class = TRUE;
3137             continue;
3138
3139             case ESC_E: /* Perl ignores an orphan \E */
3140             continue;
3141
3142             default:    /* Not recognized; fall through */
3143             break;      /* Need "default" setting to stop compiler warning. */
3144             }
3145
3146           /* In the pre-compile phase, just do the recognition. */
3147
3148           else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
3149                    c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
3150
3151           /* We need to deal with \H, \h, \V, and \v in both phases because
3152           they use extra memory. */
3153
3154           if (-c == ESC_h)
3155             {
3156             SETBIT(classbits, 0x09); /* VT */
3157             SETBIT(classbits, 0x20); /* SPACE */
3158             SETBIT(classbits, 0xa0); /* NSBP */
3159 #ifdef SUPPORT_UTF8
3160             if (utf8)
3161               {
3162               class_utf8 = TRUE;
3163               *class_utf8data++ = XCL_SINGLE;
3164               class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
3165               *class_utf8data++ = XCL_SINGLE;
3166               class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
3167               *class_utf8data++ = XCL_RANGE;
3168               class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
3169               class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
3170               *class_utf8data++ = XCL_SINGLE;
3171               class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
3172               *class_utf8data++ = XCL_SINGLE;
3173               class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
3174               *class_utf8data++ = XCL_SINGLE;
3175               class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
3176               }
3177 #endif
3178             continue;
3179             }
3180
3181           if (-c == ESC_H)
3182             {
3183             for (c = 0; c < 32; c++)
3184               {
3185               int x = 0xff;
3186               switch (c)
3187                 {
3188                 case 0x09/8: x ^= 1 << (0x09%8); break;
3189                 case 0x20/8: x ^= 1 << (0x20%8); break;
3190                 case 0xa0/8: x ^= 1 << (0xa0%8); break;
3191                 default: break;
3192                 }
3193               classbits[c] |= x;
3194               }
3195
3196 #ifdef SUPPORT_UTF8
3197             if (utf8)
3198               {
3199               class_utf8 = TRUE;
3200               *class_utf8data++ = XCL_RANGE;
3201               class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
3202               class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
3203               *class_utf8data++ = XCL_RANGE;
3204               class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
3205               class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
3206               *class_utf8data++ = XCL_RANGE;
3207               class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
3208               class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
3209               *class_utf8data++ = XCL_RANGE;
3210               class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
3211               class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
3212               *class_utf8data++ = XCL_RANGE;
3213               class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
3214               class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
3215               *class_utf8data++ = XCL_RANGE;
3216               class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
3217               class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
3218               *class_utf8data++ = XCL_RANGE;
3219               class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
3220               class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
3221               }
3222 #endif
3223             continue;
3224             }
3225
3226           if (-c == ESC_v)
3227             {
3228             SETBIT(classbits, 0x0a); /* LF */
3229             SETBIT(classbits, 0x0b); /* VT */
3230             SETBIT(classbits, 0x0c); /* FF */
3231             SETBIT(classbits, 0x0d); /* CR */
3232             SETBIT(classbits, 0x85); /* NEL */
3233 #ifdef SUPPORT_UTF8
3234             if (utf8)
3235               {
3236               class_utf8 = TRUE;
3237               *class_utf8data++ = XCL_RANGE;
3238               class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
3239               class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3240               }
3241 #endif
3242             continue;
3243             }
3244
3245           if (-c == ESC_V)
3246             {
3247             for (c = 0; c < 32; c++)
3248               {
3249               int x = 0xff;
3250               switch (c)
3251                 {
3252                 case 0x0a/8: x ^= 1 << (0x0a%8);
3253                              x ^= 1 << (0x0b%8);
3254                              x ^= 1 << (0x0c%8);
3255                              x ^= 1 << (0x0d%8);
3256                              break;
3257                 case 0x85/8: x ^= 1 << (0x85%8); break;
3258                 default: break;
3259                 }
3260               classbits[c] |= x;
3261               }
3262
3263 #ifdef SUPPORT_UTF8
3264             if (utf8)
3265               {
3266               class_utf8 = TRUE;
3267               *class_utf8data++ = XCL_RANGE;
3268               class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
3269               class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
3270               *class_utf8data++ = XCL_RANGE;
3271               class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3272               class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
3273               }
3274 #endif
3275             continue;
3276             }
3277
3278           /* We need to deal with \P and \p in both phases. */
3279
3280 #ifdef SUPPORT_UCP
3281           if (-c == ESC_p || -c == ESC_P)
3282             {
3283             BOOL negated;
3284             int pdata;
3285             int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
3286             if (ptype < 0) goto FAILED;
3287             class_utf8 = TRUE;
3288             *class_utf8data++ = ((-c == ESC_p) != negated)?
3289               XCL_PROP : XCL_NOTPROP;
3290             *class_utf8data++ = ptype;
3291             *class_utf8data++ = pdata;
3292             class_charcount -= 2;   /* Not a < 256 character */
3293             continue;
3294             }
3295 #endif
3296           /* Unrecognized escapes are faulted if PCRE is running in its
3297           strict mode. By default, for compatibility with Perl, they are
3298           treated as literals. */
3299
3300           if ((options & PCRE_EXTRA) != 0)
3301             {
3302             *errorcodeptr = ERR7;
3303             goto FAILED;
3304             }
3305
3306           class_charcount -= 2;  /* Undo the default count from above */
3307           c = *ptr;              /* Get the final character and fall through */
3308           }
3309
3310         /* Fall through if we have a single character (c >= 0). This may be
3311         greater than 256 in UTF-8 mode. */
3312
3313         }   /* End of backslash handling */
3314
3315       /* A single character may be followed by '-' to form a range. However,
3316       Perl does not permit ']' to be the end of the range. A '-' character
3317       at the end is treated as a literal. Perl ignores orphaned \E sequences
3318       entirely. The code for handling \Q and \E is messy. */
3319
3320       CHECK_RANGE:
3321       while (ptr[1] == '\\' && ptr[2] == 'E')
3322         {
3323         inescq = FALSE;
3324         ptr += 2;
3325         }
3326
3327       oldptr = ptr;
3328
3329       if (!inescq && ptr[1] == '-')
3330         {
3331         int d;
3332         ptr += 2;
3333         while (*ptr == '\\' && ptr[1] == 'E') ptr += 2;
3334
3335         /* If we hit \Q (not followed by \E) at this point, go into escaped
3336         mode. */
3337
3338         while (*ptr == '\\' && ptr[1] == 'Q')
3339           {
3340           ptr += 2;
3341           if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; }
3342           inescq = TRUE;
3343           break;
3344           }
3345
3346         if (*ptr == 0 || (!inescq && *ptr == ']'))
3347           {
3348           ptr = oldptr;
3349           goto LONE_SINGLE_CHARACTER;
3350           }
3351
3352 #ifdef SUPPORT_UTF8
3353         if (utf8)
3354           {                           /* Braces are required because the */
3355           GETCHARLEN(d, ptr, ptr);    /* macro generates multiple statements */
3356           }
3357         else
3358 #endif
3359         d = *ptr;  /* Not UTF-8 mode */
3360
3361         /* The second part of a range can be a single-character escape, but
3362         not any of the other escapes. Perl 5.6 treats a hyphen as a literal
3363         in such circumstances. */
3364
3365         if (!inescq && d == '\\')
3366           {
3367           d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3368           if (*errorcodeptr != 0) goto FAILED;
3369
3370           /* \b is backslash; \X is literal X; \R is literal R; any other
3371           special means the '-' was literal */
3372
3373           if (d < 0)
3374             {
3375             if (d == -ESC_b) d = '\b';
3376             else if (d == -ESC_X) d = 'X';
3377             else if (d == -ESC_R) d = 'R'; else
3378               {
3379               ptr = oldptr;
3380               goto LONE_SINGLE_CHARACTER;  /* A few lines below */
3381               }
3382             }
3383           }
3384
3385         /* Check that the two values are in the correct order. Optimize
3386         one-character ranges */
3387
3388         if (d < c)
3389           {
3390           *errorcodeptr = ERR8;
3391           goto FAILED;
3392           }
3393
3394         if (d == c) goto LONE_SINGLE_CHARACTER;  /* A few lines below */
3395
3396         /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
3397         matching, we have to use an XCLASS with extra data items. Caseless
3398         matching for characters > 127 is available only if UCP support is
3399         available. */
3400
3401 #ifdef SUPPORT_UTF8
3402         if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
3403           {
3404           class_utf8 = TRUE;
3405
3406           /* With UCP support, we can find the other case equivalents of
3407           the relevant characters. There may be several ranges. Optimize how
3408           they fit with the basic range. */
3409
3410 #ifdef SUPPORT_UCP
3411           if ((options & PCRE_CASELESS) != 0)
3412             {
3413             unsigned int occ, ocd;
3414             unsigned int cc = c;
3415             unsigned int origd = d;
3416             while (get_othercase_range(&cc, origd, &occ, &ocd))
3417               {
3418               if (occ >= (unsigned int)c &&
3419                   ocd <= (unsigned int)d)
3420                 continue;                          /* Skip embedded ranges */
3421
3422               if (occ < (unsigned int)c  &&
3423                   ocd >= (unsigned int)c - 1)      /* Extend the basic range */
3424                 {                                  /* if there is overlap,   */
3425                 c = occ;                           /* noting that if occ < c */
3426                 continue;                          /* we can't have ocd > d  */
3427                 }                                  /* because a subrange is  */
3428               if (ocd > (unsigned int)d &&
3429                   occ <= (unsigned int)d + 1)      /* always shorter than    */
3430                 {                                  /* the basic range.       */
3431                 d = ocd;
3432                 continue;
3433                 }
3434
3435               if (occ == ocd)
3436                 {
3437                 *class_utf8data++ = XCL_SINGLE;
3438                 }
3439               else
3440                 {
3441                 *class_utf8data++ = XCL_RANGE;
3442                 class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
3443                 }
3444               class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
3445               }
3446             }
3447 #endif  /* SUPPORT_UCP */
3448
3449           /* Now record the original range, possibly modified for UCP caseless
3450           overlapping ranges. */
3451
3452           *class_utf8data++ = XCL_RANGE;
3453           class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3454           class_utf8data += _pcre_ord2utf8(d, class_utf8data);
3455
3456           /* With UCP support, we are done. Without UCP support, there is no
3457           caseless matching for UTF-8 characters > 127; we can use the bit map
3458           for the smaller ones. */
3459
3460 #ifdef SUPPORT_UCP
3461           continue;    /* With next character in the class */
3462 #else
3463           if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
3464
3465           /* Adjust upper limit and fall through to set up the map */
3466
3467           d = 127;
3468
3469 #endif  /* SUPPORT_UCP */
3470           }
3471 #endif  /* SUPPORT_UTF8 */
3472
3473         /* We use the bit map for all cases when not in UTF-8 mode; else
3474         ranges that lie entirely within 0-127 when there is UCP support; else
3475         for partial ranges without UCP support. */
3476
3477         class_charcount += d - c + 1;
3478         class_lastchar = d;
3479
3480         /* We can save a bit of time by skipping this in the pre-compile. */
3481
3482         if (lengthptr == NULL) for (; c <= d; c++)
3483           {
3484           classbits[c/8] |= (1 << (c&7));
3485           if ((options & PCRE_CASELESS) != 0)
3486             {
3487             int uc = cd->fcc[c];           /* flip case */
3488             classbits[uc/8] |= (1 << (uc&7));
3489             }
3490           }
3491
3492         continue;   /* Go get the next char in the class */
3493         }
3494
3495       /* Handle a lone single character - we can get here for a normal
3496       non-escape char, or after \ that introduces a single character or for an
3497       apparent range that isn't. */
3498
3499       LONE_SINGLE_CHARACTER:
3500
3501       /* Handle a character that cannot go in the bit map */
3502
3503 #ifdef SUPPORT_UTF8
3504       if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
3505         {
3506         class_utf8 = TRUE;
3507         *class_utf8data++ = XCL_SINGLE;
3508         class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3509
3510 #ifdef SUPPORT_UCP
3511         if ((options & PCRE_CASELESS) != 0)
3512           {
3513           unsigned int othercase;
3514           if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR)
3515             {
3516             *class_utf8data++ = XCL_SINGLE;
3517             class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
3518             }
3519           }
3520 #endif  /* SUPPORT_UCP */
3521
3522         }
3523       else
3524 #endif  /* SUPPORT_UTF8 */
3525
3526       /* Handle a single-byte character */
3527         {
3528         classbits[c/8] |= (1 << (c&7));
3529         if ((options & PCRE_CASELESS) != 0)
3530           {
3531           c = cd->fcc[c];   /* flip case */
3532           classbits[c/8] |= (1 << (c&7));
3533           }
3534         class_charcount++;
3535         class_lastchar = c;
3536         }
3537       }
3538
3539     /* Loop until ']' reached. This "while" is the end of the "do" above. */
3540
3541     while ((c = *(++ptr)) != 0 && (c != ']' || inescq));
3542
3543     if (c == 0)                          /* Missing terminating ']' */
3544       {
3545       *errorcodeptr = ERR6;
3546       goto FAILED;
3547       }
3548
3549     /* Remember whether \r or \n are in this class */
3550
3551     if (negate_class)
3552       {
3553       if ((classbits[1] & 0x24) != 0x24) cd->external_options |= PCRE_HASCRORLF;
3554       }
3555     else
3556       {
3557       if ((classbits[1] & 0x24) != 0) cd->external_options |= PCRE_HASCRORLF;
3558       }
3559     /* If class_charcount is 1, we saw precisely one character whose value is
3560     less than 256. As long as there were no characters >= 128 and there was no
3561     use of \p or \P, in other words, no use of any XCLASS features, we can
3562     optimize.
3563
3564     In UTF-8 mode, we can optimize the negative case only if there were no
3565     characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR
3566     operate on single-bytes only. This is an historical hangover. Maybe one day
3567     we can tidy these opcodes to handle multi-byte characters.
3568
3569     The optimization throws away the bit map. We turn the item into a
3570     1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
3571     that OP_NOT does not support multibyte characters. In the positive case, it
3572     can cause firstbyte to be set. Otherwise, there can be no first char if
3573     this item is first, whatever repeat count may follow. In the case of
3574     reqbyte, save the previous value for reinstating. */
3575
3576 #ifdef SUPPORT_UTF8
3577     if (class_charcount == 1 && !class_utf8 &&
3578       (!utf8 || !negate_class || class_lastchar < 128))
3579 #else
3580     if (class_charcount == 1)
3581 #endif
3582       {
3583       zeroreqbyte = reqbyte;
3584
3585       /* The OP_NOT opcode works on one-byte characters only. */
3586
3587       if (negate_class)
3588         {
3589         if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3590         zerofirstbyte = firstbyte;
3591         *code++ = OP_NOT;
3592         *code++ = class_lastchar;
3593         break;
3594         }
3595
3596       /* For a single, positive character, get the value into mcbuffer, and
3597       then we can handle this with the normal one-character code. */
3598
3599 #ifdef SUPPORT_UTF8
3600       if (utf8 && class_lastchar > 127)
3601         mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
3602       else
3603 #endif
3604         {
3605         mcbuffer[0] = class_lastchar;
3606         mclength = 1;
3607         }
3608       goto ONE_CHAR;
3609       }       /* End of 1-char optimization */
3610
3611     /* The general case - not the one-char optimization. If this is the first
3612     thing in the branch, there can be no first char setting, whatever the
3613     repeat count. Any reqbyte setting must remain unchanged after any kind of
3614     repeat. */
3615
3616     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3617     zerofirstbyte = firstbyte;
3618     zeroreqbyte = reqbyte;
3619
3620     /* If there are characters with values > 255, we have to compile an
3621     extended class, with its own opcode. If there are no characters < 256,
3622     we can omit the bitmap in the actual compiled code. */
3623
3624 #ifdef SUPPORT_UTF8
3625     if (class_utf8)
3626       {
3627       *class_utf8data++ = XCL_END;    /* Marks the end of extra data */
3628       *code++ = OP_XCLASS;
3629       code += LINK_SIZE;
3630       *code = negate_class? XCL_NOT : 0;
3631
3632       /* If the map is required, move up the extra data to make room for it;
3633       otherwise just move the code pointer to the end of the extra data. */
3634
3635       if (class_charcount > 0)
3636         {
3637         *code++ |= XCL_MAP;
3638         VMPI_memmove(code + 32, code, class_utf8data - code);
3639         VMPI_memcpy(code, classbits, 32);
3640         code = class_utf8data + 32;
3641         }
3642       else code = class_utf8data;
3643
3644       /* Now fill in the complete length of the item */
3645
3646       PUT(previous, 1, code - previous);
3647       break;   /* End of class handling */
3648       }
3649 #endif
3650
3651     /* If there are no characters > 255, negate the 32-byte map if necessary,
3652     and copy it into the code vector. If this is the first thing in the branch,
3653     there can be no first char setting, whatever the repeat count. Any reqbyte
3654     setting must remain unchanged after any kind of repeat. */
3655
3656     if (negate_class)
3657       {
3658       *code++ = OP_NCLASS;
3659       if (lengthptr == NULL)    /* Save time in the pre-compile phase */
3660         for (c = 0; c < 32; c++) code[c] = ~classbits[c];
3661       }
3662     else
3663       {
3664           /* cn: code added here for proper UTF-8 handling of \S (not whitespace char) \D (not a digit char) and \W (not a word char) */
3665           // The class maps for \S \D \W are the inverted maps for \s \d and \w.  The class maps only handle comparisions with
3666           //  chars < 255, however, and we need successful matches for chars > 255 when comparing to \S \D and \W.  The
3667           //  interpreter code for OP_CLASS will fail the match for any character > 255, (which is correct for \s \d \w).
3668           //  The interpreter code for OP_NCLASS will succeed for any character > 255 (which is correct of ^\s, ^\d, ^\w).
3669           //  Since \S is equivalent to ^\s, we need to use OP_NCLASS for \S \D and \W (inverted_class == true in that case).
3670           // Another option would be to create an OP_XCLASS custom bitmap just to handle all chars > 255 for \S \D and \W.
3671           //  though that would execute slower.
3672
3673       // original code: *code++ = OP_CLASS;
3674           *code++ = (inverted_class ? OP_NCLASS : OP_CLASS);
3675
3676           /* cn: end modified code */
3677       VMPI_memcpy(code, classbits, 32);
3678       }
3679     code += 32;
3680     break;
3681
3682
3683     /* ===================================================================*/
3684     /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
3685     has been tested above. */
3686
3687     case '{':
3688     if (!is_quantifier) goto NORMAL_CHAR;
3689     ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
3690     if (*errorcodeptr != 0) goto FAILED;
3691     goto REPEAT;
3692
3693     case '*':
3694     repeat_min = 0;
3695     repeat_max = -1;
3696     goto REPEAT;
3697
3698     case '+':
3699     repeat_min = 1;
3700     repeat_max = -1;
3701     goto REPEAT;
3702
3703     case '?':
3704     repeat_min = 0;
3705     repeat_max = 1;
3706
3707     REPEAT:
3708     if (previous == NULL)
3709       {
3710       *errorcodeptr = ERR9;
3711       goto FAILED;
3712       }
3713
3714     if (repeat_min == 0)
3715       {
3716       firstbyte = zerofirstbyte;    /* Adjust for zero repeat */
3717       reqbyte = zeroreqbyte;        /* Ditto */
3718       }
3719
3720     /* Remember whether this is a variable length repeat */
3721
3722     reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
3723
3724     op_type = 0;                    /* Default single-char op codes */
3725     possessive_quantifier = FALSE;  /* Default not possessive quantifier */
3726
3727     /* Save start of previous item, in case we have to move it up to make space
3728     for an inserted OP_ONCE for the additional '+' extension. */
3729
3730     tempcode = previous;
3731
3732     /* If the next character is '+', we have a possessive quantifier. This
3733     implies greediness, whatever the setting of the PCRE_UNGREEDY option.
3734     If the next character is '?' this is a minimizing repeat, by default,
3735     but if PCRE_UNGREEDY is set, it works the other way round. We change the
3736     repeat type to the non-default. */
3737
3738     if (ptr[1] == '+')
3739       {
3740       repeat_type = 0;                  /* Force greedy */
3741       possessive_quantifier = TRUE;
3742       ptr++;
3743       }
3744     else if (ptr[1] == '?')
3745       {
3746       repeat_type = greedy_non_default;
3747       ptr++;
3748       }
3749     else repeat_type = greedy_default;
3750
3751     /* If previous was a character match, abolish the item and generate a
3752     repeat item instead. If a char item has a minumum of more than one, ensure
3753     that it is set in reqbyte - it might not be if a sequence such as x{3} is
3754     the first thing in a branch because the x will have gone into firstbyte
3755     instead.  */
3756
3757     if (*previous == OP_CHAR || *previous == OP_CHARNC)
3758       {
3759       /* Deal with UTF-8 characters that take up more than one byte. It's
3760       easier to write this out separately than try to macrify it. Use c to
3761       hold the length of the character in bytes, plus 0x80 to flag that it's a
3762       length rather than a small character. */
3763
3764 #ifdef SUPPORT_UTF8
3765       if (utf8 && (code[-1] & 0x80) != 0)
3766         {
3767         uschar *lastchar = code - 1;
3768         while((*lastchar & 0xc0) == 0x80) lastchar--;
3769         c = code - lastchar;            /* Length of UTF-8 character */
3770         VMPI_memcpy(utf8_char, lastchar, c); /* Save the char */
3771         c |= 0x80;                      /* Flag c as a length */
3772         }
3773       else
3774 #endif
3775
3776       /* Handle the case of a single byte - either with no UTF8 support, or
3777       with UTF-8 disabled, or for a UTF-8 character < 128. */
3778
3779         {
3780         c = code[-1];
3781         if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3782         }
3783
3784       /* If the repetition is unlimited, it pays to see if the next thing on
3785       the line is something that cannot possibly match this character. If so,
3786       automatically possessifying this item gains some performance in the case
3787       where the match fails. */
3788
3789       if (!possessive_quantifier &&
3790           repeat_max < 0 &&
3791           check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3792             options, cd))
3793         {
3794         repeat_type = 0;    /* Force greedy */
3795         possessive_quantifier = TRUE;
3796         }
3797
3798       goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */
3799       }
3800
3801     /* If previous was a single negated character ([^a] or similar), we use
3802     one of the special opcodes, replacing it. The code is shared with single-
3803     character repeats by setting opt_type to add a suitable offset into
3804     repeat_type. We can also test for auto-possessification. OP_NOT is
3805     currently used only for single-byte chars. */
3806
3807     else if (*previous == OP_NOT)
3808       {
3809       op_type = OP_NOTSTAR - OP_STAR;  /* Use "not" opcodes */
3810       c = previous[1];
3811       if (!possessive_quantifier &&
3812           repeat_max < 0 &&
3813           check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3814         {
3815         repeat_type = 0;    /* Force greedy */
3816         possessive_quantifier = TRUE;
3817         }
3818       goto OUTPUT_SINGLE_REPEAT;
3819       }
3820
3821     /* If previous was a character type match (\d or similar), abolish it and
3822     create a suitable repeat item. The code is shared with single-character
3823     repeats by setting op_type to add a suitable offset into repeat_type. Note
3824     the the Unicode property types will be present only when SUPPORT_UCP is
3825     defined, but we don't wrap the little bits of code here because it just
3826     makes it horribly messy. */
3827
3828     else if (*previous < OP_EODN)
3829       {
3830       uschar *oldcode;
3831       int prop_type, prop_value;
3832       op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */
3833       c = *previous;
3834
3835       if (!possessive_quantifier &&
3836           repeat_max < 0 &&
3837           check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3838         {
3839         repeat_type = 0;    /* Force greedy */
3840         possessive_quantifier = TRUE;
3841         }
3842
3843       OUTPUT_SINGLE_REPEAT:
3844       if (*previous == OP_PROP || *previous == OP_NOTPROP)
3845         {
3846         prop_type = previous[1];
3847         prop_value = previous[2];
3848         }
3849       else prop_type = prop_value = -1;
3850
3851       oldcode = code;
3852       code = previous;                  /* Usually overwrite previous item */
3853
3854       /* If the maximum is zero then the minimum must also be zero; Perl allows
3855       this case, so we do too - by simply omitting the item altogether. */
3856
3857       if (repeat_max == 0) goto END_REPEAT;
3858
3859       /* All real repeats make it impossible to handle partial matching (maybe
3860       one day we will be able to remove this restriction). */
3861
3862       if (repeat_max != 1) cd->nopartial = TRUE;
3863
3864       /* Combine the op_type with the repeat_type */
3865
3866       repeat_type += op_type;
3867
3868       /* A minimum of zero is handled either as the special case * or ?, or as
3869       an UPTO, with the maximum given. */
3870
3871       if (repeat_min == 0)
3872         {
3873         if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
3874           else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
3875         else
3876           {
3877           *code++ = OP_UPTO + repeat_type;
3878           PUT2INC(code, 0, repeat_max);
3879           }
3880         }
3881
3882       /* A repeat minimum of 1 is optimized into some special cases. If the
3883       maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3884       left in place and, if the maximum is greater than 1, we use OP_UPTO with
3885       one less than the maximum. */
3886
3887       else if (repeat_min == 1)
3888         {
3889         if (repeat_max == -1)
3890           *code++ = OP_PLUS + repeat_type;
3891         else
3892           {
3893           code = oldcode;                 /* leave previous item in place */
3894           if (repeat_max == 1) goto END_REPEAT;
3895           *code++ = OP_UPTO + repeat_type;
3896           PUT2INC(code, 0, repeat_max - 1);
3897           }
3898         }
3899
3900       /* The case {n,n} is just an EXACT, while the general case {n,m} is
3901       handled as an EXACT followed by an UPTO. */
3902
3903       else
3904         {
3905         *code++ = OP_EXACT + op_type;  /* NB EXACT doesn't have repeat_type */
3906         PUT2INC(code, 0, repeat_min);
3907
3908         /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3909         we have to insert the character for the previous code. For a repeated
3910         Unicode property match, there are two extra bytes that define the
3911         required property. In UTF-8 mode, long characters have their length in
3912         c, with the 0x80 bit as a flag. */
3913
3914         if (repeat_max < 0)
3915           {
3916 #ifdef SUPPORT_UTF8
3917           if (utf8 && c >= 128)
3918             {
3919             VMPI_memcpy(code, utf8_char, c & 7);
3920             code += c & 7;
3921             }
3922           else
3923 #endif
3924             {
3925             *code++ = c;
3926             if (prop_type >= 0)
3927               {
3928               *code++ = prop_type;
3929               *code++ = prop_value;
3930               }
3931             }
3932           *code++ = OP_STAR + repeat_type;
3933           }
3934
3935         /* Else insert an UPTO if the max is greater than the min, again
3936         preceded by the character, for the previously inserted code. If the
3937         UPTO is just for 1 instance, we can use QUERY instead. */
3938
3939         else if (repeat_max != repeat_min)
3940           {
3941 #ifdef SUPPORT_UTF8
3942           if (utf8 && c >= 128)
3943             {
3944             VMPI_memcpy(code, utf8_char, c & 7);
3945             code += c & 7;
3946             }
3947           else
3948 #endif
3949           *code++ = c;
3950           if (prop_type >= 0)
3951             {
3952             *code++ = prop_type;
3953             *code++ = prop_value;
3954             }
3955           repeat_max -= repeat_min;
3956
3957           if (repeat_max == 1)
3958             {
3959             *code++ = OP_QUERY + repeat_type;
3960             }
3961           else
3962             {
3963             *code++ = OP_UPTO + repeat_type;
3964             PUT2INC(code, 0, repeat_max);
3965             }
3966           }
3967         }
3968
3969       /* The character or character type itself comes last in all cases. */
3970
3971 #ifdef SUPPORT_UTF8
3972       if (utf8 && c >= 128)
3973         {
3974         VMPI_memcpy(code, utf8_char, c & 7);
3975         code += c & 7;
3976         }
3977       else
3978 #endif
3979       *code++ = c;
3980
3981       /* For a repeated Unicode property match, there are two extra bytes that
3982       define the required property. */
3983
3984 #ifdef SUPPORT_UCP
3985       if (prop_type >= 0)
3986         {
3987         *code++ = prop_type;
3988         *code++ = prop_value;
3989         }
3990 #endif
3991       }
3992
3993     /* If previous was a character class or a back reference, we put the repeat
3994     stuff after it, but just skip the item if the repeat was {0,0}. */
3995
3996     else if (*previous == OP_CLASS ||
3997              *previous == OP_NCLASS ||
3998 #ifdef SUPPORT_UTF8
3999              *previous == OP_XCLASS ||
4000 #endif
4001              *previous == OP_REF)
4002       {
4003       if (repeat_max == 0)
4004         {
4005         code = previous;
4006         goto END_REPEAT;
4007         }
4008
4009       /* All real repeats make it impossible to handle partial matching (maybe
4010       one day we will be able to remove this restriction). */
4011
4012       if (repeat_max != 1) cd->nopartial = TRUE;
4013
4014       if (repeat_min == 0 && repeat_max == -1)
4015         *code++ = OP_CRSTAR + repeat_type;
4016       else if (repeat_min == 1 && repeat_max == -1)
4017         *code++ = OP_CRPLUS + repeat_type;
4018       else if (repeat_min == 0 && repeat_max == 1)
4019         *code++ = OP_CRQUERY + repeat_type;
4020       else
4021         {
4022         *code++ = OP_CRRANGE + repeat_type;
4023         PUT2INC(code, 0, repeat_min);
4024         if (repeat_max == -1) repeat_max = 0;  /* 2-byte encoding for max */
4025         PUT2INC(code, 0, repeat_max);
4026         }
4027       }
4028
4029     /* If previous was a bracket group, we may have to replicate it in certain
4030     cases. */
4031
4032     else if (*previous == OP_BRA  || *previous == OP_CBRA ||
4033              *previous == OP_ONCE || *previous == OP_COND)
4034       {
4035       register int i;
4036       int ketoffset = 0;
4037       int len = code - previous;
4038       uschar *bralink = NULL;
4039
4040       /* Repeating a DEFINE group is pointless */
4041
4042       if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
4043         {
4044         *errorcodeptr = ERR55;
4045         goto FAILED;
4046         }
4047
4048       /* If the maximum repeat count is unlimited, find the end of the bracket
4049       by scanning through from the start, and compute the offset back to it
4050       from the current code pointer. There may be an OP_OPT setting following
4051       the final KET, so we can't find the end just by going back from the code
4052       pointer. */
4053
4054       if (repeat_max == -1)
4055         {
4056         register uschar *ket = previous;
4057         do ket += GET(ket, 1); while (*ket != OP_KET);
4058         ketoffset = code - ket;
4059         }
4060
4061       /* The case of a zero minimum is special because of the need to stick
4062       OP_BRAZERO in front of it, and because the group appears once in the
4063       data, whereas in other cases it appears the minimum number of times. For
4064       this reason, it is simplest to treat this case separately, as otherwise
4065       the code gets far too messy. There are several special subcases when the
4066       minimum is zero. */
4067
4068       if (repeat_min == 0)
4069         {
4070         /* If the maximum is also zero, we just omit the group from the output
4071         altogether. */
4072
4073         if (repeat_max == 0)
4074           {
4075           code = previous;
4076           goto END_REPEAT;
4077           }
4078
4079         /* If the maximum is 1 or unlimited, we just have to stick in the
4080         BRAZERO and do no more at this point. However, we do need to adjust
4081         any OP_RECURSE calls inside the group that refer to the group itself or
4082         any internal or forward referenced group, because the offset is from
4083         the start of the whole regex. Temporarily terminate the pattern while
4084         doing this. */
4085
4086         if (repeat_max <= 1)
4087           {
4088           *code = OP_END;
4089           adjust_recurse(previous, 1, utf8, cd, save_hwm);
4090           VMPI_memmove(previous+1, previous, len);
4091           code++;
4092           *previous++ = OP_BRAZERO + repeat_type;
4093           }
4094
4095         /* If the maximum is greater than 1 and limited, we have to replicate
4096         in a nested fashion, sticking OP_BRAZERO before each set of brackets.
4097         The first one has to be handled carefully because it's the original
4098         copy, which has to be moved up. The remainder can be handled by code
4099         that is common with the non-zero minimum case below. We have to
4100         adjust the value or repeat_max, since one less copy is required. Once
4101         again, we may have to adjust any OP_RECURSE calls inside the group. */
4102
4103         else
4104           {
4105           int offset;
4106           *code = OP_END;
4107           adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
4108           VMPI_memmove(previous + 2 + LINK_SIZE, previous, len);
4109           code += 2 + LINK_SIZE;
4110           *previous++ = OP_BRAZERO + repeat_type;
4111           *previous++ = OP_BRA;
4112
4113           /* We chain together the bracket offset fields that have to be
4114           filled in later when the ends of the brackets are reached. */
4115
4116           offset = (bralink == NULL)? 0 : previous - bralink;
4117           bralink = previous;
4118           PUTINC(previous, 0, offset);
4119           }
4120
4121         repeat_max--;
4122         }
4123
4124       /* If the minimum is greater than zero, replicate the group as many
4125       times as necessary, and adjust the maximum to the number of subsequent
4126       copies that we need. If we set a first char from the group, and didn't
4127       set a required char, copy the latter from the former. If there are any
4128       forward reference subroutine calls in the group, there will be entries on
4129       the workspace list; replicate these with an appropriate increment. */
4130
4131       else
4132         {
4133         if (repeat_min > 1)
4134           {
4135           /* In the pre-compile phase, we don't actually do the replication. We
4136           just adjust the length as if we had. Do some paranoid checks for
4137           potential integer overflow. */
4138
4139           if (lengthptr != NULL)
4140             {
4141             int delta = (repeat_min - 1)*length_prevgroup;
4142             if ((double)(repeat_min - 1)*(double)length_prevgroup >
4143                                                             (double)INT_MAX ||
4144                 OFLOW_MAX - *lengthptr < delta)
4145               {
4146               *errorcodeptr = ERR20;
4147               goto FAILED;
4148               }
4149             *lengthptr += delta;
4150             }
4151
4152           /* This is compiling for real */
4153
4154           else
4155             {
4156             if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
4157             for (i = 1; i < repeat_min; i++)
4158               {
4159               uschar *hc;
4160               uschar *this_hwm = cd->hwm;
4161               VMPI_memcpy(code, previous, len);
4162               for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
4163                 {
4164                 PUT(cd->hwm, 0, GET(hc, 0) + len);
4165                 cd->hwm += LINK_SIZE;
4166                 }
4167               save_hwm = this_hwm;
4168               code += len;
4169               }
4170             }
4171           }
4172
4173         if (repeat_max > 0) repeat_max -= repeat_min;
4174         }
4175
4176       /* This code is common to both the zero and non-zero minimum cases. If
4177       the maximum is limited, it replicates the group in a nested fashion,
4178       remembering the bracket starts on a stack. In the case of a zero minimum,
4179       the first one was set up above. In all cases the repeat_max now specifies
4180       the number of additional copies needed. Again, we must remember to
4181       replicate entries on the forward reference list. */
4182
4183       if (repeat_max >= 0)
4184         {
4185         /* In the pre-compile phase, we don't actually do the replication. We
4186         just adjust the length as if we had. For each repetition we must add 1
4187         to the length for BRAZERO and for all but the last repetition we must
4188         add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
4189         paranoid checks to avoid integer overflow. */
4190
4191         if (lengthptr != NULL && repeat_max > 0)
4192           {
4193           int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
4194                       2 - 2*LINK_SIZE;   /* Last one doesn't nest */
4195           if ((double)repeat_max *
4196                 (double)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
4197                   > (double)INT_MAX ||
4198               OFLOW_MAX - *lengthptr < delta)
4199             {
4200             *errorcodeptr = ERR20;
4201             goto FAILED;
4202             }
4203           *lengthptr += delta;
4204           }
4205
4206         /* This is compiling for real */
4207
4208         else for (i = repeat_max - 1; i >= 0; i--)
4209           {
4210           uschar *hc;
4211           uschar *this_hwm = cd->hwm;
4212
4213           *code++ = OP_BRAZERO + repeat_type;
4214
4215           /* All but the final copy start a new nesting, maintaining the
4216           chain of brackets outstanding. */
4217
4218           if (i != 0)
4219             {
4220             int offset;
4221             *code++ = OP_BRA;
4222             offset = (bralink == NULL)? 0 : code - bralink;
4223             bralink = code;
4224             PUTINC(code, 0, offset);
4225             }
4226
4227           VMPI_memcpy(code, previous, len);
4228           for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
4229             {
4230             PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
4231             cd->hwm += LINK_SIZE;
4232             }
4233           save_hwm = this_hwm;
4234           code += len;
4235           }
4236
4237         /* Now chain through the pending brackets, and fill in their length
4238         fields (which are holding the chain links pro tem). */
4239
4240         while (bralink != NULL)
4241           {
4242           int oldlinkoffset;
4243           int offset = code - bralink + 1;
4244           uschar *bra = code - offset;
4245           oldlinkoffset = GET(bra, 1);
4246           bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
4247           *code++ = OP_KET;
4248           PUTINC(code, 0, offset);
4249           PUT(bra, 1, offset);
4250           }
4251         }
4252
4253       /* If the maximum is unlimited, set a repeater in the final copy. We
4254       can't just offset backwards from the current code point, because we
4255       don't know if there's been an options resetting after the ket. The
4256       correct offset was computed above.
4257
4258       Then, when we are doing the actual compile phase, check to see whether
4259       this group is a non-atomic one that could match an empty string. If so,
4260       convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
4261       that runtime checking can be done. [This check is also applied to
4262       atomic groups at runtime, but in a different way.] */
4263
4264       else
4265         {
4266         uschar *ketcode = code - ketoffset;
4267         uschar *bracode = ketcode - GET(ketcode, 1);
4268         *ketcode = OP_KETRMAX + repeat_type;
4269         if (lengthptr == NULL && *bracode != OP_ONCE)
4270           {
4271           uschar *scode = bracode;
4272           do
4273             {
4274             if (could_be_empty_branch(scode, ketcode, utf8))
4275               {
4276               *bracode += OP_SBRA - OP_BRA;
4277               break;
4278               }
4279             scode += GET(scode, 1);
4280             }
4281           while (*scode == OP_ALT);
4282           }
4283         }
4284       }
4285
4286     /* Else there's some kind of shambles */
4287
4288     else
4289       {
4290       *errorcodeptr = ERR11;
4291       goto FAILED;
4292       }
4293
4294     /* If the character following a repeat is '+', or if certain optimization
4295     tests above succeeded, possessive_quantifier is TRUE. For some of the
4296     simpler opcodes, there is an special alternative opcode for this. For
4297     anything else, we wrap the entire repeated item inside OP_ONCE brackets.
4298     The '+' notation is just syntactic sugar, taken from Sun's Java package,
4299     but the special opcodes can optimize it a bit. The repeated item starts at
4300     tempcode, not at previous, which might be the first part of a string whose
4301     (former) last char we repeated.
4302
4303     Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
4304     an 'upto' may follow. We skip over an 'exact' item, and then test the
4305     length of what remains before proceeding. */
4306
4307     if (possessive_quantifier)
4308       {
4309       int len;
4310       if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
4311           *tempcode == OP_NOTEXACT)
4312         tempcode += _pcre_OP_lengths[*tempcode];
4313       len = code - tempcode;
4314       if (len > 0) switch (*tempcode)
4315         {
4316         case OP_STAR:  *tempcode = OP_POSSTAR; break;
4317         case OP_PLUS:  *tempcode = OP_POSPLUS; break;
4318         case OP_QUERY: *tempcode = OP_POSQUERY; break;
4319         case OP_UPTO:  *tempcode = OP_POSUPTO; break;
4320
4321         case OP_TYPESTAR:  *tempcode = OP_TYPEPOSSTAR; break;
4322         case OP_TYPEPLUS:  *tempcode = OP_TYPEPOSPLUS; break;
4323         case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
4324         case OP_TYPEUPTO:  *tempcode = OP_TYPEPOSUPTO; break;
4325
4326         case OP_NOTSTAR:  *tempcode = OP_NOTPOSSTAR; break;
4327         case OP_NOTPLUS:  *tempcode = OP_NOTPOSPLUS; break;
4328         case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
4329         case OP_NOTUPTO:  *tempcode = OP_NOTPOSUPTO; break;
4330
4331         default:
4332         VMPI_memmove(tempcode + 1+LINK_SIZE, tempcode, len);
4333         code += 1 + LINK_SIZE;
4334         len += 1 + LINK_SIZE;
4335         tempcode[0] = OP_ONCE;
4336         *code++ = OP_KET;
4337         PUTINC(code, 0, len);
4338         PUT(tempcode, 1, len);
4339         break;
4340         }
4341       }
4342
4343     /* In all case we no longer have a previous item. We also set the
4344     "follows varying string" flag for subsequently encountered reqbytes if
4345     it isn't already set and we have just passed a varying length item. */
4346
4347     END_REPEAT:
4348     previous = NULL;
4349     cd->req_varyopt |= reqvary;
4350     break;
4351
4352
4353     /* ===================================================================*/
4354     /* Start of nested parenthesized sub-expression, or comment or lookahead or
4355     lookbehind or option setting or condition or all the other extended
4356     parenthesis forms.  */
4357
4358     case '(':
4359     newoptions = options;
4360     skipbytes = 0;
4361     bravalue = OP_CBRA;
4362     save_hwm = cd->hwm;
4363     reset_bracount = FALSE;
4364
4365     /* First deal with various "verbs" that can be introduced by '*'. */
4366
4367     if (*(++ptr) == '*' && (cd->ctypes[ptr[1]] & ctype_letter) != 0)
4368       {
4369       int i, namelen;
4370       const uschar *name = ++ptr;
4371       previous = NULL;
4372       while ((cd->ctypes[*++ptr] & ctype_letter) != 0){}
4373       if (*ptr == ':')
4374         {
4375         *errorcodeptr = ERR59;   /* Not supported */
4376         goto FAILED;
4377         }
4378       if (*ptr != ')')
4379         {
4380         *errorcodeptr = ERR60;
4381         goto FAILED;
4382         }
4383       namelen = ptr - name;
4384       for (i = 0; i < verbcount; i++)
4385         {
4386         if (namelen == verbs[i].len &&
4387             VMPI_strncmp((char *)name, verbs[i].name, namelen) == 0)
4388           {
4389           *code = verbs[i].op;
4390           if (*code++ == OP_ACCEPT) cd->had_accept = TRUE;
4391           break;
4392           }
4393         }
4394       if (i < verbcount) continue;
4395       *errorcodeptr = ERR60;
4396       goto FAILED;
4397       }
4398
4399     /* Deal with the extended parentheses; all are introduced by '?', and the
4400     appearance of any of them means that this is not a capturing group. */
4401
4402     else if (*ptr == '?')
4403       {
4404       int i, set, unset, namelen;
4405       int *optset;
4406       const uschar *name;
4407       uschar *slot;
4408
4409       switch (*(++ptr))
4410         {
4411         case '#':                 /* Comment; skip to ket */
4412         ptr++;
4413         while (*ptr != 0 && *ptr != ')') ptr++;
4414         if (*ptr == 0)
4415           {
4416           *errorcodeptr = ERR18;
4417           goto FAILED;
4418           }
4419         continue;
4420
4421
4422         /* ------------------------------------------------------------ */
4423         case '|':                 /* Reset capture count for each branch */
4424         reset_bracount = TRUE;
4425         /* Fall through */
4426
4427         /* ------------------------------------------------------------ */
4428         case ':':                 /* Non-capturing bracket */
4429         bravalue = OP_BRA;
4430         ptr++;
4431         break;
4432
4433
4434         /* ------------------------------------------------------------ */
4435         case '(':
4436         bravalue = OP_COND;       /* Conditional group */
4437
4438         /* A condition can be an assertion, a number (referring to a numbered
4439         group), a name (referring to a named group), or 'R', referring to
4440         recursion. R<digits> and R&name are also permitted for recursion tests.
4441
4442         There are several syntaxes for testing a named group: (?(name)) is used
4443         by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
4444
4445         There are two unfortunate ambiguities, caused by history. (a) 'R' can
4446         be the recursive thing or the name 'R' (and similarly for 'R' followed
4447         by digits), and (b) a number could be a name that consists of digits.
4448         In both cases, we look for a name first; if not found, we try the other
4449         cases. */
4450
4451         /* For conditions that are assertions, check the syntax, and then exit
4452         the switch. This will take control down to where bracketed groups,
4453         including assertions, are processed. */
4454
4455         if (ptr[1] == '?' && (ptr[2] == '=' || ptr[2] == '!' || ptr[2] == '<'))
4456           break;
4457
4458         /* Most other conditions use OP_CREF (a couple change to OP_RREF
4459         below), and all need to skip 3 bytes at the start of the group. */
4460
4461         code[1+LINK_SIZE] = OP_CREF;
4462         skipbytes = 3;
4463         refsign = -1;
4464
4465         /* Check for a test for recursion in a named group. */
4466
4467         if (ptr[1] == 'R' && ptr[2] == '&')
4468           {
4469           terminator = -1;
4470           ptr += 2;
4471           code[1+LINK_SIZE] = OP_RREF;    /* Change the type of test */
4472           }
4473
4474         /* Check for a test for a named group's having been set, using the Perl
4475         syntax (?(<name>) or (?('name') */
4476
4477         else if (ptr[1] == '<')
4478           {
4479           terminator = '>';
4480           ptr++;
4481           }
4482         else if (ptr[1] == '\'')
4483           {
4484           terminator = '\'';
4485           ptr++;
4486           }
4487         else
4488           {
4489           terminator = 0;
4490           if (ptr[1] == '-' || ptr[1] == '+') refsign = *(++ptr);
4491           }
4492
4493         /* We now expect to read a name; any thing else is an error */
4494
4495         if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
4496           {
4497           ptr += 1;  /* To get the right offset */
4498           *errorcodeptr = ERR28;
4499           goto FAILED;
4500           }
4501
4502         /* Read the name, but also get it as a number if it's all digits */
4503
4504         recno = 0;
4505         name = ++ptr;
4506         while ((cd->ctypes[*ptr] & ctype_word) != 0)
4507           {
4508           if (recno >= 0)
4509             recno = ((digitab[*ptr] & ctype_digit) != 0)?
4510               recno * 10 + *ptr - '0' : -1;
4511           ptr++;
4512           }
4513         namelen = ptr - name;
4514
4515         if ((terminator > 0 && *ptr++ != terminator) || *ptr++ != ')')
4516           {
4517           ptr--;      /* Error offset */
4518           *errorcodeptr = ERR26;
4519           goto FAILED;
4520           }
4521
4522         /* Do no further checking in the pre-compile phase. */
4523
4524         if (lengthptr != NULL) break;
4525
4526         /* In the real compile we do the work of looking for the actual
4527         reference. If the string started with "+" or "-" we require the rest to
4528         be digits, in which case recno will be set. */
4529
4530         if (refsign > 0)
4531           {
4532           if (recno <= 0)
4533             {
4534             *errorcodeptr = ERR58;
4535             goto FAILED;
4536             }
4537           if (refsign == '-')
4538             {
4539             recno = cd->bracount - recno + 1;
4540             if (recno <= 0)
4541               {
4542               *errorcodeptr = ERR15;
4543               goto FAILED;
4544               }
4545             }
4546           else recno += cd->bracount;
4547           PUT2(code, 2+LINK_SIZE, recno);
4548           break;
4549           }
4550
4551         /* Otherwise (did not start with "+" or "-"), start by looking for the
4552         name. */
4553
4554         slot = cd->name_table;
4555         for (i = 0; i < cd->names_found; i++)
4556           {
4557           if (VMPI_strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
4558           slot += cd->name_entry_size;
4559           }
4560
4561         /* Found a previous named subpattern */
4562
4563         if (i < cd->names_found)
4564           {
4565           recno = GET2(slot, 0);
4566           PUT2(code, 2+LINK_SIZE, recno);
4567           }
4568
4569         /* Search the pattern for a forward reference */
4570
4571         else if ((i = find_parens(ptr, cd->bracount, name, namelen,
4572                         (options & PCRE_EXTENDED) != 0)) > 0)
4573           {
4574           PUT2(code, 2+LINK_SIZE, i);
4575           }
4576
4577         /* If terminator == 0 it means that the name followed directly after
4578         the opening parenthesis [e.g. (?(abc)...] and in this case there are
4579         some further alternatives to try. For the cases where terminator != 0
4580         [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
4581         now checked all the possibilities, so give an error. */
4582
4583         else if (terminator != 0)
4584           {
4585           *errorcodeptr = ERR15;
4586           goto FAILED;
4587           }
4588
4589         /* Check for (?(R) for recursion. Allow digits after R to specify a
4590         specific group number. */
4591
4592         else if (*name == 'R')
4593           {
4594           recno = 0;
4595           for (i = 1; i < namelen; i++)
4596             {
4597             if ((digitab[name[i]] & ctype_digit) == 0)
4598               {
4599               *errorcodeptr = ERR15;
4600               goto FAILED;
4601               }
4602             recno = recno * 10 + name[i] - '0';
4603             }
4604           if (recno == 0) recno = RREF_ANY;
4605           code[1+LINK_SIZE] = OP_RREF;      /* Change test type */
4606           PUT2(code, 2+LINK_SIZE, recno);
4607           }
4608
4609         /* Similarly, check for the (?(DEFINE) "condition", which is always
4610         false. */
4611
4612         else if (namelen == 6 && VMPI_strncmp((char *)name, "DEFINE", 6) == 0)
4613           {
4614           code[1+LINK_SIZE] = OP_DEF;
4615           skipbytes = 1;
4616           }
4617
4618         /* Check for the "name" actually being a subpattern number. */
4619
4620         else if (recno > 0)
4621           {
4622           PUT2(code, 2+LINK_SIZE, recno);
4623           }
4624
4625         /* Either an unidentified subpattern, or a reference to (?(0) */
4626
4627         else
4628           {
4629           *errorcodeptr = (recno == 0)? ERR35: ERR15;
4630           goto FAILED;
4631           }
4632         break;
4633
4634
4635         /* ------------------------------------------------------------ */
4636         case '=':                 /* Positive lookahead */
4637         bravalue = OP_ASSERT;
4638         ptr++;
4639         break;
4640
4641
4642         /* ------------------------------------------------------------ */
4643         case '!':                 /* Negative lookahead */
4644         ptr++;
4645         if (*ptr == ')')          /* Optimize (?!) */
4646           {
4647           *code++ = OP_FAIL;
4648           previous = NULL;
4649           continue;
4650           }
4651         bravalue = OP_ASSERT_NOT;
4652         break;
4653
4654
4655         /* ------------------------------------------------------------ */
4656         case '<':                 /* Lookbehind or named define */
4657         switch (ptr[1])
4658           {
4659           case '=':               /* Positive lookbehind */
4660           bravalue = OP_ASSERTBACK;
4661           ptr += 2;
4662           break;
4663
4664           case '!':               /* Negative lookbehind */
4665           bravalue = OP_ASSERTBACK_NOT;
4666           ptr += 2;
4667           break;
4668
4669           default:                /* Could be name define, else bad */
4670           if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;
4671           ptr++;                  /* Correct offset for error */
4672           *errorcodeptr = ERR24;
4673           goto FAILED;
4674           }
4675         break;
4676
4677
4678         /* ------------------------------------------------------------ */
4679         case '>':                 /* One-time brackets */
4680         bravalue = OP_ONCE;
4681         ptr++;
4682         break;
4683
4684
4685         /* ------------------------------------------------------------ */
4686         case 'C':                 /* Callout - may be followed by digits; */
4687         previous_callout = code;  /* Save for later completion */
4688         after_manual_callout = 1; /* Skip one item before completing */
4689         *code++ = OP_CALLOUT;
4690           {
4691           int n = 0;
4692           while ((digitab[*(++ptr)] & ctype_digit) != 0)
4693             n = n * 10 + *ptr - '0';
4694           if (*ptr != ')')
4695             {
4696             *errorcodeptr = ERR39;
4697             goto FAILED;
4698             }
4699           if (n > 255)
4700             {
4701             *errorcodeptr = ERR38;
4702             goto FAILED;
4703             }
4704           *code++ = n;
4705           PUT(code, 0, ptr - cd->start_pattern + 1);  /* Pattern offset */
4706           PUT(code, LINK_SIZE, 0);                    /* Default length */
4707           code += 2 * LINK_SIZE;
4708           }
4709         previous = NULL;
4710         continue;
4711
4712
4713         /* ------------------------------------------------------------ */
4714         case 'P':                 /* Python-style named subpattern handling */
4715         if (*(++ptr) == '=' || *ptr == '>')  /* Reference or recursion */
4716           {
4717           is_recurse = *ptr == '>';
4718           terminator = ')';
4719           goto NAMED_REF_OR_RECURSE;
4720           }
4721         else if (*ptr != '<')    /* Test for Python-style definition */
4722           {
4723           *errorcodeptr = ERR41;
4724           goto FAILED;
4725           }
4726         /* Fall through to handle (?P< as (?< is handled */
4727
4728
4729         /* ------------------------------------------------------------ */
4730         DEFINE_NAME:    /* Come here from (?< handling */
4731         case '\'':
4732           {
4733           terminator = (*ptr == '<')? '>' : '\'';
4734           name = ++ptr;
4735
4736           while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4737           namelen = ptr - name;
4738
4739           /* In the pre-compile phase, just do a syntax check. */
4740
4741           if (lengthptr != NULL)
4742             {
4743             if (*ptr != terminator)
4744               {
4745               *errorcodeptr = ERR42;
4746               goto FAILED;
4747               }
4748             if (cd->names_found >= MAX_NAME_COUNT)
4749               {
4750               *errorcodeptr = ERR49;
4751               goto FAILED;
4752               }
4753             if (namelen + 3 > cd->name_entry_size)
4754               {
4755               cd->name_entry_size = namelen + 3;
4756               if (namelen > MAX_NAME_SIZE)
4757                 {
4758                 *errorcodeptr = ERR48;
4759                 goto FAILED;
4760                 }
4761               }
4762             }
4763
4764           /* In the real compile, create the entry in the table */
4765
4766           else
4767             {
4768             slot = cd->name_table;
4769             for (i = 0; i < cd->names_found; i++)
4770               {
4771               int crc = VMPI_memcmp(name, slot+2, namelen);
4772               if (crc == 0)
4773                 {
4774                 if (slot[2+namelen] == 0)
4775                   {
4776                   if ((options & PCRE_DUPNAMES) == 0)
4777                     {
4778                     *errorcodeptr = ERR43;
4779                     goto FAILED;
4780                     }
4781                   }
4782                 else crc = -1;      /* Current name is substring */
4783                 }
4784               if (crc < 0)
4785                 {
4786                 VMPI_memmove(slot + cd->name_entry_size, slot,
4787                   (cd->names_found - i) * cd->name_entry_size);
4788                 break;
4789                 }
4790               slot += cd->name_entry_size;
4791               }
4792
4793             PUT2(slot, 0, cd->bracount + 1);
4794             VMPI_memcpy(slot + 2, name, namelen);
4795             slot[2+namelen] = 0;
4796             }
4797           }
4798
4799         /* In both cases, count the number of names we've encountered. */
4800
4801         ptr++;                    /* Move past > or ' */
4802         cd->names_found++;
4803         goto NUMBERED_GROUP;
4804
4805
4806         /* ------------------------------------------------------------ */
4807         case '&':                 /* Perl recursion/subroutine syntax */
4808         terminator = ')';
4809         is_recurse = TRUE;
4810         /* Fall through */
4811
4812         /* We come here from the Python syntax above that handles both
4813         references (?P=name) and recursion (?P>name), as well as falling
4814         through from the Perl recursion syntax (?&name). */
4815
4816         NAMED_REF_OR_RECURSE:
4817         name = ++ptr;
4818         while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4819         namelen = ptr - name;
4820
4821         /* In the pre-compile phase, do a syntax check and set a dummy
4822         reference number. */
4823
4824         if (lengthptr != NULL)
4825           {
4826           if (*ptr != terminator)
4827             {
4828             *errorcodeptr = ERR42;
4829             goto FAILED;
4830             }
4831           if (namelen > MAX_NAME_SIZE)
4832             {
4833             *errorcodeptr = ERR48;
4834             goto FAILED;
4835             }
4836           recno = 0;
4837           }
4838
4839         /* In the real compile, seek the name in the table */
4840
4841         else
4842           {
4843           slot = cd->name_table;
4844           for (i = 0; i < cd->names_found; i++)
4845             {
4846             if (VMPI_strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
4847             slot += cd->name_entry_size;
4848             }
4849
4850           if (i < cd->names_found)         /* Back reference */
4851             {
4852             recno = GET2(slot, 0);
4853             }
4854           else if ((recno =                /* Forward back reference */
4855                     find_parens(ptr, cd->bracount, name, namelen,
4856                       (options & PCRE_EXTENDED) != 0)) <= 0)
4857             {
4858             *errorcodeptr = ERR15;
4859             goto FAILED;
4860             }
4861           }
4862
4863         /* In both phases, we can now go to the code than handles numerical
4864         recursion or backreferences. */
4865
4866         if (is_recurse) goto HANDLE_RECURSION;
4867           else goto HANDLE_REFERENCE;
4868
4869
4870         /* ------------------------------------------------------------ */
4871         case 'R':                 /* Recursion */
4872         ptr++;                    /* Same as (?0)      */
4873         /* Fall through */
4874
4875
4876         /* ------------------------------------------------------------ */
4877         case '-': case '+':
4878         case '0': case '1': case '2': case '3': case '4':   /* Recursion or */
4879         case '5': case '6': case '7': case '8': case '9':   /* subroutine */
4880           {
4881           const uschar *called;
4882
4883           if ((refsign = *ptr) == '+') ptr++;
4884           else if (refsign == '-')
4885             {
4886             if ((digitab[ptr[1]] & ctype_digit) == 0)
4887               goto OTHER_CHAR_AFTER_QUERY;
4888             ptr++;
4889             }
4890
4891           recno = 0;
4892           while((digitab[*ptr] & ctype_digit) != 0)
4893             recno = recno * 10 + *ptr++ - '0';
4894
4895           if (*ptr != ')')
4896             {
4897             *errorcodeptr = ERR29;
4898             goto FAILED;
4899             }
4900
4901           if (refsign == '-')
4902             {
4903             if (recno == 0)
4904               {
4905               *errorcodeptr = ERR58;
4906               goto FAILED;
4907               }
4908             recno = cd->bracount - recno + 1;
4909             if (recno <= 0)
4910               {
4911               *errorcodeptr = ERR15;
4912               goto FAILED;
4913               }
4914             }
4915           else if (refsign == '+')
4916             {
4917             if (recno == 0)
4918               {
4919               *errorcodeptr = ERR58;
4920               goto FAILED;
4921               }
4922             recno += cd->bracount;
4923             }
4924
4925           /* Come here from code above that handles a named recursion */
4926
4927           HANDLE_RECURSION:
4928
4929           previous = code;
4930           called = cd->start_code;
4931
4932           /* When we are actually compiling, find the bracket that is being
4933           referenced. Temporarily end the regex in case it doesn't exist before
4934           this point. If we end up with a forward reference, first check that
4935           the bracket does occur later so we can give the error (and position)
4936           now. Then remember this forward reference in the workspace so it can
4937           be filled in at the end. */
4938
4939           if (lengthptr == NULL)
4940             {
4941             *code = OP_END;
4942             if (recno != 0) called = find_bracket(cd->start_code, utf8, recno);
4943
4944             /* Forward reference */
4945
4946             if (called == NULL)
4947               {
4948               if (find_parens(ptr, cd->bracount, NULL, recno,
4949                    (options & PCRE_EXTENDED) != 0) < 0)
4950                 {
4951                 *errorcodeptr = ERR15;
4952                 goto FAILED;
4953                 }
4954               called = cd->start_code + recno;
4955               PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code);
4956               }
4957
4958             /* If not a forward reference, and the subpattern is still open,
4959             this is a recursive call. We check to see if this is a left
4960             recursion that could loop for ever, and diagnose that case. */
4961
4962             else if (GET(called, 1) == 0 &&
4963                      could_be_empty(called, code, bcptr, utf8))
4964               {
4965               *errorcodeptr = ERR40;
4966               goto FAILED;
4967               }
4968             }
4969
4970           /* Insert the recursion/subroutine item, automatically wrapped inside
4971           "once" brackets. Set up a "previous group" length so that a
4972           subsequent quantifier will work. */
4973
4974           *code = OP_ONCE;
4975           PUT(code, 1, 2 + 2*LINK_SIZE);
4976           code += 1 + LINK_SIZE;
4977
4978           *code = OP_RECURSE;
4979           PUT(code, 1, called - cd->start_code);
4980           code += 1 + LINK_SIZE;
4981
4982           *code = OP_KET;
4983           PUT(code, 1, 2 + 2*LINK_SIZE);
4984           code += 1 + LINK_SIZE;
4985
4986           length_prevgroup = 3 + 3*LINK_SIZE;
4987           }
4988
4989         /* Can't determine a first byte now */
4990
4991         if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
4992         continue;
4993
4994
4995         /* ------------------------------------------------------------ */
4996         default:              /* Other characters: check option setting */
4997         OTHER_CHAR_AFTER_QUERY:
4998         set = unset = 0;
4999         optset = &set;
5000
5001         while (*ptr != ')' && *ptr != ':')
5002           {
5003           switch (*ptr++)
5004             {
5005             case '-': optset = &unset; break;
5006
5007             case 'J':    /* Record that it changed in the external options */
5008             *optset |= PCRE_DUPNAMES;
5009             cd->external_options |= PCRE_JCHANGED;
5010             break;
5011
5012             case 'i': *optset |= PCRE_CASELESS; break;
5013             case 'm': *optset |= PCRE_MULTILINE; break;
5014             case 's': *optset |= PCRE_DOTALL; break;
5015             case 'x': *optset |= PCRE_EXTENDED; break;
5016             case 'U': *optset |= PCRE_UNGREEDY; break;
5017             case 'X': *optset |= PCRE_EXTRA; break;
5018
5019             default:  *errorcodeptr = ERR12;
5020                       ptr--;    /* Correct the offset */
5021                       goto FAILED;
5022             }
5023           }
5024
5025         /* Set up the changed option bits, but don't change anything yet. */
5026
5027         newoptions = (options | set) & (~unset);
5028
5029         /* If the options ended with ')' this is not the start of a nested
5030         group with option changes, so the options change at this level. If this
5031         item is right at the start of the pattern, the options can be
5032         abstracted and made external in the pre-compile phase, and ignored in
5033         the compile phase. This can be helpful when matching -- for instance in
5034         caseless checking of required bytes.
5035
5036         If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
5037         definitely *not* at the start of the pattern because something has been
5038         compiled. In the pre-compile phase, however, the code pointer can have
5039         that value after the start, because it gets reset as code is discarded
5040         during the pre-compile. However, this can happen only at top level - if
5041         we are within parentheses, the starting BRA will still be present. At
5042         any parenthesis level, the length value can be used to test if anything
5043         has been compiled at that level. Thus, a test for both these conditions
5044         is necessary to ensure we correctly detect the start of the pattern in
5045         both phases.
5046
5047         If we are not at the pattern start, compile code to change the ims
5048         options if this setting actually changes any of them, and reset the
5049         greedy defaults and the case value for firstbyte and reqbyte. */
5050
5051         if (*ptr == ')')
5052           {
5053           if (code == cd->start_code + 1 + LINK_SIZE &&
5054                (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
5055             {
5056             cd->external_options = newoptions;
5057             }
5058          else
5059             {
5060             if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
5061               {
5062               *code++ = OP_OPT;
5063               *code++ = newoptions & PCRE_IMS;
5064               }
5065             greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
5066             greedy_non_default = greedy_default ^ 1;
5067             req_caseopt = ((newoptions & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
5068             }
5069
5070           /* Change options at this level, and pass them back for use
5071           in subsequent branches. When not at the start of the pattern, this
5072           information is also necessary so that a resetting item can be
5073           compiled at the end of a group (if we are in a group). */
5074
5075           *optionsptr = options = newoptions;
5076           previous = NULL;       /* This item can't be repeated */
5077           continue;              /* It is complete */
5078           }
5079
5080         /* If the options ended with ':' we are heading into a nested group
5081         with possible change of options. Such groups are non-capturing and are
5082         not assertions of any kind. All we need to do is skip over the ':';
5083         the newoptions value is handled below. */
5084
5085         bravalue = OP_BRA;
5086         ptr++;
5087         }     /* End of switch for character following (? */
5088       }       /* End of (? handling */
5089
5090     /* Opening parenthesis not followed by '?'. If PCRE_NO_AUTO_CAPTURE is set,
5091     all unadorned brackets become non-capturing and behave like (?:...)
5092     brackets. */
5093
5094     else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
5095       {
5096       bravalue = OP_BRA;
5097       }
5098
5099     /* Else we have a capturing group. */
5100
5101     else
5102       {
5103       NUMBERED_GROUP:
5104       cd->bracount += 1;
5105       PUT2(code, 1+LINK_SIZE, cd->bracount);
5106       skipbytes = 2;
5107       }
5108
5109     /* Process nested bracketed regex. Assertions may not be repeated, but
5110     other kinds can be. All their opcodes are >= OP_ONCE. We copy code into a
5111     non-register variable in order to be able to pass its address because some
5112     compilers complain otherwise. Pass in a new setting for the ims options if
5113     they have changed. */
5114
5115     previous = (bravalue >= OP_ONCE)? code : NULL;
5116     *code = bravalue;
5117     tempcode = code;
5118     tempreqvary = cd->req_varyopt;     /* Save value before bracket */
5119     length_prevgroup = 0;              /* Initialize for pre-compile phase */
5120
5121     if (!compile_regex(
5122          newoptions,                   /* The complete new option state */
5123          options & PCRE_IMS,           /* The previous ims option state */
5124          &tempcode,                    /* Where to put code (updated) */
5125          &ptr,                         /* Input pointer (updated) */
5126          errorcodeptr,                 /* Where to put an error message */
5127          (bravalue == OP_ASSERTBACK ||
5128           bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
5129          reset_bracount,               /* True if (?| group */
5130          skipbytes,                    /* Skip over bracket number */
5131          &subfirstbyte,                /* For possible first char */
5132          &subreqbyte,                  /* For possible last char */
5133          bcptr,                        /* Current branch chain */
5134          cd,                           /* Tables block */
5135          (lengthptr == NULL)? NULL :   /* Actual compile phase */
5136            &length_prevgroup           /* Pre-compile phase */
5137          ))
5138       goto FAILED;
5139
5140     /* At the end of compiling, code is still pointing to the start of the
5141     group, while tempcode has been updated to point past the end of the group
5142     and any option resetting that may follow it. The pattern pointer (ptr)
5143     is on the bracket. */
5144
5145     /* If this is a conditional bracket, check that there are no more than
5146     two branches in the group, or just one if it's a DEFINE group. We do this
5147     in the real compile phase, not in the pre-pass, where the whole group may
5148     not be available. */
5149
5150     if (bravalue == OP_COND && lengthptr == NULL)
5151       {
5152       uschar *tc = code;
5153       int condcount = 0;
5154
5155       do {
5156          condcount++;
5157          tc += GET(tc,1);
5158          }
5159       while (*tc != OP_KET);
5160
5161       /* A DEFINE group is never obeyed inline (the "condition" is always
5162       false). It must have only one branch. */
5163
5164       if (code[LINK_SIZE+1] == OP_DEF)
5165         {
5166         if (condcount > 1)
5167           {
5168           *errorcodeptr = ERR54;
5169           goto FAILED;
5170           }
5171         bravalue = OP_DEF;   /* Just a flag to suppress char handling below */
5172         }
5173
5174       /* A "normal" conditional group. If there is just one branch, we must not
5175       make use of its firstbyte or reqbyte, because this is equivalent to an
5176       empty second branch. */
5177
5178       else
5179         {
5180         if (condcount > 2)
5181           {
5182           *errorcodeptr = ERR27;
5183           goto FAILED;
5184           }
5185         if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
5186         }
5187       }
5188
5189     /* Error if hit end of pattern */
5190
5191     if (*ptr != ')')
5192       {
5193       *errorcodeptr = ERR14;
5194       goto FAILED;
5195       }
5196
5197     /* In the pre-compile phase, update the length by the length of the group,
5198     less the brackets at either end. Then reduce the compiled code to just a
5199     set of non-capturing brackets so that it doesn't use much memory if it is
5200     duplicated by a quantifier.*/
5201
5202     if (lengthptr != NULL)
5203       {
5204       if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
5205         {
5206         *errorcodeptr = ERR20;
5207         goto FAILED;
5208         }
5209       *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
5210       *code++ = OP_BRA;
5211       PUTINC(code, 0, 1 + LINK_SIZE);
5212       *code++ = OP_KET;
5213       PUTINC(code, 0, 1 + LINK_SIZE);
5214       break;    /* No need to waste time with special character handling */
5215       }
5216
5217     /* Otherwise update the main code pointer to the end of the group. */
5218
5219     code = tempcode;
5220
5221     /* For a DEFINE group, required and first character settings are not
5222     relevant. */
5223
5224     if (bravalue == OP_DEF) break;
5225
5226     /* Handle updating of the required and first characters for other types of
5227     group. Update for normal brackets of all kinds, and conditions with two
5228     branches (see code above). If the bracket is followed by a quantifier with
5229     zero repeat, we have to back off. Hence the definition of zeroreqbyte and
5230     zerofirstbyte outside the main loop so that they can be accessed for the
5231     back off. */
5232
5233     zeroreqbyte = reqbyte;
5234     zerofirstbyte = firstbyte;
5235     groupsetfirstbyte = FALSE;
5236
5237     if (bravalue >= OP_ONCE)
5238       {
5239       /* If we have not yet set a firstbyte in this branch, take it from the
5240       subpattern, remembering that it was set here so that a repeat of more
5241       than one can replicate it as reqbyte if necessary. If the subpattern has
5242       no firstbyte, set "none" for the whole branch. In both cases, a zero
5243       repeat forces firstbyte to "none". */
5244
5245       if (firstbyte == REQ_UNSET)
5246         {
5247         if (subfirstbyte >= 0)
5248           {
5249           firstbyte = subfirstbyte;
5250           groupsetfirstbyte = TRUE;
5251           }
5252         else firstbyte = REQ_NONE;
5253         zerofirstbyte = REQ_NONE;
5254         }
5255
5256       /* If firstbyte was previously set, convert the subpattern's firstbyte
5257       into reqbyte if there wasn't one, using the vary flag that was in
5258       existence beforehand. */
5259
5260       else if (subfirstbyte >= 0 && subreqbyte < 0)
5261         subreqbyte = subfirstbyte | tempreqvary;
5262
5263       /* If the subpattern set a required byte (or set a first byte that isn't
5264       really the first byte - see above), set it. */
5265
5266       if (subreqbyte >= 0) reqbyte = subreqbyte;
5267       }
5268
5269     /* For a forward assertion, we take the reqbyte, if set. This can be
5270     helpful if the pattern that follows the assertion doesn't set a different
5271     char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
5272     for an assertion, however because it leads to incorrect effect for patterns
5273     such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
5274     of a firstbyte. This is overcome by a scan at the end if there's no
5275     firstbyte, looking for an asserted first char. */
5276
5277     else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
5278     break;     /* End of processing '(' */
5279
5280
5281     /* ===================================================================*/
5282     /* Handle metasequences introduced by \. For ones like \d, the ESC_ values
5283     are arranged to be the negation of the corresponding OP_values. For the
5284     back references, the values are ESC_REF plus the reference number. Only
5285     back references and those types that consume a character may be repeated.
5286     We can test for values between ESC_b and ESC_Z for the latter; this may
5287     have to change if any new ones are ever created. */
5288
5289     case '\\':
5290     tempptr = ptr;
5291     c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE);
5292     if (*errorcodeptr != 0) goto FAILED;
5293
5294     if (c < 0)
5295       {
5296       if (-c == ESC_Q)            /* Handle start of quoted string */
5297         {
5298         if (ptr[1] == '\\' && ptr[2] == 'E') ptr += 2; /* avoid empty string */
5299           else inescq = TRUE;
5300         continue;
5301         }
5302
5303       if (-c == ESC_E) continue;  /* Perl ignores an orphan \E */
5304
5305       /* For metasequences that actually match a character, we disable the
5306       setting of a first character if it hasn't already been set. */
5307
5308       if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
5309         firstbyte = REQ_NONE;
5310
5311       /* Set values to reset to if this is followed by a zero repeat. */
5312
5313       zerofirstbyte = firstbyte;
5314       zeroreqbyte = reqbyte;
5315
5316       /* \k<name> or \k'name' is a back reference by name (Perl syntax).
5317       We also support \k{name} (.NET syntax) */
5318
5319       if (-c == ESC_k && (ptr[1] == '<' || ptr[1] == '\'' || ptr[1] == '{'))
5320         {
5321         is_recurse = FALSE;
5322         terminator = (*(++ptr) == '<')? '>' : (*ptr == '\'')? '\'' : '}';
5323         goto NAMED_REF_OR_RECURSE;
5324         }
5325
5326       /* Back references are handled specially; must disable firstbyte if
5327       not set to cope with cases like (?=(\w+))\1: which would otherwise set
5328       ':' later. */
5329
5330       if (-c >= ESC_REF)
5331         {
5332         recno = -c - ESC_REF;
5333
5334         HANDLE_REFERENCE:    /* Come here from named backref handling */
5335         if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
5336         previous = code;
5337         *code++ = OP_REF;
5338         PUT2INC(code, 0, recno);
5339         cd->backref_map |= (recno < 32)? (1 << recno) : 1;
5340         if (recno > cd->top_backref) cd->top_backref = recno;
5341         }
5342
5343       /* So are Unicode property matches, if supported. */
5344
5345 #ifdef SUPPORT_UCP
5346       else if (-c == ESC_P || -c == ESC_p)
5347         {
5348         BOOL negated;
5349         int pdata;
5350         int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
5351         if (ptype < 0) goto FAILED;
5352         previous = code;
5353         *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
5354         *code++ = ptype;
5355         *code++ = pdata;
5356         }
5357 #else
5358
5359       /* If Unicode properties are not supported, \X, \P, and \p are not
5360       allowed. */
5361
5362       else if (-c == ESC_X || -c == ESC_P || -c == ESC_p)
5363         {
5364         *errorcodeptr = ERR45;
5365         goto FAILED;
5366         }
5367 #endif
5368
5369       /* For the rest (including \X when Unicode properties are supported), we
5370       can obtain the OP value by negating the escape value. */
5371
5372       else
5373         {
5374         previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
5375         *code++ = -c;
5376         }
5377       continue;
5378       }
5379
5380     /* We have a data character whose value is in c. In UTF-8 mode it may have
5381     a value > 127. We set its representation in the length/buffer, and then
5382     handle it as a data character. */
5383
5384 #ifdef SUPPORT_UTF8
5385     if (utf8 && c > 127)
5386       mclength = _pcre_ord2utf8(c, mcbuffer);
5387     else
5388 #endif
5389
5390      {
5391      mcbuffer[0] = c;
5392      mclength = 1;
5393      }
5394     goto ONE_CHAR;
5395
5396
5397     /* ===================================================================*/
5398     /* Handle a literal character. It is guaranteed not to be whitespace or #
5399     when the extended flag is set. If we are in UTF-8 mode, it may be a
5400     multi-byte literal character. */
5401
5402     default:
5403     NORMAL_CHAR:
5404     mclength = 1;
5405     mcbuffer[0] = c;
5406
5407 #ifdef SUPPORT_UTF8
5408     if (utf8 && c >= 0xc0)
5409       {
5410       while ((ptr[1] & 0xc0) == 0x80)
5411         mcbuffer[mclength++] = *(++ptr);
5412       }
5413 #endif
5414
5415     /* At this point we have the character's bytes in mcbuffer, and the length
5416     in mclength. When not in UTF-8 mode, the length is always 1. */
5417
5418     ONE_CHAR:
5419     previous = code;
5420     *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR;
5421     for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
5422
5423     /* Remember if \r or \n were seen */
5424
5425     if (mcbuffer[0] == '\r' || mcbuffer[0] == '\n')
5426       cd->external_options |= PCRE_HASCRORLF;
5427
5428     /* Set the first and required bytes appropriately. If no previous first
5429     byte, set it from this character, but revert to none on a zero repeat.
5430     Otherwise, leave the firstbyte value alone, and don't change it on a zero
5431     repeat. */
5432
5433     if (firstbyte == REQ_UNSET)
5434       {
5435       zerofirstbyte = REQ_NONE;
5436       zeroreqbyte = reqbyte;
5437
5438       /* If the character is more than one byte long, we can set firstbyte
5439       only if it is not to be matched caselessly. */
5440
5441       if (mclength == 1 || req_caseopt == 0)
5442         {
5443         firstbyte = mcbuffer[0] | req_caseopt;
5444         if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt;
5445         }
5446       else firstbyte = reqbyte = REQ_NONE;
5447       }
5448
5449     /* firstbyte was previously set; we can set reqbyte only the length is
5450     1 or the matching is caseful. */
5451
5452     else
5453       {
5454       zerofirstbyte = firstbyte;
5455       zeroreqbyte = reqbyte;
5456       if (mclength == 1 || req_caseopt == 0)
5457         reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
5458       }
5459
5460     break;            /* End of literal character handling */
5461     }
5462   }                   /* end of big loop */
5463
5464
5465 /* Control never reaches here by falling through, only by a goto for all the
5466 error states. Pass back the position in the pattern so that it can be displayed
5467 to the user for diagnosing the error. */
5468
5469 FAILED:
5470 *ptrptr = ptr;
5471 return FALSE;
5472 }
5473
5474
5475
5476
5477 /*************************************************
5478 *     Compile sequence of alternatives           *
5479 *************************************************/
5480
5481 /* On entry, ptr is pointing past the bracket character, but on return it
5482 points to the closing bracket, or vertical bar, or end of string. The code
5483 variable is pointing at the byte into which the BRA operator has been stored.
5484 If the ims options are changed at the start (for a (?ims: group) or during any
5485 branch, we need to insert an OP_OPT item at the start of every following branch
5486 to ensure they get set correctly at run time, and also pass the new options
5487 into every subsequent branch compile.
5488
5489 This function is used during the pre-compile phase when we are trying to find
5490 out the amount of memory needed, as well as during the real compile phase. The
5491 value of lengthptr distinguishes the two phases.
5492
5493 Arguments:
5494   options        option bits, including any changes for this subpattern
5495   oldims         previous settings of ims option bits
5496   codeptr        -> the address of the current code pointer
5497   ptrptr         -> the address of the current pattern pointer
5498   errorcodeptr   -> pointer to error code variable
5499   lookbehind     TRUE if this is a lookbehind assertion
5500   reset_bracount TRUE to reset the count for each branch
5501   skipbytes      skip this many bytes at start (for brackets and OP_COND)
5502   firstbyteptr   place to put the first required character, or a negative number
5503   reqbyteptr     place to put the last required character, or a negative number
5504   bcptr          pointer to the chain of currently open branches
5505   cd             points to the data block with tables pointers etc.
5506   lengthptr      NULL during the real compile phase
5507                  points to length accumulator during pre-compile phase
5508
5509 Returns:         TRUE on success
5510 */
5511
5512 static BOOL
5513 compile_regex(int options, int oldims, uschar **codeptr, const uschar **ptrptr,
5514   int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipbytes,
5515   int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd,
5516   int *lengthptr)
5517 {
5518 const uschar *ptr = *ptrptr;
5519 uschar *code = *codeptr;
5520 uschar *last_branch = code;
5521 uschar *start_bracket = code;
5522 uschar *reverse_count = NULL;
5523 int firstbyte, reqbyte;
5524 int branchfirstbyte, branchreqbyte;
5525 int length;
5526 int orig_bracount;
5527 int max_bracount;
5528 branch_chain bc;
5529
5530 avmplus::AvmCore::checkPCREStackOverflow();
5531
5532 bc.outer = bcptr;
5533 bc.current = code;
5534
5535 firstbyte = reqbyte = REQ_UNSET;
5536
5537 /* Accumulate the length for use in the pre-compile phase. Start with the
5538 length of the BRA and KET and any extra bytes that are required at the
5539 beginning. We accumulate in a local variable to save frequent testing of
5540 lenthptr for NULL. We cannot do this by looking at the value of code at the
5541 start and end of each alternative, because compiled items are discarded during
5542 the pre-compile phase so that the work space is not exceeded. */
5543
5544 length = 2 + 2*LINK_SIZE + skipbytes;
5545
5546 /* WARNING: If the above line is changed for any reason, you must also change
5547 the code that abstracts option settings at the start of the pattern and makes
5548 them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
5549 pre-compile phase to find out whether anything has yet been compiled or not. */
5550
5551 /* Offset is set zero to mark that this bracket is still open */
5552
5553 PUT(code, 1, 0);
5554 code += 1 + LINK_SIZE + skipbytes;
5555
5556 /* Loop for each alternative branch */
5557
5558 orig_bracount = max_bracount = cd->bracount;
5559 for (;;)
5560   {
5561   /* For a (?| group, reset the capturing bracket count so that each branch
5562   uses the same numbers. */
5563
5564   if (reset_bracount) cd->bracount = orig_bracount;
5565
5566   /* Handle a change of ims options at the start of the branch */
5567
5568   if ((options & PCRE_IMS) != oldims)
5569     {
5570     *code++ = OP_OPT;
5571     *code++ = options & PCRE_IMS;
5572     length += 2;
5573     }
5574
5575   /* Set up dummy OP_REVERSE if lookbehind assertion */
5576
5577   if (lookbehind)
5578     {
5579     *code++ = OP_REVERSE;
5580     reverse_count = code;
5581     PUTINC(code, 0, 0);
5582     length += 1 + LINK_SIZE;
5583     }
5584
5585   /* Now compile the branch; in the pre-compile phase its length gets added
5586   into the length. */
5587
5588   if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstbyte,
5589         &branchreqbyte, &bc, cd, (lengthptr == NULL)? NULL : &length))
5590     {
5591     *ptrptr = ptr;
5592     return FALSE;
5593     }
5594
5595   /* Keep the highest bracket count in case (?| was used and some branch
5596   has fewer than the rest. */
5597
5598   if (cd->bracount > max_bracount) max_bracount = cd->bracount;
5599
5600   /* In the real compile phase, there is some post-processing to be done. */
5601
5602   if (lengthptr == NULL)
5603     {
5604     /* If this is the first branch, the firstbyte and reqbyte values for the
5605     branch become the values for the regex. */
5606
5607     if (*last_branch != OP_ALT)
5608       {
5609       firstbyte = branchfirstbyte;
5610       reqbyte = branchreqbyte;
5611       }
5612
5613     /* If this is not the first branch, the first char and reqbyte have to
5614     match the values from all the previous branches, except that if the
5615     previous value for reqbyte didn't have REQ_VARY set, it can still match,
5616     and we set REQ_VARY for the regex. */
5617
5618     else
5619       {
5620       /* If we previously had a firstbyte, but it doesn't match the new branch,
5621       we have to abandon the firstbyte for the regex, but if there was
5622       previously no reqbyte, it takes on the value of the old firstbyte. */
5623
5624       if (firstbyte >= 0 && firstbyte != branchfirstbyte)
5625         {
5626         if (reqbyte < 0) reqbyte = firstbyte;
5627         firstbyte = REQ_NONE;
5628         }
5629
5630       /* If we (now or from before) have no firstbyte, a firstbyte from the
5631       branch becomes a reqbyte if there isn't a branch reqbyte. */
5632
5633       if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
5634           branchreqbyte = branchfirstbyte;
5635
5636       /* Now ensure that the reqbytes match */
5637
5638       if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
5639         reqbyte = REQ_NONE;
5640       else reqbyte |= branchreqbyte;   /* To "or" REQ_VARY */
5641       }
5642
5643     /* If lookbehind, check that this branch matches a fixed-length string, and
5644     put the length into the OP_REVERSE item. Temporarily mark the end of the
5645     branch with OP_END. */
5646
5647     if (lookbehind)
5648       {
5649       int fixed_length;
5650       *code = OP_END;
5651       fixed_length = find_fixedlength(last_branch, options);
5652       DPRINTF(("fixed length = %d\n", fixed_length));
5653       if (fixed_length < 0)
5654         {
5655         *errorcodeptr = (fixed_length == -2)? ERR36 : ERR25;
5656         *ptrptr = ptr;
5657         return FALSE;
5658         }
5659       PUT(reverse_count, 0, fixed_length);
5660       }
5661     }
5662
5663   /* Reached end of expression, either ')' or end of pattern. In the real
5664   compile phase, go back through the alternative branches and reverse the chain
5665   of offsets, with the field in the BRA item now becoming an offset to the
5666   first alternative. If there are no alternatives, it points to the end of the
5667   group. The length in the terminating ket is always the length of the whole
5668   bracketed item. If any of the ims options were changed inside the group,
5669   compile a resetting op-code following, except at the very end of the pattern.
5670   Return leaving the pointer at the terminating char. */
5671
5672   if (*ptr != '|')
5673     {
5674     if (lengthptr == NULL)
5675       {
5676       int branch_length = code - last_branch;
5677       do
5678         {
5679         int prev_length = GET(last_branch, 1);
5680         PUT(last_branch, 1, branch_length);
5681         branch_length = prev_length;
5682         last_branch -= branch_length;
5683         }
5684       while (branch_length > 0);
5685       }
5686
5687     /* Fill in the ket */
5688
5689     *code = OP_KET;
5690     PUT(code, 1, code - start_bracket);
5691     code += 1 + LINK_SIZE;
5692
5693     /* Resetting option if needed */
5694
5695     if ((options & PCRE_IMS) != oldims && *ptr == ')')
5696       {
5697       *code++ = OP_OPT;
5698       *code++ = oldims;
5699       length += 2;
5700       }
5701
5702     /* Retain the highest bracket number, in case resetting was used. */
5703
5704     cd->bracount = max_bracount;
5705
5706     /* Set values to pass back */
5707
5708     *codeptr = code;
5709     *ptrptr = ptr;
5710     *firstbyteptr = firstbyte;
5711     *reqbyteptr = reqbyte;
5712     if (lengthptr != NULL)
5713       {
5714       if (OFLOW_MAX - *lengthptr < length)
5715         {
5716         *errorcodeptr = ERR20;
5717         return FALSE;
5718         }
5719       *lengthptr += length;
5720       }
5721     return TRUE;
5722     }
5723
5724   /* Another branch follows. In the pre-compile phase, we can move the code
5725   pointer back to where it was for the start of the first branch. (That is,
5726   pretend that each branch is the only one.)
5727
5728   In the real compile phase, insert an ALT node. Its length field points back
5729   to the previous branch while the bracket remains open. At the end the chain
5730   is reversed. It's done like this so that the start of the bracket has a
5731   zero offset until it is closed, making it possible to detect recursion. */
5732
5733   if (lengthptr != NULL)
5734     {
5735     code = *codeptr + 1 + LINK_SIZE + skipbytes;
5736     length += 1 + LINK_SIZE;
5737     }
5738   else
5739     {
5740     *code = OP_ALT;
5741     PUT(code, 1, code - last_branch);
5742     bc.current = last_branch = code;
5743     code += 1 + LINK_SIZE;
5744     }
5745
5746   ptr++;
5747   }
5748 /* Control never reaches here */
5749 }
5750
5751
5752
5753 /************************************************************************
5754  * Stack implementation used in is_anchored and is_startline functions. *
5755  ************************************************************************/
5756
5757 typedef struct pi_ia_is {
5758         const uschar *code;
5759         unsigned int bracket_map;
5760 } pi_ia_is;
5761
5762
5763 /*************************************************
5764 *          Check for anchored expression         *
5765 *************************************************/
5766
5767 /* Try to find out if this is an anchored regular expression. Consider each
5768 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
5769 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
5770 it's anchored. However, if this is a multiline pattern, then only OP_SOD
5771 counts, since OP_CIRC can match in the middle.
5772
5773 We can also consider a regex to be anchored if OP_SOM starts all its branches.
5774 This is the code for \G, which means "match at start of match position, taking
5775 into account the match offset".
5776
5777 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
5778 because that will try the rest of the pattern at all possible matching points,
5779 so there is no point trying again.... er ....
5780
5781 .... except when the .* appears inside capturing parentheses, and there is a
5782 subsequent back reference to those parentheses. We haven't enough information
5783 to catch that case precisely.
5784
5785 At first, the best we could do was to detect when .* was in capturing brackets
5786 and the highest back reference was greater than or equal to that level.
5787 However, by keeping a bitmap of the first 31 back references, we can catch some
5788 of the more common cases more precisely.
5789
5790 Arguments:
5791   code           points to start of expression (the bracket)
5792   options        points to the options setting
5793   bracket_map    a bitmap of which brackets we are inside while testing; this
5794                   handles up to substring 31; after that we just have to take
5795                   the less precise approach
5796   backref_map    the back reference bitmap
5797
5798 Returns:     TRUE or FALSE
5799 */
5800
5801 static BOOL
5802 is_anchored(register const uschar * const code, int * const options, const unsigned int bracket_map,
5803   const unsigned int backref_map)
5804 {
5805 TypedStack<pi_ia_is> work_stack;
5806 pi_ia_is work_item;
5807
5808 /* set up the first work item */
5809 work_item.code = code;
5810 work_item.bracket_map = bracket_map;
5811 work_stack.push(work_item);
5812
5813 /* start the processing loop */
5814 while (!work_stack.isEmpty())
5815   {
5816   work_stack.pop(work_item);
5817   const uschar *wi_code = work_item.code;
5818   unsigned int wi_bracket_map = work_item.bracket_map;
5819   do {
5820      const uschar *scode = first_significant_code(
5821        wi_code + _pcre_OP_lengths[*wi_code], options, PCRE_MULTILINE, FALSE);
5822      register int op = *scode;
5823
5824      /* Non-capturing brackets */
5825
5826      if (op == OP_BRA)
5827        {
5828        pi_ia_is new_item;
5829        new_item.code = scode;
5830        new_item.bracket_map = wi_bracket_map;
5831            work_stack.push(new_item);
5832        }
5833
5834      /* Capturing brackets */
5835
5836      else if (op == OP_CBRA)
5837        {
5838        pi_ia_is new_item;
5839        new_item.code = scode;
5840        int n = GET2(scode, 1+LINK_SIZE);
5841        int new_map = wi_bracket_map | ((n < 32)? (1 << n) : 1);
5842        new_item.bracket_map = new_map;
5843        work_stack.push(new_item);
5844        }
5845
5846      /* Other brackets */
5847
5848      else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
5849        {
5850        pi_ia_is new_item;
5851        new_item.code = scode;
5852        new_item.bracket_map = wi_bracket_map;
5853        work_stack.push(new_item);
5854        }
5855
5856      /* .* is not anchored unless DOTALL is set and it isn't in brackets that
5857      are or may be referenced. */
5858
5859      else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
5860                op == OP_TYPEPOSSTAR) && (*options & PCRE_DOTALL) != 0)
5861        {
5862        if (scode[1] != OP_ANY || (wi_bracket_map & backref_map) != 0)
5863          {
5864          return FALSE;
5865          }
5866        }
5867
5868      /* Check for explicit anchoring */
5869
5870      else if (op != OP_SOD && op != OP_SOM &&
5871              ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
5872        {
5873        return FALSE;
5874        }
5875
5876      wi_code += GET(wi_code, 1);
5877   } while (*wi_code == OP_ALT);   /* Loop for each alternative */
5878 } //while
5879 AvmAssert(work_stack.isEmpty());
5880 return TRUE;
5881 }
5882
5883
5884
5885
5886 /*************************************************
5887 *         Check for starting with ^ or .*        *
5888 *************************************************/
5889
5890 /* This is called to find out if every branch starts with ^ or .* so that
5891 "first char" processing can be done to speed things up in multiline
5892 matching and for non-DOTALL patterns that start with .* (which must start at
5893 the beginning or after \n). As in the case of is_anchored() (see above), we
5894 have to take account of back references to capturing brackets that contain .*
5895 because in that case we can't make the assumption.
5896
5897 Arguments:
5898   code           points to start of expression (the bracket)
5899   bracket_map    a bitmap of which brackets we are inside while testing; this
5900                   handles up to substring 31; after that we just have to take
5901                   the less precise approach
5902   backref_map    the back reference bitmap
5903
5904 Returns:         TRUE or FALSE
5905 */
5906
5907 static BOOL
5908 is_startline(const uschar * const code, const unsigned int bracket_map,
5909   const unsigned int backref_map)
5910 {
5911 TypedStack<pi_ia_is> work_stack;
5912 pi_ia_is work_item;
5913
5914 /* set up the first work item */
5915 work_item.code = code;
5916 work_item.bracket_map = bracket_map;
5917 work_stack.push(work_item);
5918
5919 /*  start the processing loop */
5920 while (!work_stack.isEmpty())
5921   {
5922   work_stack.pop(work_item);
5923   const uschar *wi_code = work_item.code;
5924   unsigned int wi_bracket_map = work_item.bracket_map;
5925   do {
5926      const uschar *scode = first_significant_code(
5927        wi_code + _pcre_OP_lengths[*wi_code], NULL, 0, FALSE);
5928      register int op = *scode;
5929
5930      /* Non-capturing brackets */
5931
5932      if (op == OP_BRA)
5933        {
5934        pi_ia_is new_item;
5935        new_item.code = scode;
5936        new_item.bracket_map = wi_bracket_map;
5937        work_stack.push(new_item);
5938        }
5939
5940      /* Capturing brackets */
5941
5942      else if (op == OP_CBRA)
5943        {
5944        pi_ia_is new_item;
5945        new_item.code = scode;
5946        int n = GET2(scode, 1+LINK_SIZE);
5947        int new_map = wi_bracket_map | ((n < 32)? (1 << n) : 1);
5948        new_item.bracket_map = new_map;
5949        work_stack.push(new_item);
5950        }
5951
5952      /* Other brackets */
5953
5954      else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
5955        {
5956        pi_ia_is new_item;
5957        new_item.code = scode;
5958        new_item.bracket_map = wi_bracket_map;
5959        work_stack.push(new_item);
5960        }
5961
5962      /* .* means "start at start or after \n" if it isn't in brackets that
5963      may be referenced. */
5964
5965      else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
5966      {
5967      if (scode[1] != OP_ANY || (wi_bracket_map & backref_map) != 0)
5968        {
5969        return FALSE;
5970        }
5971      }
5972
5973      /* Check for explicit circumflex */
5974
5975      else if (op != OP_CIRC)
5976        {
5977        return FALSE;
5978        }
5979
5980      /* Move on to the next alternative */
5981
5982      wi_code += GET(wi_code, 1);
5983      } while (*wi_code == OP_ALT);  /* Loop for each alternative */
5984   } //while
5985   AvmAssert(work_stack.isEmpty());
5986   return TRUE;
5987 }
5988
5989
5990
5991 /*************************************************
5992 *       Check for asserted fixed first char      *
5993 *************************************************/
5994
5995 /* During compilation, the "first char" settings from forward assertions are
5996 discarded, because they can cause conflicts with actual literals that follow.
5997 However, if we end up without a first char setting for an unanchored pattern,
5998 it is worth scanning the regex to see if there is an initial asserted first
5999 char. If all branches start with the same asserted char, or with a bracket all
6000 of whose alternatives start with the same asserted char (recurse ad lib), then
6001 we return that char, otherwise -1.
6002
6003 Arguments:
6004   code       points to start of expression (the bracket)
6005   options    pointer to the options (used to check casing changes)
6006   inassert   TRUE if in an assertion
6007
6008 Returns:     -1 or the fixed first char
6009 */
6010
6011 typedef struct pi_ffac {
6012   const uschar* code;
6013 } pi_ffac;
6014
6015 static int
6016 find_firstassertedchar(const uschar * const code, int * const options)
6017 {
6018 register int c = -1;
6019 BOOL inassert = FALSE;
6020 BOOL caseless = (*options & PCRE_CASELESS) != 0;
6021 TypedStack<pi_ffac> work_stack;
6022 pi_ffac work_item;
6023
6024 /* set up the first work item */
6025 work_item.code = code;
6026 work_stack.push(work_item);
6027
6028 /*  start the processing loop */
6029 while (!work_stack.isEmpty())
6030   {
6031   BOOL exit = FALSE;
6032   work_stack.pop(work_item);
6033   const uschar *wi_code = work_item.code;
6034   do {
6035      const uschar *scode =
6036        first_significant_code(wi_code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);
6037      register int op = *scode;
6038
6039      switch(op)
6040        {
6041        default:
6042        return -1;
6043
6044        case OP_BRA:
6045        case OP_CBRA:
6046        case OP_ASSERT:
6047        case OP_ONCE:
6048        case OP_COND:
6049        inassert = (op == OP_ASSERT);
6050        pi_ffac new_item;
6051        new_item.code = scode;
6052        work_stack.push(new_item);
6053        exit = TRUE;
6054        break;
6055
6056        case OP_EXACT:       /* Fall through */
6057        scode += 2;
6058
6059        case OP_CHAR:
6060        case OP_CHARNC:
6061        case OP_PLUS:
6062        case OP_MINPLUS:
6063        case OP_POSPLUS:
6064        if (!inassert)
6065          {
6066          return -1;
6067          }
6068        if (c < 0)
6069          {
6070          c = scode[1];
6071          if (caseless) c |= REQ_CASELESS;
6072          }
6073        else if ((!caseless && c != scode[1]) ||
6074                (caseless && c != (scode[1] | REQ_CASELESS)))
6075          {
6076          return -1;
6077          }
6078          break;
6079        }
6080        if(exit) break;
6081        wi_code += GET(wi_code, 1);
6082        if(*wi_code == OP_KET) wi_code += GET(work_item.code, 1);
6083      }
6084   while (*wi_code == OP_ALT);
6085   } //while
6086 return c;
6087 }
6088
6089
6090 /*************************************************
6091 *        Compile a Regular Expression            *
6092 *************************************************/
6093
6094 /* This function takes a string and returns a pointer to a block of store
6095 holding a compiled version of the expression. The original API for this
6096 function had no error code return variable; it is retained for backwards
6097 compatibility. The new function is given a new name.
6098
6099 Arguments:
6100   pattern       the regular expression
6101   options       various option bits
6102   errorcodeptr  pointer to error code variable (pcre_compile2() only)
6103                   can be NULL if you don't want a code value
6104   errorptr      pointer to pointer to error text
6105   erroroffset   ptr offset in pattern where error was detected
6106   tables        pointer to character tables or NULL
6107
6108 Returns:        pointer to compiled data block, or NULL on error,
6109                 with errorptr and erroroffset set
6110 */
6111
6112 PCRE_EXP_DEFN pcre *
6113 pcre_compile(const char *pattern, int options, const char **errorptr,
6114   int *erroroffset, const unsigned char *tables)
6115 {
6116 return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
6117 }
6118
6119
6120 PCRE_EXP_DEFN pcre *
6121 pcre_compile2(const char *pattern, int options, int *errorcodeptr,
6122   const char **errorptr, int *erroroffset, const unsigned char *tables)
6123 {
6124 real_pcre *re;
6125 int length = 1;  /* For final END opcode */
6126 int firstbyte, reqbyte, newline;
6127 int errorcode = 0;
6128 int skipatstart = 0;
6129 #ifdef SUPPORT_UTF8
6130 BOOL utf8;
6131 #endif
6132 size_t size;
6133 uschar *code;
6134 const uschar *codestart;
6135 const uschar *ptr;
6136 compile_data compile_block;
6137 compile_data *cd = &compile_block;
6138
6139 /* This space is used for "compiling" into during the first phase, when we are
6140 computing the amount of memory that is needed. Compiled items are thrown away
6141 as soon as possible, so that a fairly large buffer should be sufficient for
6142 this purpose. The same space is used in the second phase for remembering where
6143 to fill in forward references to subpatterns. */
6144
6145 uschar cworkspace[COMPILE_WORK_SIZE];
6146
6147
6148 /* Set this early so that early errors get offset 0. */
6149
6150 ptr = (const uschar *)pattern;
6151
6152 /* We can't pass back an error message if errorptr is NULL; I guess the best we
6153 can do is just return NULL, but we can set a code value if there is a code
6154 pointer. */
6155
6156 if (errorptr == NULL)
6157   {
6158   if (errorcodeptr != NULL) *errorcodeptr = 99;
6159   return NULL;
6160   }
6161
6162 *errorptr = NULL;
6163 if (errorcodeptr != NULL) *errorcodeptr = ERR0;
6164
6165 /* However, we can give a message for this error */
6166
6167 if (erroroffset == NULL)
6168   {
6169   errorcode = ERR16;
6170   goto PCRE_EARLY_ERROR_RETURN2;
6171   }
6172
6173 *erroroffset = 0;
6174
6175 /* Can't support UTF8 unless PCRE has been compiled to include the code. */
6176
6177 #ifdef SUPPORT_UTF8
6178 utf8 = (options & PCRE_UTF8) != 0;
6179 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
6180      (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0)
6181   {
6182   errorcode = ERR44;
6183   goto PCRE_EARLY_ERROR_RETURN2;
6184   }
6185 #else
6186 if ((options & PCRE_UTF8) != 0)
6187   {
6188   errorcode = ERR32;
6189   goto PCRE_EARLY_ERROR_RETURN;
6190   }
6191 #endif
6192
6193 if ((options & ~PUBLIC_OPTIONS) != 0)
6194   {
6195   errorcode = ERR17;
6196   goto PCRE_EARLY_ERROR_RETURN;
6197   }
6198
6199 /* Set up pointers to the individual character tables */
6200
6201 if (tables == NULL) tables = _pcre_default_tables;
6202 cd->lcc = tables + lcc_offset;
6203 cd->fcc = tables + fcc_offset;
6204 cd->cbits = tables + cbits_offset;
6205 cd->ctypes = tables + ctypes_offset;
6206
6207 /* Check for newline settings at the start of the pattern, and remember the
6208 offset for later. */
6209
6210 if (ptr[0] == '(' && ptr[1] == '*')
6211   {
6212   int newnl = 0;
6213   if (VMPI_strncmp((char *)(ptr+2), "CR)", 3) == 0)
6214     { skipatstart = 5; newnl = PCRE_NEWLINE_CR; }
6215   else if (VMPI_strncmp((char *)(ptr+2), "LF)", 3)  == 0)
6216     { skipatstart = 5; newnl = PCRE_NEWLINE_LF; }
6217   else if (VMPI_strncmp((char *)(ptr+2), "CRLF)", 5)  == 0)
6218     { skipatstart = 7; newnl = PCRE_NEWLINE_CR + PCRE_NEWLINE_LF; }
6219   else if (VMPI_strncmp((char *)(ptr+2), "ANY)", 4) == 0)
6220     { skipatstart = 6; newnl = PCRE_NEWLINE_ANY; }
6221   else if (VMPI_strncmp((char *)(ptr+2), "ANYCRLF)", 8)  == 0)
6222     { skipatstart = 10; newnl = PCRE_NEWLINE_ANYCRLF; }
6223   if (skipatstart > 0)
6224     options = (options & ~PCRE_NEWLINE_BITS) | newnl;
6225   }
6226
6227 /* Handle different types of newline. The three bits give seven cases. The
6228 current code allows for fixed one- or two-byte sequences, plus "any" and
6229 "anycrlf". */
6230
6231 switch (options & PCRE_NEWLINE_BITS)
6232   {
6233   case 0: newline = NEWLINE; break;   /* Build-time default */
6234   case PCRE_NEWLINE_CR: newline = '\r'; break;
6235   case PCRE_NEWLINE_LF: newline = '\n'; break;
6236   case PCRE_NEWLINE_CR+
6237        PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
6238   case PCRE_NEWLINE_ANY: newline = -1; break;
6239   case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
6240   default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
6241   }
6242
6243 if (newline == -2)
6244   {
6245   cd->nltype = NLTYPE_ANYCRLF;
6246   }
6247 else if (newline < 0)
6248   {
6249   cd->nltype = NLTYPE_ANY;
6250   }
6251 else
6252   {
6253   cd->nltype = NLTYPE_FIXED;
6254   if (newline > 255)
6255     {
6256     cd->nllen = 2;
6257     cd->nl[0] = (newline >> 8) & 255;
6258     cd->nl[1] = newline & 255;
6259     }
6260   else
6261     {
6262     cd->nllen = 1;
6263     cd->nl[0] = newline;
6264     }
6265   }
6266
6267 /* Maximum back reference and backref bitmap. The bitmap records up to 31 back
6268 references to help in deciding whether (.*) can be treated as anchored or not.
6269 */
6270
6271 cd->top_backref = 0;
6272 cd->backref_map = 0;
6273
6274 /* Reflect pattern for debugging output */
6275
6276 DPRINTF(("------------------------------------------------------------------\n"));
6277 DPRINTF(("%s\n", pattern));
6278
6279 /* Pretend to compile the pattern while actually just accumulating the length
6280 of memory required. This behaviour is triggered by passing a non-NULL final
6281 argument to compile_regex(). We pass a block of workspace (cworkspace) for it
6282 to compile parts of the pattern into; the compiled code is discarded when it is
6283 no longer needed, so hopefully this workspace will never overflow, though there
6284 is a test for its doing so. */
6285
6286 cd->bracount = 0;
6287 cd->names_found = 0;
6288 cd->name_entry_size = 0;
6289 cd->name_table = NULL;
6290 cd->start_workspace = cworkspace;
6291 cd->start_code = cworkspace;
6292 cd->hwm = cworkspace;
6293 cd->start_pattern = (const uschar *)pattern;
6294 cd->end_pattern = (const uschar *)(pattern + VMPI_strlen(pattern));
6295 cd->req_varyopt = 0;
6296 cd->nopartial = FALSE;
6297 cd->external_options = options;
6298
6299 /* Now do the pre-compile. On error, errorcode will be set non-zero, so we
6300 don't need to look at the result of the function here. The initial options have
6301 been put into the cd block so that they can be changed if an option setting is
6302 found within the regex right at the beginning. Bringing initial option settings
6303 outside can help speed up starting point checks. */
6304
6305 ptr += skipatstart;
6306 code = cworkspace;
6307 *code = OP_BRA;
6308 (void)compile_regex(cd->external_options, cd->external_options & PCRE_IMS,
6309   &code, &ptr, &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd,
6310   &length);
6311 if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN;
6312
6313 DPRINTF(("end pre-compile: length=%d workspace=%d\n", length,
6314   cd->hwm - cworkspace));
6315
6316 if (length > MAX_PATTERN_SIZE)
6317   {
6318   errorcode = ERR20;
6319   goto PCRE_EARLY_ERROR_RETURN;
6320   }
6321
6322 /* Compute the size of data block needed and get it, either from malloc or
6323 externally provided function. Integer overflow should no longer be possible
6324 because nowadays we limit the maximum value of cd->names_found and
6325 cd->name_entry_size. */
6326
6327 size = length + sizeof(real_pcre) + cd->names_found * (cd->name_entry_size + 3);
6328 re = (real_pcre *)(pcre_malloc)(size);
6329
6330 if (re == NULL)
6331   {
6332   errorcode = ERR21;
6333   goto PCRE_EARLY_ERROR_RETURN;
6334   }
6335
6336 /* Put in the magic number, and save the sizes, initial options, and character
6337 table pointer. NULL is used for the default character tables. The nullpad field
6338 is at the end; it's there to help in the case when a regex compiled on a system
6339 with 4-byte pointers is run on another with 8-byte pointers. */
6340
6341 re->magic_number = MAGIC_NUMBER;
6342 re->size = (pcre_uint32)size;
6343 re->options = cd->external_options;
6344 re->dummy1 = 0;
6345 re->first_byte = 0;
6346 re->req_byte = 0;
6347 re->name_table_offset = sizeof(real_pcre);
6348 re->name_entry_size = cd->name_entry_size;
6349 re->name_count = cd->names_found;
6350 re->ref_count = 0;
6351 re->tables = (tables == _pcre_default_tables)? NULL : tables;
6352 re->nullpad = NULL;
6353
6354 /* The starting points of the name/number translation table and of the code are
6355 passed around in the compile data block. The start/end pattern and initial
6356 options are already set from the pre-compile phase, as is the name_entry_size
6357 field. Reset the bracket count and the names_found field. Also reset the hwm
6358 field; this time it's used for remembering forward references to subpatterns.
6359 */
6360
6361 cd->bracount = 0;
6362 cd->names_found = 0;
6363 cd->name_table = (uschar *)re + re->name_table_offset;
6364 codestart = cd->name_table + re->name_entry_size * re->name_count;
6365 cd->start_code = codestart;
6366 cd->hwm = cworkspace;
6367 cd->req_varyopt = 0;
6368 cd->nopartial = FALSE;
6369 cd->had_accept = FALSE;
6370
6371 /* Set up a starting, non-extracting bracket, then compile the expression. On
6372 error, errorcode will be set non-zero, so we don't need to look at the result
6373 of the function here. */
6374
6375 ptr = (const uschar *)pattern + skipatstart;
6376 code = (uschar *)codestart;
6377 *code = OP_BRA;
6378 (void)compile_regex(re->options, re->options & PCRE_IMS, &code, &ptr,
6379   &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, NULL);
6380 re->top_bracket = cd->bracount;
6381 re->top_backref = cd->top_backref;
6382
6383 if (cd->nopartial) re->options |= PCRE_NOPARTIAL;
6384 if (cd->had_accept) reqbyte = -1;   /* Must disable after (*ACCEPT) */
6385
6386 /* If not reached end of pattern on success, there's an excess bracket. */
6387
6388 if (errorcode == 0 && *ptr != 0) errorcode = ERR22;
6389
6390 /* Fill in the terminating state and check for disastrous overflow, but
6391 if debugging, leave the test till after things are printed out. */
6392
6393 *code++ = OP_END;
6394
6395 #ifndef PCRE_DEBUG
6396 if (code - codestart > length) errorcode = ERR23;
6397 #endif
6398
6399 /* Fill in any forward references that are required. */
6400
6401 while (errorcode == 0 && cd->hwm > cworkspace)
6402   {
6403   int offset, recno;
6404   const uschar *groupptr;
6405   cd->hwm -= LINK_SIZE;
6406   offset = GET(cd->hwm, 0);
6407   recno = GET(codestart, offset);
6408   groupptr = find_bracket(codestart, (re->options & PCRE_UTF8) != 0, recno);
6409   if (groupptr == NULL) errorcode = ERR53;
6410     else PUT(((uschar *)codestart), offset, groupptr - codestart);
6411   }
6412
6413 /* Give an error if there's back reference to a non-existent capturing
6414 subpattern. */
6415
6416 if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;
6417
6418 /* Failed to compile, or error while post-processing */
6419
6420 if (errorcode != 0)
6421   {
6422   (pcre_free)(re);
6423   PCRE_EARLY_ERROR_RETURN:
6424   *erroroffset = ptr - (const uschar *)pattern;
6425   PCRE_EARLY_ERROR_RETURN2:
6426   *errorptr = error_texts[errorcode];
6427   if (errorcodeptr != NULL) *errorcodeptr = errorcode;
6428   return NULL;
6429   }
6430
6431 /* If the anchored option was not passed, set the flag if we can determine that
6432 the pattern is anchored by virtue of ^ characters or \A or anything else (such
6433 as starting with .* when DOTALL is set).
6434
6435 Otherwise, if we know what the first byte has to be, save it, because that
6436 speeds up unanchored matches no end. If not, see if we can set the
6437 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
6438 start with ^. and also when all branches start with .* for non-DOTALL matches.
6439 */
6440
6441 if ((re->options & PCRE_ANCHORED) == 0)
6442   {
6443   int temp_options = re->options;   /* May get changed during these scans */
6444   if (is_anchored(codestart, &temp_options, 0, cd->backref_map))
6445     re->options |= PCRE_ANCHORED;
6446   else
6447     {
6448     if (firstbyte < 0)
6449       firstbyte = find_firstassertedchar(codestart, &temp_options);
6450     if (firstbyte >= 0)   /* Remove caseless flag for non-caseable chars */
6451       {
6452       int ch = firstbyte & 255;
6453       re->first_byte = ((firstbyte & REQ_CASELESS) != 0 &&
6454          cd->fcc[ch] == ch)? ch : firstbyte;
6455       re->options |= PCRE_FIRSTSET;
6456       }
6457     else if (is_startline(codestart, 0, cd->backref_map))
6458       re->options |= PCRE_STARTLINE;
6459     }
6460   }
6461
6462 /* For an anchored pattern, we use the "required byte" only if it follows a
6463 variable length item in the regex. Remove the caseless flag for non-caseable
6464 bytes. */
6465
6466 if (reqbyte >= 0 &&
6467      ((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0))
6468   {
6469   int ch = reqbyte & 255;
6470   re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&
6471     cd->fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;
6472   re->options |= PCRE_REQCHSET;
6473   }
6474
6475 /* Print out the compiled data if debugging is enabled. This is never the
6476 case when building a production library. */
6477
6478 #ifdef PCRE_DEBUG
6479
6480 avmplus::AvmLog("Length = %d top_bracket = %d top_backref = %d\n",
6481   length, re->top_bracket, re->top_backref);
6482
6483 avmplus::AvmLog("Options=%08x\n", re->options);
6484
6485 if ((re->options & PCRE_FIRSTSET) != 0)
6486   {
6487   int ch = re->first_byte & 255;
6488   const char *caseless = ((re->first_byte & REQ_CASELESS) == 0)?
6489     "" : " (caseless)";
6490   if (VMPI_isprint(ch)) avmplus::AvmLog("First char = %c%s\n", ch, caseless);
6491     else avmplus::AvmLog("First char = \\x%02x%s\n", ch, caseless);
6492   }
6493
6494 if ((re->options & PCRE_REQCHSET) != 0)
6495   {
6496   int ch = re->req_byte & 255;
6497   const char *caseless = ((re->req_byte & REQ_CASELESS) == 0)?
6498     "" : " (caseless)";
6499   if (VMPI_isprint(ch)) avmplus::AvmLog("Req char = %c%s\n", ch, caseless);
6500     else avmplus::AvmLog("Req char = \\x%02x%s\n", ch, caseless);
6501   }
6502
6503 pcre_printint(re, stdout, TRUE);
6504
6505 /* This check is done here in the debugging case so that the code that
6506 was compiled can be seen. */
6507
6508 if (code - codestart > length)
6509   {
6510   (pcre_free)(re);
6511   *errorptr = error_texts[ERR23];
6512   *erroroffset = ptr - (uschar *)pattern;
6513   if (errorcodeptr != NULL) *errorcodeptr = ERR23;
6514   return NULL;
6515   }
6516 #endif   /* DEBUG */
6517
6518 return (pcre *)re;
6519 }
6520
6521 /* End of pcre_compile.c */