pcre.c

   1 /*************************************************
   2 *      Perl-Compatible Regular Expressions       *
   3 *************************************************/
   4
   5 /*
   6 This is a library of functions to support regular expressions whose syntax
   7 and semantics are as close as possible to those of the Perl 5 language. See
   8 the file Tech.Notes for some information on the internals.
   9
  10 Written by: Philip Hazel <ph10@cam.ac.uk>
  11
  12            Copyright (c) 1997-2004 University of Cambridge
  13
  14 -----------------------------------------------------------------------------
  15 Redistribution and use in source and binary forms, with or without
  16 modification, are permitted provided that the following conditions are met:
  17
  18     * Redistributions of source code must retain the above copyright notice,
  19       this list of conditions and the following disclaimer.
  20
  21     * Redistributions in binary form must reproduce the above copyright
  22       notice, this list of conditions and the following disclaimer in the
  23       documentation and/or other materials provided with the distribution.
  24
  25     * Neither the name of the University of Cambridge nor the names of its
  26       contributors may be used to endorse or promote products derived from
  27       this software without specific prior written permission.
  28
  29 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  30 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  31 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  32 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  33 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  34 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  35 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  36 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  37 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  38 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  39 POSSIBILITY OF SUCH DAMAGE.
  40 -----------------------------------------------------------------------------
  41 */
  42
  43
  44 /* Define DEBUG to get debugging output on stdout. */
  45 /* #define DEBUG */
  46
  47 /* Use a macro for debugging printing, 'cause that eliminates the use of #ifdef
  48 inline, and there are *still* stupid compilers about that don't like indented
  49 pre-processor statements. I suppose it's only been 10 years... */
  50
  51 #ifdef DEBUG
  52 #define DPRINTF(p) printf p
  53 #else
  54 #define DPRINTF(p) /*nothing*/
  55 #endif
  56
  57 /* Include the internals header, which itself includes "config.h", the Standard
  58 C headers, and the external pcre header. */
  59
  60 #include "pcre-internal.h"
  61
  62 /* If Unicode Property support is wanted, include a private copy of the
  63 function that does it, and the table that translates names to numbers. */
  64
  65 #ifdef SUPPORT_UCP
  66 #include "ucp.c"
  67 #include "ucptypetable.c"
  68 #endif
  69
  70 /* Maximum number of items on the nested bracket stacks at compile time. This
  71 applies to the nesting of all kinds of parentheses. It does not limit
  72 un-nested, non-capturing parentheses. This number can be made bigger if
  73 necessary - it is used to dimension one int and one unsigned char vector at
  74 compile time. */
  75
  76 #define BRASTACK_SIZE 200
  77
  78
  79 /* Maximum number of ints of offset to save on the stack for recursive calls.
  80 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
  81 because the offset vector is always a multiple of 3 long. */
  82
  83 #define REC_STACK_SAVE_MAX 30
  84
  85
  86 /* The maximum remaining length of subject we are prepared to search for a
  87 req_byte match. */
  88
  89 #define REQ_BYTE_MAX 1000
  90
  91
  92 /* Table of sizes for the fixed-length opcodes. It's defined in a macro so that
  93 the definition is next to the definition of the opcodes in internal.h. */
  94
  95 static const uschar OP_lengths[] = { OP_LENGTHS };
  96
  97 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
  98
  99 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
 100 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
 101
 102 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
 103 are simple data values; negative values are for special things like \d and so
 104 on. Zero means further processing is needed (for things like \x), or the escape
 105 is invalid. */
 106
 107 #if !EBCDIC   /* This is the "normal" table for ASCII systems */
 108 static const short int escapes[] = {
 109      0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */
 110      0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */
 111    '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E,      0, -ESC_G,   /* @ - G */
 112      0,      0,      0,      0,      0,      0,      0,      0,   /* H - O */
 113 -ESC_P, -ESC_Q,      0, -ESC_S,      0,      0,      0, -ESC_W,   /* P - W */
 114 -ESC_X,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */
 115    '`',      7, -ESC_b,      0, -ESC_d,  ESC_e,  ESC_f,      0,   /* ` - g */
 116      0,      0,      0,      0,      0,      0,  ESC_n,      0,   /* h - o */
 117 -ESC_p,      0,  ESC_r, -ESC_s,  ESC_tee,    0,      0, -ESC_w,   /* p - w */
 118      0,      0, -ESC_z                                            /* x - z */
 119 };
 120
 121 #else         /* This is the "abnormal" table for EBCDIC systems */
 122 static const short int escapes[] = {
 123 /*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',
 124 /*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,
 125 /*  58 */     0,     0,    '!',     '$',    '*',   ')',    ';',    '~',
 126 /*  60 */   '-',   '/',      0,       0,      0,     0,      0,      0,
 127 /*  68 */     0,     0,    '|',     ',',    '%',   '_',    '>',    '?',
 128 /*  70 */     0,     0,      0,       0,      0,     0,      0,      0,
 129 /*  78 */     0,   '`',    ':',     '#',    '@',  '\'',    '=',    '"',
 130 /*  80 */     0,     7, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,
 131 /*  88 */     0,     0,      0,     '{',      0,     0,      0,      0,
 132 /*  90 */     0,     0,      0,     'l',      0, ESC_n,      0, -ESC_p,
 133 /*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,
 134 /*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,     0, -ESC_w,      0,
 135 /*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,
 136 /*  B0 */     0,     0,      0,       0,      0,     0,      0,      0,
 137 /*  B8 */     0,     0,      0,       0,      0,   ']',    '=',    '-',
 138 /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,
 139 /*  C8 */     0,     0,      0,       0,      0,     0,      0,      0,
 140 /*  D0 */   '}',     0,      0,       0,      0,     0,      0, -ESC_P,
 141 /*  D8 */-ESC_Q,     0,      0,       0,      0,     0,      0,      0,
 142 /*  E0 */  '\\',     0, -ESC_S,       0,      0,     0, -ESC_W, -ESC_X,
 143 /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,
 144 /*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,
 145 /*  F8 */     0,     0,      0,       0,      0,     0,      0,      0
 146 };
 147 #endif
 148
 149
 150 /* Tables of names of POSIX character classes and their lengths. The list is
 151 terminated by a zero length entry. The first three must be alpha, upper, lower,
 152 as this is assumed for handling case independence. */
 153
 154 static const char *const posix_names[] = {
 155   "alpha", "lower", "upper",
 156   "alnum", "ascii", "blank", "cntrl", "digit", "graph",
 157   "print", "punct", "space", "word",  "xdigit" };
 158
 159 static const uschar posix_name_lengths[] = {
 160   5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
 161
 162 /* Table of class bit maps for each POSIX class; up to three may be combined
 163 to form the class. The table for [:blank:] is dynamically modified to remove
 164 the vertical space characters. */
 165
 166 static const int posix_class_maps[] = {
 167   cbit_lower, cbit_upper, -1,             /* alpha */
 168   cbit_lower, -1,         -1,             /* lower */
 169   cbit_upper, -1,         -1,             /* upper */
 170   cbit_digit, cbit_lower, cbit_upper,     /* alnum */
 171   cbit_print, cbit_cntrl, -1,             /* ascii */
 172   cbit_space, -1,         -1,             /* blank - a GNU extension */
 173   cbit_cntrl, -1,         -1,             /* cntrl */
 174   cbit_digit, -1,         -1,             /* digit */
 175   cbit_graph, -1,         -1,             /* graph */
 176   cbit_print, -1,         -1,             /* print */
 177   cbit_punct, -1,         -1,             /* punct */
 178   cbit_space, -1,         -1,             /* space */
 179   cbit_word,  -1,         -1,             /* word - a Perl extension */
 180   cbit_xdigit,-1,         -1              /* xdigit */
 181 };
 182
 183 /* Table to identify digits and hex digits. This is used when compiling
 184 patterns. Note that the tables in chartables are dependent on the locale, and
 185 may mark arbitrary characters as digits - but the PCRE compiling code expects
 186 to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
 187 a private table here. It costs 256 bytes, but it is a lot faster than doing
 188 character value tests (at least in some simple cases I timed), and in some
 189 applications one wants PCRE to compile efficiently as well as match
 190 efficiently.
 191
 192 For convenience, we use the same bit definitions as in chartables:
 193
 194   0x04   decimal digit
 195   0x08   hexadecimal digit
 196
 197 Then we can use ctype_digit and ctype_xdigit in the code. */
 198
 199 #if !EBCDIC    /* This is the "normal" case, for ASCII systems */
 200 static const unsigned char digitab[] =
 201   {
 202   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */
 203   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15 */
 204   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  16- 23 */
 205   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31 */
 206   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - '  */
 207   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ( - /  */
 208   0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  */
 209   0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /*  8 - ?  */
 210   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  @ - G  */
 211   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  H - O  */
 212   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  P - W  */
 213   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  X - _  */
 214   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  ` - g  */
 215   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  h - o  */
 216   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  p - w  */
 217   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  x -127 */
 218   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
 219   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
 220   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
 221   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
 222   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
 223   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
 224   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
 225   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
 226   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
 227   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
 228   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
 229   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
 230   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
 231   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
 232   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
 233   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
 234
 235 #else          /* This is the "abnormal" case, for EBCDIC systems */
 236 static const unsigned char digitab[] =
 237   {
 238   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */
 239   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15    */
 240   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  16- 23 10 */
 241   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31    */
 242   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  32- 39 20 */
 243   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  40- 47    */
 244   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  48- 55 30 */
 245   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  56- 63    */
 246   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 40 */
 247   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  72- |     */
 248   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 50 */
 249   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  88- ¬     */
 250   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 60 */
 251   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ?     */
 252   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
 253   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- "     */
 254   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g  80 */
 255   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  h -143    */
 256   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p  90 */
 257   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  q -159    */
 258   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x  A0 */
 259   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  y -175    */
 260   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ^ -183 B0 */
 261   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191    */
 262   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  { - G  C0 */
 263   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  H -207    */
 264   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  } - P  D0 */
 265   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  Q -223    */
 266   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  \ - X  E0 */
 267   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  Y -239    */
 268   0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  F0 */
 269   0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/*  8 -255    */
 270
 271 static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
 272   0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*   0-  7 */
 273   0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /*   8- 15 */
 274   0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*  16- 23 */
 275   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31 */
 276   0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*  32- 39 */
 277   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  40- 47 */
 278   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  48- 55 */
 279   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  56- 63 */
 280   0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 */
 281   0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /*  72- |  */
 282   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 */
 283   0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /*  88- ¬  */
 284   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 */
 285   0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ?  */
 286   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
 287   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- "  */
 288   0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g  */
 289   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  h -143 */
 290   0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p  */
 291   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  q -159 */
 292   0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x  */
 293   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  y -175 */
 294   0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ^ -183 */
 295   0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
 296   0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /*  { - G  */
 297   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  H -207 */
 298   0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /*  } - P  */
 299   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  Q -223 */
 300   0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /*  \ - X  */
 301   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  Y -239 */
 302   0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /*  0 - 7  */
 303   0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/*  8 -255 */
 304 #endif
 305
 306
 307 /* Definition to allow mutual recursion */
 308
 309 static BOOL
 310   compile_regex(int, int, int *, uschar **, const uschar **, const char **,
 311     BOOL, int, int *, int *, branch_chain *, compile_data *);
 312
 313 /* Structure for building a chain of data that actually lives on the
 314 stack, for holding the values of the subject pointer at the start of each
 315 subpattern, so as to detect when an empty string has been matched by a
 316 subpattern - to break infinite loops. When NO_RECURSE is set, these blocks
 317 are on the heap, not on the stack. */
 318
 319 typedef struct eptrblock {
 320   struct eptrblock *epb_prev;
 321   const uschar *epb_saved_eptr;
 322 } eptrblock;
 323
 324 /* Flag bits for the match() function */
 325
 326 #define match_condassert   0x01    /* Called to check a condition assertion */
 327 #define match_isgroup      0x02    /* Set if start of bracketed group */
 328
 329 /* Non-error returns from the match() function. Error returns are externally
 330 defined PCRE_ERROR_xxx codes, which are all negative. */
 331
 332 #define MATCH_MATCH        1
 333 #define MATCH_NOMATCH      0
 334
 335
 336
 337 /*************************************************
 338 *               Global variables                 *
 339 *************************************************/
 340
 341 /* PCRE is thread-clean and doesn't use any global variables in the normal
 342 sense. However, it calls memory allocation and free functions via the four
 343 indirections below, and it can optionally do callouts. These values can be
 344 changed by the caller, but are shared between all threads. However, when
 345 compiling for Virtual Pascal, things are done differently (see pcre.in). */
 346
 347 #ifndef VPCOMPAT
 348 #ifdef __cplusplus
 349 extern "C" void *(*pcre_malloc)(size_t) = malloc;
 350 extern "C" void  (*pcre_free)(void *) = free;
 351 extern "C" void *(*pcre_stack_malloc)(size_t) = malloc;
 352 extern "C" void  (*pcre_stack_free)(void *) = free;
 353 extern "C" int   (*pcre_callout)(pcre_callout_block *) = NULL;
 354 #else
 355 void *(*pcre_malloc)(size_t) = malloc;
 356 void  (*pcre_free)(void *) = free;
 357 void *(*pcre_stack_malloc)(size_t) = malloc;
 358 void  (*pcre_stack_free)(void *) = free;
 359 int   (*pcre_callout)(pcre_callout_block *) = NULL;
 360 #endif
 361 #endif
 362
 363
 364 /*************************************************
 365 *    Macros and tables for character handling    *
 366 *************************************************/
 367
 368 /* When UTF-8 encoding is being used, a character is no longer just a single
 369 byte. The macros for character handling generate simple sequences when used in
 370 byte-mode, and more complicated ones for UTF-8 characters. */
 371
 372 #ifndef SUPPORT_UTF8
 373 #define GETCHAR(c, eptr) c = *eptr;
 374 #define GETCHARINC(c, eptr) c = *eptr++;
 375 #define GETCHARINCTEST(c, eptr) c = *eptr++;
 376 #define GETCHARLEN(c, eptr, len) c = *eptr;
 377 #define BACKCHAR(eptr)
 378
 379 #else   /* SUPPORT_UTF8 */
 380
 381 /* Get the next UTF-8 character, not advancing the pointer. This is called when
 382 we know we are in UTF-8 mode. */
 383
 384 #define GETCHAR(c, eptr) \
 385   c = *eptr; \
 386   if ((c & 0xc0) == 0xc0) \
 387     { \
 388     int gcii; \
 389     int gcaa = utf8_table4[c & 0x3f];  /* Number of additional bytes */ \
 390     int gcss = 6*gcaa; \
 391     c = (c & utf8_table3[gcaa]) << gcss; \
 392     for (gcii = 1; gcii <= gcaa; gcii++) \
 393       { \
 394       gcss -= 6; \
 395       c |= (eptr[gcii] & 0x3f) << gcss; \
 396       } \
 397     }
 398
 399 /* Get the next UTF-8 character, advancing the pointer. This is called when we
 400 know we are in UTF-8 mode. */
 401
 402 #define GETCHARINC(c, eptr) \
 403   c = *eptr++; \
 404   if ((c & 0xc0) == 0xc0) \
 405     { \
 406     int gcaa = utf8_table4[c & 0x3f];  /* Number of additional bytes */ \
 407     int gcss = 6*gcaa; \
 408     c = (c & utf8_table3[gcaa]) << gcss; \
 409     while (gcaa-- > 0) \
 410       { \
 411       gcss -= 6; \
 412       c |= (*eptr++ & 0x3f) << gcss; \
 413       } \
 414     }
 415
 416 /* Get the next character, testing for UTF-8 mode, and advancing the pointer */
 417
 418 #define GETCHARINCTEST(c, eptr) \
 419   c = *eptr++; \
 420   if (md->utf8 && (c & 0xc0) == 0xc0) \
 421     { \
 422     int gcaa = utf8_table4[c & 0x3f];  /* Number of additional bytes */ \
 423     int gcss = 6*gcaa; \
 424     c = (c & utf8_table3[gcaa]) << gcss; \
 425     while (gcaa-- > 0) \
 426       { \
 427       gcss -= 6; \
 428       c |= (*eptr++ & 0x3f) << gcss; \
 429       } \
 430     }
 431
 432 /* Get the next UTF-8 character, not advancing the pointer, incrementing length
 433 if there are extra bytes. This is called when we know we are in UTF-8 mode. */
 434
 435 #define GETCHARLEN(c, eptr, len) \
 436   c = *eptr; \
 437   if ((c & 0xc0) == 0xc0) \
 438     { \
 439     int gcii; \
 440     int gcaa = utf8_table4[c & 0x3f];  /* Number of additional bytes */ \
 441     int gcss = 6*gcaa; \
 442     c = (c & utf8_table3[gcaa]) << gcss; \
 443     for (gcii = 1; gcii <= gcaa; gcii++) \
 444       { \
 445       gcss -= 6; \
 446       c |= (eptr[gcii] & 0x3f) << gcss; \
 447       } \
 448     len += gcaa; \
 449     }
 450
 451 /* If the pointer is not at the start of a character, move it back until
 452 it is. Called only in UTF-8 mode. */
 453
 454 #define BACKCHAR(eptr) while((*eptr & 0xc0) == 0x80) eptr--;
 455
 456 #endif
 457
 458
 459
 460 /*************************************************
 461 *             Default character tables           *
 462 *************************************************/
 463
 464 /* A default set of character tables is included in the PCRE binary. Its source
 465 is built by the maketables auxiliary program, which uses the default C ctypes
 466 functions, and put in the file chartables.c. These tables are used by PCRE
 467 whenever the caller of pcre_compile() does not provide an alternate set of
 468 tables. */
 469
 470 #include "pcre-chartables.c"
 471
 472
 473
 474 #ifdef SUPPORT_UTF8
 475 /*************************************************
 476 *           Tables for UTF-8 support             *
 477 *************************************************/
 478
 479 /* These are the breakpoints for different numbers of bytes in a UTF-8
 480 character. */
 481
 482 static const int utf8_table1[] =
 483   { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff};
 484
 485 /* These are the indicator bits and the mask for the data bits to set in the
 486 first byte of a character, indexed by the number of additional bytes. */
 487
 488 static const int utf8_table2[] = { 0,    0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
 489 static const int utf8_table3[] = { 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01};
 490
 491 /* Table of the number of extra characters, indexed by the first character
 492 masked with 0x3f. The highest number for a valid UTF-8 character is in fact
 493 0x3d. */
 494
 495 static const uschar utf8_table4[] = {
 496   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
 497   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
 498   2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
 499   3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 };
 500
 501
 502 /*************************************************
 503 *       Convert character value to UTF-8         *
 504 *************************************************/
 505
 506 /* This function takes an integer value in the range 0 - 0x7fffffff
 507 and encodes it as a UTF-8 character in 0 to 6 bytes.
 508
 509 Arguments:
 510   cvalue     the character value
 511   buffer     pointer to buffer for result - at least 6 bytes long
 512
 513 Returns:     number of characters placed in the buffer
 514 */
 515
 516 static int
 517 ord2utf8(int cvalue, uschar *buffer)
 518 {
 519 register int i, j;
 520 for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
 521   if (cvalue <= utf8_table1[i]) break;
 522 buffer += i;
 523 for (j = i; j > 0; j--)
 524  {
 525  *buffer-- = 0x80 | (cvalue & 0x3f);
 526  cvalue >>= 6;
 527  }
 528 *buffer = utf8_table2[i] | cvalue;
 529 return i + 1;
 530 }
 531 #endif
 532
 533
 534
 535 /*************************************************
 536 *         Print compiled regex                   *
 537 *************************************************/
 538
 539 /* The code for doing this is held in a separate file that is also included in
 540 pcretest.c. It defines a function called print_internals(). */
 541
 542 #ifdef DEBUG
 543 #include "printint.c"
 544 #endif
 545
 546
 547
 548 /*************************************************
 549 *          Return version string                 *
 550 *************************************************/
 551
 552 #define STRING(a)  # a
 553 #define XSTRING(s) STRING(s)
 554
 555 EXPORT const char *
 556 pcre_version(void)
 557 {
 558 return XSTRING(PCRE_MAJOR) "." XSTRING(PCRE_MINOR) " " XSTRING(PCRE_DATE);
 559 }
 560
 561
 562
 563
 564 /*************************************************
 565 *         Flip bytes in an integer               *
 566 *************************************************/
 567
 568 /* This function is called when the magic number in a regex doesn't match in
 569 order to flip its bytes to see if we are dealing with a pattern that was
 570 compiled on a host of different endianness. If so, this function is used to
 571 flip other byte values.
 572
 573 Arguments:
 574   value        the number to flip
 575   n            the number of bytes to flip (assumed to be 2 or 4)
 576
 577 Returns:       the flipped value
 578 */
 579
 580 static long int
 581 byteflip(long int value, int n)
 582 {
 583 if (n == 2) return ((value & 0x00ff) << 8) | ((value & 0xff00) >> 8);
 584 return ((value & 0x000000ff) << 24) |
 585        ((value & 0x0000ff00) <<  8) |
 586        ((value & 0x00ff0000) >>  8) |
 587        ((value & 0xff000000) >> 24);
 588 }
 589
 590
 591
 592 /*************************************************
 593 *       Test for a byte-flipped compiled regex   *
 594 *************************************************/
 595
 596 /* This function is called from pce_exec() and also from pcre_fullinfo(). Its
 597 job is to test whether the regex is byte-flipped - that is, it was compiled on
 598 a system of opposite endianness. The function is called only when the native
 599 MAGIC_NUMBER test fails. If the regex is indeed flipped, we flip all the
 600 relevant values into a different data block, and return it.
 601
 602 Arguments:
 603   re               points to the regex
 604   study            points to study data, or NULL
 605   internal_re      points to a new regex block
 606   internal_study   points to a new study block
 607
 608 Returns:           the new block if is is indeed a byte-flipped regex
 609                    NULL if it is not
 610 */
 611
 612 static real_pcre *
 613 try_flipped(const real_pcre *re, real_pcre *internal_re,
 614   const pcre_study_data *study, pcre_study_data *internal_study)
 615 {
 616 if (byteflip(re->magic_number, sizeof(re->magic_number)) != MAGIC_NUMBER)
 617   return NULL;
 618
 619 *internal_re = *re;           /* To copy other fields */
 620 internal_re->size = byteflip(re->size, sizeof(re->size));
 621 internal_re->options = byteflip(re->options, sizeof(re->options));
 622 internal_re->top_bracket = byteflip(re->top_bracket, sizeof(re->top_bracket));
 623 internal_re->top_backref = byteflip(re->top_backref, sizeof(re->top_backref));
 624 internal_re->first_byte = byteflip(re->first_byte, sizeof(re->first_byte));
 625 internal_re->req_byte = byteflip(re->req_byte, sizeof(re->req_byte));
 626 internal_re->name_table_offset = byteflip(re->name_table_offset,
 627   sizeof(re->name_table_offset));
 628 internal_re->name_entry_size = byteflip(re->name_entry_size,
 629   sizeof(re->name_entry_size));
 630 internal_re->name_count = byteflip(re->name_count, sizeof(re->name_count));
 631
 632 if (study != NULL)
 633   {
 634   *internal_study = *study;   /* To copy other fields */
 635   internal_study->size = byteflip(study->size, sizeof(study->size));
 636   internal_study->options = byteflip(study->options, sizeof(study->options));
 637   }
 638
 639 return internal_re;
 640 }
 641
 642
 643
 644 /*************************************************
 645 * (Obsolete) Return info about compiled pattern  *
 646 *************************************************/
 647
 648 /* This is the original "info" function. It picks potentially useful data out
 649 of the private structure, but its interface was too rigid. It remains for
 650 backwards compatibility. The public options are passed back in an int - though
 651 the re->options field has been expanded to a long int, all the public options
 652 at the low end of it, and so even on 16-bit systems this will still be OK.
 653 Therefore, I haven't changed the API for pcre_info().
 654
 655 Arguments:
 656   argument_re   points to compiled code
 657   optptr        where to pass back the options
 658   first_byte    where to pass back the first character,
 659                 or -1 if multiline and all branches start ^,
 660                 or -2 otherwise
 661
 662 Returns:        number of capturing subpatterns
 663                 or negative values on error
 664 */
 665
 666 EXPORT int
 667 pcre_info(const pcre *argument_re, int *optptr, int *first_byte)
 668 {
 669 real_pcre internal_re;
 670 const real_pcre *re = (const real_pcre *)argument_re;
 671 if (re == NULL) return PCRE_ERROR_NULL;
 672 if (re->magic_number != MAGIC_NUMBER)
 673   {
 674   re = try_flipped(re, &internal_re, NULL, NULL);
 675   if (re == NULL) return PCRE_ERROR_BADMAGIC;
 676   }
 677 if (optptr != NULL) *optptr = (int)(re->options & PUBLIC_OPTIONS);
 678 if (first_byte != NULL)
 679   *first_byte = ((re->options & PCRE_FIRSTSET) != 0)? re->first_byte :
 680      ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
 681 return re->top_bracket;
 682 }
 683
 684
 685
 686 /*************************************************
 687 *        Return info about compiled pattern      *
 688 *************************************************/
 689
 690 /* This is a newer "info" function which has an extensible interface so
 691 that additional items can be added compatibly.
 692
 693 Arguments:
 694   argument_re      points to compiled code
 695   extra_data       points extra data, or NULL
 696   what             what information is required
 697   where            where to put the information
 698
 699 Returns:           0 if data returned, negative on error
 700 */
 701
 702 EXPORT int
 703 pcre_fullinfo(const pcre *argument_re, const pcre_extra *extra_data, int what,
 704   void *where)
 705 {
 706 real_pcre internal_re;
 707 pcre_study_data internal_study;
 708 const real_pcre *re = (const real_pcre *)argument_re;
 709 const pcre_study_data *study = NULL;
 710
 711 if (re == NULL || where == NULL) return PCRE_ERROR_NULL;
 712
 713 if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_STUDY_DATA) != 0)
 714   study = (const pcre_study_data *)extra_data->study_data;
 715
 716 if (re->magic_number != MAGIC_NUMBER)
 717   {
 718   re = try_flipped(re, &internal_re, study, &internal_study);
 719   if (re == NULL) return PCRE_ERROR_BADMAGIC;
 720   if (study != NULL) study = &internal_study;
 721   }
 722
 723 switch (what)
 724   {
 725   case PCRE_INFO_OPTIONS:
 726   *((unsigned long int *)where) = re->options & PUBLIC_OPTIONS;
 727   break;
 728
 729   case PCRE_INFO_SIZE:
 730   *((size_t *)where) = re->size;
 731   break;
 732
 733   case PCRE_INFO_STUDYSIZE:
 734   *((size_t *)where) = (study == NULL)? 0 : study->size;
 735   break;
 736
 737   case PCRE_INFO_CAPTURECOUNT:
 738   *((int *)where) = re->top_bracket;
 739   break;
 740
 741   case PCRE_INFO_BACKREFMAX:
 742   *((int *)where) = re->top_backref;
 743   break;
 744
 745   case PCRE_INFO_FIRSTBYTE:
 746   *((int *)where) =
 747     ((re->options & PCRE_FIRSTSET) != 0)? re->first_byte :
 748     ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
 749   break;
 750
 751   /* Make sure we pass back the pointer to the bit vector in the external
 752   block, not the internal copy (with flipped integer fields). */
 753
 754   case PCRE_INFO_FIRSTTABLE:
 755   *((const uschar **)where) =
 756     (study != NULL && (study->options & PCRE_STUDY_MAPPED) != 0)?
 757       ((const pcre_study_data *)extra_data->study_data)->start_bits : NULL;
 758   break;
 759
 760   case PCRE_INFO_LASTLITERAL:
 761   *((int *)where) =
 762     ((re->options & PCRE_REQCHSET) != 0)? re->req_byte : -1;
 763   break;
 764
 765   case PCRE_INFO_NAMEENTRYSIZE:
 766   *((int *)where) = re->name_entry_size;
 767   break;
 768
 769   case PCRE_INFO_NAMECOUNT:
 770   *((int *)where) = re->name_count;
 771   break;
 772
 773   case PCRE_INFO_NAMETABLE:
 774   *((const uschar **)where) = (const uschar *)re + re->name_table_offset;
 775   break;
 776
 777   case PCRE_INFO_DEFAULT_TABLES:
 778   *((const uschar **)where) = (const uschar *)pcre_default_tables;
 779   break;
 780
 781   default: return PCRE_ERROR_BADOPTION;
 782   }
 783
 784 return 0;
 785 }
 786
 787
 788
 789 /*************************************************
 790 * Return info about what features are configured *
 791 *************************************************/
 792
 793 /* This is function which has an extensible interface so that additional items
 794 can be added compatibly.
 795
 796 Arguments:
 797   what             what information is required
 798   where            where to put the information
 799
 800 Returns:           0 if data returned, negative on error
 801 */
 802
 803 EXPORT int
 804 pcre_config(int what, void *where)
 805 {
 806 switch (what)
 807   {
 808   case PCRE_CONFIG_UTF8:
 809 #ifdef SUPPORT_UTF8
 810   *((int *)where) = 1;
 811 #else
 812   *((int *)where) = 0;
 813 #endif
 814   break;
 815
 816   case PCRE_CONFIG_UNICODE_PROPERTIES:
 817 #ifdef SUPPORT_UCP
 818   *((int *)where) = 1;
 819 #else
 820   *((int *)where) = 0;
 821 #endif
 822   break;
 823
 824   case PCRE_CONFIG_NEWLINE:
 825   *((int *)where) = NEWLINE;
 826   break;
 827
 828   case PCRE_CONFIG_LINK_SIZE:
 829   *((int *)where) = LINK_SIZE;
 830   break;
 831
 832   case PCRE_CONFIG_POSIX_MALLOC_THRESHOLD:
 833   *((int *)where) = POSIX_MALLOC_THRESHOLD;
 834   break;
 835
 836   case PCRE_CONFIG_MATCH_LIMIT:
 837   *((unsigned int *)where) = MATCH_LIMIT;
 838   break;
 839
 840   case PCRE_CONFIG_STACKRECURSE:
 841 #ifdef NO_RECURSE
 842   *((int *)where) = 0;
 843 #else
 844   *((int *)where) = 1;
 845 #endif
 846   break;
 847
 848   default: return PCRE_ERROR_BADOPTION;
 849   }
 850
 851 return 0;
 852 }
 853
 854
 855
 856 #ifdef DEBUG
 857 /*************************************************
 858 *        Debugging function to print chars       *
 859 *************************************************/
 860
 861 /* Print a sequence of chars in printable format, stopping at the end of the
 862 subject if the requested.
 863
 864 Arguments:
 865   p           points to characters
 866   length      number to print
 867   is_subject  TRUE if printing from within md->start_subject
 868   md          pointer to matching data block, if is_subject is TRUE
 869
 870 Returns:     nothing
 871 */
 872
 873 static void
 874 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
 875 {
 876 int c;
 877 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
 878 while (length-- > 0)
 879   if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
 880 }
 881 #endif
 882
 883
 884
 885
 886 /*************************************************
 887 *            Handle escapes                      *
 888 *************************************************/
 889
 890 /* This function is called when a \ has been encountered. It either returns a
 891 positive value for a simple escape such as \n, or a negative value which
 892 encodes one of the more complicated things such as \d. When UTF-8 is enabled,
 893 a positive value greater than 255 may be returned. On entry, ptr is pointing at
 894 the \. On exit, it is on the final character of the escape sequence.
 895
 896 Arguments:
 897   ptrptr     points to the pattern position pointer
 898   errorptr   points to the pointer to the error message
 899   bracount   number of previous extracting brackets
 900   options    the options bits
 901   isclass    TRUE if inside a character class
 902
 903 Returns:     zero or positive => a data character
 904              negative => a special escape sequence
 905              on error, errorptr is set
 906 */
 907
 908 static int
 909 check_escape(const uschar **ptrptr, const char **errorptr, int bracount,
 910   int options, BOOL isclass)
 911 {
 912 const uschar *ptr = *ptrptr;
 913 int c, i;
 914
 915 /* If backslash is at the end of the pattern, it's an error. */
 916
 917 c = *(++ptr);
 918 if (c == 0) *errorptr = ERR1;
 919
 920 /* Non-alphamerics are literals. For digits or letters, do an initial lookup in
 921 a table. A non-zero result is something that can be returned immediately.
 922 Otherwise further processing may be required. */
 923
 924 #if !EBCDIC    /* ASCII coding */
 925 else if (c < '0' || c > 'z') {}                           /* Not alphameric */
 926 else if ((i = escapes[c - '0']) != 0) c = i;
 927
 928 #else          /* EBCDIC coding */
 929 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {}   /* Not alphameric */
 930 else if ((i = escapes[c - 0x48]) != 0)  c = i;
 931 #endif
 932
 933 /* Escapes that need further processing, or are illegal. */
 934
 935 else
 936   {
 937   const uschar *oldptr;
 938   switch (c)
 939     {
 940     /* A number of Perl escapes are not handled by PCRE. We give an explicit
 941     error. */
 942
 943     case 'l':
 944     case 'L':
 945     case 'N':
 946     case 'u':
 947     case 'U':
 948     *errorptr = ERR37;
 949     break;
 950
 951     /* The handling of escape sequences consisting of a string of digits
 952     starting with one that is not zero is not straightforward. By experiment,
 953     the way Perl works seems to be as follows:
 954
 955     Outside a character class, the digits are read as a decimal number. If the
 956     number is less than 10, or if there are that many previous extracting
 957     left brackets, then it is a back reference. Otherwise, up to three octal
 958     digits are read to form an escaped byte. Thus \123 is likely to be octal
 959     123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
 960     value is greater than 377, the least significant 8 bits are taken. Inside a
 961     character class, \ followed by a digit is always an octal number. */
 962
 963     case '1': case '2': case '3': case '4': case '5':
 964     case '6': case '7': case '8': case '9':
 965
 966     if (!isclass)
 967       {
 968       oldptr = ptr;
 969       c -= '0';
 970       while ((digitab[ptr[1]] & ctype_digit) != 0)
 971         c = c * 10 + *(++ptr) - '0';
 972       if (c < 10 || c <= bracount)
 973         {
 974         c = -(ESC_REF + c);
 975         break;
 976         }
 977       ptr = oldptr;      /* Put the pointer back and fall through */
 978       }
 979
 980     /* Handle an octal number following \. If the first digit is 8 or 9, Perl
 981     generates a binary zero byte and treats the digit as a following literal.
 982     Thus we have to pull back the pointer by one. */
 983
 984     if ((c = *ptr) >= '8')
 985       {
 986       ptr--;
 987       c = 0;
 988       break;
 989       }
 990
 991     /* \0 always starts an octal number, but we may drop through to here with a
 992     larger first octal digit. */
 993
 994     case '0':
 995     c -= '0';
 996     while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
 997         c = c * 8 + *(++ptr) - '0';
 998     c &= 255;     /* Take least significant 8 bits */
 999     break;
1000
1001     /* \x is complicated when UTF-8 is enabled. \x{ddd} is a character number
1002     which can be greater than 0xff, but only if the ddd are hex digits. */
1003
1004     case 'x':
1005 #ifdef SUPPORT_UTF8
1006     if (ptr[1] == '{' && (options & PCRE_UTF8) != 0)
1007       {
1008       const uschar *pt = ptr + 2;
1009       register int count = 0;
1010       c = 0;
1011       while ((digitab[*pt] & ctype_xdigit) != 0)
1012         {
1013         int cc = *pt++;
1014         count++;
1015 #if !EBCDIC    /* ASCII coding */
1016         if (cc >= 'a') cc -= 32;               /* Convert to upper case */
1017         c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
1018 #else          /* EBCDIC coding */
1019         if (cc >= 'a' && cc <= 'z') cc += 64;  /* Convert to upper case */
1020         c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
1021 #endif
1022         }
1023       if (*pt == '}')
1024         {
1025         if (c < 0 || count > 8) *errorptr = ERR34;
1026         ptr = pt;
1027         break;
1028         }
1029       /* If the sequence of hex digits does not end with '}', then we don't
1030       recognize this construct; fall through to the normal \x handling. */
1031       }
1032 #endif
1033
1034     /* Read just a single hex char */
1035
1036     c = 0;
1037     while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
1038       {
1039       int cc;                               /* Some compilers don't like ++ */
1040       cc = *(++ptr);                        /* in initializers */
1041 #if !EBCDIC    /* ASCII coding */
1042       if (cc >= 'a') cc -= 32;              /* Convert to upper case */
1043       c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
1044 #else          /* EBCDIC coding */
1045       if (cc <= 'z') cc += 64;              /* Convert to upper case */
1046       c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
1047 #endif
1048       }
1049     break;
1050
1051     /* Other special escapes not starting with a digit are straightforward */
1052
1053     case 'c':
1054     c = *(++ptr);
1055     if (c == 0)
1056       {
1057       *errorptr = ERR2;
1058       return 0;
1059       }
1060
1061     /* A letter is upper-cased; then the 0x40 bit is flipped. This coding
1062     is ASCII-specific, but then the whole concept of \cx is ASCII-specific.
1063     (However, an EBCDIC equivalent has now been added.) */
1064
1065 #if !EBCDIC    /* ASCII coding */
1066     if (c >= 'a' && c <= 'z') c -= 32;
1067     c ^= 0x40;
1068 #else          /* EBCDIC coding */
1069     if (c >= 'a' && c <= 'z') c += 64;
1070     c ^= 0xC0;
1071 #endif
1072     break;
1073
1074     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
1075     other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
1076     for Perl compatibility, it is a literal. This code looks a bit odd, but
1077     there used to be some cases other than the default, and there may be again
1078     in future, so I haven't "optimized" it. */
1079
1080     default:
1081     if ((options & PCRE_EXTRA) != 0) switch(c)
1082       {
1083       default:
1084       *errorptr = ERR3;
1085       break;
1086       }
1087     break;
1088     }
1089   }
1090
1091 *ptrptr = ptr;
1092 return c;
1093 }
1094
1095
1096
1097 #ifdef SUPPORT_UCP
1098 /*************************************************
1099 *               Handle \P and \p                 *
1100 *************************************************/
1101
1102 /* This function is called after \P or \p has been encountered, provided that
1103 PCRE is compiled with support for Unicode properties. On entry, ptrptr is
1104 pointing at the P or p. On exit, it is pointing at the final character of the
1105 escape sequence.
1106
1107 Argument:
1108   ptrptr     points to the pattern position pointer
1109   negptr     points to a boolean that is set TRUE for negation else FALSE
1110   errorptr   points to the pointer to the error message
1111
1112 Returns:     value from ucp_type_table, or -1 for an invalid type
1113 */
1114
1115 static int
1116 get_ucp(const uschar **ptrptr, BOOL *negptr, const char **errorptr)
1117 {
1118 int c, i, bot, top;
1119 const uschar *ptr = *ptrptr;
1120 char name[4];
1121
1122 c = *(++ptr);
1123 if (c == 0) goto ERROR_RETURN;
1124
1125 *negptr = FALSE;
1126
1127 /* \P or \p can be followed by a one- or two-character name in {}, optionally
1128 preceded by ^ for negation. */
1129
1130 if (c == '{')
1131   {
1132   if (ptr[1] == '^')
1133     {
1134     *negptr = TRUE;
1135     ptr++;
1136     }
1137   for (i = 0; i <= 2; i++)
1138     {
1139     c = *(++ptr);
1140     if (c == 0) goto ERROR_RETURN;
1141     if (c == '}') break;
1142     name[i] = c;
1143     }
1144   if (c !='}')   /* Try to distinguish error cases */
1145     {
1146     while (*(++ptr) != 0 && *ptr != '}');
1147     if (*ptr == '}') goto UNKNOWN_RETURN; else goto ERROR_RETURN;
1148     }
1149   name[i] = 0;
1150   }
1151
1152 /* Otherwise there is just one following character */
1153
1154 else
1155   {
1156   name[0] = c;
1157   name[1] = 0;
1158   }
1159
1160 *ptrptr = ptr;
1161
1162 /* Search for a recognized property name using binary chop */
1163
1164 bot = 0;
1165 top = sizeof(utt)/sizeof(ucp_type_table);
1166
1167 while (bot < top)
1168   {
1169   i = (bot + top)/2;
1170   c = strcmp(name, utt[i].name);
1171   if (c == 0) return utt[i].value;
1172   if (c > 0) bot = i + 1; else top = i;
1173   }
1174
1175 UNKNOWN_RETURN:
1176 *errorptr = ERR47;
1177 *ptrptr = ptr;
1178 return -1;
1179
1180 ERROR_RETURN:
1181 *errorptr = ERR46;
1182 *ptrptr = ptr;
1183 return -1;
1184 }
1185 #endif
1186
1187
1188
1189
1190 /*************************************************
1191 *            Check for counted repeat            *
1192 *************************************************/
1193
1194 /* This function is called when a '{' is encountered in a place where it might
1195 start a quantifier. It looks ahead to see if it really is a quantifier or not.
1196 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
1197 where the ddds are digits.
1198
1199 Arguments:
1200   p         pointer to the first char after '{'
1201
1202 Returns:    TRUE or FALSE
1203 */
1204
1205 static BOOL
1206 is_counted_repeat(const uschar *p)
1207 {
1208 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
1209 while ((digitab[*p] & ctype_digit) != 0) p++;
1210 if (*p == '}') return TRUE;
1211
1212 if (*p++ != ',') return FALSE;
1213 if (*p == '}') return TRUE;
1214
1215 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
1216 while ((digitab[*p] & ctype_digit) != 0) p++;
1217
1218 return (*p == '}');
1219 }
1220
1221
1222
1223 /*************************************************
1224 *         Read repeat counts                     *
1225 *************************************************/
1226
1227 /* Read an item of the form {n,m} and return the values. This is called only
1228 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
1229 so the syntax is guaranteed to be correct, but we need to check the values.
1230
1231 Arguments:
1232   p          pointer to first char after '{'
1233   minp       pointer to int for min
1234   maxp       pointer to int for max
1235              returned as -1 if no max
1236   errorptr   points to pointer to error message
1237
1238 Returns:     pointer to '}' on success;
1239              current ptr on error, with errorptr set
1240 */
1241
1242 static const uschar *
1243 read_repeat_counts(const uschar *p, int *minp, int *maxp, const char **errorptr)
1244 {
1245 int min = 0;
1246 int max = -1;
1247
1248 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
1249
1250 if (*p == '}') max = min; else
1251   {
1252   if (*(++p) != '}')
1253     {
1254     max = 0;
1255     while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
1256     if (max < min)
1257       {
1258       *errorptr = ERR4;
1259       return p;
1260       }
1261     }
1262   }
1263
1264 /* Do paranoid checks, then fill in the required variables, and pass back the
1265 pointer to the terminating '}'. */
1266
1267 if (min > 65535 || max > 65535)
1268   *errorptr = ERR5;
1269 else
1270   {
1271   *minp = min;
1272   *maxp = max;
1273   }
1274 return p;
1275 }
1276
1277
1278
1279 /*************************************************
1280 *      Find first significant op code            *
1281 *************************************************/
1282
1283 /* This is called by several functions that scan a compiled expression looking
1284 for a fixed first character, or an anchoring op code etc. It skips over things
1285 that do not influence this. For some calls, a change of option is important.
1286 For some calls, it makes sense to skip negative forward and all backward
1287 assertions, and also the \b assertion; for others it does not.
1288
1289 Arguments:
1290   code         pointer to the start of the group
1291   options      pointer to external options
1292   optbit       the option bit whose changing is significant, or
1293                  zero if none are
1294   skipassert   TRUE if certain assertions are to be skipped
1295
1296 Returns:       pointer to the first significant opcode
1297 */
1298
1299 static const uschar*
1300 first_significant_code(const uschar *code, int *options, int optbit,
1301   BOOL skipassert)
1302 {
1303 for (;;)
1304   {
1305   switch ((int)*code)
1306     {
1307     case OP_OPT:
1308     if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
1309       *options = (int)code[1];
1310     code += 2;
1311     break;
1312
1313     case OP_ASSERT_NOT:
1314     case OP_ASSERTBACK:
1315     case OP_ASSERTBACK_NOT:
1316     if (!skipassert) return code;
1317     do code += GET(code, 1); while (*code == OP_ALT);
1318     code += OP_lengths[*code];
1319     break;
1320
1321     case OP_WORD_BOUNDARY:
1322     case OP_NOT_WORD_BOUNDARY:
1323     if (!skipassert) return code;
1324     /* Fall through */
1325
1326     case OP_CALLOUT:
1327     case OP_CREF:
1328     case OP_BRANUMBER:
1329     code += OP_lengths[*code];
1330     break;
1331
1332     default:
1333     return code;
1334     }
1335   }
1336 /* Control never reaches here */
1337 }
1338
1339
1340
1341
1342 /*************************************************
1343 *        Find the fixed length of a pattern      *
1344 *************************************************/
1345
1346 /* Scan a pattern and compute the fixed length of subject that will match it,
1347 if the length is fixed. This is needed for dealing with backward assertions.
1348 In UTF8 mode, the result is in characters rather than bytes.
1349
1350 Arguments:
1351   code     points to the start of the pattern (the bracket)
1352   options  the compiling options
1353
1354 Returns:   the fixed length, or -1 if there is no fixed length,
1355              or -2 if \C was encountered
1356 */
1357
1358 static int
1359 find_fixedlength(uschar *code, int options)
1360 {
1361 int length = -1;
1362
1363 register int branchlength = 0;
1364 register uschar *cc = code + 1 + LINK_SIZE;
1365
1366 /* Scan along the opcodes for this branch. If we get to the end of the
1367 branch, check the length against that of the other branches. */
1368
1369 for (;;)
1370   {
1371   int d;
1372   register int op = *cc;
1373   if (op >= OP_BRA) op = OP_BRA;
1374
1375   switch (op)
1376     {
1377     case OP_BRA:
1378     case OP_ONCE:
1379     case OP_COND:
1380     d = find_fixedlength(cc, options);
1381     if (d < 0) return d;
1382     branchlength += d;
1383     do cc += GET(cc, 1); while (*cc == OP_ALT);
1384     cc += 1 + LINK_SIZE;
1385     break;
1386
1387     /* Reached end of a branch; if it's a ket it is the end of a nested
1388     call. If it's ALT it is an alternation in a nested call. If it is
1389     END it's the end of the outer call. All can be handled by the same code. */
1390
1391     case OP_ALT:
1392     case OP_KET:
1393     case OP_KETRMAX:
1394     case OP_KETRMIN:
1395     case OP_END:
1396     if (length < 0) length = branchlength;
1397       else if (length != branchlength) return -1;
1398     if (*cc != OP_ALT) return length;
1399     cc += 1 + LINK_SIZE;
1400     branchlength = 0;
1401     break;
1402
1403     /* Skip over assertive subpatterns */
1404
1405     case OP_ASSERT:
1406     case OP_ASSERT_NOT:
1407     case OP_ASSERTBACK:
1408     case OP_ASSERTBACK_NOT:
1409     do cc += GET(cc, 1); while (*cc == OP_ALT);
1410     /* Fall through */
1411
1412     /* Skip over things that don't match chars */
1413
1414     case OP_REVERSE:
1415     case OP_BRANUMBER:
1416     case OP_CREF:
1417     case OP_OPT:
1418     case OP_CALLOUT:
1419     case OP_SOD:
1420     case OP_SOM:
1421     case OP_EOD:
1422     case OP_EODN:
1423     case OP_CIRC:
1424     case OP_DOLL:
1425     case OP_NOT_WORD_BOUNDARY:
1426     case OP_WORD_BOUNDARY:
1427     cc += OP_lengths[*cc];
1428     break;
1429
1430     /* Handle literal characters */
1431
1432     case OP_CHAR:
1433     case OP_CHARNC:
1434     branchlength++;
1435     cc += 2;
1436 #ifdef SUPPORT_UTF8
1437     if ((options & PCRE_UTF8) != 0)
1438       {
1439       while ((*cc & 0xc0) == 0x80) cc++;
1440       }
1441 #endif
1442     break;
1443
1444     /* Handle exact repetitions. The count is already in characters, but we
1445     need to skip over a multibyte character in UTF8 mode.  */
1446
1447     case OP_EXACT:
1448     branchlength += GET2(cc,1);
1449     cc += 4;
1450 #ifdef SUPPORT_UTF8
1451     if ((options & PCRE_UTF8) != 0)
1452       {
1453       while((*cc & 0x80) == 0x80) cc++;
1454       }
1455 #endif
1456     break;
1457
1458     case OP_TYPEEXACT:
1459     branchlength += GET2(cc,1);
1460     cc += 4;
1461     break;
1462
1463     /* Handle single-char matchers */
1464
1465     case OP_PROP:
1466     case OP_NOTPROP:
1467     cc++;
1468     /* Fall through */
1469
1470     case OP_NOT_DIGIT:
1471     case OP_DIGIT:
1472     case OP_NOT_WHITESPACE:
1473     case OP_WHITESPACE:
1474     case OP_NOT_WORDCHAR:
1475     case OP_WORDCHAR:
1476     case OP_ANY:
1477     branchlength++;
1478     cc++;
1479     break;
1480
1481     /* The single-byte matcher isn't allowed */
1482
1483     case OP_ANYBYTE:
1484     return -2;
1485
1486     /* Check a class for variable quantification */
1487
1488 #ifdef SUPPORT_UTF8
1489     case OP_XCLASS:
1490     cc += GET(cc, 1) - 33;
1491     /* Fall through */
1492 #endif
1493
1494     case OP_CLASS:
1495     case OP_NCLASS:
1496     cc += 33;
1497
1498     switch (*cc)
1499       {
1500       case OP_CRSTAR:
1501       case OP_CRMINSTAR:
1502       case OP_CRQUERY:
1503       case OP_CRMINQUERY:
1504       return -1;
1505
1506       case OP_CRRANGE:
1507       case OP_CRMINRANGE:
1508       if (GET2(cc,1) != GET2(cc,3)) return -1;
1509       branchlength += GET2(cc,1);
1510       cc += 5;
1511       break;
1512
1513       default:
1514       branchlength++;
1515       }
1516     break;
1517
1518     /* Anything else is variable length */
1519
1520     default:
1521     return -1;
1522     }
1523   }
1524 /* Control never gets here */
1525 }
1526
1527
1528
1529
1530 /*************************************************
1531 *    Scan compiled regex for numbered bracket    *
1532 *************************************************/
1533
1534 /* This little function scans through a compiled pattern until it finds a
1535 capturing bracket with the given number.
1536
1537 Arguments:
1538   code        points to start of expression
1539   utf8        TRUE in UTF-8 mode
1540   number      the required bracket number
1541
1542 Returns:      pointer to the opcode for the bracket, or NULL if not found
1543 */
1544
1545 static const uschar *
1546 find_bracket(const uschar *code, BOOL utf8, int number)
1547 {
1548 #ifndef SUPPORT_UTF8
1549 utf8 = utf8;               /* Stop pedantic compilers complaining */
1550 #endif
1551
1552 for (;;)
1553   {
1554   register int c = *code;
1555   if (c == OP_END) return NULL;
1556   else if (c > OP_BRA)
1557     {
1558     int n = c - OP_BRA;
1559     if (n > EXTRACT_BASIC_MAX) n = GET2(code, 2+LINK_SIZE);
1560     if (n == number) return (uschar *)code;
1561     code += OP_lengths[OP_BRA];
1562     }
1563   else
1564     {
1565     code += OP_lengths[c];
1566
1567 #ifdef SUPPORT_UTF8
1568
1569     /* In UTF-8 mode, opcodes that are followed by a character may be followed
1570     by a multi-byte character. The length in the table is a minimum, so we have
1571     to scan along to skip the extra bytes. All opcodes are less than 128, so we
1572     can use relatively efficient code. */
1573
1574     if (utf8) switch(c)
1575       {
1576       case OP_CHAR:
1577       case OP_CHARNC:
1578       case OP_EXACT:
1579       case OP_UPTO:
1580       case OP_MINUPTO:
1581       case OP_STAR:
1582       case OP_MINSTAR:
1583       case OP_PLUS:
1584       case OP_MINPLUS:
1585       case OP_QUERY:
1586       case OP_MINQUERY:
1587       while ((*code & 0xc0) == 0x80) code++;
1588       break;
1589
1590       /* XCLASS is used for classes that cannot be represented just by a bit
1591       map. This includes negated single high-valued characters. The length in
1592       the table is zero; the actual length is stored in the compiled code. */
1593
1594       case OP_XCLASS:
1595       code += GET(code, 1) + 1;
1596       break;
1597       }
1598 #endif
1599     }
1600   }
1601 }
1602
1603
1604
1605 /*************************************************
1606 *   Scan compiled regex for recursion reference  *
1607 *************************************************/
1608
1609 /* This little function scans through a compiled pattern until it finds an
1610 instance of OP_RECURSE.
1611
1612 Arguments:
1613   code        points to start of expression
1614   utf8        TRUE in UTF-8 mode
1615
1616 Returns:      pointer to the opcode for OP_RECURSE, or NULL if not found
1617 */
1618
1619 static const uschar *
1620 find_recurse(const uschar *code, BOOL utf8)
1621 {
1622 #ifndef SUPPORT_UTF8
1623 utf8 = utf8;               /* Stop pedantic compilers complaining */
1624 #endif
1625
1626 for (;;)
1627   {
1628   register int c = *code;
1629   if (c == OP_END) return NULL;
1630   else if (c == OP_RECURSE) return code;
1631   else if (c > OP_BRA)
1632     {
1633     code += OP_lengths[OP_BRA];
1634     }
1635   else
1636     {
1637     code += OP_lengths[c];
1638
1639 #ifdef SUPPORT_UTF8
1640
1641     /* In UTF-8 mode, opcodes that are followed by a character may be followed
1642     by a multi-byte character. The length in the table is a minimum, so we have
1643     to scan along to skip the extra bytes. All opcodes are less than 128, so we
1644     can use relatively efficient code. */
1645
1646     if (utf8) switch(c)
1647       {
1648       case OP_CHAR:
1649       case OP_CHARNC:
1650       case OP_EXACT:
1651       case OP_UPTO:
1652       case OP_MINUPTO:
1653       case OP_STAR:
1654       case OP_MINSTAR:
1655       case OP_PLUS:
1656       case OP_MINPLUS:
1657       case OP_QUERY:
1658       case OP_MINQUERY:
1659       while ((*code & 0xc0) == 0x80) code++;
1660       break;
1661
1662       /* XCLASS is used for classes that cannot be represented just by a bit
1663       map. This includes negated single high-valued characters. The length in
1664       the table is zero; the actual length is stored in the compiled code. */
1665
1666       case OP_XCLASS:
1667       code += GET(code, 1) + 1;
1668       break;
1669       }
1670 #endif
1671     }
1672   }
1673 }
1674
1675
1676
1677 /*************************************************
1678 *    Scan compiled branch for non-emptiness      *
1679 *************************************************/
1680
1681 /* This function scans through a branch of a compiled pattern to see whether it
1682 can match the empty string or not. It is called only from could_be_empty()
1683 below. Note that first_significant_code() skips over assertions. If we hit an
1684 unclosed bracket, we return "empty" - this means we've struck an inner bracket
1685 whose current branch will already have been scanned.
1686
1687 Arguments:
1688   code        points to start of search
1689   endcode     points to where to stop
1690   utf8        TRUE if in UTF8 mode
1691
1692 Returns:      TRUE if what is matched could be empty
1693 */
1694
1695 static BOOL
1696 could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1697 {
1698 register int c;
1699 for (code = first_significant_code(code + 1 + LINK_SIZE, NULL, 0, TRUE);
1700      code < endcode;
1701      code = first_significant_code(code + OP_lengths[c], NULL, 0, TRUE))
1702   {
1703   const uschar *ccode;
1704
1705   c = *code;
1706
1707   if (c >= OP_BRA)
1708     {
1709     BOOL empty_branch;
1710     if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */
1711
1712     /* Scan a closed bracket */
1713
1714     empty_branch = FALSE;
1715     do
1716       {
1717       if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1718         empty_branch = TRUE;
1719       code += GET(code, 1);
1720       }
1721     while (*code == OP_ALT);
1722     if (!empty_branch) return FALSE;   /* All branches are non-empty */
1723     code += 1 + LINK_SIZE;
1724     c = *code;
1725     }
1726
1727   else switch (c)
1728     {
1729     /* Check for quantifiers after a class */
1730
1731 #ifdef SUPPORT_UTF8
1732     case OP_XCLASS:
1733     ccode = code + GET(code, 1);
1734     goto CHECK_CLASS_REPEAT;
1735 #endif
1736
1737     case OP_CLASS:
1738     case OP_NCLASS:
1739     ccode = code + 33;
1740
1741 #ifdef SUPPORT_UTF8
1742     CHECK_CLASS_REPEAT:
1743 #endif
1744
1745     switch (*ccode)
1746       {
1747       case OP_CRSTAR:            /* These could be empty; continue */
1748       case OP_CRMINSTAR:
1749       case OP_CRQUERY:
1750       case OP_CRMINQUERY:
1751       break;
1752
1753       default:                   /* Non-repeat => class must match */
1754       case OP_CRPLUS:            /* These repeats aren't empty */
1755       case OP_CRMINPLUS:
1756       return FALSE;
1757
1758       case OP_CRRANGE:
1759       case OP_CRMINRANGE:
1760       if (GET2(ccode, 1) > 0) return FALSE;  /* Minimum > 0 */
1761       break;
1762       }
1763     break;
1764
1765     /* Opcodes that must match a character */
1766
1767     case OP_PROP:
1768     case OP_NOTPROP:
1769     case OP_EXTUNI:
1770     case OP_NOT_DIGIT:
1771     case OP_DIGIT:
1772     case OP_NOT_WHITESPACE:
1773     case OP_WHITESPACE:
1774     case OP_NOT_WORDCHAR:
1775     case OP_WORDCHAR:
1776     case OP_ANY:
1777     case OP_ANYBYTE:
1778     case OP_CHAR:
1779     case OP_CHARNC:
1780     case OP_NOT:
1781     case OP_PLUS:
1782     case OP_MINPLUS:
1783     case OP_EXACT:
1784     case OP_NOTPLUS:
1785     case OP_NOTMINPLUS:
1786     case OP_NOTEXACT:
1787     case OP_TYPEPLUS:
1788     case OP_TYPEMINPLUS:
1789     case OP_TYPEEXACT:
1790     return FALSE;
1791
1792     /* End of branch */
1793
1794     case OP_KET:
1795     case OP_KETRMAX:
1796     case OP_KETRMIN:
1797     case OP_ALT:
1798     return TRUE;
1799
1800     /* In UTF-8 mode, STAR, MINSTAR, QUERY, MINQUERY, UPTO, and MINUPTO  may be
1801     followed by a multibyte character */
1802
1803 #ifdef SUPPORT_UTF8
1804     case OP_STAR:
1805     case OP_MINSTAR:
1806     case OP_QUERY:
1807     case OP_MINQUERY:
1808     case OP_UPTO:
1809     case OP_MINUPTO:
1810     if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1811     break;
1812 #endif
1813     }
1814   }
1815
1816 return TRUE;
1817 }
1818
1819
1820
1821 /*************************************************
1822 *    Scan compiled regex for non-emptiness       *
1823 *************************************************/
1824
1825 /* This function is called to check for left recursive calls. We want to check
1826 the current branch of the current pattern to see if it could match the empty
1827 string. If it could, we must look outwards for branches at other levels,
1828 stopping when we pass beyond the bracket which is the subject of the recursion.
1829
1830 Arguments:
1831   code        points to start of the recursion
1832   endcode     points to where to stop (current RECURSE item)
1833   bcptr       points to the chain of current (unclosed) branch starts
1834   utf8        TRUE if in UTF-8 mode
1835
1836 Returns:      TRUE if what is matched could be empty
1837 */
1838
1839 static BOOL
1840 could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1841   BOOL utf8)
1842 {
1843 while (bcptr != NULL && bcptr->current >= code)
1844   {
1845   if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1846   bcptr = bcptr->outer;
1847   }
1848 return TRUE;
1849 }
1850
1851
1852
1853 /*************************************************
1854 *           Check for POSIX class syntax         *
1855 *************************************************/
1856
1857 /* This function is called when the sequence "[:" or "[." or "[=" is
1858 encountered in a character class. It checks whether this is followed by an
1859 optional ^ and then a sequence of letters, terminated by a matching ":]" or
1860 ".]" or "=]".
1861
1862 Argument:
1863   ptr      pointer to the initial [
1864   endptr   where to return the end pointer
1865   cd       pointer to compile data
1866
1867 Returns:   TRUE or FALSE
1868 */
1869
1870 static BOOL
1871 check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
1872 {
1873 int terminator;          /* Don't combine these lines; the Solaris cc */
1874 terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */
1875 if (*(++ptr) == '^') ptr++;
1876 while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
1877 if (*ptr == terminator && ptr[1] == ']')
1878   {
1879   *endptr = ptr;
1880   return TRUE;
1881   }
1882 return FALSE;
1883 }
1884
1885
1886
1887
1888 /*************************************************
1889 *          Check POSIX class name                *
1890 *************************************************/
1891
1892 /* This function is called to check the name given in a POSIX-style class entry
1893 such as [:alnum:].
1894
1895 Arguments:
1896   ptr        points to the first letter
1897   len        the length of the name
1898
1899 Returns:     a value representing the name, or -1 if unknown
1900 */
1901
1902 static int
1903 check_posix_name(const uschar *ptr, int len)
1904 {
1905 register int yield = 0;
1906 while (posix_name_lengths[yield] != 0)
1907   {
1908   if (len == posix_name_lengths[yield] &&
1909     strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
1910   yield++;
1911   }
1912 return -1;
1913 }
1914
1915
1916 /*************************************************
1917 *    Adjust OP_RECURSE items in repeated group   *
1918 *************************************************/
1919
1920 /* OP_RECURSE items contain an offset from the start of the regex to the group
1921 that is referenced. This means that groups can be replicated for fixed
1922 repetition simply by copying (because the recursion is allowed to refer to
1923 earlier groups that are outside the current group). However, when a group is
1924 optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1925 it, after it has been compiled. This means that any OP_RECURSE items within it
1926 that refer to the group itself or any contained groups have to have their
1927 offsets adjusted. That is the job of this function. Before it is called, the
1928 partially compiled regex must be temporarily terminated with OP_END.
1929
1930 Arguments:
1931   group      points to the start of the group
1932   adjust     the amount by which the group is to be moved
1933   utf8       TRUE in UTF-8 mode
1934   cd         contains pointers to tables etc.
1935
1936 Returns:     nothing
1937 */
1938
1939 static void
1940 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd)
1941 {
1942 uschar *ptr = group;
1943 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1944   {
1945   int offset = GET(ptr, 1);
1946   if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1947   ptr += 1 + LINK_SIZE;
1948   }
1949 }
1950
1951
1952
1953 /*************************************************
1954 *        Insert an automatic callout point       *
1955 *************************************************/
1956
1957 /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
1958 callout points before each pattern item.
1959
1960 Arguments:
1961   code           current code pointer
1962   ptr            current pattern pointer
1963   cd             pointers to tables etc
1964
1965 Returns:         new code pointer
1966 */
1967
1968 static uschar *
1969 auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
1970 {
1971 *code++ = OP_CALLOUT;
1972 *code++ = 255;
1973 PUT(code, 0, ptr - cd->start_pattern);  /* Pattern offset */
1974 PUT(code, LINK_SIZE, 0);                /* Default length */
1975 return code + 2*LINK_SIZE;
1976 }
1977
1978
1979
1980 /*************************************************
1981 *         Complete a callout item                *
1982 *************************************************/
1983
1984 /* A callout item contains the length of the next item in the pattern, which
1985 we can't fill in till after we have reached the relevant point. This is used
1986 for both automatic and manual callouts.
1987
1988 Arguments:
1989   previous_callout   points to previous callout item
1990   ptr                current pattern pointer
1991   cd                 pointers to tables etc
1992
1993 Returns:             nothing
1994 */
1995
1996 static void
1997 complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
1998 {
1999 int length = ptr - cd->start_pattern - GET(previous_callout, 2);
2000 PUT(previous_callout, 2 + LINK_SIZE, length);
2001 }
2002
2003
2004
2005 #ifdef SUPPORT_UCP
2006 /*************************************************
2007 *           Get othercase range                  *
2008 *************************************************/
2009
2010 /* This function is passed the start and end of a class range, in UTF-8 mode
2011 with UCP support. It searches up the characters, looking for internal ranges of
2012 characters in the "other" case. Each call returns the next one, updating the
2013 start address.
2014
2015 Arguments:
2016   cptr        points to starting character value; updated
2017   d           end value
2018   ocptr       where to put start of othercase range
2019   odptr       where to put end of othercase range
2020
2021 Yield:        TRUE when range returned; FALSE when no more
2022 */
2023
2024 static BOOL
2025 get_othercase_range(int *cptr, int d, int *ocptr, int *odptr)
2026 {
2027 int c, chartype, othercase, next;
2028
2029 for (c = *cptr; c <= d; c++)
2030   {
2031   if (ucp_findchar(c, &chartype, &othercase) == ucp_L && othercase != 0) break;
2032   }
2033
2034 if (c > d) return FALSE;
2035
2036 *ocptr = othercase;
2037 next = othercase + 1;
2038
2039 for (++c; c <= d; c++)
2040   {
2041   if (ucp_findchar(c, &chartype, &othercase) != ucp_L || othercase != next)
2042     break;
2043   next++;
2044   }
2045
2046 *odptr = next - 1;
2047 *cptr = c;
2048
2049 return TRUE;
2050 }
2051 #endif  /* SUPPORT_UCP */
2052
2053
2054 /*************************************************
2055 *           Compile one branch                   *
2056 *************************************************/
2057
2058 /* Scan the pattern, compiling it into the code vector. If the options are
2059 changed during the branch, the pointer is used to change the external options
2060 bits.
2061
2062 Arguments:
2063   optionsptr     pointer to the option bits
2064   brackets       points to number of extracting brackets used
2065   codeptr        points to the pointer to the current code point
2066   ptrptr         points to the current pattern pointer
2067   errorptr       points to pointer to error message
2068   firstbyteptr   set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2069   reqbyteptr     set to the last literal character required, else < 0
2070   bcptr          points to current branch chain
2071   cd             contains pointers to tables etc.
2072
2073 Returns:         TRUE on success
2074                  FALSE, with *errorptr set on error
2075 */
2076
2077 static BOOL
2078 compile_branch(int *optionsptr, int *brackets, uschar **codeptr,
2079   const uschar **ptrptr, const char **errorptr, int *firstbyteptr,
2080   int *reqbyteptr, branch_chain *bcptr, compile_data *cd)
2081 {
2082 int repeat_type, op_type;
2083 int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
2084 int bravalue = 0;
2085 int greedy_default, greedy_non_default;
2086 int firstbyte, reqbyte;
2087 int zeroreqbyte, zerofirstbyte;
2088 int req_caseopt, reqvary, tempreqvary;
2089 int condcount = 0;
2090 int options = *optionsptr;
2091 int after_manual_callout = 0;
2092 register int c;
2093 register uschar *code = *codeptr;
2094 uschar *tempcode;
2095 BOOL inescq = FALSE;
2096 BOOL groupsetfirstbyte = FALSE;
2097 const uschar *ptr = *ptrptr;
2098 const uschar *tempptr;
2099 uschar *previous = NULL;
2100 uschar *previous_callout = NULL;
2101 uschar classbits[32];
2102
2103 #ifdef SUPPORT_UTF8
2104 BOOL class_utf8;
2105 BOOL utf8 = (options & PCRE_UTF8) != 0;
2106 uschar *class_utf8data;
2107 uschar utf8_char[6];
2108 #else
2109 BOOL utf8 = FALSE;
2110 #endif
2111
2112 /* Set up the default and non-default settings for greediness */
2113
2114 greedy_default = ((options & PCRE_UNGREEDY) != 0);
2115 greedy_non_default = greedy_default ^ 1;
2116
2117 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2118 matching encountered yet". It gets changed to REQ_NONE if we hit something that
2119 matches a non-fixed char first char; reqbyte just remains unset if we never
2120 find one.
2121
2122 When we hit a repeat whose minimum is zero, we may have to adjust these values
2123 to take the zero repeat into account. This is implemented by setting them to
2124 zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2125 item types that can be repeated set these backoff variables appropriately. */
2126
2127 firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2128
2129 /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2130 according to the current setting of the caseless flag. REQ_CASELESS is a bit
2131 value > 255. It is added into the firstbyte or reqbyte variables to record the
2132 case status of the value. This is used only for ASCII characters. */
2133
2134 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2135
2136 /* Switch on next character until the end of the branch */
2137
2138 for (;; ptr++)
2139   {
2140   BOOL negate_class;
2141   BOOL possessive_quantifier;
2142   BOOL is_quantifier;
2143   int class_charcount;
2144   int class_lastchar;
2145   int newoptions;
2146   int recno;
2147   int skipbytes;
2148   int subreqbyte;
2149   int subfirstbyte;
2150   int mclength;
2151   uschar mcbuffer[8];
2152
2153   /* Next byte in the pattern */
2154
2155   c = *ptr;
2156
2157   /* If in \Q...\E, check for the end; if not, we have a literal */
2158
2159   if (inescq && c != 0)
2160     {
2161     if (c == '\\' && ptr[1] == 'E')
2162       {
2163       inescq = FALSE;
2164       ptr++;
2165       continue;
2166       }
2167     else
2168       {
2169       if (previous_callout != NULL)
2170         {
2171         complete_callout(previous_callout, ptr, cd);
2172         previous_callout = NULL;
2173         }
2174       if ((options & PCRE_AUTO_CALLOUT) != 0)
2175         {
2176         previous_callout = code;
2177         code = auto_callout(code, ptr, cd);
2178         }
2179       goto NORMAL_CHAR;
2180       }
2181     }
2182
2183   /* Fill in length of a previous callout, except when the next thing is
2184   a quantifier. */
2185
2186   is_quantifier = c == '*' || c == '+' || c == '?' ||
2187     (c == '{' && is_counted_repeat(ptr+1));
2188
2189   if (!is_quantifier && previous_callout != NULL &&
2190        after_manual_callout-- <= 0)
2191     {
2192     complete_callout(previous_callout, ptr, cd);
2193     previous_callout = NULL;
2194     }
2195
2196   /* In extended mode, skip white space and comments */
2197
2198   if ((options & PCRE_EXTENDED) != 0)
2199     {
2200     if ((cd->ctypes[c] & ctype_space) != 0) continue;
2201     if (c == '#')
2202       {
2203       /* The space before the ; is to avoid a warning on a silly compiler
2204       on the Macintosh. */
2205       while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
2206       if (c != 0) continue;   /* Else fall through to handle end of string */
2207       }
2208     }
2209
2210   /* No auto callout for quantifiers. */
2211
2212   if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
2213     {
2214     previous_callout = code;
2215     code = auto_callout(code, ptr, cd);
2216     }
2217
2218   switch(c)
2219     {
2220     /* The branch terminates at end of string, |, or ). */
2221
2222     case 0:
2223     case '|':
2224     case ')':
2225     *firstbyteptr = firstbyte;
2226     *reqbyteptr = reqbyte;
2227     *codeptr = code;
2228     *ptrptr = ptr;
2229     return TRUE;
2230
2231     /* Handle single-character metacharacters. In multiline mode, ^ disables
2232     the setting of any following char as a first character. */
2233
2234     case '^':
2235     if ((options & PCRE_MULTILINE) != 0)
2236       {
2237       if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2238       }
2239     previous = NULL;
2240     *code++ = OP_CIRC;
2241     break;
2242
2243     case '$':
2244     previous = NULL;
2245     *code++ = OP_DOLL;
2246     break;
2247
2248     /* There can never be a first char if '.' is first, whatever happens about
2249     repeats. The value of reqbyte doesn't change either. */
2250
2251     case '.':
2252     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2253     zerofirstbyte = firstbyte;
2254     zeroreqbyte = reqbyte;
2255     previous = code;
2256     *code++ = OP_ANY;
2257     break;
2258
2259     /* Character classes. If the included characters are all < 255 in value, we
2260     build a 32-byte bitmap of the permitted characters, except in the special
2261     case where there is only one such character. For negated classes, we build
2262     the map as usual, then invert it at the end. However, we use a different
2263     opcode so that data characters > 255 can be handled correctly.
2264
2265     If the class contains characters outside the 0-255 range, a different
2266     opcode is compiled. It may optionally have a bit map for characters < 256,
2267     but those above are are explicitly listed afterwards. A flag byte tells
2268     whether the bitmap is present, and whether this is a negated class or not.
2269     */
2270
2271     case '[':
2272     previous = code;
2273
2274     /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2275     they are encountered at the top level, so we'll do that too. */
2276
2277     if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2278         check_posix_syntax(ptr, &tempptr, cd))
2279       {
2280       *errorptr = (ptr[1] == ':')? ERR13 : ERR31;
2281       goto FAILED;
2282       }
2283
2284     /* If the first character is '^', set the negation flag and skip it. */
2285
2286     if ((c = *(++ptr)) == '^')
2287       {
2288       negate_class = TRUE;
2289       c = *(++ptr);
2290       }
2291     else
2292       {
2293       negate_class = FALSE;
2294       }
2295
2296     /* Keep a count of chars with values < 256 so that we can optimize the case
2297     of just a single character (as long as it's < 256). For higher valued UTF-8
2298     characters, we don't yet do any optimization. */
2299
2300     class_charcount = 0;
2301     class_lastchar = -1;
2302
2303 #ifdef SUPPORT_UTF8
2304     class_utf8 = FALSE;                       /* No chars >= 256 */
2305     class_utf8data = code + LINK_SIZE + 34;   /* For UTF-8 items */
2306 #endif
2307
2308     /* Initialize the 32-char bit map to all zeros. We have to build the
2309     map in a temporary bit of store, in case the class contains only 1
2310     character (< 256), because in that case the compiled code doesn't use the
2311     bit map. */
2312
2313     memset(classbits, 0, 32 * sizeof(uschar));
2314
2315     /* Process characters until ] is reached. By writing this as a "do" it
2316     means that an initial ] is taken as a data character. The first pass
2317     through the regex checked the overall syntax, so we don't need to be very
2318     strict here. At the start of the loop, c contains the first byte of the
2319     character. */
2320
2321     do
2322       {
2323 #ifdef SUPPORT_UTF8
2324       if (utf8 && c > 127)
2325         {                           /* Braces are required because the */
2326         GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */
2327         }
2328 #endif
2329
2330       /* Inside \Q...\E everything is literal except \E */
2331
2332       if (inescq)
2333         {
2334         if (c == '\\' && ptr[1] == 'E')
2335           {
2336           inescq = FALSE;
2337           ptr++;
2338           continue;
2339           }
2340         else goto LONE_SINGLE_CHARACTER;
2341         }
2342
2343       /* Handle POSIX class names. Perl allows a negation extension of the
2344       form [:^name:]. A square bracket that doesn't match the syntax is
2345       treated as a literal. We also recognize the POSIX constructions
2346       [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
2347       5.6 and 5.8 do. */
2348
2349       if (c == '[' &&
2350           (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2351           check_posix_syntax(ptr, &tempptr, cd))
2352         {
2353         BOOL local_negate = FALSE;
2354         int posix_class, i;
2355         register const uschar *cbits = cd->cbits;
2356
2357         if (ptr[1] != ':')
2358           {
2359           *errorptr = ERR31;
2360           goto FAILED;
2361           }
2362
2363         ptr += 2;
2364         if (*ptr == '^')
2365           {
2366           local_negate = TRUE;
2367           ptr++;
2368           }
2369
2370         posix_class = check_posix_name(ptr, tempptr - ptr);
2371         if (posix_class < 0)
2372           {
2373           *errorptr = ERR30;
2374           goto FAILED;
2375           }
2376
2377         /* If matching is caseless, upper and lower are converted to
2378         alpha. This relies on the fact that the class table starts with
2379         alpha, lower, upper as the first 3 entries. */
2380
2381         if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
2382           posix_class = 0;
2383
2384         /* Or into the map we are building up to 3 of the static class
2385         tables, or their negations. The [:blank:] class sets up the same
2386         chars as the [:space:] class (all white space). We remove the vertical
2387         white space chars afterwards. */
2388
2389         posix_class *= 3;
2390         for (i = 0; i < 3; i++)
2391           {
2392           BOOL blankclass = strncmp((char *)ptr, "blank", 5) == 0;
2393           int taboffset = posix_class_maps[posix_class + i];
2394           if (taboffset < 0) break;
2395           if (local_negate)
2396             {
2397             if (i == 0)
2398               for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+taboffset];
2399             else
2400               for (c = 0; c < 32; c++) classbits[c] &= ~cbits[c+taboffset];
2401             if (blankclass) classbits[1] |= 0x3c;
2402             }
2403           else
2404             {
2405             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+taboffset];
2406             if (blankclass) classbits[1] &= ~0x3c;
2407             }
2408           }
2409
2410         ptr = tempptr + 1;
2411         class_charcount = 10;  /* Set > 1; assumes more than 1 per class */
2412         continue;    /* End of POSIX syntax handling */
2413         }
2414
2415       /* Backslash may introduce a single character, or it may introduce one
2416       of the specials, which just set a flag. Escaped items are checked for
2417       validity in the pre-compiling pass. The sequence \b is a special case.
2418       Inside a class (and only there) it is treated as backspace. Elsewhere
2419       it marks a word boundary. Other escapes have preset maps ready to
2420       or into the one we are building. We assume they have more than one
2421       character in them, so set class_charcount bigger than one. */
2422
2423       if (c == '\\')
2424         {
2425         c = check_escape(&ptr, errorptr, *brackets, options, TRUE);
2426
2427         if (-c == ESC_b) c = '\b';       /* \b is backslash in a class */
2428         else if (-c == ESC_X) c = 'X';   /* \X is literal X in a class */
2429         else if (-c == ESC_Q)            /* Handle start of quoted string */
2430           {
2431           if (ptr[1] == '\\' && ptr[2] == 'E')
2432             {
2433             ptr += 2; /* avoid empty string */
2434             }
2435           else inescq = TRUE;
2436           continue;
2437           }
2438
2439         if (c < 0)
2440           {
2441           register const uschar *cbits = cd->cbits;
2442           class_charcount += 2;     /* Greater than 1 is what matters */
2443           switch (-c)
2444             {
2445             case ESC_d:
2446             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
2447             continue;
2448
2449             case ESC_D:
2450             for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
2451             continue;
2452
2453             case ESC_w:
2454             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
2455             continue;
2456
2457             case ESC_W:
2458             for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
2459             continue;
2460
2461             case ESC_s:
2462             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
2463             classbits[1] &= ~0x08;   /* Perl 5.004 onwards omits VT from \s */
2464             continue;
2465
2466             case ESC_S:
2467             for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
2468             classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */
2469             continue;
2470
2471 #ifdef SUPPORT_UCP
2472             case ESC_p:
2473             case ESC_P:
2474               {
2475               BOOL negated;
2476               int property = get_ucp(&ptr, &negated, errorptr);
2477               if (property < 0) goto FAILED;
2478               class_utf8 = TRUE;
2479               *class_utf8data++ = ((-c == ESC_p) != negated)?
2480                 XCL_PROP : XCL_NOTPROP;
2481               *class_utf8data++ = property;
2482               class_charcount -= 2;   /* Not a < 256 character */
2483               }
2484             continue;
2485 #endif
2486
2487             /* Unrecognized escapes are faulted if PCRE is running in its
2488             strict mode. By default, for compatibility with Perl, they are
2489             treated as literals. */
2490
2491             default:
2492             if ((options & PCRE_EXTRA) != 0)
2493               {
2494               *errorptr = ERR7;
2495               goto FAILED;
2496               }
2497             c = *ptr;              /* The final character */
2498             class_charcount -= 2;  /* Undo the default count from above */
2499             }
2500           }
2501
2502         /* Fall through if we have a single character (c >= 0). This may be
2503         > 256 in UTF-8 mode. */
2504
2505         }   /* End of backslash handling */
2506
2507       /* A single character may be followed by '-' to form a range. However,
2508       Perl does not permit ']' to be the end of the range. A '-' character
2509       here is treated as a literal. */
2510
2511       if (ptr[1] == '-' && ptr[2] != ']')
2512         {
2513         int d;
2514         ptr += 2;
2515
2516 #ifdef SUPPORT_UTF8
2517         if (utf8)
2518           {                           /* Braces are required because the */
2519           GETCHARLEN(d, ptr, ptr);    /* macro generates multiple statements */
2520           }
2521         else
2522 #endif
2523         d = *ptr;  /* Not UTF-8 mode */
2524
2525         /* The second part of a range can be a single-character escape, but
2526         not any of the other escapes. Perl 5.6 treats a hyphen as a literal
2527         in such circumstances. */
2528
2529         if (d == '\\')
2530           {
2531           const uschar *oldptr = ptr;
2532           d = check_escape(&ptr, errorptr, *brackets, options, TRUE);
2533
2534           /* \b is backslash; \X is literal X; any other special means the '-'
2535           was literal */
2536
2537           if (d < 0)
2538             {
2539             if (d == -ESC_b) d = '\b';
2540             else if (d == -ESC_X) d = 'X'; else
2541               {
2542               ptr = oldptr - 2;
2543               goto LONE_SINGLE_CHARACTER;  /* A few lines below */
2544               }
2545             }
2546           }
2547
2548         /* The check that the two values are in the correct order happens in
2549         the pre-pass. Optimize one-character ranges */
2550
2551         if (d == c) goto LONE_SINGLE_CHARACTER;  /* A few lines below */
2552
2553         /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
2554         matching, we have to use an XCLASS with extra data items. Caseless
2555         matching for characters > 127 is available only if UCP support is
2556         available. */
2557
2558 #ifdef SUPPORT_UTF8
2559         if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
2560           {
2561           class_utf8 = TRUE;
2562
2563           /* With UCP support, we can find the other case equivalents of
2564           the relevant characters. There may be several ranges. Optimize how
2565           they fit with the basic range. */
2566
2567 #ifdef SUPPORT_UCP
2568           if ((options & PCRE_CASELESS) != 0)
2569             {
2570             int occ, ocd;
2571             int cc = c;
2572             int origd = d;
2573             while (get_othercase_range(&cc, origd, &occ, &ocd))
2574               {
2575               if (occ >= c && ocd <= d) continue;  /* Skip embedded ranges */
2576
2577               if (occ < c  && ocd >= c - 1)        /* Extend the basic range */
2578                 {                                  /* if there is overlap,   */
2579                 c = occ;                           /* noting that if occ < c */
2580                 continue;                          /* we can't have ocd > d  */
2581                 }                                  /* because a subrange is  */
2582               if (ocd > d && occ <= d + 1)         /* always shorter than    */
2583                 {                                  /* the basic range.       */
2584                 d = ocd;
2585                 continue;
2586                 }
2587
2588               if (occ == ocd)
2589                 {
2590                 *class_utf8data++ = XCL_SINGLE;
2591                 }
2592               else
2593                 {
2594                 *class_utf8data++ = XCL_RANGE;
2595                 class_utf8data += ord2utf8(occ, class_utf8data);
2596                 }
2597               class_utf8data += ord2utf8(ocd, class_utf8data);
2598               }
2599             }
2600 #endif  /* SUPPORT_UCP */
2601
2602           /* Now record the original range, possibly modified for UCP caseless
2603           overlapping ranges. */
2604
2605           *class_utf8data++ = XCL_RANGE;
2606           class_utf8data += ord2utf8(c, class_utf8data);
2607           class_utf8data += ord2utf8(d, class_utf8data);
2608
2609           /* With UCP support, we are done. Without UCP support, there is no
2610           caseless matching for UTF-8 characters > 127; we can use the bit map
2611           for the smaller ones. */
2612
2613 #ifdef SUPPORT_UCP
2614           continue;    /* With next character in the class */
2615 #else
2616           if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
2617
2618           /* Adjust upper limit and fall through to set up the map */
2619
2620           d = 127;
2621
2622 #endif  /* SUPPORT_UCP */
2623           }
2624 #endif  /* SUPPORT_UTF8 */
2625
2626         /* We use the bit map for all cases when not in UTF-8 mode; else
2627         ranges that lie entirely within 0-127 when there is UCP support; else
2628         for partial ranges without UCP support. */
2629
2630         for (; c <= d; c++)
2631           {
2632           classbits[c/8] |= (1 << (c&7));
2633           if ((options & PCRE_CASELESS) != 0)
2634             {
2635             int uc = cd->fcc[c];           /* flip case */
2636             classbits[uc/8] |= (1 << (uc&7));
2637             }
2638           class_charcount++;                /* in case a one-char range */
2639           class_lastchar = c;
2640           }
2641
2642         continue;   /* Go get the next char in the class */
2643         }
2644
2645       /* Handle a lone single character - we can get here for a normal
2646       non-escape char, or after \ that introduces a single character or for an
2647       apparent range that isn't. */
2648
2649       LONE_SINGLE_CHARACTER:
2650
2651       /* Handle a character that cannot go in the bit map */
2652
2653 #ifdef SUPPORT_UTF8
2654       if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
2655         {
2656         class_utf8 = TRUE;
2657         *class_utf8data++ = XCL_SINGLE;
2658         class_utf8data += ord2utf8(c, class_utf8data);
2659
2660 #ifdef SUPPORT_UCP
2661         if ((options & PCRE_CASELESS) != 0)
2662           {
2663           int chartype;
2664           int othercase;
2665           if (ucp_findchar(c, &chartype, &othercase) >= 0 && othercase > 0)
2666             {
2667             *class_utf8data++ = XCL_SINGLE;
2668             class_utf8data += ord2utf8(othercase, class_utf8data);
2669             }
2670           }
2671 #endif  /* SUPPORT_UCP */
2672
2673         }
2674       else
2675 #endif  /* SUPPORT_UTF8 */
2676
2677       /* Handle a single-byte character */
2678         {
2679         classbits[c/8] |= (1 << (c&7));
2680         if ((options & PCRE_CASELESS) != 0)
2681           {
2682           c = cd->fcc[c];   /* flip case */
2683           classbits[c/8] |= (1 << (c&7));
2684           }
2685         class_charcount++;
2686         class_lastchar = c;
2687         }
2688       }
2689
2690     /* Loop until ']' reached; the check for end of string happens inside the
2691     loop. This "while" is the end of the "do" above. */
2692
2693     while ((c = *(++ptr)) != ']' || inescq);
2694
2695     /* If class_charcount is 1, we saw precisely one character whose value is
2696     less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we
2697     can optimize the negative case only if there were no characters >= 128
2698     because OP_NOT and the related opcodes like OP_NOTSTAR operate on
2699     single-bytes only. This is an historical hangover. Maybe one day we can
2700     tidy these opcodes to handle multi-byte characters.
2701
2702     The optimization throws away the bit map. We turn the item into a
2703     1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
2704     that OP_NOT does not support multibyte characters. In the positive case, it
2705     can cause firstbyte to be set. Otherwise, there can be no first char if
2706     this item is first, whatever repeat count may follow. In the case of
2707     reqbyte, save the previous value for reinstating. */
2708
2709 #ifdef SUPPORT_UTF8
2710     if (class_charcount == 1 &&
2711           (!utf8 ||
2712           (!class_utf8 && (!negate_class || class_lastchar < 128))))
2713
2714 #else
2715     if (class_charcount == 1)
2716 #endif
2717       {
2718       zeroreqbyte = reqbyte;
2719
2720       /* The OP_NOT opcode works on one-byte characters only. */
2721
2722       if (negate_class)
2723         {
2724         if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2725         zerofirstbyte = firstbyte;
2726         *code++ = OP_NOT;
2727         *code++ = class_lastchar;
2728         break;
2729         }
2730
2731       /* For a single, positive character, get the value into mcbuffer, and
2732       then we can handle this with the normal one-character code. */
2733
2734 #ifdef SUPPORT_UTF8
2735       if (utf8 && class_lastchar > 127)
2736         mclength = ord2utf8(class_lastchar, mcbuffer);
2737       else
2738 #endif
2739         {
2740         mcbuffer[0] = class_lastchar;
2741         mclength = 1;
2742         }
2743       goto ONE_CHAR;
2744       }       /* End of 1-char optimization */
2745
2746     /* The general case - not the one-char optimization. If this is the first
2747     thing in the branch, there can be no first char setting, whatever the
2748     repeat count. Any reqbyte setting must remain unchanged after any kind of
2749     repeat. */
2750
2751     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2752     zerofirstbyte = firstbyte;
2753     zeroreqbyte = reqbyte;
2754
2755     /* If there are characters with values > 255, we have to compile an
2756     extended class, with its own opcode. If there are no characters < 256,
2757     we can omit the bitmap. */
2758
2759 #ifdef SUPPORT_UTF8
2760     if (class_utf8)
2761       {
2762       *class_utf8data++ = XCL_END;    /* Marks the end of extra data */
2763       *code++ = OP_XCLASS;
2764       code += LINK_SIZE;
2765       *code = negate_class? XCL_NOT : 0;
2766
2767       /* If the map is required, install it, and move on to the end of
2768       the extra data */
2769
2770       if (class_charcount > 0)
2771         {
2772         *code++ |= XCL_MAP;
2773         memcpy(code, classbits, 32);
2774         code = class_utf8data;
2775         }
2776
2777       /* If the map is not required, slide down the extra data. */
2778
2779       else
2780         {
2781         int len = class_utf8data - (code + 33);
2782         memmove(code + 1, code + 33, len);
2783         code += len + 1;
2784         }
2785
2786       /* Now fill in the complete length of the item */
2787
2788       PUT(previous, 1, code - previous);
2789       break;   /* End of class handling */
2790       }
2791 #endif
2792
2793     /* If there are no characters > 255, negate the 32-byte map if necessary,
2794     and copy it into the code vector. If this is the first thing in the branch,
2795     there can be no first char setting, whatever the repeat count. Any reqbyte
2796     setting must remain unchanged after any kind of repeat. */
2797
2798     if (negate_class)
2799       {
2800       *code++ = OP_NCLASS;
2801       for (c = 0; c < 32; c++) code[c] = ~classbits[c];
2802       }
2803     else
2804       {
2805       *code++ = OP_CLASS;
2806       memcpy(code, classbits, 32);
2807       }
2808     code += 32;
2809     break;
2810
2811     /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
2812     has been tested above. */
2813
2814     case '{':
2815     if (!is_quantifier) goto NORMAL_CHAR;
2816     ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr);
2817     if (*errorptr != NULL) goto FAILED;
2818     goto REPEAT;
2819
2820     case '*':
2821     repeat_min = 0;
2822     repeat_max = -1;
2823     goto REPEAT;
2824
2825     case '+':
2826     repeat_min = 1;
2827     repeat_max = -1;
2828     goto REPEAT;
2829
2830     case '?':
2831     repeat_min = 0;
2832     repeat_max = 1;
2833
2834     REPEAT:
2835     if (previous == NULL)
2836       {
2837       *errorptr = ERR9;
2838       goto FAILED;
2839       }
2840
2841     if (repeat_min == 0)
2842       {
2843       firstbyte = zerofirstbyte;    /* Adjust for zero repeat */
2844       reqbyte = zeroreqbyte;        /* Ditto */
2845       }
2846
2847     /* Remember whether this is a variable length repeat */
2848
2849     reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
2850
2851     op_type = 0;                    /* Default single-char op codes */
2852     possessive_quantifier = FALSE;  /* Default not possessive quantifier */
2853
2854     /* Save start of previous item, in case we have to move it up to make space
2855     for an inserted OP_ONCE for the additional '+' extension. */
2856
2857     tempcode = previous;
2858
2859     /* If the next character is '+', we have a possessive quantifier. This
2860     implies greediness, whatever the setting of the PCRE_UNGREEDY option.
2861     If the next character is '?' this is a minimizing repeat, by default,
2862     but if PCRE_UNGREEDY is set, it works the other way round. We change the
2863     repeat type to the non-default. */
2864
2865     if (ptr[1] == '+')
2866       {
2867       repeat_type = 0;                  /* Force greedy */
2868       possessive_quantifier = TRUE;
2869       ptr++;
2870       }
2871     else if (ptr[1] == '?')
2872       {
2873       repeat_type = greedy_non_default;
2874       ptr++;
2875       }
2876     else repeat_type = greedy_default;
2877
2878     /* If previous was a recursion, we need to wrap it inside brackets so that
2879     it can be replicated if necessary. */
2880
2881     if (*previous == OP_RECURSE)
2882       {
2883       memmove(previous + 1 + LINK_SIZE, previous, 1 + LINK_SIZE);
2884       code += 1 + LINK_SIZE;
2885       *previous = OP_BRA;
2886       PUT(previous, 1, code - previous);
2887       *code = OP_KET;
2888       PUT(code, 1, code - previous);
2889       code += 1 + LINK_SIZE;
2890       }
2891
2892     /* If previous was a character match, abolish the item and generate a
2893     repeat item instead. If a char item has a minumum of more than one, ensure
2894     that it is set in reqbyte - it might not be if a sequence such as x{3} is
2895     the first thing in a branch because the x will have gone into firstbyte
2896     instead.  */
2897
2898     if (*previous == OP_CHAR || *previous == OP_CHARNC)
2899       {
2900       /* Deal with UTF-8 characters that take up more than one byte. It's
2901       easier to write this out separately than try to macrify it. Use c to
2902       hold the length of the character in bytes, plus 0x80 to flag that it's a
2903       length rather than a small character. */
2904
2905 #ifdef SUPPORT_UTF8
2906       if (utf8 && (code[-1] & 0x80) != 0)
2907         {
2908         uschar *lastchar = code - 1;
2909         while((*lastchar & 0xc0) == 0x80) lastchar--;
2910         c = code - lastchar;            /* Length of UTF-8 character */
2911         memcpy(utf8_char, lastchar, c); /* Save the char */
2912         c |= 0x80;                      /* Flag c as a length */
2913         }
2914       else
2915 #endif
2916
2917       /* Handle the case of a single byte - either with no UTF8 support, or
2918       with UTF-8 disabled, or for a UTF-8 character < 128. */
2919
2920         {
2921         c = code[-1];
2922         if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
2923         }
2924
2925       goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */
2926       }
2927
2928     /* If previous was a single negated character ([^a] or similar), we use
2929     one of the special opcodes, replacing it. The code is shared with single-
2930     character repeats by setting opt_type to add a suitable offset into
2931     repeat_type. OP_NOT is currently used only for single-byte chars. */
2932
2933     else if (*previous == OP_NOT)
2934       {
2935       op_type = OP_NOTSTAR - OP_STAR;  /* Use "not" opcodes */
2936       c = previous[1];
2937       goto OUTPUT_SINGLE_REPEAT;
2938       }
2939
2940     /* If previous was a character type match (\d or similar), abolish it and
2941     create a suitable repeat item. The code is shared with single-character
2942     repeats by setting op_type to add a suitable offset into repeat_type. Note
2943     the the Unicode property types will be present only when SUPPORT_UCP is
2944     defined, but we don't wrap the little bits of code here because it just
2945     makes it horribly messy. */
2946
2947     else if (*previous < OP_EODN)
2948       {
2949       uschar *oldcode;
2950       int prop_type;
2951       op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */
2952       c = *previous;
2953
2954       OUTPUT_SINGLE_REPEAT:
2955       prop_type = (*previous == OP_PROP || *previous == OP_NOTPROP)?
2956         previous[1] : -1;
2957
2958       oldcode = code;
2959       code = previous;                  /* Usually overwrite previous item */
2960
2961       /* If the maximum is zero then the minimum must also be zero; Perl allows
2962       this case, so we do too - by simply omitting the item altogether. */
2963
2964       if (repeat_max == 0) goto END_REPEAT;
2965
2966       /* All real repeats make it impossible to handle partial matching (maybe
2967       one day we will be able to remove this restriction). */
2968
2969       if (repeat_max != 1) cd->nopartial = TRUE;
2970
2971       /* Combine the op_type with the repeat_type */
2972
2973       repeat_type += op_type;
2974
2975       /* A minimum of zero is handled either as the special case * or ?, or as
2976       an UPTO, with the maximum given. */
2977
2978       if (repeat_min == 0)
2979         {
2980         if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
2981           else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
2982         else
2983           {
2984           *code++ = OP_UPTO + repeat_type;
2985           PUT2INC(code, 0, repeat_max);
2986           }
2987         }
2988
2989       /* A repeat minimum of 1 is optimized into some special cases. If the
2990       maximum is unlimited, we use OP_PLUS. Otherwise, the original item it
2991       left in place and, if the maximum is greater than 1, we use OP_UPTO with
2992       one less than the maximum. */
2993
2994       else if (repeat_min == 1)
2995         {
2996         if (repeat_max == -1)
2997           *code++ = OP_PLUS + repeat_type;
2998         else
2999           {
3000           code = oldcode;                 /* leave previous item in place */
3001           if (repeat_max == 1) goto END_REPEAT;
3002           *code++ = OP_UPTO + repeat_type;
3003           PUT2INC(code, 0, repeat_max - 1);
3004           }
3005         }
3006
3007       /* The case {n,n} is just an EXACT, while the general case {n,m} is
3008       handled as an EXACT followed by an UPTO. */
3009
3010       else
3011         {
3012         *code++ = OP_EXACT + op_type;  /* NB EXACT doesn't have repeat_type */
3013         PUT2INC(code, 0, repeat_min);
3014
3015         /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3016         we have to insert the character for the previous code. For a repeated
3017         Unicode property match, there is an extra byte that defines the
3018         required property. In UTF-8 mode, long characters have their length in
3019         c, with the 0x80 bit as a flag. */
3020
3021         if (repeat_max < 0)
3022           {
3023 #ifdef SUPPORT_UTF8
3024           if (utf8 && c >= 128)
3025             {
3026             memcpy(code, utf8_char, c & 7);
3027             code += c & 7;
3028             }
3029           else
3030 #endif
3031             {
3032             *code++ = c;
3033             if (prop_type >= 0) *code++ = prop_type;
3034             }
3035           *code++ = OP_STAR + repeat_type;
3036           }
3037
3038         /* Else insert an UPTO if the max is greater than the min, again
3039         preceded by the character, for the previously inserted code. */
3040
3041         else if (repeat_max != repeat_min)
3042           {
3043 #ifdef SUPPORT_UTF8
3044           if (utf8 && c >= 128)
3045             {
3046             memcpy(code, utf8_char, c & 7);
3047             code += c & 7;
3048             }
3049           else
3050 #endif
3051           *code++ = c;
3052           if (prop_type >= 0) *code++ = prop_type;
3053           repeat_max -= repeat_min;
3054           *code++ = OP_UPTO + repeat_type;
3055           PUT2INC(code, 0, repeat_max);
3056           }
3057         }
3058
3059       /* The character or character type itself comes last in all cases. */
3060
3061 #ifdef SUPPORT_UTF8
3062       if (utf8 && c >= 128)
3063         {
3064         memcpy(code, utf8_char, c & 7);
3065         code += c & 7;
3066         }
3067       else
3068 #endif
3069       *code++ = c;
3070
3071       /* For a repeated Unicode property match, there is an extra byte that
3072       defines the required property. */
3073
3074 #ifdef SUPPORT_UCP
3075       if (prop_type >= 0) *code++ = prop_type;
3076 #endif
3077       }
3078
3079     /* If previous was a character class or a back reference, we put the repeat
3080     stuff after it, but just skip the item if the repeat was {0,0}. */
3081
3082     else if (*previous == OP_CLASS ||
3083              *previous == OP_NCLASS ||
3084 #ifdef SUPPORT_UTF8
3085              *previous == OP_XCLASS ||
3086 #endif
3087              *previous == OP_REF)
3088       {
3089       if (repeat_max == 0)
3090         {
3091         code = previous;
3092         goto END_REPEAT;
3093         }
3094
3095       /* All real repeats make it impossible to handle partial matching (maybe
3096       one day we will be able to remove this restriction). */
3097
3098       if (repeat_max != 1) cd->nopartial = TRUE;
3099
3100       if (repeat_min == 0 && repeat_max == -1)
3101         *code++ = OP_CRSTAR + repeat_type;
3102       else if (repeat_min == 1 && repeat_max == -1)
3103         *code++ = OP_CRPLUS + repeat_type;
3104       else if (repeat_min == 0 && repeat_max == 1)
3105         *code++ = OP_CRQUERY + repeat_type;
3106       else
3107         {
3108         *code++ = OP_CRRANGE + repeat_type;
3109         PUT2INC(code, 0, repeat_min);
3110         if (repeat_max == -1) repeat_max = 0;  /* 2-byte encoding for max */
3111         PUT2INC(code, 0, repeat_max);
3112         }
3113       }
3114
3115     /* If previous was a bracket group, we may have to replicate it in certain
3116     cases. */
3117
3118     else if (*previous >= OP_BRA || *previous == OP_ONCE ||
3119              *previous == OP_COND)
3120       {
3121       register int i;
3122       int ketoffset = 0;
3123       int len = code - previous;
3124       uschar *bralink = NULL;
3125
3126       /* If the maximum repeat count is unlimited, find the end of the bracket
3127       by scanning through from the start, and compute the offset back to it
3128       from the current code pointer. There may be an OP_OPT setting following
3129       the final KET, so we can't find the end just by going back from the code
3130       pointer. */
3131
3132       if (repeat_max == -1)
3133         {
3134         register uschar *ket = previous;
3135         do ket += GET(ket, 1); while (*ket != OP_KET);
3136         ketoffset = code - ket;
3137         }
3138
3139       /* The case of a zero minimum is special because of the need to stick
3140       OP_BRAZERO in front of it, and because the group appears once in the
3141       data, whereas in other cases it appears the minimum number of times. For
3142       this reason, it is simplest to treat this case separately, as otherwise
3143       the code gets far too messy. There are several special subcases when the
3144       minimum is zero. */
3145
3146       if (repeat_min == 0)
3147         {
3148         /* If the maximum is also zero, we just omit the group from the output
3149         altogether. */
3150
3151         if (repeat_max == 0)
3152           {
3153           code = previous;
3154           goto END_REPEAT;
3155           }
3156
3157         /* If the maximum is 1 or unlimited, we just have to stick in the
3158         BRAZERO and do no more at this point. However, we do need to adjust
3159         any OP_RECURSE calls inside the group that refer to the group itself or
3160         any internal group, because the offset is from the start of the whole
3161         regex. Temporarily terminate the pattern while doing this. */
3162
3163         if (repeat_max <= 1)
3164           {
3165           *code = OP_END;
3166           adjust_recurse(previous, 1, utf8, cd);
3167           memmove(previous+1, previous, len);
3168           code++;
3169           *previous++ = OP_BRAZERO + repeat_type;
3170           }
3171
3172         /* If the maximum is greater than 1 and limited, we have to replicate
3173         in a nested fashion, sticking OP_BRAZERO before each set of brackets.
3174         The first one has to be handled carefully because it's the original
3175         copy, which has to be moved up. The remainder can be handled by code
3176         that is common with the non-zero minimum case below. We have to
3177         adjust the value or repeat_max, since one less copy is required. Once
3178         again, we may have to adjust any OP_RECURSE calls inside the group. */
3179
3180         else
3181           {
3182           int offset;
3183           *code = OP_END;
3184           adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd);
3185           memmove(previous + 2 + LINK_SIZE, previous, len);
3186           code += 2 + LINK_SIZE;
3187           *previous++ = OP_BRAZERO + repeat_type;
3188           *previous++ = OP_BRA;
3189
3190           /* We chain together the bracket offset fields that have to be
3191           filled in later when the ends of the brackets are reached. */
3192
3193           offset = (bralink == NULL)? 0 : previous - bralink;
3194           bralink = previous;
3195           PUTINC(previous, 0, offset);
3196           }
3197
3198         repeat_max--;
3199         }
3200
3201       /* If the minimum is greater than zero, replicate the group as many
3202       times as necessary, and adjust the maximum to the number of subsequent
3203       copies that we need. If we set a first char from the group, and didn't
3204       set a required char, copy the latter from the former. */
3205
3206       else
3207         {
3208         if (repeat_min > 1)
3209           {
3210           if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
3211           for (i = 1; i < repeat_min; i++)
3212             {
3213             memcpy(code, previous, len);
3214             code += len;
3215             }
3216           }
3217         if (repeat_max > 0) repeat_max -= repeat_min;
3218         }
3219
3220       /* This code is common to both the zero and non-zero minimum cases. If
3221       the maximum is limited, it replicates the group in a nested fashion,
3222       remembering the bracket starts on a stack. In the case of a zero minimum,
3223       the first one was set up above. In all cases the repeat_max now specifies
3224       the number of additional copies needed. */
3225
3226       if (repeat_max >= 0)
3227         {
3228         for (i = repeat_max - 1; i >= 0; i--)
3229           {
3230           *code++ = OP_BRAZERO + repeat_type;
3231
3232           /* All but the final copy start a new nesting, maintaining the
3233           chain of brackets outstanding. */
3234
3235           if (i != 0)
3236             {
3237             int offset;
3238             *code++ = OP_BRA;
3239             offset = (bralink == NULL)? 0 : code - bralink;
3240             bralink = code;
3241             PUTINC(code, 0, offset);
3242             }
3243
3244           memcpy(code, previous, len);
3245           code += len;
3246           }
3247
3248         /* Now chain through the pending brackets, and fill in their length
3249         fields (which are holding the chain links pro tem). */
3250
3251         while (bralink != NULL)
3252           {
3253           int oldlinkoffset;
3254           int offset = code - bralink + 1;
3255           uschar *bra = code - offset;
3256           oldlinkoffset = GET(bra, 1);
3257           bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
3258           *code++ = OP_KET;
3259           PUTINC(code, 0, offset);
3260           PUT(bra, 1, offset);
3261           }
3262         }
3263
3264       /* If the maximum is unlimited, set a repeater in the final copy. We
3265       can't just offset backwards from the current code point, because we
3266       don't know if there's been an options resetting after the ket. The
3267       correct offset was computed above. */
3268
3269       else code[-ketoffset] = OP_KETRMAX + repeat_type;
3270       }
3271
3272     /* Else there's some kind of shambles */
3273
3274     else
3275       {
3276       *errorptr = ERR11;
3277       goto FAILED;
3278       }
3279
3280     /* If the character following a repeat is '+', we wrap the entire repeated
3281     item inside OP_ONCE brackets. This is just syntactic sugar, taken from
3282     Sun's Java package. The repeated item starts at tempcode, not at previous,
3283     which might be the first part of a string whose (former) last char we
3284     repeated. However, we don't support '+' after a greediness '?'. */
3285
3286     if (possessive_quantifier)
3287       {
3288       int len = code - tempcode;
3289       memmove(tempcode + 1+LINK_SIZE, tempcode, len);
3290       code += 1 + LINK_SIZE;
3291       len += 1 + LINK_SIZE;
3292       tempcode[0] = OP_ONCE;
3293       *code++ = OP_KET;
3294       PUTINC(code, 0, len);
3295       PUT(tempcode, 1, len);
3296       }
3297
3298     /* In all case we no longer have a previous item. We also set the
3299     "follows varying string" flag for subsequently encountered reqbytes if
3300     it isn't already set and we have just passed a varying length item. */
3301
3302     END_REPEAT:
3303     previous = NULL;
3304     cd->req_varyopt |= reqvary;
3305     break;
3306
3307
3308     /* Start of nested bracket sub-expression, or comment or lookahead or
3309     lookbehind or option setting or condition. First deal with special things
3310     that can come after a bracket; all are introduced by ?, and the appearance
3311     of any of them means that this is not a referencing group. They were
3312     checked for validity in the first pass over the string, so we don't have to
3313     check for syntax errors here.  */
3314
3315     case '(':
3316     newoptions = options;
3317     skipbytes = 0;
3318
3319     if (*(++ptr) == '?')
3320       {
3321       int set, unset;
3322       int *optset;
3323
3324       switch (*(++ptr))
3325         {
3326         case '#':                 /* Comment; skip to ket */
3327         ptr++;
3328         while (*ptr != ')') ptr++;
3329         continue;
3330
3331         case ':':                 /* Non-extracting bracket */
3332         bravalue = OP_BRA;
3333         ptr++;
3334         break;
3335
3336         case '(':
3337         bravalue = OP_COND;       /* Conditional group */
3338
3339         /* Condition to test for recursion */
3340
3341         if (ptr[1] == 'R')
3342           {
3343           code[1+LINK_SIZE] = OP_CREF;
3344           PUT2(code, 2+LINK_SIZE, CREF_RECURSE);
3345           skipbytes = 3;
3346           ptr += 3;
3347           }
3348
3349         /* Condition to test for a numbered subpattern match. We know that
3350         if a digit follows ( then there will just be digits until ) because
3351         the syntax was checked in the first pass. */
3352
3353         else if ((digitab[ptr[1]] && ctype_digit) != 0)
3354           {
3355           int condref;                 /* Don't amalgamate; some compilers */
3356           condref = *(++ptr) - '0';    /* grumble at autoincrement in declaration */
3357           while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';
3358           if (condref == 0)
3359             {
3360             *errorptr = ERR35;
3361             goto FAILED;
3362             }
3363           ptr++;
3364           code[1+LINK_SIZE] = OP_CREF;
3365           PUT2(code, 2+LINK_SIZE, condref);
3366           skipbytes = 3;
3367           }
3368         /* For conditions that are assertions, we just fall through, having
3369         set bravalue above. */
3370         break;
3371
3372         case '=':                 /* Positive lookahead */
3373         bravalue = OP_ASSERT;
3374         ptr++;
3375         break;
3376
3377         case '!':                 /* Negative lookahead */
3378         bravalue = OP_ASSERT_NOT;
3379         ptr++;
3380         break;
3381
3382         case '<':                 /* Lookbehinds */
3383         switch (*(++ptr))
3384           {
3385           case '=':               /* Positive lookbehind */
3386           bravalue = OP_ASSERTBACK;
3387           ptr++;
3388           break;
3389
3390           case '!':               /* Negative lookbehind */
3391           bravalue = OP_ASSERTBACK_NOT;
3392           ptr++;
3393           break;
3394           }
3395         break;
3396
3397         case '>':                 /* One-time brackets */
3398         bravalue = OP_ONCE;
3399         ptr++;
3400         break;
3401
3402         case 'C':                 /* Callout - may be followed by digits; */
3403         previous_callout = code;  /* Save for later completion */
3404         after_manual_callout = 1; /* Skip one item before completing */
3405         *code++ = OP_CALLOUT;     /* Already checked that the terminating */
3406           {                       /* closing parenthesis is present. */
3407           int n = 0;
3408           while ((digitab[*(++ptr)] & ctype_digit) != 0)
3409             n = n * 10 + *ptr - '0';
3410           if (n > 255)
3411             {
3412             *errorptr = ERR38;
3413             goto FAILED;
3414             }
3415           *code++ = n;
3416           PUT(code, 0, ptr - cd->start_pattern + 1);  /* Pattern offset */
3417           PUT(code, LINK_SIZE, 0);                    /* Default length */
3418           code += 2 * LINK_SIZE;
3419           }
3420         previous = NULL;
3421         continue;
3422
3423         case 'P':                 /* Named subpattern handling */
3424         if (*(++ptr) == '<')      /* Definition */
3425           {
3426           int i, namelen;
3427           uschar *slot = cd->name_table;
3428           const uschar *name;     /* Don't amalgamate; some compilers */
3429           name = ++ptr;           /* grumble at autoincrement in declaration */
3430
3431           while (*ptr++ != '>');
3432           namelen = ptr - name - 1;
3433
3434           for (i = 0; i < cd->names_found; i++)
3435             {
3436             int crc = memcmp(name, slot+2, namelen);
3437             if (crc == 0)
3438               {
3439               if (slot[2+namelen] == 0)
3440                 {
3441                 *errorptr = ERR43;
3442                 goto FAILED;
3443                 }
3444               crc = -1;             /* Current name is substring */
3445               }
3446             if (crc < 0)
3447               {
3448               memmove(slot + cd->name_entry_size, slot,
3449                 (cd->names_found - i) * cd->name_entry_size);
3450               break;
3451               }
3452             slot += cd->name_entry_size;
3453             }
3454
3455           PUT2(slot, 0, *brackets + 1);
3456           memcpy(slot + 2, name, namelen);
3457           slot[2+namelen] = 0;
3458           cd->names_found++;
3459           goto NUMBERED_GROUP;
3460           }
3461
3462         if (*ptr == '=' || *ptr == '>')  /* Reference or recursion */
3463           {
3464           int i, namelen;
3465           int type = *ptr++;
3466           const uschar *name = ptr;
3467           uschar *slot = cd->name_table;
3468
3469           while (*ptr != ')') ptr++;
3470           namelen = ptr - name;
3471
3472           for (i = 0; i < cd->names_found; i++)
3473             {
3474             if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
3475             slot += cd->name_entry_size;
3476             }
3477           if (i >= cd->names_found)
3478             {
3479             *errorptr = ERR15;
3480             goto FAILED;
3481             }
3482
3483           recno = GET2(slot, 0);
3484
3485           if (type == '>') goto HANDLE_RECURSION;  /* A few lines below */
3486
3487           /* Back reference */
3488
3489           previous = code;
3490           *code++ = OP_REF;
3491           PUT2INC(code, 0, recno);
3492           cd->backref_map |= (recno < 32)? (1 << recno) : 1;
3493           if (recno > cd->top_backref) cd->top_backref = recno;
3494           continue;
3495           }
3496
3497         /* Should never happen */
3498         break;
3499
3500         case 'R':                 /* Pattern recursion */
3501         ptr++;                    /* Same as (?0)      */
3502         /* Fall through */
3503
3504         /* Recursion or "subroutine" call */
3505
3506         case '0': case '1': case '2': case '3': case '4':
3507         case '5': case '6': case '7': case '8': case '9':
3508           {
3509           const uschar *called;
3510           recno = 0;
3511           while((digitab[*ptr] & ctype_digit) != 0)
3512             recno = recno * 10 + *ptr++ - '0';
3513
3514           /* Come here from code above that handles a named recursion */
3515
3516           HANDLE_RECURSION:
3517
3518           previous = code;
3519
3520           /* Find the bracket that is being referenced. Temporarily end the
3521           regex in case it doesn't exist. */
3522
3523           *code = OP_END;
3524           called = (recno == 0)?
3525             cd->start_code : find_bracket(cd->start_code, utf8, recno);
3526
3527           if (called == NULL)
3528             {
3529             *errorptr = ERR15;
3530             goto FAILED;
3531             }
3532
3533           /* If the subpattern is still open, this is a recursive call. We
3534           check to see if this is a left recursion that could loop for ever,
3535           and diagnose that case. */
3536
3537           if (GET(called, 1) == 0 && could_be_empty(called, code, bcptr, utf8))
3538             {
3539             *errorptr = ERR40;
3540             goto FAILED;
3541             }
3542
3543           /* Insert the recursion/subroutine item */
3544
3545           *code = OP_RECURSE;
3546           PUT(code, 1, called - cd->start_code);
3547           code += 1 + LINK_SIZE;
3548           }
3549         continue;
3550
3551         /* Character after (? not specially recognized */
3552
3553         default:                  /* Option setting */
3554         set = unset = 0;
3555         optset = &set;
3556
3557         while (*ptr != ')' && *ptr != ':')
3558           {
3559           switch (*ptr++)
3560             {
3561             case '-': optset = &unset; break;
3562
3563             case 'i': *optset |= PCRE_CASELESS; break;
3564             case 'm': *optset |= PCRE_MULTILINE; break;
3565             case 's': *optset |= PCRE_DOTALL; break;
3566             case 'x': *optset |= PCRE_EXTENDED; break;
3567             case 'U': *optset |= PCRE_UNGREEDY; break;
3568             case 'X': *optset |= PCRE_EXTRA; break;
3569             }
3570           }
3571
3572         /* Set up the changed option bits, but don't change anything yet. */
3573
3574         newoptions = (options | set) & (~unset);
3575
3576         /* If the options ended with ')' this is not the start of a nested
3577         group with option changes, so the options change at this level. Compile
3578         code to change the ims options if this setting actually changes any of
3579         them. We also pass the new setting back so that it can be put at the
3580         start of any following branches, and when this group ends (if we are in
3581         a group), a resetting item can be compiled.
3582
3583         Note that if this item is right at the start of the pattern, the
3584         options will have been abstracted and made global, so there will be no
3585         change to compile. */
3586
3587         if (*ptr == ')')
3588           {
3589           if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
3590             {
3591             *code++ = OP_OPT;
3592             *code++ = newoptions & PCRE_IMS;
3593             }
3594
3595           /* Change options at this level, and pass them back for use
3596           in subsequent branches. Reset the greedy defaults and the case
3597           value for firstbyte and reqbyte. */
3598
3599           *optionsptr = options = newoptions;
3600           greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
3601           greedy_non_default = greedy_default ^ 1;
3602           req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
3603
3604           previous = NULL;       /* This item can't be repeated */
3605           continue;              /* It is complete */
3606           }
3607
3608         /* If the options ended with ':' we are heading into a nested group
3609         with possible change of options. Such groups are non-capturing and are
3610         not assertions of any kind. All we need to do is skip over the ':';
3611         the newoptions value is handled below. */
3612
3613         bravalue = OP_BRA;
3614         ptr++;
3615         }
3616       }
3617
3618     /* If PCRE_NO_AUTO_CAPTURE is set, all unadorned brackets become
3619     non-capturing and behave like (?:...) brackets */
3620
3621     else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
3622       {
3623       bravalue = OP_BRA;
3624       }
3625
3626     /* Else we have a referencing group; adjust the opcode. If the bracket
3627     number is greater than EXTRACT_BASIC_MAX, we set the opcode one higher, and
3628     arrange for the true number to follow later, in an OP_BRANUMBER item. */
3629
3630     else
3631       {
3632       NUMBERED_GROUP:
3633       if (++(*brackets) > EXTRACT_BASIC_MAX)
3634         {
3635         bravalue = OP_BRA + EXTRACT_BASIC_MAX + 1;
3636         code[1+LINK_SIZE] = OP_BRANUMBER;
3637         PUT2(code, 2+LINK_SIZE, *brackets);
3638         skipbytes = 3;
3639         }
3640       else bravalue = OP_BRA + *brackets;
3641       }
3642
3643     /* Process nested bracketed re. Assertions may not be repeated, but other
3644     kinds can be. We copy code into a non-register variable in order to be able
3645     to pass its address because some compilers complain otherwise. Pass in a
3646     new setting for the ims options if they have changed. */
3647
3648     previous = (bravalue >= OP_ONCE)? code : NULL;
3649     *code = bravalue;
3650     tempcode = code;
3651     tempreqvary = cd->req_varyopt;     /* Save value before bracket */
3652
3653     if (!compile_regex(
3654          newoptions,                   /* The complete new option state */
3655          options & PCRE_IMS,           /* The previous ims option state */
3656          brackets,                     /* Extracting bracket count */
3657          &tempcode,                    /* Where to put code (updated) */
3658          &ptr,                         /* Input pointer (updated) */
3659          errorptr,                     /* Where to put an error message */
3660          (bravalue == OP_ASSERTBACK ||
3661           bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
3662          skipbytes,                    /* Skip over OP_COND/OP_BRANUMBER */
3663          &subfirstbyte,                /* For possible first char */
3664          &subreqbyte,                  /* For possible last char */
3665          bcptr,                        /* Current branch chain */
3666          cd))                          /* Tables block */
3667       goto FAILED;
3668
3669     /* At the end of compiling, code is still pointing to the start of the
3670     group, while tempcode has been updated to point past the end of the group
3671     and any option resetting that may follow it. The pattern pointer (ptr)
3672     is on the bracket. */
3673
3674     /* If this is a conditional bracket, check that there are no more than
3675     two branches in the group. */
3676
3677     else if (bravalue == OP_COND)
3678       {
3679       uschar *tc = code;
3680       condcount = 0;
3681
3682       do {
3683          condcount++;
3684          tc += GET(tc,1);
3685          }
3686       while (*tc != OP_KET);
3687
3688       if (condcount > 2)
3689         {
3690         *errorptr = ERR27;
3691         goto FAILED;
3692         }
3693
3694       /* If there is just one branch, we must not make use of its firstbyte or
3695       reqbyte, because this is equivalent to an empty second branch. */
3696
3697       if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
3698       }
3699
3700     /* Handle updating of the required and first characters. Update for normal
3701     brackets of all kinds, and conditions with two branches (see code above).
3702     If the bracket is followed by a quantifier with zero repeat, we have to
3703     back off. Hence the definition of zeroreqbyte and zerofirstbyte outside the
3704     main loop so that they can be accessed for the back off. */
3705
3706     zeroreqbyte = reqbyte;
3707     zerofirstbyte = firstbyte;
3708     groupsetfirstbyte = FALSE;
3709
3710     if (bravalue >= OP_BRA || bravalue == OP_ONCE || bravalue == OP_COND)
3711       {
3712       /* If we have not yet set a firstbyte in this branch, take it from the
3713       subpattern, remembering that it was set here so that a repeat of more
3714       than one can replicate it as reqbyte if necessary. If the subpattern has
3715       no firstbyte, set "none" for the whole branch. In both cases, a zero
3716       repeat forces firstbyte to "none". */
3717
3718       if (firstbyte == REQ_UNSET)
3719         {
3720         if (subfirstbyte >= 0)
3721           {
3722           firstbyte = subfirstbyte;
3723           groupsetfirstbyte = TRUE;
3724           }
3725         else firstbyte = REQ_NONE;
3726         zerofirstbyte = REQ_NONE;
3727         }
3728
3729       /* If firstbyte was previously set, convert the subpattern's firstbyte
3730       into reqbyte if there wasn't one, using the vary flag that was in
3731       existence beforehand. */
3732
3733       else if (subfirstbyte >= 0 && subreqbyte < 0)
3734         subreqbyte = subfirstbyte | tempreqvary;
3735
3736       /* If the subpattern set a required byte (or set a first byte that isn't
3737       really the first byte - see above), set it. */
3738
3739       if (subreqbyte >= 0) reqbyte = subreqbyte;
3740       }
3741
3742     /* For a forward assertion, we take the reqbyte, if set. This can be
3743     helpful if the pattern that follows the assertion doesn't set a different
3744     char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
3745     for an assertion, however because it leads to incorrect effect for patterns
3746     such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
3747     of a firstbyte. This is overcome by a scan at the end if there's no
3748     firstbyte, looking for an asserted first char. */
3749
3750     else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
3751
3752     /* Now update the main code pointer to the end of the group. */
3753
3754     code = tempcode;
3755
3756     /* Error if hit end of pattern */
3757
3758     if (*ptr != ')')
3759       {
3760       *errorptr = ERR14;
3761       goto FAILED;
3762       }
3763     break;
3764
3765     /* Check \ for being a real metacharacter; if not, fall through and handle
3766     it as a data character at the start of a string. Escape items are checked
3767     for validity in the pre-compiling pass. */
3768
3769     case '\\':
3770     tempptr = ptr;
3771     c = check_escape(&ptr, errorptr, *brackets, options, FALSE);
3772
3773     /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
3774     are arranged to be the negation of the corresponding OP_values. For the
3775     back references, the values are ESC_REF plus the reference number. Only
3776     back references and those types that consume a character may be repeated.
3777     We can test for values between ESC_b and ESC_Z for the latter; this may
3778     have to change if any new ones are ever created. */
3779
3780     if (c < 0)
3781       {
3782       if (-c == ESC_Q)            /* Handle start of quoted string */
3783         {
3784         if (ptr[1] == '\\' && ptr[2] == 'E') ptr += 2; /* avoid empty string */
3785           else inescq = TRUE;
3786         continue;
3787         }
3788
3789       /* For metasequences that actually match a character, we disable the
3790       setting of a first character if it hasn't already been set. */
3791
3792       if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
3793         firstbyte = REQ_NONE;
3794
3795       /* Set values to reset to if this is followed by a zero repeat. */
3796
3797       zerofirstbyte = firstbyte;
3798       zeroreqbyte = reqbyte;
3799
3800       /* Back references are handled specially */
3801
3802       if (-c >= ESC_REF)
3803         {
3804         int number = -c - ESC_REF;
3805         previous = code;
3806         *code++ = OP_REF;
3807         PUT2INC(code, 0, number);
3808         }
3809
3810       /* So are Unicode property matches, if supported. We know that get_ucp
3811       won't fail because it was tested in the pre-pass. */
3812
3813 #ifdef SUPPORT_UCP
3814       else if (-c == ESC_P || -c == ESC_p)
3815         {
3816         BOOL negated;
3817         int value = get_ucp(&ptr, &negated, errorptr);
3818         previous = code;
3819         *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
3820         *code++ = value;
3821         }
3822 #endif
3823
3824       /* For the rest, we can obtain the OP value by negating the escape
3825       value */
3826
3827       else
3828         {
3829         previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
3830         *code++ = -c;
3831         }
3832       continue;
3833       }
3834
3835     /* We have a data character whose value is in c. In UTF-8 mode it may have
3836     a value > 127. We set its representation in the length/buffer, and then
3837     handle it as a data character. */
3838
3839 #ifdef SUPPORT_UTF8
3840     if (utf8 && c > 127)
3841       mclength = ord2utf8(c, mcbuffer);
3842     else
3843 #endif
3844
3845      {
3846      mcbuffer[0] = c;
3847      mclength = 1;
3848      }
3849
3850     goto ONE_CHAR;
3851
3852     /* Handle a literal character. It is guaranteed not to be whitespace or #
3853     when the extended flag is set. If we are in UTF-8 mode, it may be a
3854     multi-byte literal character. */
3855
3856     default:
3857     NORMAL_CHAR:
3858     mclength = 1;
3859     mcbuffer[0] = c;
3860
3861 #ifdef SUPPORT_UTF8
3862     if (utf8 && (c & 0xc0) == 0xc0)
3863       {
3864       while ((ptr[1] & 0xc0) == 0x80)
3865         mcbuffer[mclength++] = *(++ptr);
3866       }
3867 #endif
3868
3869     /* At this point we have the character's bytes in mcbuffer, and the length
3870     in mclength. When not in UTF-8 mode, the length is always 1. */
3871
3872     ONE_CHAR:
3873     previous = code;
3874     *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR;
3875     for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
3876
3877     /* Set the first and required bytes appropriately. If no previous first
3878     byte, set it from this character, but revert to none on a zero repeat.
3879     Otherwise, leave the firstbyte value alone, and don't change it on a zero
3880     repeat. */
3881
3882     if (firstbyte == REQ_UNSET)
3883       {
3884       zerofirstbyte = REQ_NONE;
3885       zeroreqbyte = reqbyte;
3886
3887       /* If the character is more than one byte long, we can set firstbyte
3888       only if it is not to be matched caselessly. */
3889
3890       if (mclength == 1 || req_caseopt == 0)
3891         {
3892         firstbyte = mcbuffer[0] | req_caseopt;
3893         if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt;
3894         }
3895       else firstbyte = reqbyte = REQ_NONE;
3896       }
3897
3898     /* firstbyte was previously set; we can set reqbyte only the length is
3899     1 or the matching is caseful. */
3900
3901     else
3902       {
3903       zerofirstbyte = firstbyte;
3904       zeroreqbyte = reqbyte;
3905       if (mclength == 1 || req_caseopt == 0)
3906         reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
3907       }
3908
3909     break;            /* End of literal character handling */
3910     }
3911   }                   /* end of big loop */
3912
3913 /* Control never reaches here by falling through, only by a goto for all the
3914 error states. Pass back the position in the pattern so that it can be displayed
3915 to the user for diagnosing the error. */
3916
3917 FAILED:
3918 *ptrptr = ptr;
3919 return FALSE;
3920 }
3921
3922
3923
3924
3925 /*************************************************
3926 *     Compile sequence of alternatives           *
3927 *************************************************/
3928
3929 /* On entry, ptr is pointing past the bracket character, but on return
3930 it points to the closing bracket, or vertical bar, or end of string.
3931 The code variable is pointing at the byte into which the BRA operator has been
3932 stored. If the ims options are changed at the start (for a (?ims: group) or
3933 during any branch, we need to insert an OP_OPT item at the start of every
3934 following branch to ensure they get set correctly at run time, and also pass
3935 the new options into every subsequent branch compile.
3936
3937 Argument:
3938   options        option bits, including any changes for this subpattern
3939   oldims         previous settings of ims option bits
3940   brackets       -> int containing the number of extracting brackets used
3941   codeptr        -> the address of the current code pointer
3942   ptrptr         -> the address of the current pattern pointer
3943   errorptr       -> pointer to error message
3944   lookbehind     TRUE if this is a lookbehind assertion
3945   skipbytes      skip this many bytes at start (for OP_COND, OP_BRANUMBER)
3946   firstbyteptr   place to put the first required character, or a negative number
3947   reqbyteptr     place to put the last required character, or a negative number
3948   bcptr          pointer to the chain of currently open branches
3949   cd             points to the data block with tables pointers etc.
3950
3951 Returns:      TRUE on success
3952 */
3953
3954 static BOOL
3955 compile_regex(int options, int oldims, int *brackets, uschar **codeptr,
3956   const uschar **ptrptr, const char **errorptr, BOOL lookbehind, int skipbytes,
3957   int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd)
3958 {
3959 const uschar *ptr = *ptrptr;
3960 uschar *code = *codeptr;
3961 uschar *last_branch = code;
3962 uschar *start_bracket = code;
3963 uschar *reverse_count = NULL;
3964 int firstbyte, reqbyte;
3965 int branchfirstbyte, branchreqbyte;
3966 branch_chain bc;
3967
3968 bc.outer = bcptr;
3969 bc.current = code;
3970
3971 firstbyte = reqbyte = REQ_UNSET;
3972
3973 /* Offset is set zero to mark that this bracket is still open */
3974
3975 PUT(code, 1, 0);
3976 code += 1 + LINK_SIZE + skipbytes;
3977
3978 /* Loop for each alternative branch */
3979
3980 for (;;)
3981   {
3982   /* Handle a change of ims options at the start of the branch */
3983
3984   if ((options & PCRE_IMS) != oldims)
3985     {
3986     *code++ = OP_OPT;
3987     *code++ = options & PCRE_IMS;
3988     }
3989
3990   /* Set up dummy OP_REVERSE if lookbehind assertion */
3991
3992   if (lookbehind)
3993     {
3994     *code++ = OP_REVERSE;
3995     reverse_count = code;
3996     PUTINC(code, 0, 0);
3997     }
3998
3999   /* Now compile the branch */
4000
4001   if (!compile_branch(&options, brackets, &code, &ptr, errorptr,
4002         &branchfirstbyte, &branchreqbyte, &bc, cd))
4003     {
4004     *ptrptr = ptr;
4005     return FALSE;
4006     }
4007
4008   /* If this is the first branch, the firstbyte and reqbyte values for the
4009   branch become the values for the regex. */
4010
4011   if (*last_branch != OP_ALT)
4012     {
4013     firstbyte = branchfirstbyte;
4014     reqbyte = branchreqbyte;
4015     }
4016
4017   /* If this is not the first branch, the first char and reqbyte have to
4018   match the values from all the previous branches, except that if the previous
4019   value for reqbyte didn't have REQ_VARY set, it can still match, and we set
4020   REQ_VARY for the regex. */
4021
4022   else
4023     {
4024     /* If we previously had a firstbyte, but it doesn't match the new branch,
4025     we have to abandon the firstbyte for the regex, but if there was previously
4026     no reqbyte, it takes on the value of the old firstbyte. */
4027
4028     if (firstbyte >= 0 && firstbyte != branchfirstbyte)
4029       {
4030       if (reqbyte < 0) reqbyte = firstbyte;
4031       firstbyte = REQ_NONE;
4032       }
4033
4034     /* If we (now or from before) have no firstbyte, a firstbyte from the
4035     branch becomes a reqbyte if there isn't a branch reqbyte. */
4036
4037     if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
4038         branchreqbyte = branchfirstbyte;
4039
4040     /* Now ensure that the reqbytes match */
4041
4042     if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
4043       reqbyte = REQ_NONE;
4044     else reqbyte |= branchreqbyte;   /* To "or" REQ_VARY */
4045     }
4046
4047   /* If lookbehind, check that this branch matches a fixed-length string,
4048   and put the length into the OP_REVERSE item. Temporarily mark the end of
4049   the branch with OP_END. */
4050
4051   if (lookbehind)
4052     {
4053     int length;
4054     *code = OP_END;
4055     length = find_fixedlength(last_branch, options);
4056     DPRINTF(("fixed length = %d\n", length));
4057     if (length < 0)
4058       {
4059       *errorptr = (length == -2)? ERR36 : ERR25;
4060       *ptrptr = ptr;
4061       return FALSE;
4062       }
4063     PUT(reverse_count, 0, length);
4064     }
4065
4066   /* Reached end of expression, either ')' or end of pattern. Go back through
4067   the alternative branches and reverse the chain of offsets, with the field in
4068   the BRA item now becoming an offset to the first alternative. If there are
4069   no alternatives, it points to the end of the group. The length in the
4070   terminating ket is always the length of the whole bracketed item. If any of
4071   the ims options were changed inside the group, compile a resetting op-code
4072   following, except at the very end of the pattern. Return leaving the pointer
4073   at the terminating char. */
4074
4075   if (*ptr != '|')
4076     {
4077     int length = code - last_branch;
4078     do
4079       {
4080       int prev_length = GET(last_branch, 1);
4081       PUT(last_branch, 1, length);
4082       length = prev_length;
4083       last_branch -= length;
4084       }
4085     while (length > 0);
4086
4087     /* Fill in the ket */
4088
4089     *code = OP_KET;
4090     PUT(code, 1, code - start_bracket);
4091     code += 1 + LINK_SIZE;
4092
4093     /* Resetting option if needed */
4094
4095     if ((options & PCRE_IMS) != oldims && *ptr == ')')
4096       {
4097       *code++ = OP_OPT;
4098       *code++ = oldims;
4099       }
4100
4101     /* Set values to pass back */
4102
4103     *codeptr = code;
4104     *ptrptr = ptr;
4105     *firstbyteptr = firstbyte;
4106     *reqbyteptr = reqbyte;
4107     return TRUE;
4108     }
4109
4110   /* Another branch follows; insert an "or" node. Its length field points back
4111   to the previous branch while the bracket remains open. At the end the chain
4112   is reversed. It's done like this so that the start of the bracket has a
4113   zero offset until it is closed, making it possible to detect recursion. */
4114
4115   *code = OP_ALT;
4116   PUT(code, 1, code - last_branch);
4117   bc.current = last_branch = code;
4118   code += 1 + LINK_SIZE;
4119   ptr++;
4120   }
4121 /* Control never reaches here */
4122 }
4123
4124
4125
4126
4127 /*************************************************
4128 *          Check for anchored expression         *
4129 *************************************************/
4130
4131 /* Try to find out if this is an anchored regular expression. Consider each
4132 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
4133 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
4134 it's anchored. However, if this is a multiline pattern, then only OP_SOD
4135 counts, since OP_CIRC can match in the middle.
4136
4137 We can also consider a regex to be anchored if OP_SOM starts all its branches.
4138 This is the code for \G, which means "match at start of match position, taking
4139 into account the match offset".
4140
4141 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
4142 because that will try the rest of the pattern at all possible matching points,
4143 so there is no point trying again.... er ....
4144
4145 .... except when the .* appears inside capturing parentheses, and there is a
4146 subsequent back reference to those parentheses. We haven't enough information
4147 to catch that case precisely.
4148
4149 At first, the best we could do was to detect when .* was in capturing brackets
4150 and the highest back reference was greater than or equal to that level.
4151 However, by keeping a bitmap of the first 31 back references, we can catch some
4152 of the more common cases more precisely.
4153
4154 Arguments:
4155   code           points to start of expression (the bracket)
4156   options        points to the options setting
4157   bracket_map    a bitmap of which brackets we are inside while testing; this
4158                   handles up to substring 31; after that we just have to take
4159                   the less precise approach
4160   backref_map    the back reference bitmap
4161
4162 Returns:     TRUE or FALSE
4163 */
4164
4165 static BOOL
4166 is_anchored(register const uschar *code, int *options, unsigned int bracket_map,
4167   unsigned int backref_map)
4168 {
4169 do {
4170    const uschar *scode =
4171      first_significant_code(code + 1+LINK_SIZE, options, PCRE_MULTILINE, FALSE);
4172    register int op = *scode;
4173
4174    /* Capturing brackets */
4175
4176    if (op > OP_BRA)
4177      {
4178      int new_map;
4179      op -= OP_BRA;
4180      if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE);
4181      new_map = bracket_map | ((op < 32)? (1 << op) : 1);
4182      if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
4183      }
4184
4185    /* Other brackets */
4186
4187    else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
4188      {
4189      if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
4190      }
4191
4192    /* .* is not anchored unless DOTALL is set and it isn't in brackets that
4193    are or may be referenced. */
4194
4195    else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR) &&
4196             (*options & PCRE_DOTALL) != 0)
4197      {
4198      if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
4199      }
4200
4201    /* Check for explicit anchoring */
4202
4203    else if (op != OP_SOD && op != OP_SOM &&
4204            ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
4205      return FALSE;
4206    code += GET(code, 1);
4207    }
4208 while (*code == OP_ALT);   /* Loop for each alternative */
4209 return TRUE;
4210 }
4211
4212
4213
4214 /*************************************************
4215 *         Check for starting with ^ or .*        *
4216 *************************************************/
4217
4218 /* This is called to find out if every branch starts with ^ or .* so that
4219 "first char" processing can be done to speed things up in multiline
4220 matching and for non-DOTALL patterns that start with .* (which must start at
4221 the beginning or after \n). As in the case of is_anchored() (see above), we
4222 have to take account of back references to capturing brackets that contain .*
4223 because in that case we can't make the assumption.
4224
4225 Arguments:
4226   code           points to start of expression (the bracket)
4227   bracket_map    a bitmap of which brackets we are inside while testing; this
4228                   handles up to substring 31; after that we just have to take
4229                   the less precise approach
4230   backref_map    the back reference bitmap
4231
4232 Returns:         TRUE or FALSE
4233 */
4234
4235 static BOOL
4236 is_startline(const uschar *code, unsigned int bracket_map,
4237   unsigned int backref_map)
4238 {
4239 do {
4240    const uschar *scode = first_significant_code(code + 1+LINK_SIZE, NULL, 0,
4241      FALSE);
4242    register int op = *scode;
4243
4244    /* Capturing brackets */
4245
4246    if (op > OP_BRA)
4247      {
4248      int new_map;
4249      op -= OP_BRA;
4250      if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE);
4251      new_map = bracket_map | ((op < 32)? (1 << op) : 1);
4252      if (!is_startline(scode, new_map, backref_map)) return FALSE;
4253      }
4254
4255    /* Other brackets */
4256
4257    else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
4258      { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }
4259
4260    /* .* means "start at start or after \n" if it isn't in brackets that
4261    may be referenced. */
4262
4263    else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
4264      {
4265      if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
4266      }
4267
4268    /* Check for explicit circumflex */
4269
4270    else if (op != OP_CIRC) return FALSE;
4271
4272    /* Move on to the next alternative */
4273
4274    code += GET(code, 1);
4275    }
4276 while (*code == OP_ALT);  /* Loop for each alternative */
4277 return TRUE;
4278 }
4279
4280
4281
4282 /*************************************************
4283 *       Check for asserted fixed first char      *
4284 *************************************************/
4285
4286 /* During compilation, the "first char" settings from forward assertions are
4287 discarded, because they can cause conflicts with actual literals that follow.
4288 However, if we end up without a first char setting for an unanchored pattern,
4289 it is worth scanning the regex to see if there is an initial asserted first
4290 char. If all branches start with the same asserted char, or with a bracket all
4291 of whose alternatives start with the same asserted char (recurse ad lib), then
4292 we return that char, otherwise -1.
4293
4294 Arguments:
4295   code       points to start of expression (the bracket)
4296   options    pointer to the options (used to check casing changes)
4297   inassert   TRUE if in an assertion
4298
4299 Returns:     -1 or the fixed first char
4300 */
4301
4302 static int
4303 find_firstassertedchar(const uschar *code, int *options, BOOL inassert)
4304 {
4305 register int c = -1;
4306 do {
4307    int d;
4308    const uschar *scode =
4309      first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);
4310    register int op = *scode;
4311
4312    if (op >= OP_BRA) op = OP_BRA;
4313
4314    switch(op)
4315      {
4316      default:
4317      return -1;
4318
4319      case OP_BRA:
4320      case OP_ASSERT:
4321      case OP_ONCE:
4322      case OP_COND:
4323      if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
4324        return -1;
4325      if (c < 0) c = d; else if (c != d) return -1;
4326      break;
4327
4328      case OP_EXACT:       /* Fall through */
4329      scode += 2;
4330
4331      case OP_CHAR:
4332      case OP_CHARNC:
4333      case OP_PLUS:
4334      case OP_MINPLUS:
4335      if (!inassert) return -1;
4336      if (c < 0)
4337        {
4338        c = scode[1];
4339        if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
4340        }
4341      else if (c != scode[1]) return -1;
4342      break;
4343      }
4344
4345    code += GET(code, 1);
4346    }
4347 while (*code == OP_ALT);
4348 return c;
4349 }
4350
4351
4352
4353
4354 #ifdef SUPPORT_UTF8
4355 /*************************************************
4356 *         Validate a UTF-8 string                *
4357 *************************************************/
4358
4359 /* This function is called (optionally) at the start of compile or match, to
4360 validate that a supposed UTF-8 string is actually valid. The early check means
4361 that subsequent code can assume it is dealing with a valid string. The check
4362 can be turned off for maximum performance, but then consequences of supplying
4363 an invalid string are then undefined.
4364
4365 Arguments:
4366   string       points to the string
4367   length       length of string, or -1 if the string is zero-terminated
4368
4369 Returns:       < 0    if the string is a valid UTF-8 string
4370                >= 0   otherwise; the value is the offset of the bad byte
4371 */
4372
4373 static int
4374 valid_utf8(const uschar *string, int length)
4375 {
4376 register const uschar *p;
4377
4378 if (length < 0)
4379   {
4380   for (p = string; *p != 0; p++);
4381   length = p - string;
4382   }
4383
4384 for (p = string; length-- > 0; p++)
4385   {
4386   register int ab;
4387   register int c = *p;
4388   if (c < 128) continue;
4389   if ((c & 0xc0) != 0xc0) return p - string;
4390   ab = utf8_table4[c & 0x3f];  /* Number of additional bytes */
4391   if (length < ab) return p - string;
4392   length -= ab;
4393
4394   /* Check top bits in the second byte */
4395   if ((*(++p) & 0xc0) != 0x80) return p - string;
4396
4397   /* Check for overlong sequences for each different length */
4398   switch (ab)
4399     {
4400     /* Check for xx00 000x */
4401     case 1:
4402     if ((c & 0x3e) == 0) return p - string;
4403     continue;   /* We know there aren't any more bytes to check */
4404
4405     /* Check for 1110 0000, xx0x xxxx */
4406     case 2:
4407     if (c == 0xe0 && (*p & 0x20) == 0) return p - string;
4408     break;
4409
4410     /* Check for 1111 0000, xx00 xxxx */
4411     case 3:
4412     if (c == 0xf0 && (*p & 0x30) == 0) return p - string;
4413     break;
4414
4415     /* Check for 1111 1000, xx00 0xxx */
4416     case 4:
4417     if (c == 0xf8 && (*p & 0x38) == 0) return p - string;
4418     break;
4419
4420     /* Check for leading 0xfe or 0xff, and then for 1111 1100, xx00 00xx */
4421     case 5:
4422     if (c == 0xfe || c == 0xff ||
4423        (c == 0xfc && (*p & 0x3c) == 0)) return p - string;
4424     break;
4425     }
4426
4427   /* Check for valid bytes after the 2nd, if any; all must start 10 */
4428   while (--ab > 0)
4429     {
4430     if ((*(++p) & 0xc0) != 0x80) return p - string;
4431     }
4432   }
4433
4434 return -1;
4435 }
4436 #endif
4437
4438
4439
4440 /*************************************************
4441 *        Compile a Regular Expression            *
4442 *************************************************/
4443
4444 /* This function takes a string and returns a pointer to a block of store
4445 holding a compiled version of the expression.
4446
4447 Arguments:
4448   pattern      the regular expression
4449   options      various option bits
4450   errorptr     pointer to pointer to error text
4451   erroroffset  ptr offset in pattern where error was detected
4452   tables       pointer to character tables or NULL
4453
4454 Returns:       pointer to compiled data block, or NULL on error,
4455                with errorptr and erroroffset set
4456 */
4457
4458 EXPORT pcre *
4459 pcre_compile(const char *pattern, int options, const char **errorptr,
4460   int *erroroffset, const unsigned char *tables)
4461 {
4462 real_pcre *re;
4463 int length = 1 + LINK_SIZE;      /* For initial BRA plus length */
4464 /* int runlength; not used L.M. 2004-09-14 */
4465 int c, firstbyte, reqbyte;
4466 int bracount = 0;
4467 int branch_extra = 0;
4468 int branch_newextra;
4469 int item_count = -1;
4470 int name_count = 0;
4471 int max_name_size = 0;
4472 int lastitemlength = 0;
4473 #ifdef SUPPORT_UTF8
4474 BOOL utf8;
4475 BOOL class_utf8;
4476 #endif
4477 BOOL inescq = FALSE;
4478 unsigned int brastackptr = 0;
4479 size_t size;
4480 uschar *code;
4481 const uschar *codestart;
4482 const uschar *ptr;
4483 compile_data compile_block;
4484 int brastack[BRASTACK_SIZE];
4485 uschar bralenstack[BRASTACK_SIZE];
4486
4487 /* We can't pass back an error message if errorptr is NULL; I guess the best we
4488 can do is just return NULL. */
4489
4490 if (errorptr == NULL) return NULL;
4491 *errorptr = NULL;
4492
4493 /* However, we can give a message for this error */
4494
4495 if (erroroffset == NULL)
4496   {
4497   *errorptr = ERR16;
4498   return NULL;
4499   }
4500 *erroroffset = 0;
4501
4502 /* Can't support UTF8 unless PCRE has been compiled to include the code. */
4503
4504 #ifdef SUPPORT_UTF8
4505 utf8 = (options & PCRE_UTF8) != 0;
4506 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
4507      (*erroroffset = valid_utf8((uschar *)pattern, -1)) >= 0)
4508   {
4509   *errorptr = ERR44;
4510   return NULL;
4511   }
4512 #else
4513 if ((options & PCRE_UTF8) != 0)
4514   {
4515   *errorptr = ERR32;
4516   return NULL;
4517   }
4518 #endif
4519
4520 if ((options & ~PUBLIC_OPTIONS) != 0)
4521   {
4522   *errorptr = ERR17;
4523   return NULL;
4524   }
4525
4526 /* Set up pointers to the individual character tables */
4527
4528 if (tables == NULL) tables = pcre_default_tables;
4529 compile_block.lcc = tables + lcc_offset;
4530 compile_block.fcc = tables + fcc_offset;
4531 compile_block.cbits = tables + cbits_offset;
4532 compile_block.ctypes = tables + ctypes_offset;
4533
4534 /* Maximum back reference and backref bitmap. This is updated for numeric
4535 references during the first pass, but for named references during the actual
4536 compile pass. The bitmap records up to 31 back references to help in deciding
4537 whether (.*) can be treated as anchored or not. */
4538
4539 compile_block.top_backref = 0;
4540 compile_block.backref_map = 0;
4541
4542 /* Reflect pattern for debugging output */
4543
4544 DPRINTF(("------------------------------------------------------------------\n"));
4545 DPRINTF(("%s\n", pattern));
4546
4547 /* The first thing to do is to make a pass over the pattern to compute the
4548 amount of store required to hold the compiled code. This does not have to be
4549 perfect as long as errors are overestimates. At the same time we can detect any
4550 flag settings right at the start, and extract them. Make an attempt to correct
4551 for any counted white space if an "extended" flag setting appears late in the
4552 pattern. We can't be so clever for #-comments. */
4553
4554 ptr = (const uschar *)(pattern - 1);
4555 while ((c = *(++ptr)) != 0)
4556   {
4557   int min, max;
4558   int class_optcount;
4559   int bracket_length;
4560   int duplength;
4561
4562   /* If we are inside a \Q...\E sequence, all chars are literal */
4563
4564   if (inescq)
4565     {
4566     if ((options & PCRE_AUTO_CALLOUT) != 0) length += 2 + 2*LINK_SIZE;
4567     goto NORMAL_CHAR;
4568     }
4569
4570   /* Otherwise, first check for ignored whitespace and comments */
4571
4572   if ((options & PCRE_EXTENDED) != 0)
4573     {
4574     if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
4575     if (c == '#')
4576       {
4577       /* The space before the ; is to avoid a warning on a silly compiler
4578       on the Macintosh. */
4579       while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
4580       if (c == 0) break;
4581       continue;
4582       }
4583     }
4584
4585   item_count++;    /* Is zero for the first non-comment item */
4586
4587   /* Allow space for auto callout before every item except quantifiers. */
4588
4589   if ((options & PCRE_AUTO_CALLOUT) != 0 &&
4590        c != '*' && c != '+' && c != '?' &&
4591        (c != '{' || !is_counted_repeat(ptr + 1)))
4592     length += 2 + 2*LINK_SIZE;
4593
4594   switch(c)
4595     {
4596     /* A backslashed item may be an escaped data character or it may be a
4597     character type. */
4598
4599     case '\\':
4600     c = check_escape(&ptr, errorptr, bracount, options, FALSE);
4601     if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
4602
4603     lastitemlength = 1;     /* Default length of last item for repeats */
4604
4605     if (c >= 0)             /* Data character */
4606       {
4607       length += 2;          /* For a one-byte character */
4608
4609 #ifdef SUPPORT_UTF8
4610       if (utf8 && c > 127)
4611         {
4612         int i;
4613         for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
4614           if (c <= utf8_table1[i]) break;
4615         length += i;
4616         lastitemlength += i;
4617         }
4618 #endif
4619
4620       continue;
4621       }
4622
4623     /* If \Q, enter "literal" mode */
4624
4625     if (-c == ESC_Q)
4626       {
4627       inescq = TRUE;
4628       continue;
4629       }
4630
4631     /* \X is supported only if Unicode property support is compiled */
4632
4633 #ifndef SUPPORT_UCP
4634     if (-c == ESC_X)
4635       {
4636       *errorptr = ERR45;
4637       goto PCRE_ERROR_RETURN;
4638       }
4639 #endif
4640
4641     /* \P and \p are for Unicode properties, but only when the support has
4642     been compiled. Each item needs 2 bytes. */
4643
4644     else if (-c == ESC_P || -c == ESC_p)
4645       {
4646 #ifdef SUPPORT_UCP
4647       BOOL negated;
4648       length += 2;
4649       lastitemlength = 2;
4650       if (get_ucp(&ptr, &negated, errorptr) < 0) goto PCRE_ERROR_RETURN;
4651       continue;
4652 #else
4653       *errorptr = ERR45;
4654       goto PCRE_ERROR_RETURN;
4655 #endif
4656       }
4657
4658     /* Other escapes need one byte */
4659
4660     length++;
4661
4662     /* A back reference needs an additional 2 bytes, plus either one or 5
4663     bytes for a repeat. We also need to keep the value of the highest
4664     back reference. */
4665
4666     if (c <= -ESC_REF)
4667       {
4668       int refnum = -c - ESC_REF;
4669       compile_block.backref_map |= (refnum < 32)? (1 << refnum) : 1;
4670       if (refnum > compile_block.top_backref)
4671         compile_block.top_backref = refnum;
4672       length += 2;   /* For single back reference */
4673       if (ptr[1] == '{' && is_counted_repeat(ptr+2))
4674         {
4675         ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
4676         if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
4677         if ((min == 0 && (max == 1 || max == -1)) ||
4678           (min == 1 && max == -1))
4679             length++;
4680         else length += 5;
4681         if (ptr[1] == '?') ptr++;
4682         }
4683       }
4684     continue;
4685
4686     case '^':     /* Single-byte metacharacters */
4687     case '.':
4688     case '$':
4689     length++;
4690     lastitemlength = 1;
4691     continue;
4692
4693     case '*':            /* These repeats won't be after brackets; */
4694     case '+':            /* those are handled separately */
4695     case '?':
4696     length++;
4697     goto POSESSIVE;      /* A few lines below */
4698
4699     /* This covers the cases of braced repeats after a single char, metachar,
4700     class, or back reference. */
4701
4702     case '{':
4703     if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;
4704     ptr = read_repeat_counts(ptr+1, &min, &max, errorptr);
4705     if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
4706
4707     /* These special cases just insert one extra opcode */
4708
4709     if ((min == 0 && (max == 1 || max == -1)) ||
4710       (min == 1 && max == -1))
4711         length++;
4712
4713     /* These cases might insert additional copies of a preceding character. */
4714
4715     else
4716       {
4717       if (min != 1)
4718         {
4719         length -= lastitemlength;   /* Uncount the original char or metachar */
4720         if (min > 0) length += 3 + lastitemlength;
4721         }
4722       length += lastitemlength + ((max > 0)? 3 : 1);
4723       }
4724
4725     if (ptr[1] == '?') ptr++;      /* Needs no extra length */
4726
4727     POSESSIVE:                     /* Test for possessive quantifier */
4728     if (ptr[1] == '+')
4729       {
4730       ptr++;
4731       length += 2 + 2*LINK_SIZE;   /* Allow for atomic brackets */
4732       }
4733     continue;
4734
4735     /* An alternation contains an offset to the next branch or ket. If any ims
4736     options changed in the previous branch(es), and/or if we are in a
4737     lookbehind assertion, extra space will be needed at the start of the
4738     branch. This is handled by branch_extra. */
4739
4740     case '|':
4741     length += 1 + LINK_SIZE + branch_extra;
4742     continue;
4743
4744     /* A character class uses 33 characters provided that all the character
4745     values are less than 256. Otherwise, it uses a bit map for low valued
4746     characters, and individual items for others. Don't worry about character
4747     types that aren't allowed in classes - they'll get picked up during the
4748     compile. A character class that contains only one single-byte character
4749     uses 2 or 3 bytes, depending on whether it is negated or not. Notice this
4750     where we can. (In UTF-8 mode we can do this only for chars < 128.) */
4751
4752     case '[':
4753     if (*(++ptr) == '^')
4754       {
4755       class_optcount = 10;  /* Greater than one */
4756       ptr++;
4757       }
4758     else class_optcount = 0;
4759
4760 #ifdef SUPPORT_UTF8
4761     class_utf8 = FALSE;
4762 #endif
4763
4764     /* Written as a "do" so that an initial ']' is taken as data */
4765
4766     if (*ptr != 0) do
4767       {
4768       /* Inside \Q...\E everything is literal except \E */
4769
4770       if (inescq)
4771         {
4772         if (*ptr != '\\' || ptr[1] != 'E') goto GET_ONE_CHARACTER;
4773         inescq = FALSE;
4774         ptr += 1;
4775         continue;
4776         }
4777
4778       /* Outside \Q...\E, check for escapes */
4779
4780       if (*ptr == '\\')
4781         {
4782         c = check_escape(&ptr, errorptr, bracount, options, TRUE);
4783         if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
4784
4785         /* \b is backspace inside a class; \X is literal */
4786
4787         if (-c == ESC_b) c = '\b';
4788         else if (-c == ESC_X) c = 'X';
4789
4790         /* \Q enters quoting mode */
4791
4792         else if (-c == ESC_Q)
4793           {
4794           inescq = TRUE;
4795           continue;
4796           }
4797
4798         /* Handle escapes that turn into characters */
4799
4800         if (c >= 0) goto NON_SPECIAL_CHARACTER;
4801
4802         /* Escapes that are meta-things. The normal ones just affect the
4803         bit map, but Unicode properties require an XCLASS extended item. */
4804
4805         else
4806           {
4807           class_optcount = 10;         /* \d, \s etc; make sure > 1 */
4808 #ifdef SUPPORT_UTF8
4809           if (-c == ESC_p || -c == ESC_P)
4810             {
4811             if (!class_utf8)
4812               {
4813               class_utf8 = TRUE;
4814               length += LINK_SIZE + 2;
4815               }
4816             length += 2;
4817             }
4818 #endif
4819           }
4820         }
4821
4822       /* Check the syntax for POSIX stuff. The bits we actually handle are
4823       checked during the real compile phase. */
4824
4825       else if (*ptr == '[' && check_posix_syntax(ptr, &ptr, &compile_block))
4826         {
4827         ptr++;
4828         class_optcount = 10;    /* Make sure > 1 */
4829         }
4830
4831       /* Anything else increments the possible optimization count. We have to
4832       detect ranges here so that we can compute the number of extra ranges for
4833       caseless wide characters when UCP support is available. If there are wide
4834       characters, we are going to have to use an XCLASS, even for single
4835       characters. */
4836
4837       else
4838         {
4839         int d;
4840
4841         GET_ONE_CHARACTER:
4842
4843 #ifdef SUPPORT_UTF8
4844         if (utf8)
4845           {
4846           int extra = 0;
4847           GETCHARLEN(c, ptr, extra);
4848           ptr += extra;
4849           }
4850         else c = *ptr;
4851 #else
4852         c = *ptr;
4853 #endif
4854
4855         /* Come here from handling \ above when it escapes to a char value */
4856
4857         NON_SPECIAL_CHARACTER:
4858         class_optcount++;
4859
4860         d = -1;
4861         if (ptr[1] == '-')
4862           {
4863           uschar const *hyptr = ptr++;
4864           if (ptr[1] == '\\')
4865             {
4866             ptr++;
4867             d = check_escape(&ptr, errorptr, bracount, options, TRUE);
4868             if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
4869             if (-d == ESC_b) d = '\b';        /* backspace */
4870             else if (-d == ESC_X) d = 'X';    /* literal X in a class */
4871             }
4872           else if (ptr[1] != 0 && ptr[1] != ']')
4873             {
4874             ptr++;
4875 #ifdef SUPPORT_UTF8
4876             if (utf8)
4877               {
4878               int extra = 0;
4879               GETCHARLEN(d, ptr, extra);
4880               ptr += extra;
4881               }
4882             else
4883 #endif
4884             d = *ptr;
4885             }
4886           if (d < 0) ptr = hyptr;      /* go back to hyphen as data */
4887           }
4888
4889         /* If d >= 0 we have a range. In UTF-8 mode, if the end is > 255, or >
4890         127 for caseless matching, we will need to use an XCLASS. */
4891
4892         if (d >= 0)
4893           {
4894           class_optcount = 10;     /* Ensure > 1 */
4895           if (d < c)
4896             {
4897             *errorptr = ERR8;
4898             goto PCRE_ERROR_RETURN;
4899             }
4900
4901 #ifdef SUPPORT_UTF8
4902           if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
4903             {
4904             uschar buffer[6];
4905             if (!class_utf8)         /* Allow for XCLASS overhead */
4906               {
4907               class_utf8 = TRUE;
4908               length += LINK_SIZE + 2;
4909               }
4910
4911 #ifdef SUPPORT_UCP
4912             /* If we have UCP support, find out how many extra ranges are
4913             needed to map the other case of characters within this range. We
4914             have to mimic the range optimization here, because extending the
4915             range upwards might push d over a boundary that makes is use
4916             another byte in the UTF-8 representation. */
4917
4918             if ((options & PCRE_CASELESS) != 0)
4919               {
4920               int occ, ocd;
4921               int cc = c;
4922               int origd = d;
4923               while (get_othercase_range(&cc, origd, &occ, &ocd))
4924                 {
4925                 if (occ >= c && ocd <= d) continue;   /* Skip embedded */
4926
4927                 if (occ < c  && ocd >= c - 1)  /* Extend the basic range */
4928                   {                            /* if there is overlap,   */
4929                   c = occ;                     /* noting that if occ < c */
4930                   continue;                    /* we can't have ocd > d  */
4931                   }                            /* because a subrange is  */
4932                 if (ocd > d && occ <= d + 1)   /* always shorter than    */
4933                   {                            /* the basic range.       */
4934                   d = ocd;
4935                   continue;
4936                   }
4937
4938                 /* An extra item is needed */
4939
4940                 length += 1 + ord2utf8(occ, buffer) +
4941                   ((occ == ocd)? 0 : ord2utf8(ocd, buffer));
4942                 }
4943               }
4944 #endif  /* SUPPORT_UCP */
4945
4946             /* The length of the (possibly extended) range */
4947
4948             length += 1 + ord2utf8(c, buffer) + ord2utf8(d, buffer);
4949             }
4950 #endif  /* SUPPORT_UTF8 */
4951
4952           }
4953
4954         /* We have a single character. There is nothing to be done unless we
4955         are in UTF-8 mode. If the char is > 255, or 127 when caseless, we must
4956         allow for an XCL_SINGLE item, doubled for caselessness if there is UCP
4957         support. */
4958
4959         else
4960           {
4961 #ifdef SUPPORT_UTF8
4962           if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
4963             {
4964             uschar buffer[6];
4965             class_optcount = 10;     /* Ensure > 1 */
4966             if (!class_utf8)         /* Allow for XCLASS overhead */
4967               {
4968               class_utf8 = TRUE;
4969               length += LINK_SIZE + 2;
4970               }
4971 #ifdef SUPPORT_UCP
4972             length += (((options & PCRE_CASELESS) != 0)? 2 : 1) *
4973               (1 + ord2utf8(c, buffer));
4974 #else   /* SUPPORT_UCP */
4975             length += 1 + ord2utf8(c, buffer);
4976 #endif  /* SUPPORT_UCP */
4977             }
4978 #endif  /* SUPPORT_UTF8 */
4979           }
4980         }
4981       }
4982     while (*(++ptr) != 0 && (inescq || *ptr != ']')); /* Concludes "do" above */
4983
4984     if (*ptr == 0)                          /* Missing terminating ']' */
4985       {
4986       *errorptr = ERR6;
4987       goto PCRE_ERROR_RETURN;
4988       }
4989
4990     /* We can optimize when there was only one optimizable character. Repeats
4991     for positive and negated single one-byte chars are handled by the general
4992     code. Here, we handle repeats for the class opcodes. */
4993
4994     if (class_optcount == 1) length += 3; else
4995       {
4996       length += 33;
4997
4998       /* A repeat needs either 1 or 5 bytes. If it is a possessive quantifier,
4999       we also need extra for wrapping the whole thing in a sub-pattern. */
5000
5001       if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2))
5002         {
5003         ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
5004         if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
5005         if ((min == 0 && (max == 1 || max == -1)) ||
5006           (min == 1 && max == -1))
5007             length++;
5008         else length += 5;
5009         if (ptr[1] == '+')
5010           {
5011           ptr++;
5012           length += 2 + 2*LINK_SIZE;
5013           }
5014         else if (ptr[1] == '?') ptr++;
5015         }
5016       }
5017     continue;
5018
5019     /* Brackets may be genuine groups or special things */
5020
5021     case '(':
5022     branch_newextra = 0;
5023     bracket_length = 1 + LINK_SIZE;
5024
5025     /* Handle special forms of bracket, which all start (? */
5026
5027     if (ptr[1] == '?')
5028       {
5029       int set, unset;
5030       int *optset;
5031
5032       switch (c = ptr[2])
5033         {
5034         /* Skip over comments entirely */
5035         case '#':
5036         ptr += 3;
5037         while (*ptr != 0 && *ptr != ')') ptr++;
5038         if (*ptr == 0)
5039           {
5040           *errorptr = ERR18;
5041           goto PCRE_ERROR_RETURN;
5042           }
5043         continue;
5044
5045         /* Non-referencing groups and lookaheads just move the pointer on, and
5046         then behave like a non-special bracket, except that they don't increment
5047         the count of extracting brackets. Ditto for the "once only" bracket,
5048         which is in Perl from version 5.005. */
5049
5050         case ':':
5051         case '=':
5052         case '!':
5053         case '>':
5054         ptr += 2;
5055         break;
5056
5057         /* (?R) specifies a recursive call to the regex, which is an extension
5058         to provide the facility which can be obtained by (?p{perl-code}) in
5059         Perl 5.6. In Perl 5.8 this has become (??{perl-code}).
5060
5061         From PCRE 4.00, items such as (?3) specify subroutine-like "calls" to
5062         the appropriate numbered brackets. This includes both recursive and
5063         non-recursive calls. (?R) is now synonymous with (?0). */
5064
5065         case 'R':
5066         ptr++;
5067
5068         case '0': case '1': case '2': case '3': case '4':
5069         case '5': case '6': case '7': case '8': case '9':
5070         ptr += 2;
5071         if (c != 'R')
5072           while ((digitab[*(++ptr)] & ctype_digit) != 0);
5073         if (*ptr != ')')
5074           {
5075           *errorptr = ERR29;
5076           goto PCRE_ERROR_RETURN;
5077           }
5078         length += 1 + LINK_SIZE;
5079
5080         /* If this item is quantified, it will get wrapped inside brackets so
5081         as to use the code for quantified brackets. We jump down and use the
5082         code that handles this for real brackets. */
5083
5084         if (ptr[1] == '+' || ptr[1] == '*' || ptr[1] == '?' || ptr[1] == '{')
5085           {
5086           length += 2 + 2 * LINK_SIZE;       /* to make bracketed */
5087           duplength = 5 + 3 * LINK_SIZE;
5088           goto HANDLE_QUANTIFIED_BRACKETS;
5089           }
5090         continue;
5091
5092         /* (?C) is an extension which provides "callout" - to provide a bit of
5093         the functionality of the Perl (?{...}) feature. An optional number may
5094         follow (default is zero). */
5095
5096         case 'C':
5097         ptr += 2;
5098         while ((digitab[*(++ptr)] & ctype_digit) != 0);
5099         if (*ptr != ')')
5100           {
5101           *errorptr = ERR39;
5102           goto PCRE_ERROR_RETURN;
5103           }
5104         length += 2 + 2*LINK_SIZE;
5105         continue;
5106
5107         /* Named subpatterns are an extension copied from Python */
5108
5109         case 'P':
5110         ptr += 3;
5111         if (*ptr == '<')
5112           {
5113           const uschar *p;    /* Don't amalgamate; some compilers */
5114           p = ++ptr;          /* grumble at autoincrement in declaration */
5115           while ((compile_block.ctypes[*ptr] & ctype_word) != 0) ptr++;
5116           if (*ptr != '>')
5117             {
5118             *errorptr = ERR42;
5119             goto PCRE_ERROR_RETURN;
5120             }
5121           name_count++;
5122           if (ptr - p > max_name_size) max_name_size = (ptr - p);
5123           break;
5124           }
5125
5126         if (*ptr == '=' || *ptr == '>')
5127           {
5128           while ((compile_block.ctypes[*(++ptr)] & ctype_word) != 0);
5129           if (*ptr != ')')
5130             {
5131             *errorptr = ERR42;
5132             goto PCRE_ERROR_RETURN;
5133             }
5134           break;
5135           }
5136
5137         /* Unknown character after (?P */
5138
5139         *errorptr = ERR41;
5140         goto PCRE_ERROR_RETURN;
5141
5142         /* Lookbehinds are in Perl from version 5.005 */
5143
5144         case '<':
5145         ptr += 3;
5146         if (*ptr == '=' || *ptr == '!')
5147           {
5148           branch_newextra = 1 + LINK_SIZE;
5149           length += 1 + LINK_SIZE;         /* For the first branch */
5150           break;
5151           }
5152         *errorptr = ERR24;
5153         goto PCRE_ERROR_RETURN;
5154
5155         /* Conditionals are in Perl from version 5.005. The bracket must either
5156         be followed by a number (for bracket reference) or by an assertion
5157         group, or (a PCRE extension) by 'R' for a recursion test. */
5158
5159         case '(':
5160         if (ptr[3] == 'R' && ptr[4] == ')')
5161           {
5162           ptr += 4;
5163           length += 3;
5164           }
5165         else if ((digitab[ptr[3]] & ctype_digit) != 0)
5166           {
5167           ptr += 4;
5168           length += 3;
5169           while ((digitab[*ptr] & ctype_digit) != 0) ptr++;
5170           if (*ptr != ')')
5171             {
5172             *errorptr = ERR26;
5173             goto PCRE_ERROR_RETURN;
5174             }
5175           }
5176         else   /* An assertion must follow */
5177           {
5178           ptr++;   /* Can treat like ':' as far as spacing is concerned */
5179           if (ptr[2] != '?' ||
5180              (ptr[3] != '=' && ptr[3] != '!' && ptr[3] != '<') )
5181             {
5182             ptr += 2;    /* To get right offset in message */
5183             *errorptr = ERR28;
5184             goto PCRE_ERROR_RETURN;
5185             }
5186           }
5187         break;
5188
5189         /* Else loop checking valid options until ) is met. Anything else is an
5190         error. If we are without any brackets, i.e. at top level, the settings
5191         act as if specified in the options, so massage the options immediately.
5192         This is for backward compatibility with Perl 5.004. */
5193
5194         default:
5195         set = unset = 0;
5196         optset = &set;
5197         ptr += 2;
5198
5199         for (;; ptr++)
5200           {
5201           c = *ptr;
5202           switch (c)
5203             {
5204             case 'i':
5205             *optset |= PCRE_CASELESS;
5206             continue;
5207
5208             case 'm':
5209             *optset |= PCRE_MULTILINE;
5210             continue;
5211
5212             case 's':
5213             *optset |= PCRE_DOTALL;
5214             continue;
5215
5216             case 'x':
5217             *optset |= PCRE_EXTENDED;
5218             continue;
5219
5220             case 'X':
5221             *optset |= PCRE_EXTRA;
5222             continue;
5223
5224             case 'U':
5225             *optset |= PCRE_UNGREEDY;
5226             continue;
5227
5228             case '-':
5229             optset = &unset;
5230             continue;
5231
5232             /* A termination by ')' indicates an options-setting-only item; if
5233             this is at the very start of the pattern (indicated by item_count
5234             being zero), we use it to set the global options. This is helpful
5235             when analyzing the pattern for first characters, etc. Otherwise
5236             nothing is done here and it is handled during the compiling
5237             process.
5238
5239             [Historical note: Up to Perl 5.8, options settings at top level
5240             were always global settings, wherever they appeared in the pattern.
5241             That is, they were equivalent to an external setting. From 5.8
5242             onwards, they apply only to what follows (which is what you might
5243             expect).] */
5244
5245             case ')':
5246             if (item_count == 0)
5247               {
5248               options = (options | set) & (~unset);
5249               set = unset = 0;     /* To save length */
5250               item_count--;        /* To allow for several */
5251               }
5252
5253             /* Fall through */
5254
5255             /* A termination by ':' indicates the start of a nested group with
5256             the given options set. This is again handled at compile time, but
5257             we must allow for compiled space if any of the ims options are
5258             set. We also have to allow for resetting space at the end of
5259             the group, which is why 4 is added to the length and not just 2.
5260             If there are several changes of options within the same group, this
5261             will lead to an over-estimate on the length, but this shouldn't
5262             matter very much. We also have to allow for resetting options at
5263             the start of any alternations, which we do by setting
5264             branch_newextra to 2. Finally, we record whether the case-dependent
5265             flag ever changes within the regex. This is used by the "required
5266             character" code. */
5267
5268             case ':':
5269             if (((set|unset) & PCRE_IMS) != 0)
5270               {
5271               length += 4;
5272               branch_newextra = 2;
5273               if (((set|unset) & PCRE_CASELESS) != 0) options |= PCRE_ICHANGED;
5274               }
5275             goto END_OPTIONS;
5276
5277             /* Unrecognized option character */
5278
5279             default:
5280             *errorptr = ERR12;
5281             goto PCRE_ERROR_RETURN;
5282             }
5283           }
5284
5285         /* If we hit a closing bracket, that's it - this is a freestanding
5286         option-setting. We need to ensure that branch_extra is updated if
5287         necessary. The only values branch_newextra can have here are 0 or 2.
5288         If the value is 2, then branch_extra must either be 2 or 5, depending
5289         on whether this is a lookbehind group or not. */
5290
5291         END_OPTIONS:
5292         if (c == ')')
5293           {
5294           if (branch_newextra == 2 &&
5295               (branch_extra == 0 || branch_extra == 1+LINK_SIZE))
5296             branch_extra += branch_newextra;
5297           continue;
5298           }
5299
5300         /* If options were terminated by ':' control comes here. Fall through
5301         to handle the group below. */
5302         }
5303       }
5304
5305     /* Extracting brackets must be counted so we can process escapes in a
5306     Perlish way. If the number exceeds EXTRACT_BASIC_MAX we are going to
5307     need an additional 3 bytes of store per extracting bracket. However, if
5308     PCRE_NO_AUTO)CAPTURE is set, unadorned brackets become non-capturing, so we
5309     must leave the count alone (it will aways be zero). */
5310
5311     else if ((options & PCRE_NO_AUTO_CAPTURE) == 0)
5312       {
5313       bracount++;
5314       if (bracount > EXTRACT_BASIC_MAX) bracket_length += 3;
5315       }
5316
5317     /* Save length for computing whole length at end if there's a repeat that
5318     requires duplication of the group. Also save the current value of
5319     branch_extra, and start the new group with the new value. If non-zero, this
5320     will either be 2 for a (?imsx: group, or 3 for a lookbehind assertion. */
5321
5322     if (brastackptr >= sizeof(brastack)/sizeof(int))
5323       {
5324       *errorptr = ERR19;
5325       goto PCRE_ERROR_RETURN;
5326       }
5327
5328     bralenstack[brastackptr] = branch_extra;
5329     branch_extra = branch_newextra;
5330
5331     brastack[brastackptr++] = length;
5332     length += bracket_length;
5333     continue;
5334
5335     /* Handle ket. Look for subsequent max/min; for certain sets of values we
5336     have to replicate this bracket up to that many times. If brastackptr is
5337     0 this is an unmatched bracket which will generate an error, but take care
5338     not to try to access brastack[-1] when computing the length and restoring
5339     the branch_extra value. */
5340
5341     case ')':
5342     length += 1 + LINK_SIZE;
5343     if (brastackptr > 0)
5344       {
5345       duplength = length - brastack[--brastackptr];
5346       branch_extra = bralenstack[brastackptr];
5347       }
5348     else duplength = 0;
5349
5350     /* The following code is also used when a recursion such as (?3) is
5351     followed by a quantifier, because in that case, it has to be wrapped inside
5352     brackets so that the quantifier works. The value of duplength must be
5353     set before arrival. */
5354
5355     HANDLE_QUANTIFIED_BRACKETS:
5356
5357     /* Leave ptr at the final char; for read_repeat_counts this happens
5358     automatically; for the others we need an increment. */
5359
5360     if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2))
5361       {
5362       ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
5363       if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
5364       }
5365     else if (c == '*') { min = 0; max = -1; ptr++; }
5366     else if (c == '+') { min = 1; max = -1; ptr++; }
5367     else if (c == '?') { min = 0; max = 1;  ptr++; }
5368     else { min = 1; max = 1; }
5369
5370     /* If the minimum is zero, we have to allow for an OP_BRAZERO before the
5371     group, and if the maximum is greater than zero, we have to replicate
5372     maxval-1 times; each replication acquires an OP_BRAZERO plus a nesting
5373     bracket set. */
5374
5375     if (min == 0)
5376       {
5377       length++;
5378       if (max > 0) length += (max - 1) * (duplength + 3 + 2*LINK_SIZE);
5379       }
5380
5381     /* When the minimum is greater than zero, we have to replicate up to
5382     minval-1 times, with no additions required in the copies. Then, if there
5383     is a limited maximum we have to replicate up to maxval-1 times allowing
5384     for a BRAZERO item before each optional copy and nesting brackets for all
5385     but one of the optional copies. */
5386
5387     else
5388       {
5389       length += (min - 1) * duplength;
5390       if (max > min)   /* Need this test as max=-1 means no limit */
5391         length += (max - min) * (duplength + 3 + 2*LINK_SIZE)
5392           - (2 + 2*LINK_SIZE);
5393       }
5394
5395     /* Allow space for once brackets for "possessive quantifier" */
5396
5397     if (ptr[1] == '+')
5398       {
5399       ptr++;
5400       length += 2 + 2*LINK_SIZE;
5401       }
5402     continue;
5403
5404     /* Non-special character. It won't be space or # in extended mode, so it is
5405     always a genuine character. If we are in a \Q...\E sequence, check for the
5406     end; if not, we have a literal. */
5407
5408     default:
5409     NORMAL_CHAR:
5410
5411     if (inescq && c == '\\' && ptr[1] == 'E')
5412       {
5413       inescq = FALSE;
5414       ptr++;
5415       continue;
5416       }
5417
5418     length += 2;          /* For a one-byte character */
5419     lastitemlength = 1;   /* Default length of last item for repeats */
5420
5421     /* In UTF-8 mode, check for additional bytes. */
5422
5423 #ifdef SUPPORT_UTF8
5424     if (utf8 && (c & 0xc0) == 0xc0)
5425       {
5426       while ((ptr[1] & 0xc0) == 0x80)         /* Can't flow over the end */
5427         {                                     /* because the end is marked */
5428         lastitemlength++;                     /* by a zero byte. */
5429         length++;
5430         ptr++;
5431         }
5432       }
5433 #endif
5434
5435     continue;
5436     }
5437   }
5438
5439 length += 2 + LINK_SIZE;    /* For final KET and END */
5440
5441 if ((options & PCRE_AUTO_CALLOUT) != 0)
5442   length += 2 + 2*LINK_SIZE;  /* For final callout */
5443
5444 if (length > MAX_PATTERN_SIZE)
5445   {
5446   *errorptr = ERR20;
5447   return NULL;
5448   }
5449
5450 /* Compute the size of data block needed and get it, either from malloc or
5451 externally provided function. */
5452
5453 size = length + sizeof(real_pcre) + name_count * (max_name_size + 3);
5454 re = (real_pcre *)(pcre_malloc)(size);
5455
5456 if (re == NULL)
5457   {
5458   *errorptr = ERR21;
5459   return NULL;
5460   }
5461
5462 /* Put in the magic number, and save the sizes, options, and character table
5463 pointer. NULL is used for the default character tables. The nullpad field is at
5464 the end; it's there to help in the case when a regex compiled on a system with
5465 4-byte pointers is run on another with 8-byte pointers. */
5466
5467 re->magic_number = MAGIC_NUMBER;
5468 re->size = size;
5469 re->options = options;
5470 re->dummy1 = re->dummy2 = 0;
5471 re->name_table_offset = sizeof(real_pcre);
5472 re->name_entry_size = max_name_size + 3;
5473 re->name_count = name_count;
5474 re->tables = (tables == pcre_default_tables)? NULL : tables;
5475 re->nullpad = NULL;
5476
5477 /* The starting points of the name/number translation table and of the code are
5478 passed around in the compile data block. */
5479
5480 compile_block.names_found = 0;
5481 compile_block.name_entry_size = max_name_size + 3;
5482 compile_block.name_table = (uschar *)re + re->name_table_offset;
5483 codestart = compile_block.name_table + re->name_entry_size * re->name_count;
5484 compile_block.start_code = codestart;
5485 compile_block.start_pattern = (const uschar *)pattern;
5486 compile_block.req_varyopt = 0;
5487 compile_block.nopartial = FALSE;
5488
5489 /* Set up a starting, non-extracting bracket, then compile the expression. On
5490 error, *errorptr will be set non-NULL, so we don't need to look at the result
5491 of the function here. */
5492
5493 ptr = (const uschar *)pattern;
5494 code = (uschar *)codestart;
5495 *code = OP_BRA;
5496 bracount = 0;
5497 (void)compile_regex(options, options & PCRE_IMS, &bracount, &code, &ptr,
5498   errorptr, FALSE, 0, &firstbyte, &reqbyte, NULL, &compile_block);
5499 re->top_bracket = bracount;
5500 re->top_backref = compile_block.top_backref;
5501
5502 if (compile_block.nopartial) re->options |= PCRE_NOPARTIAL;
5503
5504 /* If not reached end of pattern on success, there's an excess bracket. */
5505
5506 if (*errorptr == NULL && *ptr != 0) *errorptr = ERR22;
5507
5508 /* Fill in the terminating state and check for disastrous overflow, but
5509 if debugging, leave the test till after things are printed out. */
5510
5511 *code++ = OP_END;
5512
5513 #ifndef DEBUG
5514 if (code - codestart > length) *errorptr = ERR23;
5515 #endif
5516
5517 /* Give an error if there's back reference to a non-existent capturing
5518 subpattern. */
5519
5520 if (re->top_backref > re->top_bracket) *errorptr = ERR15;
5521
5522 /* Failed to compile, or error while post-processing */
5523
5524 if (*errorptr != NULL)
5525   {
5526   (pcre_free)(re);
5527   PCRE_ERROR_RETURN:
5528   *erroroffset = ptr - (const uschar *)pattern;
5529   return NULL;
5530   }
5531
5532 /* If the anchored option was not passed, set the flag if we can determine that
5533 the pattern is anchored by virtue of ^ characters or \A or anything else (such
5534 as starting with .* when DOTALL is set).
5535
5536 Otherwise, if we know what the first character has to be, save it, because that
5537 speeds up unanchored matches no end. If not, see if we can set the
5538 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
5539 start with ^. and also when all branches start with .* for non-DOTALL matches.
5540 */
5541
5542 if ((options & PCRE_ANCHORED) == 0)
5543   {
5544   int temp_options = options;
5545   if (is_anchored(codestart, &temp_options, 0, compile_block.backref_map))
5546     re->options |= PCRE_ANCHORED;
5547   else
5548     {
5549     if (firstbyte < 0)
5550       firstbyte = find_firstassertedchar(codestart, &temp_options, FALSE);
5551     if (firstbyte >= 0)   /* Remove caseless flag for non-caseable chars */
5552       {
5553       int ch = firstbyte & 255;
5554       re->first_byte = ((firstbyte & REQ_CASELESS) != 0 &&
5555          compile_block.fcc[ch] == ch)? ch : firstbyte;
5556       re->options |= PCRE_FIRSTSET;
5557       }
5558     else if (is_startline(codestart, 0, compile_block.backref_map))
5559       re->options |= PCRE_STARTLINE;
5560     }
5561   }
5562
5563 /* For an anchored pattern, we use the "required byte" only if it follows a
5564 variable length item in the regex. Remove the caseless flag for non-caseable
5565 bytes. */
5566
5567 if (reqbyte >= 0 &&
5568      ((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0))
5569   {
5570   int ch = reqbyte & 255;
5571   re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&
5572     compile_block.fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;
5573   re->options |= PCRE_REQCHSET;
5574   }
5575
5576 /* Print out the compiled data for debugging */
5577
5578 #ifdef DEBUG
5579
5580 printf("Length = %d top_bracket = %d top_backref = %d\n",
5581   length, re->top_bracket, re->top_backref);
5582
5583 if (re->options != 0)
5584   {
5585   printf("%s%s%s%s%s%s%s%s%s%s\n",
5586     ((re->options & PCRE_NOPARTIAL) != 0)? "nopartial " : "",
5587     ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
5588     ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
5589     ((re->options & PCRE_ICHANGED) != 0)? "case state changed " : "",
5590     ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
5591     ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
5592     ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
5593     ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
5594     ((re->options & PCRE_EXTRA) != 0)? "extra " : "",
5595     ((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");
5596   }
5597
5598 if ((re->options & PCRE_FIRSTSET) != 0)
5599   {
5600   int ch = re->first_byte & 255;
5601   const char *caseless = ((re->first_byte & REQ_CASELESS) == 0)? "" : " (caseless)";
5602   if (isprint(ch)) printf("First char = %c%s\n", ch, caseless);
5603     else printf("First char = \\x%02x%s\n", ch, caseless);
5604   }
5605
5606 if ((re->options & PCRE_REQCHSET) != 0)
5607   {
5608   int ch = re->req_byte & 255;
5609   const char *caseless = ((re->req_byte & REQ_CASELESS) == 0)? "" : " (caseless)";
5610   if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless);
5611     else printf("Req char = \\x%02x%s\n", ch, caseless);
5612   }
5613
5614 print_internals(re, stdout);
5615
5616 /* This check is done here in the debugging case so that the code that
5617 was compiled can be seen. */
5618
5619 if (code - codestart > length)
5620   {
5621   *errorptr = ERR23;
5622   (pcre_free)(re);
5623   *erroroffset = ptr - (uschar *)pattern;
5624   return NULL;
5625   }
5626 #endif
5627
5628 return (pcre *)re;
5629 }
5630
5631
5632
5633 /*************************************************
5634 *          Match a back-reference                *
5635 *************************************************/
5636
5637 /* If a back reference hasn't been set, the length that is passed is greater
5638 than the number of characters left in the string, so the match fails.
5639
5640 Arguments:
5641   offset      index into the offset vector
5642   eptr        points into the subject
5643   length      length to be matched
5644   md          points to match data block
5645   ims         the ims flags
5646
5647 Returns:      TRUE if matched
5648 */
5649
5650 static BOOL
5651 match_ref(int offset, register const uschar *eptr, int length, match_data *md,
5652   unsigned long int ims)
5653 {
5654 const uschar *p = md->start_subject + md->offset_vector[offset];
5655
5656 #ifdef DEBUG
5657 if (eptr >= md->end_subject)
5658   printf("matching subject <null>");
5659 else
5660   {
5661   printf("matching subject ");
5662   pchars(eptr, length, TRUE, md);
5663   }
5664 printf(" against backref ");
5665 pchars(p, length, FALSE, md);
5666 printf("\n");
5667 #endif
5668
5669 /* Always fail if not enough characters left */
5670
5671 if (length > md->end_subject - eptr) return FALSE;
5672
5673 /* Separate the caselesss case for speed */
5674
5675 if ((ims & PCRE_CASELESS) != 0)
5676   {
5677   while (length-- > 0)
5678     if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;
5679   }
5680 else
5681   { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
5682
5683 return TRUE;
5684 }
5685
5686
5687 #ifdef SUPPORT_UTF8
5688 /*************************************************
5689 *       Match character against an XCLASS        *
5690 *************************************************/
5691
5692 /* This function is called from within the XCLASS code below, to match a
5693 character against an extended class which might match values > 255.
5694
5695 Arguments:
5696   c           the character
5697   data        points to the flag byte of the XCLASS data
5698
5699 Returns:      TRUE if character matches, else FALSE
5700 */
5701
5702 static BOOL
5703 match_xclass(int c, const uschar *data)
5704 {
5705 int t;
5706 BOOL negated = (*data & XCL_NOT) != 0;
5707
5708 /* Character values < 256 are matched against a bitmap, if one is present. If
5709 not, we still carry on, because there may be ranges that start below 256 in the
5710 additional data. */
5711
5712 if (c < 256)
5713   {
5714   if ((*data & XCL_MAP) != 0 && (data[1 + c/8] & (1 << (c&7))) != 0)
5715     return !negated;   /* char found */
5716   }
5717
5718 /* First skip the bit map if present. Then match against the list of Unicode
5719 properties or large chars or ranges that end with a large char. We won't ever
5720 encounter XCL_PROP or XCL_NOTPROP when UCP support is not compiled. */
5721
5722 if ((*data++ & XCL_MAP) != 0) data += 32;
5723
5724 while ((t = *data++) != XCL_END)
5725   {
5726   int x, y;
5727   if (t == XCL_SINGLE)
5728     {
5729     GETCHARINC(x, data);
5730     if (c == x) return !negated;
5731     }
5732   else if (t == XCL_RANGE)
5733     {
5734     GETCHARINC(x, data);
5735     GETCHARINC(y, data);
5736     if (c >= x && c <= y) return !negated;
5737     }
5738
5739 #ifdef SUPPORT_UCP
5740   else  /* XCL_PROP & XCL_NOTPROP */
5741     {
5742     int chartype, othercase;
5743     int rqdtype = *data++;
5744     int category = ucp_findchar(c, &chartype, &othercase);
5745     if (rqdtype >= 128)
5746       {
5747       if ((rqdtype - 128 == category) == (t == XCL_PROP)) return !negated;
5748       }
5749     else
5750       {
5751       if ((rqdtype == chartype) == (t == XCL_PROP)) return !negated;
5752       }
5753     }
5754 #endif  /* SUPPORT_UCP */
5755   }
5756
5757 return negated;   /* char did not match */
5758 }
5759 #endif
5760
5761
5762 /***************************************************************************
5763 ****************************************************************************
5764                    RECURSION IN THE match() FUNCTION
5765
5766 The match() function is highly recursive. Some regular expressions can cause
5767 it to recurse thousands of times. I was writing for Unix, so I just let it
5768 call itself recursively. This uses the stack for saving everything that has
5769 to be saved for a recursive call. On Unix, the stack can be large, and this
5770 works fine.
5771
5772 It turns out that on non-Unix systems there are problems with programs that
5773 use a lot of stack. (This despite the fact that every last chip has oodles
5774 of memory these days, and techniques for extending the stack have been known
5775 for decades.) So....
5776
5777 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
5778 calls by keeping local variables that need to be preserved in blocks of memory
5779 obtained from malloc instead instead of on the stack. Macros are used to
5780 achieve this so that the actual code doesn't look very different to what it
5781 always used to.
5782 ****************************************************************************
5783 ***************************************************************************/
5784
5785
5786 /* These versions of the macros use the stack, as normal */
5787
5788 #ifndef NO_RECURSE
5789 #define REGISTER register
5790 #define RMATCH(rx,ra,rb,rc,rd,re,rf,rg) rx = match(ra,rb,rc,rd,re,rf,rg)
5791 #define RRETURN(ra) return ra
5792 #else
5793
5794
5795 /* These versions of the macros manage a private stack on the heap. Note
5796 that the rd argument of RMATCH isn't actually used. It's the md argument of
5797 match(), which never changes. */
5798
5799 #define REGISTER
5800
5801 #define RMATCH(rx,ra,rb,rc,rd,re,rf,rg)\
5802   {\
5803   heapframe *newframe = (pcre_stack_malloc)(sizeof(heapframe));\
5804   if (setjmp(frame->Xwhere) == 0)\
5805     {\
5806     newframe->Xeptr = ra;\
5807     newframe->Xecode = rb;\
5808     newframe->Xoffset_top = rc;\
5809     newframe->Xims = re;\
5810     newframe->Xeptrb = rf;\
5811     newframe->Xflags = rg;\
5812     newframe->Xprevframe = frame;\
5813     frame = newframe;\
5814     DPRINTF(("restarting from line %d\n", __LINE__));\
5815     goto HEAP_RECURSE;\
5816     }\
5817   else\
5818     {\
5819     DPRINTF(("longjumped back to line %d\n", __LINE__));\
5820     frame = md->thisframe;\
5821     rx = frame->Xresult;\
5822     }\
5823   }
5824
5825 #define RRETURN(ra)\
5826   {\
5827   heapframe *newframe = frame;\
5828   frame = newframe->Xprevframe;\
5829   (pcre_stack_free)(newframe);\
5830   if (frame != NULL)\
5831     {\
5832     frame->Xresult = ra;\
5833     md->thisframe = frame;\
5834     longjmp(frame->Xwhere, 1);\
5835     }\
5836   return ra;\
5837   }
5838
5839
5840 /* Structure for remembering the local variables in a private frame */
5841
5842 typedef struct heapframe {
5843   struct heapframe *Xprevframe;
5844
5845   /* Function arguments that may change */
5846
5847   const uschar *Xeptr;
5848   const uschar *Xecode;
5849   int Xoffset_top;
5850   long int Xims;
5851   eptrblock *Xeptrb;
5852   int Xflags;
5853
5854   /* Function local variables */
5855
5856   const uschar *Xcallpat;
5857   const uschar *Xcharptr;
5858   const uschar *Xdata;
5859   const uschar *Xnext;
5860   const uschar *Xpp;
5861   const uschar *Xprev;
5862   const uschar *Xsaved_eptr;
5863
5864   recursion_info Xnew_recursive;
5865
5866   BOOL Xcur_is_word;
5867   BOOL Xcondition;
5868   BOOL Xminimize;
5869   BOOL Xprev_is_word;
5870
5871   unsigned long int Xoriginal_ims;
5872
5873 #ifdef SUPPORT_UCP
5874   int Xprop_type;
5875   int Xprop_fail_result;
5876   int Xprop_category;
5877   int Xprop_chartype;
5878   int Xprop_othercase;
5879   int Xprop_test_against;
5880   int *Xprop_test_variable;
5881 #endif
5882
5883   int Xctype;
5884   int Xfc;
5885   int Xfi;
5886   int Xlength;
5887   int Xmax;
5888   int Xmin;
5889   int Xnumber;
5890   int Xoffset;
5891   int Xop;
5892   int Xsave_capture_last;
5893   int Xsave_offset1, Xsave_offset2, Xsave_offset3;
5894   int Xstacksave[REC_STACK_SAVE_MAX];
5895
5896   eptrblock Xnewptrb;
5897
5898   /* Place to pass back result, and where to jump back to */
5899
5900   int  Xresult;
5901   jmp_buf Xwhere;
5902
5903 } heapframe;
5904
5905 #endif
5906
5907
5908 /***************************************************************************
5909 ***************************************************************************/
5910
5911
5912
5913 /*************************************************
5914 *         Match from current position            *
5915 *************************************************/
5916
5917 /* On entry ecode points to the first opcode, and eptr to the first character
5918 in the subject string, while eptrb holds the value of eptr at the start of the
5919 last bracketed group - used for breaking infinite loops matching zero-length
5920 strings. This function is called recursively in many circumstances. Whenever it
5921 returns a negative (error) response, the outer incarnation must also return the
5922 same response.
5923
5924 Performance note: It might be tempting to extract commonly used fields from the
5925 md structure (e.g. utf8, end_subject) into individual variables to improve
5926 performance. Tests using gcc on a SPARC disproved this; in the first case, it
5927 made performance worse.
5928
5929 Arguments:
5930    eptr        pointer in subject
5931    ecode       position in code
5932    offset_top  current top pointer
5933    md          pointer to "static" info for the match
5934    ims         current /i, /m, and /s options
5935    eptrb       pointer to chain of blocks containing eptr at start of
5936                  brackets - for testing for empty matches
5937    flags       can contain
5938                  match_condassert - this is an assertion condition
5939                  match_isgroup - this is the start of a bracketed group
5940
5941 Returns:       MATCH_MATCH if matched            )  these values are >= 0
5942                MATCH_NOMATCH if failed to match  )
5943                a negative PCRE_ERROR_xxx value if aborted by an error condition
5944                  (e.g. stopped by recursion limit)
5945 */
5946
5947 static int
5948 match(REGISTER const uschar *eptr, REGISTER const uschar *ecode,
5949   int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
5950   int flags)
5951 {
5952 /* These variables do not need to be preserved over recursion in this function,
5953 so they can be ordinary variables in all cases. Mark them with "register"
5954 because they are used a lot in loops. */
5955
5956 register int rrc;    /* Returns from recursive calls */
5957 register int i;      /* Used for loops not involving calls to RMATCH() */
5958 register int c;      /* Character values not kept over RMATCH() calls */
5959
5960 /* When recursion is not being used, all "local" variables that have to be
5961 preserved over calls to RMATCH() are part of a "frame" which is obtained from
5962 heap storage. Set up the top-level frame here; others are obtained from the
5963 heap whenever RMATCH() does a "recursion". See the macro definitions above. */
5964
5965 #ifdef NO_RECURSE
5966 heapframe *frame = (pcre_stack_malloc)(sizeof(heapframe));
5967 frame->Xprevframe = NULL;            /* Marks the top level */
5968
5969 /* Copy in the original argument variables */
5970
5971 frame->Xeptr = eptr;
5972 frame->Xecode = ecode;
5973 frame->Xoffset_top = offset_top;
5974 frame->Xims = ims;
5975 frame->Xeptrb = eptrb;
5976 frame->Xflags = flags;
5977
5978 /* This is where control jumps back to to effect "recursion" */
5979
5980 HEAP_RECURSE:
5981
5982 /* Macros make the argument variables come from the current frame */
5983
5984 #define eptr               frame->Xeptr
5985 #define ecode              frame->Xecode
5986 #define offset_top         frame->Xoffset_top
5987 #define ims                frame->Xims
5988 #define eptrb              frame->Xeptrb
5989 #define flags              frame->Xflags
5990
5991 /* Ditto for the local variables */
5992
5993 #ifdef SUPPORT_UTF8
5994 #define charptr            frame->Xcharptr
5995 #endif
5996 #define callpat            frame->Xcallpat
5997 #define data               frame->Xdata
5998 #define next               frame->Xnext
5999 #define pp                 frame->Xpp
6000 #define prev               frame->Xprev
6001 #define saved_eptr         frame->Xsaved_eptr
6002
6003 #define new_recursive      frame->Xnew_recursive
6004
6005 #define cur_is_word        frame->Xcur_is_word
6006 #define condition          frame->Xcondition
6007 #define minimize           frame->Xminimize
6008 #define prev_is_word       frame->Xprev_is_word
6009
6010 #define original_ims       frame->Xoriginal_ims
6011
6012 #ifdef SUPPORT_UCP
6013 #define prop_type          frame->Xprop_type
6014 #define prop_fail_result   frame->Xprop_fail_result
6015 #define prop_category      frame->Xprop_category
6016 #define prop_chartype      frame->Xprop_chartype
6017 #define prop_othercase     frame->Xprop_othercase
6018 #define prop_test_against  frame->Xprop_test_against
6019 #define prop_test_variable frame->Xprop_test_variable
6020 #endif
6021
6022 #define ctype              frame->Xctype
6023 #define fc                 frame->Xfc
6024 #define fi                 frame->Xfi
6025 #define length             frame->Xlength
6026 #define max                frame->Xmax
6027 #define min                frame->Xmin
6028 #define number             frame->Xnumber
6029 #define offset             frame->Xoffset
6030 #define op                 frame->Xop
6031 #define save_capture_last  frame->Xsave_capture_last
6032 #define save_offset1       frame->Xsave_offset1
6033 #define save_offset2       frame->Xsave_offset2
6034 #define save_offset3       frame->Xsave_offset3
6035 #define stacksave          frame->Xstacksave
6036
6037 #define newptrb            frame->Xnewptrb
6038
6039 /* When recursion is being used, local variables are allocated on the stack and
6040 get preserved during recursion in the normal way. In this environment, fi and
6041 i, and fc and c, can be the same variables. */
6042
6043 #else
6044 #define fi i
6045 #define fc c
6046
6047
6048 #ifdef SUPPORT_UTF8                /* Many of these variables are used ony */
6049 const uschar *charptr;             /* small blocks of the code. My normal  */
6050 #endif                             /* style of coding would have declared  */
6051 const uschar *callpat;             /* them within each of those blocks.    */
6052 const uschar *data;                /* However, in order to accommodate the */
6053 const uschar *next;                /* version of this code that uses an    */
6054 const uschar *pp;                  /* external "stack" implemented on the  */
6055 const uschar *prev;                /* heap, it is easier to declare them   */
6056 const uschar *saved_eptr;          /* all here, so the declarations can    */
6057                                    /* be cut out in a block. The only      */
6058 recursion_info new_recursive;      /* declarations within blocks below are */
6059                                    /* for variables that do not have to    */
6060 BOOL cur_is_word;                  /* be preserved over a recursive call   */
6061 BOOL condition;                    /* to RMATCH().                         */
6062 BOOL minimize;
6063 BOOL prev_is_word;
6064
6065 unsigned long int original_ims;
6066
6067 #ifdef SUPPORT_UCP
6068 int prop_type;
6069 int prop_fail_result;
6070 int prop_category;
6071 int prop_chartype;
6072 int prop_othercase;
6073 int prop_test_against;
6074 int *prop_test_variable;
6075 #endif
6076
6077 int ctype;
6078 int length;
6079 int max;
6080 int min;
6081 int number;
6082 int offset;
6083 int op;
6084 int save_capture_last;
6085 int save_offset1, save_offset2, save_offset3;
6086 int stacksave[REC_STACK_SAVE_MAX];
6087
6088 eptrblock newptrb;
6089 #endif
6090
6091 /* These statements are here to stop the compiler complaining about unitialized
6092 variables. */
6093
6094 #ifdef SUPPORT_UCP
6095 prop_fail_result = 0;
6096 prop_test_against = 0;
6097 prop_test_variable = NULL;
6098 #endif
6099
6100 /* OK, now we can get on with the real code of the function. Recursion is
6101 specified by the macros RMATCH and RRETURN. When NO_RECURSE is *not* defined,
6102 these just turn into a recursive call to match() and a "return", respectively.
6103 However, RMATCH isn't like a function call because it's quite a complicated
6104 macro. It has to be used in one particular way. This shouldn't, however, impact
6105 performance when true recursion is being used. */
6106
6107 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
6108
6109 original_ims = ims;    /* Save for resetting on ')' */
6110
6111 /* At the start of a bracketed group, add the current subject pointer to the
6112 stack of such pointers, to be re-instated at the end of the group when we hit
6113 the closing ket. When match() is called in other circumstances, we don't add to
6114 this stack. */
6115
6116 if ((flags & match_isgroup) != 0)
6117   {
6118   newptrb.epb_prev = eptrb;
6119   newptrb.epb_saved_eptr = eptr;
6120   eptrb = &newptrb;
6121   }
6122
6123 /* Now start processing the operations. */
6124
6125 for (;;)
6126   {
6127   op = *ecode;
6128   minimize = FALSE;
6129
6130   /* For partial matching, remember if we ever hit the end of the subject after
6131   matching at least one subject character. */
6132
6133   if (md->partial &&
6134       eptr >= md->end_subject &&
6135       eptr > md->start_match)
6136     md->hitend = TRUE;
6137
6138   /* Opening capturing bracket. If there is space in the offset vector, save
6139   the current subject position in the working slot at the top of the vector. We
6140   mustn't change the current values of the data slot, because they may be set
6141   from a previous iteration of this group, and be referred to by a reference
6142   inside the group.
6143
6144   If the bracket fails to match, we need to restore this value and also the
6145   values of the final offsets, in case they were set by a previous iteration of
6146   the same bracket.
6147
6148   If there isn't enough space in the offset vector, treat this as if it were a
6149   non-capturing bracket. Don't worry about setting the flag for the error case
6150   here; that is handled in the code for KET. */
6151
6152   if (op > OP_BRA)
6153     {
6154     number = op - OP_BRA;
6155
6156     /* For extended extraction brackets (large number), we have to fish out the
6157     number from a dummy opcode at the start. */
6158
6159     if (number > EXTRACT_BASIC_MAX)
6160       number = GET2(ecode, 2+LINK_SIZE);
6161     offset = number << 1;
6162
6163 #ifdef DEBUG
6164     printf("start bracket %d subject=", number);
6165     pchars(eptr, 16, TRUE, md);
6166     printf("\n");
6167 #endif
6168
6169     if (offset < md->offset_max)
6170       {
6171       save_offset1 = md->offset_vector[offset];
6172       save_offset2 = md->offset_vector[offset+1];
6173       save_offset3 = md->offset_vector[md->offset_end - number];
6174       save_capture_last = md->capture_last;
6175
6176       DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
6177       md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
6178
6179       do
6180         {
6181         RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb,
6182           match_isgroup);
6183         if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6184         md->capture_last = save_capture_last;
6185         ecode += GET(ecode, 1);
6186         }
6187       while (*ecode == OP_ALT);
6188
6189       DPRINTF(("bracket %d failed\n", number));
6190
6191       md->offset_vector[offset] = save_offset1;
6192       md->offset_vector[offset+1] = save_offset2;
6193       md->offset_vector[md->offset_end - number] = save_offset3;
6194
6195       RRETURN(MATCH_NOMATCH);
6196       }
6197
6198     /* Insufficient room for saving captured contents */
6199
6200     else op = OP_BRA;
6201     }
6202
6203   /* Other types of node can be handled by a switch */
6204
6205   switch(op)
6206     {
6207     case OP_BRA:     /* Non-capturing bracket: optimized */
6208     DPRINTF(("start bracket 0\n"));
6209     do
6210       {
6211       RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb,
6212         match_isgroup);
6213       if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6214       ecode += GET(ecode, 1);
6215       }
6216     while (*ecode == OP_ALT);
6217     DPRINTF(("bracket 0 failed\n"));
6218     RRETURN(MATCH_NOMATCH);
6219
6220     /* Conditional group: compilation checked that there are no more than
6221     two branches. If the condition is false, skipping the first branch takes us
6222     past the end if there is only one branch, but that's OK because that is
6223     exactly what going to the ket would do. */
6224
6225     case OP_COND:
6226     if (ecode[LINK_SIZE+1] == OP_CREF) /* Condition extract or recurse test */
6227       {
6228       offset = GET2(ecode, LINK_SIZE+2) << 1;  /* Doubled ref number */
6229       condition = (offset == CREF_RECURSE * 2)?
6230         (md->recursive != NULL) :
6231         (offset < offset_top && md->offset_vector[offset] >= 0);
6232       RMATCH(rrc, eptr, ecode + (condition?
6233         (LINK_SIZE + 4) : (LINK_SIZE + 1 + GET(ecode, 1))),
6234         offset_top, md, ims, eptrb, match_isgroup);
6235       RRETURN(rrc);
6236       }
6237
6238     /* The condition is an assertion. Call match() to evaluate it - setting
6239     the final argument TRUE causes it to stop at the end of an assertion. */
6240
6241     else
6242       {
6243       RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
6244           match_condassert | match_isgroup);
6245       if (rrc == MATCH_MATCH)
6246         {
6247         ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE+2);
6248         while (*ecode == OP_ALT) ecode += GET(ecode, 1);
6249         }
6250       else if (rrc != MATCH_NOMATCH)
6251         {
6252         RRETURN(rrc);         /* Need braces because of following else */
6253         }
6254       else ecode += GET(ecode, 1);
6255       RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb,
6256         match_isgroup);
6257       RRETURN(rrc);
6258       }
6259     /* Control never reaches here */
6260
6261     /* Skip over conditional reference or large extraction number data if
6262     encountered. */
6263
6264     case OP_CREF:
6265     case OP_BRANUMBER:
6266     ecode += 3;
6267     break;
6268
6269     /* End of the pattern. If we are in a recursion, we should restore the
6270     offsets appropriately and continue from after the call. */
6271
6272     case OP_END:
6273     if (md->recursive != NULL && md->recursive->group_num == 0)
6274       {
6275       recursion_info *rec = md->recursive;
6276       DPRINTF(("Hit the end in a (?0) recursion\n"));
6277       md->recursive = rec->prevrec;
6278       memmove(md->offset_vector, rec->offset_save,
6279         rec->saved_max * sizeof(int));
6280       md->start_match = rec->save_start;
6281       ims = original_ims;
6282       ecode = rec->after_call;
6283       break;
6284       }
6285
6286     /* Otherwise, if PCRE_NOTEMPTY is set, fail if we have matched an empty
6287     string - backtracking will then try other alternatives, if any. */
6288
6289     if (md->notempty && eptr == md->start_match) RRETURN(MATCH_NOMATCH);
6290     md->end_match_ptr = eptr;          /* Record where we ended */
6291     md->end_offset_top = offset_top;   /* and how many extracts were taken */
6292     RRETURN(MATCH_MATCH);
6293
6294     /* Change option settings */
6295
6296     case OP_OPT:
6297     ims = ecode[1];
6298     ecode += 2;
6299     DPRINTF(("ims set to %02lx\n", ims));
6300     break;
6301
6302     /* Assertion brackets. Check the alternative branches in turn - the
6303     matching won't pass the KET for an assertion. If any one branch matches,
6304     the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
6305     start of each branch to move the current point backwards, so the code at
6306     this level is identical to the lookahead case. */
6307
6308     case OP_ASSERT:
6309     case OP_ASSERTBACK:
6310     do
6311       {
6312       RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
6313         match_isgroup);
6314       if (rrc == MATCH_MATCH) break;
6315       if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6316       ecode += GET(ecode, 1);
6317       }
6318     while (*ecode == OP_ALT);
6319     if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
6320
6321     /* If checking an assertion for a condition, return MATCH_MATCH. */
6322
6323     if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
6324
6325     /* Continue from after the assertion, updating the offsets high water
6326     mark, since extracts may have been taken during the assertion. */
6327
6328     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
6329     ecode += 1 + LINK_SIZE;
6330     offset_top = md->end_offset_top;
6331     continue;
6332
6333     /* Negative assertion: all branches must fail to match */
6334
6335     case OP_ASSERT_NOT:
6336     case OP_ASSERTBACK_NOT:
6337     do
6338       {
6339       RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
6340         match_isgroup);
6341       if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH);
6342       if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6343       ecode += GET(ecode,1);
6344       }
6345     while (*ecode == OP_ALT);
6346
6347     if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
6348
6349     ecode += 1 + LINK_SIZE;
6350     continue;
6351
6352     /* Move the subject pointer back. This occurs only at the start of
6353     each branch of a lookbehind assertion. If we are too close to the start to
6354     move back, this match function fails. When working with UTF-8 we move
6355     back a number of characters, not bytes. */
6356
6357     case OP_REVERSE:
6358 #ifdef SUPPORT_UTF8
6359     if (md->utf8)
6360       {
6361       c = GET(ecode,1);
6362       for (i = 0; i < c; i++)
6363         {
6364         eptr--;
6365         if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
6366         BACKCHAR(eptr)
6367         }
6368       }
6369     else
6370 #endif
6371
6372     /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
6373
6374       {
6375       eptr -= GET(ecode,1);
6376       if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
6377       }
6378
6379     /* Skip to next op code */
6380
6381     ecode += 1 + LINK_SIZE;
6382     break;
6383
6384     /* The callout item calls an external function, if one is provided, passing
6385     details of the match so far. This is mainly for debugging, though the
6386     function is able to force a failure. */
6387
6388     case OP_CALLOUT:
6389     if (pcre_callout != NULL)
6390       {
6391       pcre_callout_block cb;
6392       cb.version          = 1;   /* Version 1 of the callout block */
6393       cb.callout_number   = ecode[1];
6394       cb.offset_vector    = md->offset_vector;
6395       cb.subject          = (const char *)md->start_subject;
6396       cb.subject_length   = md->end_subject - md->start_subject;
6397       cb.start_match      = md->start_match - md->start_subject;
6398       cb.current_position = eptr - md->start_subject;
6399       cb.pattern_position = GET(ecode, 2);
6400       cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
6401       cb.capture_top      = offset_top/2;
6402       cb.capture_last     = md->capture_last;
6403       cb.callout_data     = md->callout_data;
6404       if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
6405       if (rrc < 0) RRETURN(rrc);
6406       }
6407     ecode += 2 + 2*LINK_SIZE;
6408     break;
6409
6410     /* Recursion either matches the current regex, or some subexpression. The
6411     offset data is the offset to the starting bracket from the start of the
6412     whole pattern. (This is so that it works from duplicated subpatterns.)
6413
6414     If there are any capturing brackets started but not finished, we have to
6415     save their starting points and reinstate them after the recursion. However,
6416     we don't know how many such there are (offset_top records the completed
6417     total) so we just have to save all the potential data. There may be up to
6418     65535 such values, which is too large to put on the stack, but using malloc
6419     for small numbers seems expensive. As a compromise, the stack is used when
6420     there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
6421     is used. A problem is what to do if the malloc fails ... there is no way of
6422     returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
6423     values on the stack, and accept that the rest may be wrong.
6424
6425     There are also other values that have to be saved. We use a chained
6426     sequence of blocks that actually live on the stack. Thanks to Robin Houston
6427     for the original version of this logic. */
6428
6429     case OP_RECURSE:
6430       {
6431       callpat = md->start_code + GET(ecode, 1);
6432       new_recursive.group_num = *callpat - OP_BRA;
6433
6434       /* For extended extraction brackets (large number), we have to fish out
6435       the number from a dummy opcode at the start. */
6436
6437       if (new_recursive.group_num > EXTRACT_BASIC_MAX)
6438         new_recursive.group_num = GET2(callpat, 2+LINK_SIZE);
6439
6440       /* Add to "recursing stack" */
6441
6442       new_recursive.prevrec = md->recursive;
6443       md->recursive = &new_recursive;
6444
6445       /* Find where to continue from afterwards */
6446
6447       ecode += 1 + LINK_SIZE;
6448       new_recursive.after_call = ecode;
6449
6450       /* Now save the offset data. */
6451
6452       new_recursive.saved_max = md->offset_end;
6453       if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
6454         new_recursive.offset_save = stacksave;
6455       else
6456         {
6457         new_recursive.offset_save =
6458           (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
6459         if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
6460         }
6461
6462       memcpy(new_recursive.offset_save, md->offset_vector,
6463             new_recursive.saved_max * sizeof(int));
6464       new_recursive.save_start = md->start_match;
6465       md->start_match = eptr;
6466
6467       /* OK, now we can do the recursion. For each top-level alternative we
6468       restore the offset and recursion data. */
6469
6470       DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
6471       do
6472         {
6473         RMATCH(rrc, eptr, callpat + 1 + LINK_SIZE, offset_top, md, ims,
6474             eptrb, match_isgroup);
6475         if (rrc == MATCH_MATCH)
6476           {
6477           md->recursive = new_recursive.prevrec;
6478           if (new_recursive.offset_save != stacksave)
6479             (pcre_free)(new_recursive.offset_save);
6480           RRETURN(MATCH_MATCH);
6481           }
6482         else if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6483
6484         md->recursive = &new_recursive;
6485         memcpy(md->offset_vector, new_recursive.offset_save,
6486             new_recursive.saved_max * sizeof(int));
6487         callpat += GET(callpat, 1);
6488         }
6489       while (*callpat == OP_ALT);
6490
6491       DPRINTF(("Recursion didn't match\n"));
6492       md->recursive = new_recursive.prevrec;
6493       if (new_recursive.offset_save != stacksave)
6494         (pcre_free)(new_recursive.offset_save);
6495       RRETURN(MATCH_NOMATCH);
6496       }
6497     /* Control never reaches here */
6498
6499     /* "Once" brackets are like assertion brackets except that after a match,
6500     the point in the subject string is not moved back. Thus there can never be
6501     a move back into the brackets. Friedl calls these "atomic" subpatterns.
6502     Check the alternative branches in turn - the matching won't pass the KET
6503     for this kind of subpattern. If any one branch matches, we carry on as at
6504     the end of a normal bracket, leaving the subject pointer. */
6505
6506     case OP_ONCE:
6507       {
6508       prev = ecode;
6509       saved_eptr = eptr;
6510
6511       do
6512         {
6513         RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims,
6514           eptrb, match_isgroup);
6515         if (rrc == MATCH_MATCH) break;
6516         if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6517         ecode += GET(ecode,1);
6518         }
6519       while (*ecode == OP_ALT);
6520
6521       /* If hit the end of the group (which could be repeated), fail */
6522
6523       if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
6524
6525       /* Continue as from after the assertion, updating the offsets high water
6526       mark, since extracts may have been taken. */
6527
6528       do ecode += GET(ecode,1); while (*ecode == OP_ALT);
6529
6530       offset_top = md->end_offset_top;
6531       eptr = md->end_match_ptr;
6532
6533       /* For a non-repeating ket, just continue at this level. This also
6534       happens for a repeating ket if no characters were matched in the group.
6535       This is the forcible breaking of infinite loops as implemented in Perl
6536       5.005. If there is an options reset, it will get obeyed in the normal
6537       course of events. */
6538
6539       if (*ecode == OP_KET || eptr == saved_eptr)
6540         {
6541         ecode += 1+LINK_SIZE;
6542         break;
6543         }
6544
6545       /* The repeating kets try the rest of the pattern or restart from the
6546       preceding bracket, in the appropriate order. We need to reset any options
6547       that changed within the bracket before re-running it, so check the next
6548       opcode. */
6549
6550       if (ecode[1+LINK_SIZE] == OP_OPT)
6551         {
6552         ims = (ims & ~PCRE_IMS) | ecode[4];
6553         DPRINTF(("ims set to %02lx at group repeat\n", ims));
6554         }
6555
6556       if (*ecode == OP_KETRMIN)
6557         {
6558         RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0);
6559         if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6560         RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);
6561         if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6562         }
6563       else  /* OP_KETRMAX */
6564         {
6565         RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);
6566         if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6567         RMATCH(rrc, eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0);
6568         if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6569         }
6570       }
6571     RRETURN(MATCH_NOMATCH);
6572
6573     /* An alternation is the end of a branch; scan along to find the end of the
6574     bracketed group and go to there. */
6575
6576     case OP_ALT:
6577     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
6578     break;
6579
6580     /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
6581     that it may occur zero times. It may repeat infinitely, or not at all -
6582     i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
6583     repeat limits are compiled as a number of copies, with the optional ones
6584     preceded by BRAZERO or BRAMINZERO. */
6585
6586     case OP_BRAZERO:
6587       {
6588       next = ecode+1;
6589       RMATCH(rrc, eptr, next, offset_top, md, ims, eptrb, match_isgroup);
6590       if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6591       do next += GET(next,1); while (*next == OP_ALT);
6592       ecode = next + 1+LINK_SIZE;
6593       }
6594     break;
6595
6596     case OP_BRAMINZERO:
6597       {
6598       next = ecode+1;
6599       do next += GET(next,1); while (*next == OP_ALT);
6600       RMATCH(rrc, eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb,
6601         match_isgroup);
6602       if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6603       ecode++;
6604       }
6605     break;
6606
6607     /* End of a group, repeated or non-repeating. If we are at the end of
6608     an assertion "group", stop matching and return MATCH_MATCH, but record the
6609     current high water mark for use by positive assertions. Do this also
6610     for the "once" (not-backup up) groups. */
6611
6612     case OP_KET:
6613     case OP_KETRMIN:
6614     case OP_KETRMAX:
6615       {
6616       prev = ecode - GET(ecode, 1);
6617       saved_eptr = eptrb->epb_saved_eptr;
6618
6619       /* Back up the stack of bracket start pointers. */
6620
6621       eptrb = eptrb->epb_prev;
6622
6623       if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
6624           *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
6625           *prev == OP_ONCE)
6626         {
6627         md->end_match_ptr = eptr;      /* For ONCE */
6628         md->end_offset_top = offset_top;
6629         RRETURN(MATCH_MATCH);
6630         }
6631
6632       /* In all other cases except a conditional group we have to check the
6633       group number back at the start and if necessary complete handling an
6634       extraction by setting the offsets and bumping the high water mark. */
6635
6636       if (*prev != OP_COND)
6637         {
6638         number = *prev - OP_BRA;
6639
6640         /* For extended extraction brackets (large number), we have to fish out
6641         the number from a dummy opcode at the start. */
6642
6643         if (number > EXTRACT_BASIC_MAX) number = GET2(prev, 2+LINK_SIZE);
6644         offset = number << 1;
6645
6646 #ifdef DEBUG
6647         printf("end bracket %d", number);
6648         printf("\n");
6649 #endif
6650
6651         /* Test for a numbered group. This includes groups called as a result
6652         of recursion. Note that whole-pattern recursion is coded as a recurse
6653         into group 0, so it won't be picked up here. Instead, we catch it when
6654         the OP_END is reached. */
6655
6656         if (number > 0)
6657           {
6658           md->capture_last = number;
6659           if (offset >= md->offset_max) md->offset_overflow = TRUE; else
6660             {
6661             md->offset_vector[offset] =
6662               md->offset_vector[md->offset_end - number];
6663             md->offset_vector[offset+1] = eptr - md->start_subject;
6664             if (offset_top <= offset) offset_top = offset + 2;
6665             }
6666
6667           /* Handle a recursively called group. Restore the offsets
6668           appropriately and continue from after the call. */
6669
6670           if (md->recursive != NULL && md->recursive->group_num == number)
6671             {
6672             recursion_info *rec = md->recursive;
6673             DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
6674             md->recursive = rec->prevrec;
6675             md->start_match = rec->save_start;
6676             memcpy(md->offset_vector, rec->offset_save,
6677               rec->saved_max * sizeof(int));
6678             ecode = rec->after_call;
6679             ims = original_ims;
6680             break;
6681             }
6682           }
6683         }
6684
6685       /* Reset the value of the ims flags, in case they got changed during
6686       the group. */
6687
6688       ims = original_ims;
6689       DPRINTF(("ims reset to %02lx\n", ims));
6690
6691       /* For a non-repeating ket, just continue at this level. This also
6692       happens for a repeating ket if no characters were matched in the group.
6693       This is the forcible breaking of infinite loops as implemented in Perl
6694       5.005. If there is an options reset, it will get obeyed in the normal
6695       course of events. */
6696
6697       if (*ecode == OP_KET || eptr == saved_eptr)
6698         {
6699         ecode += 1 + LINK_SIZE;
6700         break;
6701         }
6702
6703       /* The repeating kets try the rest of the pattern or restart from the
6704       preceding bracket, in the appropriate order. */
6705
6706       if (*ecode == OP_KETRMIN)
6707         {
6708         RMATCH(rrc, eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0);
6709         if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6710         RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);
6711         if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6712         }
6713       else  /* OP_KETRMAX */
6714         {
6715         RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);
6716         if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6717         RMATCH(rrc, eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0);
6718         if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6719         }
6720       }
6721
6722     RRETURN(MATCH_NOMATCH);
6723
6724     /* Start of subject unless notbol, or after internal newline if multiline */
6725
6726     case OP_CIRC:
6727     if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
6728     if ((ims & PCRE_MULTILINE) != 0)
6729       {
6730       if (eptr != md->start_subject && eptr[-1] != NEWLINE)
6731         RRETURN(MATCH_NOMATCH);
6732       ecode++;
6733       break;
6734       }
6735     /* ... else fall through */
6736
6737     /* Start of subject assertion */
6738
6739     case OP_SOD:
6740     if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
6741     ecode++;
6742     break;
6743
6744     /* Start of match assertion */
6745
6746     case OP_SOM:
6747     if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
6748     ecode++;
6749     break;
6750
6751     /* Assert before internal newline if multiline, or before a terminating
6752     newline unless endonly is set, else end of subject unless noteol is set. */
6753
6754     case OP_DOLL:
6755     if ((ims & PCRE_MULTILINE) != 0)
6756       {
6757       if (eptr < md->end_subject)
6758         { if (*eptr != NEWLINE) RRETURN(MATCH_NOMATCH); }
6759       else
6760         { if (md->noteol) RRETURN(MATCH_NOMATCH); }
6761       ecode++;
6762       break;
6763       }
6764     else
6765       {
6766       if (md->noteol) RRETURN(MATCH_NOMATCH);
6767       if (!md->endonly)
6768         {
6769         if (eptr < md->end_subject - 1 ||
6770            (eptr == md->end_subject - 1 && *eptr != NEWLINE))
6771           RRETURN(MATCH_NOMATCH);
6772         ecode++;
6773         break;
6774         }
6775       }
6776     /* ... else fall through */
6777
6778     /* End of subject assertion (\z) */
6779
6780     case OP_EOD:
6781     if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
6782     ecode++;
6783     break;
6784
6785     /* End of subject or ending \n assertion (\Z) */
6786
6787     case OP_EODN:
6788     if (eptr < md->end_subject - 1 ||
6789        (eptr == md->end_subject - 1 && *eptr != NEWLINE)) RRETURN(MATCH_NOMATCH);
6790     ecode++;
6791     break;
6792
6793     /* Word boundary assertions */
6794
6795     case OP_NOT_WORD_BOUNDARY:
6796     case OP_WORD_BOUNDARY:
6797       {
6798
6799       /* Find out if the previous and current characters are "word" characters.
6800       It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
6801       be "non-word" characters. */
6802
6803 #ifdef SUPPORT_UTF8
6804       if (md->utf8)
6805         {
6806         if (eptr == md->start_subject) prev_is_word = FALSE; else
6807           {
6808           const uschar *lastptr = eptr - 1;
6809           while((*lastptr & 0xc0) == 0x80) lastptr--;
6810           GETCHAR(c, lastptr);
6811           prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
6812           }
6813         if (eptr >= md->end_subject) cur_is_word = FALSE; else
6814           {
6815           GETCHAR(c, eptr);
6816           cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
6817           }
6818         }
6819       else
6820 #endif
6821
6822       /* More streamlined when not in UTF-8 mode */
6823
6824         {
6825         prev_is_word = (eptr != md->start_subject) &&
6826           ((md->ctypes[eptr[-1]] & ctype_word) != 0);
6827         cur_is_word = (eptr < md->end_subject) &&
6828           ((md->ctypes[*eptr] & ctype_word) != 0);
6829         }
6830
6831       /* Now see if the situation is what we want */
6832
6833       if ((*ecode++ == OP_WORD_BOUNDARY)?
6834            cur_is_word == prev_is_word : cur_is_word != prev_is_word)
6835         RRETURN(MATCH_NOMATCH);
6836       }
6837     break;
6838
6839     /* Match a single character type; inline for speed */
6840
6841     case OP_ANY:
6842     if ((ims & PCRE_DOTALL) == 0 && eptr < md->end_subject && *eptr == NEWLINE)
6843       RRETURN(MATCH_NOMATCH);
6844     if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
6845 #ifdef SUPPORT_UTF8
6846     if (md->utf8)
6847       while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
6848 #endif
6849     ecode++;
6850     break;
6851
6852     /* Match a single byte, even in UTF-8 mode. This opcode really does match
6853     any byte, even newline, independent of the setting of PCRE_DOTALL. */
6854
6855     case OP_ANYBYTE:
6856     if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
6857     ecode++;
6858     break;
6859
6860     case OP_NOT_DIGIT:
6861     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
6862     GETCHARINCTEST(c, eptr);
6863     if (
6864 #ifdef SUPPORT_UTF8
6865        c < 256 &&
6866 #endif
6867        (md->ctypes[c] & ctype_digit) != 0
6868        )
6869       RRETURN(MATCH_NOMATCH);
6870     ecode++;
6871     break;
6872
6873     case OP_DIGIT:
6874     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
6875     GETCHARINCTEST(c, eptr);
6876     if (
6877 #ifdef SUPPORT_UTF8
6878        c >= 256 ||
6879 #endif
6880        (md->ctypes[c] & ctype_digit) == 0
6881        )
6882       RRETURN(MATCH_NOMATCH);
6883     ecode++;
6884     break;
6885
6886     case OP_NOT_WHITESPACE:
6887     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
6888     GETCHARINCTEST(c, eptr);
6889     if (
6890 #ifdef SUPPORT_UTF8
6891        c < 256 &&
6892 #endif
6893        (md->ctypes[c] & ctype_space) != 0
6894        )
6895       RRETURN(MATCH_NOMATCH);
6896     ecode++;
6897     break;
6898
6899     case OP_WHITESPACE:
6900     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
6901     GETCHARINCTEST(c, eptr);
6902     if (
6903 #ifdef SUPPORT_UTF8
6904        c >= 256 ||
6905 #endif
6906        (md->ctypes[c] & ctype_space) == 0
6907        )
6908       RRETURN(MATCH_NOMATCH);
6909     ecode++;
6910     break;
6911
6912     case OP_NOT_WORDCHAR:
6913     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
6914     GETCHARINCTEST(c, eptr);
6915     if (
6916 #ifdef SUPPORT_UTF8
6917        c < 256 &&
6918 #endif
6919        (md->ctypes[c] & ctype_word) != 0
6920        )
6921       RRETURN(MATCH_NOMATCH);
6922     ecode++;
6923     break;
6924
6925     case OP_WORDCHAR:
6926     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
6927     GETCHARINCTEST(c, eptr);
6928     if (
6929 #ifdef SUPPORT_UTF8
6930        c >= 256 ||
6931 #endif
6932        (md->ctypes[c] & ctype_word) == 0
6933        )
6934       RRETURN(MATCH_NOMATCH);
6935     ecode++;
6936     break;
6937
6938 #ifdef SUPPORT_UCP
6939     /* Check the next character by Unicode property. We will get here only
6940     if the support is in the binary; otherwise a compile-time error occurs. */
6941
6942     case OP_PROP:
6943     case OP_NOTPROP:
6944     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
6945     GETCHARINCTEST(c, eptr);
6946       {
6947       int chartype, rqdtype;
6948       int othercase;
6949       int category = ucp_findchar(c, &chartype, &othercase);
6950
6951       rqdtype = *(++ecode);
6952       ecode++;
6953
6954       if (rqdtype >= 128)
6955         {
6956         if ((rqdtype - 128 != category) == (op == OP_PROP))
6957           RRETURN(MATCH_NOMATCH);
6958         }
6959       else
6960         {
6961         if ((rqdtype != chartype) == (op == OP_PROP))
6962           RRETURN(MATCH_NOMATCH);
6963         }
6964       }
6965     break;
6966
6967     /* Match an extended Unicode sequence. We will get here only if the support
6968     is in the binary; otherwise a compile-time error occurs. */
6969
6970     case OP_EXTUNI:
6971     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
6972     GETCHARINCTEST(c, eptr);
6973       {
6974       int chartype;
6975       int othercase;
6976       int category = ucp_findchar(c, &chartype, &othercase);
6977       if (category == ucp_M) RRETURN(MATCH_NOMATCH);
6978       while (eptr < md->end_subject)
6979         {
6980         int len = 1;
6981         if (!md->utf8) c = *eptr; else
6982           {
6983           GETCHARLEN(c, eptr, len);
6984           }
6985         category = ucp_findchar(c, &chartype, &othercase);
6986         if (category != ucp_M) break;
6987         eptr += len;
6988         }
6989       }
6990     ecode++;
6991     break;
6992 #endif
6993
6994
6995     /* Match a back reference, possibly repeatedly. Look past the end of the
6996     item to see if there is repeat information following. The code is similar
6997     to that for character classes, but repeated for efficiency. Then obey
6998     similar code to character type repeats - written out again for speed.
6999     However, if the referenced string is the empty string, always treat
7000     it as matched, any number of times (otherwise there could be infinite
7001     loops). */
7002
7003     case OP_REF:
7004       {
7005       offset = GET2(ecode, 1) << 1;               /* Doubled ref number */
7006       ecode += 3;                                 /* Advance past item */
7007
7008       /* If the reference is unset, set the length to be longer than the amount
7009       of subject left; this ensures that every attempt at a match fails. We
7010       can't just fail here, because of the possibility of quantifiers with zero
7011       minima. */
7012
7013       length = (offset >= offset_top || md->offset_vector[offset] < 0)?
7014         md->end_subject - eptr + 1 :
7015         md->offset_vector[offset+1] - md->offset_vector[offset];
7016
7017       /* Set up for repetition, or handle the non-repeated case */
7018
7019       switch (*ecode)
7020         {
7021         case OP_CRSTAR:
7022         case OP_CRMINSTAR:
7023         case OP_CRPLUS:
7024         case OP_CRMINPLUS:
7025         case OP_CRQUERY:
7026         case OP_CRMINQUERY:
7027         c = *ecode++ - OP_CRSTAR;
7028         minimize = (c & 1) != 0;
7029         min = rep_min[c];                 /* Pick up values from tables; */
7030         max = rep_max[c];                 /* zero for max => infinity */
7031         if (max == 0) max = INT_MAX;
7032         break;
7033
7034         case OP_CRRANGE:
7035         case OP_CRMINRANGE:
7036         minimize = (*ecode == OP_CRMINRANGE);
7037         min = GET2(ecode, 1);
7038         max = GET2(ecode, 3);
7039         if (max == 0) max = INT_MAX;
7040         ecode += 5;
7041         break;
7042
7043         default:               /* No repeat follows */
7044         if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
7045         eptr += length;
7046         continue;              /* With the main loop */
7047         }
7048
7049       /* If the length of the reference is zero, just continue with the
7050       main loop. */
7051
7052       if (length == 0) continue;
7053
7054       /* First, ensure the minimum number of matches are present. We get back
7055       the length of the reference string explicitly rather than passing the
7056       address of eptr, so that eptr can be a register variable. */
7057
7058       for (i = 1; i <= min; i++)
7059         {
7060         if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
7061         eptr += length;
7062         }
7063
7064       /* If min = max, continue at the same level without recursion.
7065       They are not both allowed to be zero. */
7066
7067       if (min == max) continue;
7068
7069       /* If minimizing, keep trying and advancing the pointer */
7070
7071       if (minimize)
7072         {
7073         for (fi = min;; fi++)
7074           {
7075           RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7076           if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7077           if (fi >= max || !match_ref(offset, eptr, length, md, ims))
7078             RRETURN(MATCH_NOMATCH);
7079           eptr += length;
7080           }
7081         /* Control never gets here */
7082         }
7083
7084       /* If maximizing, find the longest string and work backwards */
7085
7086       else
7087         {
7088         pp = eptr;
7089         for (i = min; i < max; i++)
7090           {
7091           if (!match_ref(offset, eptr, length, md, ims)) break;
7092           eptr += length;
7093           }
7094         while (eptr >= pp)
7095           {
7096           RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7097           if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7098           eptr -= length;
7099           }
7100         RRETURN(MATCH_NOMATCH);
7101         }
7102       }
7103     /* Control never gets here */
7104
7105
7106
7107     /* Match a bit-mapped character class, possibly repeatedly. This op code is
7108     used when all the characters in the class have values in the range 0-255,
7109     and either the matching is caseful, or the characters are in the range
7110     0-127 when UTF-8 processing is enabled. The only difference between
7111     OP_CLASS and OP_NCLASS occurs when a data character outside the range is
7112     encountered.
7113
7114     First, look past the end of the item to see if there is repeat information
7115     following. Then obey similar code to character type repeats - written out
7116     again for speed. */
7117
7118     case OP_NCLASS:
7119     case OP_CLASS:
7120       {
7121       data = ecode + 1;                /* Save for matching */
7122       ecode += 33;                     /* Advance past the item */
7123
7124       switch (*ecode)
7125         {
7126         case OP_CRSTAR:
7127         case OP_CRMINSTAR:
7128         case OP_CRPLUS:
7129         case OP_CRMINPLUS:
7130         case OP_CRQUERY:
7131         case OP_CRMINQUERY:
7132         c = *ecode++ - OP_CRSTAR;
7133         minimize = (c & 1) != 0;
7134         min = rep_min[c];                 /* Pick up values from tables; */
7135         max = rep_max[c];                 /* zero for max => infinity */
7136         if (max == 0) max = INT_MAX;
7137         break;
7138
7139         case OP_CRRANGE:
7140         case OP_CRMINRANGE:
7141         minimize = (*ecode == OP_CRMINRANGE);
7142         min = GET2(ecode, 1);
7143         max = GET2(ecode, 3);
7144         if (max == 0) max = INT_MAX;
7145         ecode += 5;
7146         break;
7147
7148         default:               /* No repeat follows */
7149         min = max = 1;
7150         break;
7151         }
7152
7153       /* First, ensure the minimum number of matches are present. */
7154
7155 #ifdef SUPPORT_UTF8
7156       /* UTF-8 mode */
7157       if (md->utf8)
7158         {
7159         for (i = 1; i <= min; i++)
7160           {
7161           if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
7162           GETCHARINC(c, eptr);
7163           if (c > 255)
7164             {
7165             if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
7166             }
7167           else
7168             {
7169             if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
7170             }
7171           }
7172         }
7173       else
7174 #endif
7175       /* Not UTF-8 mode */
7176         {
7177         for (i = 1; i <= min; i++)
7178           {
7179           if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
7180           c = *eptr++;
7181           if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
7182           }
7183         }
7184
7185       /* If max == min we can continue with the main loop without the
7186       need to recurse. */
7187
7188       if (min == max) continue;
7189
7190       /* If minimizing, keep testing the rest of the expression and advancing
7191       the pointer while it matches the class. */
7192
7193       if (minimize)
7194         {
7195 #ifdef SUPPORT_UTF8
7196         /* UTF-8 mode */
7197         if (md->utf8)
7198           {
7199           for (fi = min;; fi++)
7200             {
7201             RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7202             if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7203             if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
7204             GETCHARINC(c, eptr);
7205             if (c > 255)
7206               {
7207               if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
7208               }
7209             else
7210               {
7211               if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
7212               }
7213             }
7214           }
7215         else
7216 #endif
7217         /* Not UTF-8 mode */
7218           {
7219           for (fi = min;; fi++)
7220             {
7221             RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7222             if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7223             if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
7224             c = *eptr++;
7225             if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
7226             }
7227           }
7228         /* Control never gets here */
7229         }
7230
7231       /* If maximizing, find the longest possible run, then work backwards. */
7232
7233       else
7234         {
7235         pp = eptr;
7236
7237 #ifdef SUPPORT_UTF8
7238         /* UTF-8 mode */
7239         if (md->utf8)
7240           {
7241           for (i = min; i < max; i++)
7242             {
7243             int len = 1;
7244             if (eptr >= md->end_subject) break;
7245             GETCHARLEN(c, eptr, len);
7246             if (c > 255)
7247               {
7248               if (op == OP_CLASS) break;
7249               }
7250             else
7251               {
7252               if ((data[c/8] & (1 << (c&7))) == 0) break;
7253               }
7254             eptr += len;
7255             }
7256           for (;;)
7257             {
7258             RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7259             if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7260             if (eptr-- == pp) break;        /* Stop if tried at original pos */
7261             BACKCHAR(eptr);
7262             }
7263           }
7264         else
7265 #endif
7266           /* Not UTF-8 mode */
7267           {
7268           for (i = min; i < max; i++)
7269             {
7270             if (eptr >= md->end_subject) break;
7271             c = *eptr;
7272             if ((data[c/8] & (1 << (c&7))) == 0) break;
7273             eptr++;
7274             }
7275           while (eptr >= pp)
7276             {
7277             RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7278             eptr--;
7279             if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7280             }
7281           }
7282
7283         RRETURN(MATCH_NOMATCH);
7284         }
7285       }
7286     /* Control never gets here */
7287
7288
7289     /* Match an extended character class. This opcode is encountered only
7290     in UTF-8 mode, because that's the only time it is compiled. */
7291
7292 #ifdef SUPPORT_UTF8
7293     case OP_XCLASS:
7294       {
7295       data = ecode + 1 + LINK_SIZE;                /* Save for matching */
7296       ecode += GET(ecode, 1);                      /* Advance past the item */
7297
7298       switch (*ecode)
7299         {
7300         case OP_CRSTAR:
7301         case OP_CRMINSTAR:
7302         case OP_CRPLUS:
7303         case OP_CRMINPLUS:
7304         case OP_CRQUERY:
7305         case OP_CRMINQUERY:
7306         c = *ecode++ - OP_CRSTAR;
7307         minimize = (c & 1) != 0;
7308         min = rep_min[c];                 /* Pick up values from tables; */
7309         max = rep_max[c];                 /* zero for max => infinity */
7310         if (max == 0) max = INT_MAX;
7311         break;
7312
7313         case OP_CRRANGE:
7314         case OP_CRMINRANGE:
7315         minimize = (*ecode == OP_CRMINRANGE);
7316         min = GET2(ecode, 1);
7317         max = GET2(ecode, 3);
7318         if (max == 0) max = INT_MAX;
7319         ecode += 5;
7320         break;
7321
7322         default:               /* No repeat follows */
7323         min = max = 1;
7324         break;
7325         }
7326
7327       /* First, ensure the minimum number of matches are present. */
7328
7329       for (i = 1; i <= min; i++)
7330         {
7331         if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
7332         GETCHARINC(c, eptr);
7333         if (!match_xclass(c, data)) RRETURN(MATCH_NOMATCH);
7334         }
7335
7336       /* If max == min we can continue with the main loop without the
7337       need to recurse. */
7338
7339       if (min == max) continue;
7340
7341       /* If minimizing, keep testing the rest of the expression and advancing
7342       the pointer while it matches the class. */
7343
7344       if (minimize)
7345         {
7346         for (fi = min;; fi++)
7347           {
7348           RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7349           if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7350           if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
7351           GETCHARINC(c, eptr);
7352           if (!match_xclass(c, data)) RRETURN(MATCH_NOMATCH);
7353           }
7354         /* Control never gets here */
7355         }
7356
7357       /* If maximizing, find the longest possible run, then work backwards. */
7358
7359       else
7360         {
7361         pp = eptr;
7362         for (i = min; i < max; i++)
7363           {
7364           int len = 1;
7365           if (eptr >= md->end_subject) break;
7366           GETCHARLEN(c, eptr, len);
7367           if (!match_xclass(c, data)) break;
7368           eptr += len;
7369           }
7370         for(;;)
7371           {
7372           RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7373           if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7374           if (eptr-- == pp) break;        /* Stop if tried at original pos */
7375           BACKCHAR(eptr)
7376           }
7377         RRETURN(MATCH_NOMATCH);
7378         }
7379
7380       /* Control never gets here */
7381       }
7382 #endif    /* End of XCLASS */
7383
7384     /* Match a single character, casefully */
7385
7386     case OP_CHAR:
7387 #ifdef SUPPORT_UTF8
7388     if (md->utf8)
7389       {
7390       length = 1;
7391       ecode++;
7392       GETCHARLEN(fc, ecode, length);
7393       if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
7394       while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
7395       }
7396     else
7397 #endif
7398
7399     /* Non-UTF-8 mode */
7400       {
7401       if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
7402       if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
7403       ecode += 2;
7404       }
7405     break;
7406
7407     /* Match a single character, caselessly */
7408
7409     case OP_CHARNC:
7410 #ifdef SUPPORT_UTF8
7411     if (md->utf8)
7412       {
7413       length = 1;
7414       ecode++;
7415       GETCHARLEN(fc, ecode, length);
7416
7417       if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
7418
7419       /* If the pattern character's value is < 128, we have only one byte, and
7420       can use the fast lookup table. */
7421
7422       if (fc < 128)
7423         {
7424         if (md->lcc[*ecode++] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
7425         }
7426
7427       /* Otherwise we must pick up the subject character */
7428
7429       else
7430         {
7431         int dc;
7432         GETCHARINC(dc, eptr);
7433         ecode += length;
7434
7435         /* If we have Unicode property support, we can use it to test the other
7436         case of the character, if there is one. The result of ucp_findchar() is
7437         < 0 if the char isn't found, and othercase is returned as zero if there
7438         isn't one. */
7439
7440         if (fc != dc)
7441           {
7442 #ifdef SUPPORT_UCP
7443           int chartype;
7444           int othercase;
7445           if (ucp_findchar(fc, &chartype, &othercase) < 0 || dc != othercase)
7446 #endif
7447             RRETURN(MATCH_NOMATCH);
7448           }
7449         }
7450       }
7451     else
7452 #endif   /* SUPPORT_UTF8 */
7453
7454     /* Non-UTF-8 mode */
7455       {
7456       if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
7457       if (md->lcc[ecode[1]] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
7458       ecode += 2;
7459       }
7460     break;
7461
7462     /* Match a single character repeatedly; different opcodes share code. */
7463
7464     case OP_EXACT:
7465     min = max = GET2(ecode, 1);
7466     ecode += 3;
7467     goto REPEATCHAR;
7468
7469     case OP_UPTO:
7470     case OP_MINUPTO:
7471     min = 0;
7472     max = GET2(ecode, 1);
7473     minimize = *ecode == OP_MINUPTO;
7474     ecode += 3;
7475     goto REPEATCHAR;
7476
7477     case OP_STAR:
7478     case OP_MINSTAR:
7479     case OP_PLUS:
7480     case OP_MINPLUS:
7481     case OP_QUERY:
7482     case OP_MINQUERY:
7483     c = *ecode++ - OP_STAR;
7484     minimize = (c & 1) != 0;
7485     min = rep_min[c];                 /* Pick up values from tables; */
7486     max = rep_max[c];                 /* zero for max => infinity */
7487     if (max == 0) max = INT_MAX;
7488
7489     /* Common code for all repeated single-character matches. We can give
7490     up quickly if there are fewer than the minimum number of characters left in
7491     the subject. */
7492
7493     REPEATCHAR:
7494 #ifdef SUPPORT_UTF8
7495     if (md->utf8)
7496       {
7497       length = 1;
7498       charptr = ecode;
7499       GETCHARLEN(fc, ecode, length);
7500       if (min * length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
7501       ecode += length;
7502
7503       /* Handle multibyte character matching specially here. There is
7504       support for caseless matching if UCP support is present. */
7505
7506       if (length > 1)
7507         {
7508         int oclength = 0;
7509         uschar occhars[8];
7510
7511 #ifdef SUPPORT_UCP
7512         int othercase;
7513         int chartype;
7514         if ((ims & PCRE_CASELESS) != 0 &&
7515              ucp_findchar(fc, &chartype, &othercase) >= 0 &&
7516              othercase > 0)
7517           oclength = ord2utf8(othercase, occhars);
7518 #endif  /* SUPPORT_UCP */
7519
7520         for (i = 1; i <= min; i++)
7521           {
7522           if (memcmp(eptr, charptr, length) == 0) eptr += length;
7523           /* Need braces because of following else */
7524           else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
7525           else
7526             {
7527             if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
7528             eptr += oclength;
7529             }
7530           }
7531
7532         if (min == max) continue;
7533
7534         if (minimize)
7535           {
7536           for (fi = min;; fi++)
7537             {
7538             RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7539             if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7540             if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
7541             if (memcmp(eptr, charptr, length) == 0) eptr += length;
7542             /* Need braces because of following else */
7543             else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
7544             else
7545               {
7546               if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
7547               eptr += oclength;
7548               }
7549             }
7550           /* Control never gets here */
7551           }
7552         else
7553           {
7554           pp = eptr;
7555           for (i = min; i < max; i++)
7556             {
7557             if (eptr > md->end_subject - length) break;
7558             if (memcmp(eptr, charptr, length) == 0) eptr += length;
7559             else if (oclength == 0) break;
7560             else
7561               {
7562               if (memcmp(eptr, occhars, oclength) != 0) break;
7563               eptr += oclength;
7564               }
7565             }
7566           while (eptr >= pp)
7567            {
7568            RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7569            if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7570            eptr -= length;
7571            }
7572           RRETURN(MATCH_NOMATCH);
7573           }
7574         /* Control never gets here */
7575         }
7576
7577       /* If the length of a UTF-8 character is 1, we fall through here, and
7578       obey the code as for non-UTF-8 characters below, though in this case the
7579       value of fc will always be < 128. */
7580       }
7581     else
7582 #endif  /* SUPPORT_UTF8 */
7583
7584     /* When not in UTF-8 mode, load a single-byte character. */
7585       {
7586       if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
7587       fc = *ecode++;
7588       }
7589
7590     /* The value of fc at this point is always less than 256, though we may or
7591     may not be in UTF-8 mode. The code is duplicated for the caseless and
7592     caseful cases, for speed, since matching characters is likely to be quite
7593     common. First, ensure the minimum number of matches are present. If min =
7594     max, continue at the same level without recursing. Otherwise, if
7595     minimizing, keep trying the rest of the expression and advancing one
7596     matching character if failing, up to the maximum. Alternatively, if
7597     maximizing, find the maximum number of characters and work backwards. */
7598
7599     DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
7600       max, eptr));
7601
7602     if ((ims & PCRE_CASELESS) != 0)
7603       {
7604       fc = md->lcc[fc];
7605       for (i = 1; i <= min; i++)
7606         if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
7607       if (min == max) continue;
7608       if (minimize)
7609         {
7610         for (fi = min;; fi++)
7611           {
7612           RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7613           if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7614           if (fi >= max || eptr >= md->end_subject ||
7615               fc != md->lcc[*eptr++])
7616             RRETURN(MATCH_NOMATCH);
7617           }
7618         /* Control never gets here */
7619         }
7620       else
7621         {
7622         pp = eptr;
7623         for (i = min; i < max; i++)
7624           {
7625           if (eptr >= md->end_subject || fc != md->lcc[*eptr]) break;
7626           eptr++;
7627           }
7628         while (eptr >= pp)
7629           {
7630           RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7631           eptr--;
7632           if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7633           }
7634         RRETURN(MATCH_NOMATCH);
7635         }
7636       /* Control never gets here */
7637       }
7638
7639     /* Caseful comparisons (includes all multi-byte characters) */
7640
7641     else
7642       {
7643       for (i = 1; i <= min; i++) if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
7644       if (min == max) continue;
7645       if (minimize)
7646         {
7647         for (fi = min;; fi++)
7648           {
7649           RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7650           if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7651           if (fi >= max || eptr >= md->end_subject || fc != *eptr++)
7652             RRETURN(MATCH_NOMATCH);
7653           }
7654         /* Control never gets here */
7655         }
7656       else
7657         {
7658         pp = eptr;
7659         for (i = min; i < max; i++)
7660           {
7661           if (eptr >= md->end_subject || fc != *eptr) break;
7662           eptr++;
7663           }
7664         while (eptr >= pp)
7665           {
7666           RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7667           eptr--;
7668           if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7669           }
7670         RRETURN(MATCH_NOMATCH);
7671         }
7672       }
7673     /* Control never gets here */
7674
7675     /* Match a negated single one-byte character. The character we are
7676     checking can be multibyte. */
7677
7678     case OP_NOT:
7679     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
7680     ecode++;
7681     GETCHARINCTEST(c, eptr);
7682     if ((ims & PCRE_CASELESS) != 0)
7683       {
7684 #ifdef SUPPORT_UTF8
7685       if (c < 256)
7686 #endif
7687       c = md->lcc[c];
7688       if (md->lcc[*ecode++] == c) RRETURN(MATCH_NOMATCH);
7689       }
7690     else
7691       {
7692       if (*ecode++ == c) RRETURN(MATCH_NOMATCH);
7693       }
7694     break;
7695
7696     /* Match a negated single one-byte character repeatedly. This is almost a
7697     repeat of the code for a repeated single character, but I haven't found a
7698     nice way of commoning these up that doesn't require a test of the
7699     positive/negative option for each character match. Maybe that wouldn't add
7700     very much to the time taken, but character matching *is* what this is all
7701     about... */
7702
7703     case OP_NOTEXACT:
7704     min = max = GET2(ecode, 1);
7705     ecode += 3;
7706     goto REPEATNOTCHAR;
7707
7708     case OP_NOTUPTO:
7709     case OP_NOTMINUPTO:
7710     min = 0;
7711     max = GET2(ecode, 1);
7712     minimize = *ecode == OP_NOTMINUPTO;
7713     ecode += 3;
7714     goto REPEATNOTCHAR;
7715
7716     case OP_NOTSTAR:
7717     case OP_NOTMINSTAR:
7718     case OP_NOTPLUS:
7719     case OP_NOTMINPLUS:
7720     case OP_NOTQUERY:
7721     case OP_NOTMINQUERY:
7722     c = *ecode++ - OP_NOTSTAR;
7723     minimize = (c & 1) != 0;
7724     min = rep_min[c];                 /* Pick up values from tables; */
7725     max = rep_max[c];                 /* zero for max => infinity */
7726     if (max == 0) max = INT_MAX;
7727
7728     /* Common code for all repeated single-byte matches. We can give up quickly
7729     if there are fewer than the minimum number of bytes left in the
7730     subject. */
7731
7732     REPEATNOTCHAR:
7733     if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
7734     fc = *ecode++;
7735
7736     /* The code is duplicated for the caseless and caseful cases, for speed,
7737     since matching characters is likely to be quite common. First, ensure the
7738     minimum number of matches are present. If min = max, continue at the same
7739     level without recursing. Otherwise, if minimizing, keep trying the rest of
7740     the expression and advancing one matching character if failing, up to the
7741     maximum. Alternatively, if maximizing, find the maximum number of
7742     characters and work backwards. */
7743
7744     DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
7745       max, eptr));
7746
7747     if ((ims & PCRE_CASELESS) != 0)
7748       {
7749       fc = md->lcc[fc];
7750
7751 #ifdef SUPPORT_UTF8
7752       /* UTF-8 mode */
7753       if (md->utf8)
7754         {
7755         register int d;
7756         for (i = 1; i <= min; i++)
7757           {
7758           GETCHARINC(d, eptr);
7759           if (d < 256) d = md->lcc[d];
7760           if (fc == d) RRETURN(MATCH_NOMATCH);
7761           }
7762         }
7763       else
7764 #endif
7765
7766       /* Not UTF-8 mode */
7767         {
7768         for (i = 1; i <= min; i++)
7769           if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
7770         }
7771
7772       if (min == max) continue;
7773
7774       if (minimize)
7775         {
7776 #ifdef SUPPORT_UTF8
7777         /* UTF-8 mode */
7778         if (md->utf8)
7779           {
7780           register int d;
7781           for (fi = min;; fi++)
7782             {
7783             RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7784             if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7785             GETCHARINC(d, eptr);
7786             if (d < 256) d = md->lcc[d];
7787             if (fi >= max || eptr >= md->end_subject || fc == d)
7788               RRETURN(MATCH_NOMATCH);
7789             }
7790           }
7791         else
7792 #endif
7793         /* Not UTF-8 mode */
7794           {
7795           for (fi = min;; fi++)
7796             {
7797             RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7798             if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7799             if (fi >= max || eptr >= md->end_subject || fc == md->lcc[*eptr++])
7800               RRETURN(MATCH_NOMATCH);
7801             }
7802           }
7803         /* Control never gets here */
7804         }
7805
7806       /* Maximize case */
7807
7808       else
7809         {
7810         pp = eptr;
7811
7812 #ifdef SUPPORT_UTF8
7813         /* UTF-8 mode */
7814         if (md->utf8)
7815           {
7816           register int d;
7817           for (i = min; i < max; i++)
7818             {
7819             int len = 1;
7820             if (eptr >= md->end_subject) break;
7821             GETCHARLEN(d, eptr, len);
7822             if (d < 256) d = md->lcc[d];
7823             if (fc == d) break;
7824             eptr += len;
7825             }
7826           for(;;)
7827             {
7828             RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7829             if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7830             if (eptr-- == pp) break;        /* Stop if tried at original pos */
7831             BACKCHAR(eptr);
7832             }
7833           }
7834         else
7835 #endif
7836         /* Not UTF-8 mode */
7837           {
7838           for (i = min; i < max; i++)
7839             {
7840             if (eptr >= md->end_subject || fc == md->lcc[*eptr]) break;
7841             eptr++;
7842             }
7843           while (eptr >= pp)
7844             {
7845             RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7846             if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7847             eptr--;
7848             }
7849           }
7850
7851         RRETURN(MATCH_NOMATCH);
7852         }
7853       /* Control never gets here */
7854       }
7855
7856     /* Caseful comparisons */
7857
7858     else
7859       {
7860 #ifdef SUPPORT_UTF8
7861       /* UTF-8 mode */
7862       if (md->utf8)
7863         {
7864         register int d;
7865         for (i = 1; i <= min; i++)
7866           {
7867           GETCHARINC(d, eptr);
7868           if (fc == d) RRETURN(MATCH_NOMATCH);
7869           }
7870         }
7871       else
7872 #endif
7873       /* Not UTF-8 mode */
7874         {
7875         for (i = 1; i <= min; i++)
7876           if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
7877         }
7878
7879       if (min == max) continue;
7880
7881       if (minimize)
7882         {
7883 #ifdef SUPPORT_UTF8
7884         /* UTF-8 mode */
7885         if (md->utf8)
7886           {
7887           register int d;
7888           for (fi = min;; fi++)
7889             {
7890             RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7891             if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7892             GETCHARINC(d, eptr);
7893             if (fi >= max || eptr >= md->end_subject || fc == d)
7894               RRETURN(MATCH_NOMATCH);
7895             }
7896           }
7897         else
7898 #endif
7899         /* Not UTF-8 mode */
7900           {
7901           for (fi = min;; fi++)
7902             {
7903             RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7904             if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7905             if (fi >= max || eptr >= md->end_subject || fc == *eptr++)
7906               RRETURN(MATCH_NOMATCH);
7907             }
7908           }
7909         /* Control never gets here */
7910         }
7911
7912       /* Maximize case */
7913
7914       else
7915         {
7916         pp = eptr;
7917
7918 #ifdef SUPPORT_UTF8
7919         /* UTF-8 mode */
7920         if (md->utf8)
7921           {
7922           register int d;
7923           for (i = min; i < max; i++)
7924             {
7925             int len = 1;
7926             if (eptr >= md->end_subject) break;
7927             GETCHARLEN(d, eptr, len);
7928             if (fc == d) break;
7929             eptr += len;
7930             }
7931           for(;;)
7932             {
7933             RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7934             if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7935             if (eptr-- == pp) break;        /* Stop if tried at original pos */
7936             BACKCHAR(eptr);
7937             }
7938           }
7939         else
7940 #endif
7941         /* Not UTF-8 mode */
7942           {
7943           for (i = min; i < max; i++)
7944             {
7945             if (eptr >= md->end_subject || fc == *eptr) break;
7946             eptr++;
7947             }
7948           while (eptr >= pp)
7949             {
7950             RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7951             if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7952             eptr--;
7953             }
7954           }
7955
7956         RRETURN(MATCH_NOMATCH);
7957         }
7958       }
7959     /* Control never gets here */
7960
7961     /* Match a single character type repeatedly; several different opcodes
7962     share code. This is very similar to the code for single characters, but we
7963     repeat it in the interests of efficiency. */
7964
7965     case OP_TYPEEXACT:
7966     min = max = GET2(ecode, 1);
7967     minimize = TRUE;
7968     ecode += 3;
7969     goto REPEATTYPE;
7970
7971     case OP_TYPEUPTO:
7972     case OP_TYPEMINUPTO:
7973     min = 0;
7974     max = GET2(ecode, 1);
7975     minimize = *ecode == OP_TYPEMINUPTO;
7976     ecode += 3;
7977     goto REPEATTYPE;
7978
7979     case OP_TYPESTAR:
7980     case OP_TYPEMINSTAR:
7981     case OP_TYPEPLUS:
7982     case OP_TYPEMINPLUS:
7983     case OP_TYPEQUERY:
7984     case OP_TYPEMINQUERY:
7985     c = *ecode++ - OP_TYPESTAR;
7986     minimize = (c & 1) != 0;
7987     min = rep_min[c];                 /* Pick up values from tables; */
7988     max = rep_max[c];                 /* zero for max => infinity */
7989     if (max == 0) max = INT_MAX;
7990
7991     /* Common code for all repeated single character type matches. Note that
7992     in UTF-8 mode, '.' matches a character of any length, but for the other
7993     character types, the valid characters are all one-byte long. */
7994
7995     REPEATTYPE:
7996     ctype = *ecode++;      /* Code for the character type */
7997
7998 #ifdef SUPPORT_UCP
7999     if (ctype == OP_PROP || ctype == OP_NOTPROP)
8000       {
8001       prop_fail_result = ctype == OP_NOTPROP;
8002       prop_type = *ecode++;
8003       if (prop_type >= 128)
8004         {
8005         prop_test_against = prop_type - 128;
8006         prop_test_variable = &prop_category;
8007         }
8008       else
8009         {
8010         prop_test_against = prop_type;
8011         prop_test_variable = &prop_chartype;
8012         }
8013       }
8014     else prop_type = -1;
8015 #endif
8016
8017     /* First, ensure the minimum number of matches are present. Use inline
8018     code for maximizing the speed, and do the type test once at the start
8019     (i.e. keep it out of the loop). Also we can test that there are at least
8020     the minimum number of bytes before we start. This isn't as effective in
8021     UTF-8 mode, but it does no harm. Separate the UTF-8 code completely as that
8022     is tidier. Also separate the UCP code, which can be the same for both UTF-8
8023     and single-bytes. */
8024
8025     if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
8026     if (min > 0)
8027       {
8028 #ifdef SUPPORT_UCP
8029       if (prop_type > 0)
8030         {
8031         for (i = 1; i <= min; i++)
8032           {
8033           GETCHARINC(c, eptr);
8034           prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
8035           if ((*prop_test_variable == prop_test_against) == prop_fail_result)
8036             RRETURN(MATCH_NOMATCH);
8037           }
8038         }
8039
8040       /* Match extended Unicode sequences. We will get here only if the
8041       support is in the binary; otherwise a compile-time error occurs. */
8042
8043       else if (ctype == OP_EXTUNI)
8044         {
8045         for (i = 1; i <= min; i++)
8046           {
8047           GETCHARINCTEST(c, eptr);
8048           prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
8049           if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
8050           while (eptr < md->end_subject)
8051             {
8052             int len = 1;
8053             if (!md->utf8) c = *eptr; else
8054               {
8055               GETCHARLEN(c, eptr, len);
8056               }
8057             prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
8058             if (prop_category != ucp_M) break;
8059             eptr += len;
8060             }
8061           }
8062         }
8063
8064       else
8065 #endif     /* SUPPORT_UCP */
8066
8067 /* Handle all other cases when the coding is UTF-8 */
8068
8069 #ifdef SUPPORT_UTF8
8070       if (md->utf8) switch(ctype)
8071         {
8072         case OP_ANY:
8073         for (i = 1; i <= min; i++)
8074           {
8075           if (eptr >= md->end_subject ||
8076              (*eptr++ == NEWLINE && (ims & PCRE_DOTALL) == 0))
8077             RRETURN(MATCH_NOMATCH);
8078           while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
8079           }
8080         break;
8081
8082         case OP_ANYBYTE:
8083         eptr += min;
8084         break;
8085
8086         case OP_NOT_DIGIT:
8087         for (i = 1; i <= min; i++)
8088           {
8089           if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
8090           GETCHARINC(c, eptr);
8091           if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
8092             RRETURN(MATCH_NOMATCH);
8093           }
8094         break;
8095
8096         case OP_DIGIT:
8097         for (i = 1; i <= min; i++)
8098           {
8099           if (eptr >= md->end_subject ||
8100              *eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
8101             RRETURN(MATCH_NOMATCH);
8102           /* No need to skip more bytes - we know it's a 1-byte character */
8103           }
8104         break;
8105
8106         case OP_NOT_WHITESPACE:
8107         for (i = 1; i <= min; i++)
8108           {
8109           if (eptr >= md->end_subject ||
8110              (*eptr < 128 && (md->ctypes[*eptr++] & ctype_space) != 0))
8111             RRETURN(MATCH_NOMATCH);
8112           while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
8113           }
8114         break;
8115
8116         case OP_WHITESPACE:
8117         for (i = 1; i <= min; i++)
8118           {
8119           if (eptr >= md->end_subject ||
8120              *eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
8121             RRETURN(MATCH_NOMATCH);
8122           /* No need to skip more bytes - we know it's a 1-byte character */
8123           }
8124         break;
8125
8126         case OP_NOT_WORDCHAR:
8127         for (i = 1; i <= min; i++)
8128           {
8129           if (eptr >= md->end_subject ||
8130              (*eptr < 128 && (md->ctypes[*eptr++] & ctype_word) != 0))
8131             RRETURN(MATCH_NOMATCH);
8132           while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
8133           }
8134         break;
8135
8136         case OP_WORDCHAR:
8137         for (i = 1; i <= min; i++)
8138           {
8139           if (eptr >= md->end_subject ||
8140              *eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
8141             RRETURN(MATCH_NOMATCH);
8142           /* No need to skip more bytes - we know it's a 1-byte character */
8143           }
8144         break;
8145
8146         default:
8147         RRETURN(PCRE_ERROR_INTERNAL);
8148         }  /* End switch(ctype) */
8149
8150       else
8151 #endif     /* SUPPORT_UTF8 */
8152
8153       /* Code for the non-UTF-8 case for minimum matching of operators other
8154       than OP_PROP and OP_NOTPROP. */
8155
8156       switch(ctype)
8157         {
8158         case OP_ANY:
8159         if ((ims & PCRE_DOTALL) == 0)
8160           {
8161           for (i = 1; i <= min; i++)
8162             if (*eptr++ == NEWLINE) RRETURN(MATCH_NOMATCH);
8163           }
8164         else eptr += min;
8165         break;
8166
8167         case OP_ANYBYTE:
8168         eptr += min;
8169         break;
8170
8171         case OP_NOT_DIGIT:
8172         for (i = 1; i <= min; i++)
8173           if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
8174         break;
8175
8176         case OP_DIGIT:
8177         for (i = 1; i <= min; i++)
8178           if ((md->ctypes[*eptr++] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
8179         break;
8180
8181         case OP_NOT_WHITESPACE:
8182         for (i = 1; i <= min; i++)
8183           if ((md->ctypes[*eptr++] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
8184         break;
8185
8186         case OP_WHITESPACE:
8187         for (i = 1; i <= min; i++)
8188           if ((md->ctypes[*eptr++] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
8189         break;
8190
8191         case OP_NOT_WORDCHAR:
8192         for (i = 1; i <= min; i++)
8193           if ((md->ctypes[*eptr++] & ctype_word) != 0)
8194             RRETURN(MATCH_NOMATCH);
8195         break;
8196
8197         case OP_WORDCHAR:
8198         for (i = 1; i <= min; i++)
8199           if ((md->ctypes[*eptr++] & ctype_word) == 0)
8200             RRETURN(MATCH_NOMATCH);
8201         break;
8202
8203         default:
8204         RRETURN(PCRE_ERROR_INTERNAL);
8205         }
8206       }
8207
8208     /* If min = max, continue at the same level without recursing */
8209
8210     if (min == max) continue;
8211
8212     /* If minimizing, we have to test the rest of the pattern before each
8213     subsequent match. Again, separate the UTF-8 case for speed, and also
8214     separate the UCP cases. */
8215
8216     if (minimize)
8217       {
8218 #ifdef SUPPORT_UCP
8219       if (prop_type > 0)
8220         {
8221         for (fi = min;; fi++)
8222           {
8223           RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
8224           if (rrc != MATCH_NOMATCH) RRETURN(rrc);
8225           if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
8226           GETCHARINC(c, eptr);
8227           prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
8228           if ((*prop_test_variable == prop_test_against) == prop_fail_result)
8229             RRETURN(MATCH_NOMATCH);
8230           }
8231         }
8232
8233       /* Match extended Unicode sequences. We will get here only if the
8234       support is in the binary; otherwise a compile-time error occurs. */
8235
8236       else if (ctype == OP_EXTUNI)
8237         {
8238         for (fi = min;; fi++)
8239           {
8240           RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
8241           if (rrc != MATCH_NOMATCH) RRETURN(rrc);
8242           if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
8243           GETCHARINCTEST(c, eptr);
8244           prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
8245           if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
8246           while (eptr < md->end_subject)
8247             {
8248             int len = 1;
8249             if (!md->utf8) c = *eptr; else
8250               {
8251               GETCHARLEN(c, eptr, len);
8252               }
8253             prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
8254             if (prop_category != ucp_M) break;
8255             eptr += len;
8256             }
8257           }
8258         }
8259
8260       else
8261 #endif     /* SUPPORT_UCP */
8262
8263 #ifdef SUPPORT_UTF8
8264       /* UTF-8 mode */
8265       if (md->utf8)
8266         {
8267         for (fi = min;; fi++)
8268           {
8269           RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
8270           if (rrc != MATCH_NOMATCH) RRETURN(rrc);
8271           if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
8272
8273           GETCHARINC(c, eptr);
8274           switch(ctype)
8275             {
8276             case OP_ANY:
8277             if ((ims & PCRE_DOTALL) == 0 && c == NEWLINE) RRETURN(MATCH_NOMATCH);
8278             break;
8279
8280             case OP_ANYBYTE:
8281             break;
8282
8283             case OP_NOT_DIGIT:
8284             if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
8285               RRETURN(MATCH_NOMATCH);
8286             break;
8287
8288             case OP_DIGIT:
8289             if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
8290               RRETURN(MATCH_NOMATCH);
8291             break;
8292
8293             case OP_NOT_WHITESPACE:
8294             if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
8295               RRETURN(MATCH_NOMATCH);
8296             break;
8297
8298             case OP_WHITESPACE:
8299             if  (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
8300               RRETURN(MATCH_NOMATCH);
8301             break;
8302
8303             case OP_NOT_WORDCHAR:
8304             if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
8305               RRETURN(MATCH_NOMATCH);
8306             break;
8307
8308             case OP_WORDCHAR:
8309             if (c >= 256 && (md->ctypes[c] & ctype_word) == 0)
8310               RRETURN(MATCH_NOMATCH);
8311             break;
8312
8313             default:
8314             RRETURN(PCRE_ERROR_INTERNAL);
8315             }
8316           }
8317         }
8318       else
8319 #endif
8320       /* Not UTF-8 mode */
8321         {
8322         for (fi = min;; fi++)
8323           {
8324           RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
8325           if (rrc != MATCH_NOMATCH) RRETURN(rrc);
8326           if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
8327           c = *eptr++;
8328           switch(ctype)
8329             {
8330             case OP_ANY:
8331             if ((ims & PCRE_DOTALL) == 0 && c == NEWLINE) RRETURN(MATCH_NOMATCH);
8332             break;
8333
8334             case OP_ANYBYTE:
8335             break;
8336
8337             case OP_NOT_DIGIT:
8338             if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
8339             break;
8340
8341             case OP_DIGIT:
8342             if ((md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
8343             break;
8344
8345             case OP_NOT_WHITESPACE:
8346             if ((md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
8347             break;
8348
8349             case OP_WHITESPACE:
8350             if  ((md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
8351             break;
8352
8353             case OP_NOT_WORDCHAR:
8354             if ((md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
8355             break;
8356
8357             case OP_WORDCHAR:
8358             if ((md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
8359             break;
8360
8361             default:
8362             RRETURN(PCRE_ERROR_INTERNAL);
8363             }
8364           }
8365         }
8366       /* Control never gets here */
8367       }
8368
8369     /* If maximizing it is worth using inline code for speed, doing the type
8370     test once at the start (i.e. keep it out of the loop). Again, keep the
8371     UTF-8 and UCP stuff separate. */
8372
8373     else
8374       {
8375       pp = eptr;  /* Remember where we started */
8376
8377 #ifdef SUPPORT_UCP
8378       if (prop_type > 0)
8379         {
8380         for (i = min; i < max; i++)
8381           {
8382           int len = 1;
8383           if (eptr >= md->end_subject) break;
8384           GETCHARLEN(c, eptr, len);
8385           prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
8386           if ((*prop_test_variable == prop_test_against) == prop_fail_result)
8387             break;
8388           eptr+= len;
8389           }
8390
8391         /* eptr is now past the end of the maximum run */
8392
8393         for(;;)
8394           {
8395           RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
8396           if (rrc != MATCH_NOMATCH) RRETURN(rrc);
8397           if (eptr-- == pp) break;        /* Stop if tried at original pos */
8398           BACKCHAR(eptr);
8399           }
8400         }
8401
8402       /* Match extended Unicode sequences. We will get here only if the
8403       support is in the binary; otherwise a compile-time error occurs. */
8404
8405       else if (ctype == OP_EXTUNI)
8406         {
8407         for (i = min; i < max; i++)
8408           {
8409           if (eptr >= md->end_subject) break;
8410           GETCHARINCTEST(c, eptr);
8411           prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
8412           if (prop_category == ucp_M) break;
8413           while (eptr < md->end_subject)
8414             {
8415             int len = 1;
8416             if (!md->utf8) c = *eptr; else
8417               {
8418               GETCHARLEN(c, eptr, len);
8419               }
8420             prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
8421             if (prop_category != ucp_M) break;
8422             eptr += len;
8423             }
8424           }
8425
8426         /* eptr is now past the end of the maximum run */
8427
8428         for(;;)
8429           {
8430           RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
8431           if (rrc != MATCH_NOMATCH) RRETURN(rrc);
8432           if (eptr-- == pp) break;        /* Stop if tried at original pos */
8433           for (;;)                        /* Move back over one extended */
8434             {
8435             int len = 1;
8436             BACKCHAR(eptr);
8437             if (!md->utf8) c = *eptr; else
8438               {
8439               GETCHARLEN(c, eptr, len);
8440               }
8441             prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
8442             if (prop_category != ucp_M) break;
8443             eptr--;
8444             }
8445           }
8446         }
8447
8448       else
8449 #endif   /* SUPPORT_UCP */
8450
8451 #ifdef SUPPORT_UTF8
8452       /* UTF-8 mode */
8453
8454       if (md->utf8)
8455         {
8456         switch(ctype)
8457           {
8458           case OP_ANY:
8459
8460           /* Special code is required for UTF8, but when the maximum is unlimited
8461           we don't need it, so we repeat the non-UTF8 code. This is probably
8462           worth it, because .* is quite a common idiom. */
8463
8464           if (max < INT_MAX)
8465             {
8466             if ((ims & PCRE_DOTALL) == 0)
8467               {
8468               for (i = min; i < max; i++)
8469                 {
8470                 if (eptr >= md->end_subject || *eptr == NEWLINE) break;
8471                 eptr++;
8472                 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
8473                 }
8474               }
8475             else
8476               {
8477               for (i = min; i < max; i++)
8478                 {
8479                 eptr++;
8480                 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
8481                 }
8482               }
8483             }
8484
8485           /* Handle unlimited UTF-8 repeat */
8486
8487           else
8488             {
8489             if ((ims & PCRE_DOTALL) == 0)
8490               {
8491               for (i = min; i < max; i++)
8492                 {
8493                 if (eptr >= md->end_subject || *eptr == NEWLINE) break;
8494                 eptr++;
8495                 }
8496               break;
8497               }
8498             else
8499               {
8500               c = max - min;
8501               if (c > md->end_subject - eptr) c = md->end_subject - eptr;
8502               eptr += c;
8503               }
8504             }
8505           break;
8506
8507           /* The byte case is the same as non-UTF8 */
8508
8509           case OP_ANYBYTE:
8510           c = max - min;
8511           if (c > md->end_subject - eptr) c = md->end_subject - eptr;
8512           eptr += c;
8513           break;
8514
8515           case OP_NOT_DIGIT:
8516           for (i = min; i < max; i++)
8517             {
8518             int len = 1;
8519             if (eptr >= md->end_subject) break;
8520             GETCHARLEN(c, eptr, len);
8521             if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
8522             eptr+= len;
8523             }
8524           break;
8525
8526           case OP_DIGIT:
8527           for (i = min; i < max; i++)
8528             {
8529             int len = 1;
8530             if (eptr >= md->end_subject) break;
8531             GETCHARLEN(c, eptr, len);
8532             if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
8533             eptr+= len;
8534             }
8535           break;
8536
8537           case OP_NOT_WHITESPACE:
8538           for (i = min; i < max; i++)
8539             {
8540             int len = 1;
8541             if (eptr >= md->end_subject) break;
8542             GETCHARLEN(c, eptr, len);
8543             if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
8544             eptr+= len;
8545             }
8546           break;
8547
8548           case OP_WHITESPACE:
8549           for (i = min; i < max; i++)
8550             {
8551             int len = 1;
8552             if (eptr >= md->end_subject) break;
8553             GETCHARLEN(c, eptr, len);
8554             if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
8555             eptr+= len;
8556             }
8557           break;
8558
8559           case OP_NOT_WORDCHAR:
8560           for (i = min; i < max; i++)
8561             {
8562             int len = 1;
8563             if (eptr >= md->end_subject) break;
8564             GETCHARLEN(c, eptr, len);
8565             if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
8566             eptr+= len;
8567             }
8568           break;
8569
8570           case OP_WORDCHAR:
8571           for (i = min; i < max; i++)
8572             {
8573             int len = 1;
8574             if (eptr >= md->end_subject) break;
8575             GETCHARLEN(c, eptr, len);
8576             if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
8577             eptr+= len;
8578             }
8579           break;
8580
8581           default:
8582           RRETURN(PCRE_ERROR_INTERNAL);
8583           }
8584
8585         /* eptr is now past the end of the maximum run */
8586
8587         for(;;)
8588           {
8589           RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
8590           if (rrc != MATCH_NOMATCH) RRETURN(rrc);
8591           if (eptr-- == pp) break;        /* Stop if tried at original pos */
8592           BACKCHAR(eptr);
8593           }
8594         }
8595       else
8596 #endif
8597
8598       /* Not UTF-8 mode */
8599         {
8600         switch(ctype)
8601           {
8602           case OP_ANY:
8603           if ((ims & PCRE_DOTALL) == 0)
8604             {
8605             for (i = min; i < max; i++)
8606               {
8607               if (eptr >= md->end_subject || *eptr == NEWLINE) break;
8608               eptr++;
8609               }
8610             break;
8611             }
8612           /* For DOTALL case, fall through and treat as \C */
8613
8614           case OP_ANYBYTE:
8615           c = max - min;
8616           if (c > md->end_subject - eptr) c = md->end_subject - eptr;
8617           eptr += c;
8618           break;
8619
8620           case OP_NOT_DIGIT:
8621           for (i = min; i < max; i++)
8622             {
8623             if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0)
8624               break;
8625             eptr++;
8626             }
8627           break;
8628
8629           case OP_DIGIT:
8630           for (i = min; i < max; i++)
8631             {
8632             if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0)
8633               break;
8634             eptr++;
8635             }
8636           break;
8637
8638           case OP_NOT_WHITESPACE:
8639           for (i = min; i < max; i++)
8640             {
8641             if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0)
8642               break;
8643             eptr++;
8644             }
8645           break;
8646
8647           case OP_WHITESPACE:
8648           for (i = min; i < max; i++)
8649             {
8650             if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0)
8651               break;
8652             eptr++;
8653             }
8654           break;
8655
8656           case OP_NOT_WORDCHAR:
8657           for (i = min; i < max; i++)
8658             {
8659             if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0)
8660               break;
8661             eptr++;
8662             }
8663           break;
8664
8665           case OP_WORDCHAR:
8666           for (i = min; i < max; i++)
8667             {
8668             if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0)
8669               break;
8670             eptr++;
8671             }
8672           break;
8673
8674           default:
8675           RRETURN(PCRE_ERROR_INTERNAL);
8676           }
8677
8678         /* eptr is now past the end of the maximum run */
8679
8680         while (eptr >= pp)
8681           {
8682           RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
8683           eptr--;
8684           if (rrc != MATCH_NOMATCH) RRETURN(rrc);
8685           }
8686         }
8687
8688       /* Get here if we can't make it match with any permitted repetitions */
8689
8690       RRETURN(MATCH_NOMATCH);
8691       }
8692     /* Control never gets here */
8693
8694     /* There's been some horrible disaster. Since all codes > OP_BRA are
8695     for capturing brackets, and there shouldn't be any gaps between 0 and
8696     OP_BRA, arrival here can only mean there is something seriously wrong
8697     in the code above or the OP_xxx definitions. */
8698
8699     default:
8700     DPRINTF(("Unknown opcode %d\n", *ecode));
8701     RRETURN(PCRE_ERROR_UNKNOWN_NODE);
8702     }
8703
8704   /* Do not stick any code in here without much thought; it is assumed
8705   that "continue" in the code above comes out to here to repeat the main
8706   loop. */
8707
8708   }             /* End of main loop */
8709 /* Control never reaches here */
8710 }
8711
8712
8713 /***************************************************************************
8714 ****************************************************************************
8715                    RECURSION IN THE match() FUNCTION
8716
8717 Undefine all the macros that were defined above to handle this. */
8718
8719 #ifdef NO_RECURSE
8720 #undef eptr
8721 #undef ecode
8722 #undef offset_top
8723 #undef ims
8724 #undef eptrb
8725 #undef flags
8726
8727 #undef callpat
8728 #undef charptr
8729 #undef data
8730 #undef next
8731 #undef pp
8732 #undef prev
8733 #undef saved_eptr
8734
8735 #undef new_recursive
8736
8737 #undef cur_is_word
8738 #undef condition
8739 #undef minimize
8740 #undef prev_is_word
8741
8742 #undef original_ims
8743
8744 #undef ctype
8745 #undef length
8746 #undef max
8747 #undef min
8748 #undef number
8749 #undef offset
8750 #undef op
8751 #undef save_capture_last
8752 #undef save_offset1
8753 #undef save_offset2
8754 #undef save_offset3
8755 #undef stacksave
8756
8757 #undef newptrb
8758
8759 #endif
8760
8761 /* These two are defined as macros in both cases */
8762
8763 #undef fc
8764 #undef fi
8765
8766 /***************************************************************************
8767 ***************************************************************************/
8768
8769
8770
8771 /*************************************************
8772 *         Execute a Regular Expression           *
8773 *************************************************/
8774
8775 /* This function applies a compiled re to a subject string and picks out
8776 portions of the string if it matches. Two elements in the vector are set for
8777 each substring: the offsets to the start and end of the substring.
8778
8779 Arguments:
8780   argument_re     points to the compiled expression
8781   extra_data      points to extra data or is NULL
8782   subject         points to the subject string
8783   length          length of subject string (may contain binary zeros)
8784   start_offset    where to start in the subject string
8785   options         option bits
8786   offsets         points to a vector of ints to be filled in with offsets
8787   offsetcount     the number of elements in the vector
8788
8789 Returns:          > 0 => success; value is the number of elements filled in
8790                   = 0 => success, but offsets is not big enough
8791                    -1 => failed to match
8792                  < -1 => some kind of unexpected problem
8793 */
8794
8795 EXPORT int
8796 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
8797   const char *subject, int length, int start_offset, int options, int *offsets,
8798   int offsetcount)
8799 {
8800 int rc, resetcount, ocount;
8801 int first_byte = -1;
8802 int req_byte = -1;
8803 int req_byte2 = -1;
8804 unsigned long int ims = 0;
8805 BOOL using_temporary_offsets = FALSE;
8806 BOOL anchored;
8807 BOOL startline;
8808 BOOL first_byte_caseless = FALSE;
8809 BOOL req_byte_caseless = FALSE;
8810 match_data match_block;
8811 const uschar *tables;
8812 const uschar *start_bits = NULL;
8813 const uschar *start_match = (const uschar *)subject + start_offset;
8814 const uschar *end_subject;
8815 const uschar *req_byte_ptr = start_match - 1;
8816
8817 pcre_study_data internal_study;
8818 const pcre_study_data *study;
8819
8820 real_pcre internal_re;
8821 const real_pcre *external_re = (const real_pcre *)argument_re;
8822 const real_pcre *re = external_re;
8823
8824 /* Plausibility checks */
8825
8826 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
8827 if (re == NULL || subject == NULL ||
8828    (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
8829 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
8830
8831 /* Fish out the optional data from the extra_data structure, first setting
8832 the default values. */
8833
8834 study = NULL;
8835 match_block.match_limit = MATCH_LIMIT;
8836 match_block.callout_data = NULL;
8837
8838 /* The table pointer is always in native byte order. */
8839
8840 tables = external_re->tables;
8841
8842 if (extra_data != NULL)
8843   {
8844   register unsigned int flags = extra_data->flags;
8845   if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
8846     study = (const pcre_study_data *)extra_data->study_data;
8847   if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
8848     match_block.match_limit = extra_data->match_limit;
8849   if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
8850     match_block.callout_data = extra_data->callout_data;
8851   if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
8852   }
8853
8854 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
8855 is a feature that makes it possible to save compiled regex and re-use them
8856 in other programs later. */
8857
8858 if (tables == NULL) tables = pcre_default_tables;
8859
8860 /* Check that the first field in the block is the magic number. If it is not,
8861 test for a regex that was compiled on a host of opposite endianness. If this is
8862 the case, flipped values are put in internal_re and internal_study if there was
8863 study data too. */
8864
8865 if (re->magic_number != MAGIC_NUMBER)
8866   {
8867   re = try_flipped(re, &internal_re, study, &internal_study);
8868   if (re == NULL) return PCRE_ERROR_BADMAGIC;
8869   if (study != NULL) study = &internal_study;
8870   }
8871
8872 /* Set up other data */
8873
8874 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
8875 startline = (re->options & PCRE_STARTLINE) != 0;
8876
8877 /* The code starts after the real_pcre block and the capture name table. */
8878
8879 match_block.start_code = (const uschar *)external_re + re->name_table_offset +
8880   re->name_count * re->name_entry_size;
8881
8882 match_block.start_subject = (const uschar *)subject;
8883 match_block.start_offset = start_offset;
8884 match_block.end_subject = match_block.start_subject + length;
8885 end_subject = match_block.end_subject;
8886
8887 match_block.endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
8888 match_block.utf8 = (re->options & PCRE_UTF8) != 0;
8889
8890 match_block.notbol = (options & PCRE_NOTBOL) != 0;
8891 match_block.noteol = (options & PCRE_NOTEOL) != 0;
8892 match_block.notempty = (options & PCRE_NOTEMPTY) != 0;
8893 match_block.partial = (options & PCRE_PARTIAL) != 0;
8894 match_block.hitend = FALSE;
8895
8896 match_block.recursive = NULL;                   /* No recursion at top level */
8897
8898 match_block.lcc = tables + lcc_offset;
8899 match_block.ctypes = tables + ctypes_offset;
8900
8901 /* Partial matching is supported only for a restricted set of regexes at the
8902 moment. */
8903
8904 if (match_block.partial && (re->options & PCRE_NOPARTIAL) != 0)
8905   return PCRE_ERROR_BADPARTIAL;
8906
8907 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
8908 back the character offset. */
8909
8910 #ifdef SUPPORT_UTF8
8911 if (match_block.utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
8912   {
8913   if (valid_utf8((uschar *)subject, length) >= 0)
8914     return PCRE_ERROR_BADUTF8;
8915   if (start_offset > 0 && start_offset < length)
8916     {
8917     int tb = ((uschar *)subject)[start_offset];
8918     if (tb > 127)
8919       {
8920       tb &= 0xc0;
8921       if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
8922       }
8923     }
8924   }
8925 #endif
8926
8927 /* The ims options can vary during the matching as a result of the presence
8928 of (?ims) items in the pattern. They are kept in a local variable so that
8929 restoring at the exit of a group is easy. */
8930
8931 ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
8932
8933 /* If the expression has got more back references than the offsets supplied can
8934 hold, we get a temporary chunk of working store to use during the matching.
8935 Otherwise, we can use the vector supplied, rounding down its size to a multiple
8936 of 3. */
8937
8938 ocount = offsetcount - (offsetcount % 3);
8939
8940 if (re->top_backref > 0 && re->top_backref >= ocount/3)
8941   {
8942   ocount = re->top_backref * 3 + 3;
8943   match_block.offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
8944   if (match_block.offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
8945   using_temporary_offsets = TRUE;
8946   DPRINTF(("Got memory to hold back references\n"));
8947   }
8948 else match_block.offset_vector = offsets;
8949
8950 match_block.offset_end = ocount;
8951 match_block.offset_max = (2*ocount)/3;
8952 match_block.offset_overflow = FALSE;
8953 match_block.capture_last = -1;
8954
8955 /* Compute the minimum number of offsets that we need to reset each time. Doing
8956 this makes a huge difference to execution time when there aren't many brackets
8957 in the pattern. */
8958
8959 resetcount = 2 + re->top_bracket * 2;
8960 if (resetcount > offsetcount) resetcount = ocount;
8961
8962 /* Reset the working variable associated with each extraction. These should
8963 never be used unless previously set, but they get saved and restored, and so we
8964 initialize them to avoid reading uninitialized locations. */
8965
8966 if (match_block.offset_vector != NULL)
8967   {
8968   register int *iptr = match_block.offset_vector + ocount;
8969   register int *iend = iptr - resetcount/2 + 1;
8970   while (--iptr >= iend) *iptr = -1;
8971   }
8972
8973 /* Set up the first character to match, if available. The first_byte value is
8974 never set for an anchored regular expression, but the anchoring may be forced
8975 at run time, so we have to test for anchoring. The first char may be unset for
8976 an unanchored pattern, of course. If there's no first char and the pattern was
8977 studied, there may be a bitmap of possible first characters. */
8978
8979 if (!anchored)
8980   {
8981   if ((re->options & PCRE_FIRSTSET) != 0)
8982     {
8983     first_byte = re->first_byte & 255;
8984     if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
8985       first_byte = match_block.lcc[first_byte];
8986     }
8987   else
8988     if (!startline && study != NULL &&
8989       (study->options & PCRE_STUDY_MAPPED) != 0)
8990         start_bits = study->start_bits;
8991   }
8992
8993 /* For anchored or unanchored matches, there may be a "last known required
8994 character" set. */
8995
8996 if ((re->options & PCRE_REQCHSET) != 0)
8997   {
8998   req_byte = re->req_byte & 255;
8999   req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
9000   req_byte2 = (tables + fcc_offset)[req_byte];  /* case flipped */
9001   }
9002
9003 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
9004 the loop runs just once. */
9005
9006 do
9007   {
9008   /* Reset the maximum number of extractions we might see. */
9009
9010   if (match_block.offset_vector != NULL)
9011     {
9012     register int *iptr = match_block.offset_vector;
9013     register int *iend = iptr + resetcount;
9014     while (iptr < iend) *iptr++ = -1;
9015     }
9016
9017   /* Advance to a unique first char if possible */
9018
9019   if (first_byte >= 0)
9020     {
9021     if (first_byte_caseless)
9022       while (start_match < end_subject &&
9023              match_block.lcc[*start_match] != first_byte)
9024         start_match++;
9025     else
9026       while (start_match < end_subject && *start_match != first_byte)
9027         start_match++;
9028     }
9029
9030   /* Or to just after \n for a multiline match if possible */
9031
9032   else if (startline)
9033     {
9034     if (start_match > match_block.start_subject + start_offset)
9035       {
9036       while (start_match < end_subject && start_match[-1] != NEWLINE)
9037         start_match++;
9038       }
9039     }
9040
9041   /* Or to a non-unique first char after study */
9042
9043   else if (start_bits != NULL)
9044     {
9045     while (start_match < end_subject)
9046       {
9047       register unsigned int c = *start_match;
9048       if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break;
9049       }
9050     }
9051
9052 #ifdef DEBUG  /* Sigh. Some compilers never learn. */
9053   printf(">>>> Match against: ");
9054   pchars(start_match, end_subject - start_match, TRUE, &match_block);
9055   printf("\n");
9056 #endif
9057
9058   /* If req_byte is set, we know that that character must appear in the subject
9059   for the match to succeed. If the first character is set, req_byte must be
9060   later in the subject; otherwise the test starts at the match point. This
9061   optimization can save a huge amount of backtracking in patterns with nested
9062   unlimited repeats that aren't going to match. Writing separate code for
9063   cased/caseless versions makes it go faster, as does using an autoincrement
9064   and backing off on a match.
9065
9066   HOWEVER: when the subject string is very, very long, searching to its end can
9067   take a long time, and give bad performance on quite ordinary patterns. This
9068   showed up when somebody was matching /^C/ on a 32-megabyte string... so we
9069   don't do this when the string is sufficiently long.
9070
9071   ALSO: this processing is disabled when partial matching is requested.
9072   */
9073
9074   if (req_byte >= 0 &&
9075       end_subject - start_match < REQ_BYTE_MAX &&
9076       !match_block.partial)
9077     {
9078     register const uschar *p = start_match + ((first_byte >= 0)? 1 : 0);
9079
9080     /* We don't need to repeat the search if we haven't yet reached the
9081     place we found it at last time. */
9082
9083     if (p > req_byte_ptr)
9084       {
9085       if (req_byte_caseless)
9086         {
9087         while (p < end_subject)
9088           {
9089           register int pp = *p++;
9090           if (pp == req_byte || pp == req_byte2) { p--; break; }
9091           }
9092         }
9093       else
9094         {
9095         while (p < end_subject)
9096           {
9097           if (*p++ == req_byte) { p--; break; }
9098           }
9099         }
9100
9101       /* If we can't find the required character, break the matching loop */
9102
9103       if (p >= end_subject) break;
9104
9105       /* If we have found the required character, save the point where we
9106       found it, so that we don't search again next time round the loop if
9107       the start hasn't passed this character yet. */
9108
9109       req_byte_ptr = p;
9110       }
9111     }
9112
9113   /* When a match occurs, substrings will be set for all internal extractions;
9114   we just need to set up the whole thing as substring 0 before returning. If
9115   there were too many extractions, set the return code to zero. In the case
9116   where we had to get some local store to hold offsets for backreferences, copy
9117   those back references that we can. In this case there need not be overflow
9118   if certain parts of the pattern were not used. */
9119
9120   match_block.start_match = start_match;
9121   match_block.match_call_count = 0;
9122
9123   rc = match(start_match, match_block.start_code, 2, &match_block, ims, NULL,
9124     match_isgroup);
9125
9126   if (rc == MATCH_NOMATCH)
9127     {
9128     start_match++;
9129 #ifdef SUPPORT_UTF8
9130     if (match_block.utf8)
9131       while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
9132         start_match++;
9133 #endif
9134     continue;
9135     }
9136
9137   if (rc != MATCH_MATCH)
9138     {
9139     DPRINTF((">>>> error: returning %d\n", rc));
9140     return rc;
9141     }
9142
9143   /* We have a match! Copy the offset information from temporary store if
9144   necessary */
9145
9146   if (using_temporary_offsets)
9147     {
9148     if (offsetcount >= 4)
9149       {
9150       memcpy(offsets + 2, match_block.offset_vector + 2,
9151         (offsetcount - 2) * sizeof(int));
9152       DPRINTF(("Copied offsets from temporary memory\n"));
9153       }
9154     if (match_block.end_offset_top > offsetcount)
9155       match_block.offset_overflow = TRUE;
9156
9157     DPRINTF(("Freeing temporary memory\n"));
9158     (pcre_free)(match_block.offset_vector);
9159     }
9160
9161   rc = match_block.offset_overflow? 0 : match_block.end_offset_top/2;
9162
9163   if (offsetcount < 2) rc = 0; else
9164     {
9165     offsets[0] = start_match - match_block.start_subject;
9166     offsets[1] = match_block.end_match_ptr - match_block.start_subject;
9167     }
9168
9169   DPRINTF((">>>> returning %d\n", rc));
9170   return rc;
9171   }
9172
9173 /* This "while" is the end of the "do" above */
9174
9175 while (!anchored && start_match <= end_subject);
9176
9177 if (using_temporary_offsets)
9178   {
9179   DPRINTF(("Freeing temporary memory\n"));
9180   (pcre_free)(match_block.offset_vector);
9181   }
9182
9183 if (match_block.partial && match_block.hitend)
9184   {
9185   DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
9186   return PCRE_ERROR_PARTIAL;
9187   }
9188 else
9189   {
9190   DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
9191   return PCRE_ERROR_NOMATCH;
9192   }
9193 }
9194
9195 /* End of pcre.c */