Initial commit of newLISP.
[newlisp.git] / pcre.c
blobf70386e586c085822fa082e11dc1646a18999e34
1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
5 /*
6 This is a library of functions to support regular expressions whose syntax
7 and semantics are as close as possible to those of the Perl 5 language. See
8 the file Tech.Notes for some information on the internals.
10 Written by: Philip Hazel <ph10@cam.ac.uk>
12 Copyright (c) 1997-2004 University of Cambridge
14 -----------------------------------------------------------------------------
15 Redistribution and use in source and binary forms, with or without
16 modification, are permitted provided that the following conditions are met:
18 * Redistributions of source code must retain the above copyright notice,
19 this list of conditions and the following disclaimer.
21 * Redistributions in binary form must reproduce the above copyright
22 notice, this list of conditions and the following disclaimer in the
23 documentation and/or other materials provided with the distribution.
25 * Neither the name of the University of Cambridge nor the names of its
26 contributors may be used to endorse or promote products derived from
27 this software without specific prior written permission.
29 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
30 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
31 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
32 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
33 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
34 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
35 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
36 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
37 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
38 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
39 POSSIBILITY OF SUCH DAMAGE.
40 -----------------------------------------------------------------------------
44 /* Define DEBUG to get debugging output on stdout. */
45 /* #define DEBUG */
47 /* Use a macro for debugging printing, 'cause that eliminates the use of #ifdef
48 inline, and there are *still* stupid compilers about that don't like indented
49 pre-processor statements. I suppose it's only been 10 years... */
51 #ifdef DEBUG
52 #define DPRINTF(p) printf p
53 #else
54 #define DPRINTF(p) /*nothing*/
55 #endif
57 /* Include the internals header, which itself includes "config.h", the Standard
58 C headers, and the external pcre header. */
60 #include "pcre-internal.h"
62 /* If Unicode Property support is wanted, include a private copy of the
63 function that does it, and the table that translates names to numbers. */
65 #ifdef SUPPORT_UCP
66 #include "ucp.c"
67 #include "ucptypetable.c"
68 #endif
70 /* Maximum number of items on the nested bracket stacks at compile time. This
71 applies to the nesting of all kinds of parentheses. It does not limit
72 un-nested, non-capturing parentheses. This number can be made bigger if
73 necessary - it is used to dimension one int and one unsigned char vector at
74 compile time. */
76 #define BRASTACK_SIZE 200
79 /* Maximum number of ints of offset to save on the stack for recursive calls.
80 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
81 because the offset vector is always a multiple of 3 long. */
83 #define REC_STACK_SAVE_MAX 30
86 /* The maximum remaining length of subject we are prepared to search for a
87 req_byte match. */
89 #define REQ_BYTE_MAX 1000
92 /* Table of sizes for the fixed-length opcodes. It's defined in a macro so that
93 the definition is next to the definition of the opcodes in internal.h. */
95 static const uschar OP_lengths[] = { OP_LENGTHS };
97 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
99 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
100 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
102 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
103 are simple data values; negative values are for special things like \d and so
104 on. Zero means further processing is needed (for things like \x), or the escape
105 is invalid. */
107 #if !EBCDIC /* This is the "normal" table for ASCII systems */
108 static const short int escapes[] = {
109 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
110 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
111 '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */
112 0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
113 -ESC_P, -ESC_Q, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
114 -ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
115 '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
116 0, 0, 0, 0, 0, 0, ESC_n, 0, /* h - o */
117 -ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, 0, -ESC_w, /* p - w */
118 0, 0, -ESC_z /* x - z */
121 #else /* This is the "abnormal" table for EBCDIC systems */
122 static const short int escapes[] = {
123 /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
124 /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
125 /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
126 /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
127 /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
128 /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
129 /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
130 /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
131 /* 88 */ 0, 0, 0, '{', 0, 0, 0, 0,
132 /* 90 */ 0, 0, 0, 'l', 0, ESC_n, 0, -ESC_p,
133 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
134 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0, 0, -ESC_w, 0,
135 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
136 /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
137 /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
138 /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
139 /* C8 */ 0, 0, 0, 0, 0, 0, 0, 0,
140 /* D0 */ '}', 0, 0, 0, 0, 0, 0, -ESC_P,
141 /* D8 */-ESC_Q, 0, 0, 0, 0, 0, 0, 0,
142 /* E0 */ '\\', 0, -ESC_S, 0, 0, 0, -ESC_W, -ESC_X,
143 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
144 /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
145 /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
147 #endif
150 /* Tables of names of POSIX character classes and their lengths. The list is
151 terminated by a zero length entry. The first three must be alpha, upper, lower,
152 as this is assumed for handling case independence. */
154 static const char *const posix_names[] = {
155 "alpha", "lower", "upper",
156 "alnum", "ascii", "blank", "cntrl", "digit", "graph",
157 "print", "punct", "space", "word", "xdigit" };
159 static const uschar posix_name_lengths[] = {
160 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
162 /* Table of class bit maps for each POSIX class; up to three may be combined
163 to form the class. The table for [:blank:] is dynamically modified to remove
164 the vertical space characters. */
166 static const int posix_class_maps[] = {
167 cbit_lower, cbit_upper, -1, /* alpha */
168 cbit_lower, -1, -1, /* lower */
169 cbit_upper, -1, -1, /* upper */
170 cbit_digit, cbit_lower, cbit_upper, /* alnum */
171 cbit_print, cbit_cntrl, -1, /* ascii */
172 cbit_space, -1, -1, /* blank - a GNU extension */
173 cbit_cntrl, -1, -1, /* cntrl */
174 cbit_digit, -1, -1, /* digit */
175 cbit_graph, -1, -1, /* graph */
176 cbit_print, -1, -1, /* print */
177 cbit_punct, -1, -1, /* punct */
178 cbit_space, -1, -1, /* space */
179 cbit_word, -1, -1, /* word - a Perl extension */
180 cbit_xdigit,-1, -1 /* xdigit */
183 /* Table to identify digits and hex digits. This is used when compiling
184 patterns. Note that the tables in chartables are dependent on the locale, and
185 may mark arbitrary characters as digits - but the PCRE compiling code expects
186 to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
187 a private table here. It costs 256 bytes, but it is a lot faster than doing
188 character value tests (at least in some simple cases I timed), and in some
189 applications one wants PCRE to compile efficiently as well as match
190 efficiently.
192 For convenience, we use the same bit definitions as in chartables:
194 0x04 decimal digit
195 0x08 hexadecimal digit
197 Then we can use ctype_digit and ctype_xdigit in the code. */
199 #if !EBCDIC /* This is the "normal" case, for ASCII systems */
200 static const unsigned char digitab[] =
202 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
203 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
204 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
205 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
206 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
207 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
208 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
209 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
210 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
211 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
212 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
213 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
214 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
215 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
216 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
217 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
218 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
219 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
220 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
221 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
222 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
223 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
224 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
225 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
226 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
227 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
228 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
229 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
230 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
231 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
232 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
233 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
235 #else /* This is the "abnormal" case, for EBCDIC systems */
236 static const unsigned char digitab[] =
238 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
239 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
240 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
241 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
242 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
243 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
244 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
245 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
246 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
247 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
248 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
249 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- ¬ */
250 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
251 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
252 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
253 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
254 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
255 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
256 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
257 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
258 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
259 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
260 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
261 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
262 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
263 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
264 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
265 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
266 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
267 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
268 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
269 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
271 static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
272 0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
273 0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
274 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
275 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
276 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
277 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
278 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
279 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
280 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
281 0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
282 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
283 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- ¬ */
284 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
285 0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
286 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
287 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
288 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
289 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
290 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
291 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
292 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
293 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
294 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
295 0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
296 0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
297 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
298 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
299 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
300 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
301 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
302 0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
303 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
304 #endif
307 /* Definition to allow mutual recursion */
309 static BOOL
310 compile_regex(int, int, int *, uschar **, const uschar **, const char **,
311 BOOL, int, int *, int *, branch_chain *, compile_data *);
313 /* Structure for building a chain of data that actually lives on the
314 stack, for holding the values of the subject pointer at the start of each
315 subpattern, so as to detect when an empty string has been matched by a
316 subpattern - to break infinite loops. When NO_RECURSE is set, these blocks
317 are on the heap, not on the stack. */
319 typedef struct eptrblock {
320 struct eptrblock *epb_prev;
321 const uschar *epb_saved_eptr;
322 } eptrblock;
324 /* Flag bits for the match() function */
326 #define match_condassert 0x01 /* Called to check a condition assertion */
327 #define match_isgroup 0x02 /* Set if start of bracketed group */
329 /* Non-error returns from the match() function. Error returns are externally
330 defined PCRE_ERROR_xxx codes, which are all negative. */
332 #define MATCH_MATCH 1
333 #define MATCH_NOMATCH 0
337 /*************************************************
338 * Global variables *
339 *************************************************/
341 /* PCRE is thread-clean and doesn't use any global variables in the normal
342 sense. However, it calls memory allocation and free functions via the four
343 indirections below, and it can optionally do callouts. These values can be
344 changed by the caller, but are shared between all threads. However, when
345 compiling for Virtual Pascal, things are done differently (see pcre.in). */
347 #ifndef VPCOMPAT
348 #ifdef __cplusplus
349 extern "C" void *(*pcre_malloc)(size_t) = malloc;
350 extern "C" void (*pcre_free)(void *) = free;
351 extern "C" void *(*pcre_stack_malloc)(size_t) = malloc;
352 extern "C" void (*pcre_stack_free)(void *) = free;
353 extern "C" int (*pcre_callout)(pcre_callout_block *) = NULL;
354 #else
355 void *(*pcre_malloc)(size_t) = malloc;
356 void (*pcre_free)(void *) = free;
357 void *(*pcre_stack_malloc)(size_t) = malloc;
358 void (*pcre_stack_free)(void *) = free;
359 int (*pcre_callout)(pcre_callout_block *) = NULL;
360 #endif
361 #endif
364 /*************************************************
365 * Macros and tables for character handling *
366 *************************************************/
368 /* When UTF-8 encoding is being used, a character is no longer just a single
369 byte. The macros for character handling generate simple sequences when used in
370 byte-mode, and more complicated ones for UTF-8 characters. */
372 #ifndef SUPPORT_UTF8
373 #define GETCHAR(c, eptr) c = *eptr;
374 #define GETCHARINC(c, eptr) c = *eptr++;
375 #define GETCHARINCTEST(c, eptr) c = *eptr++;
376 #define GETCHARLEN(c, eptr, len) c = *eptr;
377 #define BACKCHAR(eptr)
379 #else /* SUPPORT_UTF8 */
381 /* Get the next UTF-8 character, not advancing the pointer. This is called when
382 we know we are in UTF-8 mode. */
384 #define GETCHAR(c, eptr) \
385 c = *eptr; \
386 if ((c & 0xc0) == 0xc0) \
388 int gcii; \
389 int gcaa = utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
390 int gcss = 6*gcaa; \
391 c = (c & utf8_table3[gcaa]) << gcss; \
392 for (gcii = 1; gcii <= gcaa; gcii++) \
394 gcss -= 6; \
395 c |= (eptr[gcii] & 0x3f) << gcss; \
399 /* Get the next UTF-8 character, advancing the pointer. This is called when we
400 know we are in UTF-8 mode. */
402 #define GETCHARINC(c, eptr) \
403 c = *eptr++; \
404 if ((c & 0xc0) == 0xc0) \
406 int gcaa = utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
407 int gcss = 6*gcaa; \
408 c = (c & utf8_table3[gcaa]) << gcss; \
409 while (gcaa-- > 0) \
411 gcss -= 6; \
412 c |= (*eptr++ & 0x3f) << gcss; \
416 /* Get the next character, testing for UTF-8 mode, and advancing the pointer */
418 #define GETCHARINCTEST(c, eptr) \
419 c = *eptr++; \
420 if (md->utf8 && (c & 0xc0) == 0xc0) \
422 int gcaa = utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
423 int gcss = 6*gcaa; \
424 c = (c & utf8_table3[gcaa]) << gcss; \
425 while (gcaa-- > 0) \
427 gcss -= 6; \
428 c |= (*eptr++ & 0x3f) << gcss; \
432 /* Get the next UTF-8 character, not advancing the pointer, incrementing length
433 if there are extra bytes. This is called when we know we are in UTF-8 mode. */
435 #define GETCHARLEN(c, eptr, len) \
436 c = *eptr; \
437 if ((c & 0xc0) == 0xc0) \
439 int gcii; \
440 int gcaa = utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
441 int gcss = 6*gcaa; \
442 c = (c & utf8_table3[gcaa]) << gcss; \
443 for (gcii = 1; gcii <= gcaa; gcii++) \
445 gcss -= 6; \
446 c |= (eptr[gcii] & 0x3f) << gcss; \
448 len += gcaa; \
451 /* If the pointer is not at the start of a character, move it back until
452 it is. Called only in UTF-8 mode. */
454 #define BACKCHAR(eptr) while((*eptr & 0xc0) == 0x80) eptr--;
456 #endif
460 /*************************************************
461 * Default character tables *
462 *************************************************/
464 /* A default set of character tables is included in the PCRE binary. Its source
465 is built by the maketables auxiliary program, which uses the default C ctypes
466 functions, and put in the file chartables.c. These tables are used by PCRE
467 whenever the caller of pcre_compile() does not provide an alternate set of
468 tables. */
470 #include "pcre-chartables.c"
474 #ifdef SUPPORT_UTF8
475 /*************************************************
476 * Tables for UTF-8 support *
477 *************************************************/
479 /* These are the breakpoints for different numbers of bytes in a UTF-8
480 character. */
482 static const int utf8_table1[] =
483 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff};
485 /* These are the indicator bits and the mask for the data bits to set in the
486 first byte of a character, indexed by the number of additional bytes. */
488 static const int utf8_table2[] = { 0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
489 static const int utf8_table3[] = { 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01};
491 /* Table of the number of extra characters, indexed by the first character
492 masked with 0x3f. The highest number for a valid UTF-8 character is in fact
493 0x3d. */
495 static const uschar utf8_table4[] = {
496 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
497 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
498 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
499 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 };
502 /*************************************************
503 * Convert character value to UTF-8 *
504 *************************************************/
506 /* This function takes an integer value in the range 0 - 0x7fffffff
507 and encodes it as a UTF-8 character in 0 to 6 bytes.
509 Arguments:
510 cvalue the character value
511 buffer pointer to buffer for result - at least 6 bytes long
513 Returns: number of characters placed in the buffer
516 static int
517 ord2utf8(int cvalue, uschar *buffer)
519 register int i, j;
520 for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
521 if (cvalue <= utf8_table1[i]) break;
522 buffer += i;
523 for (j = i; j > 0; j--)
525 *buffer-- = 0x80 | (cvalue & 0x3f);
526 cvalue >>= 6;
528 *buffer = utf8_table2[i] | cvalue;
529 return i + 1;
531 #endif
535 /*************************************************
536 * Print compiled regex *
537 *************************************************/
539 /* The code for doing this is held in a separate file that is also included in
540 pcretest.c. It defines a function called print_internals(). */
542 #ifdef DEBUG
543 #include "printint.c"
544 #endif
548 /*************************************************
549 * Return version string *
550 *************************************************/
552 #define STRING(a) # a
553 #define XSTRING(s) STRING(s)
555 EXPORT const char *
556 pcre_version(void)
558 return XSTRING(PCRE_MAJOR) "." XSTRING(PCRE_MINOR) " " XSTRING(PCRE_DATE);
564 /*************************************************
565 * Flip bytes in an integer *
566 *************************************************/
568 /* This function is called when the magic number in a regex doesn't match in
569 order to flip its bytes to see if we are dealing with a pattern that was
570 compiled on a host of different endianness. If so, this function is used to
571 flip other byte values.
573 Arguments:
574 value the number to flip
575 n the number of bytes to flip (assumed to be 2 or 4)
577 Returns: the flipped value
580 static long int
581 byteflip(long int value, int n)
583 if (n == 2) return ((value & 0x00ff) << 8) | ((value & 0xff00) >> 8);
584 return ((value & 0x000000ff) << 24) |
585 ((value & 0x0000ff00) << 8) |
586 ((value & 0x00ff0000) >> 8) |
587 ((value & 0xff000000) >> 24);
592 /*************************************************
593 * Test for a byte-flipped compiled regex *
594 *************************************************/
596 /* This function is called from pce_exec() and also from pcre_fullinfo(). Its
597 job is to test whether the regex is byte-flipped - that is, it was compiled on
598 a system of opposite endianness. The function is called only when the native
599 MAGIC_NUMBER test fails. If the regex is indeed flipped, we flip all the
600 relevant values into a different data block, and return it.
602 Arguments:
603 re points to the regex
604 study points to study data, or NULL
605 internal_re points to a new regex block
606 internal_study points to a new study block
608 Returns: the new block if is is indeed a byte-flipped regex
609 NULL if it is not
612 static real_pcre *
613 try_flipped(const real_pcre *re, real_pcre *internal_re,
614 const pcre_study_data *study, pcre_study_data *internal_study)
616 if (byteflip(re->magic_number, sizeof(re->magic_number)) != MAGIC_NUMBER)
617 return NULL;
619 *internal_re = *re; /* To copy other fields */
620 internal_re->size = byteflip(re->size, sizeof(re->size));
621 internal_re->options = byteflip(re->options, sizeof(re->options));
622 internal_re->top_bracket = byteflip(re->top_bracket, sizeof(re->top_bracket));
623 internal_re->top_backref = byteflip(re->top_backref, sizeof(re->top_backref));
624 internal_re->first_byte = byteflip(re->first_byte, sizeof(re->first_byte));
625 internal_re->req_byte = byteflip(re->req_byte, sizeof(re->req_byte));
626 internal_re->name_table_offset = byteflip(re->name_table_offset,
627 sizeof(re->name_table_offset));
628 internal_re->name_entry_size = byteflip(re->name_entry_size,
629 sizeof(re->name_entry_size));
630 internal_re->name_count = byteflip(re->name_count, sizeof(re->name_count));
632 if (study != NULL)
634 *internal_study = *study; /* To copy other fields */
635 internal_study->size = byteflip(study->size, sizeof(study->size));
636 internal_study->options = byteflip(study->options, sizeof(study->options));
639 return internal_re;
644 /*************************************************
645 * (Obsolete) Return info about compiled pattern *
646 *************************************************/
648 /* This is the original "info" function. It picks potentially useful data out
649 of the private structure, but its interface was too rigid. It remains for
650 backwards compatibility. The public options are passed back in an int - though
651 the re->options field has been expanded to a long int, all the public options
652 at the low end of it, and so even on 16-bit systems this will still be OK.
653 Therefore, I haven't changed the API for pcre_info().
655 Arguments:
656 argument_re points to compiled code
657 optptr where to pass back the options
658 first_byte where to pass back the first character,
659 or -1 if multiline and all branches start ^,
660 or -2 otherwise
662 Returns: number of capturing subpatterns
663 or negative values on error
666 EXPORT int
667 pcre_info(const pcre *argument_re, int *optptr, int *first_byte)
669 real_pcre internal_re;
670 const real_pcre *re = (const real_pcre *)argument_re;
671 if (re == NULL) return PCRE_ERROR_NULL;
672 if (re->magic_number != MAGIC_NUMBER)
674 re = try_flipped(re, &internal_re, NULL, NULL);
675 if (re == NULL) return PCRE_ERROR_BADMAGIC;
677 if (optptr != NULL) *optptr = (int)(re->options & PUBLIC_OPTIONS);
678 if (first_byte != NULL)
679 *first_byte = ((re->options & PCRE_FIRSTSET) != 0)? re->first_byte :
680 ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
681 return re->top_bracket;
686 /*************************************************
687 * Return info about compiled pattern *
688 *************************************************/
690 /* This is a newer "info" function which has an extensible interface so
691 that additional items can be added compatibly.
693 Arguments:
694 argument_re points to compiled code
695 extra_data points extra data, or NULL
696 what what information is required
697 where where to put the information
699 Returns: 0 if data returned, negative on error
702 EXPORT int
703 pcre_fullinfo(const pcre *argument_re, const pcre_extra *extra_data, int what,
704 void *where)
706 real_pcre internal_re;
707 pcre_study_data internal_study;
708 const real_pcre *re = (const real_pcre *)argument_re;
709 const pcre_study_data *study = NULL;
711 if (re == NULL || where == NULL) return PCRE_ERROR_NULL;
713 if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_STUDY_DATA) != 0)
714 study = (const pcre_study_data *)extra_data->study_data;
716 if (re->magic_number != MAGIC_NUMBER)
718 re = try_flipped(re, &internal_re, study, &internal_study);
719 if (re == NULL) return PCRE_ERROR_BADMAGIC;
720 if (study != NULL) study = &internal_study;
723 switch (what)
725 case PCRE_INFO_OPTIONS:
726 *((unsigned long int *)where) = re->options & PUBLIC_OPTIONS;
727 break;
729 case PCRE_INFO_SIZE:
730 *((size_t *)where) = re->size;
731 break;
733 case PCRE_INFO_STUDYSIZE:
734 *((size_t *)where) = (study == NULL)? 0 : study->size;
735 break;
737 case PCRE_INFO_CAPTURECOUNT:
738 *((int *)where) = re->top_bracket;
739 break;
741 case PCRE_INFO_BACKREFMAX:
742 *((int *)where) = re->top_backref;
743 break;
745 case PCRE_INFO_FIRSTBYTE:
746 *((int *)where) =
747 ((re->options & PCRE_FIRSTSET) != 0)? re->first_byte :
748 ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
749 break;
751 /* Make sure we pass back the pointer to the bit vector in the external
752 block, not the internal copy (with flipped integer fields). */
754 case PCRE_INFO_FIRSTTABLE:
755 *((const uschar **)where) =
756 (study != NULL && (study->options & PCRE_STUDY_MAPPED) != 0)?
757 ((const pcre_study_data *)extra_data->study_data)->start_bits : NULL;
758 break;
760 case PCRE_INFO_LASTLITERAL:
761 *((int *)where) =
762 ((re->options & PCRE_REQCHSET) != 0)? re->req_byte : -1;
763 break;
765 case PCRE_INFO_NAMEENTRYSIZE:
766 *((int *)where) = re->name_entry_size;
767 break;
769 case PCRE_INFO_NAMECOUNT:
770 *((int *)where) = re->name_count;
771 break;
773 case PCRE_INFO_NAMETABLE:
774 *((const uschar **)where) = (const uschar *)re + re->name_table_offset;
775 break;
777 case PCRE_INFO_DEFAULT_TABLES:
778 *((const uschar **)where) = (const uschar *)pcre_default_tables;
779 break;
781 default: return PCRE_ERROR_BADOPTION;
784 return 0;
789 /*************************************************
790 * Return info about what features are configured *
791 *************************************************/
793 /* This is function which has an extensible interface so that additional items
794 can be added compatibly.
796 Arguments:
797 what what information is required
798 where where to put the information
800 Returns: 0 if data returned, negative on error
803 EXPORT int
804 pcre_config(int what, void *where)
806 switch (what)
808 case PCRE_CONFIG_UTF8:
809 #ifdef SUPPORT_UTF8
810 *((int *)where) = 1;
811 #else
812 *((int *)where) = 0;
813 #endif
814 break;
816 case PCRE_CONFIG_UNICODE_PROPERTIES:
817 #ifdef SUPPORT_UCP
818 *((int *)where) = 1;
819 #else
820 *((int *)where) = 0;
821 #endif
822 break;
824 case PCRE_CONFIG_NEWLINE:
825 *((int *)where) = NEWLINE;
826 break;
828 case PCRE_CONFIG_LINK_SIZE:
829 *((int *)where) = LINK_SIZE;
830 break;
832 case PCRE_CONFIG_POSIX_MALLOC_THRESHOLD:
833 *((int *)where) = POSIX_MALLOC_THRESHOLD;
834 break;
836 case PCRE_CONFIG_MATCH_LIMIT:
837 *((unsigned int *)where) = MATCH_LIMIT;
838 break;
840 case PCRE_CONFIG_STACKRECURSE:
841 #ifdef NO_RECURSE
842 *((int *)where) = 0;
843 #else
844 *((int *)where) = 1;
845 #endif
846 break;
848 default: return PCRE_ERROR_BADOPTION;
851 return 0;
856 #ifdef DEBUG
857 /*************************************************
858 * Debugging function to print chars *
859 *************************************************/
861 /* Print a sequence of chars in printable format, stopping at the end of the
862 subject if the requested.
864 Arguments:
865 p points to characters
866 length number to print
867 is_subject TRUE if printing from within md->start_subject
868 md pointer to matching data block, if is_subject is TRUE
870 Returns: nothing
873 static void
874 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
876 int c;
877 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
878 while (length-- > 0)
879 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
881 #endif
886 /*************************************************
887 * Handle escapes *
888 *************************************************/
890 /* This function is called when a \ has been encountered. It either returns a
891 positive value for a simple escape such as \n, or a negative value which
892 encodes one of the more complicated things such as \d. When UTF-8 is enabled,
893 a positive value greater than 255 may be returned. On entry, ptr is pointing at
894 the \. On exit, it is on the final character of the escape sequence.
896 Arguments:
897 ptrptr points to the pattern position pointer
898 errorptr points to the pointer to the error message
899 bracount number of previous extracting brackets
900 options the options bits
901 isclass TRUE if inside a character class
903 Returns: zero or positive => a data character
904 negative => a special escape sequence
905 on error, errorptr is set
908 static int
909 check_escape(const uschar **ptrptr, const char **errorptr, int bracount,
910 int options, BOOL isclass)
912 const uschar *ptr = *ptrptr;
913 int c, i;
915 /* If backslash is at the end of the pattern, it's an error. */
917 c = *(++ptr);
918 if (c == 0) *errorptr = ERR1;
920 /* Non-alphamerics are literals. For digits or letters, do an initial lookup in
921 a table. A non-zero result is something that can be returned immediately.
922 Otherwise further processing may be required. */
924 #if !EBCDIC /* ASCII coding */
925 else if (c < '0' || c > 'z') {} /* Not alphameric */
926 else if ((i = escapes[c - '0']) != 0) c = i;
928 #else /* EBCDIC coding */
929 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphameric */
930 else if ((i = escapes[c - 0x48]) != 0) c = i;
931 #endif
933 /* Escapes that need further processing, or are illegal. */
935 else
937 const uschar *oldptr;
938 switch (c)
940 /* A number of Perl escapes are not handled by PCRE. We give an explicit
941 error. */
943 case 'l':
944 case 'L':
945 case 'N':
946 case 'u':
947 case 'U':
948 *errorptr = ERR37;
949 break;
951 /* The handling of escape sequences consisting of a string of digits
952 starting with one that is not zero is not straightforward. By experiment,
953 the way Perl works seems to be as follows:
955 Outside a character class, the digits are read as a decimal number. If the
956 number is less than 10, or if there are that many previous extracting
957 left brackets, then it is a back reference. Otherwise, up to three octal
958 digits are read to form an escaped byte. Thus \123 is likely to be octal
959 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
960 value is greater than 377, the least significant 8 bits are taken. Inside a
961 character class, \ followed by a digit is always an octal number. */
963 case '1': case '2': case '3': case '4': case '5':
964 case '6': case '7': case '8': case '9':
966 if (!isclass)
968 oldptr = ptr;
969 c -= '0';
970 while ((digitab[ptr[1]] & ctype_digit) != 0)
971 c = c * 10 + *(++ptr) - '0';
972 if (c < 10 || c <= bracount)
974 c = -(ESC_REF + c);
975 break;
977 ptr = oldptr; /* Put the pointer back and fall through */
980 /* Handle an octal number following \. If the first digit is 8 or 9, Perl
981 generates a binary zero byte and treats the digit as a following literal.
982 Thus we have to pull back the pointer by one. */
984 if ((c = *ptr) >= '8')
986 ptr--;
987 c = 0;
988 break;
991 /* \0 always starts an octal number, but we may drop through to here with a
992 larger first octal digit. */
994 case '0':
995 c -= '0';
996 while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
997 c = c * 8 + *(++ptr) - '0';
998 c &= 255; /* Take least significant 8 bits */
999 break;
1001 /* \x is complicated when UTF-8 is enabled. \x{ddd} is a character number
1002 which can be greater than 0xff, but only if the ddd are hex digits. */
1004 case 'x':
1005 #ifdef SUPPORT_UTF8
1006 if (ptr[1] == '{' && (options & PCRE_UTF8) != 0)
1008 const uschar *pt = ptr + 2;
1009 register int count = 0;
1010 c = 0;
1011 while ((digitab[*pt] & ctype_xdigit) != 0)
1013 int cc = *pt++;
1014 count++;
1015 #if !EBCDIC /* ASCII coding */
1016 if (cc >= 'a') cc -= 32; /* Convert to upper case */
1017 c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
1018 #else /* EBCDIC coding */
1019 if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */
1020 c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
1021 #endif
1023 if (*pt == '}')
1025 if (c < 0 || count > 8) *errorptr = ERR34;
1026 ptr = pt;
1027 break;
1029 /* If the sequence of hex digits does not end with '}', then we don't
1030 recognize this construct; fall through to the normal \x handling. */
1032 #endif
1034 /* Read just a single hex char */
1036 c = 0;
1037 while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
1039 int cc; /* Some compilers don't like ++ */
1040 cc = *(++ptr); /* in initializers */
1041 #if !EBCDIC /* ASCII coding */
1042 if (cc >= 'a') cc -= 32; /* Convert to upper case */
1043 c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
1044 #else /* EBCDIC coding */
1045 if (cc <= 'z') cc += 64; /* Convert to upper case */
1046 c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
1047 #endif
1049 break;
1051 /* Other special escapes not starting with a digit are straightforward */
1053 case 'c':
1054 c = *(++ptr);
1055 if (c == 0)
1057 *errorptr = ERR2;
1058 return 0;
1061 /* A letter is upper-cased; then the 0x40 bit is flipped. This coding
1062 is ASCII-specific, but then the whole concept of \cx is ASCII-specific.
1063 (However, an EBCDIC equivalent has now been added.) */
1065 #if !EBCDIC /* ASCII coding */
1066 if (c >= 'a' && c <= 'z') c -= 32;
1067 c ^= 0x40;
1068 #else /* EBCDIC coding */
1069 if (c >= 'a' && c <= 'z') c += 64;
1070 c ^= 0xC0;
1071 #endif
1072 break;
1074 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
1075 other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
1076 for Perl compatibility, it is a literal. This code looks a bit odd, but
1077 there used to be some cases other than the default, and there may be again
1078 in future, so I haven't "optimized" it. */
1080 default:
1081 if ((options & PCRE_EXTRA) != 0) switch(c)
1083 default:
1084 *errorptr = ERR3;
1085 break;
1087 break;
1091 *ptrptr = ptr;
1092 return c;
1097 #ifdef SUPPORT_UCP
1098 /*************************************************
1099 * Handle \P and \p *
1100 *************************************************/
1102 /* This function is called after \P or \p has been encountered, provided that
1103 PCRE is compiled with support for Unicode properties. On entry, ptrptr is
1104 pointing at the P or p. On exit, it is pointing at the final character of the
1105 escape sequence.
1107 Argument:
1108 ptrptr points to the pattern position pointer
1109 negptr points to a boolean that is set TRUE for negation else FALSE
1110 errorptr points to the pointer to the error message
1112 Returns: value from ucp_type_table, or -1 for an invalid type
1115 static int
1116 get_ucp(const uschar **ptrptr, BOOL *negptr, const char **errorptr)
1118 int c, i, bot, top;
1119 const uschar *ptr = *ptrptr;
1120 char name[4];
1122 c = *(++ptr);
1123 if (c == 0) goto ERROR_RETURN;
1125 *negptr = FALSE;
1127 /* \P or \p can be followed by a one- or two-character name in {}, optionally
1128 preceded by ^ for negation. */
1130 if (c == '{')
1132 if (ptr[1] == '^')
1134 *negptr = TRUE;
1135 ptr++;
1137 for (i = 0; i <= 2; i++)
1139 c = *(++ptr);
1140 if (c == 0) goto ERROR_RETURN;
1141 if (c == '}') break;
1142 name[i] = c;
1144 if (c !='}') /* Try to distinguish error cases */
1146 while (*(++ptr) != 0 && *ptr != '}');
1147 if (*ptr == '}') goto UNKNOWN_RETURN; else goto ERROR_RETURN;
1149 name[i] = 0;
1152 /* Otherwise there is just one following character */
1154 else
1156 name[0] = c;
1157 name[1] = 0;
1160 *ptrptr = ptr;
1162 /* Search for a recognized property name using binary chop */
1164 bot = 0;
1165 top = sizeof(utt)/sizeof(ucp_type_table);
1167 while (bot < top)
1169 i = (bot + top)/2;
1170 c = strcmp(name, utt[i].name);
1171 if (c == 0) return utt[i].value;
1172 if (c > 0) bot = i + 1; else top = i;
1175 UNKNOWN_RETURN:
1176 *errorptr = ERR47;
1177 *ptrptr = ptr;
1178 return -1;
1180 ERROR_RETURN:
1181 *errorptr = ERR46;
1182 *ptrptr = ptr;
1183 return -1;
1185 #endif
1190 /*************************************************
1191 * Check for counted repeat *
1192 *************************************************/
1194 /* This function is called when a '{' is encountered in a place where it might
1195 start a quantifier. It looks ahead to see if it really is a quantifier or not.
1196 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
1197 where the ddds are digits.
1199 Arguments:
1200 p pointer to the first char after '{'
1202 Returns: TRUE or FALSE
1205 static BOOL
1206 is_counted_repeat(const uschar *p)
1208 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
1209 while ((digitab[*p] & ctype_digit) != 0) p++;
1210 if (*p == '}') return TRUE;
1212 if (*p++ != ',') return FALSE;
1213 if (*p == '}') return TRUE;
1215 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
1216 while ((digitab[*p] & ctype_digit) != 0) p++;
1218 return (*p == '}');
1223 /*************************************************
1224 * Read repeat counts *
1225 *************************************************/
1227 /* Read an item of the form {n,m} and return the values. This is called only
1228 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
1229 so the syntax is guaranteed to be correct, but we need to check the values.
1231 Arguments:
1232 p pointer to first char after '{'
1233 minp pointer to int for min
1234 maxp pointer to int for max
1235 returned as -1 if no max
1236 errorptr points to pointer to error message
1238 Returns: pointer to '}' on success;
1239 current ptr on error, with errorptr set
1242 static const uschar *
1243 read_repeat_counts(const uschar *p, int *minp, int *maxp, const char **errorptr)
1245 int min = 0;
1246 int max = -1;
1248 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
1250 if (*p == '}') max = min; else
1252 if (*(++p) != '}')
1254 max = 0;
1255 while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
1256 if (max < min)
1258 *errorptr = ERR4;
1259 return p;
1264 /* Do paranoid checks, then fill in the required variables, and pass back the
1265 pointer to the terminating '}'. */
1267 if (min > 65535 || max > 65535)
1268 *errorptr = ERR5;
1269 else
1271 *minp = min;
1272 *maxp = max;
1274 return p;
1279 /*************************************************
1280 * Find first significant op code *
1281 *************************************************/
1283 /* This is called by several functions that scan a compiled expression looking
1284 for a fixed first character, or an anchoring op code etc. It skips over things
1285 that do not influence this. For some calls, a change of option is important.
1286 For some calls, it makes sense to skip negative forward and all backward
1287 assertions, and also the \b assertion; for others it does not.
1289 Arguments:
1290 code pointer to the start of the group
1291 options pointer to external options
1292 optbit the option bit whose changing is significant, or
1293 zero if none are
1294 skipassert TRUE if certain assertions are to be skipped
1296 Returns: pointer to the first significant opcode
1299 static const uschar*
1300 first_significant_code(const uschar *code, int *options, int optbit,
1301 BOOL skipassert)
1303 for (;;)
1305 switch ((int)*code)
1307 case OP_OPT:
1308 if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
1309 *options = (int)code[1];
1310 code += 2;
1311 break;
1313 case OP_ASSERT_NOT:
1314 case OP_ASSERTBACK:
1315 case OP_ASSERTBACK_NOT:
1316 if (!skipassert) return code;
1317 do code += GET(code, 1); while (*code == OP_ALT);
1318 code += OP_lengths[*code];
1319 break;
1321 case OP_WORD_BOUNDARY:
1322 case OP_NOT_WORD_BOUNDARY:
1323 if (!skipassert) return code;
1324 /* Fall through */
1326 case OP_CALLOUT:
1327 case OP_CREF:
1328 case OP_BRANUMBER:
1329 code += OP_lengths[*code];
1330 break;
1332 default:
1333 return code;
1336 /* Control never reaches here */
1342 /*************************************************
1343 * Find the fixed length of a pattern *
1344 *************************************************/
1346 /* Scan a pattern and compute the fixed length of subject that will match it,
1347 if the length is fixed. This is needed for dealing with backward assertions.
1348 In UTF8 mode, the result is in characters rather than bytes.
1350 Arguments:
1351 code points to the start of the pattern (the bracket)
1352 options the compiling options
1354 Returns: the fixed length, or -1 if there is no fixed length,
1355 or -2 if \C was encountered
1358 static int
1359 find_fixedlength(uschar *code, int options)
1361 int length = -1;
1363 register int branchlength = 0;
1364 register uschar *cc = code + 1 + LINK_SIZE;
1366 /* Scan along the opcodes for this branch. If we get to the end of the
1367 branch, check the length against that of the other branches. */
1369 for (;;)
1371 int d;
1372 register int op = *cc;
1373 if (op >= OP_BRA) op = OP_BRA;
1375 switch (op)
1377 case OP_BRA:
1378 case OP_ONCE:
1379 case OP_COND:
1380 d = find_fixedlength(cc, options);
1381 if (d < 0) return d;
1382 branchlength += d;
1383 do cc += GET(cc, 1); while (*cc == OP_ALT);
1384 cc += 1 + LINK_SIZE;
1385 break;
1387 /* Reached end of a branch; if it's a ket it is the end of a nested
1388 call. If it's ALT it is an alternation in a nested call. If it is
1389 END it's the end of the outer call. All can be handled by the same code. */
1391 case OP_ALT:
1392 case OP_KET:
1393 case OP_KETRMAX:
1394 case OP_KETRMIN:
1395 case OP_END:
1396 if (length < 0) length = branchlength;
1397 else if (length != branchlength) return -1;
1398 if (*cc != OP_ALT) return length;
1399 cc += 1 + LINK_SIZE;
1400 branchlength = 0;
1401 break;
1403 /* Skip over assertive subpatterns */
1405 case OP_ASSERT:
1406 case OP_ASSERT_NOT:
1407 case OP_ASSERTBACK:
1408 case OP_ASSERTBACK_NOT:
1409 do cc += GET(cc, 1); while (*cc == OP_ALT);
1410 /* Fall through */
1412 /* Skip over things that don't match chars */
1414 case OP_REVERSE:
1415 case OP_BRANUMBER:
1416 case OP_CREF:
1417 case OP_OPT:
1418 case OP_CALLOUT:
1419 case OP_SOD:
1420 case OP_SOM:
1421 case OP_EOD:
1422 case OP_EODN:
1423 case OP_CIRC:
1424 case OP_DOLL:
1425 case OP_NOT_WORD_BOUNDARY:
1426 case OP_WORD_BOUNDARY:
1427 cc += OP_lengths[*cc];
1428 break;
1430 /* Handle literal characters */
1432 case OP_CHAR:
1433 case OP_CHARNC:
1434 branchlength++;
1435 cc += 2;
1436 #ifdef SUPPORT_UTF8
1437 if ((options & PCRE_UTF8) != 0)
1439 while ((*cc & 0xc0) == 0x80) cc++;
1441 #endif
1442 break;
1444 /* Handle exact repetitions. The count is already in characters, but we
1445 need to skip over a multibyte character in UTF8 mode. */
1447 case OP_EXACT:
1448 branchlength += GET2(cc,1);
1449 cc += 4;
1450 #ifdef SUPPORT_UTF8
1451 if ((options & PCRE_UTF8) != 0)
1453 while((*cc & 0x80) == 0x80) cc++;
1455 #endif
1456 break;
1458 case OP_TYPEEXACT:
1459 branchlength += GET2(cc,1);
1460 cc += 4;
1461 break;
1463 /* Handle single-char matchers */
1465 case OP_PROP:
1466 case OP_NOTPROP:
1467 cc++;
1468 /* Fall through */
1470 case OP_NOT_DIGIT:
1471 case OP_DIGIT:
1472 case OP_NOT_WHITESPACE:
1473 case OP_WHITESPACE:
1474 case OP_NOT_WORDCHAR:
1475 case OP_WORDCHAR:
1476 case OP_ANY:
1477 branchlength++;
1478 cc++;
1479 break;
1481 /* The single-byte matcher isn't allowed */
1483 case OP_ANYBYTE:
1484 return -2;
1486 /* Check a class for variable quantification */
1488 #ifdef SUPPORT_UTF8
1489 case OP_XCLASS:
1490 cc += GET(cc, 1) - 33;
1491 /* Fall through */
1492 #endif
1494 case OP_CLASS:
1495 case OP_NCLASS:
1496 cc += 33;
1498 switch (*cc)
1500 case OP_CRSTAR:
1501 case OP_CRMINSTAR:
1502 case OP_CRQUERY:
1503 case OP_CRMINQUERY:
1504 return -1;
1506 case OP_CRRANGE:
1507 case OP_CRMINRANGE:
1508 if (GET2(cc,1) != GET2(cc,3)) return -1;
1509 branchlength += GET2(cc,1);
1510 cc += 5;
1511 break;
1513 default:
1514 branchlength++;
1516 break;
1518 /* Anything else is variable length */
1520 default:
1521 return -1;
1524 /* Control never gets here */
1530 /*************************************************
1531 * Scan compiled regex for numbered bracket *
1532 *************************************************/
1534 /* This little function scans through a compiled pattern until it finds a
1535 capturing bracket with the given number.
1537 Arguments:
1538 code points to start of expression
1539 utf8 TRUE in UTF-8 mode
1540 number the required bracket number
1542 Returns: pointer to the opcode for the bracket, or NULL if not found
1545 static const uschar *
1546 find_bracket(const uschar *code, BOOL utf8, int number)
1548 #ifndef SUPPORT_UTF8
1549 utf8 = utf8; /* Stop pedantic compilers complaining */
1550 #endif
1552 for (;;)
1554 register int c = *code;
1555 if (c == OP_END) return NULL;
1556 else if (c > OP_BRA)
1558 int n = c - OP_BRA;
1559 if (n > EXTRACT_BASIC_MAX) n = GET2(code, 2+LINK_SIZE);
1560 if (n == number) return (uschar *)code;
1561 code += OP_lengths[OP_BRA];
1563 else
1565 code += OP_lengths[c];
1567 #ifdef SUPPORT_UTF8
1569 /* In UTF-8 mode, opcodes that are followed by a character may be followed
1570 by a multi-byte character. The length in the table is a minimum, so we have
1571 to scan along to skip the extra bytes. All opcodes are less than 128, so we
1572 can use relatively efficient code. */
1574 if (utf8) switch(c)
1576 case OP_CHAR:
1577 case OP_CHARNC:
1578 case OP_EXACT:
1579 case OP_UPTO:
1580 case OP_MINUPTO:
1581 case OP_STAR:
1582 case OP_MINSTAR:
1583 case OP_PLUS:
1584 case OP_MINPLUS:
1585 case OP_QUERY:
1586 case OP_MINQUERY:
1587 while ((*code & 0xc0) == 0x80) code++;
1588 break;
1590 /* XCLASS is used for classes that cannot be represented just by a bit
1591 map. This includes negated single high-valued characters. The length in
1592 the table is zero; the actual length is stored in the compiled code. */
1594 case OP_XCLASS:
1595 code += GET(code, 1) + 1;
1596 break;
1598 #endif
1605 /*************************************************
1606 * Scan compiled regex for recursion reference *
1607 *************************************************/
1609 /* This little function scans through a compiled pattern until it finds an
1610 instance of OP_RECURSE.
1612 Arguments:
1613 code points to start of expression
1614 utf8 TRUE in UTF-8 mode
1616 Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1619 static const uschar *
1620 find_recurse(const uschar *code, BOOL utf8)
1622 #ifndef SUPPORT_UTF8
1623 utf8 = utf8; /* Stop pedantic compilers complaining */
1624 #endif
1626 for (;;)
1628 register int c = *code;
1629 if (c == OP_END) return NULL;
1630 else if (c == OP_RECURSE) return code;
1631 else if (c > OP_BRA)
1633 code += OP_lengths[OP_BRA];
1635 else
1637 code += OP_lengths[c];
1639 #ifdef SUPPORT_UTF8
1641 /* In UTF-8 mode, opcodes that are followed by a character may be followed
1642 by a multi-byte character. The length in the table is a minimum, so we have
1643 to scan along to skip the extra bytes. All opcodes are less than 128, so we
1644 can use relatively efficient code. */
1646 if (utf8) switch(c)
1648 case OP_CHAR:
1649 case OP_CHARNC:
1650 case OP_EXACT:
1651 case OP_UPTO:
1652 case OP_MINUPTO:
1653 case OP_STAR:
1654 case OP_MINSTAR:
1655 case OP_PLUS:
1656 case OP_MINPLUS:
1657 case OP_QUERY:
1658 case OP_MINQUERY:
1659 while ((*code & 0xc0) == 0x80) code++;
1660 break;
1662 /* XCLASS is used for classes that cannot be represented just by a bit
1663 map. This includes negated single high-valued characters. The length in
1664 the table is zero; the actual length is stored in the compiled code. */
1666 case OP_XCLASS:
1667 code += GET(code, 1) + 1;
1668 break;
1670 #endif
1677 /*************************************************
1678 * Scan compiled branch for non-emptiness *
1679 *************************************************/
1681 /* This function scans through a branch of a compiled pattern to see whether it
1682 can match the empty string or not. It is called only from could_be_empty()
1683 below. Note that first_significant_code() skips over assertions. If we hit an
1684 unclosed bracket, we return "empty" - this means we've struck an inner bracket
1685 whose current branch will already have been scanned.
1687 Arguments:
1688 code points to start of search
1689 endcode points to where to stop
1690 utf8 TRUE if in UTF8 mode
1692 Returns: TRUE if what is matched could be empty
1695 static BOOL
1696 could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1698 register int c;
1699 for (code = first_significant_code(code + 1 + LINK_SIZE, NULL, 0, TRUE);
1700 code < endcode;
1701 code = first_significant_code(code + OP_lengths[c], NULL, 0, TRUE))
1703 const uschar *ccode;
1705 c = *code;
1707 if (c >= OP_BRA)
1709 BOOL empty_branch;
1710 if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1712 /* Scan a closed bracket */
1714 empty_branch = FALSE;
1717 if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1718 empty_branch = TRUE;
1719 code += GET(code, 1);
1721 while (*code == OP_ALT);
1722 if (!empty_branch) return FALSE; /* All branches are non-empty */
1723 code += 1 + LINK_SIZE;
1724 c = *code;
1727 else switch (c)
1729 /* Check for quantifiers after a class */
1731 #ifdef SUPPORT_UTF8
1732 case OP_XCLASS:
1733 ccode = code + GET(code, 1);
1734 goto CHECK_CLASS_REPEAT;
1735 #endif
1737 case OP_CLASS:
1738 case OP_NCLASS:
1739 ccode = code + 33;
1741 #ifdef SUPPORT_UTF8
1742 CHECK_CLASS_REPEAT:
1743 #endif
1745 switch (*ccode)
1747 case OP_CRSTAR: /* These could be empty; continue */
1748 case OP_CRMINSTAR:
1749 case OP_CRQUERY:
1750 case OP_CRMINQUERY:
1751 break;
1753 default: /* Non-repeat => class must match */
1754 case OP_CRPLUS: /* These repeats aren't empty */
1755 case OP_CRMINPLUS:
1756 return FALSE;
1758 case OP_CRRANGE:
1759 case OP_CRMINRANGE:
1760 if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1761 break;
1763 break;
1765 /* Opcodes that must match a character */
1767 case OP_PROP:
1768 case OP_NOTPROP:
1769 case OP_EXTUNI:
1770 case OP_NOT_DIGIT:
1771 case OP_DIGIT:
1772 case OP_NOT_WHITESPACE:
1773 case OP_WHITESPACE:
1774 case OP_NOT_WORDCHAR:
1775 case OP_WORDCHAR:
1776 case OP_ANY:
1777 case OP_ANYBYTE:
1778 case OP_CHAR:
1779 case OP_CHARNC:
1780 case OP_NOT:
1781 case OP_PLUS:
1782 case OP_MINPLUS:
1783 case OP_EXACT:
1784 case OP_NOTPLUS:
1785 case OP_NOTMINPLUS:
1786 case OP_NOTEXACT:
1787 case OP_TYPEPLUS:
1788 case OP_TYPEMINPLUS:
1789 case OP_TYPEEXACT:
1790 return FALSE;
1792 /* End of branch */
1794 case OP_KET:
1795 case OP_KETRMAX:
1796 case OP_KETRMIN:
1797 case OP_ALT:
1798 return TRUE;
1800 /* In UTF-8 mode, STAR, MINSTAR, QUERY, MINQUERY, UPTO, and MINUPTO may be
1801 followed by a multibyte character */
1803 #ifdef SUPPORT_UTF8
1804 case OP_STAR:
1805 case OP_MINSTAR:
1806 case OP_QUERY:
1807 case OP_MINQUERY:
1808 case OP_UPTO:
1809 case OP_MINUPTO:
1810 if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1811 break;
1812 #endif
1816 return TRUE;
1821 /*************************************************
1822 * Scan compiled regex for non-emptiness *
1823 *************************************************/
1825 /* This function is called to check for left recursive calls. We want to check
1826 the current branch of the current pattern to see if it could match the empty
1827 string. If it could, we must look outwards for branches at other levels,
1828 stopping when we pass beyond the bracket which is the subject of the recursion.
1830 Arguments:
1831 code points to start of the recursion
1832 endcode points to where to stop (current RECURSE item)
1833 bcptr points to the chain of current (unclosed) branch starts
1834 utf8 TRUE if in UTF-8 mode
1836 Returns: TRUE if what is matched could be empty
1839 static BOOL
1840 could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1841 BOOL utf8)
1843 while (bcptr != NULL && bcptr->current >= code)
1845 if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1846 bcptr = bcptr->outer;
1848 return TRUE;
1853 /*************************************************
1854 * Check for POSIX class syntax *
1855 *************************************************/
1857 /* This function is called when the sequence "[:" or "[." or "[=" is
1858 encountered in a character class. It checks whether this is followed by an
1859 optional ^ and then a sequence of letters, terminated by a matching ":]" or
1860 ".]" or "=]".
1862 Argument:
1863 ptr pointer to the initial [
1864 endptr where to return the end pointer
1865 cd pointer to compile data
1867 Returns: TRUE or FALSE
1870 static BOOL
1871 check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
1873 int terminator; /* Don't combine these lines; the Solaris cc */
1874 terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
1875 if (*(++ptr) == '^') ptr++;
1876 while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
1877 if (*ptr == terminator && ptr[1] == ']')
1879 *endptr = ptr;
1880 return TRUE;
1882 return FALSE;
1888 /*************************************************
1889 * Check POSIX class name *
1890 *************************************************/
1892 /* This function is called to check the name given in a POSIX-style class entry
1893 such as [:alnum:].
1895 Arguments:
1896 ptr points to the first letter
1897 len the length of the name
1899 Returns: a value representing the name, or -1 if unknown
1902 static int
1903 check_posix_name(const uschar *ptr, int len)
1905 register int yield = 0;
1906 while (posix_name_lengths[yield] != 0)
1908 if (len == posix_name_lengths[yield] &&
1909 strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
1910 yield++;
1912 return -1;
1916 /*************************************************
1917 * Adjust OP_RECURSE items in repeated group *
1918 *************************************************/
1920 /* OP_RECURSE items contain an offset from the start of the regex to the group
1921 that is referenced. This means that groups can be replicated for fixed
1922 repetition simply by copying (because the recursion is allowed to refer to
1923 earlier groups that are outside the current group). However, when a group is
1924 optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1925 it, after it has been compiled. This means that any OP_RECURSE items within it
1926 that refer to the group itself or any contained groups have to have their
1927 offsets adjusted. That is the job of this function. Before it is called, the
1928 partially compiled regex must be temporarily terminated with OP_END.
1930 Arguments:
1931 group points to the start of the group
1932 adjust the amount by which the group is to be moved
1933 utf8 TRUE in UTF-8 mode
1934 cd contains pointers to tables etc.
1936 Returns: nothing
1939 static void
1940 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd)
1942 uschar *ptr = group;
1943 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1945 int offset = GET(ptr, 1);
1946 if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1947 ptr += 1 + LINK_SIZE;
1953 /*************************************************
1954 * Insert an automatic callout point *
1955 *************************************************/
1957 /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
1958 callout points before each pattern item.
1960 Arguments:
1961 code current code pointer
1962 ptr current pattern pointer
1963 cd pointers to tables etc
1965 Returns: new code pointer
1968 static uschar *
1969 auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
1971 *code++ = OP_CALLOUT;
1972 *code++ = 255;
1973 PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
1974 PUT(code, LINK_SIZE, 0); /* Default length */
1975 return code + 2*LINK_SIZE;
1980 /*************************************************
1981 * Complete a callout item *
1982 *************************************************/
1984 /* A callout item contains the length of the next item in the pattern, which
1985 we can't fill in till after we have reached the relevant point. This is used
1986 for both automatic and manual callouts.
1988 Arguments:
1989 previous_callout points to previous callout item
1990 ptr current pattern pointer
1991 cd pointers to tables etc
1993 Returns: nothing
1996 static void
1997 complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
1999 int length = ptr - cd->start_pattern - GET(previous_callout, 2);
2000 PUT(previous_callout, 2 + LINK_SIZE, length);
2005 #ifdef SUPPORT_UCP
2006 /*************************************************
2007 * Get othercase range *
2008 *************************************************/
2010 /* This function is passed the start and end of a class range, in UTF-8 mode
2011 with UCP support. It searches up the characters, looking for internal ranges of
2012 characters in the "other" case. Each call returns the next one, updating the
2013 start address.
2015 Arguments:
2016 cptr points to starting character value; updated
2017 d end value
2018 ocptr where to put start of othercase range
2019 odptr where to put end of othercase range
2021 Yield: TRUE when range returned; FALSE when no more
2024 static BOOL
2025 get_othercase_range(int *cptr, int d, int *ocptr, int *odptr)
2027 int c, chartype, othercase, next;
2029 for (c = *cptr; c <= d; c++)
2031 if (ucp_findchar(c, &chartype, &othercase) == ucp_L && othercase != 0) break;
2034 if (c > d) return FALSE;
2036 *ocptr = othercase;
2037 next = othercase + 1;
2039 for (++c; c <= d; c++)
2041 if (ucp_findchar(c, &chartype, &othercase) != ucp_L || othercase != next)
2042 break;
2043 next++;
2046 *odptr = next - 1;
2047 *cptr = c;
2049 return TRUE;
2051 #endif /* SUPPORT_UCP */
2054 /*************************************************
2055 * Compile one branch *
2056 *************************************************/
2058 /* Scan the pattern, compiling it into the code vector. If the options are
2059 changed during the branch, the pointer is used to change the external options
2060 bits.
2062 Arguments:
2063 optionsptr pointer to the option bits
2064 brackets points to number of extracting brackets used
2065 codeptr points to the pointer to the current code point
2066 ptrptr points to the current pattern pointer
2067 errorptr points to pointer to error message
2068 firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2069 reqbyteptr set to the last literal character required, else < 0
2070 bcptr points to current branch chain
2071 cd contains pointers to tables etc.
2073 Returns: TRUE on success
2074 FALSE, with *errorptr set on error
2077 static BOOL
2078 compile_branch(int *optionsptr, int *brackets, uschar **codeptr,
2079 const uschar **ptrptr, const char **errorptr, int *firstbyteptr,
2080 int *reqbyteptr, branch_chain *bcptr, compile_data *cd)
2082 int repeat_type, op_type;
2083 int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
2084 int bravalue = 0;
2085 int greedy_default, greedy_non_default;
2086 int firstbyte, reqbyte;
2087 int zeroreqbyte, zerofirstbyte;
2088 int req_caseopt, reqvary, tempreqvary;
2089 int condcount = 0;
2090 int options = *optionsptr;
2091 int after_manual_callout = 0;
2092 register int c;
2093 register uschar *code = *codeptr;
2094 uschar *tempcode;
2095 BOOL inescq = FALSE;
2096 BOOL groupsetfirstbyte = FALSE;
2097 const uschar *ptr = *ptrptr;
2098 const uschar *tempptr;
2099 uschar *previous = NULL;
2100 uschar *previous_callout = NULL;
2101 uschar classbits[32];
2103 #ifdef SUPPORT_UTF8
2104 BOOL class_utf8;
2105 BOOL utf8 = (options & PCRE_UTF8) != 0;
2106 uschar *class_utf8data;
2107 uschar utf8_char[6];
2108 #else
2109 BOOL utf8 = FALSE;
2110 #endif
2112 /* Set up the default and non-default settings for greediness */
2114 greedy_default = ((options & PCRE_UNGREEDY) != 0);
2115 greedy_non_default = greedy_default ^ 1;
2117 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2118 matching encountered yet". It gets changed to REQ_NONE if we hit something that
2119 matches a non-fixed char first char; reqbyte just remains unset if we never
2120 find one.
2122 When we hit a repeat whose minimum is zero, we may have to adjust these values
2123 to take the zero repeat into account. This is implemented by setting them to
2124 zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2125 item types that can be repeated set these backoff variables appropriately. */
2127 firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2129 /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2130 according to the current setting of the caseless flag. REQ_CASELESS is a bit
2131 value > 255. It is added into the firstbyte or reqbyte variables to record the
2132 case status of the value. This is used only for ASCII characters. */
2134 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2136 /* Switch on next character until the end of the branch */
2138 for (;; ptr++)
2140 BOOL negate_class;
2141 BOOL possessive_quantifier;
2142 BOOL is_quantifier;
2143 int class_charcount;
2144 int class_lastchar;
2145 int newoptions;
2146 int recno;
2147 int skipbytes;
2148 int subreqbyte;
2149 int subfirstbyte;
2150 int mclength;
2151 uschar mcbuffer[8];
2153 /* Next byte in the pattern */
2155 c = *ptr;
2157 /* If in \Q...\E, check for the end; if not, we have a literal */
2159 if (inescq && c != 0)
2161 if (c == '\\' && ptr[1] == 'E')
2163 inescq = FALSE;
2164 ptr++;
2165 continue;
2167 else
2169 if (previous_callout != NULL)
2171 complete_callout(previous_callout, ptr, cd);
2172 previous_callout = NULL;
2174 if ((options & PCRE_AUTO_CALLOUT) != 0)
2176 previous_callout = code;
2177 code = auto_callout(code, ptr, cd);
2179 goto NORMAL_CHAR;
2183 /* Fill in length of a previous callout, except when the next thing is
2184 a quantifier. */
2186 is_quantifier = c == '*' || c == '+' || c == '?' ||
2187 (c == '{' && is_counted_repeat(ptr+1));
2189 if (!is_quantifier && previous_callout != NULL &&
2190 after_manual_callout-- <= 0)
2192 complete_callout(previous_callout, ptr, cd);
2193 previous_callout = NULL;
2196 /* In extended mode, skip white space and comments */
2198 if ((options & PCRE_EXTENDED) != 0)
2200 if ((cd->ctypes[c] & ctype_space) != 0) continue;
2201 if (c == '#')
2203 /* The space before the ; is to avoid a warning on a silly compiler
2204 on the Macintosh. */
2205 while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
2206 if (c != 0) continue; /* Else fall through to handle end of string */
2210 /* No auto callout for quantifiers. */
2212 if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
2214 previous_callout = code;
2215 code = auto_callout(code, ptr, cd);
2218 switch(c)
2220 /* The branch terminates at end of string, |, or ). */
2222 case 0:
2223 case '|':
2224 case ')':
2225 *firstbyteptr = firstbyte;
2226 *reqbyteptr = reqbyte;
2227 *codeptr = code;
2228 *ptrptr = ptr;
2229 return TRUE;
2231 /* Handle single-character metacharacters. In multiline mode, ^ disables
2232 the setting of any following char as a first character. */
2234 case '^':
2235 if ((options & PCRE_MULTILINE) != 0)
2237 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2239 previous = NULL;
2240 *code++ = OP_CIRC;
2241 break;
2243 case '$':
2244 previous = NULL;
2245 *code++ = OP_DOLL;
2246 break;
2248 /* There can never be a first char if '.' is first, whatever happens about
2249 repeats. The value of reqbyte doesn't change either. */
2251 case '.':
2252 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2253 zerofirstbyte = firstbyte;
2254 zeroreqbyte = reqbyte;
2255 previous = code;
2256 *code++ = OP_ANY;
2257 break;
2259 /* Character classes. If the included characters are all < 255 in value, we
2260 build a 32-byte bitmap of the permitted characters, except in the special
2261 case where there is only one such character. For negated classes, we build
2262 the map as usual, then invert it at the end. However, we use a different
2263 opcode so that data characters > 255 can be handled correctly.
2265 If the class contains characters outside the 0-255 range, a different
2266 opcode is compiled. It may optionally have a bit map for characters < 256,
2267 but those above are are explicitly listed afterwards. A flag byte tells
2268 whether the bitmap is present, and whether this is a negated class or not.
2271 case '[':
2272 previous = code;
2274 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2275 they are encountered at the top level, so we'll do that too. */
2277 if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2278 check_posix_syntax(ptr, &tempptr, cd))
2280 *errorptr = (ptr[1] == ':')? ERR13 : ERR31;
2281 goto FAILED;
2284 /* If the first character is '^', set the negation flag and skip it. */
2286 if ((c = *(++ptr)) == '^')
2288 negate_class = TRUE;
2289 c = *(++ptr);
2291 else
2293 negate_class = FALSE;
2296 /* Keep a count of chars with values < 256 so that we can optimize the case
2297 of just a single character (as long as it's < 256). For higher valued UTF-8
2298 characters, we don't yet do any optimization. */
2300 class_charcount = 0;
2301 class_lastchar = -1;
2303 #ifdef SUPPORT_UTF8
2304 class_utf8 = FALSE; /* No chars >= 256 */
2305 class_utf8data = code + LINK_SIZE + 34; /* For UTF-8 items */
2306 #endif
2308 /* Initialize the 32-char bit map to all zeros. We have to build the
2309 map in a temporary bit of store, in case the class contains only 1
2310 character (< 256), because in that case the compiled code doesn't use the
2311 bit map. */
2313 memset(classbits, 0, 32 * sizeof(uschar));
2315 /* Process characters until ] is reached. By writing this as a "do" it
2316 means that an initial ] is taken as a data character. The first pass
2317 through the regex checked the overall syntax, so we don't need to be very
2318 strict here. At the start of the loop, c contains the first byte of the
2319 character. */
2323 #ifdef SUPPORT_UTF8
2324 if (utf8 && c > 127)
2325 { /* Braces are required because the */
2326 GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
2328 #endif
2330 /* Inside \Q...\E everything is literal except \E */
2332 if (inescq)
2334 if (c == '\\' && ptr[1] == 'E')
2336 inescq = FALSE;
2337 ptr++;
2338 continue;
2340 else goto LONE_SINGLE_CHARACTER;
2343 /* Handle POSIX class names. Perl allows a negation extension of the
2344 form [:^name:]. A square bracket that doesn't match the syntax is
2345 treated as a literal. We also recognize the POSIX constructions
2346 [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
2347 5.6 and 5.8 do. */
2349 if (c == '[' &&
2350 (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2351 check_posix_syntax(ptr, &tempptr, cd))
2353 BOOL local_negate = FALSE;
2354 int posix_class, i;
2355 register const uschar *cbits = cd->cbits;
2357 if (ptr[1] != ':')
2359 *errorptr = ERR31;
2360 goto FAILED;
2363 ptr += 2;
2364 if (*ptr == '^')
2366 local_negate = TRUE;
2367 ptr++;
2370 posix_class = check_posix_name(ptr, tempptr - ptr);
2371 if (posix_class < 0)
2373 *errorptr = ERR30;
2374 goto FAILED;
2377 /* If matching is caseless, upper and lower are converted to
2378 alpha. This relies on the fact that the class table starts with
2379 alpha, lower, upper as the first 3 entries. */
2381 if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
2382 posix_class = 0;
2384 /* Or into the map we are building up to 3 of the static class
2385 tables, or their negations. The [:blank:] class sets up the same
2386 chars as the [:space:] class (all white space). We remove the vertical
2387 white space chars afterwards. */
2389 posix_class *= 3;
2390 for (i = 0; i < 3; i++)
2392 BOOL blankclass = strncmp((char *)ptr, "blank", 5) == 0;
2393 int taboffset = posix_class_maps[posix_class + i];
2394 if (taboffset < 0) break;
2395 if (local_negate)
2397 if (i == 0)
2398 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+taboffset];
2399 else
2400 for (c = 0; c < 32; c++) classbits[c] &= ~cbits[c+taboffset];
2401 if (blankclass) classbits[1] |= 0x3c;
2403 else
2405 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+taboffset];
2406 if (blankclass) classbits[1] &= ~0x3c;
2410 ptr = tempptr + 1;
2411 class_charcount = 10; /* Set > 1; assumes more than 1 per class */
2412 continue; /* End of POSIX syntax handling */
2415 /* Backslash may introduce a single character, or it may introduce one
2416 of the specials, which just set a flag. Escaped items are checked for
2417 validity in the pre-compiling pass. The sequence \b is a special case.
2418 Inside a class (and only there) it is treated as backspace. Elsewhere
2419 it marks a word boundary. Other escapes have preset maps ready to
2420 or into the one we are building. We assume they have more than one
2421 character in them, so set class_charcount bigger than one. */
2423 if (c == '\\')
2425 c = check_escape(&ptr, errorptr, *brackets, options, TRUE);
2427 if (-c == ESC_b) c = '\b'; /* \b is backslash in a class */
2428 else if (-c == ESC_X) c = 'X'; /* \X is literal X in a class */
2429 else if (-c == ESC_Q) /* Handle start of quoted string */
2431 if (ptr[1] == '\\' && ptr[2] == 'E')
2433 ptr += 2; /* avoid empty string */
2435 else inescq = TRUE;
2436 continue;
2439 if (c < 0)
2441 register const uschar *cbits = cd->cbits;
2442 class_charcount += 2; /* Greater than 1 is what matters */
2443 switch (-c)
2445 case ESC_d:
2446 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
2447 continue;
2449 case ESC_D:
2450 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
2451 continue;
2453 case ESC_w:
2454 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
2455 continue;
2457 case ESC_W:
2458 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
2459 continue;
2461 case ESC_s:
2462 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
2463 classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
2464 continue;
2466 case ESC_S:
2467 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
2468 classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
2469 continue;
2471 #ifdef SUPPORT_UCP
2472 case ESC_p:
2473 case ESC_P:
2475 BOOL negated;
2476 int property = get_ucp(&ptr, &negated, errorptr);
2477 if (property < 0) goto FAILED;
2478 class_utf8 = TRUE;
2479 *class_utf8data++ = ((-c == ESC_p) != negated)?
2480 XCL_PROP : XCL_NOTPROP;
2481 *class_utf8data++ = property;
2482 class_charcount -= 2; /* Not a < 256 character */
2484 continue;
2485 #endif
2487 /* Unrecognized escapes are faulted if PCRE is running in its
2488 strict mode. By default, for compatibility with Perl, they are
2489 treated as literals. */
2491 default:
2492 if ((options & PCRE_EXTRA) != 0)
2494 *errorptr = ERR7;
2495 goto FAILED;
2497 c = *ptr; /* The final character */
2498 class_charcount -= 2; /* Undo the default count from above */
2502 /* Fall through if we have a single character (c >= 0). This may be
2503 > 256 in UTF-8 mode. */
2505 } /* End of backslash handling */
2507 /* A single character may be followed by '-' to form a range. However,
2508 Perl does not permit ']' to be the end of the range. A '-' character
2509 here is treated as a literal. */
2511 if (ptr[1] == '-' && ptr[2] != ']')
2513 int d;
2514 ptr += 2;
2516 #ifdef SUPPORT_UTF8
2517 if (utf8)
2518 { /* Braces are required because the */
2519 GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
2521 else
2522 #endif
2523 d = *ptr; /* Not UTF-8 mode */
2525 /* The second part of a range can be a single-character escape, but
2526 not any of the other escapes. Perl 5.6 treats a hyphen as a literal
2527 in such circumstances. */
2529 if (d == '\\')
2531 const uschar *oldptr = ptr;
2532 d = check_escape(&ptr, errorptr, *brackets, options, TRUE);
2534 /* \b is backslash; \X is literal X; any other special means the '-'
2535 was literal */
2537 if (d < 0)
2539 if (d == -ESC_b) d = '\b';
2540 else if (d == -ESC_X) d = 'X'; else
2542 ptr = oldptr - 2;
2543 goto LONE_SINGLE_CHARACTER; /* A few lines below */
2548 /* The check that the two values are in the correct order happens in
2549 the pre-pass. Optimize one-character ranges */
2551 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
2553 /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
2554 matching, we have to use an XCLASS with extra data items. Caseless
2555 matching for characters > 127 is available only if UCP support is
2556 available. */
2558 #ifdef SUPPORT_UTF8
2559 if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
2561 class_utf8 = TRUE;
2563 /* With UCP support, we can find the other case equivalents of
2564 the relevant characters. There may be several ranges. Optimize how
2565 they fit with the basic range. */
2567 #ifdef SUPPORT_UCP
2568 if ((options & PCRE_CASELESS) != 0)
2570 int occ, ocd;
2571 int cc = c;
2572 int origd = d;
2573 while (get_othercase_range(&cc, origd, &occ, &ocd))
2575 if (occ >= c && ocd <= d) continue; /* Skip embedded ranges */
2577 if (occ < c && ocd >= c - 1) /* Extend the basic range */
2578 { /* if there is overlap, */
2579 c = occ; /* noting that if occ < c */
2580 continue; /* we can't have ocd > d */
2581 } /* because a subrange is */
2582 if (ocd > d && occ <= d + 1) /* always shorter than */
2583 { /* the basic range. */
2584 d = ocd;
2585 continue;
2588 if (occ == ocd)
2590 *class_utf8data++ = XCL_SINGLE;
2592 else
2594 *class_utf8data++ = XCL_RANGE;
2595 class_utf8data += ord2utf8(occ, class_utf8data);
2597 class_utf8data += ord2utf8(ocd, class_utf8data);
2600 #endif /* SUPPORT_UCP */
2602 /* Now record the original range, possibly modified for UCP caseless
2603 overlapping ranges. */
2605 *class_utf8data++ = XCL_RANGE;
2606 class_utf8data += ord2utf8(c, class_utf8data);
2607 class_utf8data += ord2utf8(d, class_utf8data);
2609 /* With UCP support, we are done. Without UCP support, there is no
2610 caseless matching for UTF-8 characters > 127; we can use the bit map
2611 for the smaller ones. */
2613 #ifdef SUPPORT_UCP
2614 continue; /* With next character in the class */
2615 #else
2616 if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
2618 /* Adjust upper limit and fall through to set up the map */
2620 d = 127;
2622 #endif /* SUPPORT_UCP */
2624 #endif /* SUPPORT_UTF8 */
2626 /* We use the bit map for all cases when not in UTF-8 mode; else
2627 ranges that lie entirely within 0-127 when there is UCP support; else
2628 for partial ranges without UCP support. */
2630 for (; c <= d; c++)
2632 classbits[c/8] |= (1 << (c&7));
2633 if ((options & PCRE_CASELESS) != 0)
2635 int uc = cd->fcc[c]; /* flip case */
2636 classbits[uc/8] |= (1 << (uc&7));
2638 class_charcount++; /* in case a one-char range */
2639 class_lastchar = c;
2642 continue; /* Go get the next char in the class */
2645 /* Handle a lone single character - we can get here for a normal
2646 non-escape char, or after \ that introduces a single character or for an
2647 apparent range that isn't. */
2649 LONE_SINGLE_CHARACTER:
2651 /* Handle a character that cannot go in the bit map */
2653 #ifdef SUPPORT_UTF8
2654 if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
2656 class_utf8 = TRUE;
2657 *class_utf8data++ = XCL_SINGLE;
2658 class_utf8data += ord2utf8(c, class_utf8data);
2660 #ifdef SUPPORT_UCP
2661 if ((options & PCRE_CASELESS) != 0)
2663 int chartype;
2664 int othercase;
2665 if (ucp_findchar(c, &chartype, &othercase) >= 0 && othercase > 0)
2667 *class_utf8data++ = XCL_SINGLE;
2668 class_utf8data += ord2utf8(othercase, class_utf8data);
2671 #endif /* SUPPORT_UCP */
2674 else
2675 #endif /* SUPPORT_UTF8 */
2677 /* Handle a single-byte character */
2679 classbits[c/8] |= (1 << (c&7));
2680 if ((options & PCRE_CASELESS) != 0)
2682 c = cd->fcc[c]; /* flip case */
2683 classbits[c/8] |= (1 << (c&7));
2685 class_charcount++;
2686 class_lastchar = c;
2690 /* Loop until ']' reached; the check for end of string happens inside the
2691 loop. This "while" is the end of the "do" above. */
2693 while ((c = *(++ptr)) != ']' || inescq);
2695 /* If class_charcount is 1, we saw precisely one character whose value is
2696 less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we
2697 can optimize the negative case only if there were no characters >= 128
2698 because OP_NOT and the related opcodes like OP_NOTSTAR operate on
2699 single-bytes only. This is an historical hangover. Maybe one day we can
2700 tidy these opcodes to handle multi-byte characters.
2702 The optimization throws away the bit map. We turn the item into a
2703 1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
2704 that OP_NOT does not support multibyte characters. In the positive case, it
2705 can cause firstbyte to be set. Otherwise, there can be no first char if
2706 this item is first, whatever repeat count may follow. In the case of
2707 reqbyte, save the previous value for reinstating. */
2709 #ifdef SUPPORT_UTF8
2710 if (class_charcount == 1 &&
2711 (!utf8 ||
2712 (!class_utf8 && (!negate_class || class_lastchar < 128))))
2714 #else
2715 if (class_charcount == 1)
2716 #endif
2718 zeroreqbyte = reqbyte;
2720 /* The OP_NOT opcode works on one-byte characters only. */
2722 if (negate_class)
2724 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2725 zerofirstbyte = firstbyte;
2726 *code++ = OP_NOT;
2727 *code++ = class_lastchar;
2728 break;
2731 /* For a single, positive character, get the value into mcbuffer, and
2732 then we can handle this with the normal one-character code. */
2734 #ifdef SUPPORT_UTF8
2735 if (utf8 && class_lastchar > 127)
2736 mclength = ord2utf8(class_lastchar, mcbuffer);
2737 else
2738 #endif
2740 mcbuffer[0] = class_lastchar;
2741 mclength = 1;
2743 goto ONE_CHAR;
2744 } /* End of 1-char optimization */
2746 /* The general case - not the one-char optimization. If this is the first
2747 thing in the branch, there can be no first char setting, whatever the
2748 repeat count. Any reqbyte setting must remain unchanged after any kind of
2749 repeat. */
2751 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2752 zerofirstbyte = firstbyte;
2753 zeroreqbyte = reqbyte;
2755 /* If there are characters with values > 255, we have to compile an
2756 extended class, with its own opcode. If there are no characters < 256,
2757 we can omit the bitmap. */
2759 #ifdef SUPPORT_UTF8
2760 if (class_utf8)
2762 *class_utf8data++ = XCL_END; /* Marks the end of extra data */
2763 *code++ = OP_XCLASS;
2764 code += LINK_SIZE;
2765 *code = negate_class? XCL_NOT : 0;
2767 /* If the map is required, install it, and move on to the end of
2768 the extra data */
2770 if (class_charcount > 0)
2772 *code++ |= XCL_MAP;
2773 memcpy(code, classbits, 32);
2774 code = class_utf8data;
2777 /* If the map is not required, slide down the extra data. */
2779 else
2781 int len = class_utf8data - (code + 33);
2782 memmove(code + 1, code + 33, len);
2783 code += len + 1;
2786 /* Now fill in the complete length of the item */
2788 PUT(previous, 1, code - previous);
2789 break; /* End of class handling */
2791 #endif
2793 /* If there are no characters > 255, negate the 32-byte map if necessary,
2794 and copy it into the code vector. If this is the first thing in the branch,
2795 there can be no first char setting, whatever the repeat count. Any reqbyte
2796 setting must remain unchanged after any kind of repeat. */
2798 if (negate_class)
2800 *code++ = OP_NCLASS;
2801 for (c = 0; c < 32; c++) code[c] = ~classbits[c];
2803 else
2805 *code++ = OP_CLASS;
2806 memcpy(code, classbits, 32);
2808 code += 32;
2809 break;
2811 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
2812 has been tested above. */
2814 case '{':
2815 if (!is_quantifier) goto NORMAL_CHAR;
2816 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr);
2817 if (*errorptr != NULL) goto FAILED;
2818 goto REPEAT;
2820 case '*':
2821 repeat_min = 0;
2822 repeat_max = -1;
2823 goto REPEAT;
2825 case '+':
2826 repeat_min = 1;
2827 repeat_max = -1;
2828 goto REPEAT;
2830 case '?':
2831 repeat_min = 0;
2832 repeat_max = 1;
2834 REPEAT:
2835 if (previous == NULL)
2837 *errorptr = ERR9;
2838 goto FAILED;
2841 if (repeat_min == 0)
2843 firstbyte = zerofirstbyte; /* Adjust for zero repeat */
2844 reqbyte = zeroreqbyte; /* Ditto */
2847 /* Remember whether this is a variable length repeat */
2849 reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
2851 op_type = 0; /* Default single-char op codes */
2852 possessive_quantifier = FALSE; /* Default not possessive quantifier */
2854 /* Save start of previous item, in case we have to move it up to make space
2855 for an inserted OP_ONCE for the additional '+' extension. */
2857 tempcode = previous;
2859 /* If the next character is '+', we have a possessive quantifier. This
2860 implies greediness, whatever the setting of the PCRE_UNGREEDY option.
2861 If the next character is '?' this is a minimizing repeat, by default,
2862 but if PCRE_UNGREEDY is set, it works the other way round. We change the
2863 repeat type to the non-default. */
2865 if (ptr[1] == '+')
2867 repeat_type = 0; /* Force greedy */
2868 possessive_quantifier = TRUE;
2869 ptr++;
2871 else if (ptr[1] == '?')
2873 repeat_type = greedy_non_default;
2874 ptr++;
2876 else repeat_type = greedy_default;
2878 /* If previous was a recursion, we need to wrap it inside brackets so that
2879 it can be replicated if necessary. */
2881 if (*previous == OP_RECURSE)
2883 memmove(previous + 1 + LINK_SIZE, previous, 1 + LINK_SIZE);
2884 code += 1 + LINK_SIZE;
2885 *previous = OP_BRA;
2886 PUT(previous, 1, code - previous);
2887 *code = OP_KET;
2888 PUT(code, 1, code - previous);
2889 code += 1 + LINK_SIZE;
2892 /* If previous was a character match, abolish the item and generate a
2893 repeat item instead. If a char item has a minumum of more than one, ensure
2894 that it is set in reqbyte - it might not be if a sequence such as x{3} is
2895 the first thing in a branch because the x will have gone into firstbyte
2896 instead. */
2898 if (*previous == OP_CHAR || *previous == OP_CHARNC)
2900 /* Deal with UTF-8 characters that take up more than one byte. It's
2901 easier to write this out separately than try to macrify it. Use c to
2902 hold the length of the character in bytes, plus 0x80 to flag that it's a
2903 length rather than a small character. */
2905 #ifdef SUPPORT_UTF8
2906 if (utf8 && (code[-1] & 0x80) != 0)
2908 uschar *lastchar = code - 1;
2909 while((*lastchar & 0xc0) == 0x80) lastchar--;
2910 c = code - lastchar; /* Length of UTF-8 character */
2911 memcpy(utf8_char, lastchar, c); /* Save the char */
2912 c |= 0x80; /* Flag c as a length */
2914 else
2915 #endif
2917 /* Handle the case of a single byte - either with no UTF8 support, or
2918 with UTF-8 disabled, or for a UTF-8 character < 128. */
2921 c = code[-1];
2922 if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
2925 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
2928 /* If previous was a single negated character ([^a] or similar), we use
2929 one of the special opcodes, replacing it. The code is shared with single-
2930 character repeats by setting opt_type to add a suitable offset into
2931 repeat_type. OP_NOT is currently used only for single-byte chars. */
2933 else if (*previous == OP_NOT)
2935 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
2936 c = previous[1];
2937 goto OUTPUT_SINGLE_REPEAT;
2940 /* If previous was a character type match (\d or similar), abolish it and
2941 create a suitable repeat item. The code is shared with single-character
2942 repeats by setting op_type to add a suitable offset into repeat_type. Note
2943 the the Unicode property types will be present only when SUPPORT_UCP is
2944 defined, but we don't wrap the little bits of code here because it just
2945 makes it horribly messy. */
2947 else if (*previous < OP_EODN)
2949 uschar *oldcode;
2950 int prop_type;
2951 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
2952 c = *previous;
2954 OUTPUT_SINGLE_REPEAT:
2955 prop_type = (*previous == OP_PROP || *previous == OP_NOTPROP)?
2956 previous[1] : -1;
2958 oldcode = code;
2959 code = previous; /* Usually overwrite previous item */
2961 /* If the maximum is zero then the minimum must also be zero; Perl allows
2962 this case, so we do too - by simply omitting the item altogether. */
2964 if (repeat_max == 0) goto END_REPEAT;
2966 /* All real repeats make it impossible to handle partial matching (maybe
2967 one day we will be able to remove this restriction). */
2969 if (repeat_max != 1) cd->nopartial = TRUE;
2971 /* Combine the op_type with the repeat_type */
2973 repeat_type += op_type;
2975 /* A minimum of zero is handled either as the special case * or ?, or as
2976 an UPTO, with the maximum given. */
2978 if (repeat_min == 0)
2980 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
2981 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
2982 else
2984 *code++ = OP_UPTO + repeat_type;
2985 PUT2INC(code, 0, repeat_max);
2989 /* A repeat minimum of 1 is optimized into some special cases. If the
2990 maximum is unlimited, we use OP_PLUS. Otherwise, the original item it
2991 left in place and, if the maximum is greater than 1, we use OP_UPTO with
2992 one less than the maximum. */
2994 else if (repeat_min == 1)
2996 if (repeat_max == -1)
2997 *code++ = OP_PLUS + repeat_type;
2998 else
3000 code = oldcode; /* leave previous item in place */
3001 if (repeat_max == 1) goto END_REPEAT;
3002 *code++ = OP_UPTO + repeat_type;
3003 PUT2INC(code, 0, repeat_max - 1);
3007 /* The case {n,n} is just an EXACT, while the general case {n,m} is
3008 handled as an EXACT followed by an UPTO. */
3010 else
3012 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
3013 PUT2INC(code, 0, repeat_min);
3015 /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3016 we have to insert the character for the previous code. For a repeated
3017 Unicode property match, there is an extra byte that defines the
3018 required property. In UTF-8 mode, long characters have their length in
3019 c, with the 0x80 bit as a flag. */
3021 if (repeat_max < 0)
3023 #ifdef SUPPORT_UTF8
3024 if (utf8 && c >= 128)
3026 memcpy(code, utf8_char, c & 7);
3027 code += c & 7;
3029 else
3030 #endif
3032 *code++ = c;
3033 if (prop_type >= 0) *code++ = prop_type;
3035 *code++ = OP_STAR + repeat_type;
3038 /* Else insert an UPTO if the max is greater than the min, again
3039 preceded by the character, for the previously inserted code. */
3041 else if (repeat_max != repeat_min)
3043 #ifdef SUPPORT_UTF8
3044 if (utf8 && c >= 128)
3046 memcpy(code, utf8_char, c & 7);
3047 code += c & 7;
3049 else
3050 #endif
3051 *code++ = c;
3052 if (prop_type >= 0) *code++ = prop_type;
3053 repeat_max -= repeat_min;
3054 *code++ = OP_UPTO + repeat_type;
3055 PUT2INC(code, 0, repeat_max);
3059 /* The character or character type itself comes last in all cases. */
3061 #ifdef SUPPORT_UTF8
3062 if (utf8 && c >= 128)
3064 memcpy(code, utf8_char, c & 7);
3065 code += c & 7;
3067 else
3068 #endif
3069 *code++ = c;
3071 /* For a repeated Unicode property match, there is an extra byte that
3072 defines the required property. */
3074 #ifdef SUPPORT_UCP
3075 if (prop_type >= 0) *code++ = prop_type;
3076 #endif
3079 /* If previous was a character class or a back reference, we put the repeat
3080 stuff after it, but just skip the item if the repeat was {0,0}. */
3082 else if (*previous == OP_CLASS ||
3083 *previous == OP_NCLASS ||
3084 #ifdef SUPPORT_UTF8
3085 *previous == OP_XCLASS ||
3086 #endif
3087 *previous == OP_REF)
3089 if (repeat_max == 0)
3091 code = previous;
3092 goto END_REPEAT;
3095 /* All real repeats make it impossible to handle partial matching (maybe
3096 one day we will be able to remove this restriction). */
3098 if (repeat_max != 1) cd->nopartial = TRUE;
3100 if (repeat_min == 0 && repeat_max == -1)
3101 *code++ = OP_CRSTAR + repeat_type;
3102 else if (repeat_min == 1 && repeat_max == -1)
3103 *code++ = OP_CRPLUS + repeat_type;
3104 else if (repeat_min == 0 && repeat_max == 1)
3105 *code++ = OP_CRQUERY + repeat_type;
3106 else
3108 *code++ = OP_CRRANGE + repeat_type;
3109 PUT2INC(code, 0, repeat_min);
3110 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
3111 PUT2INC(code, 0, repeat_max);
3115 /* If previous was a bracket group, we may have to replicate it in certain
3116 cases. */
3118 else if (*previous >= OP_BRA || *previous == OP_ONCE ||
3119 *previous == OP_COND)
3121 register int i;
3122 int ketoffset = 0;
3123 int len = code - previous;
3124 uschar *bralink = NULL;
3126 /* If the maximum repeat count is unlimited, find the end of the bracket
3127 by scanning through from the start, and compute the offset back to it
3128 from the current code pointer. There may be an OP_OPT setting following
3129 the final KET, so we can't find the end just by going back from the code
3130 pointer. */
3132 if (repeat_max == -1)
3134 register uschar *ket = previous;
3135 do ket += GET(ket, 1); while (*ket != OP_KET);
3136 ketoffset = code - ket;
3139 /* The case of a zero minimum is special because of the need to stick
3140 OP_BRAZERO in front of it, and because the group appears once in the
3141 data, whereas in other cases it appears the minimum number of times. For
3142 this reason, it is simplest to treat this case separately, as otherwise
3143 the code gets far too messy. There are several special subcases when the
3144 minimum is zero. */
3146 if (repeat_min == 0)
3148 /* If the maximum is also zero, we just omit the group from the output
3149 altogether. */
3151 if (repeat_max == 0)
3153 code = previous;
3154 goto END_REPEAT;
3157 /* If the maximum is 1 or unlimited, we just have to stick in the
3158 BRAZERO and do no more at this point. However, we do need to adjust
3159 any OP_RECURSE calls inside the group that refer to the group itself or
3160 any internal group, because the offset is from the start of the whole
3161 regex. Temporarily terminate the pattern while doing this. */
3163 if (repeat_max <= 1)
3165 *code = OP_END;
3166 adjust_recurse(previous, 1, utf8, cd);
3167 memmove(previous+1, previous, len);
3168 code++;
3169 *previous++ = OP_BRAZERO + repeat_type;
3172 /* If the maximum is greater than 1 and limited, we have to replicate
3173 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
3174 The first one has to be handled carefully because it's the original
3175 copy, which has to be moved up. The remainder can be handled by code
3176 that is common with the non-zero minimum case below. We have to
3177 adjust the value or repeat_max, since one less copy is required. Once
3178 again, we may have to adjust any OP_RECURSE calls inside the group. */
3180 else
3182 int offset;
3183 *code = OP_END;
3184 adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd);
3185 memmove(previous + 2 + LINK_SIZE, previous, len);
3186 code += 2 + LINK_SIZE;
3187 *previous++ = OP_BRAZERO + repeat_type;
3188 *previous++ = OP_BRA;
3190 /* We chain together the bracket offset fields that have to be
3191 filled in later when the ends of the brackets are reached. */
3193 offset = (bralink == NULL)? 0 : previous - bralink;
3194 bralink = previous;
3195 PUTINC(previous, 0, offset);
3198 repeat_max--;
3201 /* If the minimum is greater than zero, replicate the group as many
3202 times as necessary, and adjust the maximum to the number of subsequent
3203 copies that we need. If we set a first char from the group, and didn't
3204 set a required char, copy the latter from the former. */
3206 else
3208 if (repeat_min > 1)
3210 if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
3211 for (i = 1; i < repeat_min; i++)
3213 memcpy(code, previous, len);
3214 code += len;
3217 if (repeat_max > 0) repeat_max -= repeat_min;
3220 /* This code is common to both the zero and non-zero minimum cases. If
3221 the maximum is limited, it replicates the group in a nested fashion,
3222 remembering the bracket starts on a stack. In the case of a zero minimum,
3223 the first one was set up above. In all cases the repeat_max now specifies
3224 the number of additional copies needed. */
3226 if (repeat_max >= 0)
3228 for (i = repeat_max - 1; i >= 0; i--)
3230 *code++ = OP_BRAZERO + repeat_type;
3232 /* All but the final copy start a new nesting, maintaining the
3233 chain of brackets outstanding. */
3235 if (i != 0)
3237 int offset;
3238 *code++ = OP_BRA;
3239 offset = (bralink == NULL)? 0 : code - bralink;
3240 bralink = code;
3241 PUTINC(code, 0, offset);
3244 memcpy(code, previous, len);
3245 code += len;
3248 /* Now chain through the pending brackets, and fill in their length
3249 fields (which are holding the chain links pro tem). */
3251 while (bralink != NULL)
3253 int oldlinkoffset;
3254 int offset = code - bralink + 1;
3255 uschar *bra = code - offset;
3256 oldlinkoffset = GET(bra, 1);
3257 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
3258 *code++ = OP_KET;
3259 PUTINC(code, 0, offset);
3260 PUT(bra, 1, offset);
3264 /* If the maximum is unlimited, set a repeater in the final copy. We
3265 can't just offset backwards from the current code point, because we
3266 don't know if there's been an options resetting after the ket. The
3267 correct offset was computed above. */
3269 else code[-ketoffset] = OP_KETRMAX + repeat_type;
3272 /* Else there's some kind of shambles */
3274 else
3276 *errorptr = ERR11;
3277 goto FAILED;
3280 /* If the character following a repeat is '+', we wrap the entire repeated
3281 item inside OP_ONCE brackets. This is just syntactic sugar, taken from
3282 Sun's Java package. The repeated item starts at tempcode, not at previous,
3283 which might be the first part of a string whose (former) last char we
3284 repeated. However, we don't support '+' after a greediness '?'. */
3286 if (possessive_quantifier)
3288 int len = code - tempcode;
3289 memmove(tempcode + 1+LINK_SIZE, tempcode, len);
3290 code += 1 + LINK_SIZE;
3291 len += 1 + LINK_SIZE;
3292 tempcode[0] = OP_ONCE;
3293 *code++ = OP_KET;
3294 PUTINC(code, 0, len);
3295 PUT(tempcode, 1, len);
3298 /* In all case we no longer have a previous item. We also set the
3299 "follows varying string" flag for subsequently encountered reqbytes if
3300 it isn't already set and we have just passed a varying length item. */
3302 END_REPEAT:
3303 previous = NULL;
3304 cd->req_varyopt |= reqvary;
3305 break;
3308 /* Start of nested bracket sub-expression, or comment or lookahead or
3309 lookbehind or option setting or condition. First deal with special things
3310 that can come after a bracket; all are introduced by ?, and the appearance
3311 of any of them means that this is not a referencing group. They were
3312 checked for validity in the first pass over the string, so we don't have to
3313 check for syntax errors here. */
3315 case '(':
3316 newoptions = options;
3317 skipbytes = 0;
3319 if (*(++ptr) == '?')
3321 int set, unset;
3322 int *optset;
3324 switch (*(++ptr))
3326 case '#': /* Comment; skip to ket */
3327 ptr++;
3328 while (*ptr != ')') ptr++;
3329 continue;
3331 case ':': /* Non-extracting bracket */
3332 bravalue = OP_BRA;
3333 ptr++;
3334 break;
3336 case '(':
3337 bravalue = OP_COND; /* Conditional group */
3339 /* Condition to test for recursion */
3341 if (ptr[1] == 'R')
3343 code[1+LINK_SIZE] = OP_CREF;
3344 PUT2(code, 2+LINK_SIZE, CREF_RECURSE);
3345 skipbytes = 3;
3346 ptr += 3;
3349 /* Condition to test for a numbered subpattern match. We know that
3350 if a digit follows ( then there will just be digits until ) because
3351 the syntax was checked in the first pass. */
3353 else if ((digitab[ptr[1]] && ctype_digit) != 0)
3355 int condref; /* Don't amalgamate; some compilers */
3356 condref = *(++ptr) - '0'; /* grumble at autoincrement in declaration */
3357 while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';
3358 if (condref == 0)
3360 *errorptr = ERR35;
3361 goto FAILED;
3363 ptr++;
3364 code[1+LINK_SIZE] = OP_CREF;
3365 PUT2(code, 2+LINK_SIZE, condref);
3366 skipbytes = 3;
3368 /* For conditions that are assertions, we just fall through, having
3369 set bravalue above. */
3370 break;
3372 case '=': /* Positive lookahead */
3373 bravalue = OP_ASSERT;
3374 ptr++;
3375 break;
3377 case '!': /* Negative lookahead */
3378 bravalue = OP_ASSERT_NOT;
3379 ptr++;
3380 break;
3382 case '<': /* Lookbehinds */
3383 switch (*(++ptr))
3385 case '=': /* Positive lookbehind */
3386 bravalue = OP_ASSERTBACK;
3387 ptr++;
3388 break;
3390 case '!': /* Negative lookbehind */
3391 bravalue = OP_ASSERTBACK_NOT;
3392 ptr++;
3393 break;
3395 break;
3397 case '>': /* One-time brackets */
3398 bravalue = OP_ONCE;
3399 ptr++;
3400 break;
3402 case 'C': /* Callout - may be followed by digits; */
3403 previous_callout = code; /* Save for later completion */
3404 after_manual_callout = 1; /* Skip one item before completing */
3405 *code++ = OP_CALLOUT; /* Already checked that the terminating */
3406 { /* closing parenthesis is present. */
3407 int n = 0;
3408 while ((digitab[*(++ptr)] & ctype_digit) != 0)
3409 n = n * 10 + *ptr - '0';
3410 if (n > 255)
3412 *errorptr = ERR38;
3413 goto FAILED;
3415 *code++ = n;
3416 PUT(code, 0, ptr - cd->start_pattern + 1); /* Pattern offset */
3417 PUT(code, LINK_SIZE, 0); /* Default length */
3418 code += 2 * LINK_SIZE;
3420 previous = NULL;
3421 continue;
3423 case 'P': /* Named subpattern handling */
3424 if (*(++ptr) == '<') /* Definition */
3426 int i, namelen;
3427 uschar *slot = cd->name_table;
3428 const uschar *name; /* Don't amalgamate; some compilers */
3429 name = ++ptr; /* grumble at autoincrement in declaration */
3431 while (*ptr++ != '>');
3432 namelen = ptr - name - 1;
3434 for (i = 0; i < cd->names_found; i++)
3436 int crc = memcmp(name, slot+2, namelen);
3437 if (crc == 0)
3439 if (slot[2+namelen] == 0)
3441 *errorptr = ERR43;
3442 goto FAILED;
3444 crc = -1; /* Current name is substring */
3446 if (crc < 0)
3448 memmove(slot + cd->name_entry_size, slot,
3449 (cd->names_found - i) * cd->name_entry_size);
3450 break;
3452 slot += cd->name_entry_size;
3455 PUT2(slot, 0, *brackets + 1);
3456 memcpy(slot + 2, name, namelen);
3457 slot[2+namelen] = 0;
3458 cd->names_found++;
3459 goto NUMBERED_GROUP;
3462 if (*ptr == '=' || *ptr == '>') /* Reference or recursion */
3464 int i, namelen;
3465 int type = *ptr++;
3466 const uschar *name = ptr;
3467 uschar *slot = cd->name_table;
3469 while (*ptr != ')') ptr++;
3470 namelen = ptr - name;
3472 for (i = 0; i < cd->names_found; i++)
3474 if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
3475 slot += cd->name_entry_size;
3477 if (i >= cd->names_found)
3479 *errorptr = ERR15;
3480 goto FAILED;
3483 recno = GET2(slot, 0);
3485 if (type == '>') goto HANDLE_RECURSION; /* A few lines below */
3487 /* Back reference */
3489 previous = code;
3490 *code++ = OP_REF;
3491 PUT2INC(code, 0, recno);
3492 cd->backref_map |= (recno < 32)? (1 << recno) : 1;
3493 if (recno > cd->top_backref) cd->top_backref = recno;
3494 continue;
3497 /* Should never happen */
3498 break;
3500 case 'R': /* Pattern recursion */
3501 ptr++; /* Same as (?0) */
3502 /* Fall through */
3504 /* Recursion or "subroutine" call */
3506 case '0': case '1': case '2': case '3': case '4':
3507 case '5': case '6': case '7': case '8': case '9':
3509 const uschar *called;
3510 recno = 0;
3511 while((digitab[*ptr] & ctype_digit) != 0)
3512 recno = recno * 10 + *ptr++ - '0';
3514 /* Come here from code above that handles a named recursion */
3516 HANDLE_RECURSION:
3518 previous = code;
3520 /* Find the bracket that is being referenced. Temporarily end the
3521 regex in case it doesn't exist. */
3523 *code = OP_END;
3524 called = (recno == 0)?
3525 cd->start_code : find_bracket(cd->start_code, utf8, recno);
3527 if (called == NULL)
3529 *errorptr = ERR15;
3530 goto FAILED;
3533 /* If the subpattern is still open, this is a recursive call. We
3534 check to see if this is a left recursion that could loop for ever,
3535 and diagnose that case. */
3537 if (GET(called, 1) == 0 && could_be_empty(called, code, bcptr, utf8))
3539 *errorptr = ERR40;
3540 goto FAILED;
3543 /* Insert the recursion/subroutine item */
3545 *code = OP_RECURSE;
3546 PUT(code, 1, called - cd->start_code);
3547 code += 1 + LINK_SIZE;
3549 continue;
3551 /* Character after (? not specially recognized */
3553 default: /* Option setting */
3554 set = unset = 0;
3555 optset = &set;
3557 while (*ptr != ')' && *ptr != ':')
3559 switch (*ptr++)
3561 case '-': optset = &unset; break;
3563 case 'i': *optset |= PCRE_CASELESS; break;
3564 case 'm': *optset |= PCRE_MULTILINE; break;
3565 case 's': *optset |= PCRE_DOTALL; break;
3566 case 'x': *optset |= PCRE_EXTENDED; break;
3567 case 'U': *optset |= PCRE_UNGREEDY; break;
3568 case 'X': *optset |= PCRE_EXTRA; break;
3572 /* Set up the changed option bits, but don't change anything yet. */
3574 newoptions = (options | set) & (~unset);
3576 /* If the options ended with ')' this is not the start of a nested
3577 group with option changes, so the options change at this level. Compile
3578 code to change the ims options if this setting actually changes any of
3579 them. We also pass the new setting back so that it can be put at the
3580 start of any following branches, and when this group ends (if we are in
3581 a group), a resetting item can be compiled.
3583 Note that if this item is right at the start of the pattern, the
3584 options will have been abstracted and made global, so there will be no
3585 change to compile. */
3587 if (*ptr == ')')
3589 if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
3591 *code++ = OP_OPT;
3592 *code++ = newoptions & PCRE_IMS;
3595 /* Change options at this level, and pass them back for use
3596 in subsequent branches. Reset the greedy defaults and the case
3597 value for firstbyte and reqbyte. */
3599 *optionsptr = options = newoptions;
3600 greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
3601 greedy_non_default = greedy_default ^ 1;
3602 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
3604 previous = NULL; /* This item can't be repeated */
3605 continue; /* It is complete */
3608 /* If the options ended with ':' we are heading into a nested group
3609 with possible change of options. Such groups are non-capturing and are
3610 not assertions of any kind. All we need to do is skip over the ':';
3611 the newoptions value is handled below. */
3613 bravalue = OP_BRA;
3614 ptr++;
3618 /* If PCRE_NO_AUTO_CAPTURE is set, all unadorned brackets become
3619 non-capturing and behave like (?:...) brackets */
3621 else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
3623 bravalue = OP_BRA;
3626 /* Else we have a referencing group; adjust the opcode. If the bracket
3627 number is greater than EXTRACT_BASIC_MAX, we set the opcode one higher, and
3628 arrange for the true number to follow later, in an OP_BRANUMBER item. */
3630 else
3632 NUMBERED_GROUP:
3633 if (++(*brackets) > EXTRACT_BASIC_MAX)
3635 bravalue = OP_BRA + EXTRACT_BASIC_MAX + 1;
3636 code[1+LINK_SIZE] = OP_BRANUMBER;
3637 PUT2(code, 2+LINK_SIZE, *brackets);
3638 skipbytes = 3;
3640 else bravalue = OP_BRA + *brackets;
3643 /* Process nested bracketed re. Assertions may not be repeated, but other
3644 kinds can be. We copy code into a non-register variable in order to be able
3645 to pass its address because some compilers complain otherwise. Pass in a
3646 new setting for the ims options if they have changed. */
3648 previous = (bravalue >= OP_ONCE)? code : NULL;
3649 *code = bravalue;
3650 tempcode = code;
3651 tempreqvary = cd->req_varyopt; /* Save value before bracket */
3653 if (!compile_regex(
3654 newoptions, /* The complete new option state */
3655 options & PCRE_IMS, /* The previous ims option state */
3656 brackets, /* Extracting bracket count */
3657 &tempcode, /* Where to put code (updated) */
3658 &ptr, /* Input pointer (updated) */
3659 errorptr, /* Where to put an error message */
3660 (bravalue == OP_ASSERTBACK ||
3661 bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
3662 skipbytes, /* Skip over OP_COND/OP_BRANUMBER */
3663 &subfirstbyte, /* For possible first char */
3664 &subreqbyte, /* For possible last char */
3665 bcptr, /* Current branch chain */
3666 cd)) /* Tables block */
3667 goto FAILED;
3669 /* At the end of compiling, code is still pointing to the start of the
3670 group, while tempcode has been updated to point past the end of the group
3671 and any option resetting that may follow it. The pattern pointer (ptr)
3672 is on the bracket. */
3674 /* If this is a conditional bracket, check that there are no more than
3675 two branches in the group. */
3677 else if (bravalue == OP_COND)
3679 uschar *tc = code;
3680 condcount = 0;
3682 do {
3683 condcount++;
3684 tc += GET(tc,1);
3686 while (*tc != OP_KET);
3688 if (condcount > 2)
3690 *errorptr = ERR27;
3691 goto FAILED;
3694 /* If there is just one branch, we must not make use of its firstbyte or
3695 reqbyte, because this is equivalent to an empty second branch. */
3697 if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
3700 /* Handle updating of the required and first characters. Update for normal
3701 brackets of all kinds, and conditions with two branches (see code above).
3702 If the bracket is followed by a quantifier with zero repeat, we have to
3703 back off. Hence the definition of zeroreqbyte and zerofirstbyte outside the
3704 main loop so that they can be accessed for the back off. */
3706 zeroreqbyte = reqbyte;
3707 zerofirstbyte = firstbyte;
3708 groupsetfirstbyte = FALSE;
3710 if (bravalue >= OP_BRA || bravalue == OP_ONCE || bravalue == OP_COND)
3712 /* If we have not yet set a firstbyte in this branch, take it from the
3713 subpattern, remembering that it was set here so that a repeat of more
3714 than one can replicate it as reqbyte if necessary. If the subpattern has
3715 no firstbyte, set "none" for the whole branch. In both cases, a zero
3716 repeat forces firstbyte to "none". */
3718 if (firstbyte == REQ_UNSET)
3720 if (subfirstbyte >= 0)
3722 firstbyte = subfirstbyte;
3723 groupsetfirstbyte = TRUE;
3725 else firstbyte = REQ_NONE;
3726 zerofirstbyte = REQ_NONE;
3729 /* If firstbyte was previously set, convert the subpattern's firstbyte
3730 into reqbyte if there wasn't one, using the vary flag that was in
3731 existence beforehand. */
3733 else if (subfirstbyte >= 0 && subreqbyte < 0)
3734 subreqbyte = subfirstbyte | tempreqvary;
3736 /* If the subpattern set a required byte (or set a first byte that isn't
3737 really the first byte - see above), set it. */
3739 if (subreqbyte >= 0) reqbyte = subreqbyte;
3742 /* For a forward assertion, we take the reqbyte, if set. This can be
3743 helpful if the pattern that follows the assertion doesn't set a different
3744 char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
3745 for an assertion, however because it leads to incorrect effect for patterns
3746 such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
3747 of a firstbyte. This is overcome by a scan at the end if there's no
3748 firstbyte, looking for an asserted first char. */
3750 else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
3752 /* Now update the main code pointer to the end of the group. */
3754 code = tempcode;
3756 /* Error if hit end of pattern */
3758 if (*ptr != ')')
3760 *errorptr = ERR14;
3761 goto FAILED;
3763 break;
3765 /* Check \ for being a real metacharacter; if not, fall through and handle
3766 it as a data character at the start of a string. Escape items are checked
3767 for validity in the pre-compiling pass. */
3769 case '\\':
3770 tempptr = ptr;
3771 c = check_escape(&ptr, errorptr, *brackets, options, FALSE);
3773 /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
3774 are arranged to be the negation of the corresponding OP_values. For the
3775 back references, the values are ESC_REF plus the reference number. Only
3776 back references and those types that consume a character may be repeated.
3777 We can test for values between ESC_b and ESC_Z for the latter; this may
3778 have to change if any new ones are ever created. */
3780 if (c < 0)
3782 if (-c == ESC_Q) /* Handle start of quoted string */
3784 if (ptr[1] == '\\' && ptr[2] == 'E') ptr += 2; /* avoid empty string */
3785 else inescq = TRUE;
3786 continue;
3789 /* For metasequences that actually match a character, we disable the
3790 setting of a first character if it hasn't already been set. */
3792 if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
3793 firstbyte = REQ_NONE;
3795 /* Set values to reset to if this is followed by a zero repeat. */
3797 zerofirstbyte = firstbyte;
3798 zeroreqbyte = reqbyte;
3800 /* Back references are handled specially */
3802 if (-c >= ESC_REF)
3804 int number = -c - ESC_REF;
3805 previous = code;
3806 *code++ = OP_REF;
3807 PUT2INC(code, 0, number);
3810 /* So are Unicode property matches, if supported. We know that get_ucp
3811 won't fail because it was tested in the pre-pass. */
3813 #ifdef SUPPORT_UCP
3814 else if (-c == ESC_P || -c == ESC_p)
3816 BOOL negated;
3817 int value = get_ucp(&ptr, &negated, errorptr);
3818 previous = code;
3819 *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
3820 *code++ = value;
3822 #endif
3824 /* For the rest, we can obtain the OP value by negating the escape
3825 value */
3827 else
3829 previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
3830 *code++ = -c;
3832 continue;
3835 /* We have a data character whose value is in c. In UTF-8 mode it may have
3836 a value > 127. We set its representation in the length/buffer, and then
3837 handle it as a data character. */
3839 #ifdef SUPPORT_UTF8
3840 if (utf8 && c > 127)
3841 mclength = ord2utf8(c, mcbuffer);
3842 else
3843 #endif
3846 mcbuffer[0] = c;
3847 mclength = 1;
3850 goto ONE_CHAR;
3852 /* Handle a literal character. It is guaranteed not to be whitespace or #
3853 when the extended flag is set. If we are in UTF-8 mode, it may be a
3854 multi-byte literal character. */
3856 default:
3857 NORMAL_CHAR:
3858 mclength = 1;
3859 mcbuffer[0] = c;
3861 #ifdef SUPPORT_UTF8
3862 if (utf8 && (c & 0xc0) == 0xc0)
3864 while ((ptr[1] & 0xc0) == 0x80)
3865 mcbuffer[mclength++] = *(++ptr);
3867 #endif
3869 /* At this point we have the character's bytes in mcbuffer, and the length
3870 in mclength. When not in UTF-8 mode, the length is always 1. */
3872 ONE_CHAR:
3873 previous = code;
3874 *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR;
3875 for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
3877 /* Set the first and required bytes appropriately. If no previous first
3878 byte, set it from this character, but revert to none on a zero repeat.
3879 Otherwise, leave the firstbyte value alone, and don't change it on a zero
3880 repeat. */
3882 if (firstbyte == REQ_UNSET)
3884 zerofirstbyte = REQ_NONE;
3885 zeroreqbyte = reqbyte;
3887 /* If the character is more than one byte long, we can set firstbyte
3888 only if it is not to be matched caselessly. */
3890 if (mclength == 1 || req_caseopt == 0)
3892 firstbyte = mcbuffer[0] | req_caseopt;
3893 if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt;
3895 else firstbyte = reqbyte = REQ_NONE;
3898 /* firstbyte was previously set; we can set reqbyte only the length is
3899 1 or the matching is caseful. */
3901 else
3903 zerofirstbyte = firstbyte;
3904 zeroreqbyte = reqbyte;
3905 if (mclength == 1 || req_caseopt == 0)
3906 reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
3909 break; /* End of literal character handling */
3911 } /* end of big loop */
3913 /* Control never reaches here by falling through, only by a goto for all the
3914 error states. Pass back the position in the pattern so that it can be displayed
3915 to the user for diagnosing the error. */
3917 FAILED:
3918 *ptrptr = ptr;
3919 return FALSE;
3925 /*************************************************
3926 * Compile sequence of alternatives *
3927 *************************************************/
3929 /* On entry, ptr is pointing past the bracket character, but on return
3930 it points to the closing bracket, or vertical bar, or end of string.
3931 The code variable is pointing at the byte into which the BRA operator has been
3932 stored. If the ims options are changed at the start (for a (?ims: group) or
3933 during any branch, we need to insert an OP_OPT item at the start of every
3934 following branch to ensure they get set correctly at run time, and also pass
3935 the new options into every subsequent branch compile.
3937 Argument:
3938 options option bits, including any changes for this subpattern
3939 oldims previous settings of ims option bits
3940 brackets -> int containing the number of extracting brackets used
3941 codeptr -> the address of the current code pointer
3942 ptrptr -> the address of the current pattern pointer
3943 errorptr -> pointer to error message
3944 lookbehind TRUE if this is a lookbehind assertion
3945 skipbytes skip this many bytes at start (for OP_COND, OP_BRANUMBER)
3946 firstbyteptr place to put the first required character, or a negative number
3947 reqbyteptr place to put the last required character, or a negative number
3948 bcptr pointer to the chain of currently open branches
3949 cd points to the data block with tables pointers etc.
3951 Returns: TRUE on success
3954 static BOOL
3955 compile_regex(int options, int oldims, int *brackets, uschar **codeptr,
3956 const uschar **ptrptr, const char **errorptr, BOOL lookbehind, int skipbytes,
3957 int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd)
3959 const uschar *ptr = *ptrptr;
3960 uschar *code = *codeptr;
3961 uschar *last_branch = code;
3962 uschar *start_bracket = code;
3963 uschar *reverse_count = NULL;
3964 int firstbyte, reqbyte;
3965 int branchfirstbyte, branchreqbyte;
3966 branch_chain bc;
3968 bc.outer = bcptr;
3969 bc.current = code;
3971 firstbyte = reqbyte = REQ_UNSET;
3973 /* Offset is set zero to mark that this bracket is still open */
3975 PUT(code, 1, 0);
3976 code += 1 + LINK_SIZE + skipbytes;
3978 /* Loop for each alternative branch */
3980 for (;;)
3982 /* Handle a change of ims options at the start of the branch */
3984 if ((options & PCRE_IMS) != oldims)
3986 *code++ = OP_OPT;
3987 *code++ = options & PCRE_IMS;
3990 /* Set up dummy OP_REVERSE if lookbehind assertion */
3992 if (lookbehind)
3994 *code++ = OP_REVERSE;
3995 reverse_count = code;
3996 PUTINC(code, 0, 0);
3999 /* Now compile the branch */
4001 if (!compile_branch(&options, brackets, &code, &ptr, errorptr,
4002 &branchfirstbyte, &branchreqbyte, &bc, cd))
4004 *ptrptr = ptr;
4005 return FALSE;
4008 /* If this is the first branch, the firstbyte and reqbyte values for the
4009 branch become the values for the regex. */
4011 if (*last_branch != OP_ALT)
4013 firstbyte = branchfirstbyte;
4014 reqbyte = branchreqbyte;
4017 /* If this is not the first branch, the first char and reqbyte have to
4018 match the values from all the previous branches, except that if the previous
4019 value for reqbyte didn't have REQ_VARY set, it can still match, and we set
4020 REQ_VARY for the regex. */
4022 else
4024 /* If we previously had a firstbyte, but it doesn't match the new branch,
4025 we have to abandon the firstbyte for the regex, but if there was previously
4026 no reqbyte, it takes on the value of the old firstbyte. */
4028 if (firstbyte >= 0 && firstbyte != branchfirstbyte)
4030 if (reqbyte < 0) reqbyte = firstbyte;
4031 firstbyte = REQ_NONE;
4034 /* If we (now or from before) have no firstbyte, a firstbyte from the
4035 branch becomes a reqbyte if there isn't a branch reqbyte. */
4037 if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
4038 branchreqbyte = branchfirstbyte;
4040 /* Now ensure that the reqbytes match */
4042 if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
4043 reqbyte = REQ_NONE;
4044 else reqbyte |= branchreqbyte; /* To "or" REQ_VARY */
4047 /* If lookbehind, check that this branch matches a fixed-length string,
4048 and put the length into the OP_REVERSE item. Temporarily mark the end of
4049 the branch with OP_END. */
4051 if (lookbehind)
4053 int length;
4054 *code = OP_END;
4055 length = find_fixedlength(last_branch, options);
4056 DPRINTF(("fixed length = %d\n", length));
4057 if (length < 0)
4059 *errorptr = (length == -2)? ERR36 : ERR25;
4060 *ptrptr = ptr;
4061 return FALSE;
4063 PUT(reverse_count, 0, length);
4066 /* Reached end of expression, either ')' or end of pattern. Go back through
4067 the alternative branches and reverse the chain of offsets, with the field in
4068 the BRA item now becoming an offset to the first alternative. If there are
4069 no alternatives, it points to the end of the group. The length in the
4070 terminating ket is always the length of the whole bracketed item. If any of
4071 the ims options were changed inside the group, compile a resetting op-code
4072 following, except at the very end of the pattern. Return leaving the pointer
4073 at the terminating char. */
4075 if (*ptr != '|')
4077 int length = code - last_branch;
4080 int prev_length = GET(last_branch, 1);
4081 PUT(last_branch, 1, length);
4082 length = prev_length;
4083 last_branch -= length;
4085 while (length > 0);
4087 /* Fill in the ket */
4089 *code = OP_KET;
4090 PUT(code, 1, code - start_bracket);
4091 code += 1 + LINK_SIZE;
4093 /* Resetting option if needed */
4095 if ((options & PCRE_IMS) != oldims && *ptr == ')')
4097 *code++ = OP_OPT;
4098 *code++ = oldims;
4101 /* Set values to pass back */
4103 *codeptr = code;
4104 *ptrptr = ptr;
4105 *firstbyteptr = firstbyte;
4106 *reqbyteptr = reqbyte;
4107 return TRUE;
4110 /* Another branch follows; insert an "or" node. Its length field points back
4111 to the previous branch while the bracket remains open. At the end the chain
4112 is reversed. It's done like this so that the start of the bracket has a
4113 zero offset until it is closed, making it possible to detect recursion. */
4115 *code = OP_ALT;
4116 PUT(code, 1, code - last_branch);
4117 bc.current = last_branch = code;
4118 code += 1 + LINK_SIZE;
4119 ptr++;
4121 /* Control never reaches here */
4127 /*************************************************
4128 * Check for anchored expression *
4129 *************************************************/
4131 /* Try to find out if this is an anchored regular expression. Consider each
4132 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
4133 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
4134 it's anchored. However, if this is a multiline pattern, then only OP_SOD
4135 counts, since OP_CIRC can match in the middle.
4137 We can also consider a regex to be anchored if OP_SOM starts all its branches.
4138 This is the code for \G, which means "match at start of match position, taking
4139 into account the match offset".
4141 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
4142 because that will try the rest of the pattern at all possible matching points,
4143 so there is no point trying again.... er ....
4145 .... except when the .* appears inside capturing parentheses, and there is a
4146 subsequent back reference to those parentheses. We haven't enough information
4147 to catch that case precisely.
4149 At first, the best we could do was to detect when .* was in capturing brackets
4150 and the highest back reference was greater than or equal to that level.
4151 However, by keeping a bitmap of the first 31 back references, we can catch some
4152 of the more common cases more precisely.
4154 Arguments:
4155 code points to start of expression (the bracket)
4156 options points to the options setting
4157 bracket_map a bitmap of which brackets we are inside while testing; this
4158 handles up to substring 31; after that we just have to take
4159 the less precise approach
4160 backref_map the back reference bitmap
4162 Returns: TRUE or FALSE
4165 static BOOL
4166 is_anchored(register const uschar *code, int *options, unsigned int bracket_map,
4167 unsigned int backref_map)
4169 do {
4170 const uschar *scode =
4171 first_significant_code(code + 1+LINK_SIZE, options, PCRE_MULTILINE, FALSE);
4172 register int op = *scode;
4174 /* Capturing brackets */
4176 if (op > OP_BRA)
4178 int new_map;
4179 op -= OP_BRA;
4180 if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE);
4181 new_map = bracket_map | ((op < 32)? (1 << op) : 1);
4182 if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
4185 /* Other brackets */
4187 else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
4189 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
4192 /* .* is not anchored unless DOTALL is set and it isn't in brackets that
4193 are or may be referenced. */
4195 else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR) &&
4196 (*options & PCRE_DOTALL) != 0)
4198 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
4201 /* Check for explicit anchoring */
4203 else if (op != OP_SOD && op != OP_SOM &&
4204 ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
4205 return FALSE;
4206 code += GET(code, 1);
4208 while (*code == OP_ALT); /* Loop for each alternative */
4209 return TRUE;
4214 /*************************************************
4215 * Check for starting with ^ or .* *
4216 *************************************************/
4218 /* This is called to find out if every branch starts with ^ or .* so that
4219 "first char" processing can be done to speed things up in multiline
4220 matching and for non-DOTALL patterns that start with .* (which must start at
4221 the beginning or after \n). As in the case of is_anchored() (see above), we
4222 have to take account of back references to capturing brackets that contain .*
4223 because in that case we can't make the assumption.
4225 Arguments:
4226 code points to start of expression (the bracket)
4227 bracket_map a bitmap of which brackets we are inside while testing; this
4228 handles up to substring 31; after that we just have to take
4229 the less precise approach
4230 backref_map the back reference bitmap
4232 Returns: TRUE or FALSE
4235 static BOOL
4236 is_startline(const uschar *code, unsigned int bracket_map,
4237 unsigned int backref_map)
4239 do {
4240 const uschar *scode = first_significant_code(code + 1+LINK_SIZE, NULL, 0,
4241 FALSE);
4242 register int op = *scode;
4244 /* Capturing brackets */
4246 if (op > OP_BRA)
4248 int new_map;
4249 op -= OP_BRA;
4250 if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE);
4251 new_map = bracket_map | ((op < 32)? (1 << op) : 1);
4252 if (!is_startline(scode, new_map, backref_map)) return FALSE;
4255 /* Other brackets */
4257 else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
4258 { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }
4260 /* .* means "start at start or after \n" if it isn't in brackets that
4261 may be referenced. */
4263 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
4265 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
4268 /* Check for explicit circumflex */
4270 else if (op != OP_CIRC) return FALSE;
4272 /* Move on to the next alternative */
4274 code += GET(code, 1);
4276 while (*code == OP_ALT); /* Loop for each alternative */
4277 return TRUE;
4282 /*************************************************
4283 * Check for asserted fixed first char *
4284 *************************************************/
4286 /* During compilation, the "first char" settings from forward assertions are
4287 discarded, because they can cause conflicts with actual literals that follow.
4288 However, if we end up without a first char setting for an unanchored pattern,
4289 it is worth scanning the regex to see if there is an initial asserted first
4290 char. If all branches start with the same asserted char, or with a bracket all
4291 of whose alternatives start with the same asserted char (recurse ad lib), then
4292 we return that char, otherwise -1.
4294 Arguments:
4295 code points to start of expression (the bracket)
4296 options pointer to the options (used to check casing changes)
4297 inassert TRUE if in an assertion
4299 Returns: -1 or the fixed first char
4302 static int
4303 find_firstassertedchar(const uschar *code, int *options, BOOL inassert)
4305 register int c = -1;
4306 do {
4307 int d;
4308 const uschar *scode =
4309 first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);
4310 register int op = *scode;
4312 if (op >= OP_BRA) op = OP_BRA;
4314 switch(op)
4316 default:
4317 return -1;
4319 case OP_BRA:
4320 case OP_ASSERT:
4321 case OP_ONCE:
4322 case OP_COND:
4323 if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
4324 return -1;
4325 if (c < 0) c = d; else if (c != d) return -1;
4326 break;
4328 case OP_EXACT: /* Fall through */
4329 scode += 2;
4331 case OP_CHAR:
4332 case OP_CHARNC:
4333 case OP_PLUS:
4334 case OP_MINPLUS:
4335 if (!inassert) return -1;
4336 if (c < 0)
4338 c = scode[1];
4339 if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
4341 else if (c != scode[1]) return -1;
4342 break;
4345 code += GET(code, 1);
4347 while (*code == OP_ALT);
4348 return c;
4354 #ifdef SUPPORT_UTF8
4355 /*************************************************
4356 * Validate a UTF-8 string *
4357 *************************************************/
4359 /* This function is called (optionally) at the start of compile or match, to
4360 validate that a supposed UTF-8 string is actually valid. The early check means
4361 that subsequent code can assume it is dealing with a valid string. The check
4362 can be turned off for maximum performance, but then consequences of supplying
4363 an invalid string are then undefined.
4365 Arguments:
4366 string points to the string
4367 length length of string, or -1 if the string is zero-terminated
4369 Returns: < 0 if the string is a valid UTF-8 string
4370 >= 0 otherwise; the value is the offset of the bad byte
4373 static int
4374 valid_utf8(const uschar *string, int length)
4376 register const uschar *p;
4378 if (length < 0)
4380 for (p = string; *p != 0; p++);
4381 length = p - string;
4384 for (p = string; length-- > 0; p++)
4386 register int ab;
4387 register int c = *p;
4388 if (c < 128) continue;
4389 if ((c & 0xc0) != 0xc0) return p - string;
4390 ab = utf8_table4[c & 0x3f]; /* Number of additional bytes */
4391 if (length < ab) return p - string;
4392 length -= ab;
4394 /* Check top bits in the second byte */
4395 if ((*(++p) & 0xc0) != 0x80) return p - string;
4397 /* Check for overlong sequences for each different length */
4398 switch (ab)
4400 /* Check for xx00 000x */
4401 case 1:
4402 if ((c & 0x3e) == 0) return p - string;
4403 continue; /* We know there aren't any more bytes to check */
4405 /* Check for 1110 0000, xx0x xxxx */
4406 case 2:
4407 if (c == 0xe0 && (*p & 0x20) == 0) return p - string;
4408 break;
4410 /* Check for 1111 0000, xx00 xxxx */
4411 case 3:
4412 if (c == 0xf0 && (*p & 0x30) == 0) return p - string;
4413 break;
4415 /* Check for 1111 1000, xx00 0xxx */
4416 case 4:
4417 if (c == 0xf8 && (*p & 0x38) == 0) return p - string;
4418 break;
4420 /* Check for leading 0xfe or 0xff, and then for 1111 1100, xx00 00xx */
4421 case 5:
4422 if (c == 0xfe || c == 0xff ||
4423 (c == 0xfc && (*p & 0x3c) == 0)) return p - string;
4424 break;
4427 /* Check for valid bytes after the 2nd, if any; all must start 10 */
4428 while (--ab > 0)
4430 if ((*(++p) & 0xc0) != 0x80) return p - string;
4434 return -1;
4436 #endif
4440 /*************************************************
4441 * Compile a Regular Expression *
4442 *************************************************/
4444 /* This function takes a string and returns a pointer to a block of store
4445 holding a compiled version of the expression.
4447 Arguments:
4448 pattern the regular expression
4449 options various option bits
4450 errorptr pointer to pointer to error text
4451 erroroffset ptr offset in pattern where error was detected
4452 tables pointer to character tables or NULL
4454 Returns: pointer to compiled data block, or NULL on error,
4455 with errorptr and erroroffset set
4458 EXPORT pcre *
4459 pcre_compile(const char *pattern, int options, const char **errorptr,
4460 int *erroroffset, const unsigned char *tables)
4462 real_pcre *re;
4463 int length = 1 + LINK_SIZE; /* For initial BRA plus length */
4464 /* int runlength; not used L.M. 2004-09-14 */
4465 int c, firstbyte, reqbyte;
4466 int bracount = 0;
4467 int branch_extra = 0;
4468 int branch_newextra;
4469 int item_count = -1;
4470 int name_count = 0;
4471 int max_name_size = 0;
4472 int lastitemlength = 0;
4473 #ifdef SUPPORT_UTF8
4474 BOOL utf8;
4475 BOOL class_utf8;
4476 #endif
4477 BOOL inescq = FALSE;
4478 unsigned int brastackptr = 0;
4479 size_t size;
4480 uschar *code;
4481 const uschar *codestart;
4482 const uschar *ptr;
4483 compile_data compile_block;
4484 int brastack[BRASTACK_SIZE];
4485 uschar bralenstack[BRASTACK_SIZE];
4487 /* We can't pass back an error message if errorptr is NULL; I guess the best we
4488 can do is just return NULL. */
4490 if (errorptr == NULL) return NULL;
4491 *errorptr = NULL;
4493 /* However, we can give a message for this error */
4495 if (erroroffset == NULL)
4497 *errorptr = ERR16;
4498 return NULL;
4500 *erroroffset = 0;
4502 /* Can't support UTF8 unless PCRE has been compiled to include the code. */
4504 #ifdef SUPPORT_UTF8
4505 utf8 = (options & PCRE_UTF8) != 0;
4506 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
4507 (*erroroffset = valid_utf8((uschar *)pattern, -1)) >= 0)
4509 *errorptr = ERR44;
4510 return NULL;
4512 #else
4513 if ((options & PCRE_UTF8) != 0)
4515 *errorptr = ERR32;
4516 return NULL;
4518 #endif
4520 if ((options & ~PUBLIC_OPTIONS) != 0)
4522 *errorptr = ERR17;
4523 return NULL;
4526 /* Set up pointers to the individual character tables */
4528 if (tables == NULL) tables = pcre_default_tables;
4529 compile_block.lcc = tables + lcc_offset;
4530 compile_block.fcc = tables + fcc_offset;
4531 compile_block.cbits = tables + cbits_offset;
4532 compile_block.ctypes = tables + ctypes_offset;
4534 /* Maximum back reference and backref bitmap. This is updated for numeric
4535 references during the first pass, but for named references during the actual
4536 compile pass. The bitmap records up to 31 back references to help in deciding
4537 whether (.*) can be treated as anchored or not. */
4539 compile_block.top_backref = 0;
4540 compile_block.backref_map = 0;
4542 /* Reflect pattern for debugging output */
4544 DPRINTF(("------------------------------------------------------------------\n"));
4545 DPRINTF(("%s\n", pattern));
4547 /* The first thing to do is to make a pass over the pattern to compute the
4548 amount of store required to hold the compiled code. This does not have to be
4549 perfect as long as errors are overestimates. At the same time we can detect any
4550 flag settings right at the start, and extract them. Make an attempt to correct
4551 for any counted white space if an "extended" flag setting appears late in the
4552 pattern. We can't be so clever for #-comments. */
4554 ptr = (const uschar *)(pattern - 1);
4555 while ((c = *(++ptr)) != 0)
4557 int min, max;
4558 int class_optcount;
4559 int bracket_length;
4560 int duplength;
4562 /* If we are inside a \Q...\E sequence, all chars are literal */
4564 if (inescq)
4566 if ((options & PCRE_AUTO_CALLOUT) != 0) length += 2 + 2*LINK_SIZE;
4567 goto NORMAL_CHAR;
4570 /* Otherwise, first check for ignored whitespace and comments */
4572 if ((options & PCRE_EXTENDED) != 0)
4574 if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
4575 if (c == '#')
4577 /* The space before the ; is to avoid a warning on a silly compiler
4578 on the Macintosh. */
4579 while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
4580 if (c == 0) break;
4581 continue;
4585 item_count++; /* Is zero for the first non-comment item */
4587 /* Allow space for auto callout before every item except quantifiers. */
4589 if ((options & PCRE_AUTO_CALLOUT) != 0 &&
4590 c != '*' && c != '+' && c != '?' &&
4591 (c != '{' || !is_counted_repeat(ptr + 1)))
4592 length += 2 + 2*LINK_SIZE;
4594 switch(c)
4596 /* A backslashed item may be an escaped data character or it may be a
4597 character type. */
4599 case '\\':
4600 c = check_escape(&ptr, errorptr, bracount, options, FALSE);
4601 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
4603 lastitemlength = 1; /* Default length of last item for repeats */
4605 if (c >= 0) /* Data character */
4607 length += 2; /* For a one-byte character */
4609 #ifdef SUPPORT_UTF8
4610 if (utf8 && c > 127)
4612 int i;
4613 for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
4614 if (c <= utf8_table1[i]) break;
4615 length += i;
4616 lastitemlength += i;
4618 #endif
4620 continue;
4623 /* If \Q, enter "literal" mode */
4625 if (-c == ESC_Q)
4627 inescq = TRUE;
4628 continue;
4631 /* \X is supported only if Unicode property support is compiled */
4633 #ifndef SUPPORT_UCP
4634 if (-c == ESC_X)
4636 *errorptr = ERR45;
4637 goto PCRE_ERROR_RETURN;
4639 #endif
4641 /* \P and \p are for Unicode properties, but only when the support has
4642 been compiled. Each item needs 2 bytes. */
4644 else if (-c == ESC_P || -c == ESC_p)
4646 #ifdef SUPPORT_UCP
4647 BOOL negated;
4648 length += 2;
4649 lastitemlength = 2;
4650 if (get_ucp(&ptr, &negated, errorptr) < 0) goto PCRE_ERROR_RETURN;
4651 continue;
4652 #else
4653 *errorptr = ERR45;
4654 goto PCRE_ERROR_RETURN;
4655 #endif
4658 /* Other escapes need one byte */
4660 length++;
4662 /* A back reference needs an additional 2 bytes, plus either one or 5
4663 bytes for a repeat. We also need to keep the value of the highest
4664 back reference. */
4666 if (c <= -ESC_REF)
4668 int refnum = -c - ESC_REF;
4669 compile_block.backref_map |= (refnum < 32)? (1 << refnum) : 1;
4670 if (refnum > compile_block.top_backref)
4671 compile_block.top_backref = refnum;
4672 length += 2; /* For single back reference */
4673 if (ptr[1] == '{' && is_counted_repeat(ptr+2))
4675 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
4676 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
4677 if ((min == 0 && (max == 1 || max == -1)) ||
4678 (min == 1 && max == -1))
4679 length++;
4680 else length += 5;
4681 if (ptr[1] == '?') ptr++;
4684 continue;
4686 case '^': /* Single-byte metacharacters */
4687 case '.':
4688 case '$':
4689 length++;
4690 lastitemlength = 1;
4691 continue;
4693 case '*': /* These repeats won't be after brackets; */
4694 case '+': /* those are handled separately */
4695 case '?':
4696 length++;
4697 goto POSESSIVE; /* A few lines below */
4699 /* This covers the cases of braced repeats after a single char, metachar,
4700 class, or back reference. */
4702 case '{':
4703 if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;
4704 ptr = read_repeat_counts(ptr+1, &min, &max, errorptr);
4705 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
4707 /* These special cases just insert one extra opcode */
4709 if ((min == 0 && (max == 1 || max == -1)) ||
4710 (min == 1 && max == -1))
4711 length++;
4713 /* These cases might insert additional copies of a preceding character. */
4715 else
4717 if (min != 1)
4719 length -= lastitemlength; /* Uncount the original char or metachar */
4720 if (min > 0) length += 3 + lastitemlength;
4722 length += lastitemlength + ((max > 0)? 3 : 1);
4725 if (ptr[1] == '?') ptr++; /* Needs no extra length */
4727 POSESSIVE: /* Test for possessive quantifier */
4728 if (ptr[1] == '+')
4730 ptr++;
4731 length += 2 + 2*LINK_SIZE; /* Allow for atomic brackets */
4733 continue;
4735 /* An alternation contains an offset to the next branch or ket. If any ims
4736 options changed in the previous branch(es), and/or if we are in a
4737 lookbehind assertion, extra space will be needed at the start of the
4738 branch. This is handled by branch_extra. */
4740 case '|':
4741 length += 1 + LINK_SIZE + branch_extra;
4742 continue;
4744 /* A character class uses 33 characters provided that all the character
4745 values are less than 256. Otherwise, it uses a bit map for low valued
4746 characters, and individual items for others. Don't worry about character
4747 types that aren't allowed in classes - they'll get picked up during the
4748 compile. A character class that contains only one single-byte character
4749 uses 2 or 3 bytes, depending on whether it is negated or not. Notice this
4750 where we can. (In UTF-8 mode we can do this only for chars < 128.) */
4752 case '[':
4753 if (*(++ptr) == '^')
4755 class_optcount = 10; /* Greater than one */
4756 ptr++;
4758 else class_optcount = 0;
4760 #ifdef SUPPORT_UTF8
4761 class_utf8 = FALSE;
4762 #endif
4764 /* Written as a "do" so that an initial ']' is taken as data */
4766 if (*ptr != 0) do
4768 /* Inside \Q...\E everything is literal except \E */
4770 if (inescq)
4772 if (*ptr != '\\' || ptr[1] != 'E') goto GET_ONE_CHARACTER;
4773 inescq = FALSE;
4774 ptr += 1;
4775 continue;
4778 /* Outside \Q...\E, check for escapes */
4780 if (*ptr == '\\')
4782 c = check_escape(&ptr, errorptr, bracount, options, TRUE);
4783 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
4785 /* \b is backspace inside a class; \X is literal */
4787 if (-c == ESC_b) c = '\b';
4788 else if (-c == ESC_X) c = 'X';
4790 /* \Q enters quoting mode */
4792 else if (-c == ESC_Q)
4794 inescq = TRUE;
4795 continue;
4798 /* Handle escapes that turn into characters */
4800 if (c >= 0) goto NON_SPECIAL_CHARACTER;
4802 /* Escapes that are meta-things. The normal ones just affect the
4803 bit map, but Unicode properties require an XCLASS extended item. */
4805 else
4807 class_optcount = 10; /* \d, \s etc; make sure > 1 */
4808 #ifdef SUPPORT_UTF8
4809 if (-c == ESC_p || -c == ESC_P)
4811 if (!class_utf8)
4813 class_utf8 = TRUE;
4814 length += LINK_SIZE + 2;
4816 length += 2;
4818 #endif
4822 /* Check the syntax for POSIX stuff. The bits we actually handle are
4823 checked during the real compile phase. */
4825 else if (*ptr == '[' && check_posix_syntax(ptr, &ptr, &compile_block))
4827 ptr++;
4828 class_optcount = 10; /* Make sure > 1 */
4831 /* Anything else increments the possible optimization count. We have to
4832 detect ranges here so that we can compute the number of extra ranges for
4833 caseless wide characters when UCP support is available. If there are wide
4834 characters, we are going to have to use an XCLASS, even for single
4835 characters. */
4837 else
4839 int d;
4841 GET_ONE_CHARACTER:
4843 #ifdef SUPPORT_UTF8
4844 if (utf8)
4846 int extra = 0;
4847 GETCHARLEN(c, ptr, extra);
4848 ptr += extra;
4850 else c = *ptr;
4851 #else
4852 c = *ptr;
4853 #endif
4855 /* Come here from handling \ above when it escapes to a char value */
4857 NON_SPECIAL_CHARACTER:
4858 class_optcount++;
4860 d = -1;
4861 if (ptr[1] == '-')
4863 uschar const *hyptr = ptr++;
4864 if (ptr[1] == '\\')
4866 ptr++;
4867 d = check_escape(&ptr, errorptr, bracount, options, TRUE);
4868 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
4869 if (-d == ESC_b) d = '\b'; /* backspace */
4870 else if (-d == ESC_X) d = 'X'; /* literal X in a class */
4872 else if (ptr[1] != 0 && ptr[1] != ']')
4874 ptr++;
4875 #ifdef SUPPORT_UTF8
4876 if (utf8)
4878 int extra = 0;
4879 GETCHARLEN(d, ptr, extra);
4880 ptr += extra;
4882 else
4883 #endif
4884 d = *ptr;
4886 if (d < 0) ptr = hyptr; /* go back to hyphen as data */
4889 /* If d >= 0 we have a range. In UTF-8 mode, if the end is > 255, or >
4890 127 for caseless matching, we will need to use an XCLASS. */
4892 if (d >= 0)
4894 class_optcount = 10; /* Ensure > 1 */
4895 if (d < c)
4897 *errorptr = ERR8;
4898 goto PCRE_ERROR_RETURN;
4901 #ifdef SUPPORT_UTF8
4902 if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
4904 uschar buffer[6];
4905 if (!class_utf8) /* Allow for XCLASS overhead */
4907 class_utf8 = TRUE;
4908 length += LINK_SIZE + 2;
4911 #ifdef SUPPORT_UCP
4912 /* If we have UCP support, find out how many extra ranges are
4913 needed to map the other case of characters within this range. We
4914 have to mimic the range optimization here, because extending the
4915 range upwards might push d over a boundary that makes is use
4916 another byte in the UTF-8 representation. */
4918 if ((options & PCRE_CASELESS) != 0)
4920 int occ, ocd;
4921 int cc = c;
4922 int origd = d;
4923 while (get_othercase_range(&cc, origd, &occ, &ocd))
4925 if (occ >= c && ocd <= d) continue; /* Skip embedded */
4927 if (occ < c && ocd >= c - 1) /* Extend the basic range */
4928 { /* if there is overlap, */
4929 c = occ; /* noting that if occ < c */
4930 continue; /* we can't have ocd > d */
4931 } /* because a subrange is */
4932 if (ocd > d && occ <= d + 1) /* always shorter than */
4933 { /* the basic range. */
4934 d = ocd;
4935 continue;
4938 /* An extra item is needed */
4940 length += 1 + ord2utf8(occ, buffer) +
4941 ((occ == ocd)? 0 : ord2utf8(ocd, buffer));
4944 #endif /* SUPPORT_UCP */
4946 /* The length of the (possibly extended) range */
4948 length += 1 + ord2utf8(c, buffer) + ord2utf8(d, buffer);
4950 #endif /* SUPPORT_UTF8 */
4954 /* We have a single character. There is nothing to be done unless we
4955 are in UTF-8 mode. If the char is > 255, or 127 when caseless, we must
4956 allow for an XCL_SINGLE item, doubled for caselessness if there is UCP
4957 support. */
4959 else
4961 #ifdef SUPPORT_UTF8
4962 if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
4964 uschar buffer[6];
4965 class_optcount = 10; /* Ensure > 1 */
4966 if (!class_utf8) /* Allow for XCLASS overhead */
4968 class_utf8 = TRUE;
4969 length += LINK_SIZE + 2;
4971 #ifdef SUPPORT_UCP
4972 length += (((options & PCRE_CASELESS) != 0)? 2 : 1) *
4973 (1 + ord2utf8(c, buffer));
4974 #else /* SUPPORT_UCP */
4975 length += 1 + ord2utf8(c, buffer);
4976 #endif /* SUPPORT_UCP */
4978 #endif /* SUPPORT_UTF8 */
4982 while (*(++ptr) != 0 && (inescq || *ptr != ']')); /* Concludes "do" above */
4984 if (*ptr == 0) /* Missing terminating ']' */
4986 *errorptr = ERR6;
4987 goto PCRE_ERROR_RETURN;
4990 /* We can optimize when there was only one optimizable character. Repeats
4991 for positive and negated single one-byte chars are handled by the general
4992 code. Here, we handle repeats for the class opcodes. */
4994 if (class_optcount == 1) length += 3; else
4996 length += 33;
4998 /* A repeat needs either 1 or 5 bytes. If it is a possessive quantifier,
4999 we also need extra for wrapping the whole thing in a sub-pattern. */
5001 if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2))
5003 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
5004 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
5005 if ((min == 0 && (max == 1 || max == -1)) ||
5006 (min == 1 && max == -1))
5007 length++;
5008 else length += 5;
5009 if (ptr[1] == '+')
5011 ptr++;
5012 length += 2 + 2*LINK_SIZE;
5014 else if (ptr[1] == '?') ptr++;
5017 continue;
5019 /* Brackets may be genuine groups or special things */
5021 case '(':
5022 branch_newextra = 0;
5023 bracket_length = 1 + LINK_SIZE;
5025 /* Handle special forms of bracket, which all start (? */
5027 if (ptr[1] == '?')
5029 int set, unset;
5030 int *optset;
5032 switch (c = ptr[2])
5034 /* Skip over comments entirely */
5035 case '#':
5036 ptr += 3;
5037 while (*ptr != 0 && *ptr != ')') ptr++;
5038 if (*ptr == 0)
5040 *errorptr = ERR18;
5041 goto PCRE_ERROR_RETURN;
5043 continue;
5045 /* Non-referencing groups and lookaheads just move the pointer on, and
5046 then behave like a non-special bracket, except that they don't increment
5047 the count of extracting brackets. Ditto for the "once only" bracket,
5048 which is in Perl from version 5.005. */
5050 case ':':
5051 case '=':
5052 case '!':
5053 case '>':
5054 ptr += 2;
5055 break;
5057 /* (?R) specifies a recursive call to the regex, which is an extension
5058 to provide the facility which can be obtained by (?p{perl-code}) in
5059 Perl 5.6. In Perl 5.8 this has become (??{perl-code}).
5061 From PCRE 4.00, items such as (?3) specify subroutine-like "calls" to
5062 the appropriate numbered brackets. This includes both recursive and
5063 non-recursive calls. (?R) is now synonymous with (?0). */
5065 case 'R':
5066 ptr++;
5068 case '0': case '1': case '2': case '3': case '4':
5069 case '5': case '6': case '7': case '8': case '9':
5070 ptr += 2;
5071 if (c != 'R')
5072 while ((digitab[*(++ptr)] & ctype_digit) != 0);
5073 if (*ptr != ')')
5075 *errorptr = ERR29;
5076 goto PCRE_ERROR_RETURN;
5078 length += 1 + LINK_SIZE;
5080 /* If this item is quantified, it will get wrapped inside brackets so
5081 as to use the code for quantified brackets. We jump down and use the
5082 code that handles this for real brackets. */
5084 if (ptr[1] == '+' || ptr[1] == '*' || ptr[1] == '?' || ptr[1] == '{')
5086 length += 2 + 2 * LINK_SIZE; /* to make bracketed */
5087 duplength = 5 + 3 * LINK_SIZE;
5088 goto HANDLE_QUANTIFIED_BRACKETS;
5090 continue;
5092 /* (?C) is an extension which provides "callout" - to provide a bit of
5093 the functionality of the Perl (?{...}) feature. An optional number may
5094 follow (default is zero). */
5096 case 'C':
5097 ptr += 2;
5098 while ((digitab[*(++ptr)] & ctype_digit) != 0);
5099 if (*ptr != ')')
5101 *errorptr = ERR39;
5102 goto PCRE_ERROR_RETURN;
5104 length += 2 + 2*LINK_SIZE;
5105 continue;
5107 /* Named subpatterns are an extension copied from Python */
5109 case 'P':
5110 ptr += 3;
5111 if (*ptr == '<')
5113 const uschar *p; /* Don't amalgamate; some compilers */
5114 p = ++ptr; /* grumble at autoincrement in declaration */
5115 while ((compile_block.ctypes[*ptr] & ctype_word) != 0) ptr++;
5116 if (*ptr != '>')
5118 *errorptr = ERR42;
5119 goto PCRE_ERROR_RETURN;
5121 name_count++;
5122 if (ptr - p > max_name_size) max_name_size = (ptr - p);
5123 break;
5126 if (*ptr == '=' || *ptr == '>')
5128 while ((compile_block.ctypes[*(++ptr)] & ctype_word) != 0);
5129 if (*ptr != ')')
5131 *errorptr = ERR42;
5132 goto PCRE_ERROR_RETURN;
5134 break;
5137 /* Unknown character after (?P */
5139 *errorptr = ERR41;
5140 goto PCRE_ERROR_RETURN;
5142 /* Lookbehinds are in Perl from version 5.005 */
5144 case '<':
5145 ptr += 3;
5146 if (*ptr == '=' || *ptr == '!')
5148 branch_newextra = 1 + LINK_SIZE;
5149 length += 1 + LINK_SIZE; /* For the first branch */
5150 break;
5152 *errorptr = ERR24;
5153 goto PCRE_ERROR_RETURN;
5155 /* Conditionals are in Perl from version 5.005. The bracket must either
5156 be followed by a number (for bracket reference) or by an assertion
5157 group, or (a PCRE extension) by 'R' for a recursion test. */
5159 case '(':
5160 if (ptr[3] == 'R' && ptr[4] == ')')
5162 ptr += 4;
5163 length += 3;
5165 else if ((digitab[ptr[3]] & ctype_digit) != 0)
5167 ptr += 4;
5168 length += 3;
5169 while ((digitab[*ptr] & ctype_digit) != 0) ptr++;
5170 if (*ptr != ')')
5172 *errorptr = ERR26;
5173 goto PCRE_ERROR_RETURN;
5176 else /* An assertion must follow */
5178 ptr++; /* Can treat like ':' as far as spacing is concerned */
5179 if (ptr[2] != '?' ||
5180 (ptr[3] != '=' && ptr[3] != '!' && ptr[3] != '<') )
5182 ptr += 2; /* To get right offset in message */
5183 *errorptr = ERR28;
5184 goto PCRE_ERROR_RETURN;
5187 break;
5189 /* Else loop checking valid options until ) is met. Anything else is an
5190 error. If we are without any brackets, i.e. at top level, the settings
5191 act as if specified in the options, so massage the options immediately.
5192 This is for backward compatibility with Perl 5.004. */
5194 default:
5195 set = unset = 0;
5196 optset = &set;
5197 ptr += 2;
5199 for (;; ptr++)
5201 c = *ptr;
5202 switch (c)
5204 case 'i':
5205 *optset |= PCRE_CASELESS;
5206 continue;
5208 case 'm':
5209 *optset |= PCRE_MULTILINE;
5210 continue;
5212 case 's':
5213 *optset |= PCRE_DOTALL;
5214 continue;
5216 case 'x':
5217 *optset |= PCRE_EXTENDED;
5218 continue;
5220 case 'X':
5221 *optset |= PCRE_EXTRA;
5222 continue;
5224 case 'U':
5225 *optset |= PCRE_UNGREEDY;
5226 continue;
5228 case '-':
5229 optset = &unset;
5230 continue;
5232 /* A termination by ')' indicates an options-setting-only item; if
5233 this is at the very start of the pattern (indicated by item_count
5234 being zero), we use it to set the global options. This is helpful
5235 when analyzing the pattern for first characters, etc. Otherwise
5236 nothing is done here and it is handled during the compiling
5237 process.
5239 [Historical note: Up to Perl 5.8, options settings at top level
5240 were always global settings, wherever they appeared in the pattern.
5241 That is, they were equivalent to an external setting. From 5.8
5242 onwards, they apply only to what follows (which is what you might
5243 expect).] */
5245 case ')':
5246 if (item_count == 0)
5248 options = (options | set) & (~unset);
5249 set = unset = 0; /* To save length */
5250 item_count--; /* To allow for several */
5253 /* Fall through */
5255 /* A termination by ':' indicates the start of a nested group with
5256 the given options set. This is again handled at compile time, but
5257 we must allow for compiled space if any of the ims options are
5258 set. We also have to allow for resetting space at the end of
5259 the group, which is why 4 is added to the length and not just 2.
5260 If there are several changes of options within the same group, this
5261 will lead to an over-estimate on the length, but this shouldn't
5262 matter very much. We also have to allow for resetting options at
5263 the start of any alternations, which we do by setting
5264 branch_newextra to 2. Finally, we record whether the case-dependent
5265 flag ever changes within the regex. This is used by the "required
5266 character" code. */
5268 case ':':
5269 if (((set|unset) & PCRE_IMS) != 0)
5271 length += 4;
5272 branch_newextra = 2;
5273 if (((set|unset) & PCRE_CASELESS) != 0) options |= PCRE_ICHANGED;
5275 goto END_OPTIONS;
5277 /* Unrecognized option character */
5279 default:
5280 *errorptr = ERR12;
5281 goto PCRE_ERROR_RETURN;
5285 /* If we hit a closing bracket, that's it - this is a freestanding
5286 option-setting. We need to ensure that branch_extra is updated if
5287 necessary. The only values branch_newextra can have here are 0 or 2.
5288 If the value is 2, then branch_extra must either be 2 or 5, depending
5289 on whether this is a lookbehind group or not. */
5291 END_OPTIONS:
5292 if (c == ')')
5294 if (branch_newextra == 2 &&
5295 (branch_extra == 0 || branch_extra == 1+LINK_SIZE))
5296 branch_extra += branch_newextra;
5297 continue;
5300 /* If options were terminated by ':' control comes here. Fall through
5301 to handle the group below. */
5305 /* Extracting brackets must be counted so we can process escapes in a
5306 Perlish way. If the number exceeds EXTRACT_BASIC_MAX we are going to
5307 need an additional 3 bytes of store per extracting bracket. However, if
5308 PCRE_NO_AUTO)CAPTURE is set, unadorned brackets become non-capturing, so we
5309 must leave the count alone (it will aways be zero). */
5311 else if ((options & PCRE_NO_AUTO_CAPTURE) == 0)
5313 bracount++;
5314 if (bracount > EXTRACT_BASIC_MAX) bracket_length += 3;
5317 /* Save length for computing whole length at end if there's a repeat that
5318 requires duplication of the group. Also save the current value of
5319 branch_extra, and start the new group with the new value. If non-zero, this
5320 will either be 2 for a (?imsx: group, or 3 for a lookbehind assertion. */
5322 if (brastackptr >= sizeof(brastack)/sizeof(int))
5324 *errorptr = ERR19;
5325 goto PCRE_ERROR_RETURN;
5328 bralenstack[brastackptr] = branch_extra;
5329 branch_extra = branch_newextra;
5331 brastack[brastackptr++] = length;
5332 length += bracket_length;
5333 continue;
5335 /* Handle ket. Look for subsequent max/min; for certain sets of values we
5336 have to replicate this bracket up to that many times. If brastackptr is
5337 0 this is an unmatched bracket which will generate an error, but take care
5338 not to try to access brastack[-1] when computing the length and restoring
5339 the branch_extra value. */
5341 case ')':
5342 length += 1 + LINK_SIZE;
5343 if (brastackptr > 0)
5345 duplength = length - brastack[--brastackptr];
5346 branch_extra = bralenstack[brastackptr];
5348 else duplength = 0;
5350 /* The following code is also used when a recursion such as (?3) is
5351 followed by a quantifier, because in that case, it has to be wrapped inside
5352 brackets so that the quantifier works. The value of duplength must be
5353 set before arrival. */
5355 HANDLE_QUANTIFIED_BRACKETS:
5357 /* Leave ptr at the final char; for read_repeat_counts this happens
5358 automatically; for the others we need an increment. */
5360 if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2))
5362 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
5363 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
5365 else if (c == '*') { min = 0; max = -1; ptr++; }
5366 else if (c == '+') { min = 1; max = -1; ptr++; }
5367 else if (c == '?') { min = 0; max = 1; ptr++; }
5368 else { min = 1; max = 1; }
5370 /* If the minimum is zero, we have to allow for an OP_BRAZERO before the
5371 group, and if the maximum is greater than zero, we have to replicate
5372 maxval-1 times; each replication acquires an OP_BRAZERO plus a nesting
5373 bracket set. */
5375 if (min == 0)
5377 length++;
5378 if (max > 0) length += (max - 1) * (duplength + 3 + 2*LINK_SIZE);
5381 /* When the minimum is greater than zero, we have to replicate up to
5382 minval-1 times, with no additions required in the copies. Then, if there
5383 is a limited maximum we have to replicate up to maxval-1 times allowing
5384 for a BRAZERO item before each optional copy and nesting brackets for all
5385 but one of the optional copies. */
5387 else
5389 length += (min - 1) * duplength;
5390 if (max > min) /* Need this test as max=-1 means no limit */
5391 length += (max - min) * (duplength + 3 + 2*LINK_SIZE)
5392 - (2 + 2*LINK_SIZE);
5395 /* Allow space for once brackets for "possessive quantifier" */
5397 if (ptr[1] == '+')
5399 ptr++;
5400 length += 2 + 2*LINK_SIZE;
5402 continue;
5404 /* Non-special character. It won't be space or # in extended mode, so it is
5405 always a genuine character. If we are in a \Q...\E sequence, check for the
5406 end; if not, we have a literal. */
5408 default:
5409 NORMAL_CHAR:
5411 if (inescq && c == '\\' && ptr[1] == 'E')
5413 inescq = FALSE;
5414 ptr++;
5415 continue;
5418 length += 2; /* For a one-byte character */
5419 lastitemlength = 1; /* Default length of last item for repeats */
5421 /* In UTF-8 mode, check for additional bytes. */
5423 #ifdef SUPPORT_UTF8
5424 if (utf8 && (c & 0xc0) == 0xc0)
5426 while ((ptr[1] & 0xc0) == 0x80) /* Can't flow over the end */
5427 { /* because the end is marked */
5428 lastitemlength++; /* by a zero byte. */
5429 length++;
5430 ptr++;
5433 #endif
5435 continue;
5439 length += 2 + LINK_SIZE; /* For final KET and END */
5441 if ((options & PCRE_AUTO_CALLOUT) != 0)
5442 length += 2 + 2*LINK_SIZE; /* For final callout */
5444 if (length > MAX_PATTERN_SIZE)
5446 *errorptr = ERR20;
5447 return NULL;
5450 /* Compute the size of data block needed and get it, either from malloc or
5451 externally provided function. */
5453 size = length + sizeof(real_pcre) + name_count * (max_name_size + 3);
5454 re = (real_pcre *)(pcre_malloc)(size);
5456 if (re == NULL)
5458 *errorptr = ERR21;
5459 return NULL;
5462 /* Put in the magic number, and save the sizes, options, and character table
5463 pointer. NULL is used for the default character tables. The nullpad field is at
5464 the end; it's there to help in the case when a regex compiled on a system with
5465 4-byte pointers is run on another with 8-byte pointers. */
5467 re->magic_number = MAGIC_NUMBER;
5468 re->size = size;
5469 re->options = options;
5470 re->dummy1 = re->dummy2 = 0;
5471 re->name_table_offset = sizeof(real_pcre);
5472 re->name_entry_size = max_name_size + 3;
5473 re->name_count = name_count;
5474 re->tables = (tables == pcre_default_tables)? NULL : tables;
5475 re->nullpad = NULL;
5477 /* The starting points of the name/number translation table and of the code are
5478 passed around in the compile data block. */
5480 compile_block.names_found = 0;
5481 compile_block.name_entry_size = max_name_size + 3;
5482 compile_block.name_table = (uschar *)re + re->name_table_offset;
5483 codestart = compile_block.name_table + re->name_entry_size * re->name_count;
5484 compile_block.start_code = codestart;
5485 compile_block.start_pattern = (const uschar *)pattern;
5486 compile_block.req_varyopt = 0;
5487 compile_block.nopartial = FALSE;
5489 /* Set up a starting, non-extracting bracket, then compile the expression. On
5490 error, *errorptr will be set non-NULL, so we don't need to look at the result
5491 of the function here. */
5493 ptr = (const uschar *)pattern;
5494 code = (uschar *)codestart;
5495 *code = OP_BRA;
5496 bracount = 0;
5497 (void)compile_regex(options, options & PCRE_IMS, &bracount, &code, &ptr,
5498 errorptr, FALSE, 0, &firstbyte, &reqbyte, NULL, &compile_block);
5499 re->top_bracket = bracount;
5500 re->top_backref = compile_block.top_backref;
5502 if (compile_block.nopartial) re->options |= PCRE_NOPARTIAL;
5504 /* If not reached end of pattern on success, there's an excess bracket. */
5506 if (*errorptr == NULL && *ptr != 0) *errorptr = ERR22;
5508 /* Fill in the terminating state and check for disastrous overflow, but
5509 if debugging, leave the test till after things are printed out. */
5511 *code++ = OP_END;
5513 #ifndef DEBUG
5514 if (code - codestart > length) *errorptr = ERR23;
5515 #endif
5517 /* Give an error if there's back reference to a non-existent capturing
5518 subpattern. */
5520 if (re->top_backref > re->top_bracket) *errorptr = ERR15;
5522 /* Failed to compile, or error while post-processing */
5524 if (*errorptr != NULL)
5526 (pcre_free)(re);
5527 PCRE_ERROR_RETURN:
5528 *erroroffset = ptr - (const uschar *)pattern;
5529 return NULL;
5532 /* If the anchored option was not passed, set the flag if we can determine that
5533 the pattern is anchored by virtue of ^ characters or \A or anything else (such
5534 as starting with .* when DOTALL is set).
5536 Otherwise, if we know what the first character has to be, save it, because that
5537 speeds up unanchored matches no end. If not, see if we can set the
5538 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
5539 start with ^. and also when all branches start with .* for non-DOTALL matches.
5542 if ((options & PCRE_ANCHORED) == 0)
5544 int temp_options = options;
5545 if (is_anchored(codestart, &temp_options, 0, compile_block.backref_map))
5546 re->options |= PCRE_ANCHORED;
5547 else
5549 if (firstbyte < 0)
5550 firstbyte = find_firstassertedchar(codestart, &temp_options, FALSE);
5551 if (firstbyte >= 0) /* Remove caseless flag for non-caseable chars */
5553 int ch = firstbyte & 255;
5554 re->first_byte = ((firstbyte & REQ_CASELESS) != 0 &&
5555 compile_block.fcc[ch] == ch)? ch : firstbyte;
5556 re->options |= PCRE_FIRSTSET;
5558 else if (is_startline(codestart, 0, compile_block.backref_map))
5559 re->options |= PCRE_STARTLINE;
5563 /* For an anchored pattern, we use the "required byte" only if it follows a
5564 variable length item in the regex. Remove the caseless flag for non-caseable
5565 bytes. */
5567 if (reqbyte >= 0 &&
5568 ((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0))
5570 int ch = reqbyte & 255;
5571 re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&
5572 compile_block.fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;
5573 re->options |= PCRE_REQCHSET;
5576 /* Print out the compiled data for debugging */
5578 #ifdef DEBUG
5580 printf("Length = %d top_bracket = %d top_backref = %d\n",
5581 length, re->top_bracket, re->top_backref);
5583 if (re->options != 0)
5585 printf("%s%s%s%s%s%s%s%s%s%s\n",
5586 ((re->options & PCRE_NOPARTIAL) != 0)? "nopartial " : "",
5587 ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
5588 ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
5589 ((re->options & PCRE_ICHANGED) != 0)? "case state changed " : "",
5590 ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
5591 ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
5592 ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
5593 ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
5594 ((re->options & PCRE_EXTRA) != 0)? "extra " : "",
5595 ((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");
5598 if ((re->options & PCRE_FIRSTSET) != 0)
5600 int ch = re->first_byte & 255;
5601 const char *caseless = ((re->first_byte & REQ_CASELESS) == 0)? "" : " (caseless)";
5602 if (isprint(ch)) printf("First char = %c%s\n", ch, caseless);
5603 else printf("First char = \\x%02x%s\n", ch, caseless);
5606 if ((re->options & PCRE_REQCHSET) != 0)
5608 int ch = re->req_byte & 255;
5609 const char *caseless = ((re->req_byte & REQ_CASELESS) == 0)? "" : " (caseless)";
5610 if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless);
5611 else printf("Req char = \\x%02x%s\n", ch, caseless);
5614 print_internals(re, stdout);
5616 /* This check is done here in the debugging case so that the code that
5617 was compiled can be seen. */
5619 if (code - codestart > length)
5621 *errorptr = ERR23;
5622 (pcre_free)(re);
5623 *erroroffset = ptr - (uschar *)pattern;
5624 return NULL;
5626 #endif
5628 return (pcre *)re;
5633 /*************************************************
5634 * Match a back-reference *
5635 *************************************************/
5637 /* If a back reference hasn't been set, the length that is passed is greater
5638 than the number of characters left in the string, so the match fails.
5640 Arguments:
5641 offset index into the offset vector
5642 eptr points into the subject
5643 length length to be matched
5644 md points to match data block
5645 ims the ims flags
5647 Returns: TRUE if matched
5650 static BOOL
5651 match_ref(int offset, register const uschar *eptr, int length, match_data *md,
5652 unsigned long int ims)
5654 const uschar *p = md->start_subject + md->offset_vector[offset];
5656 #ifdef DEBUG
5657 if (eptr >= md->end_subject)
5658 printf("matching subject <null>");
5659 else
5661 printf("matching subject ");
5662 pchars(eptr, length, TRUE, md);
5664 printf(" against backref ");
5665 pchars(p, length, FALSE, md);
5666 printf("\n");
5667 #endif
5669 /* Always fail if not enough characters left */
5671 if (length > md->end_subject - eptr) return FALSE;
5673 /* Separate the caselesss case for speed */
5675 if ((ims & PCRE_CASELESS) != 0)
5677 while (length-- > 0)
5678 if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;
5680 else
5681 { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
5683 return TRUE;
5687 #ifdef SUPPORT_UTF8
5688 /*************************************************
5689 * Match character against an XCLASS *
5690 *************************************************/
5692 /* This function is called from within the XCLASS code below, to match a
5693 character against an extended class which might match values > 255.
5695 Arguments:
5696 c the character
5697 data points to the flag byte of the XCLASS data
5699 Returns: TRUE if character matches, else FALSE
5702 static BOOL
5703 match_xclass(int c, const uschar *data)
5705 int t;
5706 BOOL negated = (*data & XCL_NOT) != 0;
5708 /* Character values < 256 are matched against a bitmap, if one is present. If
5709 not, we still carry on, because there may be ranges that start below 256 in the
5710 additional data. */
5712 if (c < 256)
5714 if ((*data & XCL_MAP) != 0 && (data[1 + c/8] & (1 << (c&7))) != 0)
5715 return !negated; /* char found */
5718 /* First skip the bit map if present. Then match against the list of Unicode
5719 properties or large chars or ranges that end with a large char. We won't ever
5720 encounter XCL_PROP or XCL_NOTPROP when UCP support is not compiled. */
5722 if ((*data++ & XCL_MAP) != 0) data += 32;
5724 while ((t = *data++) != XCL_END)
5726 int x, y;
5727 if (t == XCL_SINGLE)
5729 GETCHARINC(x, data);
5730 if (c == x) return !negated;
5732 else if (t == XCL_RANGE)
5734 GETCHARINC(x, data);
5735 GETCHARINC(y, data);
5736 if (c >= x && c <= y) return !negated;
5739 #ifdef SUPPORT_UCP
5740 else /* XCL_PROP & XCL_NOTPROP */
5742 int chartype, othercase;
5743 int rqdtype = *data++;
5744 int category = ucp_findchar(c, &chartype, &othercase);
5745 if (rqdtype >= 128)
5747 if ((rqdtype - 128 == category) == (t == XCL_PROP)) return !negated;
5749 else
5751 if ((rqdtype == chartype) == (t == XCL_PROP)) return !negated;
5754 #endif /* SUPPORT_UCP */
5757 return negated; /* char did not match */
5759 #endif
5762 /***************************************************************************
5763 ****************************************************************************
5764 RECURSION IN THE match() FUNCTION
5766 The match() function is highly recursive. Some regular expressions can cause
5767 it to recurse thousands of times. I was writing for Unix, so I just let it
5768 call itself recursively. This uses the stack for saving everything that has
5769 to be saved for a recursive call. On Unix, the stack can be large, and this
5770 works fine.
5772 It turns out that on non-Unix systems there are problems with programs that
5773 use a lot of stack. (This despite the fact that every last chip has oodles
5774 of memory these days, and techniques for extending the stack have been known
5775 for decades.) So....
5777 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
5778 calls by keeping local variables that need to be preserved in blocks of memory
5779 obtained from malloc instead instead of on the stack. Macros are used to
5780 achieve this so that the actual code doesn't look very different to what it
5781 always used to.
5782 ****************************************************************************
5783 ***************************************************************************/
5786 /* These versions of the macros use the stack, as normal */
5788 #ifndef NO_RECURSE
5789 #define REGISTER register
5790 #define RMATCH(rx,ra,rb,rc,rd,re,rf,rg) rx = match(ra,rb,rc,rd,re,rf,rg)
5791 #define RRETURN(ra) return ra
5792 #else
5795 /* These versions of the macros manage a private stack on the heap. Note
5796 that the rd argument of RMATCH isn't actually used. It's the md argument of
5797 match(), which never changes. */
5799 #define REGISTER
5801 #define RMATCH(rx,ra,rb,rc,rd,re,rf,rg)\
5803 heapframe *newframe = (pcre_stack_malloc)(sizeof(heapframe));\
5804 if (setjmp(frame->Xwhere) == 0)\
5806 newframe->Xeptr = ra;\
5807 newframe->Xecode = rb;\
5808 newframe->Xoffset_top = rc;\
5809 newframe->Xims = re;\
5810 newframe->Xeptrb = rf;\
5811 newframe->Xflags = rg;\
5812 newframe->Xprevframe = frame;\
5813 frame = newframe;\
5814 DPRINTF(("restarting from line %d\n", __LINE__));\
5815 goto HEAP_RECURSE;\
5817 else\
5819 DPRINTF(("longjumped back to line %d\n", __LINE__));\
5820 frame = md->thisframe;\
5821 rx = frame->Xresult;\
5825 #define RRETURN(ra)\
5827 heapframe *newframe = frame;\
5828 frame = newframe->Xprevframe;\
5829 (pcre_stack_free)(newframe);\
5830 if (frame != NULL)\
5832 frame->Xresult = ra;\
5833 md->thisframe = frame;\
5834 longjmp(frame->Xwhere, 1);\
5836 return ra;\
5840 /* Structure for remembering the local variables in a private frame */
5842 typedef struct heapframe {
5843 struct heapframe *Xprevframe;
5845 /* Function arguments that may change */
5847 const uschar *Xeptr;
5848 const uschar *Xecode;
5849 int Xoffset_top;
5850 long int Xims;
5851 eptrblock *Xeptrb;
5852 int Xflags;
5854 /* Function local variables */
5856 const uschar *Xcallpat;
5857 const uschar *Xcharptr;
5858 const uschar *Xdata;
5859 const uschar *Xnext;
5860 const uschar *Xpp;
5861 const uschar *Xprev;
5862 const uschar *Xsaved_eptr;
5864 recursion_info Xnew_recursive;
5866 BOOL Xcur_is_word;
5867 BOOL Xcondition;
5868 BOOL Xminimize;
5869 BOOL Xprev_is_word;
5871 unsigned long int Xoriginal_ims;
5873 #ifdef SUPPORT_UCP
5874 int Xprop_type;
5875 int Xprop_fail_result;
5876 int Xprop_category;
5877 int Xprop_chartype;
5878 int Xprop_othercase;
5879 int Xprop_test_against;
5880 int *Xprop_test_variable;
5881 #endif
5883 int Xctype;
5884 int Xfc;
5885 int Xfi;
5886 int Xlength;
5887 int Xmax;
5888 int Xmin;
5889 int Xnumber;
5890 int Xoffset;
5891 int Xop;
5892 int Xsave_capture_last;
5893 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
5894 int Xstacksave[REC_STACK_SAVE_MAX];
5896 eptrblock Xnewptrb;
5898 /* Place to pass back result, and where to jump back to */
5900 int Xresult;
5901 jmp_buf Xwhere;
5903 } heapframe;
5905 #endif
5908 /***************************************************************************
5909 ***************************************************************************/
5913 /*************************************************
5914 * Match from current position *
5915 *************************************************/
5917 /* On entry ecode points to the first opcode, and eptr to the first character
5918 in the subject string, while eptrb holds the value of eptr at the start of the
5919 last bracketed group - used for breaking infinite loops matching zero-length
5920 strings. This function is called recursively in many circumstances. Whenever it
5921 returns a negative (error) response, the outer incarnation must also return the
5922 same response.
5924 Performance note: It might be tempting to extract commonly used fields from the
5925 md structure (e.g. utf8, end_subject) into individual variables to improve
5926 performance. Tests using gcc on a SPARC disproved this; in the first case, it
5927 made performance worse.
5929 Arguments:
5930 eptr pointer in subject
5931 ecode position in code
5932 offset_top current top pointer
5933 md pointer to "static" info for the match
5934 ims current /i, /m, and /s options
5935 eptrb pointer to chain of blocks containing eptr at start of
5936 brackets - for testing for empty matches
5937 flags can contain
5938 match_condassert - this is an assertion condition
5939 match_isgroup - this is the start of a bracketed group
5941 Returns: MATCH_MATCH if matched ) these values are >= 0
5942 MATCH_NOMATCH if failed to match )
5943 a negative PCRE_ERROR_xxx value if aborted by an error condition
5944 (e.g. stopped by recursion limit)
5947 static int
5948 match(REGISTER const uschar *eptr, REGISTER const uschar *ecode,
5949 int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
5950 int flags)
5952 /* These variables do not need to be preserved over recursion in this function,
5953 so they can be ordinary variables in all cases. Mark them with "register"
5954 because they are used a lot in loops. */
5956 register int rrc; /* Returns from recursive calls */
5957 register int i; /* Used for loops not involving calls to RMATCH() */
5958 register int c; /* Character values not kept over RMATCH() calls */
5960 /* When recursion is not being used, all "local" variables that have to be
5961 preserved over calls to RMATCH() are part of a "frame" which is obtained from
5962 heap storage. Set up the top-level frame here; others are obtained from the
5963 heap whenever RMATCH() does a "recursion". See the macro definitions above. */
5965 #ifdef NO_RECURSE
5966 heapframe *frame = (pcre_stack_malloc)(sizeof(heapframe));
5967 frame->Xprevframe = NULL; /* Marks the top level */
5969 /* Copy in the original argument variables */
5971 frame->Xeptr = eptr;
5972 frame->Xecode = ecode;
5973 frame->Xoffset_top = offset_top;
5974 frame->Xims = ims;
5975 frame->Xeptrb = eptrb;
5976 frame->Xflags = flags;
5978 /* This is where control jumps back to to effect "recursion" */
5980 HEAP_RECURSE:
5982 /* Macros make the argument variables come from the current frame */
5984 #define eptr frame->Xeptr
5985 #define ecode frame->Xecode
5986 #define offset_top frame->Xoffset_top
5987 #define ims frame->Xims
5988 #define eptrb frame->Xeptrb
5989 #define flags frame->Xflags
5991 /* Ditto for the local variables */
5993 #ifdef SUPPORT_UTF8
5994 #define charptr frame->Xcharptr
5995 #endif
5996 #define callpat frame->Xcallpat
5997 #define data frame->Xdata
5998 #define next frame->Xnext
5999 #define pp frame->Xpp
6000 #define prev frame->Xprev
6001 #define saved_eptr frame->Xsaved_eptr
6003 #define new_recursive frame->Xnew_recursive
6005 #define cur_is_word frame->Xcur_is_word
6006 #define condition frame->Xcondition
6007 #define minimize frame->Xminimize
6008 #define prev_is_word frame->Xprev_is_word
6010 #define original_ims frame->Xoriginal_ims
6012 #ifdef SUPPORT_UCP
6013 #define prop_type frame->Xprop_type
6014 #define prop_fail_result frame->Xprop_fail_result
6015 #define prop_category frame->Xprop_category
6016 #define prop_chartype frame->Xprop_chartype
6017 #define prop_othercase frame->Xprop_othercase
6018 #define prop_test_against frame->Xprop_test_against
6019 #define prop_test_variable frame->Xprop_test_variable
6020 #endif
6022 #define ctype frame->Xctype
6023 #define fc frame->Xfc
6024 #define fi frame->Xfi
6025 #define length frame->Xlength
6026 #define max frame->Xmax
6027 #define min frame->Xmin
6028 #define number frame->Xnumber
6029 #define offset frame->Xoffset
6030 #define op frame->Xop
6031 #define save_capture_last frame->Xsave_capture_last
6032 #define save_offset1 frame->Xsave_offset1
6033 #define save_offset2 frame->Xsave_offset2
6034 #define save_offset3 frame->Xsave_offset3
6035 #define stacksave frame->Xstacksave
6037 #define newptrb frame->Xnewptrb
6039 /* When recursion is being used, local variables are allocated on the stack and
6040 get preserved during recursion in the normal way. In this environment, fi and
6041 i, and fc and c, can be the same variables. */
6043 #else
6044 #define fi i
6045 #define fc c
6048 #ifdef SUPPORT_UTF8 /* Many of these variables are used ony */
6049 const uschar *charptr; /* small blocks of the code. My normal */
6050 #endif /* style of coding would have declared */
6051 const uschar *callpat; /* them within each of those blocks. */
6052 const uschar *data; /* However, in order to accommodate the */
6053 const uschar *next; /* version of this code that uses an */
6054 const uschar *pp; /* external "stack" implemented on the */
6055 const uschar *prev; /* heap, it is easier to declare them */
6056 const uschar *saved_eptr; /* all here, so the declarations can */
6057 /* be cut out in a block. The only */
6058 recursion_info new_recursive; /* declarations within blocks below are */
6059 /* for variables that do not have to */
6060 BOOL cur_is_word; /* be preserved over a recursive call */
6061 BOOL condition; /* to RMATCH(). */
6062 BOOL minimize;
6063 BOOL prev_is_word;
6065 unsigned long int original_ims;
6067 #ifdef SUPPORT_UCP
6068 int prop_type;
6069 int prop_fail_result;
6070 int prop_category;
6071 int prop_chartype;
6072 int prop_othercase;
6073 int prop_test_against;
6074 int *prop_test_variable;
6075 #endif
6077 int ctype;
6078 int length;
6079 int max;
6080 int min;
6081 int number;
6082 int offset;
6083 int op;
6084 int save_capture_last;
6085 int save_offset1, save_offset2, save_offset3;
6086 int stacksave[REC_STACK_SAVE_MAX];
6088 eptrblock newptrb;
6089 #endif
6091 /* These statements are here to stop the compiler complaining about unitialized
6092 variables. */
6094 #ifdef SUPPORT_UCP
6095 prop_fail_result = 0;
6096 prop_test_against = 0;
6097 prop_test_variable = NULL;
6098 #endif
6100 /* OK, now we can get on with the real code of the function. Recursion is
6101 specified by the macros RMATCH and RRETURN. When NO_RECURSE is *not* defined,
6102 these just turn into a recursive call to match() and a "return", respectively.
6103 However, RMATCH isn't like a function call because it's quite a complicated
6104 macro. It has to be used in one particular way. This shouldn't, however, impact
6105 performance when true recursion is being used. */
6107 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
6109 original_ims = ims; /* Save for resetting on ')' */
6111 /* At the start of a bracketed group, add the current subject pointer to the
6112 stack of such pointers, to be re-instated at the end of the group when we hit
6113 the closing ket. When match() is called in other circumstances, we don't add to
6114 this stack. */
6116 if ((flags & match_isgroup) != 0)
6118 newptrb.epb_prev = eptrb;
6119 newptrb.epb_saved_eptr = eptr;
6120 eptrb = &newptrb;
6123 /* Now start processing the operations. */
6125 for (;;)
6127 op = *ecode;
6128 minimize = FALSE;
6130 /* For partial matching, remember if we ever hit the end of the subject after
6131 matching at least one subject character. */
6133 if (md->partial &&
6134 eptr >= md->end_subject &&
6135 eptr > md->start_match)
6136 md->hitend = TRUE;
6138 /* Opening capturing bracket. If there is space in the offset vector, save
6139 the current subject position in the working slot at the top of the vector. We
6140 mustn't change the current values of the data slot, because they may be set
6141 from a previous iteration of this group, and be referred to by a reference
6142 inside the group.
6144 If the bracket fails to match, we need to restore this value and also the
6145 values of the final offsets, in case they were set by a previous iteration of
6146 the same bracket.
6148 If there isn't enough space in the offset vector, treat this as if it were a
6149 non-capturing bracket. Don't worry about setting the flag for the error case
6150 here; that is handled in the code for KET. */
6152 if (op > OP_BRA)
6154 number = op - OP_BRA;
6156 /* For extended extraction brackets (large number), we have to fish out the
6157 number from a dummy opcode at the start. */
6159 if (number > EXTRACT_BASIC_MAX)
6160 number = GET2(ecode, 2+LINK_SIZE);
6161 offset = number << 1;
6163 #ifdef DEBUG
6164 printf("start bracket %d subject=", number);
6165 pchars(eptr, 16, TRUE, md);
6166 printf("\n");
6167 #endif
6169 if (offset < md->offset_max)
6171 save_offset1 = md->offset_vector[offset];
6172 save_offset2 = md->offset_vector[offset+1];
6173 save_offset3 = md->offset_vector[md->offset_end - number];
6174 save_capture_last = md->capture_last;
6176 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
6177 md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
6181 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb,
6182 match_isgroup);
6183 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6184 md->capture_last = save_capture_last;
6185 ecode += GET(ecode, 1);
6187 while (*ecode == OP_ALT);
6189 DPRINTF(("bracket %d failed\n", number));
6191 md->offset_vector[offset] = save_offset1;
6192 md->offset_vector[offset+1] = save_offset2;
6193 md->offset_vector[md->offset_end - number] = save_offset3;
6195 RRETURN(MATCH_NOMATCH);
6198 /* Insufficient room for saving captured contents */
6200 else op = OP_BRA;
6203 /* Other types of node can be handled by a switch */
6205 switch(op)
6207 case OP_BRA: /* Non-capturing bracket: optimized */
6208 DPRINTF(("start bracket 0\n"));
6211 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb,
6212 match_isgroup);
6213 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6214 ecode += GET(ecode, 1);
6216 while (*ecode == OP_ALT);
6217 DPRINTF(("bracket 0 failed\n"));
6218 RRETURN(MATCH_NOMATCH);
6220 /* Conditional group: compilation checked that there are no more than
6221 two branches. If the condition is false, skipping the first branch takes us
6222 past the end if there is only one branch, but that's OK because that is
6223 exactly what going to the ket would do. */
6225 case OP_COND:
6226 if (ecode[LINK_SIZE+1] == OP_CREF) /* Condition extract or recurse test */
6228 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
6229 condition = (offset == CREF_RECURSE * 2)?
6230 (md->recursive != NULL) :
6231 (offset < offset_top && md->offset_vector[offset] >= 0);
6232 RMATCH(rrc, eptr, ecode + (condition?
6233 (LINK_SIZE + 4) : (LINK_SIZE + 1 + GET(ecode, 1))),
6234 offset_top, md, ims, eptrb, match_isgroup);
6235 RRETURN(rrc);
6238 /* The condition is an assertion. Call match() to evaluate it - setting
6239 the final argument TRUE causes it to stop at the end of an assertion. */
6241 else
6243 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
6244 match_condassert | match_isgroup);
6245 if (rrc == MATCH_MATCH)
6247 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE+2);
6248 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
6250 else if (rrc != MATCH_NOMATCH)
6252 RRETURN(rrc); /* Need braces because of following else */
6254 else ecode += GET(ecode, 1);
6255 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb,
6256 match_isgroup);
6257 RRETURN(rrc);
6259 /* Control never reaches here */
6261 /* Skip over conditional reference or large extraction number data if
6262 encountered. */
6264 case OP_CREF:
6265 case OP_BRANUMBER:
6266 ecode += 3;
6267 break;
6269 /* End of the pattern. If we are in a recursion, we should restore the
6270 offsets appropriately and continue from after the call. */
6272 case OP_END:
6273 if (md->recursive != NULL && md->recursive->group_num == 0)
6275 recursion_info *rec = md->recursive;
6276 DPRINTF(("Hit the end in a (?0) recursion\n"));
6277 md->recursive = rec->prevrec;
6278 memmove(md->offset_vector, rec->offset_save,
6279 rec->saved_max * sizeof(int));
6280 md->start_match = rec->save_start;
6281 ims = original_ims;
6282 ecode = rec->after_call;
6283 break;
6286 /* Otherwise, if PCRE_NOTEMPTY is set, fail if we have matched an empty
6287 string - backtracking will then try other alternatives, if any. */
6289 if (md->notempty && eptr == md->start_match) RRETURN(MATCH_NOMATCH);
6290 md->end_match_ptr = eptr; /* Record where we ended */
6291 md->end_offset_top = offset_top; /* and how many extracts were taken */
6292 RRETURN(MATCH_MATCH);
6294 /* Change option settings */
6296 case OP_OPT:
6297 ims = ecode[1];
6298 ecode += 2;
6299 DPRINTF(("ims set to %02lx\n", ims));
6300 break;
6302 /* Assertion brackets. Check the alternative branches in turn - the
6303 matching won't pass the KET for an assertion. If any one branch matches,
6304 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
6305 start of each branch to move the current point backwards, so the code at
6306 this level is identical to the lookahead case. */
6308 case OP_ASSERT:
6309 case OP_ASSERTBACK:
6312 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
6313 match_isgroup);
6314 if (rrc == MATCH_MATCH) break;
6315 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6316 ecode += GET(ecode, 1);
6318 while (*ecode == OP_ALT);
6319 if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
6321 /* If checking an assertion for a condition, return MATCH_MATCH. */
6323 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
6325 /* Continue from after the assertion, updating the offsets high water
6326 mark, since extracts may have been taken during the assertion. */
6328 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
6329 ecode += 1 + LINK_SIZE;
6330 offset_top = md->end_offset_top;
6331 continue;
6333 /* Negative assertion: all branches must fail to match */
6335 case OP_ASSERT_NOT:
6336 case OP_ASSERTBACK_NOT:
6339 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
6340 match_isgroup);
6341 if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH);
6342 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6343 ecode += GET(ecode,1);
6345 while (*ecode == OP_ALT);
6347 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
6349 ecode += 1 + LINK_SIZE;
6350 continue;
6352 /* Move the subject pointer back. This occurs only at the start of
6353 each branch of a lookbehind assertion. If we are too close to the start to
6354 move back, this match function fails. When working with UTF-8 we move
6355 back a number of characters, not bytes. */
6357 case OP_REVERSE:
6358 #ifdef SUPPORT_UTF8
6359 if (md->utf8)
6361 c = GET(ecode,1);
6362 for (i = 0; i < c; i++)
6364 eptr--;
6365 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
6366 BACKCHAR(eptr)
6369 else
6370 #endif
6372 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
6375 eptr -= GET(ecode,1);
6376 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
6379 /* Skip to next op code */
6381 ecode += 1 + LINK_SIZE;
6382 break;
6384 /* The callout item calls an external function, if one is provided, passing
6385 details of the match so far. This is mainly for debugging, though the
6386 function is able to force a failure. */
6388 case OP_CALLOUT:
6389 if (pcre_callout != NULL)
6391 pcre_callout_block cb;
6392 cb.version = 1; /* Version 1 of the callout block */
6393 cb.callout_number = ecode[1];
6394 cb.offset_vector = md->offset_vector;
6395 cb.subject = (const char *)md->start_subject;
6396 cb.subject_length = md->end_subject - md->start_subject;
6397 cb.start_match = md->start_match - md->start_subject;
6398 cb.current_position = eptr - md->start_subject;
6399 cb.pattern_position = GET(ecode, 2);
6400 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
6401 cb.capture_top = offset_top/2;
6402 cb.capture_last = md->capture_last;
6403 cb.callout_data = md->callout_data;
6404 if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
6405 if (rrc < 0) RRETURN(rrc);
6407 ecode += 2 + 2*LINK_SIZE;
6408 break;
6410 /* Recursion either matches the current regex, or some subexpression. The
6411 offset data is the offset to the starting bracket from the start of the
6412 whole pattern. (This is so that it works from duplicated subpatterns.)
6414 If there are any capturing brackets started but not finished, we have to
6415 save their starting points and reinstate them after the recursion. However,
6416 we don't know how many such there are (offset_top records the completed
6417 total) so we just have to save all the potential data. There may be up to
6418 65535 such values, which is too large to put on the stack, but using malloc
6419 for small numbers seems expensive. As a compromise, the stack is used when
6420 there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
6421 is used. A problem is what to do if the malloc fails ... there is no way of
6422 returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
6423 values on the stack, and accept that the rest may be wrong.
6425 There are also other values that have to be saved. We use a chained
6426 sequence of blocks that actually live on the stack. Thanks to Robin Houston
6427 for the original version of this logic. */
6429 case OP_RECURSE:
6431 callpat = md->start_code + GET(ecode, 1);
6432 new_recursive.group_num = *callpat - OP_BRA;
6434 /* For extended extraction brackets (large number), we have to fish out
6435 the number from a dummy opcode at the start. */
6437 if (new_recursive.group_num > EXTRACT_BASIC_MAX)
6438 new_recursive.group_num = GET2(callpat, 2+LINK_SIZE);
6440 /* Add to "recursing stack" */
6442 new_recursive.prevrec = md->recursive;
6443 md->recursive = &new_recursive;
6445 /* Find where to continue from afterwards */
6447 ecode += 1 + LINK_SIZE;
6448 new_recursive.after_call = ecode;
6450 /* Now save the offset data. */
6452 new_recursive.saved_max = md->offset_end;
6453 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
6454 new_recursive.offset_save = stacksave;
6455 else
6457 new_recursive.offset_save =
6458 (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
6459 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
6462 memcpy(new_recursive.offset_save, md->offset_vector,
6463 new_recursive.saved_max * sizeof(int));
6464 new_recursive.save_start = md->start_match;
6465 md->start_match = eptr;
6467 /* OK, now we can do the recursion. For each top-level alternative we
6468 restore the offset and recursion data. */
6470 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
6473 RMATCH(rrc, eptr, callpat + 1 + LINK_SIZE, offset_top, md, ims,
6474 eptrb, match_isgroup);
6475 if (rrc == MATCH_MATCH)
6477 md->recursive = new_recursive.prevrec;
6478 if (new_recursive.offset_save != stacksave)
6479 (pcre_free)(new_recursive.offset_save);
6480 RRETURN(MATCH_MATCH);
6482 else if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6484 md->recursive = &new_recursive;
6485 memcpy(md->offset_vector, new_recursive.offset_save,
6486 new_recursive.saved_max * sizeof(int));
6487 callpat += GET(callpat, 1);
6489 while (*callpat == OP_ALT);
6491 DPRINTF(("Recursion didn't match\n"));
6492 md->recursive = new_recursive.prevrec;
6493 if (new_recursive.offset_save != stacksave)
6494 (pcre_free)(new_recursive.offset_save);
6495 RRETURN(MATCH_NOMATCH);
6497 /* Control never reaches here */
6499 /* "Once" brackets are like assertion brackets except that after a match,
6500 the point in the subject string is not moved back. Thus there can never be
6501 a move back into the brackets. Friedl calls these "atomic" subpatterns.
6502 Check the alternative branches in turn - the matching won't pass the KET
6503 for this kind of subpattern. If any one branch matches, we carry on as at
6504 the end of a normal bracket, leaving the subject pointer. */
6506 case OP_ONCE:
6508 prev = ecode;
6509 saved_eptr = eptr;
6513 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims,
6514 eptrb, match_isgroup);
6515 if (rrc == MATCH_MATCH) break;
6516 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6517 ecode += GET(ecode,1);
6519 while (*ecode == OP_ALT);
6521 /* If hit the end of the group (which could be repeated), fail */
6523 if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
6525 /* Continue as from after the assertion, updating the offsets high water
6526 mark, since extracts may have been taken. */
6528 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
6530 offset_top = md->end_offset_top;
6531 eptr = md->end_match_ptr;
6533 /* For a non-repeating ket, just continue at this level. This also
6534 happens for a repeating ket if no characters were matched in the group.
6535 This is the forcible breaking of infinite loops as implemented in Perl
6536 5.005. If there is an options reset, it will get obeyed in the normal
6537 course of events. */
6539 if (*ecode == OP_KET || eptr == saved_eptr)
6541 ecode += 1+LINK_SIZE;
6542 break;
6545 /* The repeating kets try the rest of the pattern or restart from the
6546 preceding bracket, in the appropriate order. We need to reset any options
6547 that changed within the bracket before re-running it, so check the next
6548 opcode. */
6550 if (ecode[1+LINK_SIZE] == OP_OPT)
6552 ims = (ims & ~PCRE_IMS) | ecode[4];
6553 DPRINTF(("ims set to %02lx at group repeat\n", ims));
6556 if (*ecode == OP_KETRMIN)
6558 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0);
6559 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6560 RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);
6561 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6563 else /* OP_KETRMAX */
6565 RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);
6566 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6567 RMATCH(rrc, eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0);
6568 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6571 RRETURN(MATCH_NOMATCH);
6573 /* An alternation is the end of a branch; scan along to find the end of the
6574 bracketed group and go to there. */
6576 case OP_ALT:
6577 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
6578 break;
6580 /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
6581 that it may occur zero times. It may repeat infinitely, or not at all -
6582 i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
6583 repeat limits are compiled as a number of copies, with the optional ones
6584 preceded by BRAZERO or BRAMINZERO. */
6586 case OP_BRAZERO:
6588 next = ecode+1;
6589 RMATCH(rrc, eptr, next, offset_top, md, ims, eptrb, match_isgroup);
6590 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6591 do next += GET(next,1); while (*next == OP_ALT);
6592 ecode = next + 1+LINK_SIZE;
6594 break;
6596 case OP_BRAMINZERO:
6598 next = ecode+1;
6599 do next += GET(next,1); while (*next == OP_ALT);
6600 RMATCH(rrc, eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb,
6601 match_isgroup);
6602 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6603 ecode++;
6605 break;
6607 /* End of a group, repeated or non-repeating. If we are at the end of
6608 an assertion "group", stop matching and return MATCH_MATCH, but record the
6609 current high water mark for use by positive assertions. Do this also
6610 for the "once" (not-backup up) groups. */
6612 case OP_KET:
6613 case OP_KETRMIN:
6614 case OP_KETRMAX:
6616 prev = ecode - GET(ecode, 1);
6617 saved_eptr = eptrb->epb_saved_eptr;
6619 /* Back up the stack of bracket start pointers. */
6621 eptrb = eptrb->epb_prev;
6623 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
6624 *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
6625 *prev == OP_ONCE)
6627 md->end_match_ptr = eptr; /* For ONCE */
6628 md->end_offset_top = offset_top;
6629 RRETURN(MATCH_MATCH);
6632 /* In all other cases except a conditional group we have to check the
6633 group number back at the start and if necessary complete handling an
6634 extraction by setting the offsets and bumping the high water mark. */
6636 if (*prev != OP_COND)
6638 number = *prev - OP_BRA;
6640 /* For extended extraction brackets (large number), we have to fish out
6641 the number from a dummy opcode at the start. */
6643 if (number > EXTRACT_BASIC_MAX) number = GET2(prev, 2+LINK_SIZE);
6644 offset = number << 1;
6646 #ifdef DEBUG
6647 printf("end bracket %d", number);
6648 printf("\n");
6649 #endif
6651 /* Test for a numbered group. This includes groups called as a result
6652 of recursion. Note that whole-pattern recursion is coded as a recurse
6653 into group 0, so it won't be picked up here. Instead, we catch it when
6654 the OP_END is reached. */
6656 if (number > 0)
6658 md->capture_last = number;
6659 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
6661 md->offset_vector[offset] =
6662 md->offset_vector[md->offset_end - number];
6663 md->offset_vector[offset+1] = eptr - md->start_subject;
6664 if (offset_top <= offset) offset_top = offset + 2;
6667 /* Handle a recursively called group. Restore the offsets
6668 appropriately and continue from after the call. */
6670 if (md->recursive != NULL && md->recursive->group_num == number)
6672 recursion_info *rec = md->recursive;
6673 DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
6674 md->recursive = rec->prevrec;
6675 md->start_match = rec->save_start;
6676 memcpy(md->offset_vector, rec->offset_save,
6677 rec->saved_max * sizeof(int));
6678 ecode = rec->after_call;
6679 ims = original_ims;
6680 break;
6685 /* Reset the value of the ims flags, in case they got changed during
6686 the group. */
6688 ims = original_ims;
6689 DPRINTF(("ims reset to %02lx\n", ims));
6691 /* For a non-repeating ket, just continue at this level. This also
6692 happens for a repeating ket if no characters were matched in the group.
6693 This is the forcible breaking of infinite loops as implemented in Perl
6694 5.005. If there is an options reset, it will get obeyed in the normal
6695 course of events. */
6697 if (*ecode == OP_KET || eptr == saved_eptr)
6699 ecode += 1 + LINK_SIZE;
6700 break;
6703 /* The repeating kets try the rest of the pattern or restart from the
6704 preceding bracket, in the appropriate order. */
6706 if (*ecode == OP_KETRMIN)
6708 RMATCH(rrc, eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0);
6709 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6710 RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);
6711 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6713 else /* OP_KETRMAX */
6715 RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);
6716 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6717 RMATCH(rrc, eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0);
6718 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6722 RRETURN(MATCH_NOMATCH);
6724 /* Start of subject unless notbol, or after internal newline if multiline */
6726 case OP_CIRC:
6727 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
6728 if ((ims & PCRE_MULTILINE) != 0)
6730 if (eptr != md->start_subject && eptr[-1] != NEWLINE)
6731 RRETURN(MATCH_NOMATCH);
6732 ecode++;
6733 break;
6735 /* ... else fall through */
6737 /* Start of subject assertion */
6739 case OP_SOD:
6740 if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
6741 ecode++;
6742 break;
6744 /* Start of match assertion */
6746 case OP_SOM:
6747 if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
6748 ecode++;
6749 break;
6751 /* Assert before internal newline if multiline, or before a terminating
6752 newline unless endonly is set, else end of subject unless noteol is set. */
6754 case OP_DOLL:
6755 if ((ims & PCRE_MULTILINE) != 0)
6757 if (eptr < md->end_subject)
6758 { if (*eptr != NEWLINE) RRETURN(MATCH_NOMATCH); }
6759 else
6760 { if (md->noteol) RRETURN(MATCH_NOMATCH); }
6761 ecode++;
6762 break;
6764 else
6766 if (md->noteol) RRETURN(MATCH_NOMATCH);
6767 if (!md->endonly)
6769 if (eptr < md->end_subject - 1 ||
6770 (eptr == md->end_subject - 1 && *eptr != NEWLINE))
6771 RRETURN(MATCH_NOMATCH);
6772 ecode++;
6773 break;
6776 /* ... else fall through */
6778 /* End of subject assertion (\z) */
6780 case OP_EOD:
6781 if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
6782 ecode++;
6783 break;
6785 /* End of subject or ending \n assertion (\Z) */
6787 case OP_EODN:
6788 if (eptr < md->end_subject - 1 ||
6789 (eptr == md->end_subject - 1 && *eptr != NEWLINE)) RRETURN(MATCH_NOMATCH);
6790 ecode++;
6791 break;
6793 /* Word boundary assertions */
6795 case OP_NOT_WORD_BOUNDARY:
6796 case OP_WORD_BOUNDARY:
6799 /* Find out if the previous and current characters are "word" characters.
6800 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
6801 be "non-word" characters. */
6803 #ifdef SUPPORT_UTF8
6804 if (md->utf8)
6806 if (eptr == md->start_subject) prev_is_word = FALSE; else
6808 const uschar *lastptr = eptr - 1;
6809 while((*lastptr & 0xc0) == 0x80) lastptr--;
6810 GETCHAR(c, lastptr);
6811 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
6813 if (eptr >= md->end_subject) cur_is_word = FALSE; else
6815 GETCHAR(c, eptr);
6816 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
6819 else
6820 #endif
6822 /* More streamlined when not in UTF-8 mode */
6825 prev_is_word = (eptr != md->start_subject) &&
6826 ((md->ctypes[eptr[-1]] & ctype_word) != 0);
6827 cur_is_word = (eptr < md->end_subject) &&
6828 ((md->ctypes[*eptr] & ctype_word) != 0);
6831 /* Now see if the situation is what we want */
6833 if ((*ecode++ == OP_WORD_BOUNDARY)?
6834 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
6835 RRETURN(MATCH_NOMATCH);
6837 break;
6839 /* Match a single character type; inline for speed */
6841 case OP_ANY:
6842 if ((ims & PCRE_DOTALL) == 0 && eptr < md->end_subject && *eptr == NEWLINE)
6843 RRETURN(MATCH_NOMATCH);
6844 if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
6845 #ifdef SUPPORT_UTF8
6846 if (md->utf8)
6847 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
6848 #endif
6849 ecode++;
6850 break;
6852 /* Match a single byte, even in UTF-8 mode. This opcode really does match
6853 any byte, even newline, independent of the setting of PCRE_DOTALL. */
6855 case OP_ANYBYTE:
6856 if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
6857 ecode++;
6858 break;
6860 case OP_NOT_DIGIT:
6861 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
6862 GETCHARINCTEST(c, eptr);
6863 if (
6864 #ifdef SUPPORT_UTF8
6865 c < 256 &&
6866 #endif
6867 (md->ctypes[c] & ctype_digit) != 0
6869 RRETURN(MATCH_NOMATCH);
6870 ecode++;
6871 break;
6873 case OP_DIGIT:
6874 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
6875 GETCHARINCTEST(c, eptr);
6876 if (
6877 #ifdef SUPPORT_UTF8
6878 c >= 256 ||
6879 #endif
6880 (md->ctypes[c] & ctype_digit) == 0
6882 RRETURN(MATCH_NOMATCH);
6883 ecode++;
6884 break;
6886 case OP_NOT_WHITESPACE:
6887 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
6888 GETCHARINCTEST(c, eptr);
6889 if (
6890 #ifdef SUPPORT_UTF8
6891 c < 256 &&
6892 #endif
6893 (md->ctypes[c] & ctype_space) != 0
6895 RRETURN(MATCH_NOMATCH);
6896 ecode++;
6897 break;
6899 case OP_WHITESPACE:
6900 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
6901 GETCHARINCTEST(c, eptr);
6902 if (
6903 #ifdef SUPPORT_UTF8
6904 c >= 256 ||
6905 #endif
6906 (md->ctypes[c] & ctype_space) == 0
6908 RRETURN(MATCH_NOMATCH);
6909 ecode++;
6910 break;
6912 case OP_NOT_WORDCHAR:
6913 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
6914 GETCHARINCTEST(c, eptr);
6915 if (
6916 #ifdef SUPPORT_UTF8
6917 c < 256 &&
6918 #endif
6919 (md->ctypes[c] & ctype_word) != 0
6921 RRETURN(MATCH_NOMATCH);
6922 ecode++;
6923 break;
6925 case OP_WORDCHAR:
6926 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
6927 GETCHARINCTEST(c, eptr);
6928 if (
6929 #ifdef SUPPORT_UTF8
6930 c >= 256 ||
6931 #endif
6932 (md->ctypes[c] & ctype_word) == 0
6934 RRETURN(MATCH_NOMATCH);
6935 ecode++;
6936 break;
6938 #ifdef SUPPORT_UCP
6939 /* Check the next character by Unicode property. We will get here only
6940 if the support is in the binary; otherwise a compile-time error occurs. */
6942 case OP_PROP:
6943 case OP_NOTPROP:
6944 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
6945 GETCHARINCTEST(c, eptr);
6947 int chartype, rqdtype;
6948 int othercase;
6949 int category = ucp_findchar(c, &chartype, &othercase);
6951 rqdtype = *(++ecode);
6952 ecode++;
6954 if (rqdtype >= 128)
6956 if ((rqdtype - 128 != category) == (op == OP_PROP))
6957 RRETURN(MATCH_NOMATCH);
6959 else
6961 if ((rqdtype != chartype) == (op == OP_PROP))
6962 RRETURN(MATCH_NOMATCH);
6965 break;
6967 /* Match an extended Unicode sequence. We will get here only if the support
6968 is in the binary; otherwise a compile-time error occurs. */
6970 case OP_EXTUNI:
6971 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
6972 GETCHARINCTEST(c, eptr);
6974 int chartype;
6975 int othercase;
6976 int category = ucp_findchar(c, &chartype, &othercase);
6977 if (category == ucp_M) RRETURN(MATCH_NOMATCH);
6978 while (eptr < md->end_subject)
6980 int len = 1;
6981 if (!md->utf8) c = *eptr; else
6983 GETCHARLEN(c, eptr, len);
6985 category = ucp_findchar(c, &chartype, &othercase);
6986 if (category != ucp_M) break;
6987 eptr += len;
6990 ecode++;
6991 break;
6992 #endif
6995 /* Match a back reference, possibly repeatedly. Look past the end of the
6996 item to see if there is repeat information following. The code is similar
6997 to that for character classes, but repeated for efficiency. Then obey
6998 similar code to character type repeats - written out again for speed.
6999 However, if the referenced string is the empty string, always treat
7000 it as matched, any number of times (otherwise there could be infinite
7001 loops). */
7003 case OP_REF:
7005 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
7006 ecode += 3; /* Advance past item */
7008 /* If the reference is unset, set the length to be longer than the amount
7009 of subject left; this ensures that every attempt at a match fails. We
7010 can't just fail here, because of the possibility of quantifiers with zero
7011 minima. */
7013 length = (offset >= offset_top || md->offset_vector[offset] < 0)?
7014 md->end_subject - eptr + 1 :
7015 md->offset_vector[offset+1] - md->offset_vector[offset];
7017 /* Set up for repetition, or handle the non-repeated case */
7019 switch (*ecode)
7021 case OP_CRSTAR:
7022 case OP_CRMINSTAR:
7023 case OP_CRPLUS:
7024 case OP_CRMINPLUS:
7025 case OP_CRQUERY:
7026 case OP_CRMINQUERY:
7027 c = *ecode++ - OP_CRSTAR;
7028 minimize = (c & 1) != 0;
7029 min = rep_min[c]; /* Pick up values from tables; */
7030 max = rep_max[c]; /* zero for max => infinity */
7031 if (max == 0) max = INT_MAX;
7032 break;
7034 case OP_CRRANGE:
7035 case OP_CRMINRANGE:
7036 minimize = (*ecode == OP_CRMINRANGE);
7037 min = GET2(ecode, 1);
7038 max = GET2(ecode, 3);
7039 if (max == 0) max = INT_MAX;
7040 ecode += 5;
7041 break;
7043 default: /* No repeat follows */
7044 if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
7045 eptr += length;
7046 continue; /* With the main loop */
7049 /* If the length of the reference is zero, just continue with the
7050 main loop. */
7052 if (length == 0) continue;
7054 /* First, ensure the minimum number of matches are present. We get back
7055 the length of the reference string explicitly rather than passing the
7056 address of eptr, so that eptr can be a register variable. */
7058 for (i = 1; i <= min; i++)
7060 if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
7061 eptr += length;
7064 /* If min = max, continue at the same level without recursion.
7065 They are not both allowed to be zero. */
7067 if (min == max) continue;
7069 /* If minimizing, keep trying and advancing the pointer */
7071 if (minimize)
7073 for (fi = min;; fi++)
7075 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7076 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7077 if (fi >= max || !match_ref(offset, eptr, length, md, ims))
7078 RRETURN(MATCH_NOMATCH);
7079 eptr += length;
7081 /* Control never gets here */
7084 /* If maximizing, find the longest string and work backwards */
7086 else
7088 pp = eptr;
7089 for (i = min; i < max; i++)
7091 if (!match_ref(offset, eptr, length, md, ims)) break;
7092 eptr += length;
7094 while (eptr >= pp)
7096 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7097 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7098 eptr -= length;
7100 RRETURN(MATCH_NOMATCH);
7103 /* Control never gets here */
7107 /* Match a bit-mapped character class, possibly repeatedly. This op code is
7108 used when all the characters in the class have values in the range 0-255,
7109 and either the matching is caseful, or the characters are in the range
7110 0-127 when UTF-8 processing is enabled. The only difference between
7111 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
7112 encountered.
7114 First, look past the end of the item to see if there is repeat information
7115 following. Then obey similar code to character type repeats - written out
7116 again for speed. */
7118 case OP_NCLASS:
7119 case OP_CLASS:
7121 data = ecode + 1; /* Save for matching */
7122 ecode += 33; /* Advance past the item */
7124 switch (*ecode)
7126 case OP_CRSTAR:
7127 case OP_CRMINSTAR:
7128 case OP_CRPLUS:
7129 case OP_CRMINPLUS:
7130 case OP_CRQUERY:
7131 case OP_CRMINQUERY:
7132 c = *ecode++ - OP_CRSTAR;
7133 minimize = (c & 1) != 0;
7134 min = rep_min[c]; /* Pick up values from tables; */
7135 max = rep_max[c]; /* zero for max => infinity */
7136 if (max == 0) max = INT_MAX;
7137 break;
7139 case OP_CRRANGE:
7140 case OP_CRMINRANGE:
7141 minimize = (*ecode == OP_CRMINRANGE);
7142 min = GET2(ecode, 1);
7143 max = GET2(ecode, 3);
7144 if (max == 0) max = INT_MAX;
7145 ecode += 5;
7146 break;
7148 default: /* No repeat follows */
7149 min = max = 1;
7150 break;
7153 /* First, ensure the minimum number of matches are present. */
7155 #ifdef SUPPORT_UTF8
7156 /* UTF-8 mode */
7157 if (md->utf8)
7159 for (i = 1; i <= min; i++)
7161 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
7162 GETCHARINC(c, eptr);
7163 if (c > 255)
7165 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
7167 else
7169 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
7173 else
7174 #endif
7175 /* Not UTF-8 mode */
7177 for (i = 1; i <= min; i++)
7179 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
7180 c = *eptr++;
7181 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
7185 /* If max == min we can continue with the main loop without the
7186 need to recurse. */
7188 if (min == max) continue;
7190 /* If minimizing, keep testing the rest of the expression and advancing
7191 the pointer while it matches the class. */
7193 if (minimize)
7195 #ifdef SUPPORT_UTF8
7196 /* UTF-8 mode */
7197 if (md->utf8)
7199 for (fi = min;; fi++)
7201 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7202 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7203 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
7204 GETCHARINC(c, eptr);
7205 if (c > 255)
7207 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
7209 else
7211 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
7215 else
7216 #endif
7217 /* Not UTF-8 mode */
7219 for (fi = min;; fi++)
7221 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7222 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7223 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
7224 c = *eptr++;
7225 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
7228 /* Control never gets here */
7231 /* If maximizing, find the longest possible run, then work backwards. */
7233 else
7235 pp = eptr;
7237 #ifdef SUPPORT_UTF8
7238 /* UTF-8 mode */
7239 if (md->utf8)
7241 for (i = min; i < max; i++)
7243 int len = 1;
7244 if (eptr >= md->end_subject) break;
7245 GETCHARLEN(c, eptr, len);
7246 if (c > 255)
7248 if (op == OP_CLASS) break;
7250 else
7252 if ((data[c/8] & (1 << (c&7))) == 0) break;
7254 eptr += len;
7256 for (;;)
7258 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7259 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7260 if (eptr-- == pp) break; /* Stop if tried at original pos */
7261 BACKCHAR(eptr);
7264 else
7265 #endif
7266 /* Not UTF-8 mode */
7268 for (i = min; i < max; i++)
7270 if (eptr >= md->end_subject) break;
7271 c = *eptr;
7272 if ((data[c/8] & (1 << (c&7))) == 0) break;
7273 eptr++;
7275 while (eptr >= pp)
7277 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7278 eptr--;
7279 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7283 RRETURN(MATCH_NOMATCH);
7286 /* Control never gets here */
7289 /* Match an extended character class. This opcode is encountered only
7290 in UTF-8 mode, because that's the only time it is compiled. */
7292 #ifdef SUPPORT_UTF8
7293 case OP_XCLASS:
7295 data = ecode + 1 + LINK_SIZE; /* Save for matching */
7296 ecode += GET(ecode, 1); /* Advance past the item */
7298 switch (*ecode)
7300 case OP_CRSTAR:
7301 case OP_CRMINSTAR:
7302 case OP_CRPLUS:
7303 case OP_CRMINPLUS:
7304 case OP_CRQUERY:
7305 case OP_CRMINQUERY:
7306 c = *ecode++ - OP_CRSTAR;
7307 minimize = (c & 1) != 0;
7308 min = rep_min[c]; /* Pick up values from tables; */
7309 max = rep_max[c]; /* zero for max => infinity */
7310 if (max == 0) max = INT_MAX;
7311 break;
7313 case OP_CRRANGE:
7314 case OP_CRMINRANGE:
7315 minimize = (*ecode == OP_CRMINRANGE);
7316 min = GET2(ecode, 1);
7317 max = GET2(ecode, 3);
7318 if (max == 0) max = INT_MAX;
7319 ecode += 5;
7320 break;
7322 default: /* No repeat follows */
7323 min = max = 1;
7324 break;
7327 /* First, ensure the minimum number of matches are present. */
7329 for (i = 1; i <= min; i++)
7331 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
7332 GETCHARINC(c, eptr);
7333 if (!match_xclass(c, data)) RRETURN(MATCH_NOMATCH);
7336 /* If max == min we can continue with the main loop without the
7337 need to recurse. */
7339 if (min == max) continue;
7341 /* If minimizing, keep testing the rest of the expression and advancing
7342 the pointer while it matches the class. */
7344 if (minimize)
7346 for (fi = min;; fi++)
7348 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7349 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7350 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
7351 GETCHARINC(c, eptr);
7352 if (!match_xclass(c, data)) RRETURN(MATCH_NOMATCH);
7354 /* Control never gets here */
7357 /* If maximizing, find the longest possible run, then work backwards. */
7359 else
7361 pp = eptr;
7362 for (i = min; i < max; i++)
7364 int len = 1;
7365 if (eptr >= md->end_subject) break;
7366 GETCHARLEN(c, eptr, len);
7367 if (!match_xclass(c, data)) break;
7368 eptr += len;
7370 for(;;)
7372 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7373 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7374 if (eptr-- == pp) break; /* Stop if tried at original pos */
7375 BACKCHAR(eptr)
7377 RRETURN(MATCH_NOMATCH);
7380 /* Control never gets here */
7382 #endif /* End of XCLASS */
7384 /* Match a single character, casefully */
7386 case OP_CHAR:
7387 #ifdef SUPPORT_UTF8
7388 if (md->utf8)
7390 length = 1;
7391 ecode++;
7392 GETCHARLEN(fc, ecode, length);
7393 if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
7394 while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
7396 else
7397 #endif
7399 /* Non-UTF-8 mode */
7401 if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
7402 if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
7403 ecode += 2;
7405 break;
7407 /* Match a single character, caselessly */
7409 case OP_CHARNC:
7410 #ifdef SUPPORT_UTF8
7411 if (md->utf8)
7413 length = 1;
7414 ecode++;
7415 GETCHARLEN(fc, ecode, length);
7417 if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
7419 /* If the pattern character's value is < 128, we have only one byte, and
7420 can use the fast lookup table. */
7422 if (fc < 128)
7424 if (md->lcc[*ecode++] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
7427 /* Otherwise we must pick up the subject character */
7429 else
7431 int dc;
7432 GETCHARINC(dc, eptr);
7433 ecode += length;
7435 /* If we have Unicode property support, we can use it to test the other
7436 case of the character, if there is one. The result of ucp_findchar() is
7437 < 0 if the char isn't found, and othercase is returned as zero if there
7438 isn't one. */
7440 if (fc != dc)
7442 #ifdef SUPPORT_UCP
7443 int chartype;
7444 int othercase;
7445 if (ucp_findchar(fc, &chartype, &othercase) < 0 || dc != othercase)
7446 #endif
7447 RRETURN(MATCH_NOMATCH);
7451 else
7452 #endif /* SUPPORT_UTF8 */
7454 /* Non-UTF-8 mode */
7456 if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
7457 if (md->lcc[ecode[1]] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
7458 ecode += 2;
7460 break;
7462 /* Match a single character repeatedly; different opcodes share code. */
7464 case OP_EXACT:
7465 min = max = GET2(ecode, 1);
7466 ecode += 3;
7467 goto REPEATCHAR;
7469 case OP_UPTO:
7470 case OP_MINUPTO:
7471 min = 0;
7472 max = GET2(ecode, 1);
7473 minimize = *ecode == OP_MINUPTO;
7474 ecode += 3;
7475 goto REPEATCHAR;
7477 case OP_STAR:
7478 case OP_MINSTAR:
7479 case OP_PLUS:
7480 case OP_MINPLUS:
7481 case OP_QUERY:
7482 case OP_MINQUERY:
7483 c = *ecode++ - OP_STAR;
7484 minimize = (c & 1) != 0;
7485 min = rep_min[c]; /* Pick up values from tables; */
7486 max = rep_max[c]; /* zero for max => infinity */
7487 if (max == 0) max = INT_MAX;
7489 /* Common code for all repeated single-character matches. We can give
7490 up quickly if there are fewer than the minimum number of characters left in
7491 the subject. */
7493 REPEATCHAR:
7494 #ifdef SUPPORT_UTF8
7495 if (md->utf8)
7497 length = 1;
7498 charptr = ecode;
7499 GETCHARLEN(fc, ecode, length);
7500 if (min * length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
7501 ecode += length;
7503 /* Handle multibyte character matching specially here. There is
7504 support for caseless matching if UCP support is present. */
7506 if (length > 1)
7508 int oclength = 0;
7509 uschar occhars[8];
7511 #ifdef SUPPORT_UCP
7512 int othercase;
7513 int chartype;
7514 if ((ims & PCRE_CASELESS) != 0 &&
7515 ucp_findchar(fc, &chartype, &othercase) >= 0 &&
7516 othercase > 0)
7517 oclength = ord2utf8(othercase, occhars);
7518 #endif /* SUPPORT_UCP */
7520 for (i = 1; i <= min; i++)
7522 if (memcmp(eptr, charptr, length) == 0) eptr += length;
7523 /* Need braces because of following else */
7524 else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
7525 else
7527 if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
7528 eptr += oclength;
7532 if (min == max) continue;
7534 if (minimize)
7536 for (fi = min;; fi++)
7538 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7539 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7540 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
7541 if (memcmp(eptr, charptr, length) == 0) eptr += length;
7542 /* Need braces because of following else */
7543 else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
7544 else
7546 if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
7547 eptr += oclength;
7550 /* Control never gets here */
7552 else
7554 pp = eptr;
7555 for (i = min; i < max; i++)
7557 if (eptr > md->end_subject - length) break;
7558 if (memcmp(eptr, charptr, length) == 0) eptr += length;
7559 else if (oclength == 0) break;
7560 else
7562 if (memcmp(eptr, occhars, oclength) != 0) break;
7563 eptr += oclength;
7566 while (eptr >= pp)
7568 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7569 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7570 eptr -= length;
7572 RRETURN(MATCH_NOMATCH);
7574 /* Control never gets here */
7577 /* If the length of a UTF-8 character is 1, we fall through here, and
7578 obey the code as for non-UTF-8 characters below, though in this case the
7579 value of fc will always be < 128. */
7581 else
7582 #endif /* SUPPORT_UTF8 */
7584 /* When not in UTF-8 mode, load a single-byte character. */
7586 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
7587 fc = *ecode++;
7590 /* The value of fc at this point is always less than 256, though we may or
7591 may not be in UTF-8 mode. The code is duplicated for the caseless and
7592 caseful cases, for speed, since matching characters is likely to be quite
7593 common. First, ensure the minimum number of matches are present. If min =
7594 max, continue at the same level without recursing. Otherwise, if
7595 minimizing, keep trying the rest of the expression and advancing one
7596 matching character if failing, up to the maximum. Alternatively, if
7597 maximizing, find the maximum number of characters and work backwards. */
7599 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
7600 max, eptr));
7602 if ((ims & PCRE_CASELESS) != 0)
7604 fc = md->lcc[fc];
7605 for (i = 1; i <= min; i++)
7606 if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
7607 if (min == max) continue;
7608 if (minimize)
7610 for (fi = min;; fi++)
7612 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7613 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7614 if (fi >= max || eptr >= md->end_subject ||
7615 fc != md->lcc[*eptr++])
7616 RRETURN(MATCH_NOMATCH);
7618 /* Control never gets here */
7620 else
7622 pp = eptr;
7623 for (i = min; i < max; i++)
7625 if (eptr >= md->end_subject || fc != md->lcc[*eptr]) break;
7626 eptr++;
7628 while (eptr >= pp)
7630 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7631 eptr--;
7632 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7634 RRETURN(MATCH_NOMATCH);
7636 /* Control never gets here */
7639 /* Caseful comparisons (includes all multi-byte characters) */
7641 else
7643 for (i = 1; i <= min; i++) if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
7644 if (min == max) continue;
7645 if (minimize)
7647 for (fi = min;; fi++)
7649 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7650 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7651 if (fi >= max || eptr >= md->end_subject || fc != *eptr++)
7652 RRETURN(MATCH_NOMATCH);
7654 /* Control never gets here */
7656 else
7658 pp = eptr;
7659 for (i = min; i < max; i++)
7661 if (eptr >= md->end_subject || fc != *eptr) break;
7662 eptr++;
7664 while (eptr >= pp)
7666 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7667 eptr--;
7668 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7670 RRETURN(MATCH_NOMATCH);
7673 /* Control never gets here */
7675 /* Match a negated single one-byte character. The character we are
7676 checking can be multibyte. */
7678 case OP_NOT:
7679 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
7680 ecode++;
7681 GETCHARINCTEST(c, eptr);
7682 if ((ims & PCRE_CASELESS) != 0)
7684 #ifdef SUPPORT_UTF8
7685 if (c < 256)
7686 #endif
7687 c = md->lcc[c];
7688 if (md->lcc[*ecode++] == c) RRETURN(MATCH_NOMATCH);
7690 else
7692 if (*ecode++ == c) RRETURN(MATCH_NOMATCH);
7694 break;
7696 /* Match a negated single one-byte character repeatedly. This is almost a
7697 repeat of the code for a repeated single character, but I haven't found a
7698 nice way of commoning these up that doesn't require a test of the
7699 positive/negative option for each character match. Maybe that wouldn't add
7700 very much to the time taken, but character matching *is* what this is all
7701 about... */
7703 case OP_NOTEXACT:
7704 min = max = GET2(ecode, 1);
7705 ecode += 3;
7706 goto REPEATNOTCHAR;
7708 case OP_NOTUPTO:
7709 case OP_NOTMINUPTO:
7710 min = 0;
7711 max = GET2(ecode, 1);
7712 minimize = *ecode == OP_NOTMINUPTO;
7713 ecode += 3;
7714 goto REPEATNOTCHAR;
7716 case OP_NOTSTAR:
7717 case OP_NOTMINSTAR:
7718 case OP_NOTPLUS:
7719 case OP_NOTMINPLUS:
7720 case OP_NOTQUERY:
7721 case OP_NOTMINQUERY:
7722 c = *ecode++ - OP_NOTSTAR;
7723 minimize = (c & 1) != 0;
7724 min = rep_min[c]; /* Pick up values from tables; */
7725 max = rep_max[c]; /* zero for max => infinity */
7726 if (max == 0) max = INT_MAX;
7728 /* Common code for all repeated single-byte matches. We can give up quickly
7729 if there are fewer than the minimum number of bytes left in the
7730 subject. */
7732 REPEATNOTCHAR:
7733 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
7734 fc = *ecode++;
7736 /* The code is duplicated for the caseless and caseful cases, for speed,
7737 since matching characters is likely to be quite common. First, ensure the
7738 minimum number of matches are present. If min = max, continue at the same
7739 level without recursing. Otherwise, if minimizing, keep trying the rest of
7740 the expression and advancing one matching character if failing, up to the
7741 maximum. Alternatively, if maximizing, find the maximum number of
7742 characters and work backwards. */
7744 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
7745 max, eptr));
7747 if ((ims & PCRE_CASELESS) != 0)
7749 fc = md->lcc[fc];
7751 #ifdef SUPPORT_UTF8
7752 /* UTF-8 mode */
7753 if (md->utf8)
7755 register int d;
7756 for (i = 1; i <= min; i++)
7758 GETCHARINC(d, eptr);
7759 if (d < 256) d = md->lcc[d];
7760 if (fc == d) RRETURN(MATCH_NOMATCH);
7763 else
7764 #endif
7766 /* Not UTF-8 mode */
7768 for (i = 1; i <= min; i++)
7769 if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
7772 if (min == max) continue;
7774 if (minimize)
7776 #ifdef SUPPORT_UTF8
7777 /* UTF-8 mode */
7778 if (md->utf8)
7780 register int d;
7781 for (fi = min;; fi++)
7783 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7784 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7785 GETCHARINC(d, eptr);
7786 if (d < 256) d = md->lcc[d];
7787 if (fi >= max || eptr >= md->end_subject || fc == d)
7788 RRETURN(MATCH_NOMATCH);
7791 else
7792 #endif
7793 /* Not UTF-8 mode */
7795 for (fi = min;; fi++)
7797 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7798 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7799 if (fi >= max || eptr >= md->end_subject || fc == md->lcc[*eptr++])
7800 RRETURN(MATCH_NOMATCH);
7803 /* Control never gets here */
7806 /* Maximize case */
7808 else
7810 pp = eptr;
7812 #ifdef SUPPORT_UTF8
7813 /* UTF-8 mode */
7814 if (md->utf8)
7816 register int d;
7817 for (i = min; i < max; i++)
7819 int len = 1;
7820 if (eptr >= md->end_subject) break;
7821 GETCHARLEN(d, eptr, len);
7822 if (d < 256) d = md->lcc[d];
7823 if (fc == d) break;
7824 eptr += len;
7826 for(;;)
7828 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7829 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7830 if (eptr-- == pp) break; /* Stop if tried at original pos */
7831 BACKCHAR(eptr);
7834 else
7835 #endif
7836 /* Not UTF-8 mode */
7838 for (i = min; i < max; i++)
7840 if (eptr >= md->end_subject || fc == md->lcc[*eptr]) break;
7841 eptr++;
7843 while (eptr >= pp)
7845 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7846 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7847 eptr--;
7851 RRETURN(MATCH_NOMATCH);
7853 /* Control never gets here */
7856 /* Caseful comparisons */
7858 else
7860 #ifdef SUPPORT_UTF8
7861 /* UTF-8 mode */
7862 if (md->utf8)
7864 register int d;
7865 for (i = 1; i <= min; i++)
7867 GETCHARINC(d, eptr);
7868 if (fc == d) RRETURN(MATCH_NOMATCH);
7871 else
7872 #endif
7873 /* Not UTF-8 mode */
7875 for (i = 1; i <= min; i++)
7876 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
7879 if (min == max) continue;
7881 if (minimize)
7883 #ifdef SUPPORT_UTF8
7884 /* UTF-8 mode */
7885 if (md->utf8)
7887 register int d;
7888 for (fi = min;; fi++)
7890 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7891 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7892 GETCHARINC(d, eptr);
7893 if (fi >= max || eptr >= md->end_subject || fc == d)
7894 RRETURN(MATCH_NOMATCH);
7897 else
7898 #endif
7899 /* Not UTF-8 mode */
7901 for (fi = min;; fi++)
7903 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7904 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7905 if (fi >= max || eptr >= md->end_subject || fc == *eptr++)
7906 RRETURN(MATCH_NOMATCH);
7909 /* Control never gets here */
7912 /* Maximize case */
7914 else
7916 pp = eptr;
7918 #ifdef SUPPORT_UTF8
7919 /* UTF-8 mode */
7920 if (md->utf8)
7922 register int d;
7923 for (i = min; i < max; i++)
7925 int len = 1;
7926 if (eptr >= md->end_subject) break;
7927 GETCHARLEN(d, eptr, len);
7928 if (fc == d) break;
7929 eptr += len;
7931 for(;;)
7933 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7934 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7935 if (eptr-- == pp) break; /* Stop if tried at original pos */
7936 BACKCHAR(eptr);
7939 else
7940 #endif
7941 /* Not UTF-8 mode */
7943 for (i = min; i < max; i++)
7945 if (eptr >= md->end_subject || fc == *eptr) break;
7946 eptr++;
7948 while (eptr >= pp)
7950 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7951 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7952 eptr--;
7956 RRETURN(MATCH_NOMATCH);
7959 /* Control never gets here */
7961 /* Match a single character type repeatedly; several different opcodes
7962 share code. This is very similar to the code for single characters, but we
7963 repeat it in the interests of efficiency. */
7965 case OP_TYPEEXACT:
7966 min = max = GET2(ecode, 1);
7967 minimize = TRUE;
7968 ecode += 3;
7969 goto REPEATTYPE;
7971 case OP_TYPEUPTO:
7972 case OP_TYPEMINUPTO:
7973 min = 0;
7974 max = GET2(ecode, 1);
7975 minimize = *ecode == OP_TYPEMINUPTO;
7976 ecode += 3;
7977 goto REPEATTYPE;
7979 case OP_TYPESTAR:
7980 case OP_TYPEMINSTAR:
7981 case OP_TYPEPLUS:
7982 case OP_TYPEMINPLUS:
7983 case OP_TYPEQUERY:
7984 case OP_TYPEMINQUERY:
7985 c = *ecode++ - OP_TYPESTAR;
7986 minimize = (c & 1) != 0;
7987 min = rep_min[c]; /* Pick up values from tables; */
7988 max = rep_max[c]; /* zero for max => infinity */
7989 if (max == 0) max = INT_MAX;
7991 /* Common code for all repeated single character type matches. Note that
7992 in UTF-8 mode, '.' matches a character of any length, but for the other
7993 character types, the valid characters are all one-byte long. */
7995 REPEATTYPE:
7996 ctype = *ecode++; /* Code for the character type */
7998 #ifdef SUPPORT_UCP
7999 if (ctype == OP_PROP || ctype == OP_NOTPROP)
8001 prop_fail_result = ctype == OP_NOTPROP;
8002 prop_type = *ecode++;
8003 if (prop_type >= 128)
8005 prop_test_against = prop_type - 128;
8006 prop_test_variable = &prop_category;
8008 else
8010 prop_test_against = prop_type;
8011 prop_test_variable = &prop_chartype;
8014 else prop_type = -1;
8015 #endif
8017 /* First, ensure the minimum number of matches are present. Use inline
8018 code for maximizing the speed, and do the type test once at the start
8019 (i.e. keep it out of the loop). Also we can test that there are at least
8020 the minimum number of bytes before we start. This isn't as effective in
8021 UTF-8 mode, but it does no harm. Separate the UTF-8 code completely as that
8022 is tidier. Also separate the UCP code, which can be the same for both UTF-8
8023 and single-bytes. */
8025 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
8026 if (min > 0)
8028 #ifdef SUPPORT_UCP
8029 if (prop_type > 0)
8031 for (i = 1; i <= min; i++)
8033 GETCHARINC(c, eptr);
8034 prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
8035 if ((*prop_test_variable == prop_test_against) == prop_fail_result)
8036 RRETURN(MATCH_NOMATCH);
8040 /* Match extended Unicode sequences. We will get here only if the
8041 support is in the binary; otherwise a compile-time error occurs. */
8043 else if (ctype == OP_EXTUNI)
8045 for (i = 1; i <= min; i++)
8047 GETCHARINCTEST(c, eptr);
8048 prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
8049 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
8050 while (eptr < md->end_subject)
8052 int len = 1;
8053 if (!md->utf8) c = *eptr; else
8055 GETCHARLEN(c, eptr, len);
8057 prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
8058 if (prop_category != ucp_M) break;
8059 eptr += len;
8064 else
8065 #endif /* SUPPORT_UCP */
8067 /* Handle all other cases when the coding is UTF-8 */
8069 #ifdef SUPPORT_UTF8
8070 if (md->utf8) switch(ctype)
8072 case OP_ANY:
8073 for (i = 1; i <= min; i++)
8075 if (eptr >= md->end_subject ||
8076 (*eptr++ == NEWLINE && (ims & PCRE_DOTALL) == 0))
8077 RRETURN(MATCH_NOMATCH);
8078 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
8080 break;
8082 case OP_ANYBYTE:
8083 eptr += min;
8084 break;
8086 case OP_NOT_DIGIT:
8087 for (i = 1; i <= min; i++)
8089 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
8090 GETCHARINC(c, eptr);
8091 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
8092 RRETURN(MATCH_NOMATCH);
8094 break;
8096 case OP_DIGIT:
8097 for (i = 1; i <= min; i++)
8099 if (eptr >= md->end_subject ||
8100 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
8101 RRETURN(MATCH_NOMATCH);
8102 /* No need to skip more bytes - we know it's a 1-byte character */
8104 break;
8106 case OP_NOT_WHITESPACE:
8107 for (i = 1; i <= min; i++)
8109 if (eptr >= md->end_subject ||
8110 (*eptr < 128 && (md->ctypes[*eptr++] & ctype_space) != 0))
8111 RRETURN(MATCH_NOMATCH);
8112 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
8114 break;
8116 case OP_WHITESPACE:
8117 for (i = 1; i <= min; i++)
8119 if (eptr >= md->end_subject ||
8120 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
8121 RRETURN(MATCH_NOMATCH);
8122 /* No need to skip more bytes - we know it's a 1-byte character */
8124 break;
8126 case OP_NOT_WORDCHAR:
8127 for (i = 1; i <= min; i++)
8129 if (eptr >= md->end_subject ||
8130 (*eptr < 128 && (md->ctypes[*eptr++] & ctype_word) != 0))
8131 RRETURN(MATCH_NOMATCH);
8132 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
8134 break;
8136 case OP_WORDCHAR:
8137 for (i = 1; i <= min; i++)
8139 if (eptr >= md->end_subject ||
8140 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
8141 RRETURN(MATCH_NOMATCH);
8142 /* No need to skip more bytes - we know it's a 1-byte character */
8144 break;
8146 default:
8147 RRETURN(PCRE_ERROR_INTERNAL);
8148 } /* End switch(ctype) */
8150 else
8151 #endif /* SUPPORT_UTF8 */
8153 /* Code for the non-UTF-8 case for minimum matching of operators other
8154 than OP_PROP and OP_NOTPROP. */
8156 switch(ctype)
8158 case OP_ANY:
8159 if ((ims & PCRE_DOTALL) == 0)
8161 for (i = 1; i <= min; i++)
8162 if (*eptr++ == NEWLINE) RRETURN(MATCH_NOMATCH);
8164 else eptr += min;
8165 break;
8167 case OP_ANYBYTE:
8168 eptr += min;
8169 break;
8171 case OP_NOT_DIGIT:
8172 for (i = 1; i <= min; i++)
8173 if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
8174 break;
8176 case OP_DIGIT:
8177 for (i = 1; i <= min; i++)
8178 if ((md->ctypes[*eptr++] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
8179 break;
8181 case OP_NOT_WHITESPACE:
8182 for (i = 1; i <= min; i++)
8183 if ((md->ctypes[*eptr++] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
8184 break;
8186 case OP_WHITESPACE:
8187 for (i = 1; i <= min; i++)
8188 if ((md->ctypes[*eptr++] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
8189 break;
8191 case OP_NOT_WORDCHAR:
8192 for (i = 1; i <= min; i++)
8193 if ((md->ctypes[*eptr++] & ctype_word) != 0)
8194 RRETURN(MATCH_NOMATCH);
8195 break;
8197 case OP_WORDCHAR:
8198 for (i = 1; i <= min; i++)
8199 if ((md->ctypes[*eptr++] & ctype_word) == 0)
8200 RRETURN(MATCH_NOMATCH);
8201 break;
8203 default:
8204 RRETURN(PCRE_ERROR_INTERNAL);
8208 /* If min = max, continue at the same level without recursing */
8210 if (min == max) continue;
8212 /* If minimizing, we have to test the rest of the pattern before each
8213 subsequent match. Again, separate the UTF-8 case for speed, and also
8214 separate the UCP cases. */
8216 if (minimize)
8218 #ifdef SUPPORT_UCP
8219 if (prop_type > 0)
8221 for (fi = min;; fi++)
8223 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
8224 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
8225 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
8226 GETCHARINC(c, eptr);
8227 prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
8228 if ((*prop_test_variable == prop_test_against) == prop_fail_result)
8229 RRETURN(MATCH_NOMATCH);
8233 /* Match extended Unicode sequences. We will get here only if the
8234 support is in the binary; otherwise a compile-time error occurs. */
8236 else if (ctype == OP_EXTUNI)
8238 for (fi = min;; fi++)
8240 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
8241 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
8242 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
8243 GETCHARINCTEST(c, eptr);
8244 prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
8245 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
8246 while (eptr < md->end_subject)
8248 int len = 1;
8249 if (!md->utf8) c = *eptr; else
8251 GETCHARLEN(c, eptr, len);
8253 prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
8254 if (prop_category != ucp_M) break;
8255 eptr += len;
8260 else
8261 #endif /* SUPPORT_UCP */
8263 #ifdef SUPPORT_UTF8
8264 /* UTF-8 mode */
8265 if (md->utf8)
8267 for (fi = min;; fi++)
8269 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
8270 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
8271 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
8273 GETCHARINC(c, eptr);
8274 switch(ctype)
8276 case OP_ANY:
8277 if ((ims & PCRE_DOTALL) == 0 && c == NEWLINE) RRETURN(MATCH_NOMATCH);
8278 break;
8280 case OP_ANYBYTE:
8281 break;
8283 case OP_NOT_DIGIT:
8284 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
8285 RRETURN(MATCH_NOMATCH);
8286 break;
8288 case OP_DIGIT:
8289 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
8290 RRETURN(MATCH_NOMATCH);
8291 break;
8293 case OP_NOT_WHITESPACE:
8294 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
8295 RRETURN(MATCH_NOMATCH);
8296 break;
8298 case OP_WHITESPACE:
8299 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
8300 RRETURN(MATCH_NOMATCH);
8301 break;
8303 case OP_NOT_WORDCHAR:
8304 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
8305 RRETURN(MATCH_NOMATCH);
8306 break;
8308 case OP_WORDCHAR:
8309 if (c >= 256 && (md->ctypes[c] & ctype_word) == 0)
8310 RRETURN(MATCH_NOMATCH);
8311 break;
8313 default:
8314 RRETURN(PCRE_ERROR_INTERNAL);
8318 else
8319 #endif
8320 /* Not UTF-8 mode */
8322 for (fi = min;; fi++)
8324 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
8325 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
8326 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
8327 c = *eptr++;
8328 switch(ctype)
8330 case OP_ANY:
8331 if ((ims & PCRE_DOTALL) == 0 && c == NEWLINE) RRETURN(MATCH_NOMATCH);
8332 break;
8334 case OP_ANYBYTE:
8335 break;
8337 case OP_NOT_DIGIT:
8338 if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
8339 break;
8341 case OP_DIGIT:
8342 if ((md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
8343 break;
8345 case OP_NOT_WHITESPACE:
8346 if ((md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
8347 break;
8349 case OP_WHITESPACE:
8350 if ((md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
8351 break;
8353 case OP_NOT_WORDCHAR:
8354 if ((md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
8355 break;
8357 case OP_WORDCHAR:
8358 if ((md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
8359 break;
8361 default:
8362 RRETURN(PCRE_ERROR_INTERNAL);
8366 /* Control never gets here */
8369 /* If maximizing it is worth using inline code for speed, doing the type
8370 test once at the start (i.e. keep it out of the loop). Again, keep the
8371 UTF-8 and UCP stuff separate. */
8373 else
8375 pp = eptr; /* Remember where we started */
8377 #ifdef SUPPORT_UCP
8378 if (prop_type > 0)
8380 for (i = min; i < max; i++)
8382 int len = 1;
8383 if (eptr >= md->end_subject) break;
8384 GETCHARLEN(c, eptr, len);
8385 prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
8386 if ((*prop_test_variable == prop_test_against) == prop_fail_result)
8387 break;
8388 eptr+= len;
8391 /* eptr is now past the end of the maximum run */
8393 for(;;)
8395 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
8396 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
8397 if (eptr-- == pp) break; /* Stop if tried at original pos */
8398 BACKCHAR(eptr);
8402 /* Match extended Unicode sequences. We will get here only if the
8403 support is in the binary; otherwise a compile-time error occurs. */
8405 else if (ctype == OP_EXTUNI)
8407 for (i = min; i < max; i++)
8409 if (eptr >= md->end_subject) break;
8410 GETCHARINCTEST(c, eptr);
8411 prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
8412 if (prop_category == ucp_M) break;
8413 while (eptr < md->end_subject)
8415 int len = 1;
8416 if (!md->utf8) c = *eptr; else
8418 GETCHARLEN(c, eptr, len);
8420 prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
8421 if (prop_category != ucp_M) break;
8422 eptr += len;
8426 /* eptr is now past the end of the maximum run */
8428 for(;;)
8430 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
8431 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
8432 if (eptr-- == pp) break; /* Stop if tried at original pos */
8433 for (;;) /* Move back over one extended */
8435 int len = 1;
8436 BACKCHAR(eptr);
8437 if (!md->utf8) c = *eptr; else
8439 GETCHARLEN(c, eptr, len);
8441 prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
8442 if (prop_category != ucp_M) break;
8443 eptr--;
8448 else
8449 #endif /* SUPPORT_UCP */
8451 #ifdef SUPPORT_UTF8
8452 /* UTF-8 mode */
8454 if (md->utf8)
8456 switch(ctype)
8458 case OP_ANY:
8460 /* Special code is required for UTF8, but when the maximum is unlimited
8461 we don't need it, so we repeat the non-UTF8 code. This is probably
8462 worth it, because .* is quite a common idiom. */
8464 if (max < INT_MAX)
8466 if ((ims & PCRE_DOTALL) == 0)
8468 for (i = min; i < max; i++)
8470 if (eptr >= md->end_subject || *eptr == NEWLINE) break;
8471 eptr++;
8472 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
8475 else
8477 for (i = min; i < max; i++)
8479 eptr++;
8480 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
8485 /* Handle unlimited UTF-8 repeat */
8487 else
8489 if ((ims & PCRE_DOTALL) == 0)
8491 for (i = min; i < max; i++)
8493 if (eptr >= md->end_subject || *eptr == NEWLINE) break;
8494 eptr++;
8496 break;
8498 else
8500 c = max - min;
8501 if (c > md->end_subject - eptr) c = md->end_subject - eptr;
8502 eptr += c;
8505 break;
8507 /* The byte case is the same as non-UTF8 */
8509 case OP_ANYBYTE:
8510 c = max - min;
8511 if (c > md->end_subject - eptr) c = md->end_subject - eptr;
8512 eptr += c;
8513 break;
8515 case OP_NOT_DIGIT:
8516 for (i = min; i < max; i++)
8518 int len = 1;
8519 if (eptr >= md->end_subject) break;
8520 GETCHARLEN(c, eptr, len);
8521 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
8522 eptr+= len;
8524 break;
8526 case OP_DIGIT:
8527 for (i = min; i < max; i++)
8529 int len = 1;
8530 if (eptr >= md->end_subject) break;
8531 GETCHARLEN(c, eptr, len);
8532 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
8533 eptr+= len;
8535 break;
8537 case OP_NOT_WHITESPACE:
8538 for (i = min; i < max; i++)
8540 int len = 1;
8541 if (eptr >= md->end_subject) break;
8542 GETCHARLEN(c, eptr, len);
8543 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
8544 eptr+= len;
8546 break;
8548 case OP_WHITESPACE:
8549 for (i = min; i < max; i++)
8551 int len = 1;
8552 if (eptr >= md->end_subject) break;
8553 GETCHARLEN(c, eptr, len);
8554 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
8555 eptr+= len;
8557 break;
8559 case OP_NOT_WORDCHAR:
8560 for (i = min; i < max; i++)
8562 int len = 1;
8563 if (eptr >= md->end_subject) break;
8564 GETCHARLEN(c, eptr, len);
8565 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
8566 eptr+= len;
8568 break;
8570 case OP_WORDCHAR:
8571 for (i = min; i < max; i++)
8573 int len = 1;
8574 if (eptr >= md->end_subject) break;
8575 GETCHARLEN(c, eptr, len);
8576 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
8577 eptr+= len;
8579 break;
8581 default:
8582 RRETURN(PCRE_ERROR_INTERNAL);
8585 /* eptr is now past the end of the maximum run */
8587 for(;;)
8589 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
8590 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
8591 if (eptr-- == pp) break; /* Stop if tried at original pos */
8592 BACKCHAR(eptr);
8595 else
8596 #endif
8598 /* Not UTF-8 mode */
8600 switch(ctype)
8602 case OP_ANY:
8603 if ((ims & PCRE_DOTALL) == 0)
8605 for (i = min; i < max; i++)
8607 if (eptr >= md->end_subject || *eptr == NEWLINE) break;
8608 eptr++;
8610 break;
8612 /* For DOTALL case, fall through and treat as \C */
8614 case OP_ANYBYTE:
8615 c = max - min;
8616 if (c > md->end_subject - eptr) c = md->end_subject - eptr;
8617 eptr += c;
8618 break;
8620 case OP_NOT_DIGIT:
8621 for (i = min; i < max; i++)
8623 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0)
8624 break;
8625 eptr++;
8627 break;
8629 case OP_DIGIT:
8630 for (i = min; i < max; i++)
8632 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0)
8633 break;
8634 eptr++;
8636 break;
8638 case OP_NOT_WHITESPACE:
8639 for (i = min; i < max; i++)
8641 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0)
8642 break;
8643 eptr++;
8645 break;
8647 case OP_WHITESPACE:
8648 for (i = min; i < max; i++)
8650 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0)
8651 break;
8652 eptr++;
8654 break;
8656 case OP_NOT_WORDCHAR:
8657 for (i = min; i < max; i++)
8659 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0)
8660 break;
8661 eptr++;
8663 break;
8665 case OP_WORDCHAR:
8666 for (i = min; i < max; i++)
8668 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0)
8669 break;
8670 eptr++;
8672 break;
8674 default:
8675 RRETURN(PCRE_ERROR_INTERNAL);
8678 /* eptr is now past the end of the maximum run */
8680 while (eptr >= pp)
8682 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
8683 eptr--;
8684 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
8688 /* Get here if we can't make it match with any permitted repetitions */
8690 RRETURN(MATCH_NOMATCH);
8692 /* Control never gets here */
8694 /* There's been some horrible disaster. Since all codes > OP_BRA are
8695 for capturing brackets, and there shouldn't be any gaps between 0 and
8696 OP_BRA, arrival here can only mean there is something seriously wrong
8697 in the code above or the OP_xxx definitions. */
8699 default:
8700 DPRINTF(("Unknown opcode %d\n", *ecode));
8701 RRETURN(PCRE_ERROR_UNKNOWN_NODE);
8704 /* Do not stick any code in here without much thought; it is assumed
8705 that "continue" in the code above comes out to here to repeat the main
8706 loop. */
8708 } /* End of main loop */
8709 /* Control never reaches here */
8713 /***************************************************************************
8714 ****************************************************************************
8715 RECURSION IN THE match() FUNCTION
8717 Undefine all the macros that were defined above to handle this. */
8719 #ifdef NO_RECURSE
8720 #undef eptr
8721 #undef ecode
8722 #undef offset_top
8723 #undef ims
8724 #undef eptrb
8725 #undef flags
8727 #undef callpat
8728 #undef charptr
8729 #undef data
8730 #undef next
8731 #undef pp
8732 #undef prev
8733 #undef saved_eptr
8735 #undef new_recursive
8737 #undef cur_is_word
8738 #undef condition
8739 #undef minimize
8740 #undef prev_is_word
8742 #undef original_ims
8744 #undef ctype
8745 #undef length
8746 #undef max
8747 #undef min
8748 #undef number
8749 #undef offset
8750 #undef op
8751 #undef save_capture_last
8752 #undef save_offset1
8753 #undef save_offset2
8754 #undef save_offset3
8755 #undef stacksave
8757 #undef newptrb
8759 #endif
8761 /* These two are defined as macros in both cases */
8763 #undef fc
8764 #undef fi
8766 /***************************************************************************
8767 ***************************************************************************/
8771 /*************************************************
8772 * Execute a Regular Expression *
8773 *************************************************/
8775 /* This function applies a compiled re to a subject string and picks out
8776 portions of the string if it matches. Two elements in the vector are set for
8777 each substring: the offsets to the start and end of the substring.
8779 Arguments:
8780 argument_re points to the compiled expression
8781 extra_data points to extra data or is NULL
8782 subject points to the subject string
8783 length length of subject string (may contain binary zeros)
8784 start_offset where to start in the subject string
8785 options option bits
8786 offsets points to a vector of ints to be filled in with offsets
8787 offsetcount the number of elements in the vector
8789 Returns: > 0 => success; value is the number of elements filled in
8790 = 0 => success, but offsets is not big enough
8791 -1 => failed to match
8792 < -1 => some kind of unexpected problem
8795 EXPORT int
8796 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
8797 const char *subject, int length, int start_offset, int options, int *offsets,
8798 int offsetcount)
8800 int rc, resetcount, ocount;
8801 int first_byte = -1;
8802 int req_byte = -1;
8803 int req_byte2 = -1;
8804 unsigned long int ims = 0;
8805 BOOL using_temporary_offsets = FALSE;
8806 BOOL anchored;
8807 BOOL startline;
8808 BOOL first_byte_caseless = FALSE;
8809 BOOL req_byte_caseless = FALSE;
8810 match_data match_block;
8811 const uschar *tables;
8812 const uschar *start_bits = NULL;
8813 const uschar *start_match = (const uschar *)subject + start_offset;
8814 const uschar *end_subject;
8815 const uschar *req_byte_ptr = start_match - 1;
8817 pcre_study_data internal_study;
8818 const pcre_study_data *study;
8820 real_pcre internal_re;
8821 const real_pcre *external_re = (const real_pcre *)argument_re;
8822 const real_pcre *re = external_re;
8824 /* Plausibility checks */
8826 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
8827 if (re == NULL || subject == NULL ||
8828 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
8829 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
8831 /* Fish out the optional data from the extra_data structure, first setting
8832 the default values. */
8834 study = NULL;
8835 match_block.match_limit = MATCH_LIMIT;
8836 match_block.callout_data = NULL;
8838 /* The table pointer is always in native byte order. */
8840 tables = external_re->tables;
8842 if (extra_data != NULL)
8844 register unsigned int flags = extra_data->flags;
8845 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
8846 study = (const pcre_study_data *)extra_data->study_data;
8847 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
8848 match_block.match_limit = extra_data->match_limit;
8849 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
8850 match_block.callout_data = extra_data->callout_data;
8851 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
8854 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
8855 is a feature that makes it possible to save compiled regex and re-use them
8856 in other programs later. */
8858 if (tables == NULL) tables = pcre_default_tables;
8860 /* Check that the first field in the block is the magic number. If it is not,
8861 test for a regex that was compiled on a host of opposite endianness. If this is
8862 the case, flipped values are put in internal_re and internal_study if there was
8863 study data too. */
8865 if (re->magic_number != MAGIC_NUMBER)
8867 re = try_flipped(re, &internal_re, study, &internal_study);
8868 if (re == NULL) return PCRE_ERROR_BADMAGIC;
8869 if (study != NULL) study = &internal_study;
8872 /* Set up other data */
8874 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
8875 startline = (re->options & PCRE_STARTLINE) != 0;
8877 /* The code starts after the real_pcre block and the capture name table. */
8879 match_block.start_code = (const uschar *)external_re + re->name_table_offset +
8880 re->name_count * re->name_entry_size;
8882 match_block.start_subject = (const uschar *)subject;
8883 match_block.start_offset = start_offset;
8884 match_block.end_subject = match_block.start_subject + length;
8885 end_subject = match_block.end_subject;
8887 match_block.endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
8888 match_block.utf8 = (re->options & PCRE_UTF8) != 0;
8890 match_block.notbol = (options & PCRE_NOTBOL) != 0;
8891 match_block.noteol = (options & PCRE_NOTEOL) != 0;
8892 match_block.notempty = (options & PCRE_NOTEMPTY) != 0;
8893 match_block.partial = (options & PCRE_PARTIAL) != 0;
8894 match_block.hitend = FALSE;
8896 match_block.recursive = NULL; /* No recursion at top level */
8898 match_block.lcc = tables + lcc_offset;
8899 match_block.ctypes = tables + ctypes_offset;
8901 /* Partial matching is supported only for a restricted set of regexes at the
8902 moment. */
8904 if (match_block.partial && (re->options & PCRE_NOPARTIAL) != 0)
8905 return PCRE_ERROR_BADPARTIAL;
8907 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
8908 back the character offset. */
8910 #ifdef SUPPORT_UTF8
8911 if (match_block.utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
8913 if (valid_utf8((uschar *)subject, length) >= 0)
8914 return PCRE_ERROR_BADUTF8;
8915 if (start_offset > 0 && start_offset < length)
8917 int tb = ((uschar *)subject)[start_offset];
8918 if (tb > 127)
8920 tb &= 0xc0;
8921 if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
8925 #endif
8927 /* The ims options can vary during the matching as a result of the presence
8928 of (?ims) items in the pattern. They are kept in a local variable so that
8929 restoring at the exit of a group is easy. */
8931 ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
8933 /* If the expression has got more back references than the offsets supplied can
8934 hold, we get a temporary chunk of working store to use during the matching.
8935 Otherwise, we can use the vector supplied, rounding down its size to a multiple
8936 of 3. */
8938 ocount = offsetcount - (offsetcount % 3);
8940 if (re->top_backref > 0 && re->top_backref >= ocount/3)
8942 ocount = re->top_backref * 3 + 3;
8943 match_block.offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
8944 if (match_block.offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
8945 using_temporary_offsets = TRUE;
8946 DPRINTF(("Got memory to hold back references\n"));
8948 else match_block.offset_vector = offsets;
8950 match_block.offset_end = ocount;
8951 match_block.offset_max = (2*ocount)/3;
8952 match_block.offset_overflow = FALSE;
8953 match_block.capture_last = -1;
8955 /* Compute the minimum number of offsets that we need to reset each time. Doing
8956 this makes a huge difference to execution time when there aren't many brackets
8957 in the pattern. */
8959 resetcount = 2 + re->top_bracket * 2;
8960 if (resetcount > offsetcount) resetcount = ocount;
8962 /* Reset the working variable associated with each extraction. These should
8963 never be used unless previously set, but they get saved and restored, and so we
8964 initialize them to avoid reading uninitialized locations. */
8966 if (match_block.offset_vector != NULL)
8968 register int *iptr = match_block.offset_vector + ocount;
8969 register int *iend = iptr - resetcount/2 + 1;
8970 while (--iptr >= iend) *iptr = -1;
8973 /* Set up the first character to match, if available. The first_byte value is
8974 never set for an anchored regular expression, but the anchoring may be forced
8975 at run time, so we have to test for anchoring. The first char may be unset for
8976 an unanchored pattern, of course. If there's no first char and the pattern was
8977 studied, there may be a bitmap of possible first characters. */
8979 if (!anchored)
8981 if ((re->options & PCRE_FIRSTSET) != 0)
8983 first_byte = re->first_byte & 255;
8984 if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
8985 first_byte = match_block.lcc[first_byte];
8987 else
8988 if (!startline && study != NULL &&
8989 (study->options & PCRE_STUDY_MAPPED) != 0)
8990 start_bits = study->start_bits;
8993 /* For anchored or unanchored matches, there may be a "last known required
8994 character" set. */
8996 if ((re->options & PCRE_REQCHSET) != 0)
8998 req_byte = re->req_byte & 255;
8999 req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
9000 req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */
9003 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
9004 the loop runs just once. */
9008 /* Reset the maximum number of extractions we might see. */
9010 if (match_block.offset_vector != NULL)
9012 register int *iptr = match_block.offset_vector;
9013 register int *iend = iptr + resetcount;
9014 while (iptr < iend) *iptr++ = -1;
9017 /* Advance to a unique first char if possible */
9019 if (first_byte >= 0)
9021 if (first_byte_caseless)
9022 while (start_match < end_subject &&
9023 match_block.lcc[*start_match] != first_byte)
9024 start_match++;
9025 else
9026 while (start_match < end_subject && *start_match != first_byte)
9027 start_match++;
9030 /* Or to just after \n for a multiline match if possible */
9032 else if (startline)
9034 if (start_match > match_block.start_subject + start_offset)
9036 while (start_match < end_subject && start_match[-1] != NEWLINE)
9037 start_match++;
9041 /* Or to a non-unique first char after study */
9043 else if (start_bits != NULL)
9045 while (start_match < end_subject)
9047 register unsigned int c = *start_match;
9048 if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break;
9052 #ifdef DEBUG /* Sigh. Some compilers never learn. */
9053 printf(">>>> Match against: ");
9054 pchars(start_match, end_subject - start_match, TRUE, &match_block);
9055 printf("\n");
9056 #endif
9058 /* If req_byte is set, we know that that character must appear in the subject
9059 for the match to succeed. If the first character is set, req_byte must be
9060 later in the subject; otherwise the test starts at the match point. This
9061 optimization can save a huge amount of backtracking in patterns with nested
9062 unlimited repeats that aren't going to match. Writing separate code for
9063 cased/caseless versions makes it go faster, as does using an autoincrement
9064 and backing off on a match.
9066 HOWEVER: when the subject string is very, very long, searching to its end can
9067 take a long time, and give bad performance on quite ordinary patterns. This
9068 showed up when somebody was matching /^C/ on a 32-megabyte string... so we
9069 don't do this when the string is sufficiently long.
9071 ALSO: this processing is disabled when partial matching is requested.
9074 if (req_byte >= 0 &&
9075 end_subject - start_match < REQ_BYTE_MAX &&
9076 !match_block.partial)
9078 register const uschar *p = start_match + ((first_byte >= 0)? 1 : 0);
9080 /* We don't need to repeat the search if we haven't yet reached the
9081 place we found it at last time. */
9083 if (p > req_byte_ptr)
9085 if (req_byte_caseless)
9087 while (p < end_subject)
9089 register int pp = *p++;
9090 if (pp == req_byte || pp == req_byte2) { p--; break; }
9093 else
9095 while (p < end_subject)
9097 if (*p++ == req_byte) { p--; break; }
9101 /* If we can't find the required character, break the matching loop */
9103 if (p >= end_subject) break;
9105 /* If we have found the required character, save the point where we
9106 found it, so that we don't search again next time round the loop if
9107 the start hasn't passed this character yet. */
9109 req_byte_ptr = p;
9113 /* When a match occurs, substrings will be set for all internal extractions;
9114 we just need to set up the whole thing as substring 0 before returning. If
9115 there were too many extractions, set the return code to zero. In the case
9116 where we had to get some local store to hold offsets for backreferences, copy
9117 those back references that we can. In this case there need not be overflow
9118 if certain parts of the pattern were not used. */
9120 match_block.start_match = start_match;
9121 match_block.match_call_count = 0;
9123 rc = match(start_match, match_block.start_code, 2, &match_block, ims, NULL,
9124 match_isgroup);
9126 if (rc == MATCH_NOMATCH)
9128 start_match++;
9129 #ifdef SUPPORT_UTF8
9130 if (match_block.utf8)
9131 while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
9132 start_match++;
9133 #endif
9134 continue;
9137 if (rc != MATCH_MATCH)
9139 DPRINTF((">>>> error: returning %d\n", rc));
9140 return rc;
9143 /* We have a match! Copy the offset information from temporary store if
9144 necessary */
9146 if (using_temporary_offsets)
9148 if (offsetcount >= 4)
9150 memcpy(offsets + 2, match_block.offset_vector + 2,
9151 (offsetcount - 2) * sizeof(int));
9152 DPRINTF(("Copied offsets from temporary memory\n"));
9154 if (match_block.end_offset_top > offsetcount)
9155 match_block.offset_overflow = TRUE;
9157 DPRINTF(("Freeing temporary memory\n"));
9158 (pcre_free)(match_block.offset_vector);
9161 rc = match_block.offset_overflow? 0 : match_block.end_offset_top/2;
9163 if (offsetcount < 2) rc = 0; else
9165 offsets[0] = start_match - match_block.start_subject;
9166 offsets[1] = match_block.end_match_ptr - match_block.start_subject;
9169 DPRINTF((">>>> returning %d\n", rc));
9170 return rc;
9173 /* This "while" is the end of the "do" above */
9175 while (!anchored && start_match <= end_subject);
9177 if (using_temporary_offsets)
9179 DPRINTF(("Freeing temporary memory\n"));
9180 (pcre_free)(match_block.offset_vector);
9183 if (match_block.partial && match_block.hitend)
9185 DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
9186 return PCRE_ERROR_PARTIAL;
9188 else
9190 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
9191 return PCRE_ERROR_NOMATCH;
9195 /* End of pcre.c */