Bug 591908: Remove Math.floor from microbenchmark iterations/sec calculation (r=fklockii)
[tamarin-stm.git] / pcre / pcre_compile.cpp
blob222cc16516e09f83dfdfa408d2cb1e54274ca7cb
1 #include "avmplus.h"
3 /*************************************************
4 * Perl-Compatible Regular Expressions *
5 *************************************************/
7 /* PCRE is a library of functions to support regular expressions whose syntax
8 and semantics are as close as possible to those of the Perl 5 language.
10 Written by Philip Hazel
11 Copyright (c) 1997-2007 University of Cambridge
13 -----------------------------------------------------------------------------
14 Redistribution and use in source and binary forms, with or without
15 modification, are permitted provided that the following conditions are met:
17 * Redistributions of source code must retain the above copyright notice,
18 this list of conditions and the following disclaimer.
20 * Redistributions in binary form must reproduce the above copyright
21 notice, this list of conditions and the following disclaimer in the
22 documentation and/or other materials provided with the distribution.
24 * Neither the name of the University of Cambridge nor the names of its
25 contributors may be used to endorse or promote products derived from
26 this software without specific prior written permission.
28 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
29 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
32 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
33 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
34 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
35 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
36 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
37 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38 POSSIBILITY OF SUCH DAMAGE.
39 -----------------------------------------------------------------------------
43 /* This module contains the external function pcre_compile(), along with
44 supporting internal functions that are not used by other modules. */
47 #include "config.h"
49 #define NLBLOCK cd /* Block containing newline information */
50 #define PSSTART start_pattern /* Field containing processed string start */
51 #define PSEND end_pattern /* Field containing processed string end */
53 #include "pcre_internal.h"
56 /* When DEBUG is defined, we need the pcre_printint() function, which is also
57 used by pcretest. DEBUG is not defined when building a production library. */
59 #ifdef PCRE_DEBUG
60 #include "pcre_printint.src"
61 #endif
64 /* Macro for setting individual bits in class bitmaps. */
66 #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
68 /* Maximum length value to check against when making sure that the integer that
69 holds the compiled pattern length does not overflow. We make it a bit less than
70 INT_MAX to allow for adding in group terminating bytes, so that we don't have
71 to check them every time. */
73 #define OFLOW_MAX (INT_MAX - 20)
76 /*************************************************
77 * Code parameters and static tables *
78 *************************************************/
80 /* This value specifies the size of stack workspace that is used during the
81 first pre-compile phase that determines how much memory is required. The regex
82 is partly compiled into this space, but the compiled parts are discarded as
83 soon as they can be, so that hopefully there will never be an overrun. The code
84 does, however, check for an overrun. The largest amount I've seen used is 218,
85 so this number is very generous.
87 The same workspace is used during the second, actual compile phase for
88 remembering forward references to groups so that they can be filled in at the
89 end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
90 is 4 there is plenty of room. */
92 //#define COMPILE_WORK_SIZE (4096)
93 #define COMPILE_WORK_SIZE (2048) //tierney: smaller size to avoid stack overflow when compiling patterns with many nested ()'s
96 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
97 are simple data values; negative values are for special things like \d and so
98 on. Zero means further processing is needed (for things like \x), or the escape
99 is invalid. */
101 #ifndef EBCDIC /* This is the "normal" table for ASCII systems */
102 static const short int escapes[] = {
103 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
104 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
105 '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */
106 -ESC_H, 0, 0, -ESC_K, 0, 0, 0, 0, /* H - O */
107 -ESC_P, -ESC_Q, -ESC_R, -ESC_S, 0, 0, -ESC_V, -ESC_W, /* P - W */
108 -ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
109 '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
110 -ESC_h, 0, 0, -ESC_k, 0, 0, ESC_n, 0, /* h - o */
111 -ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, -ESC_v, -ESC_w, /* p - w */
112 0, 0, -ESC_z /* x - z */
115 #else /* This is the "abnormal" table for EBCDIC systems */
116 static const short int escapes[] = {
117 /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
118 /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
119 /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
120 /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
121 /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
122 /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
123 /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
124 /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
125 /* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
126 /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
127 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
128 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
129 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
130 /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
131 /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
132 /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
133 /* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
134 /* D0 */ '}', 0, -ESC_K, 0, 0, 0, 0, -ESC_P,
135 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
136 /* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
137 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
138 /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
139 /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
141 #endif
144 /* Table of special "verbs" like (*PRUNE) */
146 typedef struct verbitem {
147 const char *name;
148 int len;
149 int op;
150 } verbitem;
152 static verbitem verbs[] = {
153 { "ACCEPT", 6, OP_ACCEPT },
154 { "COMMIT", 6, OP_COMMIT },
155 { "F", 1, OP_FAIL },
156 { "FAIL", 4, OP_FAIL },
157 { "PRUNE", 5, OP_PRUNE },
158 { "SKIP", 4, OP_SKIP },
159 { "THEN", 4, OP_THEN }
162 static int verbcount = sizeof(verbs)/sizeof(verbitem);
165 /* Tables of names of POSIX character classes and their lengths. The list is
166 terminated by a zero length entry. The first three must be alpha, lower, upper,
167 as this is assumed for handling case independence. */
169 static const char *const posix_names[] = {
170 "alpha", "lower", "upper",
171 "alnum", "ascii", "blank", "cntrl", "digit", "graph",
172 "print", "punct", "space", "word", "xdigit" };
174 static const uschar posix_name_lengths[] = {
175 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
177 /* Table of class bit maps for each POSIX class. Each class is formed from a
178 base map, with an optional addition or removal of another map. Then, for some
179 classes, there is some additional tweaking: for [:blank:] the vertical space
180 characters are removed, and for [:alpha:] and [:alnum:] the underscore
181 character is removed. The triples in the table consist of the base map offset,
182 second map offset or -1 if no second map, and a non-negative value for map
183 addition or a negative value for map subtraction (if there are two maps). The
184 absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
185 remove vertical space characters, 2 => remove underscore. */
187 static const int posix_class_maps[] = {
188 cbit_word, cbit_digit, -2, /* alpha */
189 cbit_lower, -1, 0, /* lower */
190 cbit_upper, -1, 0, /* upper */
191 cbit_word, -1, 2, /* alnum - word without underscore */
192 cbit_print, cbit_cntrl, 0, /* ascii */
193 cbit_space, -1, 1, /* blank - a GNU extension */
194 cbit_cntrl, -1, 0, /* cntrl */
195 cbit_digit, -1, 0, /* digit */
196 cbit_graph, -1, 0, /* graph */
197 cbit_print, -1, 0, /* print */
198 cbit_punct, -1, 0, /* punct */
199 cbit_space, -1, 0, /* space */
200 cbit_word, -1, 0, /* word - a Perl extension */
201 cbit_xdigit,-1, 0 /* xdigit */
205 #define STRING(a) # a
206 #define XSTRING(s) STRING(s)
208 /* The texts of compile-time error messages. These are "char *" because they
209 are passed to the outside world. Do not ever re-use any error number, because
210 they are documented. Always add a new error instead. Messages marked DEAD below
211 are no longer used. */
213 static const char *error_texts[] = {
214 #ifdef AVMPLUS_PCRE
220 /* 5 */
226 /* 10 */
232 /* 15 */
238 /* 20 */
244 /* 25 */
250 /* 30 */
256 /* 35 */
262 /* 40 */
268 /* 45 */
274 /* 50 */
275 "", /** DEAD **/
280 /* 55 */
286 /* 60 */
289 #else // AVMPLUS_PCRE
290 "no error",
291 "\\ at end of pattern",
292 "\\c at end of pattern",
293 "unrecognized character follows \\",
294 "numbers out of order in {} quantifier",
295 /* 5 */
296 "number too big in {} quantifier",
297 "missing terminating ] for character class",
298 "invalid escape sequence in character class",
299 "range out of order in character class",
300 "nothing to repeat",
301 /* 10 */
302 "operand of unlimited repeat could match the empty string", /** DEAD **/
303 "internal error: unexpected repeat",
304 "unrecognized character after (?",
305 "POSIX named classes are supported only within a class",
306 "missing )",
307 /* 15 */
308 "reference to non-existent subpattern",
309 "erroffset passed as NULL",
310 "unknown option bit(s) set",
311 "missing ) after comment",
312 "parentheses nested too deeply", /** DEAD **/
313 /* 20 */
314 "regular expression is too large",
315 "failed to get memory",
316 "unmatched parentheses",
317 "internal error: code overflow",
318 "unrecognized character after (?<",
319 /* 25 */
320 "lookbehind assertion is not fixed length",
321 "malformed number or name after (?(",
322 "conditional group contains more than two branches",
323 "assertion expected after (?(",
324 "(?R or (?[+-]digits must be followed by )",
325 /* 30 */
326 "unknown POSIX class name",
327 "POSIX collating elements are not supported",
328 "this version of PCRE is not compiled with PCRE_UTF8 support",
329 "spare error", /** DEAD **/
330 "character value in \\x{...} sequence is too large",
331 /* 35 */
332 "invalid condition (?(0)",
333 "\\C not allowed in lookbehind assertion",
334 "PCRE does not support \\L, \\l, \\N, \\U, or \\u",
335 "number after (?C is > 255",
336 "closing ) for (?C expected",
337 /* 40 */
338 "recursive call could loop indefinitely",
339 "unrecognized character after (?P",
340 "syntax error in subpattern name (missing terminator)",
341 "two named subpatterns have the same name",
342 "invalid UTF-8 string",
343 /* 45 */
344 "support for \\P, \\p, and \\X has not been compiled",
345 "malformed \\P or \\p sequence",
346 "unknown property name after \\P or \\p",
347 "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)",
348 "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")",
349 /* 50 */
350 "repeated subpattern is too long", /** DEAD **/
351 "octal value is greater than \\377 (not in UTF-8 mode)",
352 "internal error: overran compiling workspace",
353 "internal error: previously-checked referenced subpattern not found",
354 "DEFINE group contains more than one branch",
355 /* 55 */
356 "repeating a DEFINE group is not allowed",
357 "inconsistent NEWLINE options",
358 "\\g is not followed by a braced name or an optionally braced non-zero number",
359 "(?+ or (?- or (?(+ or (?(- must be followed by a non-zero number",
360 "(*VERB) with an argument is not supported",
361 /* 60 */
362 "(*VERB) not recognized",
363 "number is too big"
364 #endif // AVMPLUS_PCRE
368 /* Table to identify digits and hex digits. This is used when compiling
369 patterns. Note that the tables in chartables are dependent on the locale, and
370 may mark arbitrary characters as digits - but the PCRE compiling code expects
371 to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
372 a private table here. It costs 256 bytes, but it is a lot faster than doing
373 character value tests (at least in some simple cases I timed), and in some
374 applications one wants PCRE to compile efficiently as well as match
375 efficiently.
377 For convenience, we use the same bit definitions as in chartables:
379 0x04 decimal digit
380 0x08 hexadecimal digit
382 Then we can use ctype_digit and ctype_xdigit in the code. */
384 #ifndef EBCDIC /* This is the "normal" case, for ASCII systems */
385 static const unsigned char digitab[] =
387 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
388 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
389 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
390 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
391 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
392 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
393 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
394 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
395 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
396 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
397 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
398 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
399 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
400 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
401 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
402 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
403 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
404 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
405 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
406 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
407 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
408 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
409 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
410 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
411 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
412 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
413 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
414 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
415 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
416 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
417 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
418 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
420 #else /* This is the "abnormal" case, for EBCDIC systems */
421 static const unsigned char digitab[] =
423 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
424 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
425 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
426 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
427 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
428 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
429 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
430 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
431 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
432 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
433 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
434 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
435 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
436 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
437 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
438 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
439 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
440 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
441 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
442 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
443 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
444 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
445 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
446 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
447 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
448 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
449 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
450 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
451 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
452 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
453 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
454 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
456 static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
457 0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
458 0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
459 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
460 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
461 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
462 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
463 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
464 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
465 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
466 0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
467 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
468 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
469 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
470 0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
471 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
472 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
473 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
474 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
475 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
476 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
477 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
478 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
479 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
480 0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
481 0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
482 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
483 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
484 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
485 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
486 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
487 0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
488 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
489 #endif
492 /* Definition to allow mutual recursion */
494 static BOOL
495 compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
496 int *, int *, branch_chain *, compile_data *, int *);
500 /*************************************************
501 * Handle escapes *
502 *************************************************/
504 /* This function is called when a \ has been encountered. It either returns a
505 positive value for a simple escape such as \n, or a negative value which
506 encodes one of the more complicated things such as \d. A backreference to group
507 n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
508 UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
509 ptr is pointing at the \. On exit, it is on the final character of the escape
510 sequence.
512 Arguments:
513 ptrptr points to the pattern position pointer
514 errorcodeptr points to the errorcode variable
515 bracount number of previous extracting brackets
516 options the options bits
517 isclass TRUE if inside a character class
519 Returns: zero or positive => a data character
520 negative => a special escape sequence
521 on error, errorcodeptr is set
524 static int
525 check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
526 int options, BOOL isclass)
528 BOOL utf8 = (options & PCRE_UTF8) != 0;
529 const uschar *ptr = *ptrptr + 1;
530 int c, i;
532 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
533 ptr--; /* Set pointer back to the last byte */
535 /* If backslash is at the end of the pattern, it's an error. */
537 if (c == 0) *errorcodeptr = ERR1;
539 /* Non-alphamerics are literals. For digits or letters, do an initial lookup in
540 a table. A non-zero result is something that can be returned immediately.
541 Otherwise further processing may be required. */
543 #ifndef EBCDIC /* ASCII coding */
544 else if (c < '0' || c > 'z') {} /* Not alphameric */
545 else if ((i = escapes[c - '0']) != 0) c = i;
547 #else /* EBCDIC coding */
548 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphameric */
549 else if ((i = escapes[c - 0x48]) != 0) c = i;
550 #endif
552 /* Escapes that need further processing, or are illegal. */
554 else
556 const uschar *oldptr;
557 BOOL braced, negated;
559 switch (c)
561 /* A number of Perl escapes are not handled by PCRE. We give an explicit
562 error. */
564 case 'l':
565 case 'L':
566 case 'N':
567 case 'u':
568 case 'U':
569 *errorcodeptr = ERR37;
570 break;
572 /* \g must be followed by a number, either plain or braced. If positive, it
573 is an absolute backreference. If negative, it is a relative backreference.
574 This is a Perl 5.10 feature. Perl 5.10 also supports \g{name} as a
575 reference to a named group. This is part of Perl's movement towards a
576 unified syntax for back references. As this is synonymous with \k{name}, we
577 fudge it up by pretending it really was \k. */
579 case 'g':
580 if (ptr[1] == '{')
582 const uschar *p;
583 for (p = ptr+2; *p != 0 && *p != '}'; p++)
584 if (*p != '-' && (digitab[*p] & ctype_digit) == 0) break;
585 if (*p != 0 && *p != '}')
587 c = -ESC_k;
588 break;
590 braced = TRUE;
591 ptr++;
593 else braced = FALSE;
595 if (ptr[1] == '-')
597 negated = TRUE;
598 ptr++;
600 else negated = FALSE;
602 c = 0;
603 while ((digitab[ptr[1]] & ctype_digit) != 0)
604 c = c * 10 + *(++ptr) - '0';
606 if (c < 0)
608 *errorcodeptr = ERR61;
609 break;
612 if (c == 0 || (braced && *(++ptr) != '}'))
614 *errorcodeptr = ERR57;
615 break;
618 if (negated)
620 if (c > bracount)
622 *errorcodeptr = ERR15;
623 break;
625 c = bracount - (c - 1);
628 c = -(ESC_REF + c);
629 break;
631 /* The handling of escape sequences consisting of a string of digits
632 starting with one that is not zero is not straightforward. By experiment,
633 the way Perl works seems to be as follows:
635 Outside a character class, the digits are read as a decimal number. If the
636 number is less than 10, or if there are that many previous extracting
637 left brackets, then it is a back reference. Otherwise, up to three octal
638 digits are read to form an escaped byte. Thus \123 is likely to be octal
639 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
640 value is greater than 377, the least significant 8 bits are taken. Inside a
641 character class, \ followed by a digit is always an octal number. */
643 case '1': case '2': case '3': case '4': case '5':
644 case '6': case '7': case '8': case '9':
646 if (!isclass)
648 oldptr = ptr;
649 c -= '0';
650 while ((digitab[ptr[1]] & ctype_digit) != 0)
651 c = c * 10 + *(++ptr) - '0';
652 if (c < 0)
654 *errorcodeptr = ERR61;
655 break;
657 if (c < 10 || c <= bracount)
659 c = -(ESC_REF + c);
660 break;
662 ptr = oldptr; /* Put the pointer back and fall through */
665 /* Handle an octal number following \. If the first digit is 8 or 9, Perl
666 generates a binary zero byte and treats the digit as a following literal.
667 Thus we have to pull back the pointer by one. */
669 if ((c = *ptr) >= '8')
671 ptr--;
672 c = 0;
673 break;
676 /* \0 always starts an octal number, but we may drop through to here with a
677 larger first octal digit. The original code used just to take the least
678 significant 8 bits of octal numbers (I think this is what early Perls used
679 to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
680 than 3 octal digits. */
682 case '0':
683 c -= '0';
684 while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
685 c = c * 8 + *(++ptr) - '0';
686 if (!utf8 && c > 255) *errorcodeptr = ERR51;
687 break;
689 /* \x is complicated. \x{ddd} is a character number which can be greater
690 than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
691 treated as a data character. */
693 case 'x':
694 if (ptr[1] == '{')
696 const uschar *pt = ptr + 2;
697 int count = 0;
699 c = 0;
700 while ((digitab[*pt] & ctype_xdigit) != 0)
702 register int cc = *pt++;
703 if (c == 0 && cc == '0') continue; /* Leading zeroes */
704 count++;
706 #ifndef EBCDIC /* ASCII coding */
707 if (cc >= 'a') cc -= 32; /* Convert to upper case */
708 c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
709 #else /* EBCDIC coding */
710 if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */
711 c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
712 #endif
715 if (*pt == '}')
717 if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
718 ptr = pt;
719 break;
722 /* If the sequence of hex digits does not end with '}', then we don't
723 recognize this construct; fall through to the normal \x handling. */
726 /* Read just a single-byte hex-defined char */
728 c = 0;
729 while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
731 int cc; /* Some compilers don't like ++ */
732 cc = *(++ptr); /* in initializers */
733 #ifndef EBCDIC /* ASCII coding */
734 if (cc >= 'a') cc -= 32; /* Convert to upper case */
735 c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
736 #else /* EBCDIC coding */
737 if (cc <= 'z') cc += 64; /* Convert to upper case */
738 c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
739 #endif
741 break;
743 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
744 This coding is ASCII-specific, but then the whole concept of \cx is
745 ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
747 case 'c':
748 c = *(++ptr);
749 if (c == 0)
751 *errorcodeptr = ERR2;
752 break;
755 #ifndef EBCDIC /* ASCII coding */
756 if (c >= 'a' && c <= 'z') c -= 32;
757 c ^= 0x40;
758 #else /* EBCDIC coding */
759 if (c >= 'a' && c <= 'z') c += 64;
760 c ^= 0xC0;
761 #endif
762 break;
764 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
765 other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
766 for Perl compatibility, it is a literal. This code looks a bit odd, but
767 there used to be some cases other than the default, and there may be again
768 in future, so I haven't "optimized" it. */
770 default:
771 #ifdef AVMPLUS_PCRE
772 if ((options & PCRE_EXTRA) != 0)
774 *errorcodeptr = ERR3;
776 #else
777 if ((options & PCRE_EXTRA) != 0) switch(c)
779 default:
780 *errorcodeptr = ERR3;
781 break;
783 #endif
784 break;
788 *ptrptr = ptr;
789 return c;
794 #ifdef SUPPORT_UCP
795 /*************************************************
796 * Handle \P and \p *
797 *************************************************/
799 /* This function is called after \P or \p has been encountered, provided that
800 PCRE is compiled with support for Unicode properties. On entry, ptrptr is
801 pointing at the P or p. On exit, it is pointing at the final character of the
802 escape sequence.
804 Argument:
805 ptrptr points to the pattern position pointer
806 negptr points to a boolean that is set TRUE for negation else FALSE
807 dptr points to an int that is set to the detailed property value
808 errorcodeptr points to the error code variable
810 Returns: type value from ucp_type_table, or -1 for an invalid type
813 static int
814 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
816 int c, i, bot, top;
817 const uschar *ptr = *ptrptr;
818 char name[32];
820 c = *(++ptr);
821 if (c == 0) goto ERROR_RETURN;
823 *negptr = FALSE;
825 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
826 negation. */
828 if (c == '{')
830 if (ptr[1] == '^')
832 *negptr = TRUE;
833 ptr++;
835 for (i = 0; i < (int)sizeof(name) - 1; i++)
837 c = *(++ptr);
838 if (c == 0) goto ERROR_RETURN;
839 if (c == '}') break;
840 name[i] = c;
842 if (c !='}') goto ERROR_RETURN;
843 name[i] = 0;
846 /* Otherwise there is just one following character */
848 else
850 name[0] = c;
851 name[1] = 0;
854 *ptrptr = ptr;
856 /* Search for a recognized property name using binary chop */
858 bot = 0;
859 top = _pcre_utt_size;
861 while (bot < top)
863 i = (bot + top) >> 1;
864 c = VMPI_strcmp(name, _pcre_utt[i].name);
865 if (c == 0)
867 *dptr = _pcre_utt[i].value;
868 return _pcre_utt[i].type;
870 if (c > 0) bot = i + 1; else top = i;
873 *errorcodeptr = ERR47;
874 *ptrptr = ptr;
875 return -1;
877 ERROR_RETURN:
878 *errorcodeptr = ERR46;
879 *ptrptr = ptr;
880 return -1;
882 #endif
887 /*************************************************
888 * Check for counted repeat *
889 *************************************************/
891 /* This function is called when a '{' is encountered in a place where it might
892 start a quantifier. It looks ahead to see if it really is a quantifier or not.
893 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
894 where the ddds are digits.
896 Arguments:
897 p pointer to the first char after '{'
899 Returns: TRUE or FALSE
902 static BOOL
903 is_counted_repeat(const uschar *p)
905 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
906 while ((digitab[*p] & ctype_digit) != 0) p++;
907 if (*p == '}') return TRUE;
909 if (*p++ != ',') return FALSE;
910 if (*p == '}') return TRUE;
912 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
913 while ((digitab[*p] & ctype_digit) != 0) p++;
915 return (*p == '}');
920 /*************************************************
921 * Read repeat counts *
922 *************************************************/
924 /* Read an item of the form {n,m} and return the values. This is called only
925 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
926 so the syntax is guaranteed to be correct, but we need to check the values.
928 Arguments:
929 p pointer to first char after '{'
930 minp pointer to int for min
931 maxp pointer to int for max
932 returned as -1 if no max
933 errorcodeptr points to error code variable
935 Returns: pointer to '}' on success;
936 current ptr on error, with errorcodeptr set non-zero
939 static const uschar *
940 read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
942 int min = 0;
943 int max = -1;
945 /* Read the minimum value and do a paranoid check: a negative value indicates
946 an integer overflow. */
948 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
949 if (min < 0 || min > 65535)
951 *errorcodeptr = ERR5;
952 return p;
955 /* Read the maximum value if there is one, and again do a paranoid on its size.
956 Also, max must not be less than min. */
958 if (*p == '}') max = min; else
960 if (*(++p) != '}')
962 max = 0;
963 while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
964 if (max < 0 || max > 65535)
966 *errorcodeptr = ERR5;
967 return p;
969 if (max < min)
971 *errorcodeptr = ERR4;
972 return p;
977 /* Fill in the required variables, and pass back the pointer to the terminating
978 '}'. */
980 *minp = min;
981 *maxp = max;
982 return p;
987 /*************************************************
988 * Find forward referenced subpattern *
989 *************************************************/
991 /* This function scans along a pattern's text looking for capturing
992 subpatterns, and counting them. If it finds a named pattern that matches the
993 name it is given, it returns its number. Alternatively, if the name is NULL, it
994 returns when it reaches a given numbered subpattern. This is used for forward
995 references to subpatterns. We know that if (?P< is encountered, the name will
996 be terminated by '>' because that is checked in the first pass.
998 Arguments:
999 ptr current position in the pattern
1000 count current count of capturing parens so far encountered
1001 name name to seek, or NULL if seeking a numbered subpattern
1002 lorn name length, or subpattern number if name is NULL
1003 xmode TRUE if we are in /x mode
1005 Returns: the number of the named subpattern, or -1 if not found
1008 static int
1009 find_parens(const uschar *ptr, int count, const uschar *name, int lorn,
1010 BOOL xmode)
1012 const uschar *thisname;
1014 for (; *ptr != 0; ptr++)
1016 int term;
1018 /* Skip over backslashed characters and also entire \Q...\E */
1020 if (*ptr == '\\')
1022 if (*(++ptr) == 0) return -1;
1023 if (*ptr == 'Q') for (;;)
1025 while (*(++ptr) != 0 && *ptr != '\\') {}
1026 if (*ptr == 0) return -1;
1027 if (*(++ptr) == 'E') break;
1029 continue;
1032 /* Skip over character classes */
1034 if (*ptr == '[')
1036 while (*(++ptr) != ']')
1038 if (*ptr == 0) return -1;
1039 if (*ptr == '\\')
1041 if (*(++ptr) == 0) return -1;
1042 if (*ptr == 'Q') for (;;)
1044 while (*(++ptr) != 0 && *ptr != '\\'){}
1045 if (*ptr == 0) return -1;
1046 if (*(++ptr) == 'E') break;
1048 continue;
1051 continue;
1054 /* Skip comments in /x mode */
1056 if (xmode && *ptr == '#')
1058 while (*(++ptr) != 0 && *ptr != '\n'){}
1059 if (*ptr == 0) return -1;
1060 continue;
1063 /* An opening parens must now be a real metacharacter */
1065 if (*ptr != '(') continue;
1066 if (ptr[1] != '?' && ptr[1] != '*')
1068 count++;
1069 if (name == NULL && count == lorn) return count;
1070 continue;
1073 ptr += 2;
1074 if (*ptr == 'P') ptr++; /* Allow optional P */
1076 /* We have to disambiguate (?<! and (?<= from (?<name> */
1078 if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&
1079 *ptr != '\'')
1080 continue;
1082 count++;
1084 if (name == NULL && count == lorn) return count;
1085 term = *ptr++;
1086 if (term == '<') term = '>';
1087 thisname = ptr;
1088 while (*ptr != term) ptr++;
1089 if (name != NULL && lorn == ptr - thisname &&
1090 VMPI_strncmp((const char *)name, (const char *)thisname, lorn) == 0)
1091 return count;
1094 return -1;
1099 /*************************************************
1100 * Find first significant op code *
1101 *************************************************/
1103 /* This is called by several functions that scan a compiled expression looking
1104 for a fixed first character, or an anchoring op code etc. It skips over things
1105 that do not influence this. For some calls, a change of option is important.
1106 For some calls, it makes sense to skip negative forward and all backward
1107 assertions, and also the \b assertion; for others it does not.
1109 Arguments:
1110 code pointer to the start of the group
1111 options pointer to external options
1112 optbit the option bit whose changing is significant, or
1113 zero if none are
1114 skipassert TRUE if certain assertions are to be skipped
1116 Returns: pointer to the first significant opcode
1119 static const uschar*
1120 first_significant_code(const uschar *code, int *options, int optbit,
1121 BOOL skipassert)
1123 for (;;)
1125 switch ((int)*code)
1127 case OP_OPT:
1128 if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
1129 *options = (int)code[1];
1130 code += 2;
1131 break;
1133 case OP_ASSERT_NOT:
1134 case OP_ASSERTBACK:
1135 case OP_ASSERTBACK_NOT:
1136 if (!skipassert) return code;
1137 do code += GET(code, 1); while (*code == OP_ALT);
1138 code += _pcre_OP_lengths[*code];
1139 break;
1141 case OP_WORD_BOUNDARY:
1142 case OP_NOT_WORD_BOUNDARY:
1143 if (!skipassert) return code;
1144 /* Fall through */
1146 case OP_CALLOUT:
1147 case OP_CREF:
1148 case OP_RREF:
1149 case OP_DEF:
1150 code += _pcre_OP_lengths[*code];
1151 break;
1153 default:
1154 return code;
1157 /* Control never reaches here */
1161 /*****************************************************************************
1162 * Generic stack used by find_fixedlength, is_anchored, is_startline *
1163 * to avoid unbounded recursion *
1164 *****************************************************************************/
1166 #define TYPED_SEGMENT_SIZE 20
1168 template <class T>
1169 class TypedStack {
1170 public:
1171 TypedStack();
1172 ~TypedStack();
1174 bool isEmpty(); // true iff the stack is empty
1175 void push(T& value); // push value onto the stack (by copy)
1176 void pop(T& value); // pop value off the stack (by copy)
1177 T& stackTop(); // reference to top element
1179 private:
1180 struct TypedSegment {
1181 T memory[TYPED_SEGMENT_SIZE]; // stack memory
1182 T *top; // for non-top segments, the saved 'top' pointer
1183 TypedSegment *next; // next segment on the stack
1186 void pushSegment();
1187 void popSegment();
1188 void invariants();
1190 TypedSegment* topSegment; // segment at the top of stack
1191 T *top; // next free address in that segment
1192 T *limit; // limit address in that segment
1193 #ifdef DEBUG
1194 uint32_t depth; // current stack depth
1195 uint32_t maxdepth; // peak stack depth
1196 #endif
1199 template <class T>
1200 void TypedStack<T>::invariants()
1202 AvmAssert(topSegment == NULL || top > topSegment->memory);
1203 AvmAssert(topSegment == NULL || limit == topSegment->memory + TYPED_SEGMENT_SIZE);
1204 AvmAssert(top <= limit);
1207 template <class T>
1208 TypedStack<T>::TypedStack()
1209 : topSegment(NULL)
1210 , top(NULL)
1211 , limit(NULL)
1212 #ifdef DEBUG
1213 , depth(0)
1214 , maxdepth(0)
1215 #endif
1217 invariants();
1220 template <class T>
1221 TypedStack<T>::~TypedStack()
1223 #if 0 && defined DEBUG
1224 // For the morbidly curious only. See bugzilla 502589 for some data.
1225 FILE *fp = fopen("typedstack.txt", "a");
1226 fprintf(fp, "PCRE TypedStack depth = %u\n", unsigned(maxdepth));
1227 fclose(fp);
1228 #endif
1229 invariants();
1230 while (topSegment != NULL)
1231 popSegment();
1234 template <class T>
1235 bool TypedStack<T>::isEmpty()
1237 return topSegment == NULL;
1240 template <class T>
1241 T& TypedStack<T>::stackTop()
1243 invariants();
1244 AvmAssert(!isEmpty());
1245 return top[-1];
1248 template <class T>
1249 void TypedStack<T>::push(T& value)
1251 invariants();
1252 if (top == limit)
1253 pushSegment();
1254 *top++ = value;
1255 #ifdef DEBUG
1256 depth++;
1257 if (depth > maxdepth)
1258 maxdepth = depth;
1259 #endif
1260 invariants();
1263 template <class T>
1264 void TypedStack<T>::pop(T& value)
1266 invariants();
1267 #ifdef DEBUG
1268 depth--;
1269 #endif
1270 top--;
1271 value = *top;
1272 if (top == topSegment->memory)
1273 popSegment();
1274 invariants();
1277 template <class T>
1278 void TypedStack<T>::pushSegment()
1280 TypedSegment *s = (TypedSegment*)pcre_malloc(sizeof(TypedSegment));
1281 if (topSegment != NULL)
1282 topSegment->top = top;
1283 s->next = topSegment;
1284 topSegment = s;
1285 top = s->memory;
1286 limit = s->memory + TYPED_SEGMENT_SIZE;
1289 template <class T>
1290 void TypedStack<T>::popSegment()
1292 TypedSegment *s = topSegment;
1293 topSegment = topSegment->next;
1294 pcre_free(s);
1295 if (topSegment != NULL)
1297 top = topSegment->top;
1298 limit = topSegment->memory + TYPED_SEGMENT_SIZE;
1300 else
1302 top = NULL;
1303 limit = NULL;
1307 /*************************************************
1308 * Find the fixed length of a pattern *
1309 *************************************************/
1311 /* Scan a pattern and compute the fixed length of subject that will match it,
1312 if the length is fixed. This is needed for dealing with backward assertions.
1313 In UTF8 mode, the result is in characters rather than bytes.
1315 Arguments:
1316 code points to the start of the pattern (the bracket)
1317 options the compiling options
1319 Returns: the fixed length, or -1 if there is no fixed length,
1320 or -2 if \C was encountered
1323 // processing information for find_fixedlength
1324 typedef struct pi_ffl {
1325 uschar* code;
1326 int length;
1327 int branchlength;
1328 /* the work item from which this work item was derived */
1329 pi_ffl* parent;
1330 } pi_ffl;
1332 #define FFL_NEWITEM(ITEM) \
1333 ITEM.code = 0; \
1334 ITEM.length = -1; \
1335 ITEM.branchlength = 0; \
1336 ITEM.parent = 0
1338 static int
1339 find_fixedlength(uschar * const code, const int options)
1341 (void)options;
1343 TypedStack<pi_ffl> work_stack;
1344 pi_ffl work_item;
1346 /* set up the first work item */
1347 FFL_NEWITEM(work_item);
1348 work_item.code = code;
1349 work_stack.push(work_item);
1351 /* start the processing loop */
1352 while (!work_stack.isEmpty()) {
1353 work_stack.pop(work_item);
1354 int length = work_item.length;
1355 register int branchlength = work_item.branchlength;
1356 register uschar *cc = work_item.code + 1 + LINK_SIZE;
1357 BOOL skip_to_next_item = FALSE;
1359 for (;!skip_to_next_item;)
1361 register int op = *cc;
1362 switch (op)
1364 case OP_CBRA:
1365 case OP_BRA:
1366 case OP_ONCE:
1367 case OP_COND:
1370 - create a new workitem for the remainder of this item (or alter the
1371 current one)
1372 - create a new workitem with the bracketed code and put it on the stack
1373 - this results in the following stack state: new_item -> work_item -> ...
1374 this way we keep the processing order as if recursion would have been
1375 used
1376 - move the code pointer of work_item after the bracketed code
1377 - start the next iteration (with new_item)
1378 * */
1379 work_item.length = length;
1380 work_item.branchlength = branchlength;
1381 work_stack.push(work_item);
1383 pi_ffl new_item;
1384 FFL_NEWITEM(new_item);
1385 new_item.code = cc + ((op == OP_CBRA)? 2:0);
1386 pi_ffl* parent_item = &work_stack.stackTop();
1387 new_item.parent = parent_item;
1388 work_stack.push(new_item);
1390 do cc += GET(cc, 1); while (*cc == OP_ALT);
1391 // original code added 1 + LINK_SIZE to the pointer
1392 // we don't need this as the processing of an item start with that
1393 parent_item->code = cc;
1395 skip_to_next_item = TRUE;
1396 continue;
1399 /* Reached end of a branch; if it's a ket it is the end of a nested
1400 call. If it's ALT it is an alternation in a nested call. If it is
1401 END it's the end of the outer call. All can be handled by the same code.*/
1403 case OP_ALT:
1404 case OP_KET:
1405 case OP_KETRMAX:
1406 case OP_KETRMIN:
1407 case OP_END:
1408 if (length < 0) length = branchlength;
1409 /* clean up the stack and current item and return -1 */
1410 else if (length != branchlength)
1411 { return -1; }
1412 if (*cc != OP_ALT)
1414 /* end of the pattern, clean up and return */
1415 if (*cc == OP_END)
1416 { return length; }
1417 else
1418 /* KET is the end of in internal group, propagate the brenchlength back
1419 to the parent group's branchlength */
1421 work_item.parent->branchlength += branchlength;
1422 skip_to_next_item = TRUE;
1423 continue;
1426 cc += 1 + LINK_SIZE;
1427 branchlength = 0;
1428 break;
1430 /* Skip over assertive subpatterns */
1432 case OP_ASSERT:
1433 case OP_ASSERT_NOT:
1434 case OP_ASSERTBACK:
1435 case OP_ASSERTBACK_NOT:
1436 do cc += GET(cc, 1); while (*cc == OP_ALT);
1437 /* Fall through */
1439 /* Skip over things that don't match chars */
1441 case OP_REVERSE:
1442 case OP_CREF:
1443 case OP_RREF:
1444 case OP_DEF:
1445 case OP_OPT:
1446 case OP_CALLOUT:
1447 case OP_SOD:
1448 case OP_SOM:
1449 case OP_EOD:
1450 case OP_EODN:
1451 case OP_CIRC:
1452 case OP_DOLL:
1453 case OP_NOT_WORD_BOUNDARY:
1454 case OP_WORD_BOUNDARY:
1455 cc += _pcre_OP_lengths[*cc];
1456 break;
1458 /* Handle literal characters */
1460 case OP_CHAR:
1461 case OP_CHARNC:
1462 case OP_NOT:
1463 branchlength++;
1464 cc += 2;
1465 #ifdef SUPPORT_UTF8
1466 if ((options & PCRE_UTF8) != 0)
1468 while ((*cc & 0xc0) == 0x80) cc++;
1470 #endif
1471 break;
1473 /* Handle exact repetitions. The count is already in characters, but we
1474 need to skip over a multibyte character in UTF8 mode. */
1476 case OP_EXACT:
1477 branchlength += GET2(cc,1);
1478 cc += 4;
1479 #ifdef SUPPORT_UTF8
1480 if ((options & PCRE_UTF8) != 0)
1482 while((*cc & 0x80) == 0x80) cc++;
1484 #endif
1485 break;
1487 case OP_TYPEEXACT:
1488 branchlength += GET2(cc,1);
1489 if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
1490 cc += 4;
1491 break;
1493 /* Handle single-char matchers */
1495 case OP_PROP:
1496 case OP_NOTPROP:
1497 cc += 2;
1498 /* Fall through */
1500 case OP_NOT_DIGIT:
1501 case OP_DIGIT:
1502 case OP_NOT_WHITESPACE:
1503 case OP_WHITESPACE:
1504 case OP_NOT_WORDCHAR:
1505 case OP_WORDCHAR:
1506 case OP_ANY:
1507 branchlength++;
1508 cc++;
1509 break;
1511 /* The single-byte matcher isn't allowed */
1513 case OP_ANYBYTE:
1514 /* free the proc stack and the current item*/
1515 return -2;
1517 /* Check a class for variable quantification */
1519 #ifdef SUPPORT_UTF8
1520 case OP_XCLASS:
1521 cc += GET(cc, 1) - 33;
1522 /* Fall through */
1523 #endif
1525 case OP_CLASS:
1526 case OP_NCLASS:
1527 cc += 33;
1529 switch (*cc)
1531 case OP_CRSTAR:
1532 case OP_CRMINSTAR:
1533 case OP_CRQUERY:
1534 case OP_CRMINQUERY:
1535 /* free the proc stack and the current item*/
1536 return -1;
1538 case OP_CRRANGE:
1539 case OP_CRMINRANGE:
1540 if (GET2(cc,1) != GET2(cc,3)) return -1;
1541 branchlength += GET2(cc,1);
1542 cc += 5;
1543 break;
1545 default:
1546 branchlength++;
1548 break;
1550 /* Anything else is variable length */
1552 default:
1553 /* free the proc stack and the current item*/
1554 return -1;
1558 /* Control never gets here */
1559 AvmAssert(0);
1560 return -1;
1563 #undef FFL_NEWITEM
1567 /*************************************************
1568 * Scan compiled regex for numbered bracket *
1569 *************************************************/
1571 /* This little function scans through a compiled pattern until it finds a
1572 capturing bracket with the given number.
1574 Arguments:
1575 code points to start of expression
1576 utf8 TRUE in UTF-8 mode
1577 number the required bracket number
1579 Returns: pointer to the opcode for the bracket, or NULL if not found
1582 static const uschar *
1583 find_bracket(const uschar *code, BOOL utf8, int number)
1585 for (;;)
1587 register int c = *code;
1588 if (c == OP_END) return NULL;
1590 /* XCLASS is used for classes that cannot be represented just by a bit
1591 map. This includes negated single high-valued characters. The length in
1592 the table is zero; the actual length is stored in the compiled code. */
1594 if (c == OP_XCLASS) code += GET(code, 1);
1596 /* Handle capturing bracket */
1598 else if (c == OP_CBRA)
1600 int n = GET2(code, 1+LINK_SIZE);
1601 if (n == number) return (uschar *)code;
1602 code += _pcre_OP_lengths[c];
1605 /* Otherwise, we can get the item's length from the table, except that for
1606 repeated character types, we have to test for \p and \P, which have an extra
1607 two bytes of parameters. */
1609 else
1611 switch(c)
1613 case OP_TYPESTAR:
1614 case OP_TYPEMINSTAR:
1615 case OP_TYPEPLUS:
1616 case OP_TYPEMINPLUS:
1617 case OP_TYPEQUERY:
1618 case OP_TYPEMINQUERY:
1619 case OP_TYPEPOSSTAR:
1620 case OP_TYPEPOSPLUS:
1621 case OP_TYPEPOSQUERY:
1622 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1623 break;
1625 case OP_TYPEUPTO:
1626 case OP_TYPEMINUPTO:
1627 case OP_TYPEEXACT:
1628 case OP_TYPEPOSUPTO:
1629 if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1630 break;
1633 /* Add in the fixed length from the table */
1635 code += _pcre_OP_lengths[c];
1637 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1638 a multi-byte character. The length in the table is a minimum, so we have to
1639 arrange to skip the extra bytes. */
1641 #ifdef SUPPORT_UTF8
1642 if (utf8) switch(c)
1644 case OP_CHAR:
1645 case OP_CHARNC:
1646 case OP_EXACT:
1647 case OP_UPTO:
1648 case OP_MINUPTO:
1649 case OP_POSUPTO:
1650 case OP_STAR:
1651 case OP_MINSTAR:
1652 case OP_POSSTAR:
1653 case OP_PLUS:
1654 case OP_MINPLUS:
1655 case OP_POSPLUS:
1656 case OP_QUERY:
1657 case OP_MINQUERY:
1658 case OP_POSQUERY:
1659 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1660 break;
1662 #endif
1669 /*************************************************
1670 * Scan compiled regex for recursion reference *
1671 *************************************************/
1673 /* This little function scans through a compiled pattern until it finds an
1674 instance of OP_RECURSE.
1676 Arguments:
1677 code points to start of expression
1678 utf8 TRUE in UTF-8 mode
1680 Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1683 static const uschar *
1684 find_recurse(const uschar *code, BOOL utf8)
1686 for (;;)
1688 register int c = *code;
1689 if (c == OP_END) return NULL;
1690 if (c == OP_RECURSE) return code;
1692 /* XCLASS is used for classes that cannot be represented just by a bit
1693 map. This includes negated single high-valued characters. The length in
1694 the table is zero; the actual length is stored in the compiled code. */
1696 if (c == OP_XCLASS) code += GET(code, 1);
1698 /* Otherwise, we can get the item's length from the table, except that for
1699 repeated character types, we have to test for \p and \P, which have an extra
1700 two bytes of parameters. */
1702 else
1704 switch(c)
1706 case OP_TYPESTAR:
1707 case OP_TYPEMINSTAR:
1708 case OP_TYPEPLUS:
1709 case OP_TYPEMINPLUS:
1710 case OP_TYPEQUERY:
1711 case OP_TYPEMINQUERY:
1712 case OP_TYPEPOSSTAR:
1713 case OP_TYPEPOSPLUS:
1714 case OP_TYPEPOSQUERY:
1715 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1716 break;
1718 case OP_TYPEPOSUPTO:
1719 case OP_TYPEUPTO:
1720 case OP_TYPEMINUPTO:
1721 case OP_TYPEEXACT:
1722 if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1723 break;
1726 /* Add in the fixed length from the table */
1728 code += _pcre_OP_lengths[c];
1730 /* In UTF-8 mode, opcodes that are followed by a character may be followed
1731 by a multi-byte character. The length in the table is a minimum, so we have
1732 to arrange to skip the extra bytes. */
1734 #ifdef SUPPORT_UTF8
1735 if (utf8) switch(c)
1737 case OP_CHAR:
1738 case OP_CHARNC:
1739 case OP_EXACT:
1740 case OP_UPTO:
1741 case OP_MINUPTO:
1742 case OP_POSUPTO:
1743 case OP_STAR:
1744 case OP_MINSTAR:
1745 case OP_POSSTAR:
1746 case OP_PLUS:
1747 case OP_MINPLUS:
1748 case OP_POSPLUS:
1749 case OP_QUERY:
1750 case OP_MINQUERY:
1751 case OP_POSQUERY:
1752 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1753 break;
1755 #endif
1762 /*************************************************
1763 * Scan compiled branch for non-emptiness *
1764 *************************************************/
1766 /* This function scans through a branch of a compiled pattern to see whether it
1767 can match the empty string or not. It is called from could_be_empty()
1768 below and from compile_branch() when checking for an unlimited repeat of a
1769 group that can match nothing. Note that first_significant_code() skips over
1770 assertions. If we hit an unclosed bracket, we return "empty" - this means we've
1771 struck an inner bracket whose current branch will already have been scanned.
1773 Arguments:
1774 code points to start of search
1775 endcode points to where to stop
1776 utf8 TRUE if in UTF8 mode
1778 Returns: TRUE if what is matched could be empty
1781 static BOOL
1782 could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1784 register int c;
1785 avmplus::AvmCore::checkPCREStackOverflow();
1786 for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1787 code < endcode;
1788 code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1790 const uschar *ccode;
1792 c = *code;
1794 /* Groups with zero repeats can of course be empty; skip them. */
1796 if (c == OP_BRAZERO || c == OP_BRAMINZERO)
1798 code += _pcre_OP_lengths[c];
1799 do code += GET(code, 1); while (*code == OP_ALT);
1800 c = *code;
1801 continue;
1804 /* For other groups, scan the branches. */
1806 if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)
1808 BOOL empty_branch;
1809 if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1811 /* Scan a closed bracket */
1813 empty_branch = FALSE;
1816 if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1817 empty_branch = TRUE;
1818 code += GET(code, 1);
1820 while (*code == OP_ALT);
1821 if (!empty_branch) return FALSE; /* All branches are non-empty */
1822 c = *code;
1823 continue;
1826 /* Handle the other opcodes */
1828 switch (c)
1830 /* Check for quantifiers after a class. XCLASS is used for classes that
1831 cannot be represented just by a bit map. This includes negated single
1832 high-valued characters. The length in _pcre_OP_lengths[] is zero; the
1833 actual length is stored in the compiled code, so we must update "code"
1834 here. */
1836 #ifdef SUPPORT_UTF8
1837 case OP_XCLASS:
1838 ccode = code += GET(code, 1);
1839 goto CHECK_CLASS_REPEAT;
1840 #endif
1842 case OP_CLASS:
1843 case OP_NCLASS:
1844 ccode = code + 33;
1846 #ifdef SUPPORT_UTF8
1847 CHECK_CLASS_REPEAT:
1848 #endif
1850 switch (*ccode)
1852 case OP_CRSTAR: /* These could be empty; continue */
1853 case OP_CRMINSTAR:
1854 case OP_CRQUERY:
1855 case OP_CRMINQUERY:
1856 break;
1858 default: /* Non-repeat => class must match */
1859 case OP_CRPLUS: /* These repeats aren't empty */
1860 case OP_CRMINPLUS:
1861 return FALSE;
1863 case OP_CRRANGE:
1864 case OP_CRMINRANGE:
1865 if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1866 break;
1868 break;
1870 /* Opcodes that must match a character */
1872 case OP_PROP:
1873 case OP_NOTPROP:
1874 case OP_EXTUNI:
1875 case OP_NOT_DIGIT:
1876 case OP_DIGIT:
1877 case OP_NOT_WHITESPACE:
1878 case OP_WHITESPACE:
1879 case OP_NOT_WORDCHAR:
1880 case OP_WORDCHAR:
1881 case OP_ANY:
1882 case OP_ANYBYTE:
1883 case OP_CHAR:
1884 case OP_CHARNC:
1885 case OP_NOT:
1886 case OP_PLUS:
1887 case OP_MINPLUS:
1888 case OP_POSPLUS:
1889 case OP_EXACT:
1890 case OP_NOTPLUS:
1891 case OP_NOTMINPLUS:
1892 case OP_NOTPOSPLUS:
1893 case OP_NOTEXACT:
1894 case OP_TYPEPLUS:
1895 case OP_TYPEMINPLUS:
1896 case OP_TYPEPOSPLUS:
1897 case OP_TYPEEXACT:
1898 return FALSE;
1900 /* These are going to continue, as they may be empty, but we have to
1901 fudge the length for the \p and \P cases. */
1903 case OP_TYPESTAR:
1904 case OP_TYPEMINSTAR:
1905 case OP_TYPEPOSSTAR:
1906 case OP_TYPEQUERY:
1907 case OP_TYPEMINQUERY:
1908 case OP_TYPEPOSQUERY:
1909 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1910 break;
1912 /* Same for these */
1914 case OP_TYPEUPTO:
1915 case OP_TYPEMINUPTO:
1916 case OP_TYPEPOSUPTO:
1917 if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1918 break;
1920 /* End of branch */
1922 case OP_KET:
1923 case OP_KETRMAX:
1924 case OP_KETRMIN:
1925 case OP_ALT:
1926 return TRUE;
1928 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1929 MINUPTO, and POSUPTO may be followed by a multibyte character */
1931 #ifdef SUPPORT_UTF8
1932 case OP_STAR:
1933 case OP_MINSTAR:
1934 case OP_POSSTAR:
1935 case OP_QUERY:
1936 case OP_MINQUERY:
1937 case OP_POSQUERY:
1938 case OP_UPTO:
1939 case OP_MINUPTO:
1940 case OP_POSUPTO:
1941 if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1942 break;
1943 #endif
1947 return TRUE;
1952 /*************************************************
1953 * Scan compiled regex for non-emptiness *
1954 *************************************************/
1956 /* This function is called to check for left recursive calls. We want to check
1957 the current branch of the current pattern to see if it could match the empty
1958 string. If it could, we must look outwards for branches at other levels,
1959 stopping when we pass beyond the bracket which is the subject of the recursion.
1961 Arguments:
1962 code points to start of the recursion
1963 endcode points to where to stop (current RECURSE item)
1964 bcptr points to the chain of current (unclosed) branch starts
1965 utf8 TRUE if in UTF-8 mode
1967 Returns: TRUE if what is matched could be empty
1970 static BOOL
1971 could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1972 BOOL utf8)
1974 while (bcptr != NULL && bcptr->current >= code)
1976 if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1977 bcptr = bcptr->outer;
1979 return TRUE;
1984 /*************************************************
1985 * Check for POSIX class syntax *
1986 *************************************************/
1988 /* This function is called when the sequence "[:" or "[." or "[=" is
1989 encountered in a character class. It checks whether this is followed by an
1990 optional ^ and then a sequence of letters, terminated by a matching ":]" or
1991 ".]" or "=]".
1993 Argument:
1994 ptr pointer to the initial [
1995 endptr where to return the end pointer
1996 cd pointer to compile data
1998 Returns: TRUE or FALSE
2001 static BOOL
2002 check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
2004 int terminator; /* Don't combine these lines; the Solaris cc */
2005 terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
2006 if (*(++ptr) == '^') ptr++;
2007 while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
2008 if (*ptr == terminator && ptr[1] == ']')
2010 *endptr = ptr;
2011 return TRUE;
2013 return FALSE;
2019 /*************************************************
2020 * Check POSIX class name *
2021 *************************************************/
2023 /* This function is called to check the name given in a POSIX-style class entry
2024 such as [:alnum:].
2026 Arguments:
2027 ptr points to the first letter
2028 len the length of the name
2030 Returns: a value representing the name, or -1 if unknown
2033 static int
2034 check_posix_name(const uschar *ptr, int len)
2036 register int yield = 0;
2037 while (posix_name_lengths[yield] != 0)
2039 if (len == posix_name_lengths[yield] &&
2040 VMPI_strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
2041 yield++;
2043 return -1;
2047 /*************************************************
2048 * Adjust OP_RECURSE items in repeated group *
2049 *************************************************/
2051 /* OP_RECURSE items contain an offset from the start of the regex to the group
2052 that is referenced. This means that groups can be replicated for fixed
2053 repetition simply by copying (because the recursion is allowed to refer to
2054 earlier groups that are outside the current group). However, when a group is
2055 optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
2056 it, after it has been compiled. This means that any OP_RECURSE items within it
2057 that refer to the group itself or any contained groups have to have their
2058 offsets adjusted. That one of the jobs of this function. Before it is called,
2059 the partially compiled regex must be temporarily terminated with OP_END.
2061 This function has been extended with the possibility of forward references for
2062 recursions and subroutine calls. It must also check the list of such references
2063 for the group we are dealing with. If it finds that one of the recursions in
2064 the current group is on this list, it adjusts the offset in the list, not the
2065 value in the reference (which is a group number).
2067 Arguments:
2068 group points to the start of the group
2069 adjust the amount by which the group is to be moved
2070 utf8 TRUE in UTF-8 mode
2071 cd contains pointers to tables etc.
2072 save_hwm the hwm forward reference pointer at the start of the group
2074 Returns: nothing
2077 static void
2078 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
2079 uschar *save_hwm)
2081 uschar *ptr = group;
2083 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
2085 int offset;
2086 uschar *hc;
2088 /* See if this recursion is on the forward reference list. If so, adjust the
2089 reference. */
2091 for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
2093 offset = GET(hc, 0);
2094 if (cd->start_code + offset == ptr + 1)
2096 PUT(hc, 0, offset + adjust);
2097 break;
2101 /* Otherwise, adjust the recursion offset if it's after the start of this
2102 group. */
2104 if (hc >= cd->hwm)
2106 offset = GET(ptr, 1);
2107 if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
2110 ptr += 1 + LINK_SIZE;
2116 /*************************************************
2117 * Insert an automatic callout point *
2118 *************************************************/
2120 /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
2121 callout points before each pattern item.
2123 Arguments:
2124 code current code pointer
2125 ptr current pattern pointer
2126 cd pointers to tables etc
2128 Returns: new code pointer
2131 static uschar *
2132 auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
2134 *code++ = OP_CALLOUT;
2135 *code++ = 255;
2136 PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
2137 PUT(code, LINK_SIZE, 0); /* Default length */
2138 return code + 2*LINK_SIZE;
2143 /*************************************************
2144 * Complete a callout item *
2145 *************************************************/
2147 /* A callout item contains the length of the next item in the pattern, which
2148 we can't fill in till after we have reached the relevant point. This is used
2149 for both automatic and manual callouts.
2151 Arguments:
2152 previous_callout points to previous callout item
2153 ptr current pattern pointer
2154 cd pointers to tables etc
2156 Returns: nothing
2159 static void
2160 complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
2162 int length = ptr - cd->start_pattern - GET(previous_callout, 2);
2163 PUT(previous_callout, 2 + LINK_SIZE, length);
2168 #ifdef SUPPORT_UCP
2169 /*************************************************
2170 * Get othercase range *
2171 *************************************************/
2173 /* This function is passed the start and end of a class range, in UTF-8 mode
2174 with UCP support. It searches up the characters, looking for internal ranges of
2175 characters in the "other" case. Each call returns the next one, updating the
2176 start address.
2178 Arguments:
2179 cptr points to starting character value; updated
2180 d end value
2181 ocptr where to put start of othercase range
2182 odptr where to put end of othercase range
2184 Yield: TRUE when range returned; FALSE when no more
2187 static BOOL
2188 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
2189 unsigned int *odptr)
2191 unsigned int c, othercase, next;
2193 for (c = *cptr; c <= d; c++)
2194 { if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) break; }
2196 if (c > d) return FALSE;
2198 *ocptr = othercase;
2199 next = othercase + 1;
2201 for (++c; c <= d; c++)
2203 if (_pcre_ucp_othercase(c) != next) break;
2204 next++;
2207 *odptr = next - 1;
2208 *cptr = c;
2210 return TRUE;
2212 #endif /* SUPPORT_UCP */
2216 /*************************************************
2217 * Check if auto-possessifying is possible *
2218 *************************************************/
2220 /* This function is called for unlimited repeats of certain items, to see
2221 whether the next thing could possibly match the repeated item. If not, it makes
2222 sense to automatically possessify the repeated item.
2224 Arguments:
2225 op_code the repeated op code
2226 this data for this item, depends on the opcode
2227 utf8 TRUE in UTF-8 mode
2228 utf8_char used for utf8 character bytes, NULL if not relevant
2229 ptr next character in pattern
2230 options options bits
2231 cd contains pointers to tables etc.
2233 Returns: TRUE if possessifying is wanted
2236 static BOOL
2237 check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
2238 const uschar *ptr, int options, compile_data *cd)
2240 int next;
2242 /* Skip whitespace and comments in extended mode */
2244 if ((options & PCRE_EXTENDED) != 0)
2246 for (;;)
2248 while ( true ) //(*ptr < 256 && (cd->ctypes[*ptr] & ctype_space) != 0) || (*ptr >= 256 && isUnicodeWhiteSpace(*ptr)) )
2250 if( (cd->ctypes[*ptr] & ctype_space) != 0 )
2252 ptr++;
2253 continue;
2255 break;
2257 if (*ptr == '#')
2259 while (*(++ptr) != 0)
2260 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2262 else break;
2266 /* If the next item is one that we can handle, get its value. A non-negative
2267 value is a character, a negative value is an escape value. */
2269 if (*ptr == '\\')
2271 int temperrorcode = 0;
2272 next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
2273 if (temperrorcode != 0) return FALSE;
2274 ptr++; /* Point after the escape sequence */
2277 else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
2279 #ifdef SUPPORT_UTF8
2280 if (utf8) { GETCHARINC(next, ptr); } else
2281 #endif
2282 next = *ptr++;
2285 else return FALSE;
2287 /* Skip whitespace and comments in extended mode */
2289 if ((options & PCRE_EXTENDED) != 0)
2291 for (;;)
2293 while ( true ) //(*ptr < 256 && (cd->ctypes[*ptr] & ctype_space) != 0) || (*ptr >= 256 && isUnicodeWhiteSpace(*ptr)) )
2295 if( (cd->ctypes[*ptr] & ctype_space) != 0 )
2297 ptr++;
2298 continue;
2300 break;
2302 if (*ptr == '#')
2304 while (*(++ptr) != 0)
2305 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2307 else break;
2311 /* If the next thing is itself optional, we have to give up. */
2313 if (*ptr == '*' || *ptr == '?' || VMPI_strncmp((char *)ptr, "{0,", 3) == 0)
2314 return FALSE;
2316 /* Now compare the next item with the previous opcode. If the previous is a
2317 positive single character match, "item" either contains the character or, if
2318 "item" is greater than 127 in utf8 mode, the character's bytes are in
2319 utf8_char. */
2322 /* Handle cases when the next item is a character. */
2324 if (next >= 0) switch(op_code)
2326 case OP_CHAR:
2327 #ifdef SUPPORT_UTF8
2328 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2329 #endif
2330 return item != next;
2332 /* For CHARNC (caseless character) we must check the other case. If we have
2333 Unicode property support, we can use it to test the other case of
2334 high-valued characters. */
2336 case OP_CHARNC:
2337 #ifdef SUPPORT_UTF8
2338 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2339 #endif
2340 if (item == next) return FALSE;
2341 #ifdef SUPPORT_UTF8
2342 if (utf8)
2344 unsigned int othercase;
2345 if (next < 128) othercase = cd->fcc[next]; else
2346 #ifdef SUPPORT_UCP
2347 othercase = _pcre_ucp_othercase((unsigned int)next);
2348 #else
2349 othercase = NOTACHAR;
2350 #endif
2351 return (unsigned int)item != othercase;
2353 else
2354 #endif /* SUPPORT_UTF8 */
2355 return (item != cd->fcc[next]); /* Non-UTF-8 mode */
2357 /* For OP_NOT, "item" must be a single-byte character. */
2359 case OP_NOT:
2360 if (next < 0) return FALSE; /* Not a character */
2361 if (item == next) return TRUE;
2362 if ((options & PCRE_CASELESS) == 0) return FALSE;
2363 #ifdef SUPPORT_UTF8
2364 if (utf8)
2366 unsigned int othercase;
2367 if (next < 128) othercase = cd->fcc[next]; else
2368 #ifdef SUPPORT_UCP
2369 othercase = _pcre_ucp_othercase(next);
2370 #else
2371 othercase = NOTACHAR;
2372 #endif
2373 return (unsigned int)item == othercase;
2375 else
2376 #endif /* SUPPORT_UTF8 */
2377 return (item == cd->fcc[next]); /* Non-UTF-8 mode */
2379 case OP_DIGIT:
2380 return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
2382 case OP_NOT_DIGIT:
2383 return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
2385 case OP_WHITESPACE:
2386 return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
2388 case OP_NOT_WHITESPACE:
2389 return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
2391 case OP_WORDCHAR:
2392 return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
2394 case OP_NOT_WORDCHAR:
2395 return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
2397 case OP_HSPACE:
2398 case OP_NOT_HSPACE:
2399 switch(next)
2401 case 0x09:
2402 case 0x20:
2403 case 0xa0:
2404 case 0x1680:
2405 case 0x180e:
2406 case 0x2000:
2407 case 0x2001:
2408 case 0x2002:
2409 case 0x2003:
2410 case 0x2004:
2411 case 0x2005:
2412 case 0x2006:
2413 case 0x2007:
2414 case 0x2008:
2415 case 0x2009:
2416 case 0x200A:
2417 case 0x202f:
2418 case 0x205f:
2419 case 0x3000:
2420 return op_code != OP_HSPACE;
2421 default:
2422 return op_code == OP_HSPACE;
2425 case OP_VSPACE:
2426 case OP_NOT_VSPACE:
2427 switch(next)
2429 case 0x0a:
2430 case 0x0b:
2431 case 0x0c:
2432 case 0x0d:
2433 case 0x85:
2434 case 0x2028:
2435 case 0x2029:
2436 return op_code != OP_VSPACE;
2437 default:
2438 return op_code == OP_VSPACE;
2441 default:
2442 return FALSE;
2446 /* Handle the case when the next item is \d, \s, etc. */
2448 switch(op_code)
2450 case OP_CHAR:
2451 case OP_CHARNC:
2452 #ifdef SUPPORT_UTF8
2453 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2454 #endif
2455 switch(-next)
2457 case ESC_d:
2458 return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
2460 case ESC_D:
2461 return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
2463 case ESC_s:
2464 return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
2466 case ESC_S:
2467 return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
2469 case ESC_w:
2470 return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
2472 case ESC_W:
2473 return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
2475 case ESC_h:
2476 case ESC_H:
2477 switch(item)
2479 case 0x09:
2480 case 0x20:
2481 case 0xa0:
2482 case 0x1680:
2483 case 0x180e:
2484 case 0x2000:
2485 case 0x2001:
2486 case 0x2002:
2487 case 0x2003:
2488 case 0x2004:
2489 case 0x2005:
2490 case 0x2006:
2491 case 0x2007:
2492 case 0x2008:
2493 case 0x2009:
2494 case 0x200A:
2495 case 0x202f:
2496 case 0x205f:
2497 case 0x3000:
2498 return -next != ESC_h;
2499 default:
2500 return -next == ESC_h;
2503 case ESC_v:
2504 case ESC_V:
2505 switch(item)
2507 case 0x0a:
2508 case 0x0b:
2509 case 0x0c:
2510 case 0x0d:
2511 case 0x85:
2512 case 0x2028:
2513 case 0x2029:
2514 return -next != ESC_v;
2515 default:
2516 return -next == ESC_v;
2519 default:
2520 return FALSE;
2523 case OP_DIGIT:
2524 return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2525 next == -ESC_h || next == -ESC_v;
2527 case OP_NOT_DIGIT:
2528 return next == -ESC_d;
2530 case OP_WHITESPACE:
2531 return next == -ESC_S || next == -ESC_d || next == -ESC_w;
2533 case OP_NOT_WHITESPACE:
2534 return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2536 case OP_HSPACE:
2537 return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
2539 case OP_NOT_HSPACE:
2540 return next == -ESC_h;
2542 /* Can't have \S in here because VT matches \S (Perl anomaly) */
2543 case OP_VSPACE:
2544 return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2546 case OP_NOT_VSPACE:
2547 return next == -ESC_v;
2549 case OP_WORDCHAR:
2550 return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
2552 case OP_NOT_WORDCHAR:
2553 return next == -ESC_w || next == -ESC_d;
2555 default:
2556 return FALSE;
2559 /* Control does not reach here */
2564 /*************************************************
2565 * Compile one branch *
2566 *************************************************/
2568 /* Scan the pattern, compiling it into the a vector. If the options are
2569 changed during the branch, the pointer is used to change the external options
2570 bits. This function is used during the pre-compile phase when we are trying
2571 to find out the amount of memory needed, as well as during the real compile
2572 phase. The value of lengthptr distinguishes the two phases.
2574 Arguments:
2575 optionsptr pointer to the option bits
2576 codeptr points to the pointer to the current code point
2577 ptrptr points to the current pattern pointer
2578 errorcodeptr points to error code variable
2579 firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2580 reqbyteptr set to the last literal character required, else < 0
2581 bcptr points to current branch chain
2582 cd contains pointers to tables etc.
2583 lengthptr NULL during the real compile phase
2584 points to length accumulator during pre-compile phase
2586 Returns: TRUE on success
2587 FALSE, with *errorcodeptr set non-zero on error
2590 static BOOL
2591 compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2592 int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2593 compile_data *cd, int *lengthptr)
2595 int repeat_type, op_type;
2596 int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
2597 int bravalue = 0;
2598 int greedy_default, greedy_non_default;
2599 int firstbyte, reqbyte;
2600 int zeroreqbyte, zerofirstbyte;
2601 int req_caseopt, reqvary, tempreqvary;
2602 int options = *optionsptr;
2603 int after_manual_callout = 0;
2604 int length_prevgroup = 0;
2605 register int c;
2606 register uschar *code = *codeptr;
2607 uschar *last_code = code;
2608 uschar *orig_code = code;
2609 uschar *tempcode;
2610 BOOL inescq = FALSE;
2611 BOOL groupsetfirstbyte = FALSE;
2612 const uschar *ptr = *ptrptr;
2613 const uschar *tempptr;
2614 uschar *previous = NULL;
2615 uschar *previous_callout = NULL;
2616 uschar *save_hwm = NULL;
2617 uschar classbits[32];
2619 #ifdef SUPPORT_UTF8
2620 BOOL class_utf8;
2621 BOOL utf8 = (options & PCRE_UTF8) != 0;
2622 uschar *class_utf8data;
2623 uschar *class_utf8data_base;
2624 uschar utf8_char[6];
2625 #else
2626 BOOL utf8 = FALSE;
2627 uschar *utf8_char = NULL;
2628 #endif
2630 #ifdef PCRE_DEBUG
2631 if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2632 #endif
2634 /* Set up the default and non-default settings for greediness */
2636 greedy_default = ((options & PCRE_UNGREEDY) != 0);
2637 greedy_non_default = greedy_default ^ 1;
2639 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2640 matching encountered yet". It gets changed to REQ_NONE if we hit something that
2641 matches a non-fixed char first char; reqbyte just remains unset if we never
2642 find one.
2644 When we hit a repeat whose minimum is zero, we may have to adjust these values
2645 to take the zero repeat into account. This is implemented by setting them to
2646 zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2647 item types that can be repeated set these backoff variables appropriately. */
2649 firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2651 /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2652 according to the current setting of the caseless flag. REQ_CASELESS is a bit
2653 value > 255. It is added into the firstbyte or reqbyte variables to record the
2654 case status of the value. This is used only for ASCII characters. */
2656 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2658 /* Switch on next character until the end of the branch */
2660 for (;; ptr++)
2662 BOOL negate_class;
2663 BOOL inverted_class = FALSE;
2664 BOOL possessive_quantifier;
2665 BOOL is_quantifier;
2666 BOOL is_recurse;
2667 BOOL reset_bracount;
2668 int class_charcount;
2669 int class_lastchar;
2670 int newoptions;
2671 int recno;
2672 int refsign;
2673 int skipbytes;
2674 int subreqbyte;
2675 int subfirstbyte;
2676 int terminator;
2677 int mclength;
2678 uschar mcbuffer[8];
2680 /* Get next byte in the pattern */
2682 c = *ptr;
2684 /* If we are in the pre-compile phase, accumulate the length used for the
2685 previous cycle of this loop. */
2687 if (lengthptr != NULL)
2689 #ifdef PCRE_DEBUG
2690 if (code > cd->hwm) cd->hwm = code; /* High water info */
2691 #endif
2692 if (code > cd->start_workspace + (COMPILE_WORK_SIZE-10)) /* tierney: added -10, since actual overwriting corrupts the stack Check for overrun */
2694 *errorcodeptr = ERR52;
2695 goto FAILED;
2698 /* There is at least one situation where code goes backwards: this is the
2699 case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2700 the class is simply eliminated. However, it is created first, so we have to
2701 allow memory for it. Therefore, don't ever reduce the length at this point.
2704 if (code < last_code) code = last_code;
2706 /* Paranoid check for integer overflow */
2708 if (OFLOW_MAX - *lengthptr < code - last_code)
2710 *errorcodeptr = ERR20;
2711 goto FAILED;
2714 *lengthptr += code - last_code;
2715 DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2717 /* If "previous" is set and it is not at the start of the work space, move
2718 it back to there, in order to avoid filling up the work space. Otherwise,
2719 if "previous" is NULL, reset the current code pointer to the start. */
2721 if (previous != NULL)
2723 if (previous > orig_code)
2725 VMPI_memmove(orig_code, previous, code - previous);
2726 code -= previous - orig_code;
2727 previous = orig_code;
2730 else code = orig_code;
2732 /* Remember where this code item starts so we can pick up the length
2733 next time round. */
2735 last_code = code;
2738 /* In the real compile phase, just check the workspace used by the forward
2739 reference list. */
2741 else if (cd->hwm > cd->start_workspace + (COMPILE_WORK_SIZE-10)) // tierney: added -10, since actual overwriting corrupts the stack
2743 *errorcodeptr = ERR52;
2744 goto FAILED;
2747 /* If in \Q...\E, check for the end; if not, we have a literal */
2749 if (inescq && c != 0)
2751 if (c == '\\' && ptr[1] == 'E')
2753 inescq = FALSE;
2754 ptr++;
2755 continue;
2757 else
2759 if (previous_callout != NULL)
2761 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2762 complete_callout(previous_callout, ptr, cd);
2763 previous_callout = NULL;
2765 if ((options & PCRE_AUTO_CALLOUT) != 0)
2767 previous_callout = code;
2768 code = auto_callout(code, ptr, cd);
2770 goto NORMAL_CHAR;
2774 /* Fill in length of a previous callout, except when the next thing is
2775 a quantifier. */
2777 is_quantifier = c == '*' || c == '+' || c == '?' ||
2778 (c == '{' && is_counted_repeat(ptr+1));
2780 if (!is_quantifier && previous_callout != NULL &&
2781 after_manual_callout-- <= 0)
2783 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2784 complete_callout(previous_callout, ptr, cd);
2785 previous_callout = NULL;
2788 /* In extended mode, skip white space and comments */
2790 if ((options & PCRE_EXTENDED) != 0)
2792 if (c < 256)
2794 if ((cd->ctypes[c] & ctype_space) != 0)
2795 continue;
2797 else if (isUnicodeWhiteSpace(c) == true)
2799 continue;
2801 if (c == '#')
2803 while (*(++ptr) != 0)
2805 if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2807 if (*ptr != 0) continue;
2809 /* Else fall through to handle end of string */
2810 c = 0;
2814 /* No auto callout for quantifiers. */
2816 if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
2818 previous_callout = code;
2819 code = auto_callout(code, ptr, cd);
2822 switch(c)
2824 /* ===================================================================*/
2825 case 0: /* The branch terminates at string end */
2826 case '|': /* or | or ) */
2827 case ')':
2828 *firstbyteptr = firstbyte;
2829 *reqbyteptr = reqbyte;
2830 *codeptr = code;
2831 *ptrptr = ptr;
2832 if (lengthptr != NULL)
2834 if (OFLOW_MAX - *lengthptr < code - last_code)
2836 *errorcodeptr = ERR20;
2837 goto FAILED;
2839 *lengthptr += code - last_code; /* To include callout length */
2840 DPRINTF((">> end branch\n"));
2842 return TRUE;
2845 /* ===================================================================*/
2846 /* Handle single-character metacharacters. In multiline mode, ^ disables
2847 the setting of any following char as a first character. */
2849 case '^':
2850 if ((options & PCRE_MULTILINE) != 0)
2852 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2854 previous = NULL;
2855 *code++ = OP_CIRC;
2856 break;
2858 case '$':
2859 previous = NULL;
2860 *code++ = OP_DOLL;
2861 break;
2863 /* There can never be a first char if '.' is first, whatever happens about
2864 repeats. The value of reqbyte doesn't change either. */
2866 case '.':
2867 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2868 zerofirstbyte = firstbyte;
2869 zeroreqbyte = reqbyte;
2870 previous = code;
2871 *code++ = OP_ANY;
2872 break;
2875 /* ===================================================================*/
2876 /* Character classes. If the included characters are all < 256, we build a
2877 32-byte bitmap of the permitted characters, except in the special case
2878 where there is only one such character. For negated classes, we build the
2879 map as usual, then invert it at the end. However, we use a different opcode
2880 so that data characters > 255 can be handled correctly.
2882 If the class contains characters outside the 0-255 range, a different
2883 opcode is compiled. It may optionally have a bit map for characters < 256,
2884 but those above are are explicitly listed afterwards. A flag byte tells
2885 whether the bitmap is present, and whether this is a negated class or not.
2888 case '[':
2889 previous = code;
2891 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2892 they are encountered at the top level, so we'll do that too. */
2894 if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2895 check_posix_syntax(ptr, &tempptr, cd))
2897 *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
2898 goto FAILED;
2901 /* If the first character is '^', set the negation flag and skip it. Also,
2902 if the first few characters (either before or after ^) are \Q\E or \E we
2903 skip them too. This makes for compatibility with Perl. */
2905 negate_class = FALSE;
2906 for (;;)
2908 c = *(++ptr);
2909 if (c == '\\')
2911 if (ptr[1] == 'E') ptr++;
2912 else if (VMPI_strncmp((const char *)ptr+1, "Q\\E", 3) == 0) ptr += 3;
2913 else break;
2915 else if (!negate_class && c == '^')
2916 negate_class = TRUE;
2917 else break;
2920 /* Keep a count of chars with values < 256 so that we can optimize the case
2921 of just a single character (as long as it's < 256). However, For higher
2922 valued UTF-8 characters, we don't yet do any optimization. */
2924 class_charcount = 0;
2925 class_lastchar = -1;
2927 /* Initialize the 32-char bit map to all zeros. We build the map in a
2928 temporary bit of memory, in case the class contains only 1 character (less
2929 than 256), because in that case the compiled code doesn't use the bit map.
2932 VMPI_memset(classbits, 0, 32 * sizeof(uschar));
2934 #ifdef SUPPORT_UTF8
2935 class_utf8 = FALSE; /* No chars >= 256 */
2936 class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */
2937 class_utf8data_base = class_utf8data; /* For resetting in pass 1 */
2938 #endif
2940 /* Process characters until ] is reached. By writing this as a "do" it
2941 means that an initial ] is taken as a data character. At the start of the
2942 loop, c contains the first byte of the character. */
2944 /* cn: Due to bug http://flashqa.macromedia.com/bugapp/detail.asp?ID=110290
2945 we need to handle [^] as matching any character (its not not a character...)
2946 The ^ is already stripped off, so we just need to catch a leading ']'
2948 if (c == ']') // [] is still invalid, so we know we must have [^]. [^] means any character other than null
2950 class_lastchar = 0;
2951 class_charcount = 1;
2953 else if (c != 0) do
2955 const uschar *oldptr;
2957 #ifdef SUPPORT_UTF8
2958 if (utf8 && c > 127)
2959 { /* Braces are required because the */
2960 GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
2963 /* In the pre-compile phase, accumulate the length of any UTF-8 extra
2964 data and reset the pointer, to ensure that large classes that cannot
2965 overwrite the work space. */
2967 if (lengthptr != NULL)
2969 *lengthptr += class_utf8data - class_utf8data_base;
2970 class_utf8data = class_utf8data_base;
2973 #endif
2975 /* Inside \Q...\E everything is literal except \E */
2977 if (inescq)
2979 if (c == '\\' && ptr[1] == 'E') /* If we are at \E */
2981 inescq = FALSE; /* Reset literal state */
2982 ptr++; /* Skip the 'E' */
2983 continue; /* Carry on with next */
2985 goto CHECK_RANGE; /* Could be range if \E follows */
2988 /* Handle POSIX class names. Perl allows a negation extension of the
2989 form [:^name:]. A square bracket that doesn't match the syntax is
2990 treated as a literal. We also recognize the POSIX constructions
2991 [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
2992 5.6 and 5.8 do. */
2994 if (c == '[' &&
2995 (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2996 check_posix_syntax(ptr, &tempptr, cd))
2998 BOOL local_negate = FALSE;
2999 int posix_class, taboffset, tabopt;
3000 register const uschar *cbits = cd->cbits;
3001 uschar pbits[32];
3003 if (ptr[1] != ':')
3005 *errorcodeptr = ERR31;
3006 goto FAILED;
3009 ptr += 2;
3010 if (*ptr == '^')
3012 local_negate = TRUE;
3013 ptr++;
3016 posix_class = check_posix_name(ptr, tempptr - ptr);
3017 if (posix_class < 0)
3019 *errorcodeptr = ERR30;
3020 goto FAILED;
3023 /* If matching is caseless, upper and lower are converted to
3024 alpha. This relies on the fact that the class table starts with
3025 alpha, lower, upper as the first 3 entries. */
3027 if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
3028 posix_class = 0;
3030 /* We build the bit map for the POSIX class in a chunk of local store
3031 because we may be adding and subtracting from it, and we don't want to
3032 subtract bits that may be in the main map already. At the end we or the
3033 result into the bit map that is being built. */
3035 posix_class *= 3;
3037 /* Copy in the first table (always present) */
3039 VMPI_memcpy(pbits, cbits + posix_class_maps[posix_class],
3040 32 * sizeof(uschar));
3042 /* If there is a second table, add or remove it as required. */
3044 taboffset = posix_class_maps[posix_class + 1];
3045 tabopt = posix_class_maps[posix_class + 2];
3047 if (taboffset >= 0)
3049 if (tabopt >= 0)
3050 for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
3051 else
3052 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
3055 /* Not see if we need to remove any special characters. An option
3056 value of 1 removes vertical space and 2 removes underscore. */
3058 if (tabopt < 0) tabopt = -tabopt;
3059 if (tabopt == 1) pbits[1] &= ~0x3c;
3060 else if (tabopt == 2) pbits[11] &= 0x7f;
3062 /* Add the POSIX table or its complement into the main table that is
3063 being built and we are done. */
3065 if (local_negate)
3066 for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
3067 else
3068 for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
3070 ptr = tempptr + 1;
3071 class_charcount = 10; /* Set > 1; assumes more than 1 per class */
3072 continue; /* End of POSIX syntax handling */
3075 /* Backslash may introduce a single character, or it may introduce one
3076 of the specials, which just set a flag. The sequence \b is a special
3077 case. Inside a class (and only there) it is treated as backspace.
3078 Elsewhere it marks a word boundary. Other escapes have preset maps ready
3079 to 'or' into the one we are building. We assume they have more than one
3080 character in them, so set class_charcount bigger than one. */
3082 if (c == '\\')
3084 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3085 if (*errorcodeptr != 0) goto FAILED;
3087 if (-c == ESC_b) c = '\b'; /* \b is backslash in a class */
3088 else if (-c == ESC_X) c = 'X'; /* \X is literal X in a class */
3089 else if (-c == ESC_R) c = 'R'; /* \R is literal R in a class */
3090 else if (-c == ESC_Q) /* Handle start of quoted string */
3092 if (ptr[1] == '\\' && ptr[2] == 'E')
3094 ptr += 2; /* avoid empty string */
3096 else inescq = TRUE;
3097 continue;
3099 else if (-c == ESC_E) continue; /* Ignore orphan \E */
3101 if (c < 0)
3103 register const uschar *cbits = cd->cbits;
3104 class_charcount += 2; /* Greater than 1 is what matters */
3106 /* Save time by not doing this in the pre-compile phase. */
3108 if (lengthptr == NULL) switch (-c)
3110 case ESC_d:
3111 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
3112 continue;
3114 case ESC_D:
3115 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
3116 inverted_class = TRUE;
3117 continue;
3119 case ESC_w:
3120 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
3121 continue;
3123 case ESC_W:
3124 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
3125 inverted_class = TRUE;
3126 continue;
3128 case ESC_s:
3129 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
3130 classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
3131 continue;
3133 case ESC_S:
3134 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
3135 classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
3136 inverted_class = TRUE;
3137 continue;
3139 case ESC_E: /* Perl ignores an orphan \E */
3140 continue;
3142 default: /* Not recognized; fall through */
3143 break; /* Need "default" setting to stop compiler warning. */
3146 /* In the pre-compile phase, just do the recognition. */
3148 else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
3149 c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
3151 /* We need to deal with \H, \h, \V, and \v in both phases because
3152 they use extra memory. */
3154 if (-c == ESC_h)
3156 SETBIT(classbits, 0x09); /* VT */
3157 SETBIT(classbits, 0x20); /* SPACE */
3158 SETBIT(classbits, 0xa0); /* NSBP */
3159 #ifdef SUPPORT_UTF8
3160 if (utf8)
3162 class_utf8 = TRUE;
3163 *class_utf8data++ = XCL_SINGLE;
3164 class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
3165 *class_utf8data++ = XCL_SINGLE;
3166 class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
3167 *class_utf8data++ = XCL_RANGE;
3168 class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
3169 class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
3170 *class_utf8data++ = XCL_SINGLE;
3171 class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
3172 *class_utf8data++ = XCL_SINGLE;
3173 class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
3174 *class_utf8data++ = XCL_SINGLE;
3175 class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
3177 #endif
3178 continue;
3181 if (-c == ESC_H)
3183 for (c = 0; c < 32; c++)
3185 int x = 0xff;
3186 switch (c)
3188 case 0x09/8: x ^= 1 << (0x09%8); break;
3189 case 0x20/8: x ^= 1 << (0x20%8); break;
3190 case 0xa0/8: x ^= 1 << (0xa0%8); break;
3191 default: break;
3193 classbits[c] |= x;
3196 #ifdef SUPPORT_UTF8
3197 if (utf8)
3199 class_utf8 = TRUE;
3200 *class_utf8data++ = XCL_RANGE;
3201 class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
3202 class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
3203 *class_utf8data++ = XCL_RANGE;
3204 class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
3205 class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
3206 *class_utf8data++ = XCL_RANGE;
3207 class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
3208 class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
3209 *class_utf8data++ = XCL_RANGE;
3210 class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
3211 class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
3212 *class_utf8data++ = XCL_RANGE;
3213 class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
3214 class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
3215 *class_utf8data++ = XCL_RANGE;
3216 class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
3217 class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
3218 *class_utf8data++ = XCL_RANGE;
3219 class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
3220 class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
3222 #endif
3223 continue;
3226 if (-c == ESC_v)
3228 SETBIT(classbits, 0x0a); /* LF */
3229 SETBIT(classbits, 0x0b); /* VT */
3230 SETBIT(classbits, 0x0c); /* FF */
3231 SETBIT(classbits, 0x0d); /* CR */
3232 SETBIT(classbits, 0x85); /* NEL */
3233 #ifdef SUPPORT_UTF8
3234 if (utf8)
3236 class_utf8 = TRUE;
3237 *class_utf8data++ = XCL_RANGE;
3238 class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
3239 class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3241 #endif
3242 continue;
3245 if (-c == ESC_V)
3247 for (c = 0; c < 32; c++)
3249 int x = 0xff;
3250 switch (c)
3252 case 0x0a/8: x ^= 1 << (0x0a%8);
3253 x ^= 1 << (0x0b%8);
3254 x ^= 1 << (0x0c%8);
3255 x ^= 1 << (0x0d%8);
3256 break;
3257 case 0x85/8: x ^= 1 << (0x85%8); break;
3258 default: break;
3260 classbits[c] |= x;
3263 #ifdef SUPPORT_UTF8
3264 if (utf8)
3266 class_utf8 = TRUE;
3267 *class_utf8data++ = XCL_RANGE;
3268 class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
3269 class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
3270 *class_utf8data++ = XCL_RANGE;
3271 class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3272 class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
3274 #endif
3275 continue;
3278 /* We need to deal with \P and \p in both phases. */
3280 #ifdef SUPPORT_UCP
3281 if (-c == ESC_p || -c == ESC_P)
3283 BOOL negated;
3284 int pdata;
3285 int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
3286 if (ptype < 0) goto FAILED;
3287 class_utf8 = TRUE;
3288 *class_utf8data++ = ((-c == ESC_p) != negated)?
3289 XCL_PROP : XCL_NOTPROP;
3290 *class_utf8data++ = ptype;
3291 *class_utf8data++ = pdata;
3292 class_charcount -= 2; /* Not a < 256 character */
3293 continue;
3295 #endif
3296 /* Unrecognized escapes are faulted if PCRE is running in its
3297 strict mode. By default, for compatibility with Perl, they are
3298 treated as literals. */
3300 if ((options & PCRE_EXTRA) != 0)
3302 *errorcodeptr = ERR7;
3303 goto FAILED;
3306 class_charcount -= 2; /* Undo the default count from above */
3307 c = *ptr; /* Get the final character and fall through */
3310 /* Fall through if we have a single character (c >= 0). This may be
3311 greater than 256 in UTF-8 mode. */
3313 } /* End of backslash handling */
3315 /* A single character may be followed by '-' to form a range. However,
3316 Perl does not permit ']' to be the end of the range. A '-' character
3317 at the end is treated as a literal. Perl ignores orphaned \E sequences
3318 entirely. The code for handling \Q and \E is messy. */
3320 CHECK_RANGE:
3321 while (ptr[1] == '\\' && ptr[2] == 'E')
3323 inescq = FALSE;
3324 ptr += 2;
3327 oldptr = ptr;
3329 if (!inescq && ptr[1] == '-')
3331 int d;
3332 ptr += 2;
3333 while (*ptr == '\\' && ptr[1] == 'E') ptr += 2;
3335 /* If we hit \Q (not followed by \E) at this point, go into escaped
3336 mode. */
3338 while (*ptr == '\\' && ptr[1] == 'Q')
3340 ptr += 2;
3341 if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; }
3342 inescq = TRUE;
3343 break;
3346 if (*ptr == 0 || (!inescq && *ptr == ']'))
3348 ptr = oldptr;
3349 goto LONE_SINGLE_CHARACTER;
3352 #ifdef SUPPORT_UTF8
3353 if (utf8)
3354 { /* Braces are required because the */
3355 GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
3357 else
3358 #endif
3359 d = *ptr; /* Not UTF-8 mode */
3361 /* The second part of a range can be a single-character escape, but
3362 not any of the other escapes. Perl 5.6 treats a hyphen as a literal
3363 in such circumstances. */
3365 if (!inescq && d == '\\')
3367 d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3368 if (*errorcodeptr != 0) goto FAILED;
3370 /* \b is backslash; \X is literal X; \R is literal R; any other
3371 special means the '-' was literal */
3373 if (d < 0)
3375 if (d == -ESC_b) d = '\b';
3376 else if (d == -ESC_X) d = 'X';
3377 else if (d == -ESC_R) d = 'R'; else
3379 ptr = oldptr;
3380 goto LONE_SINGLE_CHARACTER; /* A few lines below */
3385 /* Check that the two values are in the correct order. Optimize
3386 one-character ranges */
3388 if (d < c)
3390 *errorcodeptr = ERR8;
3391 goto FAILED;
3394 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
3396 /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
3397 matching, we have to use an XCLASS with extra data items. Caseless
3398 matching for characters > 127 is available only if UCP support is
3399 available. */
3401 #ifdef SUPPORT_UTF8
3402 if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
3404 class_utf8 = TRUE;
3406 /* With UCP support, we can find the other case equivalents of
3407 the relevant characters. There may be several ranges. Optimize how
3408 they fit with the basic range. */
3410 #ifdef SUPPORT_UCP
3411 if ((options & PCRE_CASELESS) != 0)
3413 unsigned int occ, ocd;
3414 unsigned int cc = c;
3415 unsigned int origd = d;
3416 while (get_othercase_range(&cc, origd, &occ, &ocd))
3418 if (occ >= (unsigned int)c &&
3419 ocd <= (unsigned int)d)
3420 continue; /* Skip embedded ranges */
3422 if (occ < (unsigned int)c &&
3423 ocd >= (unsigned int)c - 1) /* Extend the basic range */
3424 { /* if there is overlap, */
3425 c = occ; /* noting that if occ < c */
3426 continue; /* we can't have ocd > d */
3427 } /* because a subrange is */
3428 if (ocd > (unsigned int)d &&
3429 occ <= (unsigned int)d + 1) /* always shorter than */
3430 { /* the basic range. */
3431 d = ocd;
3432 continue;
3435 if (occ == ocd)
3437 *class_utf8data++ = XCL_SINGLE;
3439 else
3441 *class_utf8data++ = XCL_RANGE;
3442 class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
3444 class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
3447 #endif /* SUPPORT_UCP */
3449 /* Now record the original range, possibly modified for UCP caseless
3450 overlapping ranges. */
3452 *class_utf8data++ = XCL_RANGE;
3453 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3454 class_utf8data += _pcre_ord2utf8(d, class_utf8data);
3456 /* With UCP support, we are done. Without UCP support, there is no
3457 caseless matching for UTF-8 characters > 127; we can use the bit map
3458 for the smaller ones. */
3460 #ifdef SUPPORT_UCP
3461 continue; /* With next character in the class */
3462 #else
3463 if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
3465 /* Adjust upper limit and fall through to set up the map */
3467 d = 127;
3469 #endif /* SUPPORT_UCP */
3471 #endif /* SUPPORT_UTF8 */
3473 /* We use the bit map for all cases when not in UTF-8 mode; else
3474 ranges that lie entirely within 0-127 when there is UCP support; else
3475 for partial ranges without UCP support. */
3477 class_charcount += d - c + 1;
3478 class_lastchar = d;
3480 /* We can save a bit of time by skipping this in the pre-compile. */
3482 if (lengthptr == NULL) for (; c <= d; c++)
3484 classbits[c/8] |= (1 << (c&7));
3485 if ((options & PCRE_CASELESS) != 0)
3487 int uc = cd->fcc[c]; /* flip case */
3488 classbits[uc/8] |= (1 << (uc&7));
3492 continue; /* Go get the next char in the class */
3495 /* Handle a lone single character - we can get here for a normal
3496 non-escape char, or after \ that introduces a single character or for an
3497 apparent range that isn't. */
3499 LONE_SINGLE_CHARACTER:
3501 /* Handle a character that cannot go in the bit map */
3503 #ifdef SUPPORT_UTF8
3504 if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
3506 class_utf8 = TRUE;
3507 *class_utf8data++ = XCL_SINGLE;
3508 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3510 #ifdef SUPPORT_UCP
3511 if ((options & PCRE_CASELESS) != 0)
3513 unsigned int othercase;
3514 if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR)
3516 *class_utf8data++ = XCL_SINGLE;
3517 class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
3520 #endif /* SUPPORT_UCP */
3523 else
3524 #endif /* SUPPORT_UTF8 */
3526 /* Handle a single-byte character */
3528 classbits[c/8] |= (1 << (c&7));
3529 if ((options & PCRE_CASELESS) != 0)
3531 c = cd->fcc[c]; /* flip case */
3532 classbits[c/8] |= (1 << (c&7));
3534 class_charcount++;
3535 class_lastchar = c;
3539 /* Loop until ']' reached. This "while" is the end of the "do" above. */
3541 while ((c = *(++ptr)) != 0 && (c != ']' || inescq));
3543 if (c == 0) /* Missing terminating ']' */
3545 *errorcodeptr = ERR6;
3546 goto FAILED;
3549 /* Remember whether \r or \n are in this class */
3551 if (negate_class)
3553 if ((classbits[1] & 0x24) != 0x24) cd->external_options |= PCRE_HASCRORLF;
3555 else
3557 if ((classbits[1] & 0x24) != 0) cd->external_options |= PCRE_HASCRORLF;
3559 /* If class_charcount is 1, we saw precisely one character whose value is
3560 less than 256. As long as there were no characters >= 128 and there was no
3561 use of \p or \P, in other words, no use of any XCLASS features, we can
3562 optimize.
3564 In UTF-8 mode, we can optimize the negative case only if there were no
3565 characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR
3566 operate on single-bytes only. This is an historical hangover. Maybe one day
3567 we can tidy these opcodes to handle multi-byte characters.
3569 The optimization throws away the bit map. We turn the item into a
3570 1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
3571 that OP_NOT does not support multibyte characters. In the positive case, it
3572 can cause firstbyte to be set. Otherwise, there can be no first char if
3573 this item is first, whatever repeat count may follow. In the case of
3574 reqbyte, save the previous value for reinstating. */
3576 #ifdef SUPPORT_UTF8
3577 if (class_charcount == 1 && !class_utf8 &&
3578 (!utf8 || !negate_class || class_lastchar < 128))
3579 #else
3580 if (class_charcount == 1)
3581 #endif
3583 zeroreqbyte = reqbyte;
3585 /* The OP_NOT opcode works on one-byte characters only. */
3587 if (negate_class)
3589 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3590 zerofirstbyte = firstbyte;
3591 *code++ = OP_NOT;
3592 *code++ = class_lastchar;
3593 break;
3596 /* For a single, positive character, get the value into mcbuffer, and
3597 then we can handle this with the normal one-character code. */
3599 #ifdef SUPPORT_UTF8
3600 if (utf8 && class_lastchar > 127)
3601 mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
3602 else
3603 #endif
3605 mcbuffer[0] = class_lastchar;
3606 mclength = 1;
3608 goto ONE_CHAR;
3609 } /* End of 1-char optimization */
3611 /* The general case - not the one-char optimization. If this is the first
3612 thing in the branch, there can be no first char setting, whatever the
3613 repeat count. Any reqbyte setting must remain unchanged after any kind of
3614 repeat. */
3616 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3617 zerofirstbyte = firstbyte;
3618 zeroreqbyte = reqbyte;
3620 /* If there are characters with values > 255, we have to compile an
3621 extended class, with its own opcode. If there are no characters < 256,
3622 we can omit the bitmap in the actual compiled code. */
3624 #ifdef SUPPORT_UTF8
3625 if (class_utf8)
3627 *class_utf8data++ = XCL_END; /* Marks the end of extra data */
3628 *code++ = OP_XCLASS;
3629 code += LINK_SIZE;
3630 *code = negate_class? XCL_NOT : 0;
3632 /* If the map is required, move up the extra data to make room for it;
3633 otherwise just move the code pointer to the end of the extra data. */
3635 if (class_charcount > 0)
3637 *code++ |= XCL_MAP;
3638 VMPI_memmove(code + 32, code, class_utf8data - code);
3639 VMPI_memcpy(code, classbits, 32);
3640 code = class_utf8data + 32;
3642 else code = class_utf8data;
3644 /* Now fill in the complete length of the item */
3646 PUT(previous, 1, code - previous);
3647 break; /* End of class handling */
3649 #endif
3651 /* If there are no characters > 255, negate the 32-byte map if necessary,
3652 and copy it into the code vector. If this is the first thing in the branch,
3653 there can be no first char setting, whatever the repeat count. Any reqbyte
3654 setting must remain unchanged after any kind of repeat. */
3656 if (negate_class)
3658 *code++ = OP_NCLASS;
3659 if (lengthptr == NULL) /* Save time in the pre-compile phase */
3660 for (c = 0; c < 32; c++) code[c] = ~classbits[c];
3662 else
3664 /* cn: code added here for proper UTF-8 handling of \S (not whitespace char) \D (not a digit char) and \W (not a word char) */
3665 // The class maps for \S \D \W are the inverted maps for \s \d and \w. The class maps only handle comparisions with
3666 // chars < 255, however, and we need successful matches for chars > 255 when comparing to \S \D and \W. The
3667 // interpreter code for OP_CLASS will fail the match for any character > 255, (which is correct for \s \d \w).
3668 // The interpreter code for OP_NCLASS will succeed for any character > 255 (which is correct of ^\s, ^\d, ^\w).
3669 // Since \S is equivalent to ^\s, we need to use OP_NCLASS for \S \D and \W (inverted_class == true in that case).
3670 // Another option would be to create an OP_XCLASS custom bitmap just to handle all chars > 255 for \S \D and \W.
3671 // though that would execute slower.
3673 // original code: *code++ = OP_CLASS;
3674 *code++ = (inverted_class ? OP_NCLASS : OP_CLASS);
3676 /* cn: end modified code */
3677 VMPI_memcpy(code, classbits, 32);
3679 code += 32;
3680 break;
3683 /* ===================================================================*/
3684 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
3685 has been tested above. */
3687 case '{':
3688 if (!is_quantifier) goto NORMAL_CHAR;
3689 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
3690 if (*errorcodeptr != 0) goto FAILED;
3691 goto REPEAT;
3693 case '*':
3694 repeat_min = 0;
3695 repeat_max = -1;
3696 goto REPEAT;
3698 case '+':
3699 repeat_min = 1;
3700 repeat_max = -1;
3701 goto REPEAT;
3703 case '?':
3704 repeat_min = 0;
3705 repeat_max = 1;
3707 REPEAT:
3708 if (previous == NULL)
3710 *errorcodeptr = ERR9;
3711 goto FAILED;
3714 if (repeat_min == 0)
3716 firstbyte = zerofirstbyte; /* Adjust for zero repeat */
3717 reqbyte = zeroreqbyte; /* Ditto */
3720 /* Remember whether this is a variable length repeat */
3722 reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
3724 op_type = 0; /* Default single-char op codes */
3725 possessive_quantifier = FALSE; /* Default not possessive quantifier */
3727 /* Save start of previous item, in case we have to move it up to make space
3728 for an inserted OP_ONCE for the additional '+' extension. */
3730 tempcode = previous;
3732 /* If the next character is '+', we have a possessive quantifier. This
3733 implies greediness, whatever the setting of the PCRE_UNGREEDY option.
3734 If the next character is '?' this is a minimizing repeat, by default,
3735 but if PCRE_UNGREEDY is set, it works the other way round. We change the
3736 repeat type to the non-default. */
3738 if (ptr[1] == '+')
3740 repeat_type = 0; /* Force greedy */
3741 possessive_quantifier = TRUE;
3742 ptr++;
3744 else if (ptr[1] == '?')
3746 repeat_type = greedy_non_default;
3747 ptr++;
3749 else repeat_type = greedy_default;
3751 /* If previous was a character match, abolish the item and generate a
3752 repeat item instead. If a char item has a minumum of more than one, ensure
3753 that it is set in reqbyte - it might not be if a sequence such as x{3} is
3754 the first thing in a branch because the x will have gone into firstbyte
3755 instead. */
3757 if (*previous == OP_CHAR || *previous == OP_CHARNC)
3759 /* Deal with UTF-8 characters that take up more than one byte. It's
3760 easier to write this out separately than try to macrify it. Use c to
3761 hold the length of the character in bytes, plus 0x80 to flag that it's a
3762 length rather than a small character. */
3764 #ifdef SUPPORT_UTF8
3765 if (utf8 && (code[-1] & 0x80) != 0)
3767 uschar *lastchar = code - 1;
3768 while((*lastchar & 0xc0) == 0x80) lastchar--;
3769 c = code - lastchar; /* Length of UTF-8 character */
3770 VMPI_memcpy(utf8_char, lastchar, c); /* Save the char */
3771 c |= 0x80; /* Flag c as a length */
3773 else
3774 #endif
3776 /* Handle the case of a single byte - either with no UTF8 support, or
3777 with UTF-8 disabled, or for a UTF-8 character < 128. */
3780 c = code[-1];
3781 if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3784 /* If the repetition is unlimited, it pays to see if the next thing on
3785 the line is something that cannot possibly match this character. If so,
3786 automatically possessifying this item gains some performance in the case
3787 where the match fails. */
3789 if (!possessive_quantifier &&
3790 repeat_max < 0 &&
3791 check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3792 options, cd))
3794 repeat_type = 0; /* Force greedy */
3795 possessive_quantifier = TRUE;
3798 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
3801 /* If previous was a single negated character ([^a] or similar), we use
3802 one of the special opcodes, replacing it. The code is shared with single-
3803 character repeats by setting opt_type to add a suitable offset into
3804 repeat_type. We can also test for auto-possessification. OP_NOT is
3805 currently used only for single-byte chars. */
3807 else if (*previous == OP_NOT)
3809 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
3810 c = previous[1];
3811 if (!possessive_quantifier &&
3812 repeat_max < 0 &&
3813 check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3815 repeat_type = 0; /* Force greedy */
3816 possessive_quantifier = TRUE;
3818 goto OUTPUT_SINGLE_REPEAT;
3821 /* If previous was a character type match (\d or similar), abolish it and
3822 create a suitable repeat item. The code is shared with single-character
3823 repeats by setting op_type to add a suitable offset into repeat_type. Note
3824 the the Unicode property types will be present only when SUPPORT_UCP is
3825 defined, but we don't wrap the little bits of code here because it just
3826 makes it horribly messy. */
3828 else if (*previous < OP_EODN)
3830 uschar *oldcode;
3831 int prop_type, prop_value;
3832 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
3833 c = *previous;
3835 if (!possessive_quantifier &&
3836 repeat_max < 0 &&
3837 check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3839 repeat_type = 0; /* Force greedy */
3840 possessive_quantifier = TRUE;
3843 OUTPUT_SINGLE_REPEAT:
3844 if (*previous == OP_PROP || *previous == OP_NOTPROP)
3846 prop_type = previous[1];
3847 prop_value = previous[2];
3849 else prop_type = prop_value = -1;
3851 oldcode = code;
3852 code = previous; /* Usually overwrite previous item */
3854 /* If the maximum is zero then the minimum must also be zero; Perl allows
3855 this case, so we do too - by simply omitting the item altogether. */
3857 if (repeat_max == 0) goto END_REPEAT;
3859 /* All real repeats make it impossible to handle partial matching (maybe
3860 one day we will be able to remove this restriction). */
3862 if (repeat_max != 1) cd->nopartial = TRUE;
3864 /* Combine the op_type with the repeat_type */
3866 repeat_type += op_type;
3868 /* A minimum of zero is handled either as the special case * or ?, or as
3869 an UPTO, with the maximum given. */
3871 if (repeat_min == 0)
3873 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
3874 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
3875 else
3877 *code++ = OP_UPTO + repeat_type;
3878 PUT2INC(code, 0, repeat_max);
3882 /* A repeat minimum of 1 is optimized into some special cases. If the
3883 maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3884 left in place and, if the maximum is greater than 1, we use OP_UPTO with
3885 one less than the maximum. */
3887 else if (repeat_min == 1)
3889 if (repeat_max == -1)
3890 *code++ = OP_PLUS + repeat_type;
3891 else
3893 code = oldcode; /* leave previous item in place */
3894 if (repeat_max == 1) goto END_REPEAT;
3895 *code++ = OP_UPTO + repeat_type;
3896 PUT2INC(code, 0, repeat_max - 1);
3900 /* The case {n,n} is just an EXACT, while the general case {n,m} is
3901 handled as an EXACT followed by an UPTO. */
3903 else
3905 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
3906 PUT2INC(code, 0, repeat_min);
3908 /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3909 we have to insert the character for the previous code. For a repeated
3910 Unicode property match, there are two extra bytes that define the
3911 required property. In UTF-8 mode, long characters have their length in
3912 c, with the 0x80 bit as a flag. */
3914 if (repeat_max < 0)
3916 #ifdef SUPPORT_UTF8
3917 if (utf8 && c >= 128)
3919 VMPI_memcpy(code, utf8_char, c & 7);
3920 code += c & 7;
3922 else
3923 #endif
3925 *code++ = c;
3926 if (prop_type >= 0)
3928 *code++ = prop_type;
3929 *code++ = prop_value;
3932 *code++ = OP_STAR + repeat_type;
3935 /* Else insert an UPTO if the max is greater than the min, again
3936 preceded by the character, for the previously inserted code. If the
3937 UPTO is just for 1 instance, we can use QUERY instead. */
3939 else if (repeat_max != repeat_min)
3941 #ifdef SUPPORT_UTF8
3942 if (utf8 && c >= 128)
3944 VMPI_memcpy(code, utf8_char, c & 7);
3945 code += c & 7;
3947 else
3948 #endif
3949 *code++ = c;
3950 if (prop_type >= 0)
3952 *code++ = prop_type;
3953 *code++ = prop_value;
3955 repeat_max -= repeat_min;
3957 if (repeat_max == 1)
3959 *code++ = OP_QUERY + repeat_type;
3961 else
3963 *code++ = OP_UPTO + repeat_type;
3964 PUT2INC(code, 0, repeat_max);
3969 /* The character or character type itself comes last in all cases. */
3971 #ifdef SUPPORT_UTF8
3972 if (utf8 && c >= 128)
3974 VMPI_memcpy(code, utf8_char, c & 7);
3975 code += c & 7;
3977 else
3978 #endif
3979 *code++ = c;
3981 /* For a repeated Unicode property match, there are two extra bytes that
3982 define the required property. */
3984 #ifdef SUPPORT_UCP
3985 if (prop_type >= 0)
3987 *code++ = prop_type;
3988 *code++ = prop_value;
3990 #endif
3993 /* If previous was a character class or a back reference, we put the repeat
3994 stuff after it, but just skip the item if the repeat was {0,0}. */
3996 else if (*previous == OP_CLASS ||
3997 *previous == OP_NCLASS ||
3998 #ifdef SUPPORT_UTF8
3999 *previous == OP_XCLASS ||
4000 #endif
4001 *previous == OP_REF)
4003 if (repeat_max == 0)
4005 code = previous;
4006 goto END_REPEAT;
4009 /* All real repeats make it impossible to handle partial matching (maybe
4010 one day we will be able to remove this restriction). */
4012 if (repeat_max != 1) cd->nopartial = TRUE;
4014 if (repeat_min == 0 && repeat_max == -1)
4015 *code++ = OP_CRSTAR + repeat_type;
4016 else if (repeat_min == 1 && repeat_max == -1)
4017 *code++ = OP_CRPLUS + repeat_type;
4018 else if (repeat_min == 0 && repeat_max == 1)
4019 *code++ = OP_CRQUERY + repeat_type;
4020 else
4022 *code++ = OP_CRRANGE + repeat_type;
4023 PUT2INC(code, 0, repeat_min);
4024 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
4025 PUT2INC(code, 0, repeat_max);
4029 /* If previous was a bracket group, we may have to replicate it in certain
4030 cases. */
4032 else if (*previous == OP_BRA || *previous == OP_CBRA ||
4033 *previous == OP_ONCE || *previous == OP_COND)
4035 register int i;
4036 int ketoffset = 0;
4037 int len = code - previous;
4038 uschar *bralink = NULL;
4040 /* Repeating a DEFINE group is pointless */
4042 if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
4044 *errorcodeptr = ERR55;
4045 goto FAILED;
4048 /* If the maximum repeat count is unlimited, find the end of the bracket
4049 by scanning through from the start, and compute the offset back to it
4050 from the current code pointer. There may be an OP_OPT setting following
4051 the final KET, so we can't find the end just by going back from the code
4052 pointer. */
4054 if (repeat_max == -1)
4056 register uschar *ket = previous;
4057 do ket += GET(ket, 1); while (*ket != OP_KET);
4058 ketoffset = code - ket;
4061 /* The case of a zero minimum is special because of the need to stick
4062 OP_BRAZERO in front of it, and because the group appears once in the
4063 data, whereas in other cases it appears the minimum number of times. For
4064 this reason, it is simplest to treat this case separately, as otherwise
4065 the code gets far too messy. There are several special subcases when the
4066 minimum is zero. */
4068 if (repeat_min == 0)
4070 /* If the maximum is also zero, we just omit the group from the output
4071 altogether. */
4073 if (repeat_max == 0)
4075 code = previous;
4076 goto END_REPEAT;
4079 /* If the maximum is 1 or unlimited, we just have to stick in the
4080 BRAZERO and do no more at this point. However, we do need to adjust
4081 any OP_RECURSE calls inside the group that refer to the group itself or
4082 any internal or forward referenced group, because the offset is from
4083 the start of the whole regex. Temporarily terminate the pattern while
4084 doing this. */
4086 if (repeat_max <= 1)
4088 *code = OP_END;
4089 adjust_recurse(previous, 1, utf8, cd, save_hwm);
4090 VMPI_memmove(previous+1, previous, len);
4091 code++;
4092 *previous++ = OP_BRAZERO + repeat_type;
4095 /* If the maximum is greater than 1 and limited, we have to replicate
4096 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
4097 The first one has to be handled carefully because it's the original
4098 copy, which has to be moved up. The remainder can be handled by code
4099 that is common with the non-zero minimum case below. We have to
4100 adjust the value or repeat_max, since one less copy is required. Once
4101 again, we may have to adjust any OP_RECURSE calls inside the group. */
4103 else
4105 int offset;
4106 *code = OP_END;
4107 adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
4108 VMPI_memmove(previous + 2 + LINK_SIZE, previous, len);
4109 code += 2 + LINK_SIZE;
4110 *previous++ = OP_BRAZERO + repeat_type;
4111 *previous++ = OP_BRA;
4113 /* We chain together the bracket offset fields that have to be
4114 filled in later when the ends of the brackets are reached. */
4116 offset = (bralink == NULL)? 0 : previous - bralink;
4117 bralink = previous;
4118 PUTINC(previous, 0, offset);
4121 repeat_max--;
4124 /* If the minimum is greater than zero, replicate the group as many
4125 times as necessary, and adjust the maximum to the number of subsequent
4126 copies that we need. If we set a first char from the group, and didn't
4127 set a required char, copy the latter from the former. If there are any
4128 forward reference subroutine calls in the group, there will be entries on
4129 the workspace list; replicate these with an appropriate increment. */
4131 else
4133 if (repeat_min > 1)
4135 /* In the pre-compile phase, we don't actually do the replication. We
4136 just adjust the length as if we had. Do some paranoid checks for
4137 potential integer overflow. */
4139 if (lengthptr != NULL)
4141 int delta = (repeat_min - 1)*length_prevgroup;
4142 if ((double)(repeat_min - 1)*(double)length_prevgroup >
4143 (double)INT_MAX ||
4144 OFLOW_MAX - *lengthptr < delta)
4146 *errorcodeptr = ERR20;
4147 goto FAILED;
4149 *lengthptr += delta;
4152 /* This is compiling for real */
4154 else
4156 if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
4157 for (i = 1; i < repeat_min; i++)
4159 uschar *hc;
4160 uschar *this_hwm = cd->hwm;
4161 VMPI_memcpy(code, previous, len);
4162 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
4164 PUT(cd->hwm, 0, GET(hc, 0) + len);
4165 cd->hwm += LINK_SIZE;
4167 save_hwm = this_hwm;
4168 code += len;
4173 if (repeat_max > 0) repeat_max -= repeat_min;
4176 /* This code is common to both the zero and non-zero minimum cases. If
4177 the maximum is limited, it replicates the group in a nested fashion,
4178 remembering the bracket starts on a stack. In the case of a zero minimum,
4179 the first one was set up above. In all cases the repeat_max now specifies
4180 the number of additional copies needed. Again, we must remember to
4181 replicate entries on the forward reference list. */
4183 if (repeat_max >= 0)
4185 /* In the pre-compile phase, we don't actually do the replication. We
4186 just adjust the length as if we had. For each repetition we must add 1
4187 to the length for BRAZERO and for all but the last repetition we must
4188 add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
4189 paranoid checks to avoid integer overflow. */
4191 if (lengthptr != NULL && repeat_max > 0)
4193 int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
4194 2 - 2*LINK_SIZE; /* Last one doesn't nest */
4195 if ((double)repeat_max *
4196 (double)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
4197 > (double)INT_MAX ||
4198 OFLOW_MAX - *lengthptr < delta)
4200 *errorcodeptr = ERR20;
4201 goto FAILED;
4203 *lengthptr += delta;
4206 /* This is compiling for real */
4208 else for (i = repeat_max - 1; i >= 0; i--)
4210 uschar *hc;
4211 uschar *this_hwm = cd->hwm;
4213 *code++ = OP_BRAZERO + repeat_type;
4215 /* All but the final copy start a new nesting, maintaining the
4216 chain of brackets outstanding. */
4218 if (i != 0)
4220 int offset;
4221 *code++ = OP_BRA;
4222 offset = (bralink == NULL)? 0 : code - bralink;
4223 bralink = code;
4224 PUTINC(code, 0, offset);
4227 VMPI_memcpy(code, previous, len);
4228 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
4230 PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
4231 cd->hwm += LINK_SIZE;
4233 save_hwm = this_hwm;
4234 code += len;
4237 /* Now chain through the pending brackets, and fill in their length
4238 fields (which are holding the chain links pro tem). */
4240 while (bralink != NULL)
4242 int oldlinkoffset;
4243 int offset = code - bralink + 1;
4244 uschar *bra = code - offset;
4245 oldlinkoffset = GET(bra, 1);
4246 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
4247 *code++ = OP_KET;
4248 PUTINC(code, 0, offset);
4249 PUT(bra, 1, offset);
4253 /* If the maximum is unlimited, set a repeater in the final copy. We
4254 can't just offset backwards from the current code point, because we
4255 don't know if there's been an options resetting after the ket. The
4256 correct offset was computed above.
4258 Then, when we are doing the actual compile phase, check to see whether
4259 this group is a non-atomic one that could match an empty string. If so,
4260 convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
4261 that runtime checking can be done. [This check is also applied to
4262 atomic groups at runtime, but in a different way.] */
4264 else
4266 uschar *ketcode = code - ketoffset;
4267 uschar *bracode = ketcode - GET(ketcode, 1);
4268 *ketcode = OP_KETRMAX + repeat_type;
4269 if (lengthptr == NULL && *bracode != OP_ONCE)
4271 uschar *scode = bracode;
4274 if (could_be_empty_branch(scode, ketcode, utf8))
4276 *bracode += OP_SBRA - OP_BRA;
4277 break;
4279 scode += GET(scode, 1);
4281 while (*scode == OP_ALT);
4286 /* Else there's some kind of shambles */
4288 else
4290 *errorcodeptr = ERR11;
4291 goto FAILED;
4294 /* If the character following a repeat is '+', or if certain optimization
4295 tests above succeeded, possessive_quantifier is TRUE. For some of the
4296 simpler opcodes, there is an special alternative opcode for this. For
4297 anything else, we wrap the entire repeated item inside OP_ONCE brackets.
4298 The '+' notation is just syntactic sugar, taken from Sun's Java package,
4299 but the special opcodes can optimize it a bit. The repeated item starts at
4300 tempcode, not at previous, which might be the first part of a string whose
4301 (former) last char we repeated.
4303 Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
4304 an 'upto' may follow. We skip over an 'exact' item, and then test the
4305 length of what remains before proceeding. */
4307 if (possessive_quantifier)
4309 int len;
4310 if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
4311 *tempcode == OP_NOTEXACT)
4312 tempcode += _pcre_OP_lengths[*tempcode];
4313 len = code - tempcode;
4314 if (len > 0) switch (*tempcode)
4316 case OP_STAR: *tempcode = OP_POSSTAR; break;
4317 case OP_PLUS: *tempcode = OP_POSPLUS; break;
4318 case OP_QUERY: *tempcode = OP_POSQUERY; break;
4319 case OP_UPTO: *tempcode = OP_POSUPTO; break;
4321 case OP_TYPESTAR: *tempcode = OP_TYPEPOSSTAR; break;
4322 case OP_TYPEPLUS: *tempcode = OP_TYPEPOSPLUS; break;
4323 case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
4324 case OP_TYPEUPTO: *tempcode = OP_TYPEPOSUPTO; break;
4326 case OP_NOTSTAR: *tempcode = OP_NOTPOSSTAR; break;
4327 case OP_NOTPLUS: *tempcode = OP_NOTPOSPLUS; break;
4328 case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
4329 case OP_NOTUPTO: *tempcode = OP_NOTPOSUPTO; break;
4331 default:
4332 VMPI_memmove(tempcode + 1+LINK_SIZE, tempcode, len);
4333 code += 1 + LINK_SIZE;
4334 len += 1 + LINK_SIZE;
4335 tempcode[0] = OP_ONCE;
4336 *code++ = OP_KET;
4337 PUTINC(code, 0, len);
4338 PUT(tempcode, 1, len);
4339 break;
4343 /* In all case we no longer have a previous item. We also set the
4344 "follows varying string" flag for subsequently encountered reqbytes if
4345 it isn't already set and we have just passed a varying length item. */
4347 END_REPEAT:
4348 previous = NULL;
4349 cd->req_varyopt |= reqvary;
4350 break;
4353 /* ===================================================================*/
4354 /* Start of nested parenthesized sub-expression, or comment or lookahead or
4355 lookbehind or option setting or condition or all the other extended
4356 parenthesis forms. */
4358 case '(':
4359 newoptions = options;
4360 skipbytes = 0;
4361 bravalue = OP_CBRA;
4362 save_hwm = cd->hwm;
4363 reset_bracount = FALSE;
4365 /* First deal with various "verbs" that can be introduced by '*'. */
4367 if (*(++ptr) == '*' && (cd->ctypes[ptr[1]] & ctype_letter) != 0)
4369 int i, namelen;
4370 const uschar *name = ++ptr;
4371 previous = NULL;
4372 while ((cd->ctypes[*++ptr] & ctype_letter) != 0){}
4373 if (*ptr == ':')
4375 *errorcodeptr = ERR59; /* Not supported */
4376 goto FAILED;
4378 if (*ptr != ')')
4380 *errorcodeptr = ERR60;
4381 goto FAILED;
4383 namelen = ptr - name;
4384 for (i = 0; i < verbcount; i++)
4386 if (namelen == verbs[i].len &&
4387 VMPI_strncmp((char *)name, verbs[i].name, namelen) == 0)
4389 *code = verbs[i].op;
4390 if (*code++ == OP_ACCEPT) cd->had_accept = TRUE;
4391 break;
4394 if (i < verbcount) continue;
4395 *errorcodeptr = ERR60;
4396 goto FAILED;
4399 /* Deal with the extended parentheses; all are introduced by '?', and the
4400 appearance of any of them means that this is not a capturing group. */
4402 else if (*ptr == '?')
4404 int i, set, unset, namelen;
4405 int *optset;
4406 const uschar *name;
4407 uschar *slot;
4409 switch (*(++ptr))
4411 case '#': /* Comment; skip to ket */
4412 ptr++;
4413 while (*ptr != 0 && *ptr != ')') ptr++;
4414 if (*ptr == 0)
4416 *errorcodeptr = ERR18;
4417 goto FAILED;
4419 continue;
4422 /* ------------------------------------------------------------ */
4423 case '|': /* Reset capture count for each branch */
4424 reset_bracount = TRUE;
4425 /* Fall through */
4427 /* ------------------------------------------------------------ */
4428 case ':': /* Non-capturing bracket */
4429 bravalue = OP_BRA;
4430 ptr++;
4431 break;
4434 /* ------------------------------------------------------------ */
4435 case '(':
4436 bravalue = OP_COND; /* Conditional group */
4438 /* A condition can be an assertion, a number (referring to a numbered
4439 group), a name (referring to a named group), or 'R', referring to
4440 recursion. R<digits> and R&name are also permitted for recursion tests.
4442 There are several syntaxes for testing a named group: (?(name)) is used
4443 by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
4445 There are two unfortunate ambiguities, caused by history. (a) 'R' can
4446 be the recursive thing or the name 'R' (and similarly for 'R' followed
4447 by digits), and (b) a number could be a name that consists of digits.
4448 In both cases, we look for a name first; if not found, we try the other
4449 cases. */
4451 /* For conditions that are assertions, check the syntax, and then exit
4452 the switch. This will take control down to where bracketed groups,
4453 including assertions, are processed. */
4455 if (ptr[1] == '?' && (ptr[2] == '=' || ptr[2] == '!' || ptr[2] == '<'))
4456 break;
4458 /* Most other conditions use OP_CREF (a couple change to OP_RREF
4459 below), and all need to skip 3 bytes at the start of the group. */
4461 code[1+LINK_SIZE] = OP_CREF;
4462 skipbytes = 3;
4463 refsign = -1;
4465 /* Check for a test for recursion in a named group. */
4467 if (ptr[1] == 'R' && ptr[2] == '&')
4469 terminator = -1;
4470 ptr += 2;
4471 code[1+LINK_SIZE] = OP_RREF; /* Change the type of test */
4474 /* Check for a test for a named group's having been set, using the Perl
4475 syntax (?(<name>) or (?('name') */
4477 else if (ptr[1] == '<')
4479 terminator = '>';
4480 ptr++;
4482 else if (ptr[1] == '\'')
4484 terminator = '\'';
4485 ptr++;
4487 else
4489 terminator = 0;
4490 if (ptr[1] == '-' || ptr[1] == '+') refsign = *(++ptr);
4493 /* We now expect to read a name; any thing else is an error */
4495 if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
4497 ptr += 1; /* To get the right offset */
4498 *errorcodeptr = ERR28;
4499 goto FAILED;
4502 /* Read the name, but also get it as a number if it's all digits */
4504 recno = 0;
4505 name = ++ptr;
4506 while ((cd->ctypes[*ptr] & ctype_word) != 0)
4508 if (recno >= 0)
4509 recno = ((digitab[*ptr] & ctype_digit) != 0)?
4510 recno * 10 + *ptr - '0' : -1;
4511 ptr++;
4513 namelen = ptr - name;
4515 if ((terminator > 0 && *ptr++ != terminator) || *ptr++ != ')')
4517 ptr--; /* Error offset */
4518 *errorcodeptr = ERR26;
4519 goto FAILED;
4522 /* Do no further checking in the pre-compile phase. */
4524 if (lengthptr != NULL) break;
4526 /* In the real compile we do the work of looking for the actual
4527 reference. If the string started with "+" or "-" we require the rest to
4528 be digits, in which case recno will be set. */
4530 if (refsign > 0)
4532 if (recno <= 0)
4534 *errorcodeptr = ERR58;
4535 goto FAILED;
4537 if (refsign == '-')
4539 recno = cd->bracount - recno + 1;
4540 if (recno <= 0)
4542 *errorcodeptr = ERR15;
4543 goto FAILED;
4546 else recno += cd->bracount;
4547 PUT2(code, 2+LINK_SIZE, recno);
4548 break;
4551 /* Otherwise (did not start with "+" or "-"), start by looking for the
4552 name. */
4554 slot = cd->name_table;
4555 for (i = 0; i < cd->names_found; i++)
4557 if (VMPI_strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
4558 slot += cd->name_entry_size;
4561 /* Found a previous named subpattern */
4563 if (i < cd->names_found)
4565 recno = GET2(slot, 0);
4566 PUT2(code, 2+LINK_SIZE, recno);
4569 /* Search the pattern for a forward reference */
4571 else if ((i = find_parens(ptr, cd->bracount, name, namelen,
4572 (options & PCRE_EXTENDED) != 0)) > 0)
4574 PUT2(code, 2+LINK_SIZE, i);
4577 /* If terminator == 0 it means that the name followed directly after
4578 the opening parenthesis [e.g. (?(abc)...] and in this case there are
4579 some further alternatives to try. For the cases where terminator != 0
4580 [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
4581 now checked all the possibilities, so give an error. */
4583 else if (terminator != 0)
4585 *errorcodeptr = ERR15;
4586 goto FAILED;
4589 /* Check for (?(R) for recursion. Allow digits after R to specify a
4590 specific group number. */
4592 else if (*name == 'R')
4594 recno = 0;
4595 for (i = 1; i < namelen; i++)
4597 if ((digitab[name[i]] & ctype_digit) == 0)
4599 *errorcodeptr = ERR15;
4600 goto FAILED;
4602 recno = recno * 10 + name[i] - '0';
4604 if (recno == 0) recno = RREF_ANY;
4605 code[1+LINK_SIZE] = OP_RREF; /* Change test type */
4606 PUT2(code, 2+LINK_SIZE, recno);
4609 /* Similarly, check for the (?(DEFINE) "condition", which is always
4610 false. */
4612 else if (namelen == 6 && VMPI_strncmp((char *)name, "DEFINE", 6) == 0)
4614 code[1+LINK_SIZE] = OP_DEF;
4615 skipbytes = 1;
4618 /* Check for the "name" actually being a subpattern number. */
4620 else if (recno > 0)
4622 PUT2(code, 2+LINK_SIZE, recno);
4625 /* Either an unidentified subpattern, or a reference to (?(0) */
4627 else
4629 *errorcodeptr = (recno == 0)? ERR35: ERR15;
4630 goto FAILED;
4632 break;
4635 /* ------------------------------------------------------------ */
4636 case '=': /* Positive lookahead */
4637 bravalue = OP_ASSERT;
4638 ptr++;
4639 break;
4642 /* ------------------------------------------------------------ */
4643 case '!': /* Negative lookahead */
4644 ptr++;
4645 if (*ptr == ')') /* Optimize (?!) */
4647 *code++ = OP_FAIL;
4648 previous = NULL;
4649 continue;
4651 bravalue = OP_ASSERT_NOT;
4652 break;
4655 /* ------------------------------------------------------------ */
4656 case '<': /* Lookbehind or named define */
4657 switch (ptr[1])
4659 case '=': /* Positive lookbehind */
4660 bravalue = OP_ASSERTBACK;
4661 ptr += 2;
4662 break;
4664 case '!': /* Negative lookbehind */
4665 bravalue = OP_ASSERTBACK_NOT;
4666 ptr += 2;
4667 break;
4669 default: /* Could be name define, else bad */
4670 if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;
4671 ptr++; /* Correct offset for error */
4672 *errorcodeptr = ERR24;
4673 goto FAILED;
4675 break;
4678 /* ------------------------------------------------------------ */
4679 case '>': /* One-time brackets */
4680 bravalue = OP_ONCE;
4681 ptr++;
4682 break;
4685 /* ------------------------------------------------------------ */
4686 case 'C': /* Callout - may be followed by digits; */
4687 previous_callout = code; /* Save for later completion */
4688 after_manual_callout = 1; /* Skip one item before completing */
4689 *code++ = OP_CALLOUT;
4691 int n = 0;
4692 while ((digitab[*(++ptr)] & ctype_digit) != 0)
4693 n = n * 10 + *ptr - '0';
4694 if (*ptr != ')')
4696 *errorcodeptr = ERR39;
4697 goto FAILED;
4699 if (n > 255)
4701 *errorcodeptr = ERR38;
4702 goto FAILED;
4704 *code++ = n;
4705 PUT(code, 0, ptr - cd->start_pattern + 1); /* Pattern offset */
4706 PUT(code, LINK_SIZE, 0); /* Default length */
4707 code += 2 * LINK_SIZE;
4709 previous = NULL;
4710 continue;
4713 /* ------------------------------------------------------------ */
4714 case 'P': /* Python-style named subpattern handling */
4715 if (*(++ptr) == '=' || *ptr == '>') /* Reference or recursion */
4717 is_recurse = *ptr == '>';
4718 terminator = ')';
4719 goto NAMED_REF_OR_RECURSE;
4721 else if (*ptr != '<') /* Test for Python-style definition */
4723 *errorcodeptr = ERR41;
4724 goto FAILED;
4726 /* Fall through to handle (?P< as (?< is handled */
4729 /* ------------------------------------------------------------ */
4730 DEFINE_NAME: /* Come here from (?< handling */
4731 case '\'':
4733 terminator = (*ptr == '<')? '>' : '\'';
4734 name = ++ptr;
4736 while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4737 namelen = ptr - name;
4739 /* In the pre-compile phase, just do a syntax check. */
4741 if (lengthptr != NULL)
4743 if (*ptr != terminator)
4745 *errorcodeptr = ERR42;
4746 goto FAILED;
4748 if (cd->names_found >= MAX_NAME_COUNT)
4750 *errorcodeptr = ERR49;
4751 goto FAILED;
4753 if (namelen + 3 > cd->name_entry_size)
4755 cd->name_entry_size = namelen + 3;
4756 if (namelen > MAX_NAME_SIZE)
4758 *errorcodeptr = ERR48;
4759 goto FAILED;
4764 /* In the real compile, create the entry in the table */
4766 else
4768 slot = cd->name_table;
4769 for (i = 0; i < cd->names_found; i++)
4771 int crc = VMPI_memcmp(name, slot+2, namelen);
4772 if (crc == 0)
4774 if (slot[2+namelen] == 0)
4776 if ((options & PCRE_DUPNAMES) == 0)
4778 *errorcodeptr = ERR43;
4779 goto FAILED;
4782 else crc = -1; /* Current name is substring */
4784 if (crc < 0)
4786 VMPI_memmove(slot + cd->name_entry_size, slot,
4787 (cd->names_found - i) * cd->name_entry_size);
4788 break;
4790 slot += cd->name_entry_size;
4793 PUT2(slot, 0, cd->bracount + 1);
4794 VMPI_memcpy(slot + 2, name, namelen);
4795 slot[2+namelen] = 0;
4799 /* In both cases, count the number of names we've encountered. */
4801 ptr++; /* Move past > or ' */
4802 cd->names_found++;
4803 goto NUMBERED_GROUP;
4806 /* ------------------------------------------------------------ */
4807 case '&': /* Perl recursion/subroutine syntax */
4808 terminator = ')';
4809 is_recurse = TRUE;
4810 /* Fall through */
4812 /* We come here from the Python syntax above that handles both
4813 references (?P=name) and recursion (?P>name), as well as falling
4814 through from the Perl recursion syntax (?&name). */
4816 NAMED_REF_OR_RECURSE:
4817 name = ++ptr;
4818 while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4819 namelen = ptr - name;
4821 /* In the pre-compile phase, do a syntax check and set a dummy
4822 reference number. */
4824 if (lengthptr != NULL)
4826 if (*ptr != terminator)
4828 *errorcodeptr = ERR42;
4829 goto FAILED;
4831 if (namelen > MAX_NAME_SIZE)
4833 *errorcodeptr = ERR48;
4834 goto FAILED;
4836 recno = 0;
4839 /* In the real compile, seek the name in the table */
4841 else
4843 slot = cd->name_table;
4844 for (i = 0; i < cd->names_found; i++)
4846 if (VMPI_strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
4847 slot += cd->name_entry_size;
4850 if (i < cd->names_found) /* Back reference */
4852 recno = GET2(slot, 0);
4854 else if ((recno = /* Forward back reference */
4855 find_parens(ptr, cd->bracount, name, namelen,
4856 (options & PCRE_EXTENDED) != 0)) <= 0)
4858 *errorcodeptr = ERR15;
4859 goto FAILED;
4863 /* In both phases, we can now go to the code than handles numerical
4864 recursion or backreferences. */
4866 if (is_recurse) goto HANDLE_RECURSION;
4867 else goto HANDLE_REFERENCE;
4870 /* ------------------------------------------------------------ */
4871 case 'R': /* Recursion */
4872 ptr++; /* Same as (?0) */
4873 /* Fall through */
4876 /* ------------------------------------------------------------ */
4877 case '-': case '+':
4878 case '0': case '1': case '2': case '3': case '4': /* Recursion or */
4879 case '5': case '6': case '7': case '8': case '9': /* subroutine */
4881 const uschar *called;
4883 if ((refsign = *ptr) == '+') ptr++;
4884 else if (refsign == '-')
4886 if ((digitab[ptr[1]] & ctype_digit) == 0)
4887 goto OTHER_CHAR_AFTER_QUERY;
4888 ptr++;
4891 recno = 0;
4892 while((digitab[*ptr] & ctype_digit) != 0)
4893 recno = recno * 10 + *ptr++ - '0';
4895 if (*ptr != ')')
4897 *errorcodeptr = ERR29;
4898 goto FAILED;
4901 if (refsign == '-')
4903 if (recno == 0)
4905 *errorcodeptr = ERR58;
4906 goto FAILED;
4908 recno = cd->bracount - recno + 1;
4909 if (recno <= 0)
4911 *errorcodeptr = ERR15;
4912 goto FAILED;
4915 else if (refsign == '+')
4917 if (recno == 0)
4919 *errorcodeptr = ERR58;
4920 goto FAILED;
4922 recno += cd->bracount;
4925 /* Come here from code above that handles a named recursion */
4927 HANDLE_RECURSION:
4929 previous = code;
4930 called = cd->start_code;
4932 /* When we are actually compiling, find the bracket that is being
4933 referenced. Temporarily end the regex in case it doesn't exist before
4934 this point. If we end up with a forward reference, first check that
4935 the bracket does occur later so we can give the error (and position)
4936 now. Then remember this forward reference in the workspace so it can
4937 be filled in at the end. */
4939 if (lengthptr == NULL)
4941 *code = OP_END;
4942 if (recno != 0) called = find_bracket(cd->start_code, utf8, recno);
4944 /* Forward reference */
4946 if (called == NULL)
4948 if (find_parens(ptr, cd->bracount, NULL, recno,
4949 (options & PCRE_EXTENDED) != 0) < 0)
4951 *errorcodeptr = ERR15;
4952 goto FAILED;
4954 called = cd->start_code + recno;
4955 PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code);
4958 /* If not a forward reference, and the subpattern is still open,
4959 this is a recursive call. We check to see if this is a left
4960 recursion that could loop for ever, and diagnose that case. */
4962 else if (GET(called, 1) == 0 &&
4963 could_be_empty(called, code, bcptr, utf8))
4965 *errorcodeptr = ERR40;
4966 goto FAILED;
4970 /* Insert the recursion/subroutine item, automatically wrapped inside
4971 "once" brackets. Set up a "previous group" length so that a
4972 subsequent quantifier will work. */
4974 *code = OP_ONCE;
4975 PUT(code, 1, 2 + 2*LINK_SIZE);
4976 code += 1 + LINK_SIZE;
4978 *code = OP_RECURSE;
4979 PUT(code, 1, called - cd->start_code);
4980 code += 1 + LINK_SIZE;
4982 *code = OP_KET;
4983 PUT(code, 1, 2 + 2*LINK_SIZE);
4984 code += 1 + LINK_SIZE;
4986 length_prevgroup = 3 + 3*LINK_SIZE;
4989 /* Can't determine a first byte now */
4991 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
4992 continue;
4995 /* ------------------------------------------------------------ */
4996 default: /* Other characters: check option setting */
4997 OTHER_CHAR_AFTER_QUERY:
4998 set = unset = 0;
4999 optset = &set;
5001 while (*ptr != ')' && *ptr != ':')
5003 switch (*ptr++)
5005 case '-': optset = &unset; break;
5007 case 'J': /* Record that it changed in the external options */
5008 *optset |= PCRE_DUPNAMES;
5009 cd->external_options |= PCRE_JCHANGED;
5010 break;
5012 case 'i': *optset |= PCRE_CASELESS; break;
5013 case 'm': *optset |= PCRE_MULTILINE; break;
5014 case 's': *optset |= PCRE_DOTALL; break;
5015 case 'x': *optset |= PCRE_EXTENDED; break;
5016 case 'U': *optset |= PCRE_UNGREEDY; break;
5017 case 'X': *optset |= PCRE_EXTRA; break;
5019 default: *errorcodeptr = ERR12;
5020 ptr--; /* Correct the offset */
5021 goto FAILED;
5025 /* Set up the changed option bits, but don't change anything yet. */
5027 newoptions = (options | set) & (~unset);
5029 /* If the options ended with ')' this is not the start of a nested
5030 group with option changes, so the options change at this level. If this
5031 item is right at the start of the pattern, the options can be
5032 abstracted and made external in the pre-compile phase, and ignored in
5033 the compile phase. This can be helpful when matching -- for instance in
5034 caseless checking of required bytes.
5036 If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
5037 definitely *not* at the start of the pattern because something has been
5038 compiled. In the pre-compile phase, however, the code pointer can have
5039 that value after the start, because it gets reset as code is discarded
5040 during the pre-compile. However, this can happen only at top level - if
5041 we are within parentheses, the starting BRA will still be present. At
5042 any parenthesis level, the length value can be used to test if anything
5043 has been compiled at that level. Thus, a test for both these conditions
5044 is necessary to ensure we correctly detect the start of the pattern in
5045 both phases.
5047 If we are not at the pattern start, compile code to change the ims
5048 options if this setting actually changes any of them, and reset the
5049 greedy defaults and the case value for firstbyte and reqbyte. */
5051 if (*ptr == ')')
5053 if (code == cd->start_code + 1 + LINK_SIZE &&
5054 (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
5056 cd->external_options = newoptions;
5058 else
5060 if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
5062 *code++ = OP_OPT;
5063 *code++ = newoptions & PCRE_IMS;
5065 greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
5066 greedy_non_default = greedy_default ^ 1;
5067 req_caseopt = ((newoptions & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
5070 /* Change options at this level, and pass them back for use
5071 in subsequent branches. When not at the start of the pattern, this
5072 information is also necessary so that a resetting item can be
5073 compiled at the end of a group (if we are in a group). */
5075 *optionsptr = options = newoptions;
5076 previous = NULL; /* This item can't be repeated */
5077 continue; /* It is complete */
5080 /* If the options ended with ':' we are heading into a nested group
5081 with possible change of options. Such groups are non-capturing and are
5082 not assertions of any kind. All we need to do is skip over the ':';
5083 the newoptions value is handled below. */
5085 bravalue = OP_BRA;
5086 ptr++;
5087 } /* End of switch for character following (? */
5088 } /* End of (? handling */
5090 /* Opening parenthesis not followed by '?'. If PCRE_NO_AUTO_CAPTURE is set,
5091 all unadorned brackets become non-capturing and behave like (?:...)
5092 brackets. */
5094 else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
5096 bravalue = OP_BRA;
5099 /* Else we have a capturing group. */
5101 else
5103 NUMBERED_GROUP:
5104 cd->bracount += 1;
5105 PUT2(code, 1+LINK_SIZE, cd->bracount);
5106 skipbytes = 2;
5109 /* Process nested bracketed regex. Assertions may not be repeated, but
5110 other kinds can be. All their opcodes are >= OP_ONCE. We copy code into a
5111 non-register variable in order to be able to pass its address because some
5112 compilers complain otherwise. Pass in a new setting for the ims options if
5113 they have changed. */
5115 previous = (bravalue >= OP_ONCE)? code : NULL;
5116 *code = bravalue;
5117 tempcode = code;
5118 tempreqvary = cd->req_varyopt; /* Save value before bracket */
5119 length_prevgroup = 0; /* Initialize for pre-compile phase */
5121 if (!compile_regex(
5122 newoptions, /* The complete new option state */
5123 options & PCRE_IMS, /* The previous ims option state */
5124 &tempcode, /* Where to put code (updated) */
5125 &ptr, /* Input pointer (updated) */
5126 errorcodeptr, /* Where to put an error message */
5127 (bravalue == OP_ASSERTBACK ||
5128 bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
5129 reset_bracount, /* True if (?| group */
5130 skipbytes, /* Skip over bracket number */
5131 &subfirstbyte, /* For possible first char */
5132 &subreqbyte, /* For possible last char */
5133 bcptr, /* Current branch chain */
5134 cd, /* Tables block */
5135 (lengthptr == NULL)? NULL : /* Actual compile phase */
5136 &length_prevgroup /* Pre-compile phase */
5138 goto FAILED;
5140 /* At the end of compiling, code is still pointing to the start of the
5141 group, while tempcode has been updated to point past the end of the group
5142 and any option resetting that may follow it. The pattern pointer (ptr)
5143 is on the bracket. */
5145 /* If this is a conditional bracket, check that there are no more than
5146 two branches in the group, or just one if it's a DEFINE group. We do this
5147 in the real compile phase, not in the pre-pass, where the whole group may
5148 not be available. */
5150 if (bravalue == OP_COND && lengthptr == NULL)
5152 uschar *tc = code;
5153 int condcount = 0;
5155 do {
5156 condcount++;
5157 tc += GET(tc,1);
5159 while (*tc != OP_KET);
5161 /* A DEFINE group is never obeyed inline (the "condition" is always
5162 false). It must have only one branch. */
5164 if (code[LINK_SIZE+1] == OP_DEF)
5166 if (condcount > 1)
5168 *errorcodeptr = ERR54;
5169 goto FAILED;
5171 bravalue = OP_DEF; /* Just a flag to suppress char handling below */
5174 /* A "normal" conditional group. If there is just one branch, we must not
5175 make use of its firstbyte or reqbyte, because this is equivalent to an
5176 empty second branch. */
5178 else
5180 if (condcount > 2)
5182 *errorcodeptr = ERR27;
5183 goto FAILED;
5185 if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
5189 /* Error if hit end of pattern */
5191 if (*ptr != ')')
5193 *errorcodeptr = ERR14;
5194 goto FAILED;
5197 /* In the pre-compile phase, update the length by the length of the group,
5198 less the brackets at either end. Then reduce the compiled code to just a
5199 set of non-capturing brackets so that it doesn't use much memory if it is
5200 duplicated by a quantifier.*/
5202 if (lengthptr != NULL)
5204 if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
5206 *errorcodeptr = ERR20;
5207 goto FAILED;
5209 *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
5210 *code++ = OP_BRA;
5211 PUTINC(code, 0, 1 + LINK_SIZE);
5212 *code++ = OP_KET;
5213 PUTINC(code, 0, 1 + LINK_SIZE);
5214 break; /* No need to waste time with special character handling */
5217 /* Otherwise update the main code pointer to the end of the group. */
5219 code = tempcode;
5221 /* For a DEFINE group, required and first character settings are not
5222 relevant. */
5224 if (bravalue == OP_DEF) break;
5226 /* Handle updating of the required and first characters for other types of
5227 group. Update for normal brackets of all kinds, and conditions with two
5228 branches (see code above). If the bracket is followed by a quantifier with
5229 zero repeat, we have to back off. Hence the definition of zeroreqbyte and
5230 zerofirstbyte outside the main loop so that they can be accessed for the
5231 back off. */
5233 zeroreqbyte = reqbyte;
5234 zerofirstbyte = firstbyte;
5235 groupsetfirstbyte = FALSE;
5237 if (bravalue >= OP_ONCE)
5239 /* If we have not yet set a firstbyte in this branch, take it from the
5240 subpattern, remembering that it was set here so that a repeat of more
5241 than one can replicate it as reqbyte if necessary. If the subpattern has
5242 no firstbyte, set "none" for the whole branch. In both cases, a zero
5243 repeat forces firstbyte to "none". */
5245 if (firstbyte == REQ_UNSET)
5247 if (subfirstbyte >= 0)
5249 firstbyte = subfirstbyte;
5250 groupsetfirstbyte = TRUE;
5252 else firstbyte = REQ_NONE;
5253 zerofirstbyte = REQ_NONE;
5256 /* If firstbyte was previously set, convert the subpattern's firstbyte
5257 into reqbyte if there wasn't one, using the vary flag that was in
5258 existence beforehand. */
5260 else if (subfirstbyte >= 0 && subreqbyte < 0)
5261 subreqbyte = subfirstbyte | tempreqvary;
5263 /* If the subpattern set a required byte (or set a first byte that isn't
5264 really the first byte - see above), set it. */
5266 if (subreqbyte >= 0) reqbyte = subreqbyte;
5269 /* For a forward assertion, we take the reqbyte, if set. This can be
5270 helpful if the pattern that follows the assertion doesn't set a different
5271 char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
5272 for an assertion, however because it leads to incorrect effect for patterns
5273 such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
5274 of a firstbyte. This is overcome by a scan at the end if there's no
5275 firstbyte, looking for an asserted first char. */
5277 else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
5278 break; /* End of processing '(' */
5281 /* ===================================================================*/
5282 /* Handle metasequences introduced by \. For ones like \d, the ESC_ values
5283 are arranged to be the negation of the corresponding OP_values. For the
5284 back references, the values are ESC_REF plus the reference number. Only
5285 back references and those types that consume a character may be repeated.
5286 We can test for values between ESC_b and ESC_Z for the latter; this may
5287 have to change if any new ones are ever created. */
5289 case '\\':
5290 tempptr = ptr;
5291 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE);
5292 if (*errorcodeptr != 0) goto FAILED;
5294 if (c < 0)
5296 if (-c == ESC_Q) /* Handle start of quoted string */
5298 if (ptr[1] == '\\' && ptr[2] == 'E') ptr += 2; /* avoid empty string */
5299 else inescq = TRUE;
5300 continue;
5303 if (-c == ESC_E) continue; /* Perl ignores an orphan \E */
5305 /* For metasequences that actually match a character, we disable the
5306 setting of a first character if it hasn't already been set. */
5308 if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
5309 firstbyte = REQ_NONE;
5311 /* Set values to reset to if this is followed by a zero repeat. */
5313 zerofirstbyte = firstbyte;
5314 zeroreqbyte = reqbyte;
5316 /* \k<name> or \k'name' is a back reference by name (Perl syntax).
5317 We also support \k{name} (.NET syntax) */
5319 if (-c == ESC_k && (ptr[1] == '<' || ptr[1] == '\'' || ptr[1] == '{'))
5321 is_recurse = FALSE;
5322 terminator = (*(++ptr) == '<')? '>' : (*ptr == '\'')? '\'' : '}';
5323 goto NAMED_REF_OR_RECURSE;
5326 /* Back references are handled specially; must disable firstbyte if
5327 not set to cope with cases like (?=(\w+))\1: which would otherwise set
5328 ':' later. */
5330 if (-c >= ESC_REF)
5332 recno = -c - ESC_REF;
5334 HANDLE_REFERENCE: /* Come here from named backref handling */
5335 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
5336 previous = code;
5337 *code++ = OP_REF;
5338 PUT2INC(code, 0, recno);
5339 cd->backref_map |= (recno < 32)? (1 << recno) : 1;
5340 if (recno > cd->top_backref) cd->top_backref = recno;
5343 /* So are Unicode property matches, if supported. */
5345 #ifdef SUPPORT_UCP
5346 else if (-c == ESC_P || -c == ESC_p)
5348 BOOL negated;
5349 int pdata;
5350 int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
5351 if (ptype < 0) goto FAILED;
5352 previous = code;
5353 *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
5354 *code++ = ptype;
5355 *code++ = pdata;
5357 #else
5359 /* If Unicode properties are not supported, \X, \P, and \p are not
5360 allowed. */
5362 else if (-c == ESC_X || -c == ESC_P || -c == ESC_p)
5364 *errorcodeptr = ERR45;
5365 goto FAILED;
5367 #endif
5369 /* For the rest (including \X when Unicode properties are supported), we
5370 can obtain the OP value by negating the escape value. */
5372 else
5374 previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
5375 *code++ = -c;
5377 continue;
5380 /* We have a data character whose value is in c. In UTF-8 mode it may have
5381 a value > 127. We set its representation in the length/buffer, and then
5382 handle it as a data character. */
5384 #ifdef SUPPORT_UTF8
5385 if (utf8 && c > 127)
5386 mclength = _pcre_ord2utf8(c, mcbuffer);
5387 else
5388 #endif
5391 mcbuffer[0] = c;
5392 mclength = 1;
5394 goto ONE_CHAR;
5397 /* ===================================================================*/
5398 /* Handle a literal character. It is guaranteed not to be whitespace or #
5399 when the extended flag is set. If we are in UTF-8 mode, it may be a
5400 multi-byte literal character. */
5402 default:
5403 NORMAL_CHAR:
5404 mclength = 1;
5405 mcbuffer[0] = c;
5407 #ifdef SUPPORT_UTF8
5408 if (utf8 && c >= 0xc0)
5410 while ((ptr[1] & 0xc0) == 0x80)
5411 mcbuffer[mclength++] = *(++ptr);
5413 #endif
5415 /* At this point we have the character's bytes in mcbuffer, and the length
5416 in mclength. When not in UTF-8 mode, the length is always 1. */
5418 ONE_CHAR:
5419 previous = code;
5420 *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR;
5421 for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
5423 /* Remember if \r or \n were seen */
5425 if (mcbuffer[0] == '\r' || mcbuffer[0] == '\n')
5426 cd->external_options |= PCRE_HASCRORLF;
5428 /* Set the first and required bytes appropriately. If no previous first
5429 byte, set it from this character, but revert to none on a zero repeat.
5430 Otherwise, leave the firstbyte value alone, and don't change it on a zero
5431 repeat. */
5433 if (firstbyte == REQ_UNSET)
5435 zerofirstbyte = REQ_NONE;
5436 zeroreqbyte = reqbyte;
5438 /* If the character is more than one byte long, we can set firstbyte
5439 only if it is not to be matched caselessly. */
5441 if (mclength == 1 || req_caseopt == 0)
5443 firstbyte = mcbuffer[0] | req_caseopt;
5444 if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt;
5446 else firstbyte = reqbyte = REQ_NONE;
5449 /* firstbyte was previously set; we can set reqbyte only the length is
5450 1 or the matching is caseful. */
5452 else
5454 zerofirstbyte = firstbyte;
5455 zeroreqbyte = reqbyte;
5456 if (mclength == 1 || req_caseopt == 0)
5457 reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
5460 break; /* End of literal character handling */
5462 } /* end of big loop */
5465 /* Control never reaches here by falling through, only by a goto for all the
5466 error states. Pass back the position in the pattern so that it can be displayed
5467 to the user for diagnosing the error. */
5469 FAILED:
5470 *ptrptr = ptr;
5471 return FALSE;
5477 /*************************************************
5478 * Compile sequence of alternatives *
5479 *************************************************/
5481 /* On entry, ptr is pointing past the bracket character, but on return it
5482 points to the closing bracket, or vertical bar, or end of string. The code
5483 variable is pointing at the byte into which the BRA operator has been stored.
5484 If the ims options are changed at the start (for a (?ims: group) or during any
5485 branch, we need to insert an OP_OPT item at the start of every following branch
5486 to ensure they get set correctly at run time, and also pass the new options
5487 into every subsequent branch compile.
5489 This function is used during the pre-compile phase when we are trying to find
5490 out the amount of memory needed, as well as during the real compile phase. The
5491 value of lengthptr distinguishes the two phases.
5493 Arguments:
5494 options option bits, including any changes for this subpattern
5495 oldims previous settings of ims option bits
5496 codeptr -> the address of the current code pointer
5497 ptrptr -> the address of the current pattern pointer
5498 errorcodeptr -> pointer to error code variable
5499 lookbehind TRUE if this is a lookbehind assertion
5500 reset_bracount TRUE to reset the count for each branch
5501 skipbytes skip this many bytes at start (for brackets and OP_COND)
5502 firstbyteptr place to put the first required character, or a negative number
5503 reqbyteptr place to put the last required character, or a negative number
5504 bcptr pointer to the chain of currently open branches
5505 cd points to the data block with tables pointers etc.
5506 lengthptr NULL during the real compile phase
5507 points to length accumulator during pre-compile phase
5509 Returns: TRUE on success
5512 static BOOL
5513 compile_regex(int options, int oldims, uschar **codeptr, const uschar **ptrptr,
5514 int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipbytes,
5515 int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd,
5516 int *lengthptr)
5518 const uschar *ptr = *ptrptr;
5519 uschar *code = *codeptr;
5520 uschar *last_branch = code;
5521 uschar *start_bracket = code;
5522 uschar *reverse_count = NULL;
5523 int firstbyte, reqbyte;
5524 int branchfirstbyte, branchreqbyte;
5525 int length;
5526 int orig_bracount;
5527 int max_bracount;
5528 branch_chain bc;
5530 avmplus::AvmCore::checkPCREStackOverflow();
5532 bc.outer = bcptr;
5533 bc.current = code;
5535 firstbyte = reqbyte = REQ_UNSET;
5537 /* Accumulate the length for use in the pre-compile phase. Start with the
5538 length of the BRA and KET and any extra bytes that are required at the
5539 beginning. We accumulate in a local variable to save frequent testing of
5540 lenthptr for NULL. We cannot do this by looking at the value of code at the
5541 start and end of each alternative, because compiled items are discarded during
5542 the pre-compile phase so that the work space is not exceeded. */
5544 length = 2 + 2*LINK_SIZE + skipbytes;
5546 /* WARNING: If the above line is changed for any reason, you must also change
5547 the code that abstracts option settings at the start of the pattern and makes
5548 them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
5549 pre-compile phase to find out whether anything has yet been compiled or not. */
5551 /* Offset is set zero to mark that this bracket is still open */
5553 PUT(code, 1, 0);
5554 code += 1 + LINK_SIZE + skipbytes;
5556 /* Loop for each alternative branch */
5558 orig_bracount = max_bracount = cd->bracount;
5559 for (;;)
5561 /* For a (?| group, reset the capturing bracket count so that each branch
5562 uses the same numbers. */
5564 if (reset_bracount) cd->bracount = orig_bracount;
5566 /* Handle a change of ims options at the start of the branch */
5568 if ((options & PCRE_IMS) != oldims)
5570 *code++ = OP_OPT;
5571 *code++ = options & PCRE_IMS;
5572 length += 2;
5575 /* Set up dummy OP_REVERSE if lookbehind assertion */
5577 if (lookbehind)
5579 *code++ = OP_REVERSE;
5580 reverse_count = code;
5581 PUTINC(code, 0, 0);
5582 length += 1 + LINK_SIZE;
5585 /* Now compile the branch; in the pre-compile phase its length gets added
5586 into the length. */
5588 if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstbyte,
5589 &branchreqbyte, &bc, cd, (lengthptr == NULL)? NULL : &length))
5591 *ptrptr = ptr;
5592 return FALSE;
5595 /* Keep the highest bracket count in case (?| was used and some branch
5596 has fewer than the rest. */
5598 if (cd->bracount > max_bracount) max_bracount = cd->bracount;
5600 /* In the real compile phase, there is some post-processing to be done. */
5602 if (lengthptr == NULL)
5604 /* If this is the first branch, the firstbyte and reqbyte values for the
5605 branch become the values for the regex. */
5607 if (*last_branch != OP_ALT)
5609 firstbyte = branchfirstbyte;
5610 reqbyte = branchreqbyte;
5613 /* If this is not the first branch, the first char and reqbyte have to
5614 match the values from all the previous branches, except that if the
5615 previous value for reqbyte didn't have REQ_VARY set, it can still match,
5616 and we set REQ_VARY for the regex. */
5618 else
5620 /* If we previously had a firstbyte, but it doesn't match the new branch,
5621 we have to abandon the firstbyte for the regex, but if there was
5622 previously no reqbyte, it takes on the value of the old firstbyte. */
5624 if (firstbyte >= 0 && firstbyte != branchfirstbyte)
5626 if (reqbyte < 0) reqbyte = firstbyte;
5627 firstbyte = REQ_NONE;
5630 /* If we (now or from before) have no firstbyte, a firstbyte from the
5631 branch becomes a reqbyte if there isn't a branch reqbyte. */
5633 if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
5634 branchreqbyte = branchfirstbyte;
5636 /* Now ensure that the reqbytes match */
5638 if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
5639 reqbyte = REQ_NONE;
5640 else reqbyte |= branchreqbyte; /* To "or" REQ_VARY */
5643 /* If lookbehind, check that this branch matches a fixed-length string, and
5644 put the length into the OP_REVERSE item. Temporarily mark the end of the
5645 branch with OP_END. */
5647 if (lookbehind)
5649 int fixed_length;
5650 *code = OP_END;
5651 fixed_length = find_fixedlength(last_branch, options);
5652 DPRINTF(("fixed length = %d\n", fixed_length));
5653 if (fixed_length < 0)
5655 *errorcodeptr = (fixed_length == -2)? ERR36 : ERR25;
5656 *ptrptr = ptr;
5657 return FALSE;
5659 PUT(reverse_count, 0, fixed_length);
5663 /* Reached end of expression, either ')' or end of pattern. In the real
5664 compile phase, go back through the alternative branches and reverse the chain
5665 of offsets, with the field in the BRA item now becoming an offset to the
5666 first alternative. If there are no alternatives, it points to the end of the
5667 group. The length in the terminating ket is always the length of the whole
5668 bracketed item. If any of the ims options were changed inside the group,
5669 compile a resetting op-code following, except at the very end of the pattern.
5670 Return leaving the pointer at the terminating char. */
5672 if (*ptr != '|')
5674 if (lengthptr == NULL)
5676 int branch_length = code - last_branch;
5679 int prev_length = GET(last_branch, 1);
5680 PUT(last_branch, 1, branch_length);
5681 branch_length = prev_length;
5682 last_branch -= branch_length;
5684 while (branch_length > 0);
5687 /* Fill in the ket */
5689 *code = OP_KET;
5690 PUT(code, 1, code - start_bracket);
5691 code += 1 + LINK_SIZE;
5693 /* Resetting option if needed */
5695 if ((options & PCRE_IMS) != oldims && *ptr == ')')
5697 *code++ = OP_OPT;
5698 *code++ = oldims;
5699 length += 2;
5702 /* Retain the highest bracket number, in case resetting was used. */
5704 cd->bracount = max_bracount;
5706 /* Set values to pass back */
5708 *codeptr = code;
5709 *ptrptr = ptr;
5710 *firstbyteptr = firstbyte;
5711 *reqbyteptr = reqbyte;
5712 if (lengthptr != NULL)
5714 if (OFLOW_MAX - *lengthptr < length)
5716 *errorcodeptr = ERR20;
5717 return FALSE;
5719 *lengthptr += length;
5721 return TRUE;
5724 /* Another branch follows. In the pre-compile phase, we can move the code
5725 pointer back to where it was for the start of the first branch. (That is,
5726 pretend that each branch is the only one.)
5728 In the real compile phase, insert an ALT node. Its length field points back
5729 to the previous branch while the bracket remains open. At the end the chain
5730 is reversed. It's done like this so that the start of the bracket has a
5731 zero offset until it is closed, making it possible to detect recursion. */
5733 if (lengthptr != NULL)
5735 code = *codeptr + 1 + LINK_SIZE + skipbytes;
5736 length += 1 + LINK_SIZE;
5738 else
5740 *code = OP_ALT;
5741 PUT(code, 1, code - last_branch);
5742 bc.current = last_branch = code;
5743 code += 1 + LINK_SIZE;
5746 ptr++;
5748 /* Control never reaches here */
5753 /************************************************************************
5754 * Stack implementation used in is_anchored and is_startline functions. *
5755 ************************************************************************/
5757 typedef struct pi_ia_is {
5758 const uschar *code;
5759 unsigned int bracket_map;
5760 } pi_ia_is;
5763 /*************************************************
5764 * Check for anchored expression *
5765 *************************************************/
5767 /* Try to find out if this is an anchored regular expression. Consider each
5768 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
5769 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
5770 it's anchored. However, if this is a multiline pattern, then only OP_SOD
5771 counts, since OP_CIRC can match in the middle.
5773 We can also consider a regex to be anchored if OP_SOM starts all its branches.
5774 This is the code for \G, which means "match at start of match position, taking
5775 into account the match offset".
5777 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
5778 because that will try the rest of the pattern at all possible matching points,
5779 so there is no point trying again.... er ....
5781 .... except when the .* appears inside capturing parentheses, and there is a
5782 subsequent back reference to those parentheses. We haven't enough information
5783 to catch that case precisely.
5785 At first, the best we could do was to detect when .* was in capturing brackets
5786 and the highest back reference was greater than or equal to that level.
5787 However, by keeping a bitmap of the first 31 back references, we can catch some
5788 of the more common cases more precisely.
5790 Arguments:
5791 code points to start of expression (the bracket)
5792 options points to the options setting
5793 bracket_map a bitmap of which brackets we are inside while testing; this
5794 handles up to substring 31; after that we just have to take
5795 the less precise approach
5796 backref_map the back reference bitmap
5798 Returns: TRUE or FALSE
5801 static BOOL
5802 is_anchored(register const uschar * const code, int * const options, const unsigned int bracket_map,
5803 const unsigned int backref_map)
5805 TypedStack<pi_ia_is> work_stack;
5806 pi_ia_is work_item;
5808 /* set up the first work item */
5809 work_item.code = code;
5810 work_item.bracket_map = bracket_map;
5811 work_stack.push(work_item);
5813 /* start the processing loop */
5814 while (!work_stack.isEmpty())
5816 work_stack.pop(work_item);
5817 const uschar *wi_code = work_item.code;
5818 unsigned int wi_bracket_map = work_item.bracket_map;
5819 do {
5820 const uschar *scode = first_significant_code(
5821 wi_code + _pcre_OP_lengths[*wi_code], options, PCRE_MULTILINE, FALSE);
5822 register int op = *scode;
5824 /* Non-capturing brackets */
5826 if (op == OP_BRA)
5828 pi_ia_is new_item;
5829 new_item.code = scode;
5830 new_item.bracket_map = wi_bracket_map;
5831 work_stack.push(new_item);
5834 /* Capturing brackets */
5836 else if (op == OP_CBRA)
5838 pi_ia_is new_item;
5839 new_item.code = scode;
5840 int n = GET2(scode, 1+LINK_SIZE);
5841 int new_map = wi_bracket_map | ((n < 32)? (1 << n) : 1);
5842 new_item.bracket_map = new_map;
5843 work_stack.push(new_item);
5846 /* Other brackets */
5848 else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
5850 pi_ia_is new_item;
5851 new_item.code = scode;
5852 new_item.bracket_map = wi_bracket_map;
5853 work_stack.push(new_item);
5856 /* .* is not anchored unless DOTALL is set and it isn't in brackets that
5857 are or may be referenced. */
5859 else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
5860 op == OP_TYPEPOSSTAR) && (*options & PCRE_DOTALL) != 0)
5862 if (scode[1] != OP_ANY || (wi_bracket_map & backref_map) != 0)
5864 return FALSE;
5868 /* Check for explicit anchoring */
5870 else if (op != OP_SOD && op != OP_SOM &&
5871 ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
5873 return FALSE;
5876 wi_code += GET(wi_code, 1);
5877 } while (*wi_code == OP_ALT); /* Loop for each alternative */
5878 } //while
5879 AvmAssert(work_stack.isEmpty());
5880 return TRUE;
5886 /*************************************************
5887 * Check for starting with ^ or .* *
5888 *************************************************/
5890 /* This is called to find out if every branch starts with ^ or .* so that
5891 "first char" processing can be done to speed things up in multiline
5892 matching and for non-DOTALL patterns that start with .* (which must start at
5893 the beginning or after \n). As in the case of is_anchored() (see above), we
5894 have to take account of back references to capturing brackets that contain .*
5895 because in that case we can't make the assumption.
5897 Arguments:
5898 code points to start of expression (the bracket)
5899 bracket_map a bitmap of which brackets we are inside while testing; this
5900 handles up to substring 31; after that we just have to take
5901 the less precise approach
5902 backref_map the back reference bitmap
5904 Returns: TRUE or FALSE
5907 static BOOL
5908 is_startline(const uschar * const code, const unsigned int bracket_map,
5909 const unsigned int backref_map)
5911 TypedStack<pi_ia_is> work_stack;
5912 pi_ia_is work_item;
5914 /* set up the first work item */
5915 work_item.code = code;
5916 work_item.bracket_map = bracket_map;
5917 work_stack.push(work_item);
5919 /* start the processing loop */
5920 while (!work_stack.isEmpty())
5922 work_stack.pop(work_item);
5923 const uschar *wi_code = work_item.code;
5924 unsigned int wi_bracket_map = work_item.bracket_map;
5925 do {
5926 const uschar *scode = first_significant_code(
5927 wi_code + _pcre_OP_lengths[*wi_code], NULL, 0, FALSE);
5928 register int op = *scode;
5930 /* Non-capturing brackets */
5932 if (op == OP_BRA)
5934 pi_ia_is new_item;
5935 new_item.code = scode;
5936 new_item.bracket_map = wi_bracket_map;
5937 work_stack.push(new_item);
5940 /* Capturing brackets */
5942 else if (op == OP_CBRA)
5944 pi_ia_is new_item;
5945 new_item.code = scode;
5946 int n = GET2(scode, 1+LINK_SIZE);
5947 int new_map = wi_bracket_map | ((n < 32)? (1 << n) : 1);
5948 new_item.bracket_map = new_map;
5949 work_stack.push(new_item);
5952 /* Other brackets */
5954 else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
5956 pi_ia_is new_item;
5957 new_item.code = scode;
5958 new_item.bracket_map = wi_bracket_map;
5959 work_stack.push(new_item);
5962 /* .* means "start at start or after \n" if it isn't in brackets that
5963 may be referenced. */
5965 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
5967 if (scode[1] != OP_ANY || (wi_bracket_map & backref_map) != 0)
5969 return FALSE;
5973 /* Check for explicit circumflex */
5975 else if (op != OP_CIRC)
5977 return FALSE;
5980 /* Move on to the next alternative */
5982 wi_code += GET(wi_code, 1);
5983 } while (*wi_code == OP_ALT); /* Loop for each alternative */
5984 } //while
5985 AvmAssert(work_stack.isEmpty());
5986 return TRUE;
5991 /*************************************************
5992 * Check for asserted fixed first char *
5993 *************************************************/
5995 /* During compilation, the "first char" settings from forward assertions are
5996 discarded, because they can cause conflicts with actual literals that follow.
5997 However, if we end up without a first char setting for an unanchored pattern,
5998 it is worth scanning the regex to see if there is an initial asserted first
5999 char. If all branches start with the same asserted char, or with a bracket all
6000 of whose alternatives start with the same asserted char (recurse ad lib), then
6001 we return that char, otherwise -1.
6003 Arguments:
6004 code points to start of expression (the bracket)
6005 options pointer to the options (used to check casing changes)
6006 inassert TRUE if in an assertion
6008 Returns: -1 or the fixed first char
6011 typedef struct pi_ffac {
6012 const uschar* code;
6013 } pi_ffac;
6015 static int
6016 find_firstassertedchar(const uschar * const code, int * const options)
6018 register int c = -1;
6019 BOOL inassert = FALSE;
6020 BOOL caseless = (*options & PCRE_CASELESS) != 0;
6021 TypedStack<pi_ffac> work_stack;
6022 pi_ffac work_item;
6024 /* set up the first work item */
6025 work_item.code = code;
6026 work_stack.push(work_item);
6028 /* start the processing loop */
6029 while (!work_stack.isEmpty())
6031 BOOL exit = FALSE;
6032 work_stack.pop(work_item);
6033 const uschar *wi_code = work_item.code;
6034 do {
6035 const uschar *scode =
6036 first_significant_code(wi_code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);
6037 register int op = *scode;
6039 switch(op)
6041 default:
6042 return -1;
6044 case OP_BRA:
6045 case OP_CBRA:
6046 case OP_ASSERT:
6047 case OP_ONCE:
6048 case OP_COND:
6049 inassert = (op == OP_ASSERT);
6050 pi_ffac new_item;
6051 new_item.code = scode;
6052 work_stack.push(new_item);
6053 exit = TRUE;
6054 break;
6056 case OP_EXACT: /* Fall through */
6057 scode += 2;
6059 case OP_CHAR:
6060 case OP_CHARNC:
6061 case OP_PLUS:
6062 case OP_MINPLUS:
6063 case OP_POSPLUS:
6064 if (!inassert)
6066 return -1;
6068 if (c < 0)
6070 c = scode[1];
6071 if (caseless) c |= REQ_CASELESS;
6073 else if ((!caseless && c != scode[1]) ||
6074 (caseless && c != (scode[1] | REQ_CASELESS)))
6076 return -1;
6078 break;
6080 if(exit) break;
6081 wi_code += GET(wi_code, 1);
6082 if(*wi_code == OP_KET) wi_code += GET(work_item.code, 1);
6084 while (*wi_code == OP_ALT);
6085 } //while
6086 return c;
6090 /*************************************************
6091 * Compile a Regular Expression *
6092 *************************************************/
6094 /* This function takes a string and returns a pointer to a block of store
6095 holding a compiled version of the expression. The original API for this
6096 function had no error code return variable; it is retained for backwards
6097 compatibility. The new function is given a new name.
6099 Arguments:
6100 pattern the regular expression
6101 options various option bits
6102 errorcodeptr pointer to error code variable (pcre_compile2() only)
6103 can be NULL if you don't want a code value
6104 errorptr pointer to pointer to error text
6105 erroroffset ptr offset in pattern where error was detected
6106 tables pointer to character tables or NULL
6108 Returns: pointer to compiled data block, or NULL on error,
6109 with errorptr and erroroffset set
6112 PCRE_EXP_DEFN pcre *
6113 pcre_compile(const char *pattern, int options, const char **errorptr,
6114 int *erroroffset, const unsigned char *tables)
6116 return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
6120 PCRE_EXP_DEFN pcre *
6121 pcre_compile2(const char *pattern, int options, int *errorcodeptr,
6122 const char **errorptr, int *erroroffset, const unsigned char *tables)
6124 real_pcre *re;
6125 int length = 1; /* For final END opcode */
6126 int firstbyte, reqbyte, newline;
6127 int errorcode = 0;
6128 int skipatstart = 0;
6129 #ifdef SUPPORT_UTF8
6130 BOOL utf8;
6131 #endif
6132 size_t size;
6133 uschar *code;
6134 const uschar *codestart;
6135 const uschar *ptr;
6136 compile_data compile_block;
6137 compile_data *cd = &compile_block;
6139 /* This space is used for "compiling" into during the first phase, when we are
6140 computing the amount of memory that is needed. Compiled items are thrown away
6141 as soon as possible, so that a fairly large buffer should be sufficient for
6142 this purpose. The same space is used in the second phase for remembering where
6143 to fill in forward references to subpatterns. */
6145 uschar cworkspace[COMPILE_WORK_SIZE];
6148 /* Set this early so that early errors get offset 0. */
6150 ptr = (const uschar *)pattern;
6152 /* We can't pass back an error message if errorptr is NULL; I guess the best we
6153 can do is just return NULL, but we can set a code value if there is a code
6154 pointer. */
6156 if (errorptr == NULL)
6158 if (errorcodeptr != NULL) *errorcodeptr = 99;
6159 return NULL;
6162 *errorptr = NULL;
6163 if (errorcodeptr != NULL) *errorcodeptr = ERR0;
6165 /* However, we can give a message for this error */
6167 if (erroroffset == NULL)
6169 errorcode = ERR16;
6170 goto PCRE_EARLY_ERROR_RETURN2;
6173 *erroroffset = 0;
6175 /* Can't support UTF8 unless PCRE has been compiled to include the code. */
6177 #ifdef SUPPORT_UTF8
6178 utf8 = (options & PCRE_UTF8) != 0;
6179 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
6180 (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0)
6182 errorcode = ERR44;
6183 goto PCRE_EARLY_ERROR_RETURN2;
6185 #else
6186 if ((options & PCRE_UTF8) != 0)
6188 errorcode = ERR32;
6189 goto PCRE_EARLY_ERROR_RETURN;
6191 #endif
6193 if ((options & ~PUBLIC_OPTIONS) != 0)
6195 errorcode = ERR17;
6196 goto PCRE_EARLY_ERROR_RETURN;
6199 /* Set up pointers to the individual character tables */
6201 if (tables == NULL) tables = _pcre_default_tables;
6202 cd->lcc = tables + lcc_offset;
6203 cd->fcc = tables + fcc_offset;
6204 cd->cbits = tables + cbits_offset;
6205 cd->ctypes = tables + ctypes_offset;
6207 /* Check for newline settings at the start of the pattern, and remember the
6208 offset for later. */
6210 if (ptr[0] == '(' && ptr[1] == '*')
6212 int newnl = 0;
6213 if (VMPI_strncmp((char *)(ptr+2), "CR)", 3) == 0)
6214 { skipatstart = 5; newnl = PCRE_NEWLINE_CR; }
6215 else if (VMPI_strncmp((char *)(ptr+2), "LF)", 3) == 0)
6216 { skipatstart = 5; newnl = PCRE_NEWLINE_LF; }
6217 else if (VMPI_strncmp((char *)(ptr+2), "CRLF)", 5) == 0)
6218 { skipatstart = 7; newnl = PCRE_NEWLINE_CR + PCRE_NEWLINE_LF; }
6219 else if (VMPI_strncmp((char *)(ptr+2), "ANY)", 4) == 0)
6220 { skipatstart = 6; newnl = PCRE_NEWLINE_ANY; }
6221 else if (VMPI_strncmp((char *)(ptr+2), "ANYCRLF)", 8) == 0)
6222 { skipatstart = 10; newnl = PCRE_NEWLINE_ANYCRLF; }
6223 if (skipatstart > 0)
6224 options = (options & ~PCRE_NEWLINE_BITS) | newnl;
6227 /* Handle different types of newline. The three bits give seven cases. The
6228 current code allows for fixed one- or two-byte sequences, plus "any" and
6229 "anycrlf". */
6231 switch (options & PCRE_NEWLINE_BITS)
6233 case 0: newline = NEWLINE; break; /* Build-time default */
6234 case PCRE_NEWLINE_CR: newline = '\r'; break;
6235 case PCRE_NEWLINE_LF: newline = '\n'; break;
6236 case PCRE_NEWLINE_CR+
6237 PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
6238 case PCRE_NEWLINE_ANY: newline = -1; break;
6239 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
6240 default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
6243 if (newline == -2)
6245 cd->nltype = NLTYPE_ANYCRLF;
6247 else if (newline < 0)
6249 cd->nltype = NLTYPE_ANY;
6251 else
6253 cd->nltype = NLTYPE_FIXED;
6254 if (newline > 255)
6256 cd->nllen = 2;
6257 cd->nl[0] = (newline >> 8) & 255;
6258 cd->nl[1] = newline & 255;
6260 else
6262 cd->nllen = 1;
6263 cd->nl[0] = newline;
6267 /* Maximum back reference and backref bitmap. The bitmap records up to 31 back
6268 references to help in deciding whether (.*) can be treated as anchored or not.
6271 cd->top_backref = 0;
6272 cd->backref_map = 0;
6274 /* Reflect pattern for debugging output */
6276 DPRINTF(("------------------------------------------------------------------\n"));
6277 DPRINTF(("%s\n", pattern));
6279 /* Pretend to compile the pattern while actually just accumulating the length
6280 of memory required. This behaviour is triggered by passing a non-NULL final
6281 argument to compile_regex(). We pass a block of workspace (cworkspace) for it
6282 to compile parts of the pattern into; the compiled code is discarded when it is
6283 no longer needed, so hopefully this workspace will never overflow, though there
6284 is a test for its doing so. */
6286 cd->bracount = 0;
6287 cd->names_found = 0;
6288 cd->name_entry_size = 0;
6289 cd->name_table = NULL;
6290 cd->start_workspace = cworkspace;
6291 cd->start_code = cworkspace;
6292 cd->hwm = cworkspace;
6293 cd->start_pattern = (const uschar *)pattern;
6294 cd->end_pattern = (const uschar *)(pattern + VMPI_strlen(pattern));
6295 cd->req_varyopt = 0;
6296 cd->nopartial = FALSE;
6297 cd->external_options = options;
6299 /* Now do the pre-compile. On error, errorcode will be set non-zero, so we
6300 don't need to look at the result of the function here. The initial options have
6301 been put into the cd block so that they can be changed if an option setting is
6302 found within the regex right at the beginning. Bringing initial option settings
6303 outside can help speed up starting point checks. */
6305 ptr += skipatstart;
6306 code = cworkspace;
6307 *code = OP_BRA;
6308 (void)compile_regex(cd->external_options, cd->external_options & PCRE_IMS,
6309 &code, &ptr, &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd,
6310 &length);
6311 if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN;
6313 DPRINTF(("end pre-compile: length=%d workspace=%d\n", length,
6314 cd->hwm - cworkspace));
6316 if (length > MAX_PATTERN_SIZE)
6318 errorcode = ERR20;
6319 goto PCRE_EARLY_ERROR_RETURN;
6322 /* Compute the size of data block needed and get it, either from malloc or
6323 externally provided function. Integer overflow should no longer be possible
6324 because nowadays we limit the maximum value of cd->names_found and
6325 cd->name_entry_size. */
6327 size = length + sizeof(real_pcre) + cd->names_found * (cd->name_entry_size + 3);
6328 re = (real_pcre *)(pcre_malloc)(size);
6330 if (re == NULL)
6332 errorcode = ERR21;
6333 goto PCRE_EARLY_ERROR_RETURN;
6336 /* Put in the magic number, and save the sizes, initial options, and character
6337 table pointer. NULL is used for the default character tables. The nullpad field
6338 is at the end; it's there to help in the case when a regex compiled on a system
6339 with 4-byte pointers is run on another with 8-byte pointers. */
6341 re->magic_number = MAGIC_NUMBER;
6342 re->size = (pcre_uint32)size;
6343 re->options = cd->external_options;
6344 re->dummy1 = 0;
6345 re->first_byte = 0;
6346 re->req_byte = 0;
6347 re->name_table_offset = sizeof(real_pcre);
6348 re->name_entry_size = cd->name_entry_size;
6349 re->name_count = cd->names_found;
6350 re->ref_count = 0;
6351 re->tables = (tables == _pcre_default_tables)? NULL : tables;
6352 re->nullpad = NULL;
6354 /* The starting points of the name/number translation table and of the code are
6355 passed around in the compile data block. The start/end pattern and initial
6356 options are already set from the pre-compile phase, as is the name_entry_size
6357 field. Reset the bracket count and the names_found field. Also reset the hwm
6358 field; this time it's used for remembering forward references to subpatterns.
6361 cd->bracount = 0;
6362 cd->names_found = 0;
6363 cd->name_table = (uschar *)re + re->name_table_offset;
6364 codestart = cd->name_table + re->name_entry_size * re->name_count;
6365 cd->start_code = codestart;
6366 cd->hwm = cworkspace;
6367 cd->req_varyopt = 0;
6368 cd->nopartial = FALSE;
6369 cd->had_accept = FALSE;
6371 /* Set up a starting, non-extracting bracket, then compile the expression. On
6372 error, errorcode will be set non-zero, so we don't need to look at the result
6373 of the function here. */
6375 ptr = (const uschar *)pattern + skipatstart;
6376 code = (uschar *)codestart;
6377 *code = OP_BRA;
6378 (void)compile_regex(re->options, re->options & PCRE_IMS, &code, &ptr,
6379 &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, NULL);
6380 re->top_bracket = cd->bracount;
6381 re->top_backref = cd->top_backref;
6383 if (cd->nopartial) re->options |= PCRE_NOPARTIAL;
6384 if (cd->had_accept) reqbyte = -1; /* Must disable after (*ACCEPT) */
6386 /* If not reached end of pattern on success, there's an excess bracket. */
6388 if (errorcode == 0 && *ptr != 0) errorcode = ERR22;
6390 /* Fill in the terminating state and check for disastrous overflow, but
6391 if debugging, leave the test till after things are printed out. */
6393 *code++ = OP_END;
6395 #ifndef PCRE_DEBUG
6396 if (code - codestart > length) errorcode = ERR23;
6397 #endif
6399 /* Fill in any forward references that are required. */
6401 while (errorcode == 0 && cd->hwm > cworkspace)
6403 int offset, recno;
6404 const uschar *groupptr;
6405 cd->hwm -= LINK_SIZE;
6406 offset = GET(cd->hwm, 0);
6407 recno = GET(codestart, offset);
6408 groupptr = find_bracket(codestart, (re->options & PCRE_UTF8) != 0, recno);
6409 if (groupptr == NULL) errorcode = ERR53;
6410 else PUT(((uschar *)codestart), offset, groupptr - codestart);
6413 /* Give an error if there's back reference to a non-existent capturing
6414 subpattern. */
6416 if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;
6418 /* Failed to compile, or error while post-processing */
6420 if (errorcode != 0)
6422 (pcre_free)(re);
6423 PCRE_EARLY_ERROR_RETURN:
6424 *erroroffset = ptr - (const uschar *)pattern;
6425 PCRE_EARLY_ERROR_RETURN2:
6426 *errorptr = error_texts[errorcode];
6427 if (errorcodeptr != NULL) *errorcodeptr = errorcode;
6428 return NULL;
6431 /* If the anchored option was not passed, set the flag if we can determine that
6432 the pattern is anchored by virtue of ^ characters or \A or anything else (such
6433 as starting with .* when DOTALL is set).
6435 Otherwise, if we know what the first byte has to be, save it, because that
6436 speeds up unanchored matches no end. If not, see if we can set the
6437 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
6438 start with ^. and also when all branches start with .* for non-DOTALL matches.
6441 if ((re->options & PCRE_ANCHORED) == 0)
6443 int temp_options = re->options; /* May get changed during these scans */
6444 if (is_anchored(codestart, &temp_options, 0, cd->backref_map))
6445 re->options |= PCRE_ANCHORED;
6446 else
6448 if (firstbyte < 0)
6449 firstbyte = find_firstassertedchar(codestart, &temp_options);
6450 if (firstbyte >= 0) /* Remove caseless flag for non-caseable chars */
6452 int ch = firstbyte & 255;
6453 re->first_byte = ((firstbyte & REQ_CASELESS) != 0 &&
6454 cd->fcc[ch] == ch)? ch : firstbyte;
6455 re->options |= PCRE_FIRSTSET;
6457 else if (is_startline(codestart, 0, cd->backref_map))
6458 re->options |= PCRE_STARTLINE;
6462 /* For an anchored pattern, we use the "required byte" only if it follows a
6463 variable length item in the regex. Remove the caseless flag for non-caseable
6464 bytes. */
6466 if (reqbyte >= 0 &&
6467 ((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0))
6469 int ch = reqbyte & 255;
6470 re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&
6471 cd->fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;
6472 re->options |= PCRE_REQCHSET;
6475 /* Print out the compiled data if debugging is enabled. This is never the
6476 case when building a production library. */
6478 #ifdef PCRE_DEBUG
6480 avmplus::AvmLog("Length = %d top_bracket = %d top_backref = %d\n",
6481 length, re->top_bracket, re->top_backref);
6483 avmplus::AvmLog("Options=%08x\n", re->options);
6485 if ((re->options & PCRE_FIRSTSET) != 0)
6487 int ch = re->first_byte & 255;
6488 const char *caseless = ((re->first_byte & REQ_CASELESS) == 0)?
6489 "" : " (caseless)";
6490 if (VMPI_isprint(ch)) avmplus::AvmLog("First char = %c%s\n", ch, caseless);
6491 else avmplus::AvmLog("First char = \\x%02x%s\n", ch, caseless);
6494 if ((re->options & PCRE_REQCHSET) != 0)
6496 int ch = re->req_byte & 255;
6497 const char *caseless = ((re->req_byte & REQ_CASELESS) == 0)?
6498 "" : " (caseless)";
6499 if (VMPI_isprint(ch)) avmplus::AvmLog("Req char = %c%s\n", ch, caseless);
6500 else avmplus::AvmLog("Req char = \\x%02x%s\n", ch, caseless);
6503 pcre_printint(re, stdout, TRUE);
6505 /* This check is done here in the debugging case so that the code that
6506 was compiled can be seen. */
6508 if (code - codestart > length)
6510 (pcre_free)(re);
6511 *errorptr = error_texts[ERR23];
6512 *erroroffset = ptr - (uschar *)pattern;
6513 if (errorcodeptr != NULL) *errorcodeptr = ERR23;
6514 return NULL;
6516 #endif /* DEBUG */
6518 return (pcre *)re;
6521 /* End of pcre_compile.c */