More warning fixes (SF #732742: compiling errors).
[nedit.git] / source / regularExp.c
blobdac7a21bf0c6fd4637db9911cdbb6a8a68c755b7
1 static const char CVSID[] = "$Id: regularExp.c,v 1.22 2003/05/07 10:51:52 edg Exp $";
2 /*------------------------------------------------------------------------*
3 * `CompileRE', `ExecRE', and `substituteRE' -- regular expression parsing
5 * This is a HIGHLY ALTERED VERSION of Henry Spencer's `regcomp' and
6 * `regexec' code adapted for NEdit.
8 * .-------------------------------------------------------------------.
9 * | ORIGINAL COPYRIGHT NOTICE: |
10 * | |
11 * | Copyright (c) 1986 by University of Toronto. |
12 * | Written by Henry Spencer. Not derived from licensed software. |
13 * | |
14 * | Permission is granted to anyone to use this software for any |
15 * | purpose on any computer system, and to redistribute it freely, |
16 * | subject to the following restrictions: |
17 * | |
18 * | 1. The author is not responsible for the consequences of use of |
19 * | this software, no matter how awful, even if they arise |
20 * | from defects in it. |
21 * | |
22 * | 2. The origin of this software must not be misrepresented, either |
23 * | by explicit claim or by omission. |
24 * | |
25 * | 3. Altered versions must be plainly marked as such, and must not |
26 * | be misrepresented as being the original software. |
27 * `-------------------------------------------------------------------'
29 * BEWARE that some of this code is subtly aware of the way operator
30 * precedence is structured in regular expressions. Serious changes in
31 * regular-expression syntax might require a total rethink.
32 * -- Henry Spencer
33 * (Yes, it did!) -- Christopher Conrad, Dec. 1999
35 * January, 1994, Mark Edel
36 * Consolidated files, changed names of external functions to avoid
37 * potential conflicts with native regcomp and regexec functions, changed
38 * error reporting to NEdit form, added multi-line and reverse searching,
39 * and added \n \t \u \U \l \L.
41 * June, 1996, Mark Edel
42 * Bug in NEXT macro, didn't work for expressions which compiled to over
43 * 256 bytes.
45 * December, 1999, Christopher Conrad
46 * Reformatted code for readability, improved error output, added octal and
47 * hexadecimal escapes, added back-references (\1-\9), added positive look
48 * ahead: (?=...), added negative lookahead: (?!...), added non-capturing
49 * parentheses: (?:...), added case insensitive constructs (?i...) and
50 * (?I...), added newline matching constructs (?n...) and (?N...), added
51 * regex comments: (?#...), added shortcut escapes: \d\D\l\L\s\S\w\W\y\Y.
52 * Added "not a word boundary" anchor \B.
54 * July, 2002, Eddy De Greef
55 * Added look behind, both positive (?<=...) and negative (?<!...) for
56 * bounded-length patterns.
59 #ifdef HAVE_CONFIG_H
60 #include "../config.h"
61 #endif
63 #include "regularExp.h"
65 #include <stdio.h>
66 #include <stdlib.h>
67 #include <string.h>
68 #include <ctype.h>
69 #include <limits.h>
71 #ifdef HAVE_DEBUG_H
72 #include "../debug.h"
73 #endif
76 /* The first byte of the regexp internal `program' is a magic number to help
77 gaurd against corrupted data; the compiled regex code really begins in the
78 second byte. */
80 #define MAGIC 0234
82 /* The "internal use only" fields in `regexp.h' are present to pass info from
83 * `CompileRE' to `ExecRE' which permits the execute phase to run lots faster on
84 * simple cases. They are:
86 * match_start Character that must begin a match; '\0' if none obvious.
87 * anchor Is the match anchored (at beginning-of-line only)?
89 * `match_start' and `anchor' permit very fast decisions on suitable starting
90 * points for a match, considerably reducing the work done by ExecRE. */
92 /* STRUCTURE FOR A REGULAR EXPRESSION (regex) `PROGRAM'.
94 * This is essentially a linear encoding of a nondeterministic finite-state
95 * machine or NFA (aka syntax charts or `railroad normal form' in parsing
96 * technology). Each node is an opcode plus a NEXT pointer, possibly
97 * followed by operands. NEXT pointers of all nodes except BRANCH implement
98 * concatenation; a NEXT pointer with a BRANCH on both ends of it is
99 * connecting two alternatives. (Here we have one of the subtle syntax
100 * dependencies: an individual BRANCH (as opposed to a collection of them) is
101 * never concatenated with anything because of operator precedence.) The
102 * operand of some types of nodes is a literal string; for others, it is a node
103 * leading into a sub-FSM. In particular, the operand of a BRANCH node is the
104 * first node of the branch. (NB this is _NOT_ a tree structure: the tail of
105 * the branch connects to the thing following the set of BRANCHes.)
107 * The opcodes are: */
109 /* DEFINITION VALUE MEANING */
111 #define END 1 /* End of program. */
113 /* Zero width positional assertions. */
115 #define BOL 2 /* Match position at beginning of line. */
116 #define EOL 3 /* Match position at end of line. */
117 #define BOWORD 4 /* Match "" representing word delimiter or BOL */
118 #define EOWORD 5 /* Match "" representing word delimiter or EOL */
119 #define NOT_BOUNDARY 6 /* Not word boundary (\B, opposite of < and >) */
121 /* Op codes with null terminated string operands. */
123 #define EXACTLY 7 /* Match this string. */
124 #define SIMILAR 8 /* Match this case insensitive string */
125 #define ANY_OF 9 /* Match any character in the set. */
126 #define ANY_BUT 10 /* Match any character not in the set. */
128 /* Op codes to match any character. */
130 #define ANY 11 /* Match any one character (implements '.') */
131 #define EVERY 12 /* Same as ANY but matches newline. */
133 /* Shortcut escapes, \d, \D, \l, \L, \s, \S, \w, \W, \y, \Y. */
135 #define DIGIT 13 /* Match any digit, i.e. [0123456789] */
136 #define NOT_DIGIT 14 /* Match any non-digit, i.e. [^0123456789] */
137 #define LETTER 15 /* Match any letter character [a-zA-Z] */
138 #define NOT_LETTER 16 /* Match any non-letter character [^a-zA-Z] */
139 #define SPACE 17 /* Match any whitespace character EXCEPT \n */
140 #define SPACE_NL 18 /* Match any whitespace character INCLUDING \n */
141 #define NOT_SPACE 19 /* Match any non-whitespace character */
142 #define NOT_SPACE_NL 20 /* Same as NOT_SPACE but matches newline. */
143 #define WORD_CHAR 21 /* Match any word character [a-zA-Z0-9_] */
144 #define NOT_WORD_CHAR 22 /* Match any non-word character [^a-zA-Z0-9_] */
145 #define IS_DELIM 23 /* Match any character that's a word delimiter */
146 #define NOT_DELIM 24 /* Match any character NOT a word delimiter */
148 /* Quantifier nodes. (Only applied to SIMPLE nodes. Quantifiers applied to non
149 SIMPLE nodes or larger atoms are implemented using complex constructs.)*/
151 #define STAR 25 /* Match this (simple) thing 0 or more times. */
152 #define LAZY_STAR 26 /* Minimal matching STAR */
153 #define QUESTION 27 /* Match this (simple) thing 0 or 1 times. */
154 #define LAZY_QUESTION 28 /* Minimal matching QUESTION */
155 #define PLUS 29 /* Match this (simple) thing 1 or more times. */
156 #define LAZY_PLUS 30 /* Minimal matching PLUS */
157 #define BRACE 31 /* Match this (simple) thing m to n times. */
158 #define LAZY_BRACE 32 /* Minimal matching BRACE */
160 /* Nodes used to build complex constructs. */
162 #define NOTHING 33 /* Match empty string (always matches) */
163 #define BRANCH 34 /* Match this alternative, or the next... */
164 #define BACK 35 /* Always matches, NEXT ptr points backward. */
165 #define INIT_COUNT 36 /* Initialize {m,n} counter to zero */
166 #define INC_COUNT 37 /* Increment {m,n} counter by one */
167 #define TEST_COUNT 38 /* Test {m,n} counter against operand */
169 /* Back Reference nodes. */
171 #define BACK_REF 39 /* Match latest matched parenthesized text */
172 #define BACK_REF_CI 40 /* Case insensitive version of BACK_REF */
173 #define X_REGEX_BR 41 /* Cross-Regex Back-Ref for syntax highlighting */
174 #define X_REGEX_BR_CI 42 /* Case insensitive version of X_REGEX_BR_CI */
176 /* Various nodes used to implement parenthetical constructs. */
178 #define POS_AHEAD_OPEN 43 /* Begin positive look ahead */
179 #define NEG_AHEAD_OPEN 44 /* Begin negative look ahead */
180 #define LOOK_AHEAD_CLOSE 45 /* End positive or negative look ahead */
182 #define POS_BEHIND_OPEN 46 /* Begin positive look behind */
183 #define NEG_BEHIND_OPEN 47 /* Begin negative look behind */
184 #define LOOK_BEHIND_CLOSE 48 /* Close look behind */
186 #define OPEN 49 /* Open for capturing parentheses. */
188 /* OPEN+1 is number 1, etc. */
189 #define CLOSE (OPEN + NSUBEXP) /* Close for capturing parentheses. */
191 #define LAST_PAREN (CLOSE + NSUBEXP)
193 #if (LAST_PAREN > UCHAR_MAX)
194 #error "Too many parentheses for storage in an unsigned char (LAST_PAREN too big.)"
195 #endif
197 /* The next_ptr () function can consume up to 30% of the time during matching
198 because it is called an immense number of times (an average of 25
199 next_ptr() calls per match() call was witnessed for Perl syntax
200 highlighting). Therefore it is well worth removing some of the function
201 call overhead by selectively inlining the next_ptr() calls. Moreover,
202 the inlined code can be simplified for matching because one of the tests,
203 only necessary during compilation, can be left out.
204 The net result of using this inlined version at two critical places is
205 a 25% speedup (again, witnesses on Perl syntax highlighting). */
207 #define NEXT_PTR(in_ptr, out_ptr)\
208 next_ptr_offset = GET_OFFSET (in_ptr);\
209 if (next_ptr_offset == 0)\
210 out_ptr = NULL;\
211 else {\
212 if (GET_OP_CODE (in_ptr) == BACK)\
213 out_ptr = in_ptr - next_ptr_offset;\
214 else \
215 out_ptr = in_ptr + next_ptr_offset;\
218 /* OPCODE NOTES:
219 ------------
221 All nodes consist of an 8 bit op code followed by 2 bytes that make up a 16
222 bit NEXT pointer. Some nodes have a null terminated character string operand
223 following the NEXT pointer. Other nodes may have an 8 bit index operand.
224 The TEST_COUNT node has an index operand followed by a 16 bit test value.
225 The BRACE and LAZY_BRACE nodes have two 16 bit values for min and max but no
226 index value.
228 SIMILAR
229 Operand(s): null terminated string
231 Implements a case insensitive match of a string. Mostly intended for use
232 in syntax highlighting patterns for keywords of languages like FORTRAN
233 and Ada that are case insensitive. The regex text in this node is
234 converted to lower case during regex compile.
236 DIGIT, NOT_DIGIT, LETTER, NOT_LETTER, SPACE, NOT_SPACE, WORD_CHAR,
237 NOT_WORD_CHAR
238 Operand(s): None
240 Implements shortcut escapes \d, \D, \l, \L, \s, \S, \w, \W. The locale
241 aware ANSI functions isdigit(), isalpha(), isalnun(), and isspace() are
242 used to implement these in the hopes of increasing portability.
244 NOT_BOUNDARY
245 Operand(s): None
247 Implements \B as a zero width assertion that the current character is
248 NOT on a word boundary. Word boundaries are defined to be the position
249 between two characters where one of those characters is one of the
250 dynamically defined word delimiters, and the other character is not.
252 IS_DELIM
253 Operand(s): None
255 Implements \y as any character that is one of the dynamically
256 specified word delimiters.
258 NOT_DELIM
259 Operand(s): None
261 Implements \Y as any character that is NOT one of the dynamically
262 specified word delimiters.
264 STAR, PLUS, QUESTION, and complex '*', '+', and '?'
265 Operand(s): None (Note: NEXT pointer is usually zero. The code that
266 processes this node skips over it.)
268 Complex (parenthesized) versions implemented as circular BRANCH
269 structures using BACK. SIMPLE versions (one character per match) are
270 implemented separately for speed and to minimize recursion.
272 BRACE, LAZY_BRACE
273 Operand(s): minimum value (2 bytes), maximum value (2 bytes)
275 Implements the {m,n} construct for atoms that are SIMPLE.
277 BRANCH
278 Operand(s): None
280 The set of branches constituting a single choice are hooked together
281 with their NEXT pointers, since precedence prevents anything being
282 concatenated to any individual branch. The NEXT pointer of the last
283 BRANCH in a choice points to the thing following the whole choice. This
284 is also where the final NEXT pointer of each individual branch points;
285 each branch starts with the operand node of a BRANCH node.
287 BACK
288 Operand(s): None
290 Normal NEXT pointers all implicitly point forward. Back implicitly
291 points backward. BACK exists to make loop structures possible.
293 INIT_COUNT
294 Operand(s): index (1 byte)
296 Initializes the count array element referenced by the index operand.
297 This node is used to build general (i.e. parenthesized) {m,n} constructs.
299 INC_COUNT
300 Operand(s): index (1 byte)
302 Increments the count array element referenced by the index operand.
303 This node is used to build general (i.e. parenthesized) {m,n} constructs.
305 TEST_COUNT
306 Operand(s): index (1 byte), test value (2 bytes)
308 Tests the current value of the count array element specified by the
309 index operand against the test value. If the current value is less than
310 the test value, control passes to the node after that TEST_COUNT node.
311 Otherwise control passes to the node referenced by the NEXT pointer for
312 the TEST_COUNT node. This node is used to build general (i.e.
313 parenthesized) {m,n} constructs.
315 BACK_REF, BACK_REF_CI
316 Operand(s): index (1 byte, value 1-9)
318 Implements back references. This node will attempt to match whatever text
319 was most recently captured by the index'th set of parentheses.
320 BACK_REF_CI is case insensitive version.
322 X_REGEX_BR, X_REGEX_BR_CI
323 (NOT IMPLEMENTED YET)
325 Operand(s): index (1 byte, value 1-9)
327 Implements back references into a previously matched but separate regular
328 expression. This is used by syntax highlighting patterns. This node will
329 attempt to match whatever text was most captured by the index'th set of
330 parentheses of the separate regex passed to ExecRE. X_REGEX_BR_CI is case
331 insensitive version.
333 POS_AHEAD_OPEN, NEG_AHEAD_OPEN, LOOK_AHEAD_CLOSE
335 Operand(s): None
337 Implements positive and negative look ahead. Look ahead is an assertion
338 that something is either there or not there. Once this is determined the
339 regex engine backtracks to where it was just before the look ahead was
340 encountered, i.e. look ahead is a zero width assertion.
342 POS_BEHIND_OPEN, NEG_BEHIND_OPEN, LOOK_BEHIND_CLOSE
344 Operand(s): 2x2 bytes for OPEN (match boundaries), None for CLOSE
346 Implements positive and negative look behind. Look behind is an assertion
347 that something is either there or not there in front of the current
348 position. Look behind is a zero width assertion, with the additional
349 constraint that it must have a bounded length (for complexity and
350 efficiency reasons; note that most other implementation even impose
351 fixed length).
353 OPEN, CLOSE
355 Operand(s): None
357 OPEN + n = Start of parenthesis 'n', CLOSE + n = Close of parenthesis
358 'n', and are numbered at compile time.
361 /* A node is one char of opcode followed by two chars of NEXT pointer plus
362 * any operands. NEXT pointers are stored as two 8-bit pieces, high order
363 * first. The value is a positive offset from the opcode of the node
364 * containing it. An operand, if any, simply follows the node. (Note that
365 * much of the code generation knows about this implicit relationship.)
367 * Using two bytes for NEXT_PTR_SIZE is vast overkill for most things,
368 * but allows patterns to get big without disasters. */
370 #define OP_CODE_SIZE 1
371 #define NEXT_PTR_SIZE 2
372 #define INDEX_SIZE 1
373 #define LENGTH_SIZE 4
374 #define NODE_SIZE (NEXT_PTR_SIZE + OP_CODE_SIZE)
376 #define GET_OP_CODE(p) (*(unsigned char *)(p))
377 #define OPERAND(p) ((p) + NODE_SIZE)
378 #define GET_OFFSET(p) ((( *((p) + 1) & 0377) << 8) + (( *((p) + 2)) & 0377))
379 #define PUT_OFFSET_L(v) (unsigned char)(((v) >> 8) & 0377)
380 #define PUT_OFFSET_R(v) (unsigned char) ((v) & 0377)
381 #define GET_LOWER(p) ((( *((p) + NODE_SIZE) & 0377) << 8) + \
382 (( *((p) + NODE_SIZE+1)) & 0377))
383 #define GET_UPPER(p) ((( *((p) + NODE_SIZE+2) & 0377) << 8) + \
384 (( *((p) + NODE_SIZE+3)) & 0377))
386 /* Utility definitions. */
388 #define REG_FAIL(m) {*Error_Ptr = (m); return (NULL);}
389 #define IS_QUANTIFIER(c) ((c) == '*' || (c) == '+' || \
390 (c) == '?' || (c) == Brace_Char)
391 #define SET_BIT(i,n) ((i) |= (1 << ((n) - 1)))
392 #define TEST_BIT(i,n) ((i) & (1 << ((n) - 1)))
393 #define U_CHAR_AT(p) ((unsigned int) *(unsigned char *)(p))
395 /* Flags to be passed up and down via function parameters during compile. */
397 #define WORST 0 /* Worst case. No assumptions can be made.*/
398 #define HAS_WIDTH 1 /* Known never to match null string. */
399 #define SIMPLE 2 /* Simple enough to be STAR/PLUS operand. */
401 #define NO_PAREN 0 /* Only set by initial call to "chunk". */
402 #define PAREN 1 /* Used for normal capturing parentheses. */
403 #define NO_CAPTURE 2 /* Non-capturing parentheses (grouping only). */
404 #define INSENSITIVE 3 /* Case insensitive parenthetical construct */
405 #define SENSITIVE 4 /* Case sensitive parenthetical construct */
406 #define NEWLINE 5 /* Construct to match newlines in most cases */
407 #define NO_NEWLINE 6 /* Construct to match newlines normally */
409 #define REG_INFINITY 0UL
410 #define REG_ZERO 0UL
411 #define REG_ONE 1UL
413 /* Flags for function shortcut_escape() */
415 #define CHECK_ESCAPE 0 /* Check an escape sequence for validity only. */
416 #define CHECK_CLASS_ESCAPE 1 /* Check the validity of an escape within a
417 character class */
418 #define EMIT_CLASS_BYTES 2 /* Emit equivalent character class bytes,
419 e.g \d=0123456789 */
420 #define EMIT_NODE 3 /* Emit the appropriate node. */
422 /* Array sizes for arrays used by function init_ansi_classes. */
424 #define WHITE_SPACE_SIZE 16
425 #define ALNUM_CHAR_SIZE 256
427 /* Number of bytes to offset from the beginning of the regex program to the start
428 of the actual compiled regex code, i.e. skipping over the MAGIC number and
429 the two counters at the front. */
431 #define REGEX_START_OFFSET 3
433 #define MAX_COMPILED_SIZE 32767UL /* Largest size a compiled regex can be.
434 Probably could be 65535UL. */
436 /* Global work variables for `CompileRE'. */
438 static unsigned char *Reg_Parse; /* Input scan ptr (scans user's regex) */
439 static int Total_Paren; /* Parentheses, (), counter. */
440 static int Num_Braces; /* Number of general {m,n} constructs.
441 {m,n} quantifiers of SIMPLE atoms are
442 not included in this count. */
443 static int Closed_Parens; /* Bit flags indicating () closure. */
444 static int Paren_Has_Width; /* Bit flags indicating ()'s that are
445 known to not match the empty string */
446 static unsigned char Compute_Size; /* Address of this used as flag. */
447 static unsigned char *Code_Emit_Ptr; /* When Code_Emit_Ptr is set to
448 &Compute_Size no code is emitted.
449 Instead, the size of code that WOULD
450 have been generated is accumulated in
451 Reg_Size. Otherwise, Code_Emit_Ptr
452 points to where compiled regex code is
453 to be written. */
454 static unsigned long Reg_Size; /* Size of compiled regex code. */
455 static char **Error_Ptr; /* Place to store error messages so
456 they can be returned by `CompileRE' */
457 static char Error_Text [128];/* Sting to build error messages in. */
459 static unsigned char White_Space [WHITE_SPACE_SIZE]; /* Arrays used by */
460 static unsigned char Word_Char [ALNUM_CHAR_SIZE]; /* functions */
461 static unsigned char Letter_Char [ALNUM_CHAR_SIZE]; /* init_ansi_classes () */
462 /* and
463 shortcut_escape (). */
465 static unsigned char ASCII_Digits [] = "0123456789"; /* Same for all */
466 /* locales. */
467 static int Is_Case_Insensitive;
468 static int Match_Newline;
470 static int Enable_Counting_Quantifier = 1;
471 static unsigned char Brace_Char;
472 static unsigned char Default_Meta_Char [] = "{.*+?[(|)^<>$";
473 static unsigned char *Meta_Char;
475 typedef struct { long lower; long upper; } len_range;
477 /* Forward declarations for functions used by `CompileRE'. */
479 static unsigned char * alternative (int *flag_param, len_range *range_param);
480 static unsigned char * back_ref (unsigned char *c, int *flag_param,
481 int emit);
482 static unsigned char * chunk (int paren, int *flag_param, len_range *range_param);
483 static void emit_byte (unsigned char c);
484 static void emit_class_byte (unsigned char c);
485 static unsigned char * emit_node (int op_code);
486 static unsigned char * emit_special (unsigned char op_code,
487 unsigned long test_val,
488 int index);
489 static unsigned char literal_escape (unsigned char c);
490 static unsigned char numeric_escape (unsigned char c, unsigned char **parse);
491 static unsigned char * atom (int *flag_param, len_range *range_param);
492 static void reg_error (char *str);
493 static unsigned char * insert (unsigned char op, unsigned char *opnd,
494 long min, long max, int index);
495 static unsigned char * next_ptr (unsigned char *ptr);
496 static void offset_tail (unsigned char *ptr, int offset,
497 unsigned char *val);
498 static void branch_tail (unsigned char *ptr, int offset,
499 unsigned char *val);
500 static unsigned char * piece (int *flag_param, len_range *range_param);
501 static void tail (unsigned char *search_from,
502 unsigned char *point_t);
503 static unsigned char * shortcut_escape (unsigned char c, int *flag_param,
504 int emit);
506 static int init_ansi_classes (void);
508 /*----------------------------------------------------------------------*
509 * CompileRE
511 * Compiles a regular expression into the internal format used by
512 * `ExecRE'.
514 * The default behaviour wrt. case sensitivity and newline matching can
515 * be controlled through the defaultFlags argument (Markus Schwarzenberg).
516 * Future extensions are possible by using other flag bits.
517 * Note that currently only the case sensitivity flag is effectively used.
519 * Beware that the optimization and preparation code in here knows about
520 * some of the structure of the compiled regexp.
521 *----------------------------------------------------------------------*/
523 regexp * CompileRE (const char *exp, char **errorText, int defaultFlags) {
525 register regexp *comp_regex = NULL;
526 register unsigned char *scan;
527 int flags_local, pass;
528 len_range range_local;
530 if (Enable_Counting_Quantifier) {
531 Brace_Char = '{';
532 Meta_Char = &Default_Meta_Char [0];
533 } else {
534 Brace_Char = '*'; /* Bypass the '{' in */
535 Meta_Char = &Default_Meta_Char [1]; /* Default_Meta_Char */
538 /* Set up errorText to receive failure reports. */
540 Error_Ptr = errorText;
541 *Error_Ptr = "";
543 if (exp == NULL) REG_FAIL ("NULL argument, `CompileRE\'");
545 /* Initialize arrays used by function `shortcut_escape'. */
547 if (!init_ansi_classes ()) REG_FAIL ("internal error #1, `CompileRE\'");
549 Code_Emit_Ptr = &Compute_Size;
550 Reg_Size = 0UL;
552 /* We can't allocate space until we know how big the compiled form will be,
553 but we can't compile it (and thus know how big it is) until we've got a
554 place to put the code. So we cheat: we compile it twice, once with code
555 generation turned off and size counting turned on, and once "for real".
556 This also means that we don't allocate space until we are sure that the
557 thing really will compile successfully, and we never have to move the
558 code and thus invalidate pointers into it. (Note that it has to be in
559 one piece because free() must be able to free it all.) */
561 for (pass = 1; pass <= 2; pass++) {
562 /*-------------------------------------------*
563 * FIRST PASS: Determine size and legality. *
564 * SECOND PASS: Emit code. *
565 *-------------------------------------------*/
567 /* Schwarzenberg:
568 * If defaultFlags = 0 use standard defaults:
569 * Is_Case_Insensitive: Case sensitive is the default
570 * Match_Newline: Newlines are NOT matched by default
571 * in character classes
573 Is_Case_Insensitive = ((defaultFlags & REDFLT_CASE_INSENSITIVE) ? 1 : 0);
574 Match_Newline = 0; /* ((defaultFlags & REDFLT_MATCH_NEWLINE) ? 1 : 0);
575 Currently not used. Uncomment if needed. */
577 Reg_Parse = (unsigned char *) exp;
578 Total_Paren = 1;
579 Num_Braces = 0;
580 Closed_Parens = 0;
581 Paren_Has_Width = 0;
583 emit_byte (MAGIC);
584 emit_byte ('%'); /* Placeholder for num of capturing parentheses. */
585 emit_byte ('%'); /* Placeholder for num of general {m,n} constructs. */
587 if (chunk (NO_PAREN, &flags_local, &range_local) == NULL)
588 return (NULL); /* Something went wrong */
589 if (pass == 1) {
590 if (Reg_Size >= MAX_COMPILED_SIZE) {
591 /* Too big for NEXT pointers NEXT_PTR_SIZE bytes long to span.
592 This is a real issue since the first BRANCH node usually points
593 to the end of the compiled regex code. */
595 sprintf (Error_Text, "regexp > %lu bytes", MAX_COMPILED_SIZE);
596 REG_FAIL (Error_Text);
599 /* Allocate memory. */
601 comp_regex = (regexp *) malloc (sizeof (regexp) + Reg_Size);
603 if (comp_regex == NULL) REG_FAIL ("out of memory in `CompileRE\'");
605 Code_Emit_Ptr = (unsigned char *) comp_regex->program;
609 comp_regex->program [1] = (unsigned char) Total_Paren - 1;
610 comp_regex->program [2] = (unsigned char) Num_Braces;
612 /*----------------------------------------*
613 * Dig out information for optimizations. *
614 *----------------------------------------*/
616 comp_regex->match_start = '\0'; /* Worst-case defaults. */
617 comp_regex->anchor = 0;
619 /* First BRANCH. */
621 scan = (unsigned char *) (comp_regex->program + REGEX_START_OFFSET);
623 if (GET_OP_CODE (next_ptr (scan)) == END) { /* Only one top-level choice. */
624 scan = OPERAND (scan);
626 /* Starting-point info. */
628 if (GET_OP_CODE (scan) == EXACTLY) {
629 comp_regex->match_start = *OPERAND (scan);
631 } else if (PLUS <= GET_OP_CODE (scan) &&
632 GET_OP_CODE (scan) <= LAZY_PLUS) {
634 /* Allow x+ or x+? at the start of the regex to be
635 optimized. */
637 if (GET_OP_CODE (scan + NODE_SIZE) == EXACTLY) {
638 comp_regex->match_start = *OPERAND (scan + NODE_SIZE);
640 } else if (GET_OP_CODE (scan) == BOL) {
641 comp_regex->anchor++;
645 return (comp_regex);
648 /*----------------------------------------------------------------------*
649 * chunk *
651 * Process main body of regex or process a parenthesized "thing". *
653 * Caller must absorb opening parenthesis. *
655 * Combining parenthesis handling with the base level of regular *
656 * expression is a trifle forced, but the need to tie the tails of the *
657 * branches to what follows makes it hard to avoid. *
658 *----------------------------------------------------------------------*/
660 static unsigned char * chunk (int paren, int *flag_param,
661 len_range *range_param) {
663 register unsigned char *ret_val = NULL;
664 register unsigned char *this_branch;
665 register unsigned char *ender = NULL;
666 register int this_paren = 0;
667 int flags_local, first = 1, zero_width, i;
668 int old_sensitive = Is_Case_Insensitive;
669 int old_newline = Match_Newline;
670 len_range range_local;
671 int look_only = 0;
672 unsigned char *emit_look_behind_bounds = NULL;
675 *flag_param = HAS_WIDTH; /* Tentatively. */
676 range_param->lower = 0; /* Idem */
677 range_param->upper = 0;
679 /* Make an OPEN node, if parenthesized. */
681 if (paren == PAREN) {
682 if (Total_Paren >= NSUBEXP) {
683 sprintf (Error_Text, "number of ()'s > %d", (int) NSUBEXP);
684 REG_FAIL (Error_Text);
687 this_paren = Total_Paren; Total_Paren++;
688 ret_val = emit_node (OPEN + this_paren);
689 } else if (paren == POS_AHEAD_OPEN || paren == NEG_AHEAD_OPEN) {
690 *flag_param = WORST; /* Look ahead is zero width. */
691 look_only = 1;
692 ret_val = emit_node (paren);
693 } else if (paren == POS_BEHIND_OPEN || paren == NEG_BEHIND_OPEN) {
694 *flag_param = WORST; /* Look behind is zero width. */
695 look_only = 1;
696 /* We'll overwrite the zero length later on, so we save the ptr */
697 ret_val = emit_special (paren, 0, 0);
698 emit_look_behind_bounds = ret_val + NODE_SIZE;
699 } else if (paren == INSENSITIVE) {
700 Is_Case_Insensitive = 1;
701 } else if (paren == SENSITIVE) {
702 Is_Case_Insensitive = 0;
703 } else if (paren == NEWLINE) {
704 Match_Newline = 1;
705 } else if (paren == NO_NEWLINE) {
706 Match_Newline = 0;
709 /* Pick up the branches, linking them together. */
711 do {
712 this_branch = alternative (&flags_local, &range_local);
714 if (this_branch == NULL) return (NULL);
716 if (first) {
717 first = 0;
718 *range_param = range_local;
719 if (ret_val == NULL) ret_val = this_branch;
720 } else if (range_param->lower >= 0) {
721 if (range_local.lower >= 0) {
722 if (range_local.lower < range_param->lower)
723 range_param->lower = range_local.lower;
724 if (range_local.upper > range_param->upper)
725 range_param->upper = range_local.upper;
726 } else {
727 range_param->lower = -1; /* Branches have different lengths */
728 range_param->upper = -1;
732 tail (ret_val, this_branch); /* Connect BRANCH -> BRANCH. */
734 /* If any alternative could be zero width, consider the whole
735 parenthisized thing to be zero width. */
737 if (!(flags_local & HAS_WIDTH)) *flag_param &= ~HAS_WIDTH;
739 /* Are there more alternatives to process? */
741 if (*Reg_Parse != '|') break;
743 Reg_Parse++;
744 } while (1);
746 /* Make a closing node, and hook it on the end. */
748 if (paren == PAREN) {
749 ender = emit_node (CLOSE + this_paren);
751 } else if (paren == NO_PAREN) {
752 ender = emit_node (END);
754 } else if (paren == POS_AHEAD_OPEN || paren == NEG_AHEAD_OPEN) {
755 ender = emit_node (LOOK_AHEAD_CLOSE);
757 } else if (paren == POS_BEHIND_OPEN || paren == NEG_BEHIND_OPEN) {
758 ender = emit_node (LOOK_BEHIND_CLOSE);
760 } else {
761 ender = emit_node (NOTHING);
764 tail (ret_val, ender);
766 /* Hook the tails of the branch alternatives to the closing node. */
768 for (this_branch = ret_val; this_branch != NULL; ) {
769 branch_tail (this_branch, NODE_SIZE, ender);
770 this_branch = next_ptr (this_branch);
773 /* Check for proper termination. */
775 if (paren != NO_PAREN && *Reg_Parse++ != ')') {
776 REG_FAIL ("missing right parenthesis \')\'");
777 } else if (paren == NO_PAREN && *Reg_Parse != '\0') {
778 if (*Reg_Parse == ')') {
779 REG_FAIL ("missing left parenthesis \'(\'");
780 } else {
781 REG_FAIL ("junk on end"); /* "Can't happen" - NOTREACHED */
785 /* Check whether look behind has a fixed size */
787 if (emit_look_behind_bounds) {
788 if (range_param->lower < 0) {
789 REG_FAIL ("look-behind does not have a bounded size");
791 if (range_param->upper > 65535L) {
792 REG_FAIL ("max. look-behind size is too large (>65535)")
794 if (Code_Emit_Ptr != &Compute_Size) {
795 *emit_look_behind_bounds++ = PUT_OFFSET_L (range_param->lower);
796 *emit_look_behind_bounds++ = PUT_OFFSET_R (range_param->lower);
797 *emit_look_behind_bounds++ = PUT_OFFSET_L (range_param->upper);
798 *emit_look_behind_bounds = PUT_OFFSET_R (range_param->upper);
802 /* For look ahead/behind, the length must be set to zero again */
803 if (look_only) {
804 range_param->lower = 0;
805 range_param->upper = 0;
808 zero_width = 0;
810 /* Set a bit in Closed_Parens to let future calls to function `back_ref'
811 know that we have closed this set of parentheses. */
813 if (paren == PAREN && this_paren <= (int)sizeof (Closed_Parens) * CHAR_BIT) {
814 SET_BIT (Closed_Parens, this_paren);
816 /* Determine if a parenthesized expression is modified by a quantifier
817 that can have zero width. */
819 if (*(Reg_Parse) == '?' || *(Reg_Parse) == '*') {
820 zero_width++;
821 } else if (*(Reg_Parse) == '{' && Brace_Char == '{') {
822 if (*(Reg_Parse + 1) == ',' || *(Reg_Parse + 1) == '}') {
823 zero_width++;
824 } else if (*(Reg_Parse + 1) == '0') {
825 i = 2;
827 while (*(Reg_Parse + i) == '0') i++;
829 if (*(Reg_Parse + i) == ',') zero_width++;
834 /* If this set of parentheses is known to never match the empty string, set
835 a bit in Paren_Has_Width to let future calls to function back_ref know
836 that this set of parentheses has non-zero width. This will allow star
837 (*) or question (?) quantifiers to be aplied to a back-reference that
838 refers to this set of parentheses. */
840 if ((*flag_param & HAS_WIDTH) &&
841 paren == PAREN &&
842 !zero_width &&
843 this_paren <= (int)(sizeof (Paren_Has_Width) * CHAR_BIT)) {
845 SET_BIT (Paren_Has_Width, this_paren);
848 Is_Case_Insensitive = old_sensitive;
849 Match_Newline = old_newline;
851 return (ret_val);
854 /*----------------------------------------------------------------------*
855 * alternative
857 * Processes one alternative of an '|' operator. Connects the NEXT
858 * pointers of each regex atom together sequentialy.
859 *----------------------------------------------------------------------*/
861 static unsigned char * alternative (int *flag_param, len_range *range_param) {
863 register unsigned char *ret_val;
864 register unsigned char *chain;
865 register unsigned char *latest;
866 int flags_local;
867 len_range range_local;
869 *flag_param = WORST; /* Tentatively. */
870 range_param->lower = 0; /* Idem */
871 range_param->upper = 0;
873 ret_val = emit_node (BRANCH);
874 chain = NULL;
876 /* Loop until we hit the start of the next alternative, the end of this set
877 of alternatives (end of parentheses), or the end of the regex. */
879 while (*Reg_Parse != '|' && *Reg_Parse != ')' && *Reg_Parse != '\0') {
880 latest = piece (&flags_local, &range_local);
882 if (latest == NULL) return (NULL); /* Something went wrong. */
884 *flag_param |= flags_local & HAS_WIDTH;
885 if (range_local.lower < 0) {
886 /* Not a fixed length */
887 range_param->lower = -1;
888 range_param->upper = -1;
889 } else if (range_param->lower >= 0) {
890 range_param->lower += range_local.lower;
891 range_param->upper += range_local.upper;
894 if (chain != NULL) { /* Connect the regex atoms together sequentialy. */
895 tail (chain, latest);
898 chain = latest;
901 if (chain == NULL) { /* Loop ran zero times. */
902 (void) emit_node (NOTHING);
905 return (ret_val);
908 /*----------------------------------------------------------------------*
909 * piece - something followed by possible '*', '+', '?', or "{m,n}"
911 * Note that the branching code sequences used for the general cases of
912 * *, +. ?, and {m,n} are somewhat optimized: they use the same
913 * NOTHING node as both the endmarker for their branch list and the
914 * body of the last branch. It might seem that this node could be
915 * dispensed with entirely, but the endmarker role is not redundant.
916 *----------------------------------------------------------------------*/
918 static unsigned char * piece (int *flag_param, len_range *range_param) {
920 register unsigned char *ret_val;
921 register unsigned char *next;
922 register unsigned char op_code;
923 unsigned long min_max [2] = {REG_ZERO, REG_INFINITY};
924 int flags_local, i, brace_present = 0;
925 int lazy = 0, comma_present = 0;
926 int digit_present [2] = {0,0};
927 len_range range_local;
929 ret_val = atom (&flags_local, &range_local);
931 if (ret_val == NULL) return (NULL); /* Something went wrong. */
933 op_code = *Reg_Parse;
935 if (!IS_QUANTIFIER (op_code)) {
936 *flag_param = flags_local;
937 *range_param = range_local;
938 return (ret_val);
939 } else if (op_code == '{') { /* {n,m} quantifier present */
940 brace_present++;
941 Reg_Parse++;
943 /* This code will allow specifying a counting range in any of the
944 following forms:
946 {m,n} between m and n.
947 {,n} same as {0,n} or between 0 and infinity.
948 {m,} same as {m,0} or between m and infinity.
949 {m} same as {m,m} or exactly m.
950 {,} same as {0,0} or between 0 and infinity or just '*'.
951 {} same as {0,0} or between 0 and infinity or just '*'.
953 Note that specifying a max of zero, {m,0} is not allowed in the regex
954 itself, but it is implemented internally that way to support '*', '+',
955 and {min,} constructs and signals an unlimited number. */
957 for (i = 0; i < 2; i++) {
958 /* Look for digits of number and convert as we go. The numeric maximum
959 value for max and min of 65,535 is due to using 2 bytes to store
960 each value in the compiled regex code. */
962 while (isdigit (*Reg_Parse)) {
963 /* (6553 * 10 + 6) > 65535 (16 bit max) */
965 if ((min_max [i] == 6553UL && (*Reg_Parse - '0') <= 5) ||
966 (min_max [i] <= 6552UL)) {
968 min_max [i] = (min_max [i] * 10UL) +
969 (unsigned long) (*Reg_Parse - '0');
970 Reg_Parse++;
972 digit_present [i]++;
973 } else {
974 if (i == 0) {
975 sprintf (Error_Text, "min operand of {%lu%c,???} > 65535",
976 min_max [0], *Reg_Parse);
977 } else {
978 sprintf (Error_Text, "max operand of {%lu,%lu%c} > 65535",
979 min_max [0], min_max [1], *Reg_Parse);
982 REG_FAIL (Error_Text);
986 if (!comma_present && *Reg_Parse == ',') {
987 comma_present++;
988 Reg_Parse++;
992 /* A max of zero can not be specified directly in the regex since it would
993 signal a max of infinity. This code specifically disallows `{0,0}',
994 `{,0}', and `{0}' which really means nothing to humans but would be
995 interpreted as `{0,infinity}' or `*' if we didn't make this check. */
997 if (digit_present [0] && (min_max [0] == REG_ZERO) && !comma_present) {
999 REG_FAIL ("{0} is an invalid range");
1000 } else if (digit_present [0] && (min_max [0] == REG_ZERO) &&
1001 digit_present [1] && (min_max [1] == REG_ZERO)) {
1003 REG_FAIL ("{0,0} is an invalid range");
1004 } else if (digit_present [1] && (min_max [1] == REG_ZERO)) {
1005 if (digit_present [0]) {
1006 sprintf (Error_Text, "{%lu,0} is an invalid range", min_max [0]);
1007 REG_FAIL (Error_Text);
1008 } else {
1009 REG_FAIL ("{,0} is an invalid range");
1013 if (!comma_present) min_max [1] = min_max [0]; /* {x} means {x,x} */
1015 if (*Reg_Parse != '}') {
1016 REG_FAIL ("{m,n} specification missing right \'}\'");
1018 } else if (min_max [1] != REG_INFINITY && min_max [0] > min_max [1]) {
1019 /* Disallow a backward range. */
1021 sprintf (Error_Text, "{%lu,%lu} is an invalid range",
1022 min_max [0], min_max [1]);
1023 REG_FAIL (Error_Text);
1027 Reg_Parse++;
1029 /* Check for a minimal matching (non-greedy or "lazy") specification. */
1031 if (*Reg_Parse == '?') {
1032 lazy = 1;
1033 Reg_Parse++;
1036 /* Avoid overhead of counting if possible */
1038 if (op_code == '{') {
1039 if (min_max [0] == REG_ZERO && min_max [1] == REG_INFINITY) {
1040 op_code = '*';
1041 } else if (min_max [0] == REG_ONE && min_max [1] == REG_INFINITY) {
1042 op_code = '+';
1043 } else if (min_max [0] == REG_ZERO && min_max [1] == REG_ONE) {
1044 op_code = '?';
1045 } else if (min_max [0] == REG_ONE && min_max [1] == REG_ONE) {
1046 /* "x{1,1}" is the same as "x". No need to pollute the compiled
1047 regex with such nonsense. */
1049 *flag_param = flags_local;
1050 *range_param = range_local;
1051 return (ret_val);
1052 } else if (Num_Braces > (int)UCHAR_MAX) {
1053 sprintf (Error_Text, "number of {m,n} constructs > %d", UCHAR_MAX);
1054 REG_FAIL (Error_Text);
1058 if (op_code == '+') min_max [0] = REG_ONE;
1059 if (op_code == '?') min_max [1] = REG_ONE;
1061 /* It is dangerous to apply certain quantifiers to a possibly zero width
1062 item. */
1064 if (!(flags_local & HAS_WIDTH)) {
1065 if (brace_present) {
1066 sprintf (Error_Text, "{%lu,%lu} operand could be empty",
1067 min_max [0], min_max [1]);
1068 } else {
1069 sprintf (Error_Text, "%c operand could be empty", op_code);
1072 REG_FAIL (Error_Text);
1075 *flag_param = (min_max [0] > REG_ZERO) ? (WORST | HAS_WIDTH) : WORST;
1076 if (range_local.lower >= 0) {
1077 if (min_max[1] != REG_INFINITY) {
1078 range_param->lower = range_local.lower * min_max[0];
1079 range_param->upper = range_local.upper * min_max[1];
1080 } else {
1081 range_param->lower = -1; /* Not a fixed-size length */
1082 range_param->upper = -1;
1084 } else {
1085 range_param->lower = -1; /* Not a fixed-size length */
1086 range_param->upper = -1;
1089 /*---------------------------------------------------------------------*
1090 * Symbol Legend For Node Structure Diagrams
1091 *---------------------------------------------------------------------*
1092 * (...) = general grouped thing
1093 * B = (B)ranch, K = bac(K), N = (N)othing
1094 * I = (I)nitialize count, C = Increment (C)ount
1095 * T~m = (T)est against mini(m)um- go to NEXT pointer if >= operand
1096 * T~x = (T)est against ma(x)imum- go to NEXT pointer if >= operand
1097 * '~' = NEXT pointer, \___| = forward pointer, |___/ = Backward pointer
1098 *---------------------------------------------------------------------*/
1100 if (op_code == '*' && (flags_local & SIMPLE)) {
1101 insert ((lazy ? LAZY_STAR : STAR), ret_val, 0UL, 0UL, 0);
1103 } else if (op_code == '+' && (flags_local & SIMPLE)) {
1104 insert (lazy ? LAZY_PLUS : PLUS, ret_val, 0UL, 0UL, 0);
1106 } else if (op_code == '?' && (flags_local & SIMPLE)) {
1107 insert (lazy ? LAZY_QUESTION : QUESTION, ret_val, 0UL, 0UL, 0);
1109 } else if (op_code == '{' && (flags_local & SIMPLE)) {
1110 insert (lazy ? LAZY_BRACE : BRACE, ret_val, min_max [0], min_max [1], 0);
1112 } else if ((op_code == '*' || op_code == '+') && lazy) {
1113 /* Node structure for (x)*? Node structure for (x)+? construct.
1114 * construct. (Same as (x)*? except for initial
1115 * forward jump into parenthesis.)
1117 * ___6____
1118 * _______5_______ /________|______
1119 * | _4__ 1_\ /| ____ | _\
1120 * |/ | / |\ / |/ | | / |\
1121 * B~ N~ B~ (...)~ K~ N~ N~ B~ N~ B~ (...)~ K~ N~
1122 * \ \___2_______| \ \___________|
1123 * \_____3_______| \_____________|
1127 tail (ret_val, emit_node (BACK)); /* 1 */
1128 (void) insert (BRANCH, ret_val, 0UL, 0UL, 0); /* 2,4 */
1129 (void) insert (NOTHING, ret_val, 0UL, 0UL, 0); /* 3 */
1131 next = emit_node (NOTHING); /* 2,3 */
1133 offset_tail (ret_val, NODE_SIZE, next); /* 2 */
1134 tail (ret_val, next); /* 3 */
1135 insert (BRANCH, ret_val, 0UL, 0UL, 0); /* 4,5 */
1136 tail (ret_val, ret_val + (2 * NODE_SIZE)); /* 4 */
1137 offset_tail (ret_val, 3 * NODE_SIZE, ret_val); /* 5 */
1139 if (op_code == '+') {
1140 insert (NOTHING, ret_val, 0UL, 0UL, 0); /* 6 */
1141 tail (ret_val, ret_val + (4 * NODE_SIZE)); /* 6 */
1143 } else if (op_code == '*') {
1144 /* Node structure for (x)* construct.
1145 * ____1_____
1146 * | \
1147 * B~ (...)~ K~ B~ N~
1148 * \ \_|2 |\_|
1149 * \__3_______| 4
1152 insert (BRANCH, ret_val, 0UL, 0UL, 0); /* 1,3 */
1153 offset_tail (ret_val, NODE_SIZE, emit_node (BACK)); /* 2 */
1154 offset_tail (ret_val, NODE_SIZE, ret_val); /* 1 */
1155 tail (ret_val, emit_node (BRANCH)); /* 3 */
1156 tail (ret_val, emit_node (NOTHING)); /* 4 */
1157 } else if (op_code == '+') {
1158 /* Node structure for (x)+ construct.
1160 * ____2_____
1161 * | \
1162 * (...)~ B~ K~ B~ N~
1163 * \_|\____|\_|
1164 * 1 3 4
1167 next = emit_node (BRANCH); /* 1 */
1169 tail (ret_val, next); /* 1 */
1170 tail (emit_node (BACK), ret_val); /* 2 */
1171 tail (next, emit_node (BRANCH)); /* 3 */
1172 tail (ret_val, emit_node (NOTHING)); /* 4 */
1173 } else if (op_code == '?' && lazy) {
1174 /* Node structure for (x)?? construct.
1175 * _4__ 1_
1176 * / | / |
1177 * B~ N~ B~ (...)~ N~
1178 * \ \___2____|
1179 * \_____3____|
1182 (void) insert (BRANCH, ret_val, 0UL, 0UL, 0); /* 2,4 */
1183 (void) insert (NOTHING, ret_val, 0UL, 0UL, 0); /* 3 */
1185 next = emit_node (NOTHING); /* 1,2,3 */
1187 offset_tail (ret_val, 2 * NODE_SIZE, next); /* 1 */
1188 offset_tail (ret_val, NODE_SIZE, next); /* 2 */
1189 tail (ret_val, next); /* 3 */
1190 insert (BRANCH, ret_val, 0UL, 0UL, 0); /* 4 */
1191 tail (ret_val, (ret_val + (2 * NODE_SIZE))); /* 4 */
1193 } else if (op_code == '?') {
1194 /* Node structure for (x)? construct.
1195 * ___1____ _2
1196 * / |/ |
1197 * B~ (...)~ B~ N~
1198 * \__3_|
1201 insert (BRANCH, ret_val, 0UL, 0UL, 0); /* 1 */
1202 tail (ret_val, emit_node (BRANCH)); /* 1 */
1204 next = emit_node (NOTHING); /* 2,3 */
1206 tail (ret_val, next); /* 2 */
1207 offset_tail (ret_val, NODE_SIZE, next); /* 3 */
1208 } else if (op_code == '{' && min_max [0] == min_max [1]) {
1209 /* Node structure for (x){m}, (x){m}?, (x){m,m}, or (x){m,m}? constructs.
1210 * Note that minimal and maximal matching mean the same thing when we
1211 * specify the minimum and maximum to be the same value.
1212 * _______3_____
1213 * | 1_ _2 \
1214 * | / |/ | \
1215 * I~ (...)~ C~ T~m K~ N~
1216 * \_| \_____|
1217 * 5 4
1220 tail (ret_val, emit_special (INC_COUNT, 0UL, Num_Braces)); /* 1 */
1221 tail (ret_val, emit_special (TEST_COUNT, min_max [0], Num_Braces));/* 2 */
1222 tail (emit_node (BACK), ret_val); /* 3 */
1223 tail (ret_val, emit_node (NOTHING)); /* 4 */
1225 next = insert (INIT_COUNT, ret_val, 0UL, 0UL, Num_Braces); /* 5 */
1227 tail (ret_val, next); /* 5 */
1229 Num_Braces++;
1230 } else if (op_code == '{' && lazy) {
1231 if (min_max [0] == REG_ZERO && min_max [1] != REG_INFINITY) {
1232 /* Node structure for (x){0,n}? or {,n}? construct.
1233 * _________3____________
1234 * 8_| _4__ 1_ _2 \
1235 * / |/ | / |/ | \
1236 * I~ B~ N~ B~ (...)~ C~ T~x K~ N~
1237 * \ \ \__7__|
1238 * \ \_________6_______|
1239 * \______5____________|
1242 tail (ret_val, emit_special (INC_COUNT, 0UL, Num_Braces)); /* 1 */
1244 next = emit_special (TEST_COUNT, min_max [0], Num_Braces); /* 2,7 */
1246 tail (ret_val, next); /* 2 */
1247 (void) insert (BRANCH, ret_val, 0UL, 0UL, Num_Braces); /* 4,6 */
1248 (void) insert (NOTHING, ret_val, 0UL, 0UL, Num_Braces); /* 5 */
1249 (void) insert (BRANCH, ret_val, 0UL, 0UL, Num_Braces); /* 3,4,8 */
1250 tail (emit_node (BACK), ret_val); /* 3 */
1251 tail (ret_val, ret_val + (2 * NODE_SIZE)); /* 4 */
1253 next = emit_node (NOTHING); /* 5,6,7 */
1255 offset_tail (ret_val, NODE_SIZE, next); /* 5 */
1256 offset_tail (ret_val, 2 * NODE_SIZE, next); /* 6 */
1257 offset_tail (ret_val, 3 * NODE_SIZE, next); /* 7 */
1259 next = insert (INIT_COUNT, ret_val, 0UL, 0UL, Num_Braces); /* 8 */
1261 tail (ret_val, next); /* 8 */
1263 } else if (min_max [0] > REG_ZERO && min_max [1] == REG_INFINITY) {
1264 /* Node structure for (x){m,}? construct.
1265 * ______8_________________
1266 * | _______3_____ \
1267 * | _7__ | 1_ _2 \ \
1268 * |/ | | / |/ | \ \
1269 * I~ B~ N~ B~ (...)~ C~ T~m K~ K~ N~
1270 * \_____\__\_| \_4___| |
1271 * 9 \ \_________5__________|
1272 * \_______6______________|
1275 tail (ret_val, emit_special (INC_COUNT, 0UL, Num_Braces)); /* 1 */
1277 next = emit_special (TEST_COUNT, min_max [0], Num_Braces); /* 2,4 */
1279 tail (ret_val, next); /* 2 */
1280 tail (emit_node (BACK), ret_val); /* 3 */
1281 tail (ret_val, emit_node (BACK)); /* 4 */
1282 (void) insert (BRANCH, ret_val, 0UL, 0UL, 0); /* 5,7 */
1283 (void) insert (NOTHING, ret_val, 0UL, 0UL, 0); /* 6 */
1285 next = emit_node (NOTHING); /* 5,6 */
1287 offset_tail (ret_val, NODE_SIZE, next); /* 5 */
1288 tail (ret_val, next); /* 6 */
1289 (void) insert (BRANCH, ret_val, 0UL, 0UL, 0); /* 7,8 */
1290 tail (ret_val, ret_val + (2 * NODE_SIZE)); /* 7 */
1291 offset_tail (ret_val, 3 * NODE_SIZE, ret_val); /* 8 */
1292 (void) insert (INIT_COUNT, ret_val, 0UL, 0UL, Num_Braces); /* 9 */
1293 tail (ret_val, ret_val + INDEX_SIZE + (4 * NODE_SIZE)); /* 9 */
1295 } else {
1296 /* Node structure for (x){m,n}? construct.
1297 * ______9_____________________
1298 * | _____________3___ \
1299 * | __8_ | 1_ _2 \ \
1300 * |/ | | / |/ | \ \
1301 * I~ B~ N~ B~ (...)~ C~ T~x T~m K~ K~ N~
1302 * \_____\__\_| \ \__4__| |
1303 * 10 \ \ \_7_________|
1304 * \ \_________6_____________|
1305 * \_______5_________________|
1308 tail (ret_val, emit_special (INC_COUNT, 0UL, Num_Braces)); /* 1 */
1310 next = emit_special (TEST_COUNT, min_max [1], Num_Braces); /* 2,7 */
1312 tail (ret_val, next); /* 2 */
1314 next = emit_special (TEST_COUNT, min_max [0], Num_Braces); /* 4 */
1316 tail (emit_node (BACK), ret_val); /* 3 */
1317 tail (next, emit_node (BACK)); /* 4 */
1318 (void) insert (BRANCH, ret_val, 0UL, 0UL, 0); /* 6,8 */
1319 (void) insert (NOTHING, ret_val, 0UL, 0UL, 0); /* 5 */
1320 (void) insert (BRANCH, ret_val, 0UL, 0UL, 0); /* 8,9 */
1322 next = emit_node (NOTHING); /* 5,6,7 */
1324 offset_tail (ret_val, NODE_SIZE, next); /* 5 */
1325 offset_tail (ret_val, 2 * NODE_SIZE, next); /* 6 */
1326 offset_tail (ret_val, 3 * NODE_SIZE, next); /* 7 */
1327 tail (ret_val, ret_val + (2 * NODE_SIZE)); /* 8 */
1328 offset_tail (next, -NODE_SIZE, ret_val); /* 9 */
1329 insert (INIT_COUNT, ret_val, 0UL, 0UL, Num_Braces); /* 10 */
1330 tail (ret_val, ret_val + INDEX_SIZE + (4 * NODE_SIZE)); /* 10 */
1333 Num_Braces++;
1334 } else if (op_code == '{') {
1335 if (min_max [0] == REG_ZERO && min_max [1] != REG_INFINITY) {
1336 /* Node structure for (x){0,n} or (x){,n} construct.
1338 * ___3____________
1339 * | 1_ _2 \ 5_
1340 * | / |/ | \ / |
1341 * I~ B~ (...)~ C~ T~x K~ B~ N~
1342 * \_|\ \_6___|__|
1343 * 7 \________4________|
1346 tail (ret_val, emit_special (INC_COUNT, 0UL, Num_Braces)); /* 1 */
1348 next = emit_special (TEST_COUNT, min_max [1], Num_Braces); /* 2,6 */
1350 tail (ret_val, next); /* 2 */
1351 (void) insert (BRANCH, ret_val, 0UL, 0UL, 0); /* 3,4,7 */
1352 tail (emit_node (BACK), ret_val); /* 3 */
1354 next = emit_node (BRANCH); /* 4,5 */
1356 tail (ret_val, next); /* 4 */
1357 tail (next, emit_node (NOTHING)); /* 5,6 */
1358 offset_tail (ret_val, NODE_SIZE, next); /* 6 */
1360 next = insert (INIT_COUNT, ret_val, 0UL, 0UL, Num_Braces); /* 7 */
1362 tail (ret_val, next); /* 7 */
1364 } else if (min_max [0] > REG_ZERO && min_max [1] == REG_INFINITY) {
1365 /* Node structure for (x){m,} construct.
1366 * __________4________
1367 * | __3__________ \
1368 * _|___| 1_ _2 \ \ _7
1369 * / | 8 | / |/ | \ \ / |
1370 * I~ B~ (...)~ C~ T~m K~ K~ B~ N~
1371 * \ \_5___| |
1372 * \__________6__________|
1375 tail (ret_val, emit_special (INC_COUNT, 0UL, Num_Braces)); /* 1 */
1377 next = emit_special (TEST_COUNT, min_max [0], Num_Braces); /* 2 */
1379 tail (ret_val, next); /* 2 */
1380 tail (emit_node (BACK), ret_val); /* 3 */
1381 (void) insert (BRANCH, ret_val, 0UL, 0UL, 0); /* 4,6 */
1383 next = emit_node (BACK); /* 4 */
1385 tail (next, ret_val); /* 4 */
1386 offset_tail (ret_val, NODE_SIZE, next); /* 5 */
1387 tail (ret_val, emit_node (BRANCH)); /* 6 */
1388 tail (ret_val, emit_node (NOTHING)); /* 7 */
1390 insert (INIT_COUNT, ret_val, 0UL, 0UL, Num_Braces); /* 8 */
1392 tail (ret_val, ret_val + INDEX_SIZE + (2 * NODE_SIZE)); /* 8 */
1394 } else {
1395 /* Node structure for (x){m,n} construct.
1396 * _____6________________
1397 * | _____________3___ \
1398 * 9_|__| 1_ _2 \ \ _8
1399 * / | | / |/ | \ \ / |
1400 * I~ B~ (...)~ C~ T~x T~m K~ K~ B~ N~
1401 * \ \ \__4__| | |
1402 * \ \_7_________|__|
1403 * \_________5_____________|
1406 tail (ret_val, emit_special (INC_COUNT, 0UL, Num_Braces)); /* 1 */
1408 next = emit_special (TEST_COUNT, min_max [1], Num_Braces); /* 2,4 */
1410 tail (ret_val, next); /* 2 */
1412 next = emit_special (TEST_COUNT, min_max [0], Num_Braces); /* 4 */
1414 tail (emit_node (BACK), ret_val); /* 3 */
1415 tail (next, emit_node (BACK)); /* 4 */
1416 (void) insert (BRANCH, ret_val, 0UL, 0UL, 0); /* 5,6 */
1418 next = emit_node (BRANCH); /* 5,8 */
1420 tail (ret_val, next); /* 5 */
1421 offset_tail (next, -NODE_SIZE, ret_val); /* 6 */
1423 next = emit_node (NOTHING); /* 7,8 */
1425 offset_tail (ret_val, NODE_SIZE, next); /* 7 */
1427 offset_tail (next, -NODE_SIZE, next); /* 8 */
1428 (void) insert (INIT_COUNT, ret_val, 0UL, 0UL, Num_Braces); /* 9 */
1429 tail (ret_val, ret_val + INDEX_SIZE + (2 * NODE_SIZE)); /* 9 */
1432 Num_Braces++;
1433 } else {
1434 /* We get here if the IS_QUANTIFIER macro is not coordinated properly
1435 with this function. */
1437 REG_FAIL ("internal error #2, `piece\'");
1440 if (IS_QUANTIFIER (*Reg_Parse)) {
1441 if (op_code == '{') {
1442 sprintf (Error_Text, "nested quantifiers, {m,n}%c", *Reg_Parse);
1443 } else {
1444 sprintf (Error_Text, "nested quantifiers, %c%c", op_code, *Reg_Parse);
1447 REG_FAIL (Error_Text);
1450 return (ret_val);
1453 /*----------------------------------------------------------------------*
1454 * atom
1456 * Process one regex item at the lowest level
1458 * OPTIMIZATION: Lumps a continuous sequence of ordinary characters
1459 * together so that it can turn them into a single EXACTLY node, which
1460 * is smaller to store and faster to run.
1461 *----------------------------------------------------------------------*/
1463 static unsigned char * atom (int *flag_param, len_range *range_param) {
1465 register unsigned char *ret_val;
1466 unsigned char test;
1467 int flags_local;
1468 len_range range_local;
1470 *flag_param = WORST; /* Tentatively. */
1471 range_param->lower = 0; /* Idem */
1472 range_param->upper = 0;
1474 /* Process any regex comments, e.g. `(?# match next token->)'. The
1475 terminating right parenthesis can not be escaped. The comment stops at
1476 the first right parenthesis encountered (or the end of the regex
1477 string)... period. Handles multiple sequential comments,
1478 e.g. `(?# one)(?# two)...' */
1480 while (*Reg_Parse == '(' &&
1481 *(Reg_Parse + 1) == '?' &&
1482 *(Reg_Parse + 2) == '#') {
1484 Reg_Parse += 3;
1486 while (*Reg_Parse != ')' && *Reg_Parse != '\0') {
1487 Reg_Parse++;
1490 if (*Reg_Parse == ')') {
1491 Reg_Parse++;
1494 if (*Reg_Parse == ')' || *Reg_Parse == '|' || *Reg_Parse == '\0') {
1495 /* Hit end of regex string or end of parenthesized regex; have to
1496 return "something" (i.e. a NOTHING node) to avoid generating an
1497 error. */
1499 ret_val = emit_node (NOTHING);
1501 return (ret_val);
1505 switch (*Reg_Parse++) {
1506 case '^':
1507 ret_val = emit_node (BOL);
1508 break;
1510 case '$':
1511 ret_val = emit_node (EOL);
1512 break;
1514 case '<':
1515 ret_val = emit_node (BOWORD);
1516 break;
1518 case '>':
1519 ret_val = emit_node (EOWORD);
1520 break;
1522 case '.':
1523 if (Match_Newline) {
1524 ret_val = emit_node (EVERY);
1525 } else {
1526 ret_val = emit_node (ANY);
1529 *flag_param |= (HAS_WIDTH | SIMPLE);
1530 range_param->lower = 1;
1531 range_param->upper = 1;
1532 break;
1534 case '(':
1535 if (*Reg_Parse == '?') { /* Special parenthetical expression */
1536 Reg_Parse++;
1537 range_local.lower = 0; /* Make sure it is always used */
1538 range_local.upper = 0;
1540 if (*Reg_Parse == ':') {
1541 Reg_Parse++;
1542 ret_val = chunk (NO_CAPTURE, &flags_local, &range_local);
1543 } else if (*Reg_Parse == '=') {
1544 Reg_Parse++;
1545 ret_val = chunk (POS_AHEAD_OPEN, &flags_local, &range_local);
1546 } else if (*Reg_Parse == '!') {
1547 Reg_Parse++;
1548 ret_val = chunk (NEG_AHEAD_OPEN, &flags_local, &range_local);
1549 } else if (*Reg_Parse == 'i') {
1550 Reg_Parse++;
1551 ret_val = chunk (INSENSITIVE, &flags_local, &range_local);
1552 } else if (*Reg_Parse == 'I') {
1553 Reg_Parse++;
1554 ret_val = chunk (SENSITIVE, &flags_local, &range_local);
1555 } else if (*Reg_Parse == 'n') {
1556 Reg_Parse++;
1557 ret_val = chunk (NEWLINE, &flags_local, &range_local);
1558 } else if (*Reg_Parse == 'N') {
1559 Reg_Parse++;
1560 ret_val = chunk (NO_NEWLINE, &flags_local, &range_local);
1561 } else if (*Reg_Parse == '<') {
1562 Reg_Parse++;
1563 if (*Reg_Parse == '=') {
1564 Reg_Parse++;
1565 ret_val = chunk (POS_BEHIND_OPEN, &flags_local, &range_local);
1566 } else if (*Reg_Parse == '!') {
1567 Reg_Parse++;
1568 ret_val = chunk (NEG_BEHIND_OPEN, &flags_local, &range_local);
1569 } else {
1570 sprintf (Error_Text,
1571 "invalid look-behind syntax, \"(?<%c...)\"",
1572 *Reg_Parse);
1574 REG_FAIL (Error_Text);
1576 } else {
1577 sprintf (Error_Text,
1578 "invalid grouping syntax, \"(?%c...)\"",
1579 *Reg_Parse);
1581 REG_FAIL (Error_Text);
1583 } else { /* Normal capturing parentheses */
1584 ret_val = chunk (PAREN, &flags_local, &range_local);
1587 if (ret_val == NULL) return (NULL); /* Something went wrong. */
1589 /* Add HAS_WIDTH flag if it was set by call to chunk. */
1591 *flag_param |= flags_local & HAS_WIDTH;
1592 *range_param = range_local;
1594 break;
1596 case '\0':
1597 case '|':
1598 case ')':
1599 REG_FAIL ("internal error #3, `atom\'"); /* Supposed to be */
1600 /* caught earlier. */
1601 case '?':
1602 case '+':
1603 case '*':
1604 sprintf (Error_Text, "%c follows nothing", *(Reg_Parse - 1));
1605 REG_FAIL (Error_Text);
1607 case '{':
1608 if (Enable_Counting_Quantifier) {
1609 REG_FAIL ("{m,n} follows nothing");
1610 } else {
1611 ret_val = emit_node (EXACTLY); /* Treat braces as literals. */
1612 emit_byte ('{');
1613 emit_byte ('\0');
1614 range_param->lower = 1;
1615 range_param->upper = 1;
1618 break;
1620 case '[':
1622 register unsigned int second_value;
1623 register unsigned int last_value;
1624 unsigned char last_emit = 0;
1626 /* Handle characters that can only occur at the start of a class. */
1628 if (*Reg_Parse == '^') { /* Complement of range. */
1629 ret_val = emit_node (ANY_BUT);
1630 Reg_Parse++;
1632 /* All negated classes include newline unless escaped with
1633 a "(?n)" switch. */
1635 if (!Match_Newline) emit_byte ('\n');
1636 } else {
1637 ret_val = emit_node (ANY_OF);
1640 if (*Reg_Parse == ']' || *Reg_Parse == '-') {
1641 /* If '-' or ']' is the first character in a class,
1642 it is a literal character in the class. */
1644 last_emit = *Reg_Parse;
1645 emit_byte (*Reg_Parse);
1646 Reg_Parse++;
1649 /* Handle the rest of the class characters. */
1651 while (*Reg_Parse != '\0' && *Reg_Parse != ']') {
1652 if (*Reg_Parse == '-') { /* Process a range, e.g [a-z]. */
1653 Reg_Parse++;
1655 if (*Reg_Parse == ']' || *Reg_Parse == '\0') {
1656 /* If '-' is the last character in a class it is a literal
1657 character. If `Reg_Parse' points to the end of the
1658 regex string, an error will be generated later. */
1660 emit_byte ('-');
1661 last_emit = '-';
1662 } else {
1663 /* We must get the range starting character value from the
1664 emitted code since it may have been an escaped
1665 character. `second_value' is set one larger than the
1666 just emitted character value. This is done since
1667 `second_value' is used as the start value for the loop
1668 that emits the values in the range. Since we have
1669 already emitted the first character of the class, we do
1670 not want to emit it again. */
1672 second_value = ((unsigned int) last_emit) + 1;
1674 if (*Reg_Parse == '\\') {
1675 /* Handle escaped characters within a class range.
1676 Specifically disallow shortcut escapes as the end of
1677 a class range. To allow this would be ambiguous
1678 since shortcut escapes represent a set of characters,
1679 and it would not be clear which character of the
1680 class should be treated as the "last" character. */
1682 Reg_Parse++;
1684 if ((test = numeric_escape (*Reg_Parse, &Reg_Parse))) {
1685 last_value = (unsigned int) test;
1686 } else if ((test = literal_escape (*Reg_Parse))) {
1687 last_value = (unsigned int) test;
1688 } else if (shortcut_escape (*Reg_Parse,
1689 NULL,
1690 CHECK_CLASS_ESCAPE)) {
1691 sprintf (Error_Text,
1692 "\\%c is not allowed as range operand",
1693 *Reg_Parse);
1695 REG_FAIL (Error_Text);
1696 } else {
1697 sprintf (
1698 Error_Text,
1699 "\\%c is an invalid char class escape sequence",
1700 *Reg_Parse);
1702 REG_FAIL (Error_Text);
1704 } else {
1705 last_value = U_CHAR_AT (Reg_Parse);
1708 if (Is_Case_Insensitive) {
1709 second_value =
1710 (unsigned int) tolower ((int) second_value);
1711 last_value =
1712 (unsigned int) tolower ((int) last_value);
1715 /* For case insensitive, something like [A-_] will
1716 generate an error here since ranges are converted to
1717 lower case. */
1719 if (second_value - 1 > last_value) {
1720 REG_FAIL ("invalid [] range");
1723 /* If only one character in range (e.g [a-a]) then this
1724 loop is not run since the first character of any range
1725 was emitted by the previous iteration of while loop. */
1727 for (; second_value <= last_value; second_value++) {
1728 emit_class_byte (second_value);
1731 last_emit = (unsigned char) last_value;
1733 Reg_Parse++;
1735 } /* End class character range code. */
1736 } else if (*Reg_Parse == '\\') {
1737 Reg_Parse++;
1739 if ((test = numeric_escape (*Reg_Parse, &Reg_Parse)) != '\0') {
1740 emit_class_byte (test);
1742 last_emit = test;
1743 } else if ((test = literal_escape (*Reg_Parse)) != '\0') {
1744 emit_byte (test);
1745 last_emit = test;
1746 } else if (shortcut_escape (*Reg_Parse,
1747 NULL,
1748 CHECK_CLASS_ESCAPE)) {
1750 if (*(Reg_Parse + 1) == '-') {
1751 /* Specifically disallow shortcut escapes as the start
1752 of a character class range (see comment above.) */
1754 sprintf (Error_Text,
1755 "\\%c not allowed as range operand",
1756 *Reg_Parse);
1758 REG_FAIL (Error_Text);
1759 } else {
1760 /* Emit the bytes that are part of the shortcut
1761 escape sequence's range (e.g. \d = 0123456789) */
1763 shortcut_escape (*Reg_Parse, NULL, EMIT_CLASS_BYTES);
1765 } else {
1766 sprintf (Error_Text,
1767 "\\%c is an invalid char class escape sequence",
1768 *Reg_Parse);
1770 REG_FAIL (Error_Text);
1773 Reg_Parse++;
1775 /* End of class escaped sequence code */
1776 } else {
1777 emit_class_byte (*Reg_Parse); /* Ordinary class character. */
1779 last_emit = *Reg_Parse;
1780 Reg_Parse++;
1782 } /* End of while (*Reg_Parse != '\0' && *Reg_Parse != ']') */
1784 if (*Reg_Parse != ']') REG_FAIL ("missing right \']\'");
1786 emit_byte('\0');
1788 /* NOTE: it is impossible to specify an empty class. This is
1789 because [] would be interpreted as "begin character class"
1790 followed by a literal ']' character and no "end character class"
1791 delimiter (']'). Because of this, it is always safe to assume
1792 that a class HAS_WIDTH. */
1794 Reg_Parse++;
1795 *flag_param |= HAS_WIDTH | SIMPLE;
1796 range_param->lower = 1;
1797 range_param->upper = 1;
1800 break; /* End of character class code. */
1802 case '\\':
1803 /* Force Error_Text to have a length of zero. This way we can tell if
1804 either of the calls to shortcut_escape() or back_ref() fill
1805 Error_Text with an error message. */
1807 Error_Text [0] = '\0';
1809 if ((ret_val = shortcut_escape (*Reg_Parse, flag_param, EMIT_NODE))) {
1811 Reg_Parse++;
1812 range_param->lower = 1;
1813 range_param->upper = 1;
1814 break;
1816 } else if ((ret_val = back_ref (Reg_Parse, flag_param, EMIT_NODE))) {
1817 /* Can't make any assumptions about a back-reference as to SIMPLE
1818 or HAS_WIDTH. For example (^|<) is neither simple nor has
1819 width. So we don't flip bits in flag_param here. */
1821 Reg_Parse++;
1822 /* Back-references always have an unknown length */
1823 range_param->lower = -1;
1824 range_param->upper = -1;
1825 break;
1828 if (strlen (Error_Text) > 0) REG_FAIL (Error_Text);
1830 /* At this point it is apparent that the escaped character is not a
1831 shortcut escape or back-reference. Back up one character to allow
1832 the default code to include it as an ordinary character. */
1834 /* Fall through to Default case to handle literal escapes and numeric
1835 escapes. */
1837 default:
1838 Reg_Parse--; /* If we fell through from the above code, we are now
1839 pointing at the back slash (\) character. */
1841 unsigned char *parse_save;
1842 int len = 0;
1844 if (Is_Case_Insensitive) {
1845 ret_val = emit_node (SIMILAR);
1846 } else {
1847 ret_val = emit_node (EXACTLY);
1850 /* Loop until we find a meta character, shortcut escape, back
1851 reference, or end of regex string. */
1853 for (; *Reg_Parse != '\0' &&
1854 !strchr ((char *) Meta_Char, (int) *Reg_Parse);
1855 len++) {
1857 /* Save where we are in case we have to back
1858 this character out. */
1860 parse_save = Reg_Parse;
1862 if (*Reg_Parse == '\\') {
1863 Reg_Parse++; /* Point to escaped character */
1865 Error_Text [0] = '\0'; /* See comment above. */
1867 if ((test = numeric_escape (*Reg_Parse, &Reg_Parse))) {
1868 if (Is_Case_Insensitive) {
1869 emit_byte (tolower (test));
1870 } else {
1871 emit_byte (test);
1873 } else if ((test = literal_escape (*Reg_Parse))) {
1874 emit_byte (test);
1875 } else if (back_ref (Reg_Parse, NULL, CHECK_ESCAPE)) {
1876 /* Leave back reference for next `atom' call */
1878 Reg_Parse--; break;
1879 } else if (shortcut_escape (*Reg_Parse, NULL, CHECK_ESCAPE)) {
1880 /* Leave shortcut escape for next `atom' call */
1882 Reg_Parse--; break;
1883 } else {
1884 if (strlen (Error_Text) == 0) {
1885 /* None of the above calls generated an error message
1886 so generate our own here. */
1888 sprintf (Error_Text,
1889 "\\%c is an invalid escape sequence",
1890 *Reg_Parse);
1893 REG_FAIL (Error_Text);
1896 Reg_Parse++;
1897 } else {
1898 /* Ordinary character */
1900 if (Is_Case_Insensitive) {
1901 emit_byte (tolower (*Reg_Parse));
1902 } else {
1903 emit_byte (*Reg_Parse);
1906 Reg_Parse++;
1909 /* If next regex token is a quantifier (?, +. *, or {m,n}) and
1910 our EXACTLY node so far is more than one character, leave the
1911 last character to be made into an EXACTLY node one character
1912 wide for the multiplier to act on. For example 'abcd* would
1913 have an EXACTLY node with an 'abc' operand followed by a STAR
1914 node followed by another EXACTLY node with a 'd' operand. */
1916 if (IS_QUANTIFIER (*Reg_Parse) && len > 0) {
1917 Reg_Parse = parse_save; /* Point to previous regex token. */
1919 if (Code_Emit_Ptr == &Compute_Size) {
1920 Reg_Size--;
1921 } else {
1922 Code_Emit_Ptr--; /* Write over previously emitted byte. */
1925 break;
1929 if (len <= 0) REG_FAIL ("internal error #4, `atom\'");
1931 *flag_param |= HAS_WIDTH;
1933 if (len == 1) *flag_param |= SIMPLE;
1935 range_param->lower = len;
1936 range_param->upper = len;
1938 emit_byte ('\0');
1940 } /* END switch (*Reg_Parse++) */
1942 return (ret_val);
1945 /*----------------------------------------------------------------------*
1946 * emit_node
1948 * Emit (if appropriate) the op code for a regex node atom.
1950 * The NEXT pointer is initialized to NULL.
1952 * Returns a pointer to the START of the emitted node.
1953 *----------------------------------------------------------------------*/
1955 static unsigned char * emit_node (int op_code) {
1957 register unsigned char *ret_val;
1958 register unsigned char *ptr;
1960 ret_val = Code_Emit_Ptr; /* Return address of start of node */
1962 if (ret_val == &Compute_Size) {
1963 Reg_Size += NODE_SIZE;
1964 } else {
1965 ptr = ret_val;
1966 *ptr++ = (unsigned char) op_code;
1967 *ptr++ = '\0'; /* Null "NEXT" pointer. */
1968 *ptr++ = '\0';
1970 Code_Emit_Ptr = ptr;
1973 return (ret_val);
1976 /*----------------------------------------------------------------------*
1977 * emit_byte
1979 * Emit (if appropriate) a byte of code (usually part of an operand.)
1980 *----------------------------------------------------------------------*/
1982 static void emit_byte (unsigned char c) {
1984 if (Code_Emit_Ptr == &Compute_Size) {
1985 Reg_Size++;
1986 } else {
1987 *Code_Emit_Ptr++ = c;
1991 /*----------------------------------------------------------------------*
1992 * emit_class_byte
1994 * Emit (if appropriate) a byte of code (usually part of a character
1995 * class operand.)
1996 *----------------------------------------------------------------------*/
1998 static void emit_class_byte (unsigned char c) {
2000 if (Code_Emit_Ptr == &Compute_Size) {
2001 Reg_Size++;
2003 if (Is_Case_Insensitive && isalpha (c)) Reg_Size++;
2004 } else if (Is_Case_Insensitive && isalpha (c)) {
2005 /* For case insensitive character classes, emit both upper and lower case
2006 versions of alphabetical characters. */
2008 *Code_Emit_Ptr++ = tolower (c);
2009 *Code_Emit_Ptr++ = toupper (c);
2010 } else {
2011 *Code_Emit_Ptr++ = c;
2015 /*----------------------------------------------------------------------*
2016 * emit_special
2018 * Emit nodes that need special processing.
2019 *----------------------------------------------------------------------*/
2021 static unsigned char * emit_special (
2022 unsigned char op_code,
2023 unsigned long test_val,
2024 int index) {
2026 register unsigned char *ret_val = &Compute_Size;
2027 register unsigned char *ptr;
2029 if (Code_Emit_Ptr == &Compute_Size) {
2030 switch (op_code) {
2031 case POS_BEHIND_OPEN:
2032 case NEG_BEHIND_OPEN:
2033 Reg_Size += LENGTH_SIZE; /* Length of the look-behind match */
2034 Reg_Size += NODE_SIZE; /* Make room for the node */
2035 break;
2037 case TEST_COUNT:
2038 Reg_Size += NEXT_PTR_SIZE; /* Make room for a test value. */
2040 case INC_COUNT:
2041 Reg_Size += INDEX_SIZE; /* Make room for an index value. */
2043 default:
2044 Reg_Size += NODE_SIZE; /* Make room for the node. */
2046 } else {
2047 ret_val = emit_node (op_code); /* Return the address for start of node. */
2048 ptr = Code_Emit_Ptr;
2050 if (op_code == INC_COUNT || op_code == TEST_COUNT) {
2051 *ptr++ = (unsigned char) index;
2053 if (op_code == TEST_COUNT) {
2054 *ptr++ = PUT_OFFSET_L (test_val);
2055 *ptr++ = PUT_OFFSET_R (test_val);
2057 } else if (op_code == POS_BEHIND_OPEN || op_code == NEG_BEHIND_OPEN) {
2058 *ptr++ = PUT_OFFSET_L (test_val);
2059 *ptr++ = PUT_OFFSET_R (test_val);
2060 *ptr++ = PUT_OFFSET_L (test_val);
2061 *ptr++ = PUT_OFFSET_R (test_val);
2064 Code_Emit_Ptr = ptr;
2067 return (ret_val);
2070 /*----------------------------------------------------------------------*
2071 * insert
2073 * Insert a node in front of already emitted node(s). Means relocating
2074 * the operand. Code_Emit_Ptr points one byte past the just emitted
2075 * node and operand. The parameter `insert_pos' points to the location
2076 * where the new node is to be inserted.
2077 *----------------------------------------------------------------------*/
2079 static unsigned char * insert (
2080 unsigned char op,
2081 unsigned char *insert_pos,
2082 long min,
2083 long max,
2084 int index) {
2086 register unsigned char *src;
2087 register unsigned char *dst;
2088 unsigned char *place;
2089 int insert_size = NODE_SIZE;
2091 if (op == BRACE || op == LAZY_BRACE) {
2092 /* Make room for the min and max values. */
2094 insert_size += (2 * NEXT_PTR_SIZE);
2095 } else if (op == INIT_COUNT) {
2096 /* Make room for an index value . */
2098 insert_size += INDEX_SIZE;
2101 if (Code_Emit_Ptr == &Compute_Size) {
2102 Reg_Size += insert_size;
2103 return &Compute_Size;
2106 src = Code_Emit_Ptr;
2107 Code_Emit_Ptr += insert_size;
2108 dst = Code_Emit_Ptr;
2110 /* Relocate the existing emitted code to make room for the new node. */
2112 while (src > insert_pos) *--dst = *--src;
2114 place = insert_pos; /* Where operand used to be. */
2115 *place++ = op; /* Inserted operand. */
2116 *place++ = '\0'; /* NEXT pointer for inserted operand. */
2117 *place++ = '\0';
2119 if (op == BRACE || op == LAZY_BRACE) {
2120 *place++ = PUT_OFFSET_L (min);
2121 *place++ = PUT_OFFSET_R (min);
2123 *place++ = PUT_OFFSET_L (max);
2124 *place++ = PUT_OFFSET_R (max);
2125 } else if (op == INIT_COUNT) {
2126 *place++ = (unsigned char) index;
2129 return place; /* Return a pointer to the start of the code moved. */
2132 /*----------------------------------------------------------------------*
2133 * tail - Set the next-pointer at the end of a node chain.
2134 *----------------------------------------------------------------------*/
2136 static void tail (unsigned char *search_from, unsigned char *point_to) {
2138 register unsigned char *scan;
2139 register unsigned char *next;
2140 register int offset;
2142 if (search_from == &Compute_Size) return;
2144 /* Find the last node in the chain (node with a null NEXT pointer) */
2146 scan = search_from;
2148 for (;;) {
2149 next = next_ptr (scan);
2151 if (!next) break;
2153 scan = next;
2156 if (GET_OP_CODE (scan) == BACK) {
2157 offset = scan - point_to;
2158 } else {
2159 offset = point_to - scan;
2162 /* Set NEXT pointer */
2164 *(scan + 1) = PUT_OFFSET_L (offset);
2165 *(scan + 2) = PUT_OFFSET_R (offset);
2168 /*--------------------------------------------------------------------*
2169 * offset_tail
2171 * Perform a tail operation on (ptr + offset).
2172 *--------------------------------------------------------------------*/
2174 static void offset_tail (unsigned char *ptr, int offset, unsigned char *val) {
2176 if (ptr == &Compute_Size || ptr == NULL) return;
2178 tail (ptr + offset, val);
2181 /*--------------------------------------------------------------------*
2182 * branch_tail
2184 * Perform a tail operation on (ptr + offset) but only if `ptr' is a
2185 * BRANCH node.
2186 *--------------------------------------------------------------------*/
2188 static void branch_tail (unsigned char *ptr, int offset, unsigned char *val) {
2190 if (ptr == &Compute_Size || ptr == NULL ||GET_OP_CODE (ptr) != BRANCH) {
2191 return;
2194 tail (ptr + offset, val);
2197 /*--------------------------------------------------------------------*
2198 * shortcut_escape
2200 * Implements convenient escape sequences that represent entire
2201 * character classes or special location assertions (similar to escapes
2202 * supported by Perl)
2204 * \d Digits [0-9] |
2205 * \D NOT a digit [^0-9] | (Examples
2206 * \l Letters [a-zA-Z] | at left
2207 * \L NOT a Letter [^a-zA-Z] | are
2208 * \s Whitespace [ \t\n\r\f\v] | for
2209 * \S NOT Whitespace [^ \t\n\r\f\v] | C
2210 * \w "Word" character [a-zA-Z0-9_] | Locale)
2211 * \W NOT a "Word" character [^a-zA-Z0-9_] _|
2213 * \B Matches any character that is NOT a word-delimiter
2215 * Codes for the "emit" parameter:
2217 * EMIT_NODE
2218 * Emit a shortcut node. Shortcut nodes have an implied set of
2219 * class characters. This helps keep the compiled regex string
2220 * small.
2222 * EMIT_CLASS_BYTES
2223 * Emit just the equivalent characters of the class. This makes
2224 * the escape usable from within a class, e.g. [a-fA-F\d]. Only
2225 * \d, \D, \s, \S, \w, and \W can be used within a class.
2227 * CHECK_ESCAPE
2228 * Only verify that this is a valid shortcut escape.
2230 * CHECK_CLASS_ESCAPE
2231 * Same as CHECK_ESCAPE but only allows characters valid within
2232 * a class.
2234 *--------------------------------------------------------------------*/
2236 static unsigned char * shortcut_escape (
2237 unsigned char c,
2238 int *flag_param,
2239 int emit) {
2241 register unsigned char *class = NULL;
2242 static unsigned char *codes = (unsigned char *) "ByYdDlLsSwW";
2243 unsigned char *ret_val = (unsigned char *) 1; /* Assume success. */
2244 unsigned char *valid_codes;
2246 if (emit == EMIT_CLASS_BYTES || emit == CHECK_CLASS_ESCAPE) {
2247 valid_codes = codes + 3; /* \B, \y and \Y are not allowed in classes */
2248 } else {
2249 valid_codes = codes;
2252 if (!strchr ((char *) valid_codes, (int) c)) {
2253 return NULL; /* Not a valid shortcut escape sequence */
2254 } else if (emit == CHECK_ESCAPE || emit == CHECK_CLASS_ESCAPE) {
2255 return ret_val; /* Just checking if this is a valid shortcut escape. */
2258 switch (c) {
2259 case 'd':
2260 case 'D':
2261 if (emit == EMIT_CLASS_BYTES) {
2262 class = ASCII_Digits;
2263 } else if (emit == EMIT_NODE) {
2264 ret_val = (islower (c) ? emit_node (DIGIT)
2265 : emit_node (NOT_DIGIT));
2268 break;
2270 case 'l':
2271 case 'L':
2272 if (emit == EMIT_CLASS_BYTES) {
2273 class = Letter_Char;
2274 } else if (emit == EMIT_NODE) {
2275 ret_val = (islower (c) ? emit_node (LETTER)
2276 : emit_node (NOT_LETTER));
2279 break;
2281 case 's':
2282 case 'S':
2283 if (emit == EMIT_CLASS_BYTES) {
2284 if (Match_Newline) emit_byte ('\n');
2286 class = White_Space;
2287 } else if (emit == EMIT_NODE) {
2288 if (Match_Newline) {
2289 ret_val = (islower (c) ? emit_node (SPACE_NL)
2290 : emit_node (NOT_SPACE_NL));
2291 } else {
2292 ret_val = (islower (c) ? emit_node (SPACE)
2293 : emit_node (NOT_SPACE));
2297 break;
2299 case 'w':
2300 case 'W':
2301 if (emit == EMIT_CLASS_BYTES) {
2302 class = Word_Char;
2303 } else if (emit == EMIT_NODE) {
2304 ret_val = (islower (c) ? emit_node (WORD_CHAR)
2305 : emit_node (NOT_WORD_CHAR));
2308 break;
2310 /* Since the delimiter table is not available at regex compile time \B,
2311 \Y and \Y can only generate a node. At run time, the delimiter table
2312 will be available for these nodes to use. */
2314 case 'y':
2316 if (emit == EMIT_NODE) {
2317 ret_val = emit_node (IS_DELIM);
2318 } else {
2319 REG_FAIL ("internal error #5 `shortcut_escape\'");
2322 break;
2324 case 'Y':
2326 if (emit == EMIT_NODE) {
2327 ret_val = emit_node (NOT_DELIM);
2328 } else {
2329 REG_FAIL ("internal error #6 `shortcut_escape\'");
2332 break;
2334 case 'B':
2336 if (emit == EMIT_NODE) {
2337 ret_val = emit_node (NOT_BOUNDARY);
2338 } else {
2339 REG_FAIL ("internal error #7 `shortcut_escape\'");
2342 break;
2344 default:
2345 /* We get here if there isn't a case for every character in
2346 the string "codes" */
2348 REG_FAIL ("internal error #8 `shortcut_escape\'");
2351 if (emit == EMIT_NODE && c != 'B') {
2352 *flag_param |= (HAS_WIDTH | SIMPLE);
2355 if (class) {
2356 /* Emit bytes within a character class operand. */
2358 while (*class != '\0') {
2359 emit_byte (*class++);
2363 return ret_val;
2366 /*--------------------------------------------------------------------*
2367 * numeric_escape
2369 * Implements hex and octal numeric escape sequence syntax.
2371 * Hexadecimal Escape: \x## Max of two digits Must have leading 'x'.
2372 * Octal Escape: \0### Max of three digits and not greater
2373 * than 377 octal. Must have leading zero.
2375 * Returns the actual character value or NULL if not a valid hex or
2376 * octal escape. REG_FAIL is called if \x0, \x00, \0, \00, \000, or
2377 * \0000 is specified.
2378 *--------------------------------------------------------------------*/
2380 static unsigned char numeric_escape (
2381 unsigned char c,
2382 unsigned char **parse) {
2384 static unsigned char digits [] = "fedcbaFEDCBA9876543210";
2386 static unsigned int digit_val [] = {
2387 15, 14, 13, 12, 11, 10, /* Lower case Hex digits */
2388 15, 14, 13, 12, 11, 10, /* Upper case Hex digits */
2389 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}; /* Decimal Digits */
2391 register unsigned char *scan;
2392 register unsigned char *pos_ptr;
2393 register unsigned char *digit_str;
2394 unsigned int value = 0;
2395 unsigned int radix = 8;
2396 int width = 3; /* Can not be bigger than \0377 */
2397 int pos_delta = 14;
2398 int i, pos;
2400 switch (c) {
2401 case '0':
2402 digit_str = digits + pos_delta; /* Only use Octal digits, i.e. 0-7. */
2404 break;
2406 case 'x':
2407 case 'X':
2408 width = 2; /* Can not be bigger than \0377 */
2409 radix = 16;
2410 pos_delta = 0;
2411 digit_str = digits; /* Use all of the digit characters. */
2413 break;
2415 default:
2416 return ('\0'); /* Not a numeric escape */
2419 scan = *parse; scan++; /* Only change *parse on success. */
2421 pos_ptr = (unsigned char *) strchr ((char *) digit_str, (int) *scan);
2423 for (i = 0; pos_ptr != NULL && (i < width); i++) {
2424 pos = (pos_ptr - digit_str) + pos_delta;
2425 value = (value * radix) + digit_val [pos];
2427 /* If this digit makes the value over 255, treat this digit as a literal
2428 character instead of part of the numeric escape. For example, \0777
2429 will be processed as \077 (an 'M') and a literal '7' character, NOT
2430 511 decimal which is > 255. */
2432 if (value > 255) {
2433 /* Back out calculations for last digit processed. */
2435 value -= digit_val [pos];
2436 value /= radix;
2438 break; /* Note that scan will not be incremented and still points to
2439 the digit that caused overflow. It will be decremented by
2440 the "else" below to point to the last character that is
2441 considered to be part of the octal escape. */
2444 scan++;
2445 pos_ptr = (unsigned char *) strchr ((char *) digit_str, (int) *scan);
2448 /* Handle the case of "\0" i.e. trying to specify a NULL character. */
2450 if (value == 0) {
2451 if (c == '0') {
2452 sprintf (Error_Text, "\\00 is an invalid octal escape");
2453 } else {
2454 sprintf (Error_Text, "\\%c0 is an invalid hexadecimal escape", c);
2456 } else {
2457 /* Point to the last character of the number on success. */
2459 scan--;
2460 *parse = scan;
2463 return (unsigned char) value;
2466 /*--------------------------------------------------------------------*
2467 * literal_escape
2469 * Recognize escaped literal characters (prefixed with backslash),
2470 * and translate them into the corresponding character.
2472 * Returns the proper character value or NULL if not a valid literal
2473 * escape.
2474 *--------------------------------------------------------------------*/
2476 static unsigned char literal_escape (unsigned char c) {
2478 static unsigned char valid_escape [] = {
2479 'a', 'b',
2480 'e',
2481 'f', 'n', 'r', 't', 'v', '(', ')', '-', '[', ']',
2482 '<', '>', '{', '}', '.', '\\', '|', '^', '$', '*',
2483 '+', '?', '&', '\0'
2486 static unsigned char value [] = {
2487 '\a', '\b',
2488 #ifdef EBCDIC_CHARSET
2489 0x27, /* Escape character in IBM's EBCDIC character set. */
2490 #else
2491 0x1B, /* Escape character in ASCII character set. */
2492 #endif
2493 '\f', '\n', '\r', '\t', '\v', '(', ')', '-', '[', ']',
2494 '<', '>', '{', '}', '.', '\\', '|', '^', '$', '*',
2495 '+', '?', '&', '\0'
2498 int i;
2500 for (i = 0; valid_escape [i] != '\0'; i++) {
2501 if (c == valid_escape [i]) return value [i];
2504 return '\0';
2507 /*--------------------------------------------------------------------*
2508 * back_ref
2510 * Process a request to match a previous parenthesized thing.
2511 * Parenthetical entities are numbered beginning at 1 by counting
2512 * opening parentheses from left to to right. \0 would represent
2513 * whole match, but would confuse numeric_escape as an octal escape,
2514 * so it is forbidden.
2516 * Constructs of the form \~1, \~2, etc. are cross-regex back
2517 * references and are used in syntax highlighting patterns to match
2518 * text previously matched by another regex. *** IMPLEMENT LATER ***
2519 *--------------------------------------------------------------------*/
2521 static unsigned char * back_ref (
2522 unsigned char *c,
2523 int *flag_param,
2524 int emit) {
2526 int paren_no, c_offset = 0, is_cross_regex = 0;
2528 unsigned char *ret_val;
2530 /* Implement cross regex backreferences later. */
2532 /* if (*c == (unsigned char) ('~')) {
2533 c_offset++;
2534 is_cross_regex++;
2535 } */
2537 paren_no = (int) (*(c + c_offset) - (unsigned char) ('0'));
2539 if (!isdigit (*(c + c_offset)) || /* Only \1, \2, ... \9 are supported. */
2540 paren_no == 0) { /* Should be caught by numeric_escape. */
2542 return NULL;
2545 /* Make sure parentheses for requested back-reference are complete. */
2547 if (!is_cross_regex && !TEST_BIT (Closed_Parens, paren_no)) {
2548 sprintf (Error_Text, "\\%d is an illegal back reference", paren_no);
2549 return NULL;
2552 if (emit == EMIT_NODE) {
2553 if (is_cross_regex) {
2554 Reg_Parse++; /* Skip past the '~' in a cross regex back reference.
2555 We only do this if we are emitting code. */
2557 if (Is_Case_Insensitive) {
2558 ret_val = emit_node (X_REGEX_BR_CI);
2559 } else {
2560 ret_val = emit_node (X_REGEX_BR);
2562 } else {
2563 if (Is_Case_Insensitive) {
2564 ret_val = emit_node (BACK_REF_CI);
2565 } else {
2566 ret_val = emit_node (BACK_REF);
2570 emit_byte ((unsigned char) paren_no);
2572 if (is_cross_regex || TEST_BIT (Paren_Has_Width, paren_no)) {
2573 *flag_param |= HAS_WIDTH;
2575 } else if (emit == CHECK_ESCAPE) {
2576 ret_val = (unsigned char *) 1;
2577 } else {
2578 ret_val = NULL;
2581 return ret_val;
2584 /*======================================================================*
2585 * Regex execution related code
2586 *======================================================================*/
2588 /* Global work variables for `ExecRE'. */
2590 static unsigned char *Reg_Input; /* String-input pointer. */
2591 static unsigned char *Start_Of_String; /* Beginning of input, for ^ */
2592 /* and < checks. */
2593 static unsigned char *Look_Behind_To; /* Position till were look behind
2594 can safely check back */
2595 static unsigned char **Start_Ptr_Ptr; /* Pointer to `startp' array. */
2596 static unsigned char **End_Ptr_Ptr; /* Ditto for `endp'. */
2597 static unsigned char *Extent_Ptr_FW; /* Forward extent pointer */
2598 static unsigned char *Extent_Ptr_BW; /* Backward extent pointer */
2599 static unsigned char *Back_Ref_Start [10]; /* Back_Ref_Start [0] and */
2600 static unsigned char *Back_Ref_End [10]; /* Back_Ref_End [0] are not */
2601 /* used. This simplifies */
2602 /* indexing. */
2603 /* static regexp *Cross_Regex_Backref; */
2605 static int Prev_Is_BOL;
2606 static int Succ_Is_EOL;
2607 /* static int Prev_Is_Delim;
2608 static int Succ_Is_Delim; */
2609 static int Prev_Is_WordChar;
2610 static int Succ_Is_WordChar;
2612 /* Define a pointer to an array to hold general (...){m,n} counts. */
2614 typedef struct brace_counts {
2615 unsigned long count [1]; /* More unwarranted chumminess with compiler. */
2616 } brace_counts;
2618 static struct brace_counts *Brace;
2620 /* Default table for determining whether a character is a word delimiter. */
2622 static unsigned char Default_Delimiters [UCHAR_MAX] = {0};
2624 static unsigned char *Current_Delimiters; /* Current delimiter table */
2626 /* Forward declarations of functions used by `ExecRE' */
2628 static int attempt (regexp *, unsigned char *);
2629 static int match (unsigned char *, int *);
2630 static unsigned long greedy (unsigned char *, long);
2631 static void adjustcase (unsigned char *, int, unsigned char);
2632 static unsigned char * makeDelimiterTable (unsigned char *, unsigned char *);
2635 * ExecRE - match a `regexp' structure against a string
2637 * If `end' is non-NULL, matches may not BEGIN past end, but may extend past
2638 * it. If reverse is true, `end' must be specified, and searching begins at
2639 * `end'. "isbol" should be set to true if the beginning of the string is the
2640 * actual beginning of a line (since `ExecRE' can't look backwards from the
2641 * beginning to find whether there was a newline before). Likewise, "isbow"
2642 * asks whether the string is preceded by a word delimiter. End of string is
2643 * always treated as a word and line boundary (there may be cases where it
2644 * shouldn't be, in which case, this should be changed). "delimit" (if
2645 * non-null) specifies a null-terminated string of characters to be considered
2646 * word delimiters matching "<" and ">". if "delimit" is NULL, the default
2647 * delimiters (as set in SetREDefaultWordDelimiters) are used.
2648 * Finally, look_behind_to indicates the position till where it is safe to
2649 * perform look-behind matches. If set, it should be smaller than or equal
2650 * to the start position of the search (pointed at by string). If it is NULL,
2651 * it defaults to the start position. */
2653 int ExecRE (
2654 regexp *prog,
2655 regexp *cross_regex_backref,
2656 const char *string,
2657 const char *end,
2658 int reverse,
2659 char prev_char,
2660 char succ_char,
2661 const char *delimiters,
2662 const char *look_behind_to) {
2664 register unsigned char *str;
2665 unsigned char **s_ptr;
2666 unsigned char **e_ptr;
2667 int ret_val = 0;
2668 unsigned char tempDelimitTable [256];
2669 int i;
2671 s_ptr = (unsigned char **) prog->startp;
2672 e_ptr = (unsigned char **) prog->endp;
2674 /* Check for valid parameters. */
2676 if (prog == NULL || string == NULL) {
2677 reg_error ("NULL parameter to `ExecRE\'");
2678 goto SINGLE_RETURN;
2681 /* Check validity of program. */
2683 if (U_CHAR_AT (prog->program) != MAGIC) {
2684 reg_error ("corrupted program");
2685 goto SINGLE_RETURN;
2688 /* If caller has supplied delimiters, make a delimiter table */
2690 if (delimiters == NULL) {
2691 Current_Delimiters = Default_Delimiters;
2692 } else {
2693 Current_Delimiters = makeDelimiterTable (
2694 (unsigned char *) delimiters,
2695 (unsigned char *) tempDelimitTable);
2698 if (end == NULL && reverse) {
2699 for (end = string; *end != '\0'; end++) ;
2700 succ_char = '\n';
2701 } else if (end == NULL) {
2702 succ_char = '\n';
2705 /* Initialize arrays used by shortcut_escape. */
2707 if (!init_ansi_classes ()) goto SINGLE_RETURN;
2709 /* Remember the beginning of the string for matching BOL */
2711 Start_Of_String = (unsigned char *) string;
2712 Look_Behind_To = (unsigned char *) (look_behind_to?look_behind_to:string);
2714 Prev_Is_BOL = ((prev_char == '\n') || (prev_char == '\0') ? 1 : 0);
2715 Succ_Is_EOL = ((succ_char == '\n') || (succ_char == '\0') ? 1 : 0);
2716 /* Prev_Is_Delim = (Current_Delimiters [(unsigned char)prev_char] ? 1 : 0);
2717 Succ_Is_Delim = (Current_Delimiters [(unsigned char)succ_char] ? 1 : 0);*/
2718 Prev_Is_WordChar = ((isalnum ((int)prev_char) || prev_char == '_') ? 1 : 0);
2719 Succ_Is_WordChar = ((isalnum ((int)succ_char) || succ_char == '_') ? 1 : 0);
2721 Total_Paren = (int) (prog->program [1]);
2722 Num_Braces = (int) (prog->program [2]);
2724 /* Cross_Regex_Backref = cross_regex_backref; */
2726 /* Allocate memory for {m,n} construct counting variables if need be. */
2728 if (Num_Braces > 0) {
2729 Brace =
2730 (brace_counts *) malloc (sizeof (brace_counts) + (size_t) Num_Braces);
2732 if (Brace == NULL) {
2733 reg_error ("out of memory in `ExecRE\'");
2734 goto SINGLE_RETURN;
2736 } else {
2737 Brace = NULL;
2740 /* Initialize the first nine (9) capturing parentheses start and end
2741 pointers to point to the start of the search string. This is to prevent
2742 crashes when later trying to reference captured parens that do not exist
2743 in the compiled regex. We only need to do the first nine since users
2744 can only specify \1, \2, ... \9. */
2746 for (i = 9; i > 0; i--) {
2747 *s_ptr++ = (unsigned char *) string;
2748 *e_ptr++ = (unsigned char *) string;
2751 if (!reverse) { /* Forward Search */
2752 if (prog->anchor) {
2753 /* Search is anchored at BOL */
2755 if (attempt (prog, (unsigned char *) string)) {
2756 ret_val = 1;
2757 goto SINGLE_RETURN;
2760 for (str = (unsigned char *) string;
2761 *str != '\0' && str != (unsigned char *) end;
2762 str++) {
2764 if (*str == '\n') {
2765 if (attempt (prog, str + 1)) {
2766 ret_val = 1;
2767 break;
2772 goto SINGLE_RETURN;
2774 } else if (prog->match_start != '\0') {
2775 /* We know what char match must start with. */
2777 for (str = (unsigned char *) string;
2778 *str != '\0' && str != (unsigned char *) end;
2779 str++) {
2781 if (*str == (unsigned char)prog->match_start) {
2782 if (attempt (prog, str)) {
2783 ret_val = 1;
2784 break;
2789 goto SINGLE_RETURN;
2790 } else {
2791 /* General case */
2793 for (str = (unsigned char *) string;
2794 *str != '\0' && str != (unsigned char *) end;
2795 str++) {
2797 if (attempt (prog, str)) {
2798 ret_val = 1;
2799 break;
2803 /* Beware of a single $ matching \0 */
2804 if (!ret_val && *str == '\0' && str != (unsigned char *) end) {
2805 if (attempt (prog, str)) {
2806 ret_val = 1;
2810 goto SINGLE_RETURN;
2812 } else { /* Search reverse, same as forward, but loops run backward */
2814 if (prog->anchor) {
2815 /* Search is anchored at BOL */
2817 for (str = (unsigned char *)(end - 1);
2818 str >= (unsigned char *) string;
2819 str--) {
2821 if (*str == '\n') {
2822 if (attempt (prog, str + 1)) {
2823 ret_val = 1;
2824 goto SINGLE_RETURN;
2829 if (attempt (prog, (unsigned char *) string)) {
2830 ret_val = 1;
2831 goto SINGLE_RETURN;
2834 goto SINGLE_RETURN;
2835 } else if (prog->match_start != '\0') {
2836 /* We know what char match must start with. */
2838 for (str = (unsigned char *) end;
2839 str >= (unsigned char *) string;
2840 str--) {
2842 if (*str == (unsigned char)prog->match_start) {
2843 if (attempt (prog, str)) {
2844 ret_val = 1;
2845 break;
2850 goto SINGLE_RETURN;
2851 } else {
2852 /* General case */
2854 for (str = (unsigned char *) end;
2855 str >= (unsigned char *) string;
2856 str--) {
2858 if (attempt (prog, str)) {
2859 ret_val = 1;
2860 break;
2866 SINGLE_RETURN: if (Brace) free (Brace);
2868 return (ret_val);
2871 /*--------------------------------------------------------------------*
2872 * init_ansi_classes
2874 * Generate character class sets using locale aware ANSI C functions.
2876 *--------------------------------------------------------------------*/
2878 static int init_ansi_classes (void) {
2880 static int initialized = 0;
2881 static int underscore = (int) '_';
2882 int i, word_count, letter_count, space_count;
2884 if (!initialized) {
2885 initialized = 1; /* Only need to generate character sets once. */
2886 word_count = 0;
2887 letter_count = 0;
2888 space_count = 0;
2890 for (i = 1; i < (int)UCHAR_MAX; i++) {
2891 if (isalnum (i) || i == underscore) {
2892 Word_Char [word_count++] = (unsigned char) i;
2895 if (isalpha (i)) {
2896 Letter_Char [letter_count++] = (unsigned char) i;
2899 /* Note: Whether or not newline is considered to be whitespace is
2900 handled by switches within the original regex and is thus omitted
2901 here. */
2903 if (isspace (i) && (i != (int) '\n')) {
2904 White_Space [space_count++] = (unsigned char) i;
2907 /* Make sure arrays are big enough. ("- 2" because of zero array
2908 origin and we need to leave room for the NULL terminator.) */
2910 if (word_count > (ALNUM_CHAR_SIZE - 2) ||
2911 space_count > (WHITE_SPACE_SIZE - 2) ||
2912 letter_count > (ALNUM_CHAR_SIZE - 2)) {
2914 reg_error ("internal error #9 `init_ansi_classes\'");
2915 return (0);
2919 Word_Char [word_count] = '\0';
2920 Letter_Char [word_count] = '\0';
2921 White_Space [space_count] = '\0';
2924 return (1);
2927 /*----------------------------------------------------------------------*
2928 * attempt - try match at specific point, returns: 0 failure, 1 success
2929 *----------------------------------------------------------------------*/
2931 static int attempt (regexp *prog, unsigned char *string) {
2933 register int i;
2934 register unsigned char **s_ptr;
2935 register unsigned char **e_ptr;
2936 int branch_index = 0; /* Must be set to zero ! */
2938 Reg_Input = string;
2939 Start_Ptr_Ptr = (unsigned char **) prog->startp;
2940 End_Ptr_Ptr = (unsigned char **) prog->endp;
2941 s_ptr = (unsigned char **) prog->startp;
2942 e_ptr = (unsigned char **) prog->endp;
2944 /* Overhead due to capturing parentheses. */
2946 Extent_Ptr_BW = string;
2947 Extent_Ptr_FW = NULL;
2949 for (i = Total_Paren + 1; i > 0; i--) {
2950 *s_ptr++ = NULL;
2951 *e_ptr++ = NULL;
2954 if (match ((unsigned char *) (prog->program + REGEX_START_OFFSET),
2955 &branch_index)) {
2956 prog->startp [0] = (char *) string;
2957 prog->endp [0] = (char *) Reg_Input; /* <-- One char AFTER */
2958 prog->extentpBW = (char *) Extent_Ptr_BW; /* matched string! */
2959 prog->extentpFW = (char *) Extent_Ptr_FW;
2960 prog->top_branch = branch_index;
2962 return (1);
2963 } else {
2964 return (0);
2968 /*----------------------------------------------------------------------*
2969 * match - main matching routine
2971 * Conceptually the strategy is simple: check to see whether the
2972 * current node matches, call self recursively to see whether the rest
2973 * matches, and then act accordingly. In practice we make some effort
2974 * to avoid recursion, in particular by going through "ordinary" nodes
2975 * (that don't need to know whether the rest of the match failed) by a
2976 * loop instead of by recursion. Returns 0 failure, 1 success.
2977 *----------------------------------------------------------------------*/
2979 static int match (unsigned char *prog, int *branch_index_param) {
2981 register unsigned char *scan; /* Current node. */
2982 unsigned char *next; /* Next node. */
2983 register int next_ptr_offset; /* Used by the NEXT_PTR () macro */
2986 scan = prog;
2988 while (scan != NULL) {
2989 NEXT_PTR (scan, next);
2991 switch (GET_OP_CODE (scan)) {
2992 case BRANCH:
2994 register unsigned char *save;
2995 register int branch_index_local = 0;
2997 if (GET_OP_CODE (next) != BRANCH) { /* No choice. */
2998 next = OPERAND (scan); /* Avoid recursion. */
2999 } else {
3000 do {
3001 save = Reg_Input;
3003 if (match (OPERAND (scan), NULL))
3005 if (branch_index_param)
3006 *branch_index_param = branch_index_local;
3007 return (1);
3010 ++branch_index_local;
3012 Reg_Input = save; /* Backtrack. */
3013 NEXT_PTR (scan, scan);
3014 } while (scan != NULL && GET_OP_CODE (scan) == BRANCH);
3016 return (0); /* NOT REACHED */
3020 break;
3022 case EXACTLY:
3024 register int len;
3025 register unsigned char *opnd;
3027 opnd = OPERAND (scan);
3029 /* Inline the first character, for speed. */
3031 if (*opnd != *Reg_Input) return (0);
3033 len = strlen ((char *) opnd);
3035 if (len > 1 &&
3036 strncmp ((char *) opnd, (char *) Reg_Input, len) != 0) {
3038 return (0);
3041 Reg_Input += len;
3044 break;
3046 case SIMILAR:
3048 register unsigned char *opnd;
3049 register unsigned char test;
3051 opnd = OPERAND (scan);
3053 /* Note: the SIMILAR operand was converted to lower case during
3054 regex compile. */
3056 while ((test = *opnd++) != '\0') {
3057 if (tolower (*Reg_Input++) != test) return (0);
3061 break;
3063 case BOL: /* `^' (beginning of line anchor) */
3064 if (Reg_Input == Start_Of_String) {
3065 if (Prev_Is_BOL) break;
3066 } else if (*(Reg_Input - 1) == '\n') {
3067 break;
3070 return (0);
3072 case EOL: /* `$' anchor matches end of line and end of string */
3073 if (*Reg_Input == '\n' || (*Reg_Input == '\0' && Succ_Is_EOL)) {
3074 break;
3077 return (0);
3079 case BOWORD: /* `<' (beginning of word anchor) */
3080 /* Check to see if the current character is a word character
3081 and the preceding character is not. */
3083 int prev_is_wchar;
3084 if (Reg_Input == Start_Of_String) {
3085 prev_is_wchar = Prev_Is_WordChar;
3086 } else {
3087 prev_is_wchar = (isalnum (*(Reg_Input-1))
3088 || *(Reg_Input-1) == '_');
3090 if (!prev_is_wchar) {
3091 int current_is_wchar;
3092 if (*Reg_Input == '\0') {
3093 current_is_wchar = Succ_Is_WordChar;
3094 } else {
3095 current_is_wchar = (isalnum (*Reg_Input)
3096 || *Reg_Input == '_');
3098 if (current_is_wchar) break;
3102 return (0);
3104 case EOWORD: /* `>' (end of word anchor) */
3105 /* Check to see if the current character is not a word character
3106 and the preceding character is. */
3108 int prev_is_wchar;
3109 if (Reg_Input == Start_Of_String) {
3110 prev_is_wchar = Prev_Is_WordChar;
3111 } else {
3112 prev_is_wchar = (isalnum (*(Reg_Input-1))
3113 || *(Reg_Input-1) == '_');
3115 if (prev_is_wchar) {
3116 int current_is_wchar;
3117 if (*Reg_Input == '\0') {
3118 current_is_wchar = Succ_Is_WordChar;
3119 } else {
3120 current_is_wchar = (isalnum (*Reg_Input)
3121 || *Reg_Input == '_');
3123 if (!current_is_wchar) break;
3127 return (0);
3129 case NOT_BOUNDARY: /* \B (NOT a word boundary) */
3131 int prev_is_wchar;
3132 int current_is_wchar;
3133 if (Reg_Input == Start_Of_String) {
3134 prev_is_wchar = Prev_Is_WordChar;
3135 } else {
3136 prev_is_wchar = (isalnum (*(Reg_Input-1))
3137 || *(Reg_Input-1) == '_');
3139 if (*Reg_Input == '\0') {
3140 current_is_wchar = Succ_Is_WordChar;
3141 } else {
3142 current_is_wchar = (isalnum (*Reg_Input)
3143 || *Reg_Input == '_');
3145 if (!(prev_is_wchar ^ current_is_wchar)) break;
3148 return (0);
3150 case IS_DELIM: /* \y (A word delimiter character.) */
3151 if (Current_Delimiters [ *Reg_Input ]) {
3152 Reg_Input++; break;
3155 return (0);
3157 case NOT_DELIM: /* \Y (NOT a word delimiter character.) */
3158 if (!Current_Delimiters [ *Reg_Input ]) {
3159 Reg_Input++; break;
3162 return (0);
3164 case WORD_CHAR: /* \w (word character; alpha-numeric or underscore) */
3165 if (isalnum ((int) *Reg_Input) || *Reg_Input == '_') {
3166 Reg_Input++; break;
3169 return (0);
3171 case NOT_WORD_CHAR:/* \W (NOT a word character) */
3172 if (isalnum ((int) *Reg_Input) ||
3173 *Reg_Input == '_' ||
3174 *Reg_Input == '\n' ||
3175 *Reg_Input == '\0') return (0);
3177 Reg_Input++; break;
3179 case ANY: /* `.' (matches any character EXCEPT newline) */
3180 if (*Reg_Input == '\0' || *Reg_Input == '\n') return (0);
3182 Reg_Input++; break;
3184 case EVERY: /* `.' (matches any character INCLUDING newline) */
3185 if (*Reg_Input == '\0') return (0);
3187 Reg_Input++; break;
3189 case DIGIT: /* \d, same as [0123456789] */
3190 if (!isdigit ((int) *Reg_Input)) return (0);
3192 Reg_Input++; break;
3194 case NOT_DIGIT: /* \D, same as [^0123456789] */
3195 if (isdigit ((int) *Reg_Input) ||
3196 *Reg_Input == '\n' ||
3197 *Reg_Input == '\0') return (0);
3199 Reg_Input++; break;
3201 case LETTER: /* \l, same as [a-zA-Z] */
3202 if (!isalpha ((int) *Reg_Input)) return (0);
3204 Reg_Input++; break;
3206 case NOT_LETTER: /* \L, same as [^0123456789] */
3207 if (isalpha ((int) *Reg_Input) ||
3208 *Reg_Input == '\n' ||
3209 *Reg_Input == '\0') return (0);
3211 Reg_Input++; break;
3213 case SPACE: /* \s, same as [ \t\r\f\v] */
3214 if (!isspace ((int) *Reg_Input) || *Reg_Input == '\n') return (0);
3216 Reg_Input++; break;
3218 case SPACE_NL: /* \s, same as [\n \t\r\f\v] */
3219 if (!isspace ((int) *Reg_Input)) return (0);
3221 Reg_Input++; break;
3223 case NOT_SPACE: /* \S, same as [^\n \t\r\f\v] */
3224 if (isspace ((int) *Reg_Input) || *Reg_Input == '\0') return (0);
3226 Reg_Input++; break;
3228 case NOT_SPACE_NL: /* \S, same as [^ \t\r\f\v] */
3229 if ((isspace ((int) *Reg_Input) && *Reg_Input != '\n') ||
3230 *Reg_Input == '\0') return (0);
3232 Reg_Input++; break;
3234 case ANY_OF: /* [...] character class. */
3235 if (*Reg_Input == '\0') return (0); /* Needed because strchr ()
3236 considers \0 as a member
3237 of the character set. */
3239 if (strchr ((char *) OPERAND (scan), (int) *Reg_Input) == NULL) {
3240 return (0);
3243 Reg_Input++; break;
3245 case ANY_BUT: /* [^...] Negated character class-- does NOT normally
3246 match newline (\n added usually to operand at compile
3247 time.) */
3249 if (*Reg_Input == '\0') return (0); /* See comment for ANY_OF. */
3251 if (strchr ((char *) OPERAND (scan), (int) *Reg_Input) != NULL) {
3252 return (0);
3255 Reg_Input++; break;
3257 case NOTHING:
3258 case BACK:
3259 break;
3261 case STAR:
3262 case PLUS:
3263 case QUESTION:
3264 case BRACE:
3266 case LAZY_STAR:
3267 case LAZY_PLUS:
3268 case LAZY_QUESTION:
3269 case LAZY_BRACE:
3271 register unsigned long num_matched = REG_ZERO;
3272 register unsigned long min = ULONG_MAX, max = REG_ZERO;
3273 register unsigned char *save;
3274 register unsigned char next_char;
3275 unsigned char *next_op;
3276 int lazy = 0;
3278 /* Lookahead (when possible) to avoid useless match attempts
3279 when we know what character comes next. */
3281 if (GET_OP_CODE (next) == EXACTLY) {
3282 next_char = *OPERAND (next);
3283 } else {
3284 next_char = '\0';/* i.e. Don't know what next character is. */
3287 next_op = OPERAND (scan);
3289 switch (GET_OP_CODE (scan)) {
3290 case LAZY_STAR:
3291 lazy = 1;
3292 case STAR:
3293 min = REG_ZERO;
3294 max = ULONG_MAX;
3295 break;
3297 case LAZY_PLUS:
3298 lazy = 1;
3299 case PLUS:
3300 min = REG_ONE;
3301 max = ULONG_MAX;
3302 break;
3304 case LAZY_QUESTION:
3305 lazy = 1;
3306 case QUESTION:
3307 min = REG_ZERO;
3308 max = REG_ONE;
3309 break;
3311 case LAZY_BRACE:
3312 lazy = 1;
3313 case BRACE:
3314 min = (unsigned long)
3315 GET_OFFSET (scan + NEXT_PTR_SIZE);
3317 max = (unsigned long)
3318 GET_OFFSET (scan + (2 * NEXT_PTR_SIZE));
3320 if (max <= REG_INFINITY) max = ULONG_MAX;
3322 next_op = OPERAND (scan + (2 * NEXT_PTR_SIZE));
3325 save = Reg_Input;
3327 if (lazy) {
3328 if ( min > REG_ZERO) num_matched = greedy (next_op, min);
3329 } else {
3330 num_matched = greedy (next_op, max);
3333 while (min <= num_matched && num_matched <= max) {
3334 if (next_char == '\0' || next_char == *Reg_Input) {
3335 if (match (next, NULL)) return (1);
3338 /* Couldn't or didn't match. */
3340 if (lazy) {
3341 if (!greedy (next_op, 1)) return (0);
3343 num_matched++; /* Inch forward. */
3344 } else if (num_matched > REG_ZERO) {
3345 num_matched--; /* Back up. */
3346 } else if (min == REG_ZERO && num_matched == REG_ZERO) {
3347 break;
3350 Reg_Input = save + num_matched;
3353 return (0);
3356 break;
3358 case END:
3359 if (Extent_Ptr_FW == NULL || (Reg_Input - Extent_Ptr_FW) > 0) {
3360 Extent_Ptr_FW = Reg_Input;
3363 return (1); /* Success! */
3365 break;
3367 case INIT_COUNT:
3368 Brace->count [*OPERAND (scan)] = REG_ZERO;
3370 break;
3372 case INC_COUNT:
3373 Brace->count [*OPERAND (scan)]++;
3375 break;
3377 case TEST_COUNT:
3378 if (Brace->count [*OPERAND (scan)] <
3379 (unsigned long) GET_OFFSET (scan + NEXT_PTR_SIZE + INDEX_SIZE)) {
3381 next = scan + NODE_SIZE + INDEX_SIZE + NEXT_PTR_SIZE;
3384 break;
3386 case BACK_REF:
3387 case BACK_REF_CI:
3388 /* case X_REGEX_BR: */
3389 /* case X_REGEX_BR_CI: *** IMPLEMENT LATER */
3391 register unsigned char *captured, *finish;
3392 int paren_no;
3394 paren_no = (int) *OPERAND (scan);
3396 /* if (GET_OP_CODE (scan) == X_REGEX_BR ||
3397 GET_OP_CODE (scan) == X_REGEX_BR_CI) {
3399 if (Cross_Regex_Backref == NULL) return (0);
3401 captured =
3402 (unsigned char *) Cross_Regex_Backref->startp [paren_no];
3404 finish =
3405 (unsigned char *) Cross_Regex_Backref->endp [paren_no];
3406 } else { */
3407 captured = Back_Ref_Start [paren_no];
3408 finish = Back_Ref_End [paren_no];
3409 /* } */
3411 if ((captured != NULL) && (finish != NULL)) {
3412 if (captured > finish) return (0);
3414 if (GET_OP_CODE (scan) == BACK_REF_CI /* ||
3415 GET_OP_CODE (scan) == X_REGEX_BR_CI*/ ) {
3417 while (captured < finish) {
3418 if (tolower (*captured++) != tolower (*Reg_Input++)) {
3419 return (0);
3422 } else {
3423 while (captured < finish) {
3424 if (*captured++ != *Reg_Input++) return (0);
3428 break;
3429 } else {
3430 return (0);
3434 case POS_AHEAD_OPEN:
3435 case NEG_AHEAD_OPEN:
3437 register unsigned char *save;
3438 int answer;
3440 save = Reg_Input;
3441 answer = match (next, NULL); /* Does the look-ahead regex match? */
3443 if ((GET_OP_CODE (scan) == POS_AHEAD_OPEN) ? answer : !answer) {
3444 /* Remember the last (most to the right) character position
3445 that we consume in the input for a successful match. This
3446 is info that may be needed should an attempt be made to
3447 match the exact same text at the exact same place. Since
3448 look-aheads backtrack, a regex with a trailing look-ahead
3449 may need more text than it matches to accomplish a
3450 re-match. */
3452 if (Extent_Ptr_FW == NULL || (Reg_Input - Extent_Ptr_FW) > 0) {
3453 Extent_Ptr_FW = Reg_Input;
3456 Reg_Input = save; /* Backtrack to look-ahead start. */
3458 /* Jump to the node just after the (?=...) or (?!...)
3459 Construct. */
3461 next = next_ptr (OPERAND (scan)); /* Skip 1st branch */
3462 /* Skip the chain of branches inside the look-ahead */
3463 while(GET_OP_CODE(next) == BRANCH)
3464 next = next_ptr (next);
3465 next = next_ptr (next); /* Skip the LOOK_AHEAD_CLOSE */
3466 } else {
3467 Reg_Input = save; /* Backtrack to look-ahead start. */
3469 return (0);
3473 break;
3475 case POS_BEHIND_OPEN:
3476 case NEG_BEHIND_OPEN:
3478 register unsigned char *save;
3479 int answer;
3480 register int offset, upper;
3481 int lower;
3482 int found = 0;
3483 unsigned char save_char;
3485 save = Reg_Input;
3486 save_char = *Reg_Input;
3488 /* Prevent overshoot (greedy matching could end past the
3489 current position). This means that look-ahead at the end
3490 of a look-behind pattern won't work, but it is highly
3491 unlikely that anyone ever needs this. It is documented as
3492 being not supported anyway. It is possible to lift this
3493 limitation, but only at the cost of increased complexity
3494 and an overall decrease in performance of the regex matching
3495 code, so it's probably not worth it. */
3496 *Reg_Input = '\0';
3498 lower = GET_LOWER (scan);
3499 upper = GET_UPPER (scan);
3501 /* Start with the shortest match first. This is the most
3502 efficient direction in general.
3503 Note! Negative look behind is _very_ tricky when the length
3504 is not constant: we have to make sure the expression doesn't
3505 match for _any_ of the starting positions. */
3506 for (offset = lower; offset <= upper; ++offset) {
3507 Reg_Input = save - offset;
3509 if (Reg_Input < Look_Behind_To) {
3510 /* No need to look any further */
3511 break;
3514 answer = match (next, NULL); /* Does the look-behind regex match? */
3516 /* The match must have ended at the current position;
3517 otherwise it is invalid */
3518 if (answer && Reg_Input == save) {
3519 /* It matched, exactly far enough */
3520 found = 1;
3522 /* Remember the last (most to the left) character position
3523 that we consume in the input for a successful match.
3524 This is info that may be needed should an attempt be
3525 made to match the exact same text at the exact same
3526 place. Since look-behind backtracks, a regex with a
3527 leading look-behind may need more text than it matches
3528 to accomplish a re-match. */
3530 if (Extent_Ptr_BW == NULL ||
3531 (Extent_Ptr_BW - (save - offset)) > 0) {
3532 Extent_Ptr_BW = save - offset;
3535 break;
3539 /* Always restore the position and repair the string */
3540 Reg_Input = save;
3541 *Reg_Input = save_char;
3543 if ((GET_OP_CODE (scan) == POS_BEHIND_OPEN) ? found : !found) {
3544 /* The look-behind matches, so we must jump to the next
3545 node. The look-behind node is followed by a chain of
3546 branches (contents of the look-behind expression), and
3547 terminated by a look-behind-close node. */
3548 next = next_ptr (OPERAND (scan) + LENGTH_SIZE); /* 1st branch */
3549 /* Skip the chained branches inside the look-ahead */
3550 while (GET_OP_CODE (next) == BRANCH)
3551 next = next_ptr (next);
3552 next = next_ptr (next); /* Skip LOOK_BEHIND_CLOSE */
3553 } else {
3554 /* Not a match */
3555 return (0);
3558 break;
3560 case LOOK_AHEAD_CLOSE:
3561 case LOOK_BEHIND_CLOSE:
3562 return (1); /* We have reached the end of the look-ahead or
3563 look-behind which implies that we matched it,
3564 so return TRUE. */
3565 default:
3566 if ((GET_OP_CODE (scan) > OPEN) &&
3567 (GET_OP_CODE (scan) < OPEN + NSUBEXP)) {
3569 register int no;
3570 register unsigned char *save;
3572 no = GET_OP_CODE (scan) - OPEN;
3573 save = Reg_Input;
3575 if (no < 10) {
3576 Back_Ref_Start [no] = save;
3577 Back_Ref_End [no] = NULL;
3580 if (match (next, NULL)) {
3581 /* Do not set `Start_Ptr_Ptr' if some later invocation (think
3582 recursion) of the same parentheses already has. */
3584 if (Start_Ptr_Ptr [no] == NULL) Start_Ptr_Ptr [no] = save;
3586 return (1);
3587 } else {
3588 return (0);
3590 } else if ((GET_OP_CODE (scan) > CLOSE) &&
3591 (GET_OP_CODE (scan) < CLOSE + NSUBEXP)) {
3593 register int no;
3594 register unsigned char *save;
3596 no = GET_OP_CODE (scan) - CLOSE;
3597 save = Reg_Input;
3599 if (no < 10) Back_Ref_End [no] = save;
3601 if (match (next, NULL)) {
3602 /* Do not set `End_Ptr_Ptr' if some later invocation of the
3603 same parentheses already has. */
3605 if (End_Ptr_Ptr [no] == NULL) End_Ptr_Ptr [no] = save;
3607 return (1);
3608 } else {
3609 return (0);
3611 } else {
3612 reg_error ("memory corruption, `match\'");
3614 return (0);
3617 break;
3620 scan = next;
3623 /* We get here only if there's trouble -- normally "case END" is
3624 the terminating point. */
3626 reg_error ("corrupted pointers, `match\'");
3628 return (0);
3631 /*----------------------------------------------------------------------*
3632 * greedy
3634 * Repeatedly match something simple up to "max" times. If max <= 0
3635 * then match as much as possible (max = infinity). Uses unsigned long
3636 * variables to maximize the amount of text matchable for unbounded
3637 * qualifiers like '*' and '+'. This will allow at least 4,294,967,295
3638 * matches (4 Gig!) for an ANSI C compliant compiler. If you are
3639 * applying a regex to something bigger than that, you shouldn't be
3640 * using NEdit!
3642 * Returns the actual number of matches.
3643 *----------------------------------------------------------------------*/
3645 static unsigned long greedy (unsigned char *p, long max) {
3647 register unsigned char *input_str;
3648 register unsigned char *operand;
3649 register unsigned long count = REG_ZERO;
3650 register unsigned long max_cmp;
3652 input_str = Reg_Input;
3653 operand = OPERAND (p); /* Literal char or start of class characters. */
3654 max_cmp = (max > 0) ? (unsigned long) max : ULONG_MAX;
3656 switch (GET_OP_CODE (p)) {
3657 case ANY:
3658 /* Race to the end of the line or string. Dot DOESN'T match
3659 newline. */
3661 while (count < max_cmp && *input_str != '\0' && *input_str != '\n') {
3662 count++; input_str++;
3665 break;
3667 case EVERY:
3668 /* Race to the end of the line or string. Dot DOES match newline. */
3670 while (count < max_cmp && *input_str != '\0') {
3671 count++; input_str++;
3674 break;
3676 case EXACTLY: /* Count occurrences of single character operand. */
3677 while (count < max_cmp && *operand == *input_str) {
3678 count++; input_str++;
3681 break;
3683 case SIMILAR: /* Case insensitive version of EXACTLY */
3684 while (count < max_cmp && *operand == tolower (*input_str)) {
3685 count++; input_str++;
3688 break;
3690 case ANY_OF: /* [...] character class. */
3691 while (count < max_cmp &&
3692 *input_str != '\0' &&
3693 strchr ((char *) operand, (int) *input_str) != NULL) {
3695 count++; input_str++;
3698 break;
3700 case ANY_BUT: /* [^...] Negated character class- does NOT normally
3701 match newline (\n added usually to operand at compile
3702 time.) */
3704 while (count < max_cmp &&
3705 *input_str != '\0' &&
3706 strchr ((char *) operand, (int) *input_str) == NULL) {
3708 count++; input_str++;
3711 break;
3713 case IS_DELIM: /* \y (not a word delimiter char)
3714 NOTE: '\n' and '\0' are always word delimiters. */
3716 while (count < max_cmp && Current_Delimiters [ *input_str ]) {
3717 count++; input_str++;
3720 break;
3722 case NOT_DELIM: /* \Y (not a word delimiter char)
3723 NOTE: '\n' and '\0' are always word delimiters. */
3725 while (count < max_cmp && !Current_Delimiters [ *input_str ]) {
3726 count++; input_str++;
3729 break;
3731 case WORD_CHAR: /* \w (word character, alpha-numeric or underscore) */
3732 while (count < max_cmp &&
3733 (isalnum ((int) *input_str) ||
3734 *input_str == (unsigned char) '_')) {
3736 count++; input_str++;
3739 break;
3741 case NOT_WORD_CHAR:/* \W (NOT a word character) */
3742 while (count < max_cmp &&
3743 !isalnum ((int) *input_str) &&
3744 *input_str != (unsigned char) '_' &&
3745 *input_str != (unsigned char) '\n' &&
3746 *input_str != (unsigned char) '\0') {
3748 count++; input_str++;
3751 break;
3753 case DIGIT: /* same as [0123456789] */
3754 while (count < max_cmp && isdigit ((int) *input_str)) {
3755 count++; input_str++;
3758 break;
3760 case NOT_DIGIT: /* same as [^0123456789] */
3761 while (count < max_cmp &&
3762 !isdigit ((int) *input_str) &&
3763 *input_str != '\n' &&
3764 *input_str != '\0') {
3766 count++; input_str++;
3769 break;
3771 case SPACE: /* same as [ \t\r\f\v]-- doesn't match newline. */
3772 while (count < max_cmp &&
3773 isspace ((int) *input_str) &&
3774 *input_str != '\n' &&
3775 *input_str != '\0') {
3777 count++; input_str++;
3780 break;
3782 case SPACE_NL: /* same as [\n \t\r\f\v]-- matches newline. */
3783 while (count < max_cmp &&
3784 isspace ((int) *input_str) &&
3785 *input_str != '\0') {
3787 count++; input_str++;
3790 break;
3792 case NOT_SPACE: /* same as [^\n \t\r\f\v]-- doesn't match newline. */
3793 while (count < max_cmp &&
3794 !isspace ((int) *input_str) &&
3795 *input_str != '\0') {
3797 count++; input_str++;
3800 break;
3802 case NOT_SPACE_NL: /* same as [^ \t\r\f\v]-- matches newline. */
3803 while (count < max_cmp &&
3804 (!isspace ((int) *input_str) || *input_str == '\n') &&
3805 *input_str != '\0') {
3807 count++; input_str++;
3810 break;
3812 case LETTER: /* same as [a-zA-Z] */
3813 while (count < max_cmp &&
3814 isalpha ((int) *input_str) &&
3815 *input_str != '\0') {
3817 count++; input_str++;
3820 break;
3822 case NOT_LETTER: /* same as [^a-zA-Z] */
3823 while (count < max_cmp &&
3824 !isalpha ((int) *input_str) &&
3825 *input_str != '\n' &&
3826 *input_str != '\0') {
3828 count++; input_str++;
3831 break;
3833 default:
3834 /* Called inappropriately. Only atoms that are SIMPLE should
3835 generate a call to greedy. The above cases should cover
3836 all the atoms that are SIMPLE. */
3838 reg_error ("internal error #10 `greedy\'");
3839 count = 0U; /* Best we can do. */
3842 /* Point to character just after last matched character. */
3844 Reg_Input = input_str;
3846 return (count);
3849 /*----------------------------------------------------------------------*
3850 * next_ptr - compute the address of a node's "NEXT" pointer.
3851 * Note: a simplified inline version is available via the NEXT_PTR() macro,
3852 * but that one is only to be used at time-critical places (see the
3853 * description of the macro).
3854 *----------------------------------------------------------------------*/
3856 static unsigned char * next_ptr (unsigned char *ptr) {
3858 register int offset;
3860 if (ptr == &Compute_Size) return (NULL);
3862 offset = GET_OFFSET (ptr);
3864 if (offset == 0) return (NULL);
3866 if (GET_OP_CODE (ptr) == BACK) {
3867 return (ptr - offset);
3868 } else {
3869 return (ptr + offset);
3873 /*----------------------------------------------------------------------*
3874 * SubstituteRE - Perform substitutions after a `regexp' match.
3875 *----------------------------------------------------------------------*/
3877 void SubstituteRE (
3878 regexp *prog,
3879 const char *source,
3880 char *dest,
3881 int max) {
3883 register unsigned char *src;
3884 unsigned char *src_alias;
3885 register unsigned char *dst;
3886 register unsigned char c;
3887 register unsigned char test;
3888 register int paren_no;
3889 register int len;
3890 register unsigned char chgcase;
3892 if (prog == NULL || source == NULL || dest == NULL) {
3893 reg_error ("NULL parm to `SubstituteRE\'");
3895 return;
3898 if (U_CHAR_AT (prog->program) != MAGIC) {
3899 reg_error ("damaged regexp passed to `SubstituteRE\'");
3901 return;
3904 src = (unsigned char *) source;
3905 dst = (unsigned char *) dest;
3907 while ((c = *src++) != '\0') {
3908 chgcase = '\0';
3909 paren_no = -1;
3911 if (c == '\\') {
3912 /* Process any case altering tokens, i.e \u, \U, \l, \L. */
3914 if (*src == 'u' || *src == 'U' || *src == 'l' || *src == 'L') {
3915 chgcase = *src;
3916 src++;
3917 c = *src++;
3919 if (c == '\0') break;
3923 if (c == '&') {
3924 paren_no = 0;
3925 } else if (c == '\\') {
3926 /* Can not pass register variable `&src' to function `numeric_escape'
3927 so make a non-register copy that we can take the address of. */
3929 src_alias = src;
3931 if ('1' <= *src && *src <= '9') {
3932 paren_no = (int) *src++ - (int) '0';
3934 } else if ((test = literal_escape (*src)) != '\0') {
3935 c = test; src++;
3937 } else if ((test = numeric_escape (*src, &src_alias)) != '\0') {
3938 c = test;
3939 src = src_alias; src++;
3941 /* NOTE: if an octal escape for zero is attempted (e.g. \000), it
3942 will be treated as a literal string. */
3943 } else if (*src == '\0') {
3944 /* If '\' is the last character of the replacement string, it is
3945 interpreted as a literal backslash. */
3947 c = '\\';
3948 } else {
3949 c = *src++; /* Allow any escape sequence (This is */
3950 } /* INCONSISTENT with the `CompileRE' */
3951 } /* mind set of issuing an error! */
3953 if (paren_no < 0) { /* Ordinary character. */
3954 if (((char *) dst - (char *) dest) >= (max - 1)) {
3955 break;
3956 } else {
3957 *dst++ = c;
3959 } else if (prog->startp [paren_no] != NULL &&
3960 prog->endp [paren_no] != NULL) {
3962 len = prog->endp [paren_no] - prog->startp [paren_no];
3964 if (((char *) dst + len - (char *) dest) >= max-1) {
3965 len = max - ((char *) dst - (char *) dest) - 1;
3968 (void) strncpy ((char *) dst, (char *) prog->startp [paren_no], len);
3970 if (chgcase != '\0') adjustcase (dst, len, chgcase);
3972 dst += len;
3974 if (len != 0 && *(dst - 1) == '\0') { /* strncpy hit NUL. */
3975 reg_error ("damaged match string in `SubstituteRE\'");
3977 return;
3982 *dst = '\0';
3985 static void adjustcase (unsigned char *str, int len, unsigned char chgcase) {
3987 register unsigned char *string = str;
3988 int i;
3990 /* The tokens \u and \l only modify the first character while the tokens
3991 \U and \L modify the entire string. */
3993 if (islower (chgcase) && len > 0) len = 1;
3995 switch (chgcase) {
3996 case 'u':
3997 case 'U':
3998 for (i = 0; i < len; i++) {
3999 *(string + i) = toupper ((int) *(string + i));
4002 break;
4004 case 'l':
4005 case 'L':
4006 for (i = 0; i < len; i++) {
4007 *(string + i) = tolower ((int) *(string + i));
4010 break;
4014 /*----------------------------------------------------------------------*
4015 * reg_error
4016 *----------------------------------------------------------------------*/
4018 static void reg_error (char *str) {
4020 fprintf (
4021 stderr,
4022 "NEdit: Internal error processing regular expression (%s)\n",
4023 str);
4026 /*----------------------------------------------------------------------*
4027 * makeDelimiterTable
4029 * Translate a null-terminated string of delimiters into a 256 byte
4030 * lookup table for determining whether a character is a delimiter or
4031 * not.
4033 * Table must be allocated by the caller.
4035 * Return value is a pointer to the table.
4036 *----------------------------------------------------------------------*/
4038 static unsigned char * makeDelimiterTable (
4039 unsigned char *delimiters,
4040 unsigned char *table) {
4042 unsigned char *c;
4044 memset (table, 0, 256);
4046 for (c = (unsigned char *) delimiters; *c != '\0'; c++) {
4047 table [*c] = 1;
4050 table [(int) NULL] = 1; /* These */
4051 table [(int) '\t'] = 1; /* characters */
4052 table [(int) '\n'] = 1; /* are always */
4053 table [(int) ' ' ] = 1; /* delimiters. */
4055 return table;
4058 /*----------------------------------------------------------------------*
4059 * SetREDefaultWordDelimiters
4061 * Builds a default delimiter table that persists across `ExecRE' calls.
4062 *----------------------------------------------------------------------*/
4064 void SetREDefaultWordDelimiters (char *delimiters) {
4065 makeDelimiterTable ((unsigned char *) delimiters, Default_Delimiters);
4068 void EnableCountingQuantifier (int is_enabled) {
4069 Enable_Counting_Quantifier = is_enabled;