Revive upgrade message to 5.4 file format
[nedit.git] / source / regularExp.c
blob0786eaae728ee18600d61ac4127d474c798aec00
1 static const char CVSID[] = "$Id: regularExp.c,v 1.18 2002/07/17 15:14:37 edg Exp $";
2 /*------------------------------------------------------------------------*
3 * `CompileRE', `ExecRE', and `substituteRE' -- regular expression parsing
5 * This is a HIGHLY ALTERED VERSION of Henry Spencer's `regcomp' and
6 * `regexec' code adapted for NEdit.
8 * .-------------------------------------------------------------------.
9 * | ORIGINAL COPYRIGHT NOTICE: |
10 * | |
11 * | Copyright (c) 1986 by University of Toronto. |
12 * | Written by Henry Spencer. Not derived from licensed software. |
13 * | |
14 * | Permission is granted to anyone to use this software for any |
15 * | purpose on any computer system, and to redistribute it freely, |
16 * | subject to the following restrictions: |
17 * | |
18 * | 1. The author is not responsible for the consequences of use of |
19 * | this software, no matter how awful, even if they arise |
20 * | from defects in it. |
21 * | |
22 * | 2. The origin of this software must not be misrepresented, either |
23 * | by explicit claim or by omission. |
24 * | |
25 * | 3. Altered versions must be plainly marked as such, and must not |
26 * | be misrepresented as being the original software. |
27 * `-------------------------------------------------------------------'
29 * BEWARE that some of this code is subtly aware of the way operator
30 * precedence is structured in regular expressions. Serious changes in
31 * regular-expression syntax might require a total rethink.
32 * -- Henry Spencer
33 * (Yes, it did!) -- Christopher Conrad, Dec. 1999
35 * January, 1994, Mark Edel
36 * Consolidated files, changed names of external functions to avoid
37 * potential conflicts with native regcomp and regexec functions, changed
38 * error reporting to NEdit form, added multi-line and reverse searching,
39 * and added \n \t \u \U \l \L.
41 * June, 1996, Mark Edel
42 * Bug in NEXT macro, didn't work for expressions which compiled to over
43 * 256 bytes.
45 * December, 1999, Christopher Conrad
46 * Reformatted code for readability, improved error output, added octal and
47 * hexadecimal escapes, added back-references (\1-\9), added positive look
48 * ahead: (?=...), added negative lookahead: (?!...), added non-capturing
49 * parentheses: (?:...), added case insensitive constructs (?i...) and
50 * (?I...), added newline matching constructs (?n...) and (?N...), added
51 * regex comments: (?#...), added shortcut escapes: \d\D\l\L\s\S\w\W\y\Y.
52 * Added "not a word boundary" anchor \B.
54 * July, 2002, Eddy De Greef
55 * Added look behind, both positive (?<=...) and negative (?<!...) for
56 * bounded-length patterns.
59 #ifdef HAVE_CONFIG_H
60 #include "../config.h"
61 #endif
63 #include "regularExp.h"
65 #include <stdio.h>
66 #include <stdlib.h>
67 #include <string.h>
68 #include <ctype.h>
69 #include <limits.h>
71 #ifdef HAVE_DEBUG_H
72 #include "../debug.h"
73 #endif
76 /* The first byte of the regexp internal `program' is a magic number to help
77 gaurd against corrupted data; the compiled regex code really begins in the
78 second byte. */
80 #define MAGIC 0234
82 /* The "internal use only" fields in `regexp.h' are present to pass info from
83 * `CompileRE' to `ExecRE' which permits the execute phase to run lots faster on
84 * simple cases. They are:
86 * match_start Character that must begin a match; '\0' if none obvious.
87 * anchor Is the match anchored (at beginning-of-line only)?
89 * `match_start' and `anchor' permit very fast decisions on suitable starting
90 * points for a match, considerably reducing the work done by ExecRE. */
92 /* STRUCTURE FOR A REGULAR EXPRESSION (regex) `PROGRAM'.
94 * This is essentially a linear encoding of a nondeterministic finite-state
95 * machine or NFA (aka syntax charts or `railroad normal form' in parsing
96 * technology). Each node is an opcode plus a NEXT pointer, possibly
97 * followed by operands. NEXT pointers of all nodes except BRANCH implement
98 * concatenation; a NEXT pointer with a BRANCH on both ends of it is
99 * connecting two alternatives. (Here we have one of the subtle syntax
100 * dependencies: an individual BRANCH (as opposed to a collection of them) is
101 * never concatenated with anything because of operator precedence.) The
102 * operand of some types of nodes is a literal string; for others, it is a node
103 * leading into a sub-FSM. In particular, the operand of a BRANCH node is the
104 * first node of the branch. (NB this is _NOT_ a tree structure: the tail of
105 * the branch connects to the thing following the set of BRANCHes.)
107 * The opcodes are: */
109 /* DEFINITION VALUE MEANING */
111 #define END 1 /* End of program. */
113 /* Zero width positional assertions. */
115 #define BOL 2 /* Match position at beginning of line. */
116 #define EOL 3 /* Match position at end of line. */
117 #define BOWORD 4 /* Match "" representing word delimiter or BOL */
118 #define EOWORD 5 /* Match "" representing word delimiter or EOL */
119 #define NOT_BOUNDARY 6 /* Not word boundary (\B, opposite of < and >) */
121 /* Op codes with null terminated string operands. */
123 #define EXACTLY 7 /* Match this string. */
124 #define SIMILAR 8 /* Match this case insensitive string */
125 #define ANY_OF 9 /* Match any character in the set. */
126 #define ANY_BUT 10 /* Match any character not in the set. */
128 /* Op codes to match any character. */
130 #define ANY 11 /* Match any one character (implements '.') */
131 #define EVERY 12 /* Same as ANY but matches newline. */
133 /* Shortcut escapes, \d, \D, \l, \L, \s, \S, \w, \W, \y, \Y. */
135 #define DIGIT 13 /* Match any digit, i.e. [0123456789] */
136 #define NOT_DIGIT 14 /* Match any non-digit, i.e. [^0123456789] */
137 #define LETTER 15 /* Match any letter character [a-zA-Z] */
138 #define NOT_LETTER 16 /* Match any non-letter character [^a-zA-Z] */
139 #define SPACE 17 /* Match any whitespace character EXCEPT \n */
140 #define SPACE_NL 18 /* Match any whitespace character INCLUDING \n */
141 #define NOT_SPACE 19 /* Match any non-whitespace character */
142 #define NOT_SPACE_NL 20 /* Same as NOT_SPACE but matches newline. */
143 #define WORD_CHAR 21 /* Match any word character [a-zA-Z0-9_] */
144 #define NOT_WORD_CHAR 22 /* Match any non-word character [^a-zA-Z0-9_] */
145 #define IS_DELIM 23 /* Match any character that's a word delimiter */
146 #define NOT_DELIM 24 /* Match any character NOT a word delimiter */
148 /* Quantifier nodes. (Only applied to SIMPLE nodes. Quantifiers applied to non
149 SIMPLE nodes or larger atoms are implemented using complex constructs.)*/
151 #define STAR 25 /* Match this (simple) thing 0 or more times. */
152 #define LAZY_STAR 26 /* Minimal matching STAR */
153 #define QUESTION 27 /* Match this (simple) thing 0 or 1 times. */
154 #define LAZY_QUESTION 28 /* Minimal matching QUESTION */
155 #define PLUS 29 /* Match this (simple) thing 1 or more times. */
156 #define LAZY_PLUS 30 /* Minimal matching PLUS */
157 #define BRACE 31 /* Match this (simple) thing m to n times. */
158 #define LAZY_BRACE 32 /* Minimal matching BRACE */
160 /* Nodes used to build complex constructs. */
162 #define NOTHING 33 /* Match empty string (always matches) */
163 #define BRANCH 34 /* Match this alternative, or the next... */
164 #define BACK 35 /* Always matches, NEXT ptr points backward. */
165 #define INIT_COUNT 36 /* Initialize {m,n} counter to zero */
166 #define INC_COUNT 37 /* Increment {m,n} counter by one */
167 #define TEST_COUNT 38 /* Test {m,n} counter against operand */
169 /* Back Reference nodes. */
171 #define BACK_REF 39 /* Match latest matched parenthesized text */
172 #define BACK_REF_CI 40 /* Case insensitive version of BACK_REF */
173 #define X_REGEX_BR 41 /* Cross-Regex Back-Ref for syntax highlighting */
174 #define X_REGEX_BR_CI 42 /* Case insensitive version of X_REGEX_BR_CI */
176 /* Various nodes used to implement parenthetical constructs. */
178 #define POS_AHEAD_OPEN 43 /* Begin positive look ahead */
179 #define NEG_AHEAD_OPEN 44 /* Begin negative look ahead */
180 #define LOOK_AHEAD_CLOSE 45 /* End positive or negative look ahead */
182 #define POS_BEHIND_OPEN 46 /* Begin positive look behind */
183 #define NEG_BEHIND_OPEN 47 /* Begin negative look behind */
184 #define LOOK_BEHIND_CLOSE 48 /* Close look behind */
186 #define OPEN 49 /* Open for capturing parentheses. */
188 /* OPEN+1 is number 1, etc. */
189 #define CLOSE (OPEN + NSUBEXP) /* Close for capturing parentheses. */
191 #define LAST_PAREN (CLOSE + NSUBEXP)
193 #if (LAST_PAREN > UCHAR_MAX)
194 #error "Too many parentheses for storage in an unsigned char (LAST_PAREN too big.)"
195 #endif
197 /* The next_ptr () function can consume up to 30% of the time during matching
198 because it is called an immense number of times (an average of 25
199 next_ptr() calls per match() call was witnessed for Perl syntax
200 highlighting). Therefore it is well worth removing some of the function
201 call overhead by selectively inlining the next_ptr() calls. Moreover,
202 the inlined code can be simplified for matching because one of the tests,
203 only necessary during compilation, can be left out.
204 The net result of using this inlined version at two critical places is
205 a 25% speedup (again, witnesses on Perl syntax highlighting). */
207 #define NEXT_PTR(in_ptr, out_ptr)\
208 next_ptr_offset = GET_OFFSET (in_ptr);\
209 if (next_ptr_offset == 0)\
210 out_ptr = NULL;\
211 else {\
212 if (GET_OP_CODE (in_ptr) == BACK)\
213 out_ptr = in_ptr - next_ptr_offset;\
214 else \
215 out_ptr = in_ptr + next_ptr_offset;\
218 /* OPCODE NOTES:
219 ------------
221 All nodes consist of an 8 bit op code followed by 2 bytes that make up a 16
222 bit NEXT pointer. Some nodes have a null terminated character string operand
223 following the NEXT pointer. Other nodes may have an 8 bit index operand.
224 The TEST_COUNT node has an index operand followed by a 16 bit test value.
225 The BRACE and LAZY_BRACE nodes have two 16 bit values for min and max but no
226 index value.
228 SIMILAR
229 Operand(s): null terminated string
231 Implements a case insensitive match of a string. Mostly intended for use
232 in syntax highlighting patterns for keywords of languages like FORTRAN
233 and Ada that are case insensitive. The regex text in this node is
234 converted to lower case during regex compile.
236 DIGIT, NOT_DIGIT, LETTER, NOT_LETTER, SPACE, NOT_SPACE, WORD_CHAR,
237 NOT_WORD_CHAR
238 Operand(s): None
240 Implements shortcut escapes \d, \D, \l, \L, \s, \S, \w, \W. The locale
241 aware ANSI functions isdigit(), isalpha(), isalnun(), and isspace() are
242 used to implement these in the hopes of increasing portability.
244 NOT_BOUNDARY
245 Operand(s): None
247 Implements \B as a zero width assertion that the current character is
248 NOT on a word boundary. Word boundaries are defined to be the position
249 between two characters where one of those characters is one of the
250 dynamically defined word delimiters, and the other character is not.
252 IS_DELIM
253 Operand(s): None
255 Implements \y as any character that is one of the dynamically
256 specified word delimiters.
258 NOT_DELIM
259 Operand(s): None
261 Implements \Y as any character that is NOT one of the dynamically
262 specified word delimiters.
264 STAR, PLUS, QUESTION, and complex '*', '+', and '?'
265 Operand(s): None (Note: NEXT pointer is usually zero. The code that
266 processes this node skips over it.)
268 Complex (parenthesized) versions implemented as circular BRANCH
269 structures using BACK. SIMPLE versions (one character per match) are
270 implemented separately for speed and to minimize recursion.
272 BRACE, LAZY_BRACE
273 Operand(s): minimum value (2 bytes), maximum value (2 bytes)
275 Implements the {m,n} construct for atoms that are SIMPLE.
277 BRANCH
278 Operand(s): None
280 The set of branches constituting a single choice are hooked together
281 with their NEXT pointers, since precedence prevents anything being
282 concatenated to any individual branch. The NEXT pointer of the last
283 BRANCH in a choice points to the thing following the whole choice. This
284 is also where the final NEXT pointer of each individual branch points;
285 each branch starts with the operand node of a BRANCH node.
287 BACK
288 Operand(s): None
290 Normal NEXT pointers all implicitly point forward. Back implicitly
291 points backward. BACK exists to make loop structures possible.
293 INIT_COUNT
294 Operand(s): index (1 byte)
296 Initializes the count array element referenced by the index operand.
297 This node is used to build general (i.e. parenthesized) {m,n} constructs.
299 INC_COUNT
300 Operand(s): index (1 byte)
302 Increments the count array element referenced by the index operand.
303 This node is used to build general (i.e. parenthesized) {m,n} constructs.
305 TEST_COUNT
306 Operand(s): index (1 byte), test value (2 bytes)
308 Tests the current value of the count array element specified by the
309 index operand against the test value. If the current value is less than
310 the test value, control passes to the node after that TEST_COUNT node.
311 Otherwise control passes to the node referenced by the NEXT pointer for
312 the TEST_COUNT node. This node is used to build general (i.e.
313 parenthesized) {m,n} constructs.
315 BACK_REF, BACK_REF_CI
316 Operand(s): index (1 byte, value 1-9)
318 Implements back references. This node will attempt to match whatever text
319 was most recently captured by the index'th set of parentheses.
320 BACK_REF_CI is case insensitive version.
322 X_REGEX_BR, X_REGEX_BR_CI
323 (NOT IMPLEMENTED YET)
325 Operand(s): index (1 byte, value 1-9)
327 Implements back references into a previously matched but separate regular
328 expression. This is used by syntax highlighting patterns. This node will
329 attempt to match whatever text was most captured by the index'th set of
330 parentheses of the separate regex passed to ExecRE. X_REGEX_BR_CI is case
331 insensitive version.
333 POS_AHEAD_OPEN, NEG_AHEAD_OPEN, LOOK_AHEAD_CLOSE
335 Operand(s): None
337 Implements positive and negative look ahead. Look ahead is an assertion
338 that something is either there or not there. Once this is determined the
339 regex engine backtracks to where it was just before the look ahead was
340 encountered, i.e. look ahead is a zero width assertion.
342 POS_BEHIND_OPEN, NEG_BEHIND_OPEN, LOOK_BEHIND_CLOSE
344 Operand(s): 2x2 bytes for OPEN (match boundaries), None for CLOSE
346 Implements positive and negative look behind. Look behind is an assertion
347 that something is either there or not there in front of the current
348 position. Look behind is a zero width assertion, with the additional
349 constraint that it must have a bounded length (for complexity and
350 efficiency reasons; note that most other implementation even impose
351 fixed length).
353 OPEN, CLOSE
355 Operand(s): None
357 OPEN + n = Start of parenthesis 'n', CLOSE + n = Close of parenthesis
358 'n', and are numbered at compile time.
361 /* A node is one char of opcode followed by two chars of NEXT pointer plus
362 * any operands. NEXT pointers are stored as two 8-bit pieces, high order
363 * first. The value is a positive offset from the opcode of the node
364 * containing it. An operand, if any, simply follows the node. (Note that
365 * much of the code generation knows about this implicit relationship.)
367 * Using two bytes for NEXT_PTR_SIZE is vast overkill for most things,
368 * but allows patterns to get big without disasters. */
370 #define OP_CODE_SIZE 1
371 #define NEXT_PTR_SIZE 2
372 #define INDEX_SIZE 1
373 #define LENGTH_SIZE 4
374 #define NODE_SIZE (NEXT_PTR_SIZE + OP_CODE_SIZE)
376 #define GET_OP_CODE(p) (*(unsigned char *)(p))
377 #define OPERAND(p) ((p) + NODE_SIZE)
378 #define GET_OFFSET(p) ((( *((p) + 1) & 0377) << 8) + (( *((p) + 2)) & 0377))
379 #define PUT_OFFSET_L(v) (unsigned char)(((v) >> 8) & 0377)
380 #define PUT_OFFSET_R(v) (unsigned char) ((v) & 0377)
381 #define GET_LOWER(p) ((( *((p) + NODE_SIZE) & 0377) << 8) + \
382 (( *((p) + NODE_SIZE+1)) & 0377))
383 #define GET_UPPER(p) ((( *((p) + NODE_SIZE+2) & 0377) << 8) + \
384 (( *((p) + NODE_SIZE+3)) & 0377))
386 /* Utility definitions. */
388 #define REG_FAIL(m) {*Error_Ptr = (m); return (NULL);}
389 #define IS_QUANTIFIER(c) ((c) == '*' || (c) == '+' || \
390 (c) == '?' || (c) == Brace_Char)
391 #define SET_BIT(i,n) ((i) |= (1 << ((n) - 1)))
392 #define TEST_BIT(i,n) ((i) & (1 << ((n) - 1)))
393 #define U_CHAR_AT(p) ((unsigned int) *(unsigned char *)(p))
395 /* Flags to be passed up and down via function parameters during compile. */
397 #define WORST 0 /* Worst case. No assumptions can be made.*/
398 #define HAS_WIDTH 1 /* Known never to match null string. */
399 #define SIMPLE 2 /* Simple enough to be STAR/PLUS operand. */
401 #define NO_PAREN 0 /* Only set by initial call to "chunk". */
402 #define PAREN 1 /* Used for normal capturing parentheses. */
403 #define NO_CAPTURE 2 /* Non-capturing parentheses (grouping only). */
404 #define INSENSITIVE 3 /* Case insensitive parenthetical construct */
405 #define SENSITIVE 4 /* Case sensitive parenthetical construct */
406 #define NEWLINE 5 /* Construct to match newlines in most cases */
407 #define NO_NEWLINE 6 /* Construct to match newlines normally */
409 #define REG_INFINITY 0UL
410 #define REG_ZERO 0UL
411 #define REG_ONE 1UL
413 /* Flags for function shortcut_escape() */
415 #define CHECK_ESCAPE 0 /* Check an escape sequence for validity only. */
416 #define CHECK_CLASS_ESCAPE 1 /* Check the validity of an escape within a
417 character class */
418 #define EMIT_CLASS_BYTES 2 /* Emit equivalent character class bytes,
419 e.g \d=0123456789 */
420 #define EMIT_NODE 3 /* Emit the appropriate node. */
422 /* Array sizes for arrays used by function init_ansi_classes. */
424 #define WHITE_SPACE_SIZE 16
425 #define ALNUM_CHAR_SIZE 256
427 /* Number of bytes to offset from the beginning of the regex program to the start
428 of the actual compiled regex code, i.e. skipping over the MAGIC number and
429 the two counters at the front. */
431 #define REGEX_START_OFFSET 3
433 #define MAX_COMPILED_SIZE 32767UL /* Largest size a compiled regex can be.
434 Probably could be 65535UL. */
436 /* Global work variables for `CompileRE'. */
438 static unsigned char *Reg_Parse; /* Input scan ptr (scans user's regex) */
439 static int Total_Paren; /* Parentheses, (), counter. */
440 static int Num_Braces; /* Number of general {m,n} constructs.
441 {m,n} quantifiers of SIMPLE atoms are
442 not included in this count. */
443 static int Closed_Parens; /* Bit flags indicating () closure. */
444 static int Paren_Has_Width; /* Bit flags indicating ()'s that are
445 known to not match the empty string */
446 static unsigned char Compute_Size; /* Address of this used as flag. */
447 static unsigned char *Code_Emit_Ptr; /* When Code_Emit_Ptr is set to
448 &Compute_Size no code is emitted.
449 Instead, the size of code that WOULD
450 have been generated is accumulated in
451 Reg_Size. Otherwise, Code_Emit_Ptr
452 points to where compiled regex code is
453 to be written. */
454 static unsigned long Reg_Size; /* Size of compiled regex code. */
455 static char **Error_Ptr; /* Place to store error messages so
456 they can be returned by `CompileRE' */
457 static char Error_Text [128];/* Sting to build error messages in. */
459 static unsigned char White_Space [WHITE_SPACE_SIZE]; /* Arrays used by */
460 static unsigned char Word_Char [ALNUM_CHAR_SIZE]; /* functions */
461 static unsigned char Letter_Char [ALNUM_CHAR_SIZE]; /* init_ansi_classes () */
462 /* and
463 shortcut_escape (). */
465 static unsigned char ASCII_Digits [] = "0123456789"; /* Same for all */
466 /* locales. */
467 static int Is_Case_Insensitive;
468 static int Match_Newline;
470 static int Enable_Counting_Quantifier = 1;
471 static unsigned char Brace_Char;
472 static unsigned char Default_Meta_Char [] = "{.*+?[(|)^<>$";
473 static unsigned char *Meta_Char;
475 typedef struct { long lower; long upper; } len_range;
477 /* Forward declarations for functions used by `CompileRE'. */
479 static unsigned char * alternative (int *flag_param, len_range *range_param);
480 static unsigned char * back_ref (unsigned char *c, int *flag_param,
481 int emit);
482 static unsigned char * chunk (int paren, int *flag_param, len_range *range_param);
483 static void emit_byte (unsigned char c);
484 static void emit_class_byte (unsigned char c);
485 static unsigned char * emit_node (int op_code);
486 static unsigned char * emit_special (unsigned char op_code,
487 unsigned long test_val,
488 int index);
489 static unsigned char literal_escape (unsigned char c);
490 static unsigned char numeric_escape (unsigned char c, unsigned char **parse);
491 static unsigned char * atom (int *flag_param, len_range *range_param);
492 static void reg_error (char *str);
493 static unsigned char * insert (unsigned char op, unsigned char *opnd,
494 long min, long max, int index);
495 static unsigned char * next_ptr (unsigned char *ptr);
496 static void offset_tail (unsigned char *ptr, int offset,
497 unsigned char *val);
498 static void branch_tail (unsigned char *ptr, int offset,
499 unsigned char *val);
500 static unsigned char * piece (int *flag_param, len_range *range_param);
501 static void tail (unsigned char *search_from,
502 unsigned char *point_t);
503 static unsigned char * shortcut_escape (unsigned char c, int *flag_param,
504 int emit);
506 static int init_ansi_classes (void);
508 /*----------------------------------------------------------------------*
509 * CompileRE
511 * Compiles a regular expression into the internal format used by
512 * `ExecRE'.
514 * The default behaviour wrt. case sensitivity and newline matching can
515 * be controlled through the defaultFlags argument (Markus Schwarzenberg).
516 * Future extensions are possible by using other flag bits.
517 * Note that currently only the case sensitivity flag is effectively used.
519 * Beware that the optimization and preparation code in here knows about
520 * some of the structure of the compiled regexp.
521 *----------------------------------------------------------------------*/
523 regexp * CompileRE (const char *exp, char **errorText, int defaultFlags) {
525 register regexp *comp_regex = NULL;
526 register unsigned char *scan;
527 int flags_local, pass;
528 len_range range_local;
530 if (Enable_Counting_Quantifier) {
531 Brace_Char = '{';
532 Meta_Char = &Default_Meta_Char [0];
533 } else {
534 Brace_Char = '*'; /* Bypass the '{' in */
535 Meta_Char = &Default_Meta_Char [1]; /* Default_Meta_Char */
538 /* Set up errorText to receive failure reports. */
540 Error_Ptr = errorText;
541 *Error_Ptr = "";
543 if (exp == NULL) REG_FAIL ("NULL argument, `CompileRE\'");
545 /* Initialize arrays used by function `shortcut_escape'. */
547 if (!init_ansi_classes ()) REG_FAIL ("internal error #1, `CompileRE\'");
549 Code_Emit_Ptr = &Compute_Size;
550 Reg_Size = 0UL;
552 /* We can't allocate space until we know how big the compiled form will be,
553 but we can't compile it (and thus know how big it is) until we've got a
554 place to put the code. So we cheat: we compile it twice, once with code
555 generation turned off and size counting turned on, and once "for real".
556 This also means that we don't allocate space until we are sure that the
557 thing really will compile successfully, and we never have to move the
558 code and thus invalidate pointers into it. (Note that it has to be in
559 one piece because free() must be able to free it all.) */
561 for (pass = 1; pass <= 2; pass++) {
562 /*-------------------------------------------*
563 * FIRST PASS: Determine size and legality. *
564 * SECOND PASS: Emit code. *
565 *-------------------------------------------*/
567 /* Schwarzenberg:
568 * If defaultFlags = 0 use standard defaults:
569 * Is_Case_Insensitive: Case sensitive is the default
570 * Match_Newline: Newlines are NOT matched by default
571 * in character classes
573 Is_Case_Insensitive = ((defaultFlags & REDFLT_CASE_INSENSITIVE) ? 1 : 0);
574 Match_Newline = 0; /* ((defaultFlags & REDFLT_MATCH_NEWLINE) ? 1 : 0);
575 Currently not used. Uncomment if needed. */
577 Reg_Parse = (unsigned char *) exp;
578 Total_Paren = 1;
579 Num_Braces = 0;
580 Closed_Parens = 0;
581 Paren_Has_Width = 0;
583 emit_byte (MAGIC);
584 emit_byte ('%'); /* Placeholder for num of capturing parentheses. */
585 emit_byte ('%'); /* Placeholder for num of general {m,n} constructs. */
587 if (chunk (NO_PAREN, &flags_local, &range_local) == NULL)
588 return (NULL); /* Something went wrong */
589 if (pass == 1) {
590 if (Reg_Size >= MAX_COMPILED_SIZE) {
591 /* Too big for NEXT pointers NEXT_PTR_SIZE bytes long to span.
592 This is a real issue since the first BRANCH node usually points
593 to the end of the compiled regex code. */
595 sprintf (Error_Text, "regexp > %lu bytes", MAX_COMPILED_SIZE);
596 REG_FAIL (Error_Text);
599 /* Allocate memory. */
601 comp_regex = (regexp *) malloc (sizeof (regexp) + Reg_Size);
603 if (comp_regex == NULL) REG_FAIL ("out of memory in `CompileRE\'");
605 Code_Emit_Ptr = (unsigned char *) comp_regex->program;
609 comp_regex->program [1] = (unsigned char) Total_Paren - 1;
610 comp_regex->program [2] = (unsigned char) Num_Braces;
612 /*----------------------------------------*
613 * Dig out information for optimizations. *
614 *----------------------------------------*/
616 comp_regex->match_start = '\0'; /* Worst-case defaults. */
617 comp_regex->anchor = 0;
619 /* First BRANCH. */
621 scan = (unsigned char *) (comp_regex->program + REGEX_START_OFFSET);
623 if (GET_OP_CODE (next_ptr (scan)) == END) { /* Only one top-level choice. */
624 scan = OPERAND (scan);
626 /* Starting-point info. */
628 if (GET_OP_CODE (scan) == EXACTLY) {
629 comp_regex->match_start = *OPERAND (scan);
631 } else if (PLUS <= GET_OP_CODE (scan) &&
632 GET_OP_CODE (scan) <= LAZY_PLUS) {
634 /* Allow x+ or x+? at the start of the regex to be
635 optimized. */
637 if (GET_OP_CODE (scan + NODE_SIZE) == EXACTLY) {
638 comp_regex->match_start = *OPERAND (scan + NODE_SIZE);
640 } else if (GET_OP_CODE (scan) == BOL) {
641 comp_regex->anchor++;
645 return (comp_regex);
648 /*----------------------------------------------------------------------*
649 * chunk *
651 * Process main body of regex or process a parenthesized "thing". *
653 * Caller must absorb opening parenthesis. *
655 * Combining parenthesis handling with the base level of regular *
656 * expression is a trifle forced, but the need to tie the tails of the *
657 * branches to what follows makes it hard to avoid. *
658 *----------------------------------------------------------------------*/
660 static unsigned char * chunk (int paren, int *flag_param,
661 len_range *range_param) {
663 register unsigned char *ret_val = NULL;
664 register unsigned char *this_branch;
665 register unsigned char *ender = NULL;
666 register int this_paren = 0;
667 int flags_local, first = 1, zero_width, i;
668 int old_sensitive = Is_Case_Insensitive;
669 int old_newline = Match_Newline;
670 len_range range_local;
671 int look_only = 0;
672 unsigned char *emit_look_behind_bounds = NULL;
675 *flag_param = HAS_WIDTH; /* Tentatively. */
676 range_param->lower = 0; /* Idem */
677 range_param->upper = 0;
679 /* Make an OPEN node, if parenthesized. */
681 if (paren == PAREN) {
682 if (Total_Paren >= NSUBEXP) {
683 sprintf (Error_Text, "number of ()'s > %d", (int) NSUBEXP);
684 REG_FAIL (Error_Text);
687 this_paren = Total_Paren; Total_Paren++;
688 ret_val = emit_node (OPEN + this_paren);
689 } else if (paren == POS_AHEAD_OPEN || paren == NEG_AHEAD_OPEN) {
690 *flag_param = WORST; /* Look ahead is zero width. */
691 look_only = 1;
692 ret_val = emit_node (paren);
693 } else if (paren == POS_BEHIND_OPEN || paren == NEG_BEHIND_OPEN) {
694 *flag_param = WORST; /* Look behind is zero width. */
695 look_only = 1;
696 /* We'll overwrite the zero length later on, so we save the ptr */
697 ret_val = emit_special (paren, 0, 0);
698 emit_look_behind_bounds = ret_val + NODE_SIZE;
699 } else if (paren == INSENSITIVE) {
700 Is_Case_Insensitive = 1;
701 } else if (paren == SENSITIVE) {
702 Is_Case_Insensitive = 0;
703 } else if (paren == NEWLINE) {
704 Match_Newline = 1;
705 } else if (paren == NO_NEWLINE) {
706 Match_Newline = 0;
709 /* Pick up the branches, linking them together. */
711 do {
712 this_branch = alternative (&flags_local, &range_local);
714 if (this_branch == NULL) return (NULL);
716 if (first) {
717 first = 0;
718 *range_param = range_local;
719 if (ret_val == NULL) ret_val = this_branch;
720 } else if (range_param->lower >= 0) {
721 if (range_local.lower >= 0) {
722 if (range_local.lower < range_param->lower)
723 range_param->lower = range_local.lower;
724 if (range_local.upper > range_param->upper)
725 range_param->upper = range_local.upper;
726 } else {
727 range_param->lower = -1; /* Branches have different lengths */
728 range_param->upper = -1;
732 tail (ret_val, this_branch); /* Connect BRANCH -> BRANCH. */
734 /* If any alternative could be zero width, consider the whole
735 parenthisized thing to be zero width. */
737 if (!(flags_local & HAS_WIDTH)) *flag_param &= ~HAS_WIDTH;
739 /* Are there more alternatives to process? */
741 if (*Reg_Parse != '|') break;
743 Reg_Parse++;
744 } while (1);
746 /* Make a closing node, and hook it on the end. */
748 if (paren == PAREN) {
749 ender = emit_node (CLOSE + this_paren);
751 } else if (paren == NO_PAREN) {
752 ender = emit_node (END);
754 } else if (paren == POS_AHEAD_OPEN || paren == NEG_AHEAD_OPEN) {
755 ender = emit_node (LOOK_AHEAD_CLOSE);
757 } else if (paren == POS_BEHIND_OPEN || paren == NEG_BEHIND_OPEN) {
758 ender = emit_node (LOOK_BEHIND_CLOSE);
760 } else {
761 ender = emit_node (NOTHING);
764 tail (ret_val, ender);
766 /* Hook the tails of the branch alternatives to the closing node. */
768 for (this_branch = ret_val; this_branch != NULL; ) {
769 branch_tail (this_branch, NODE_SIZE, ender);
770 this_branch = next_ptr (this_branch);
773 /* Check for proper termination. */
775 if (paren != NO_PAREN && *Reg_Parse++ != ')') {
776 REG_FAIL ("missing right parenthesis \')\'");
777 } else if (paren == NO_PAREN && *Reg_Parse != '\0') {
778 if (*Reg_Parse == ')') {
779 REG_FAIL ("missing left parenthesis \'(\'");
780 } else {
781 REG_FAIL ("junk on end"); /* "Can't happen" - NOTREACHED */
785 /* Check whether look behind has a fixed size */
787 if (emit_look_behind_bounds) {
788 if (range_param->lower < 0) {
789 REG_FAIL ("look-behind does not have a bounded size");
791 if (range_param->upper > 65535L) {
792 REG_FAIL ("max. look-behind size is too large (>65535)")
794 if (Code_Emit_Ptr != &Compute_Size) {
795 *emit_look_behind_bounds++ = PUT_OFFSET_L (range_param->lower);
796 *emit_look_behind_bounds++ = PUT_OFFSET_R (range_param->lower);
797 *emit_look_behind_bounds++ = PUT_OFFSET_L (range_param->upper);
798 *emit_look_behind_bounds = PUT_OFFSET_R (range_param->upper);
802 /* For look ahead/behind, the length must be set to zero again */
803 if (look_only) {
804 range_param->lower = 0;
805 range_param->upper = 0;
808 zero_width = 0;
810 /* Set a bit in Closed_Parens to let future calls to function `back_ref'
811 know that we have closed this set of parentheses. */
813 if (paren == PAREN && this_paren <= (int)sizeof (Closed_Parens) * CHAR_BIT) {
814 SET_BIT (Closed_Parens, this_paren);
816 /* Determine if a parenthesized expression is modified by a quantifier
817 that can have zero width. */
819 if (*(Reg_Parse) == '?' || *(Reg_Parse) == '*') {
820 zero_width++;
821 } else if (*(Reg_Parse) == '{' && Brace_Char == '{') {
822 if (*(Reg_Parse + 1) == ',' || *(Reg_Parse + 1) == '}') {
823 zero_width++;
824 } else if (*(Reg_Parse + 1) == '0') {
825 i = 2;
827 while (*(Reg_Parse + i) == '0') i++;
829 if (*(Reg_Parse + i) == ',') zero_width++;
834 /* If this set of parentheses is known to never match the empty string, set
835 a bit in Paren_Has_Width to let future calls to function back_ref know
836 that this set of parentheses has non-zero width. This will allow star
837 (*) or question (?) quantifiers to be aplied to a back-reference that
838 refers to this set of parentheses. */
840 if ((*flag_param & HAS_WIDTH) &&
841 paren == PAREN &&
842 !zero_width &&
843 this_paren <= (int)(sizeof (Paren_Has_Width) * CHAR_BIT)) {
845 SET_BIT (Paren_Has_Width, this_paren);
848 Is_Case_Insensitive = old_sensitive;
849 Match_Newline = old_newline;
851 return (ret_val);
854 /*----------------------------------------------------------------------*
855 * alternative
857 * Processes one alternative of an '|' operator. Connects the NEXT
858 * pointers of each regex atom together sequentialy.
859 *----------------------------------------------------------------------*/
861 static unsigned char * alternative (int *flag_param, len_range *range_param) {
863 register unsigned char *ret_val;
864 register unsigned char *chain;
865 register unsigned char *latest;
866 int flags_local;
867 len_range range_local;
869 *flag_param = WORST; /* Tentatively. */
870 range_param->lower = 0; /* Idem */
871 range_param->upper = 0;
873 ret_val = emit_node (BRANCH);
874 chain = NULL;
876 /* Loop until we hit the start of the next alternative, the end of this set
877 of alternatives (end of parentheses), or the end of the regex. */
879 while (*Reg_Parse != '|' && *Reg_Parse != ')' && *Reg_Parse != '\0') {
880 latest = piece (&flags_local, &range_local);
882 if (latest == NULL) return (NULL); /* Something went wrong. */
884 *flag_param |= flags_local & HAS_WIDTH;
885 if (range_local.lower < 0) {
886 /* Not a fixed length */
887 range_param->lower = -1;
888 range_param->upper = -1;
889 } else if (range_param->lower >= 0) {
890 range_param->lower += range_local.lower;
891 range_param->upper += range_local.upper;
894 if (chain != NULL) { /* Connect the regex atoms together sequentialy. */
895 tail (chain, latest);
898 chain = latest;
901 if (chain == NULL) { /* Loop ran zero times. */
902 (void) emit_node (NOTHING);
905 return (ret_val);
908 /*----------------------------------------------------------------------*
909 * piece - something followed by possible '*', '+', '?', or "{m,n}"
911 * Note that the branching code sequences used for the general cases of
912 * *, +. ?, and {m,n} are somewhat optimized: they use the same
913 * NOTHING node as both the endmarker for their branch list and the
914 * body of the last branch. It might seem that this node could be
915 * dispensed with entirely, but the endmarker role is not redundant.
916 *----------------------------------------------------------------------*/
918 static unsigned char * piece (int *flag_param, len_range *range_param) {
920 register unsigned char *ret_val;
921 register unsigned char *next;
922 register unsigned char op_code;
923 unsigned long min_max [2] = {REG_ZERO, REG_INFINITY};
924 int flags_local, i, brace_present = 0;
925 int lazy = 0, comma_present = 0;
926 int digit_present [2] = {0,0};
927 len_range range_local;
929 ret_val = atom (&flags_local, &range_local);
931 if (ret_val == NULL) return (NULL); /* Something went wrong. */
933 op_code = *Reg_Parse;
935 if (!IS_QUANTIFIER (op_code)) {
936 *flag_param = flags_local;
937 *range_param = range_local;
938 return (ret_val);
939 } else if (op_code == '{') { /* {n,m} quantifier present */
940 brace_present++;
941 Reg_Parse++;
943 /* This code will allow specifying a counting range in any of the
944 following forms:
946 {m,n} between m and n.
947 {,n} same as {0,n} or between 0 and infinity.
948 {m,} same as {m,0} or between m and infinity.
949 {m} same as {m,m} or exactly m.
950 {,} same as {0,0} or between 0 and infinity or just '*'.
951 {} same as {0,0} or between 0 and infinity or just '*'.
953 Note that specifying a max of zero, {m,0} is not allowed in the regex
954 itself, but it is implemented internally that way to support '*', '+',
955 and {min,} constructs and signals an unlimited number. */
957 for (i = 0; i < 2; i++) {
958 /* Look for digits of number and convert as we go. The numeric maximum
959 value for max and min of 65,535 is due to using 2 bytes to store
960 each value in the compiled regex code. */
962 while (isdigit (*Reg_Parse)) {
963 /* (6553 * 10 + 6) > 65535 (16 bit max) */
965 if ((min_max [i] == 6553UL && (*Reg_Parse - '0') <= 5) ||
966 (min_max [i] <= 6552UL)) {
968 min_max [i] = (min_max [i] * 10UL) +
969 (unsigned long) (*Reg_Parse - '0');
970 Reg_Parse++;
972 digit_present [i]++;
973 } else {
974 if (i == 0) {
975 sprintf (Error_Text, "min operand of {%lu%c,???} > 65535",
976 min_max [0], *Reg_Parse);
977 } else {
978 sprintf (Error_Text, "max operand of {%lu,%lu%c} > 65535",
979 min_max [0], min_max [1], *Reg_Parse);
982 REG_FAIL (Error_Text);
986 if (!comma_present && *Reg_Parse == ',') {
987 comma_present++;
988 Reg_Parse++;
992 /* A max of zero can not be specified directly in the regex since it would
993 signal a max of infinity. This code specifically disallows `{0,0}',
994 `{,0}', and `{0}' which really means nothing to humans but would be
995 interpreted as `{0,infinity}' or `*' if we didn't make this check. */
997 if (digit_present [0] && (min_max [0] == REG_ZERO) && !comma_present) {
999 REG_FAIL ("{0} is an invalid range");
1000 } else if (digit_present [0] && (min_max [0] == REG_ZERO) &&
1001 digit_present [1] && (min_max [1] == REG_ZERO)) {
1003 REG_FAIL ("{0,0} is an invalid range");
1004 } else if (digit_present [1] && (min_max [1] == REG_ZERO)) {
1005 if (digit_present [0]) {
1006 sprintf (Error_Text, "{%lu,0} is an invalid range", min_max [0]);
1007 REG_FAIL (Error_Text);
1008 } else {
1009 REG_FAIL ("{,0} is an invalid range");
1013 if (!comma_present) min_max [1] = min_max [0]; /* {x} means {x,x} */
1015 if (*Reg_Parse != '}') {
1016 REG_FAIL ("{m,n} specification missing right \'}\'");
1018 } else if (min_max [1] != REG_INFINITY && min_max [0] > min_max [1]) {
1019 /* Disallow a backward range. */
1021 sprintf (Error_Text, "{%lu,%lu} is an invalid range",
1022 min_max [0], min_max [1]);
1023 REG_FAIL (Error_Text);
1027 Reg_Parse++;
1029 /* Check for a minimal matching (non-greedy or "lazy") specification. */
1031 if (*Reg_Parse == '?') {
1032 lazy = 1;
1033 Reg_Parse++;
1036 /* Avoid overhead of counting if possible */
1038 if (op_code == '{') {
1039 if (min_max [0] == REG_ZERO && min_max [1] == REG_INFINITY) {
1040 op_code = '*';
1041 } else if (min_max [0] == REG_ONE && min_max [1] == REG_INFINITY) {
1042 op_code = '+';
1043 } else if (min_max [0] == REG_ZERO && min_max [1] == REG_ONE) {
1044 op_code = '?';
1045 } else if (min_max [0] == REG_ONE && min_max [1] == REG_ONE) {
1046 /* "x{1,1}" is the same as "x". No need to pollute the compiled
1047 regex with such nonsense. */
1049 *flag_param = flags_local;
1050 *range_param = range_local;
1051 return (ret_val);
1052 } else if (Num_Braces > (int)UCHAR_MAX) {
1053 sprintf (Error_Text, "number of {m,n} constructs > %d", UCHAR_MAX);
1054 REG_FAIL (Error_Text);
1058 if (op_code == '+') min_max [0] = REG_ONE;
1059 if (op_code == '?') min_max [1] = REG_ONE;
1061 /* It is dangerous to apply certain quantifiers to a possibly zero width
1062 item. */
1064 if (!(flags_local & HAS_WIDTH)) {
1065 if (brace_present) {
1066 sprintf (Error_Text, "{%lu,%lu} operand could be empty",
1067 min_max [0], min_max [1]);
1068 } else {
1069 sprintf (Error_Text, "%c operand could be empty", op_code);
1072 REG_FAIL (Error_Text);
1075 *flag_param = (min_max [0] > REG_ZERO) ? (WORST | HAS_WIDTH) : WORST;
1076 if (range_local.lower >= 0) {
1077 if (min_max[1] != REG_INFINITY) {
1078 range_param->lower = range_local.lower * min_max[0];
1079 range_param->upper = range_local.upper * min_max[1];
1080 } else {
1081 range_param->lower = -1; /* Not a fixed-size length */
1082 range_param->upper = -1;
1084 } else {
1085 range_param->lower = -1; /* Not a fixed-size length */
1086 range_param->upper = -1;
1089 /*---------------------------------------------------------------------*
1090 * Symbol Legend For Node Structure Diagrams
1091 *---------------------------------------------------------------------*
1092 * (...) = general grouped thing
1093 * B = (B)ranch, K = bac(K), N = (N)othing
1094 * I = (I)nitialize count, C = Increment (C)ount
1095 * T~m = (T)est against mini(m)um- go to NEXT pointer if >= operand
1096 * T~x = (T)est against ma(x)imum- go to NEXT pointer if >= operand
1097 * '~' = NEXT pointer, \___| = forward pointer, |___/ = Backward pointer
1098 *---------------------------------------------------------------------*/
1100 if (op_code == '*' && (flags_local & SIMPLE)) {
1101 insert ((lazy ? LAZY_STAR : STAR), ret_val, 0UL, 0UL, 0);
1103 } else if (op_code == '+' && (flags_local & SIMPLE)) {
1104 insert (lazy ? LAZY_PLUS : PLUS, ret_val, 0UL, 0UL, 0);
1106 } else if (op_code == '?' && (flags_local & SIMPLE)) {
1107 insert (lazy ? LAZY_QUESTION : QUESTION, ret_val, 0UL, 0UL, 0);
1109 } else if (op_code == '{' && (flags_local & SIMPLE)) {
1110 insert (lazy ? LAZY_BRACE : BRACE, ret_val, min_max [0], min_max [1], 0);
1112 } else if ((op_code == '*' || op_code == '+') && lazy) {
1113 /* Node structure for (x)*? Node structure for (x)+? construct.
1114 * construct. (Same as (x)*? except for initial
1115 * forward jump into parenthesis.)
1117 * ___6____
1118 * _______5_______ /________|______
1119 * | _4__ 1_\ /| ____ | _\
1120 * |/ | / |\ / |/ | | / |\
1121 * B~ N~ B~ (...)~ K~ N~ N~ B~ N~ B~ (...)~ K~ N~
1122 * \ \___2_______| \ \___________|
1123 * \_____3_______| \_____________|
1127 tail (ret_val, emit_node (BACK)); /* 1 */
1128 (void) insert (BRANCH, ret_val, 0UL, 0UL, 0); /* 2,4 */
1129 (void) insert (NOTHING, ret_val, 0UL, 0UL, 0); /* 3 */
1131 next = emit_node (NOTHING); /* 2,3 */
1133 offset_tail (ret_val, NODE_SIZE, next); /* 2 */
1134 tail (ret_val, next); /* 3 */
1135 insert (BRANCH, ret_val, 0UL, 0UL, 0); /* 4,5 */
1136 tail (ret_val, ret_val + (2 * NODE_SIZE)); /* 4 */
1137 offset_tail (ret_val, 3 * NODE_SIZE, ret_val); /* 5 */
1139 if (op_code == '+') {
1140 insert (NOTHING, ret_val, 0UL, 0UL, 0); /* 6 */
1141 tail (ret_val, ret_val + (4 * NODE_SIZE)); /* 6 */
1143 } else if (op_code == '*') {
1144 /* Node structure for (x)* construct.
1145 * ____1_____
1146 * | \
1147 * B~ (...)~ K~ B~ N~
1148 * \ \_|2 |\_|
1149 * \__3_______| 4
1152 insert (BRANCH, ret_val, 0UL, 0UL, 0); /* 1,3 */
1153 offset_tail (ret_val, NODE_SIZE, emit_node (BACK)); /* 2 */
1154 offset_tail (ret_val, NODE_SIZE, ret_val); /* 1 */
1155 tail (ret_val, emit_node (BRANCH)); /* 3 */
1156 tail (ret_val, emit_node (NOTHING)); /* 4 */
1157 } else if (op_code == '+') {
1158 /* Node structure for (x)+ construct.
1160 * ____2_____
1161 * | \
1162 * (...)~ B~ K~ B~ N~
1163 * \_|\____|\_|
1164 * 1 3 4
1167 next = emit_node (BRANCH); /* 1 */
1169 tail (ret_val, next); /* 1 */
1170 tail (emit_node (BACK), ret_val); /* 2 */
1171 tail (next, emit_node (BRANCH)); /* 3 */
1172 tail (ret_val, emit_node (NOTHING)); /* 4 */
1173 } else if (op_code == '?' && lazy) {
1174 /* Node structure for (x)?? construct.
1175 * _4__ 1_
1176 * / | / |
1177 * B~ N~ B~ (...)~ N~
1178 * \ \___2____|
1179 * \_____3____|
1182 (void) insert (BRANCH, ret_val, 0UL, 0UL, 0); /* 2,4 */
1183 (void) insert (NOTHING, ret_val, 0UL, 0UL, 0); /* 3 */
1185 next = emit_node (NOTHING); /* 1,2,3 */
1187 offset_tail (ret_val, 2 * NODE_SIZE, next); /* 1 */
1188 offset_tail (ret_val, NODE_SIZE, next); /* 2 */
1189 tail (ret_val, next); /* 3 */
1190 insert (BRANCH, ret_val, 0UL, 0UL, 0); /* 4 */
1191 tail (ret_val, (ret_val + (2 * NODE_SIZE))); /* 4 */
1193 } else if (op_code == '?') {
1194 /* Node structure for (x)? construct.
1195 * ___1____ _2
1196 * / |/ |
1197 * B~ (...)~ B~ N~
1198 * \__3_|
1201 insert (BRANCH, ret_val, 0UL, 0UL, 0); /* 1 */
1202 tail (ret_val, emit_node (BRANCH)); /* 1 */
1204 next = emit_node (NOTHING); /* 2,3 */
1206 tail (ret_val, next); /* 2 */
1207 offset_tail (ret_val, NODE_SIZE, next); /* 3 */
1208 } else if (op_code == '{' && min_max [0] == min_max [1]) {
1209 /* Node structure for (x){m}, (x){m}?, (x){m,m}, or (x){m,m}? constructs.
1210 * Note that minimal and maximal matching mean the same thing when we
1211 * specify the minimum and maximum to be the same value.
1212 * _______3_____
1213 * | 1_ _2 \
1214 * | / |/ | \
1215 * I~ (...)~ C~ T~m K~ N~
1216 * \_| \_____|
1217 * 5 4
1220 tail (ret_val, emit_special (INC_COUNT, 0UL, Num_Braces)); /* 1 */
1221 tail (ret_val, emit_special (TEST_COUNT, min_max [0], Num_Braces));/* 2 */
1222 tail (emit_node (BACK), ret_val); /* 3 */
1223 tail (ret_val, emit_node (NOTHING)); /* 4 */
1225 next = insert (INIT_COUNT, ret_val, 0UL, 0UL, Num_Braces); /* 5 */
1227 tail (ret_val, next); /* 5 */
1229 Num_Braces++;
1230 } else if (op_code == '{' && lazy) {
1231 if (min_max [0] == REG_ZERO && min_max [1] != REG_INFINITY) {
1232 /* Node structure for (x){0,n}? or {,n}? construct.
1233 * _________3____________
1234 * 8_| _4__ 1_ _2 \
1235 * / |/ | / |/ | \
1236 * I~ B~ N~ B~ (...)~ C~ T~x K~ N~
1237 * \ \ \__7__|
1238 * \ \_________6_______|
1239 * \______5____________|
1242 tail (ret_val, emit_special (INC_COUNT, 0UL, Num_Braces)); /* 1 */
1244 next = emit_special (TEST_COUNT, min_max [0], Num_Braces); /* 2,7 */
1246 tail (ret_val, next); /* 2 */
1247 (void) insert (BRANCH, ret_val, 0UL, 0UL, Num_Braces); /* 4,6 */
1248 (void) insert (NOTHING, ret_val, 0UL, 0UL, Num_Braces); /* 5 */
1249 (void) insert (BRANCH, ret_val, 0UL, 0UL, Num_Braces); /* 3,4,8 */
1250 tail (emit_node (BACK), ret_val); /* 3 */
1251 tail (ret_val, ret_val + (2 * NODE_SIZE)); /* 4 */
1253 next = emit_node (NOTHING); /* 5,6,7 */
1255 offset_tail (ret_val, NODE_SIZE, next); /* 5 */
1256 offset_tail (ret_val, 2 * NODE_SIZE, next); /* 6 */
1257 offset_tail (ret_val, 3 * NODE_SIZE, next); /* 7 */
1259 next = insert (INIT_COUNT, ret_val, 0UL, 0UL, Num_Braces); /* 8 */
1261 tail (ret_val, next); /* 8 */
1263 } else if (min_max [0] > REG_ZERO && min_max [1] == REG_INFINITY) {
1264 /* Node structure for (x){m,}? construct.
1265 * ______8_________________
1266 * | _______3_____ \
1267 * | _7__ | 1_ _2 \ \
1268 * |/ | | / |/ | \ \
1269 * I~ B~ N~ B~ (...)~ C~ T~m K~ K~ N~
1270 * \_____\__\_| \_4___| |
1271 * 9 \ \_________5__________|
1272 * \_______6______________|
1275 tail (ret_val, emit_special (INC_COUNT, 0UL, Num_Braces)); /* 1 */
1277 next = emit_special (TEST_COUNT, min_max [0], Num_Braces); /* 2,4 */
1279 tail (ret_val, next); /* 2 */
1280 tail (emit_node (BACK), ret_val); /* 3 */
1281 tail (ret_val, emit_node (BACK)); /* 4 */
1282 (void) insert (BRANCH, ret_val, 0UL, 0UL, 0); /* 5,7 */
1283 (void) insert (NOTHING, ret_val, 0UL, 0UL, 0); /* 6 */
1285 next = emit_node (NOTHING); /* 5,6 */
1287 offset_tail (ret_val, NODE_SIZE, next); /* 5 */
1288 tail (ret_val, next); /* 6 */
1289 (void) insert (BRANCH, ret_val, 0UL, 0UL, 0); /* 7,8 */
1290 tail (ret_val, ret_val + (2 * NODE_SIZE)); /* 7 */
1291 offset_tail (ret_val, 3 * NODE_SIZE, ret_val); /* 8 */
1292 (void) insert (INIT_COUNT, ret_val, 0UL, 0UL, Num_Braces); /* 9 */
1293 tail (ret_val, ret_val + INDEX_SIZE + (4 * NODE_SIZE)); /* 9 */
1295 } else {
1296 /* Node structure for (x){m,n}? construct.
1297 * ______9_____________________
1298 * | _____________3___ \
1299 * | __8_ | 1_ _2 \ \
1300 * |/ | | / |/ | \ \
1301 * I~ B~ N~ B~ (...)~ C~ T~x T~m K~ K~ N~
1302 * \_____\__\_| \ \__4__| |
1303 * 10 \ \ \_7_________|
1304 * \ \_________6_____________|
1305 * \_______5_________________|
1308 tail (ret_val, emit_special (INC_COUNT, 0UL, Num_Braces)); /* 1 */
1310 next = emit_special (TEST_COUNT, min_max [1], Num_Braces); /* 2,7 */
1312 tail (ret_val, next); /* 2 */
1314 next = emit_special (TEST_COUNT, min_max [0], Num_Braces); /* 4 */
1316 tail (emit_node (BACK), ret_val); /* 3 */
1317 tail (next, emit_node (BACK)); /* 4 */
1318 (void) insert (BRANCH, ret_val, 0UL, 0UL, 0); /* 6,8 */
1319 (void) insert (NOTHING, ret_val, 0UL, 0UL, 0); /* 5 */
1320 (void) insert (BRANCH, ret_val, 0UL, 0UL, 0); /* 8,9 */
1322 next = emit_node (NOTHING); /* 5,6,7 */
1324 offset_tail (ret_val, NODE_SIZE, next); /* 5 */
1325 offset_tail (ret_val, 2 * NODE_SIZE, next); /* 6 */
1326 offset_tail (ret_val, 3 * NODE_SIZE, next); /* 7 */
1327 tail (ret_val, ret_val + (2 * NODE_SIZE)); /* 8 */
1328 offset_tail (next, -NODE_SIZE, ret_val); /* 9 */
1329 insert (INIT_COUNT, ret_val, 0UL, 0UL, Num_Braces); /* 10 */
1330 tail (ret_val, ret_val + INDEX_SIZE + (4 * NODE_SIZE)); /* 10 */
1333 Num_Braces++;
1334 } else if (op_code == '{') {
1335 if (min_max [0] == REG_ZERO && min_max [1] != REG_INFINITY) {
1336 /* Node structure for (x){0,n} or (x){,n} construct.
1338 * ___3____________
1339 * | 1_ _2 \ 5_
1340 * | / |/ | \ / |
1341 * I~ B~ (...)~ C~ T~x K~ B~ N~
1342 * \_|\ \_6___|__|
1343 * 7 \________4________|
1346 tail (ret_val, emit_special (INC_COUNT, 0UL, Num_Braces)); /* 1 */
1348 next = emit_special (TEST_COUNT, min_max [1], Num_Braces); /* 2,6 */
1350 tail (ret_val, next); /* 2 */
1351 (void) insert (BRANCH, ret_val, 0UL, 0UL, 0); /* 3,4,7 */
1352 tail (emit_node (BACK), ret_val); /* 3 */
1354 next = emit_node (BRANCH); /* 4,5 */
1356 tail (ret_val, next); /* 4 */
1357 tail (next, emit_node (NOTHING)); /* 5,6 */
1358 offset_tail (ret_val, NODE_SIZE, next); /* 6 */
1360 next = insert (INIT_COUNT, ret_val, 0UL, 0UL, Num_Braces); /* 7 */
1362 tail (ret_val, next); /* 7 */
1364 } else if (min_max [0] > REG_ZERO && min_max [1] == REG_INFINITY) {
1365 /* Node structure for (x){m,} construct.
1366 * __________4________
1367 * | __3__________ \
1368 * _|___| 1_ _2 \ \ _7
1369 * / | 8 | / |/ | \ \ / |
1370 * I~ B~ (...)~ C~ T~m K~ K~ B~ N~
1371 * \ \_5___| |
1372 * \__________6__________|
1375 tail (ret_val, emit_special (INC_COUNT, 0UL, Num_Braces)); /* 1 */
1377 next = emit_special (TEST_COUNT, min_max [0], Num_Braces); /* 2 */
1379 tail (ret_val, next); /* 2 */
1380 tail (emit_node (BACK), ret_val); /* 3 */
1381 (void) insert (BRANCH, ret_val, 0UL, 0UL, 0); /* 4,6 */
1383 next = emit_node (BACK); /* 4 */
1385 tail (next, ret_val); /* 4 */
1386 offset_tail (ret_val, NODE_SIZE, next); /* 5 */
1387 tail (ret_val, emit_node (BRANCH)); /* 6 */
1388 tail (ret_val, emit_node (NOTHING)); /* 7 */
1390 insert (INIT_COUNT, ret_val, 0UL, 0UL, Num_Braces); /* 8 */
1392 tail (ret_val, ret_val + INDEX_SIZE + (2 * NODE_SIZE)); /* 8 */
1394 } else {
1395 /* Node structure for (x){m,n} construct.
1396 * _____6________________
1397 * | _____________3___ \
1398 * 9_|__| 1_ _2 \ \ _8
1399 * / | | / |/ | \ \ / |
1400 * I~ B~ (...)~ C~ T~x T~m K~ K~ B~ N~
1401 * \ \ \__4__| | |
1402 * \ \_7_________|__|
1403 * \_________5_____________|
1406 tail (ret_val, emit_special (INC_COUNT, 0UL, Num_Braces)); /* 1 */
1408 next = emit_special (TEST_COUNT, min_max [1], Num_Braces); /* 2,4 */
1410 tail (ret_val, next); /* 2 */
1412 next = emit_special (TEST_COUNT, min_max [0], Num_Braces); /* 4 */
1414 tail (emit_node (BACK), ret_val); /* 3 */
1415 tail (next, emit_node (BACK)); /* 4 */
1416 (void) insert (BRANCH, ret_val, 0UL, 0UL, 0); /* 5,6 */
1418 next = emit_node (BRANCH); /* 5,8 */
1420 tail (ret_val, next); /* 5 */
1421 offset_tail (next, -NODE_SIZE, ret_val); /* 6 */
1423 next = emit_node (NOTHING); /* 7,8 */
1425 offset_tail (ret_val, NODE_SIZE, next); /* 7 */
1427 offset_tail (next, -NODE_SIZE, next); /* 8 */
1428 (void) insert (INIT_COUNT, ret_val, 0UL, 0UL, Num_Braces); /* 9 */
1429 tail (ret_val, ret_val + INDEX_SIZE + (2 * NODE_SIZE)); /* 9 */
1432 Num_Braces++;
1433 } else {
1434 /* We get here if the IS_QUANTIFIER macro is not coordinated properly
1435 with this function. */
1437 REG_FAIL ("internal error #2, `piece\'");
1440 if (IS_QUANTIFIER (*Reg_Parse)) {
1441 if (op_code == '{') {
1442 sprintf (Error_Text, "nested quantifiers, {m,n}%c", *Reg_Parse);
1443 } else {
1444 sprintf (Error_Text, "nested quantifiers, %c%c", op_code, *Reg_Parse);
1447 REG_FAIL (Error_Text);
1450 return (ret_val);
1453 /*----------------------------------------------------------------------*
1454 * atom
1456 * Process one regex item at the lowest level
1458 * OPTIMIZATION: Lumps a continuous sequence of ordinary characters
1459 * together so that it can turn them into a single EXACTLY node, which
1460 * is smaller to store and faster to run.
1461 *----------------------------------------------------------------------*/
1463 static unsigned char * atom (int *flag_param, len_range *range_param) {
1465 register unsigned char *ret_val;
1466 unsigned char test;
1467 int flags_local;
1468 len_range range_local;
1470 *flag_param = WORST; /* Tentatively. */
1471 range_param->lower = 0; /* Idem */
1472 range_param->upper = 0;
1474 /* Process any regex comments, e.g. `(?# match next token->)'. The
1475 terminating right parenthesis can not be escaped. The comment stops at
1476 the first right parenthesis encountered (or the end of the regex
1477 string)... period. Handles multiple sequential comments,
1478 e.g. `(?# one)(?# two)...' */
1480 while (*Reg_Parse == '(' &&
1481 *(Reg_Parse + 1) == '?' &&
1482 *(Reg_Parse + 2) == '#') {
1484 Reg_Parse += 3;
1486 while (*Reg_Parse != ')' && *Reg_Parse != '\0') {
1487 Reg_Parse++;
1490 if (*Reg_Parse == ')') {
1491 Reg_Parse++;
1494 if (*Reg_Parse == ')' || *Reg_Parse == '|' || *Reg_Parse == '\0') {
1495 /* Hit end of regex string or end of parenthesized regex; have to
1496 return "something" (i.e. a NOTHING node) to avoid generating an
1497 error. */
1499 ret_val = emit_node (NOTHING);
1501 return (ret_val);
1505 switch (*Reg_Parse++) {
1506 case '^':
1507 ret_val = emit_node (BOL);
1508 break;
1510 case '$':
1511 ret_val = emit_node (EOL);
1512 break;
1514 case '<':
1515 ret_val = emit_node (BOWORD);
1516 break;
1518 case '>':
1519 ret_val = emit_node (EOWORD);
1520 break;
1522 case '.':
1523 if (Match_Newline) {
1524 ret_val = emit_node (EVERY);
1525 } else {
1526 ret_val = emit_node (ANY);
1529 *flag_param |= (HAS_WIDTH | SIMPLE);
1530 range_param->lower = 1;
1531 range_param->upper = 1;
1532 break;
1534 case '(':
1535 if (*Reg_Parse == '?') { /* Special parenthetical expression */
1536 Reg_Parse++;
1537 range_local.lower = 0; /* Make sure it is always used */
1538 range_local.upper = 0;
1540 if (*Reg_Parse == ':') {
1541 Reg_Parse++;
1542 ret_val = chunk (NO_CAPTURE, &flags_local, &range_local);
1543 } else if (*Reg_Parse == '=') {
1544 Reg_Parse++;
1545 ret_val = chunk (POS_AHEAD_OPEN, &flags_local, &range_local);
1546 } else if (*Reg_Parse == '!') {
1547 Reg_Parse++;
1548 ret_val = chunk (NEG_AHEAD_OPEN, &flags_local, &range_local);
1549 } else if (*Reg_Parse == 'i') {
1550 Reg_Parse++;
1551 ret_val = chunk (INSENSITIVE, &flags_local, &range_local);
1552 } else if (*Reg_Parse == 'I') {
1553 Reg_Parse++;
1554 ret_val = chunk (SENSITIVE, &flags_local, &range_local);
1555 } else if (*Reg_Parse == 'n') {
1556 Reg_Parse++;
1557 ret_val = chunk (NEWLINE, &flags_local, &range_local);
1558 } else if (*Reg_Parse == 'N') {
1559 Reg_Parse++;
1560 ret_val = chunk (NO_NEWLINE, &flags_local, &range_local);
1561 } else if (*Reg_Parse == '<') {
1562 Reg_Parse++;
1563 if (*Reg_Parse == '=') {
1564 Reg_Parse++;
1565 ret_val = chunk (POS_BEHIND_OPEN, &flags_local, &range_local);
1566 } else if (*Reg_Parse == '!') {
1567 Reg_Parse++;
1568 ret_val = chunk (NEG_BEHIND_OPEN, &flags_local, &range_local);
1569 } else {
1570 sprintf (Error_Text,
1571 "invalid look-behind syntax, \"(?<%c...)\"",
1572 *Reg_Parse);
1574 REG_FAIL (Error_Text);
1576 } else {
1577 sprintf (Error_Text,
1578 "invalid grouping syntax, \"(?%c...)\"",
1579 *Reg_Parse);
1581 REG_FAIL (Error_Text);
1583 } else { /* Normal capturing parentheses */
1584 ret_val = chunk (PAREN, &flags_local, &range_local);
1587 if (ret_val == NULL) return (NULL); /* Something went wrong. */
1589 /* Add HAS_WIDTH flag if it was set by call to chunk. */
1591 *flag_param |= flags_local & HAS_WIDTH;
1592 *range_param = range_local;
1594 break;
1596 case '\0':
1597 case '|':
1598 case ')':
1599 REG_FAIL ("internal error #3, `atom\'"); /* Supposed to be */
1600 /* caught earlier. */
1601 case '?':
1602 case '+':
1603 case '*':
1604 sprintf (Error_Text, "%c follows nothing", *(Reg_Parse - 1));
1605 REG_FAIL (Error_Text);
1607 case '{':
1608 if (Enable_Counting_Quantifier) {
1609 REG_FAIL ("{m,n} follows nothing");
1610 } else {
1611 ret_val = emit_node (EXACTLY); /* Treat braces as literals. */
1612 emit_byte ('{');
1613 emit_byte ('\0');
1614 range_param->lower = 1;
1615 range_param->upper = 1;
1618 break;
1620 case '[':
1622 register unsigned int second_value;
1623 register unsigned int last_value;
1624 unsigned char last_emit = 0;
1626 /* Handle characters that can only occur at the start of a class. */
1628 if (*Reg_Parse == '^') { /* Complement of range. */
1629 ret_val = emit_node (ANY_BUT);
1630 Reg_Parse++;
1632 /* All negated classes include newline unless escaped with
1633 a "(?n)" switch. */
1635 if (!Match_Newline) emit_byte ('\n');
1636 } else {
1637 ret_val = emit_node (ANY_OF);
1640 if (*Reg_Parse == ']' || *Reg_Parse == '-') {
1641 /* If '-' or ']' is the first character in a class,
1642 it is a literal character in the class. */
1644 last_emit = *Reg_Parse;
1645 emit_byte (*Reg_Parse);
1646 Reg_Parse++;
1649 /* Handle the rest of the class characters. */
1651 while (*Reg_Parse != '\0' && *Reg_Parse != ']') {
1652 if (*Reg_Parse == '-') { /* Process a range, e.g [a-z]. */
1653 Reg_Parse++;
1655 if (*Reg_Parse == ']' || *Reg_Parse == '\0') {
1656 /* If '-' is the last character in a class it is a literal
1657 character. If `Reg_Parse' points to the end of the
1658 regex string, an error will be generated later. */
1660 emit_byte ('-');
1661 last_emit = '-';
1662 } else {
1663 /* We must get the range starting character value from the
1664 emitted code since it may have been an escaped
1665 character. `second_value' is set one larger than the
1666 just emitted character value. This is done since
1667 `second_value' is used as the start value for the loop
1668 that emits the values in the range. Since we have
1669 already emitted the first character of the class, we do
1670 not want to emit it again. */
1672 second_value = ((unsigned int) last_emit) + 1;
1674 if (*Reg_Parse == '\\') {
1675 /* Handle escaped characters within a class range.
1676 Specifically disallow shortcut escapes as the end of
1677 a class range. To allow this would be ambiguous
1678 since shortcut escapes represent a set of characters,
1679 and it would not be clear which character of the
1680 class should be treated as the "last" character. */
1682 Reg_Parse++;
1684 if ((test = numeric_escape (*Reg_Parse, &Reg_Parse))) {
1685 last_value = (unsigned int) test;
1686 } else if ((test = literal_escape (*Reg_Parse))) {
1687 last_value = (unsigned int) test;
1688 } else if (shortcut_escape (*Reg_Parse,
1689 NULL,
1690 CHECK_CLASS_ESCAPE)) {
1691 sprintf (Error_Text,
1692 "\\%c is not allowed as range operand",
1693 *Reg_Parse);
1695 REG_FAIL (Error_Text);
1696 } else {
1697 sprintf (
1698 Error_Text,
1699 "\\%c is an invalid char class escape sequence",
1700 *Reg_Parse);
1702 REG_FAIL (Error_Text);
1704 } else {
1705 last_value = U_CHAR_AT (Reg_Parse);
1708 if (Is_Case_Insensitive) {
1709 second_value =
1710 (unsigned int) tolower ((int) second_value);
1711 last_value =
1712 (unsigned int) tolower ((int) last_value);
1715 /* For case insensitive, something like [A-_] will
1716 generate an error here since ranges are converted to
1717 lower case. */
1719 if (second_value - 1 > last_value) {
1720 REG_FAIL ("invalid [] range");
1723 /* If only one character in range (e.g [a-a]) then this
1724 loop is not run since the first character of any range
1725 was emitted by the previous iteration of while loop. */
1727 for (; second_value <= last_value; second_value++) {
1728 emit_class_byte (second_value);
1731 last_emit = (unsigned char) last_value;
1733 Reg_Parse++;
1735 } /* End class character range code. */
1736 } else if (*Reg_Parse == '\\') {
1737 Reg_Parse++;
1739 if ((test = numeric_escape (*Reg_Parse, &Reg_Parse)) != '\0') {
1740 emit_class_byte (test);
1742 last_emit = test;
1743 } else if ((test = literal_escape (*Reg_Parse)) != '\0') {
1744 emit_byte (test);
1745 last_emit = test;
1746 } else if (shortcut_escape (*Reg_Parse,
1747 NULL,
1748 CHECK_CLASS_ESCAPE)) {
1750 if (*(Reg_Parse + 1) == '-') {
1751 /* Specifically disallow shortcut escapes as the start
1752 of a character class range (see comment above.) */
1754 sprintf (Error_Text,
1755 "\\%c not allowed as range operand",
1756 *Reg_Parse);
1758 REG_FAIL (Error_Text);
1759 } else {
1760 /* Emit the bytes that are part of the shortcut
1761 escape sequence's range (e.g. \d = 0123456789) */
1763 shortcut_escape (*Reg_Parse, NULL, EMIT_CLASS_BYTES);
1765 } else {
1766 sprintf (Error_Text,
1767 "\\%c is an invalid char class escape sequence",
1768 *Reg_Parse);
1770 REG_FAIL (Error_Text);
1773 Reg_Parse++;
1775 /* End of class escaped sequence code */
1776 } else {
1777 emit_class_byte (*Reg_Parse); /* Ordinary class character. */
1779 last_emit = *Reg_Parse;
1780 Reg_Parse++;
1782 } /* End of while (*Reg_Parse != '\0' && *Reg_Parse != ']') */
1784 if (*Reg_Parse != ']') REG_FAIL ("missing right \']\'");
1786 emit_byte('\0');
1788 /* NOTE: it is impossible to specify an empty class. This is
1789 because [] would be interpreted as "begin character class"
1790 followed by a literal ']' character and no "end character class"
1791 delimiter (']'). Because of this, it is always safe to assume
1792 that a class HAS_WIDTH. */
1794 Reg_Parse++;
1795 *flag_param |= HAS_WIDTH | SIMPLE;
1796 range_param->lower = 1;
1797 range_param->upper = 1;
1800 break; /* End of character class code. */
1802 case '\\':
1803 /* Force Error_Text to have a length of zero. This way we can tell if
1804 either of the calls to shortcut_escape() or back_ref() fill
1805 Error_Text with an error message. */
1807 Error_Text [0] = '\0';
1809 if ((ret_val = shortcut_escape (*Reg_Parse, flag_param, EMIT_NODE))) {
1811 Reg_Parse++;
1812 range_param->lower = 1;
1813 range_param->upper = 1;
1814 break;
1816 } else if ((ret_val = back_ref (Reg_Parse, flag_param, EMIT_NODE))) {
1817 /* Can't make any assumptions about a back-reference as to SIMPLE
1818 or HAS_WIDTH. For example (^|<) is neither simple nor has
1819 width. So we don't flip bits in flag_param here. */
1821 Reg_Parse++;
1822 /* Back-references always have an unknown length */
1823 range_param->lower = -1;
1824 range_param->upper = -1;
1825 break;
1828 if (strlen (Error_Text) > 0) REG_FAIL (Error_Text);
1830 /* At this point it is apparent that the escaped character is not a
1831 shortcut escape or back-reference. Back up one character to allow
1832 the default code to include it as an ordinary character. */
1834 /* Fall through to Default case to handle literal escapes and numeric
1835 escapes. */
1837 default:
1838 Reg_Parse--; /* If we fell through from the above code, we are now
1839 pointing at the back slash (\) character. */
1841 unsigned char *parse_save;
1842 int len = 0;
1844 if (Is_Case_Insensitive) {
1845 ret_val = emit_node (SIMILAR);
1846 } else {
1847 ret_val = emit_node (EXACTLY);
1850 /* Loop until we find a meta character, shortcut escape, back
1851 reference, or end of regex string. */
1853 for (; *Reg_Parse != '\0' &&
1854 !strchr ((char *) Meta_Char, (int) *Reg_Parse);
1855 len++) {
1857 /* Save where we are in case we have to back
1858 this character out. */
1860 parse_save = Reg_Parse;
1862 if (*Reg_Parse == '\\') {
1863 Reg_Parse++; /* Point to escaped character */
1865 Error_Text [0] = '\0'; /* See comment above. */
1867 if ((test = numeric_escape (*Reg_Parse, &Reg_Parse))) {
1868 if (Is_Case_Insensitive) {
1869 emit_byte (tolower (test));
1870 } else {
1871 emit_byte (test);
1873 } else if ((test = literal_escape (*Reg_Parse))) {
1874 emit_byte (test);
1875 } else if (back_ref (Reg_Parse, NULL, CHECK_ESCAPE)) {
1876 /* Leave back reference for next `atom' call */
1878 Reg_Parse--; break;
1879 } else if (shortcut_escape (*Reg_Parse, NULL, CHECK_ESCAPE)) {
1880 /* Leave shortcut escape for next `atom' call */
1882 Reg_Parse--; break;
1883 } else {
1884 if (strlen (Error_Text) == 0) {
1885 /* None of the above calls generated an error message
1886 so generate our own here. */
1888 sprintf (Error_Text,
1889 "\\%c is an invalid escape sequence",
1890 *Reg_Parse);
1893 REG_FAIL (Error_Text);
1896 Reg_Parse++;
1897 } else {
1898 /* Ordinary character */
1900 if (Is_Case_Insensitive) {
1901 emit_byte (tolower (*Reg_Parse));
1902 } else {
1903 emit_byte (*Reg_Parse);
1906 Reg_Parse++;
1909 /* If next regex token is a quantifier (?, +. *, or {m,n}) and
1910 our EXACTLY node so far is more than one character, leave the
1911 last character to be made into an EXACTLY node one character
1912 wide for the multiplier to act on. For example 'abcd* would
1913 have an EXACTLY node with an 'abc' operand followed by a STAR
1914 node followed by another EXACTLY node with a 'd' operand. */
1916 if (IS_QUANTIFIER (*Reg_Parse) && len > 0) {
1917 Reg_Parse = parse_save; /* Point to previous regex token. */
1919 if (Code_Emit_Ptr == &Compute_Size) {
1920 Reg_Size--;
1921 } else {
1922 Code_Emit_Ptr--; /* Write over previously emitted byte. */
1925 break;
1929 if (len <= 0) REG_FAIL ("internal error #4, `atom\'");
1931 *flag_param |= HAS_WIDTH;
1933 if (len == 1) *flag_param |= SIMPLE;
1935 range_param->lower = len;
1936 range_param->upper = len;
1938 emit_byte ('\0');
1940 } /* END switch (*Reg_Parse++) */
1942 return (ret_val);
1945 /*----------------------------------------------------------------------*
1946 * emit_node
1948 * Emit (if appropriate) the op code for a regex node atom.
1950 * The NEXT pointer is initialized to NULL.
1952 * Returns a pointer to the START of the emitted node.
1953 *----------------------------------------------------------------------*/
1955 static unsigned char * emit_node (int op_code) {
1957 register unsigned char *ret_val;
1958 register unsigned char *ptr;
1960 ret_val = Code_Emit_Ptr; /* Return address of start of node */
1962 if (ret_val == &Compute_Size) {
1963 Reg_Size += NODE_SIZE;
1964 } else {
1965 ptr = ret_val;
1966 *ptr++ = (unsigned char) op_code;
1967 *ptr++ = '\0'; /* Null "NEXT" pointer. */
1968 *ptr++ = '\0';
1970 Code_Emit_Ptr = ptr;
1973 return (ret_val);
1976 /*----------------------------------------------------------------------*
1977 * emit_byte
1979 * Emit (if appropriate) a byte of code (usually part of an operand.)
1980 *----------------------------------------------------------------------*/
1982 static void emit_byte (unsigned char c) {
1984 if (Code_Emit_Ptr == &Compute_Size) {
1985 Reg_Size++;
1986 } else {
1987 *Code_Emit_Ptr++ = c;
1991 /*----------------------------------------------------------------------*
1992 * emit_class_byte
1994 * Emit (if appropriate) a byte of code (usually part of a character
1995 * class operand.)
1996 *----------------------------------------------------------------------*/
1998 static void emit_class_byte (unsigned char c) {
2000 if (Code_Emit_Ptr == &Compute_Size) {
2001 Reg_Size++;
2003 if (Is_Case_Insensitive && isalpha (c)) Reg_Size++;
2004 } else if (Is_Case_Insensitive && isalpha (c)) {
2005 /* For case insensitive character classes, emit both upper and lower case
2006 versions of alphabetical characters. */
2008 *Code_Emit_Ptr++ = tolower (c);
2009 *Code_Emit_Ptr++ = toupper (c);
2010 } else {
2011 *Code_Emit_Ptr++ = c;
2015 /*----------------------------------------------------------------------*
2016 * emit_special
2018 * Emit nodes that need special processing.
2019 *----------------------------------------------------------------------*/
2021 static unsigned char * emit_special (
2022 unsigned char op_code,
2023 unsigned long test_val,
2024 int index) {
2026 register unsigned char *ret_val = &Compute_Size;
2027 register unsigned char *ptr;
2029 if (Code_Emit_Ptr == &Compute_Size) {
2030 switch (op_code) {
2031 case POS_BEHIND_OPEN:
2032 case NEG_BEHIND_OPEN:
2033 Reg_Size += LENGTH_SIZE; /* Length of the look-behind match */
2034 Reg_Size += NODE_SIZE; /* Make room for the node */
2035 break;
2037 case TEST_COUNT:
2038 Reg_Size += NEXT_PTR_SIZE; /* Make room for a test value. */
2040 case INC_COUNT:
2041 Reg_Size += INDEX_SIZE; /* Make room for an index value. */
2043 default:
2044 Reg_Size += NODE_SIZE; /* Make room for the node. */
2046 } else {
2047 ret_val = emit_node (op_code); /* Return the address for start of node. */
2048 ptr = Code_Emit_Ptr;
2050 if (op_code == INC_COUNT || op_code == TEST_COUNT) {
2051 *ptr++ = (unsigned char) index;
2053 if (op_code == TEST_COUNT) {
2054 *ptr++ = PUT_OFFSET_L (test_val);
2055 *ptr++ = PUT_OFFSET_R (test_val);
2057 } else if (op_code == POS_BEHIND_OPEN || op_code == NEG_BEHIND_OPEN) {
2058 *ptr++ = PUT_OFFSET_L (test_val);
2059 *ptr++ = PUT_OFFSET_R (test_val);
2060 *ptr++ = PUT_OFFSET_L (test_val);
2061 *ptr++ = PUT_OFFSET_R (test_val);
2064 Code_Emit_Ptr = ptr;
2067 return (ret_val);
2070 /*----------------------------------------------------------------------*
2071 * insert
2073 * Insert a node in front of already emitted node(s). Means relocating
2074 * the operand. Code_Emit_Ptr points one byte past the just emitted
2075 * node and operand. The parameter `insert_pos' points to the location
2076 * where the new node is to be inserted.
2077 *----------------------------------------------------------------------*/
2079 static unsigned char * insert (
2080 unsigned char op,
2081 unsigned char *insert_pos,
2082 long min,
2083 long max,
2084 int index) {
2086 register unsigned char *src;
2087 register unsigned char *dst;
2088 unsigned char *place;
2089 int insert_size = NODE_SIZE;
2091 if (op == BRACE || op == LAZY_BRACE) {
2092 /* Make room for the min and max values. */
2094 insert_size += (2 * NEXT_PTR_SIZE);
2095 } else if (op == INIT_COUNT) {
2096 /* Make room for an index value . */
2098 insert_size += INDEX_SIZE;
2101 if (Code_Emit_Ptr == &Compute_Size) {
2102 Reg_Size += insert_size;
2103 return &Compute_Size;
2106 src = Code_Emit_Ptr;
2107 Code_Emit_Ptr += insert_size;
2108 dst = Code_Emit_Ptr;
2110 /* Relocate the existing emitted code to make room for the new node. */
2112 while (src > insert_pos) *--dst = *--src;
2114 place = insert_pos; /* Where operand used to be. */
2115 *place++ = op; /* Inserted operand. */
2116 *place++ = '\0'; /* NEXT pointer for inserted operand. */
2117 *place++ = '\0';
2119 if (op == BRACE || op == LAZY_BRACE) {
2120 *place++ = PUT_OFFSET_L (min);
2121 *place++ = PUT_OFFSET_R (min);
2123 *place++ = PUT_OFFSET_L (max);
2124 *place++ = PUT_OFFSET_R (max);
2125 } else if (op == INIT_COUNT) {
2126 *place++ = (unsigned char) index;
2129 return place; /* Return a pointer to the start of the code moved. */
2132 /*----------------------------------------------------------------------*
2133 * tail - Set the next-pointer at the end of a node chain.
2134 *----------------------------------------------------------------------*/
2136 static void tail (unsigned char *search_from, unsigned char *point_to) {
2138 register unsigned char *scan;
2139 register unsigned char *next;
2140 register int offset;
2142 if (search_from == &Compute_Size) return;
2144 /* Find the last node in the chain (node with a null NEXT pointer) */
2146 scan = search_from;
2148 for (;;) {
2149 next = next_ptr (scan);
2151 if (!next) break;
2153 scan = next;
2156 if (GET_OP_CODE (scan) == BACK) {
2157 offset = scan - point_to;
2158 } else {
2159 offset = point_to - scan;
2162 /* Set NEXT pointer */
2164 *(scan + 1) = PUT_OFFSET_L (offset);
2165 *(scan + 2) = PUT_OFFSET_R (offset);
2168 /*--------------------------------------------------------------------*
2169 * offset_tail
2171 * Perform a tail operation on (ptr + offset).
2172 *--------------------------------------------------------------------*/
2174 static void offset_tail (unsigned char *ptr, int offset, unsigned char *val) {
2176 if (ptr == &Compute_Size || ptr == NULL) return;
2178 tail (ptr + offset, val);
2181 /*--------------------------------------------------------------------*
2182 * branch_tail
2184 * Perform a tail operation on (ptr + offset) but only if `ptr' is a
2185 * BRANCH node.
2186 *--------------------------------------------------------------------*/
2188 static void branch_tail (unsigned char *ptr, int offset, unsigned char *val) {
2190 if (ptr == &Compute_Size || ptr == NULL ||GET_OP_CODE (ptr) != BRANCH) {
2191 return;
2194 tail (ptr + offset, val);
2197 /*--------------------------------------------------------------------*
2198 * shortcut_escape
2200 * Implements convenient escape sequences that represent entire
2201 * character classes or special location assertions (similar to escapes
2202 * supported by Perl)
2204 * \d Digits [0-9] |
2205 * \D NOT a digit [^0-9] | (Examples
2206 * \l Letters [a-zA-Z] | at left
2207 * \L NOT a Letter [^a-zA-Z] | are
2208 * \s Whitespace [ \t\n\r\f\v] | for
2209 * \S NOT Whitespace [^ \t\n\r\f\v] | C
2210 * \w "Word" character [a-zA-Z0-9_] | Locale)
2211 * \W NOT a "Word" character [^a-zA-Z0-9_] _|
2213 * \B Matches any character that is NOT a word-delimiter
2215 * Codes for the "emit" parameter:
2217 * EMIT_NODE
2218 * Emit a shortcut node. Shortcut nodes have an implied set of
2219 * class characters. This helps keep the compiled regex string
2220 * small.
2222 * EMIT_CLASS_BYTES
2223 * Emit just the equivalent characters of the class. This makes
2224 * the escape usable from within a class, e.g. [a-fA-F\d]. Only
2225 * \d, \D, \s, \S, \w, and \W can be used within a class.
2227 * CHECK_ESCAPE
2228 * Only verify that this is a valid shortcut escape.
2230 * CHECK_CLASS_ESCAPE
2231 * Same as CHECK_ESCAPE but only allows characters valid within
2232 * a class.
2234 *--------------------------------------------------------------------*/
2236 static unsigned char * shortcut_escape (
2237 unsigned char c,
2238 int *flag_param,
2239 int emit) {
2241 register unsigned char *class = NULL;
2242 static unsigned char *codes = (unsigned char *) "ByYdDlLsSwW";
2243 unsigned char *ret_val = (unsigned char *) 1; /* Assume success. */
2244 unsigned char *valid_codes;
2246 if (emit == EMIT_CLASS_BYTES || emit == CHECK_CLASS_ESCAPE) {
2247 valid_codes = codes + 3; /* \B, \y and \Y are not allowed in classes */
2248 } else {
2249 valid_codes = codes;
2252 if (!strchr ((char *) valid_codes, (int) c)) {
2253 return NULL; /* Not a valid shortcut escape sequence */
2254 } else if (emit == CHECK_ESCAPE || emit == CHECK_CLASS_ESCAPE) {
2255 return ret_val; /* Just checking if this is a valid shortcut escape. */
2258 switch (c) {
2259 case 'd':
2260 case 'D':
2261 if (emit == EMIT_CLASS_BYTES) {
2262 class = ASCII_Digits;
2263 } else if (emit == EMIT_NODE) {
2264 ret_val = (islower (c) ? emit_node (DIGIT)
2265 : emit_node (NOT_DIGIT));
2268 break;
2270 case 'l':
2271 case 'L':
2272 if (emit == EMIT_CLASS_BYTES) {
2273 class = Letter_Char;
2274 } else if (emit == EMIT_NODE) {
2275 ret_val = (islower (c) ? emit_node (LETTER)
2276 : emit_node (NOT_LETTER));
2279 break;
2281 case 's':
2282 case 'S':
2283 if (emit == EMIT_CLASS_BYTES) {
2284 if (Match_Newline) emit_byte ('\n');
2286 class = White_Space;
2287 } else if (emit == EMIT_NODE) {
2288 if (Match_Newline) {
2289 ret_val = (islower (c) ? emit_node (SPACE_NL)
2290 : emit_node (NOT_SPACE_NL));
2291 } else {
2292 ret_val = (islower (c) ? emit_node (SPACE)
2293 : emit_node (NOT_SPACE));
2297 break;
2299 case 'w':
2300 case 'W':
2301 if (emit == EMIT_CLASS_BYTES) {
2302 class = Word_Char;
2303 } else if (emit == EMIT_NODE) {
2304 ret_val = (islower (c) ? emit_node (WORD_CHAR)
2305 : emit_node (NOT_WORD_CHAR));
2308 break;
2310 /* Since the delimiter table is not available at regex compile time \B,
2311 \Y and \Y can only generate a node. At run time, the delimiter table
2312 will be available for these nodes to use. */
2314 case 'y':
2316 if (emit == EMIT_NODE) {
2317 ret_val = emit_node (IS_DELIM);
2318 } else {
2319 REG_FAIL ("internal error #5 `shortcut_escape\'");
2322 break;
2324 case 'Y':
2326 if (emit == EMIT_NODE) {
2327 ret_val = emit_node (NOT_DELIM);
2328 } else {
2329 REG_FAIL ("internal error #6 `shortcut_escape\'");
2332 break;
2334 case 'B':
2336 if (emit == EMIT_NODE) {
2337 ret_val = emit_node (NOT_BOUNDARY);
2338 } else {
2339 REG_FAIL ("internal error #7 `shortcut_escape\'");
2342 break;
2344 default:
2345 /* We get here if there isn't a case for every character in
2346 the string "codes" */
2348 REG_FAIL ("internal error #8 `shortcut_escape\'");
2351 if (emit == EMIT_NODE && c != 'B') {
2352 *flag_param |= (HAS_WIDTH | SIMPLE);
2355 if (class) {
2356 /* Emit bytes within a character class operand. */
2358 while (*class != '\0') {
2359 emit_byte (*class++);
2363 return ret_val;
2366 /*--------------------------------------------------------------------*
2367 * numeric_escape
2369 * Implements hex and octal numeric escape sequence syntax.
2371 * Hexadecimal Escape: \x## Max of two digits Must have leading 'x'.
2372 * Octal Escape: \0### Max of three digits and not greater
2373 * than 377 octal. Must have leading zero.
2375 * Returns the actual character value or NULL if not a valid hex or
2376 * octal escape. REG_FAIL is called if \x0, \x00, \0, \00, \000, or
2377 * \0000 is specified.
2378 *--------------------------------------------------------------------*/
2380 static unsigned char numeric_escape (
2381 unsigned char c,
2382 unsigned char **parse) {
2384 static unsigned char digits [] = "fedcbaFEDCBA9876543210";
2386 static unsigned int digit_val [] = {
2387 15, 14, 13, 12, 11, 10, /* Lower case Hex digits */
2388 15, 14, 13, 12, 11, 10, /* Upper case Hex digits */
2389 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}; /* Decimal Digits */
2391 register unsigned char *scan;
2392 register unsigned char *pos_ptr;
2393 register unsigned char *digit_str;
2394 unsigned int value = 0;
2395 unsigned int radix = 8;
2396 int width = 3; /* Can not be bigger than \0377 */
2397 int pos_delta = 14;
2398 int i, pos;
2400 switch (c) {
2401 case '0':
2402 digit_str = digits + pos_delta; /* Only use Octal digits, i.e. 0-7. */
2404 break;
2406 case 'x':
2407 case 'X':
2408 width = 2; /* Can not be bigger than \0377 */
2409 radix = 16;
2410 pos_delta = 0;
2411 digit_str = digits; /* Use all of the digit characters. */
2413 break;
2415 default:
2416 return ('\0'); /* Not a numeric escape */
2419 scan = *parse; scan++; /* Only change *parse on success. */
2421 pos_ptr = (unsigned char *) strchr ((char *) digit_str, (int) *scan);
2423 for (i = 0; pos_ptr != NULL && (i < width); i++) {
2424 pos = (pos_ptr - digit_str) + pos_delta;
2425 value = (value * radix) + digit_val [pos];
2427 /* If this digit makes the value over 255, treat this digit as a literal
2428 character instead of part of the numeric escape. For example, \0777
2429 will be processed as \077 (an 'M') and a literal '7' character, NOT
2430 511 decimal which is > 255. */
2432 if (value > 255) {
2433 /* Back out calculations for last digit processed. */
2435 value -= digit_val [pos];
2436 value /= radix;
2438 break; /* Note that scan will not be incremented and still points to
2439 the digit that caused overflow. It will be decremented by
2440 the "else" below to point to the last character that is
2441 considered to be part of the octal escape. */
2444 scan++;
2445 pos_ptr = (unsigned char *) strchr ((char *) digit_str, (int) *scan);
2448 /* Handle the case of "\0" i.e. trying to specify a NULL character. */
2450 if (value == 0) {
2451 if (c == '0') {
2452 sprintf (Error_Text, "\\00 is an invalid octal escape");
2453 } else {
2454 sprintf (Error_Text, "\\%c0 is an invalid hexadecimal escape", c);
2456 } else {
2457 /* Point to the last character of the number on success. */
2459 scan--;
2460 *parse = scan;
2463 return (unsigned char) value;
2466 /*--------------------------------------------------------------------*
2467 * literal_escape
2469 * Recognize escaped literal characters (prefixed with backslash),
2470 * and translate them into the corresponding character.
2472 * Returns the proper character value or NULL if not a valid literal
2473 * escape.
2474 *--------------------------------------------------------------------*/
2476 static unsigned char literal_escape (unsigned char c) {
2478 static unsigned char valid_escape [] = {
2479 'a', 'b',
2480 'e',
2481 'f', 'n', 'r', 't', 'v', '(', ')', '-', '[', ']',
2482 '<', '>', '{', '}', '.', '\\', '|', '^', '$', '*',
2483 '+', '?', '&', '\0'
2486 static unsigned char value [] = {
2487 '\a', '\b',
2488 #ifdef EBCDIC_CHARSET
2489 0x27, /* Escape character in IBM's EBCDIC character set. */
2490 #else
2491 0x1B, /* Escape character in ASCII character set. */
2492 #endif
2493 '\f', '\n', '\r', '\t', '\v', '(', ')', '-', '[', ']',
2494 '<', '>', '{', '}', '.', '\\', '|', '^', '$', '*',
2495 '+', '?', '&', '\0'
2498 int i;
2500 for (i = 0; valid_escape [i] != '\0'; i++) {
2501 if (c == valid_escape [i]) return value [i];
2504 return '\0';
2507 /*--------------------------------------------------------------------*
2508 * back_ref
2510 * Process a request to match a previous parenthesized thing.
2511 * Parenthetical entities are numbered beginning at 1 by counting
2512 * opening parentheses from left to to right. \0 would represent
2513 * whole match, but would confuse numeric_escape as an octal escape,
2514 * so it is forbidden.
2516 * Constructs of the form \~1, \~2, etc. are cross-regex back
2517 * references and are used in syntax highlighting patterns to match
2518 * text previously matched by another regex. *** IMPLEMENT LATER ***
2519 *--------------------------------------------------------------------*/
2521 static unsigned char * back_ref (
2522 unsigned char *c,
2523 int *flag_param,
2524 int emit) {
2526 int paren_no, c_offset = 0, is_cross_regex = 0;
2528 unsigned char *ret_val;
2530 /* Implement cross regex backreferences later. */
2532 /* if (*c == (unsigned char) ('~')) {
2533 c_offset++;
2534 is_cross_regex++;
2535 } */
2537 paren_no = (int) (*(c + c_offset) - (unsigned char) ('0'));
2539 if (!isdigit (*(c + c_offset)) || /* Only \1, \2, ... \9 are supported. */
2540 paren_no == 0) { /* Should be caught by numeric_escape. */
2542 return NULL;
2545 /* Make sure parentheses for requested back-reference are complete. */
2547 if (!is_cross_regex && !TEST_BIT (Closed_Parens, paren_no)) {
2548 sprintf (Error_Text, "\\%d is an illegal back reference", paren_no);
2549 return NULL;
2552 if (emit == EMIT_NODE) {
2553 if (is_cross_regex) {
2554 Reg_Parse++; /* Skip past the '~' in a cross regex back reference.
2555 We only do this if we are emitting code. */
2557 if (Is_Case_Insensitive) {
2558 ret_val = emit_node (X_REGEX_BR_CI);
2559 } else {
2560 ret_val = emit_node (X_REGEX_BR);
2562 } else {
2563 if (Is_Case_Insensitive) {
2564 ret_val = emit_node (BACK_REF_CI);
2565 } else {
2566 ret_val = emit_node (BACK_REF);
2570 emit_byte ((unsigned char) paren_no);
2572 if (is_cross_regex || TEST_BIT (Paren_Has_Width, paren_no)) {
2573 *flag_param |= HAS_WIDTH;
2575 } else if (emit == CHECK_ESCAPE) {
2576 ret_val = (unsigned char *) 1;
2577 } else {
2578 ret_val = NULL;
2581 return ret_val;
2584 /*======================================================================*
2585 * Regex execution related code
2586 *======================================================================*/
2588 /* Global work variables for `ExecRE'. */
2590 static unsigned char *Reg_Input; /* String-input pointer. */
2591 static unsigned char *Start_Of_String; /* Beginning of input, for ^ */
2592 /* and < checks. */
2593 static unsigned char *Look_Behind_To; /* Position till were look behind
2594 can safely check back */
2595 static unsigned char **Start_Ptr_Ptr; /* Pointer to `startp' array. */
2596 static unsigned char **End_Ptr_Ptr; /* Ditto for `endp'. */
2597 static unsigned char *Extent_Ptr_FW; /* Forward extent pointer */
2598 static unsigned char *Extent_Ptr_BW; /* Backward extent pointer */
2599 static unsigned char *Back_Ref_Start [10]; /* Back_Ref_Start [0] and */
2600 static unsigned char *Back_Ref_End [10]; /* Back_Ref_End [0] are not */
2601 /* used. This simplifies */
2602 /* indexing. */
2603 static regexp *Cross_Regex_Backref;
2605 static int Prev_Is_BOL;
2606 static int Succ_Is_EOL;
2607 static int Prev_Is_Delim;
2608 static int Succ_Is_Delim;
2610 /* Define a pointer to an array to hold general (...){m,n} counts. */
2612 typedef struct brace_counts {
2613 unsigned long count [1]; /* More unwarranted chumminess with compiler. */
2614 } brace_counts;
2616 static struct brace_counts *Brace;
2618 /* Default table for determining whether a character is a word delimiter. */
2620 static unsigned char Default_Delimiters [UCHAR_MAX] = {0};
2622 static unsigned char *Current_Delimiters; /* Current delimiter table */
2624 /* Forward declarations of functions used by `ExecRE' */
2626 static int attempt (regexp *, unsigned char *);
2627 static int match (unsigned char *, int *);
2628 static unsigned long greedy (unsigned char *, long);
2629 static void adjustcase (unsigned char *, int, unsigned char);
2630 static unsigned char * makeDelimiterTable (unsigned char *, unsigned char *);
2633 * ExecRE - match a `regexp' structure against a string
2635 * If `end' is non-NULL, matches may not BEGIN past end, but may extend past
2636 * it. If reverse is true, `end' must be specified, and searching begins at
2637 * `end'. "isbol" should be set to true if the beginning of the string is the
2638 * actual beginning of a line (since `ExecRE' can't look backwards from the
2639 * beginning to find whether there was a newline before). Likewise, "isbow"
2640 * asks whether the string is preceded by a word delimiter. End of string is
2641 * always treated as a word and line boundary (there may be cases where it
2642 * shouldn't be, in which case, this should be changed). "delimit" (if
2643 * non-null) specifies a null-terminated string of characters to be considered
2644 * word delimiters matching "<" and ">". if "delimit" is NULL, the default
2645 * delimiters (as set in SetREDefaultWordDelimiters) are used.
2646 * Finally, look_behind_to indicates the position till where it is safe to
2647 * perform look-behind matches. If set, it should be smaller than or equal
2648 * to the start position of the search (pointed at by string). If it is NULL,
2649 * it defaults to the start position. */
2651 int ExecRE (
2652 regexp *prog,
2653 regexp *cross_regex_backref,
2654 const char *string,
2655 const char *end,
2656 int reverse,
2657 char prev_char,
2658 char succ_char,
2659 const char *delimiters,
2660 const char *look_behind_to) {
2662 register unsigned char *str;
2663 unsigned char **s_ptr;
2664 unsigned char **e_ptr;
2665 int ret_val = 0;
2666 unsigned char tempDelimitTable [256];
2667 int i;
2669 s_ptr = (unsigned char **) prog->startp;
2670 e_ptr = (unsigned char **) prog->endp;
2672 /* Check for valid parameters. */
2674 if (prog == NULL || string == NULL) {
2675 reg_error ("NULL parameter to `ExecRE\'");
2676 goto SINGLE_RETURN;
2679 /* Check validity of program. */
2681 if (U_CHAR_AT (prog->program) != MAGIC) {
2682 reg_error ("corrupted program");
2683 goto SINGLE_RETURN;
2686 /* If caller has supplied delimiters, make a delimiter table */
2688 if (delimiters == NULL) {
2689 Current_Delimiters = Default_Delimiters;
2690 } else {
2691 Current_Delimiters = makeDelimiterTable (
2692 (unsigned char *) delimiters,
2693 (unsigned char *) tempDelimitTable);
2696 if (end == NULL && reverse) {
2697 for (end = string; *end != '\0'; end++) ;
2698 succ_char = '\n';
2699 } else if (end == NULL) {
2700 succ_char = '\n';
2703 /* Initialize arrays used by shortcut_escape. */
2705 if (!init_ansi_classes ()) goto SINGLE_RETURN;
2707 /* Remember the beginning of the string for matching BOL */
2709 Start_Of_String = (unsigned char *) string;
2710 Look_Behind_To = (unsigned char *) (look_behind_to?look_behind_to:string);
2712 Prev_Is_BOL = ((prev_char == '\n') || (prev_char == '\0') ? 1 : 0);
2713 Succ_Is_EOL = ((succ_char == '\n') || (succ_char == '\0') ? 1 : 0);
2714 Prev_Is_Delim = (Current_Delimiters [(unsigned char)prev_char] ? 1 : 0);
2715 Succ_Is_Delim = (Current_Delimiters [(unsigned char)succ_char] ? 1 : 0);
2717 Total_Paren = (int) (prog->program [1]);
2718 Num_Braces = (int) (prog->program [2]);
2720 Cross_Regex_Backref = cross_regex_backref;
2722 /* Allocate memory for {m,n} construct counting variables if need be. */
2724 if (Num_Braces > 0) {
2725 Brace =
2726 (brace_counts *) malloc (sizeof (brace_counts) + (size_t) Num_Braces);
2728 if (Brace == NULL) {
2729 reg_error ("out of memory in `ExecRE\'");
2730 goto SINGLE_RETURN;
2732 } else {
2733 Brace = NULL;
2736 /* Initialize the first nine (9) capturing parentheses start and end
2737 pointers to point to the start of the search string. This is to prevent
2738 crashes when later trying to reference captured parens that do not exist
2739 in the compiled regex. We only need to do the first nine since users
2740 can only specify \1, \2, ... \9. */
2742 for (i = 9; i > 0; i--) {
2743 *s_ptr++ = (unsigned char *) string;
2744 *e_ptr++ = (unsigned char *) string;
2747 if (!reverse) { /* Forward Search */
2748 if (prog->anchor) {
2749 /* Search is anchored at BOL */
2751 if (attempt (prog, (unsigned char *) string)) {
2752 ret_val = 1;
2753 goto SINGLE_RETURN;
2756 for (str = (unsigned char *) string;
2757 *str != '\0' && str != (unsigned char *) end;
2758 str++) {
2760 if (*str == '\n') {
2761 if (attempt (prog, str + 1)) {
2762 ret_val = 1;
2763 break;
2768 goto SINGLE_RETURN;
2770 } else if (prog->match_start != '\0') {
2771 /* We know what char match must start with. */
2773 for (str = (unsigned char *) string;
2774 *str != '\0' && str != (unsigned char *) end;
2775 str++) {
2777 if (*str == (unsigned char)prog->match_start) {
2778 if (attempt (prog, str)) {
2779 ret_val = 1;
2780 break;
2785 goto SINGLE_RETURN;
2786 } else {
2787 /* General case */
2789 for (str = (unsigned char *) string;
2790 *str != '\0' && str != (unsigned char *) end;
2791 str++) {
2793 if (attempt (prog, str)) {
2794 ret_val = 1;
2795 break;
2799 goto SINGLE_RETURN;
2801 } else { /* Search reverse, same as forward, but loops run backward */
2803 if (prog->anchor) {
2804 /* Search is anchored at BOL */
2806 for (str = (unsigned char *)(end - 1);
2807 str >= (unsigned char *) string;
2808 str--) {
2810 if (*str == '\n') {
2811 if (attempt (prog, str + 1)) {
2812 ret_val = 1;
2813 goto SINGLE_RETURN;
2818 if (attempt (prog, (unsigned char *) string)) {
2819 ret_val = 1;
2820 goto SINGLE_RETURN;
2823 goto SINGLE_RETURN;
2824 } else if (prog->match_start != '\0') {
2825 /* We know what char match must start with. */
2827 for (str = (unsigned char *) end;
2828 str >= (unsigned char *) string;
2829 str--) {
2831 if (*str == (unsigned char)prog->match_start) {
2832 if (attempt (prog, str)) {
2833 ret_val = 1;
2834 break;
2839 goto SINGLE_RETURN;
2840 } else {
2841 /* General case */
2843 for (str = (unsigned char *) end;
2844 str >= (unsigned char *) string;
2845 str--) {
2847 if (attempt (prog, str)) {
2848 ret_val = 1;
2849 break;
2855 SINGLE_RETURN: if (Brace) free (Brace);
2857 return (ret_val);
2860 /*--------------------------------------------------------------------*
2861 * init_ansi_classes
2863 * Generate character class sets using locale aware ANSI C functions.
2865 *--------------------------------------------------------------------*/
2867 static int init_ansi_classes (void) {
2869 static int initialized = 0;
2870 static int underscore = (int) '_';
2871 int i, word_count, letter_count, space_count;
2873 if (!initialized) {
2874 initialized = 1; /* Only need to generate character sets once. */
2875 word_count = 0;
2876 letter_count = 0;
2877 space_count = 0;
2879 for (i = 1; i < (int)UCHAR_MAX; i++) {
2880 if (isalnum (i) || i == underscore) {
2881 Word_Char [word_count++] = (unsigned char) i;
2884 if (isalpha (i)) {
2885 Letter_Char [letter_count++] = (unsigned char) i;
2888 /* Note: Whether or not newline is considered to be whitespace is
2889 handled by switches within the original regex and is thus omitted
2890 here. */
2892 if (isspace (i) && (i != (int) '\n')) {
2893 White_Space [space_count++] = (unsigned char) i;
2896 /* Make sure arrays are big enough. ("- 2" because of zero array
2897 origin and we need to leave room for the NULL terminator.) */
2899 if (word_count > (ALNUM_CHAR_SIZE - 2) ||
2900 space_count > (WHITE_SPACE_SIZE - 2) ||
2901 letter_count > (ALNUM_CHAR_SIZE - 2)) {
2903 reg_error ("internal error #9 `init_ansi_classes\'");
2904 return (0);
2908 Word_Char [word_count] = '\0';
2909 Letter_Char [word_count] = '\0';
2910 White_Space [space_count] = '\0';
2913 return (1);
2916 /*----------------------------------------------------------------------*
2917 * attempt - try match at specific point, returns: 0 failure, 1 success
2918 *----------------------------------------------------------------------*/
2920 static int attempt (regexp *prog, unsigned char *string) {
2922 register int i;
2923 register unsigned char **s_ptr;
2924 register unsigned char **e_ptr;
2925 int branch_index = 0; /* Must be set to zero ! */
2927 Reg_Input = string;
2928 Start_Ptr_Ptr = (unsigned char **) prog->startp;
2929 End_Ptr_Ptr = (unsigned char **) prog->endp;
2930 s_ptr = (unsigned char **) prog->startp;
2931 e_ptr = (unsigned char **) prog->endp;
2933 /* Overhead due to capturing parentheses. */
2935 Extent_Ptr_BW = string;
2936 Extent_Ptr_FW = NULL;
2938 for (i = Total_Paren + 1; i > 0; i--) {
2939 *s_ptr++ = NULL;
2940 *e_ptr++ = NULL;
2943 if (match ((unsigned char *) (prog->program + REGEX_START_OFFSET),
2944 &branch_index)) {
2945 prog->startp [0] = (char *) string;
2946 prog->endp [0] = (char *) Reg_Input; /* <-- One char AFTER */
2947 prog->extentpBW = (char *) Extent_Ptr_BW; /* matched string! */
2948 prog->extentpFW = (char *) Extent_Ptr_FW;
2949 prog->top_branch = branch_index;
2951 return (1);
2952 } else {
2953 return (0);
2957 /*----------------------------------------------------------------------*
2958 * match - main matching routine
2960 * Conceptually the strategy is simple: check to see whether the
2961 * current node matches, call self recursively to see whether the rest
2962 * matches, and then act accordingly. In practice we make some effort
2963 * to avoid recursion, in particular by going through "ordinary" nodes
2964 * (that don't need to know whether the rest of the match failed) by a
2965 * loop instead of by recursion. Returns 0 failure, 1 success.
2966 *----------------------------------------------------------------------*/
2968 static int match (unsigned char *prog, int *branch_index_param) {
2970 register unsigned char *scan; /* Current node. */
2971 unsigned char *next; /* Next node. */
2972 register int next_ptr_offset; /* Used by the NEXT_PTR () macro */
2975 scan = prog;
2977 while (scan != NULL) {
2978 NEXT_PTR (scan, next);
2980 switch (GET_OP_CODE (scan)) {
2981 case BRANCH:
2983 register unsigned char *save;
2984 register int branch_index_local = 0;
2986 if (GET_OP_CODE (next) != BRANCH) { /* No choice. */
2987 next = OPERAND (scan); /* Avoid recursion. */
2988 } else {
2989 do {
2990 save = Reg_Input;
2992 if (match (OPERAND (scan), NULL))
2994 if (branch_index_param)
2995 *branch_index_param = branch_index_local;
2996 return (1);
2999 ++branch_index_local;
3001 Reg_Input = save; /* Backtrack. */
3002 NEXT_PTR (scan, scan);
3003 } while (scan != NULL && GET_OP_CODE (scan) == BRANCH);
3005 return (0); /* NOT REACHED */
3009 break;
3011 case EXACTLY:
3013 register int len;
3014 register unsigned char *opnd;
3016 opnd = OPERAND (scan);
3018 /* Inline the first character, for speed. */
3020 if (*opnd != *Reg_Input) return (0);
3022 len = strlen ((char *) opnd);
3024 if (len > 1 &&
3025 strncmp ((char *) opnd, (char *) Reg_Input, len) != 0) {
3027 return (0);
3030 Reg_Input += len;
3033 break;
3035 case SIMILAR:
3037 register unsigned char *opnd;
3038 register unsigned char test;
3040 opnd = OPERAND (scan);
3042 /* Note: the SIMILAR operand was converted to lower case during
3043 regex compile. */
3045 while ((test = *opnd++) != '\0') {
3046 if (tolower (*Reg_Input++) != test) return (0);
3050 break;
3052 case BOL: /* `^' (beginning of line anchor) */
3053 if (Reg_Input == Start_Of_String) {
3054 if (Prev_Is_BOL) break;
3055 } else if (*(Reg_Input - 1) == '\n') {
3056 break;
3059 return (0);
3061 case EOL: /* `$' anchor matches end of line and end of string */
3062 if (*Reg_Input == '\n' || (*Reg_Input == '\0' && Succ_Is_EOL)) {
3063 break;
3066 return (0);
3068 case BOWORD: /* `<' (beginning of word anchor) */
3069 /* Check to see if preceding character is a delimiter. */
3071 if (Reg_Input == Start_Of_String) {
3072 if (Prev_Is_Delim) break;
3073 } else if (Current_Delimiters [ *(Reg_Input - 1) ]) {
3074 break;
3077 return (0);
3079 case EOWORD: /* `>' (end of word anchor) */
3080 /* Check to see if current character is a delimiter. */
3082 if (Current_Delimiters [ *(Reg_Input) ]) break;
3084 return (0);
3086 case NOT_BOUNDARY: /* \B (NOT a word boundary) */
3087 if (Reg_Input == Start_Of_String) {
3088 if (!(Prev_Is_Delim ^ Current_Delimiters [*Reg_Input])) break;
3089 } else if (*Reg_Input == '\0') {
3090 if (!((Current_Delimiters [ *(Reg_Input - 1) ]) ^
3091 Succ_Is_Delim)) break;
3092 } else {
3093 if (!(Current_Delimiters [ *(Reg_Input - 1) ] ^
3094 Current_Delimiters [*Reg_Input])) break;
3097 return (0);
3099 case IS_DELIM: /* \y (A word delimiter character.) */
3100 if (Current_Delimiters [ *Reg_Input ]) {
3101 Reg_Input++; break;
3104 return (0);
3106 case NOT_DELIM: /* \Y (NOT a word delimiter character.) */
3107 if (!Current_Delimiters [ *Reg_Input ]) {
3108 Reg_Input++; break;
3111 return (0);
3113 case WORD_CHAR: /* \w (word character; alpha-numeric or underscore) */
3114 if (isalnum ((int) *Reg_Input) || *Reg_Input == '_') {
3115 Reg_Input++; break;
3118 return (0);
3120 case NOT_WORD_CHAR:/* \W (NOT a word character) */
3121 if (isalnum ((int) *Reg_Input) ||
3122 *Reg_Input == '_' ||
3123 *Reg_Input == '\n') return (0);
3125 Reg_Input++; break;
3127 case ANY: /* `.' (matches any character EXCEPT newline) */
3128 if (*Reg_Input == '\0' || *Reg_Input == '\n') return (0);
3130 Reg_Input++; break;
3132 case EVERY: /* `.' (matches any character INCLUDING newline) */
3133 if (*Reg_Input == '\0') return (0);
3135 Reg_Input++; break;
3137 case DIGIT: /* \d, same as [0123456789] */
3138 if (!isdigit ((int) *Reg_Input)) return (0);
3140 Reg_Input++; break;
3142 case NOT_DIGIT: /* \D, same as [^0123456789] */
3143 if (isdigit ((int) *Reg_Input) || *Reg_Input == '\n') return (0);
3145 Reg_Input++; break;
3147 case LETTER: /* \l, same as [a-zA-Z] */
3148 if (!isalpha ((int) *Reg_Input)) return (0);
3150 Reg_Input++; break;
3152 case NOT_LETTER: /* \L, same as [^0123456789] */
3153 if (isalpha ((int) *Reg_Input) || *Reg_Input == '\n') return (0);
3155 Reg_Input++; break;
3157 case SPACE: /* \s, same as [ \t\r\f\v] */
3158 if (!isspace ((int) *Reg_Input) || *Reg_Input == '\n') return (0);
3160 Reg_Input++; break;
3162 case SPACE_NL: /* \s, same as [\n \t\r\f\v] */
3163 if (!isspace ((int) *Reg_Input)) return (0);
3165 Reg_Input++; break;
3167 case NOT_SPACE: /* \S, same as [^\n \t\r\f\v] */
3168 if (isspace ((int) *Reg_Input)) return (0);
3170 Reg_Input++; break;
3172 case NOT_SPACE_NL: /* \S, same as [^ \t\r\f\v] */
3173 if (isspace ((int) *Reg_Input) && *Reg_Input != '\n') return (0);
3175 Reg_Input++; break;
3177 case ANY_OF: /* [...] character class. */
3178 if (*Reg_Input == '\0') return (0); /* Needed because strchr ()
3179 considers \0 as a member
3180 of the character set. */
3182 if (strchr ((char *) OPERAND (scan), (int) *Reg_Input) == NULL) {
3183 return (0);
3186 Reg_Input++; break;
3188 case ANY_BUT: /* [^...] Negated character class-- does NOT normally
3189 match newline (\n added usually to operand at compile
3190 time.) */
3192 if (*Reg_Input == '\0') return (0); /* See comment for ANY_OF. */
3194 if (strchr ((char *) OPERAND (scan), (int) *Reg_Input) != NULL) {
3195 return (0);
3198 Reg_Input++; break;
3200 case NOTHING:
3201 case BACK:
3202 break;
3204 case STAR:
3205 case PLUS:
3206 case QUESTION:
3207 case BRACE:
3209 case LAZY_STAR:
3210 case LAZY_PLUS:
3211 case LAZY_QUESTION:
3212 case LAZY_BRACE:
3214 register unsigned long num_matched = REG_ZERO;
3215 register unsigned long min = ULONG_MAX, max = REG_ZERO;
3216 register unsigned char *save;
3217 register unsigned char next_char;
3218 unsigned char *next_op;
3219 int lazy = 0;
3221 /* Lookahead (when possible) to avoid useless match attempts
3222 when we know what character comes next. */
3224 if (GET_OP_CODE (next) == EXACTLY) {
3225 next_char = *OPERAND (next);
3226 } else {
3227 next_char = '\0';/* i.e. Don't know what next character is. */
3230 next_op = OPERAND (scan);
3232 switch (GET_OP_CODE (scan)) {
3233 case LAZY_STAR:
3234 lazy = 1;
3235 case STAR:
3236 min = REG_ZERO;
3237 max = ULONG_MAX;
3238 break;
3240 case LAZY_PLUS:
3241 lazy = 1;
3242 case PLUS:
3243 min = REG_ONE;
3244 max = ULONG_MAX;
3245 break;
3247 case LAZY_QUESTION:
3248 lazy = 1;
3249 case QUESTION:
3250 min = REG_ZERO;
3251 max = REG_ONE;
3252 break;
3254 case LAZY_BRACE:
3255 lazy = 1;
3256 case BRACE:
3257 min = (unsigned long)
3258 GET_OFFSET (scan + NEXT_PTR_SIZE);
3260 max = (unsigned long)
3261 GET_OFFSET (scan + (2 * NEXT_PTR_SIZE));
3263 if (max <= REG_INFINITY) max = ULONG_MAX;
3265 next_op = OPERAND (scan + (2 * NEXT_PTR_SIZE));
3268 save = Reg_Input;
3270 if (lazy) {
3271 if ( min > REG_ZERO) num_matched = greedy (next_op, min);
3272 } else {
3273 num_matched = greedy (next_op, max);
3276 while (min <= num_matched && num_matched <= max) {
3277 if (next_char == '\0' || next_char == *Reg_Input) {
3278 if (match (next, NULL)) return (1);
3281 /* Couldn't or didn't match. */
3283 if (lazy) {
3284 if (!greedy (next_op, 1)) return (0);
3286 num_matched++; /* Inch forward. */
3287 } else if (num_matched > REG_ZERO) {
3288 num_matched--; /* Back up. */
3289 } else if (min == REG_ZERO && num_matched == REG_ZERO) {
3290 break;
3293 Reg_Input = save + num_matched;
3296 return (0);
3299 break;
3301 case END:
3302 if (Extent_Ptr_FW == NULL || (Reg_Input - Extent_Ptr_FW) > 0) {
3303 Extent_Ptr_FW = Reg_Input;
3306 return (1); /* Success! */
3308 break;
3310 case INIT_COUNT:
3311 Brace->count [*OPERAND (scan)] = REG_ZERO;
3313 break;
3315 case INC_COUNT:
3316 Brace->count [*OPERAND (scan)]++;
3318 break;
3320 case TEST_COUNT:
3321 if (Brace->count [*OPERAND (scan)] <
3322 (unsigned long) GET_OFFSET (scan + NEXT_PTR_SIZE + INDEX_SIZE)) {
3324 next = scan + NODE_SIZE + INDEX_SIZE + NEXT_PTR_SIZE;
3327 break;
3329 case BACK_REF:
3330 case BACK_REF_CI:
3331 /* case X_REGEX_BR: */
3332 /* case X_REGEX_BR_CI: *** IMPLEMENT LATER */
3334 register unsigned char *captured, *finish;
3335 int paren_no;
3337 paren_no = (int) *OPERAND (scan);
3339 /* if (GET_OP_CODE (scan) == X_REGEX_BR ||
3340 GET_OP_CODE (scan) == X_REGEX_BR_CI) {
3342 if (Cross_Regex_Backref == NULL) return (0);
3344 captured =
3345 (unsigned char *) Cross_Regex_Backref->startp [paren_no];
3347 finish =
3348 (unsigned char *) Cross_Regex_Backref->endp [paren_no];
3349 } else { */
3350 captured = Back_Ref_Start [paren_no];
3351 finish = Back_Ref_End [paren_no];
3352 /* } */
3354 if ((captured != NULL) && (finish != NULL)) {
3355 if (captured > finish) return (0);
3357 if (GET_OP_CODE (scan) == BACK_REF_CI /* ||
3358 GET_OP_CODE (scan) == X_REGEX_BR_CI*/ ) {
3360 while (captured < finish) {
3361 if (tolower (*captured++) != tolower (*Reg_Input++)) {
3362 return (0);
3365 } else {
3366 while (captured < finish) {
3367 if (*captured++ != *Reg_Input++) return (0);
3371 break;
3372 } else {
3373 return (0);
3377 case POS_AHEAD_OPEN:
3378 case NEG_AHEAD_OPEN:
3380 register unsigned char *save;
3381 int answer;
3383 save = Reg_Input;
3384 answer = match (next, NULL); /* Does the look-ahead regex match? */
3386 if ((GET_OP_CODE (scan) == POS_AHEAD_OPEN) ? answer : !answer) {
3387 /* Remember the last (most to the right) character position
3388 that we consume in the input for a successful match. This
3389 is info that may be needed should an attempt be made to
3390 match the exact same text at the exact same place. Since
3391 look-aheads backtrack, a regex with a trailing look-ahead
3392 may need more text than it matches to accomplish a
3393 re-match. */
3395 if (Extent_Ptr_FW == NULL || (Reg_Input - Extent_Ptr_FW) > 0) {
3396 Extent_Ptr_FW = Reg_Input;
3399 Reg_Input = save; /* Backtrack to look-ahead start. */
3401 /* Jump to the node just after the (?=...) or (?!...)
3402 Construct. */
3404 next = next_ptr (OPERAND (scan)); /* Skip 1st branch */
3405 /* Skip the chain of branches inside the look-ahead */
3406 while(GET_OP_CODE(next) == BRANCH)
3407 next = next_ptr (next);
3408 next = next_ptr (next); /* Skip the LOOK_AHEAD_CLOSE */
3409 } else {
3410 Reg_Input = save; /* Backtrack to look-ahead start. */
3412 return (0);
3416 break;
3418 case POS_BEHIND_OPEN:
3419 case NEG_BEHIND_OPEN:
3421 register unsigned char *save;
3422 int answer;
3423 register int offset, upper;
3424 int lower;
3425 int found = 0;
3426 unsigned char save_char;
3428 save = Reg_Input;
3429 save_char = *Reg_Input;
3431 /* Prevent overshoot (greedy matching could end past the
3432 current position). This means that look-ahead at the end
3433 of a look-behind pattern won't work, but it is highly
3434 unlikely that anyone ever needs this. It is documented as
3435 being not supported anyway. It is possible to lift this
3436 limitation, but only at the cost of increased complexity
3437 and an overall decrease in performance of the regex matching
3438 code, so it's probably not worth it. */
3439 *Reg_Input = '\0';
3441 lower = GET_LOWER (scan);
3442 upper = GET_UPPER (scan);
3444 /* Start with the shortest match first. This is the most
3445 efficient direction in general.
3446 Note! Negative look behind is _very_ tricky when the length
3447 is not constant: we have to make sure the expression doesn't
3448 match for _any_ of the starting positions. */
3449 for (offset = lower; offset <= upper; ++offset) {
3450 Reg_Input = save - offset;
3452 if (Reg_Input < Look_Behind_To) {
3453 /* No need to look any further */
3454 break;
3457 answer = match (next, NULL); /* Does the look-behind regex match? */
3459 /* The match must have ended at the current position;
3460 otherwise it is invalid */
3461 if (answer && Reg_Input == save) {
3462 /* It matched, exactly far enough */
3463 found = 1;
3465 /* Remember the last (most to the left) character position
3466 that we consume in the input for a successful match.
3467 This is info that may be needed should an attempt be
3468 made to match the exact same text at the exact same
3469 place. Since look-behind backtracks, a regex with a
3470 leading look-behind may need more text than it matches
3471 to accomplish a re-match. */
3473 if (Extent_Ptr_BW == NULL ||
3474 (Extent_Ptr_BW - (save - offset)) > 0) {
3475 Extent_Ptr_BW = save - offset;
3478 break;
3482 /* Always restore the position and repair the string */
3483 Reg_Input = save;
3484 *Reg_Input = save_char;
3486 if ((GET_OP_CODE (scan) == POS_BEHIND_OPEN) ? found : !found) {
3487 /* The look-behind matches, so we must jump to the next
3488 node. The look-behind node is followed by a chain of
3489 branches (contents of the look-behind expression), and
3490 terminated by a look-behind-close node. */
3491 next = next_ptr (OPERAND (scan) + LENGTH_SIZE); /* 1st branch */
3492 /* Skip the chained branches inside the look-ahead */
3493 while (GET_OP_CODE (next) == BRANCH)
3494 next = next_ptr (next);
3495 next = next_ptr (next); /* Skip LOOK_BEHIND_CLOSE */
3496 } else {
3497 /* Not a match */
3498 return (0);
3501 break;
3503 case LOOK_AHEAD_CLOSE:
3504 case LOOK_BEHIND_CLOSE:
3505 return (1); /* We have reached the end of the look-ahead or
3506 look-behind which implies that we matched it,
3507 so return TRUE. */
3508 default:
3509 if ((GET_OP_CODE (scan) > OPEN) &&
3510 (GET_OP_CODE (scan) < OPEN + NSUBEXP)) {
3512 register int no;
3513 register unsigned char *save;
3515 no = GET_OP_CODE (scan) - OPEN;
3516 save = Reg_Input;
3518 if (no < 10) {
3519 Back_Ref_Start [no] = save;
3520 Back_Ref_End [no] = NULL;
3523 if (match (next, NULL)) {
3524 /* Do not set `Start_Ptr_Ptr' if some later invocation (think
3525 recursion) of the same parentheses already has. */
3527 if (Start_Ptr_Ptr [no] == NULL) Start_Ptr_Ptr [no] = save;
3529 return (1);
3530 } else {
3531 return (0);
3533 } else if ((GET_OP_CODE (scan) > CLOSE) &&
3534 (GET_OP_CODE (scan) < CLOSE + NSUBEXP)) {
3536 register int no;
3537 register unsigned char *save;
3539 no = GET_OP_CODE (scan) - CLOSE;
3540 save = Reg_Input;
3542 if (no < 10) Back_Ref_End [no] = save;
3544 if (match (next, NULL)) {
3545 /* Do not set `End_Ptr_Ptr' if some later invocation of the
3546 same parentheses already has. */
3548 if (End_Ptr_Ptr [no] == NULL) End_Ptr_Ptr [no] = save;
3550 return (1);
3551 } else {
3552 return (0);
3554 } else {
3555 reg_error ("memory corruption, `match\'");
3557 return (0);
3560 break;
3563 scan = next;
3566 /* We get here only if there's trouble -- normally "case END" is
3567 the terminating point. */
3569 reg_error ("corrupted pointers, `match\'");
3571 return (0);
3574 /*----------------------------------------------------------------------*
3575 * greedy
3577 * Repeatedly match something simple up to "max" times. If max <= 0
3578 * then match as much as possible (max = infinity). Uses unsigned long
3579 * variables to maximize the amount of text matchable for unbounded
3580 * qualifiers like '*' and '+'. This will allow at least 4,294,967,295
3581 * matches (4 Gig!) for an ANSI C compliant compiler. If you are
3582 * applying a regex to something bigger than that, you shouldn't be
3583 * using NEdit!
3585 * Returns the actual number of matches.
3586 *----------------------------------------------------------------------*/
3588 static unsigned long greedy (unsigned char *p, long max) {
3590 register unsigned char *input_str;
3591 register unsigned char *operand;
3592 register unsigned long count = REG_ZERO;
3593 register unsigned long max_cmp;
3595 input_str = Reg_Input;
3596 operand = OPERAND (p); /* Literal char or start of class characters. */
3597 max_cmp = (max > 0) ? (unsigned long) max : ULONG_MAX;
3599 switch (GET_OP_CODE (p)) {
3600 case ANY:
3601 /* Race to the end of the line or string. Dot DOESN'T match
3602 newline. */
3604 while (count < max_cmp && *input_str != '\0' && *input_str != '\n') {
3605 count++; input_str++;
3608 break;
3610 case EVERY:
3611 /* Race to the end of the line or string. Dot DOES match newline. */
3613 while (count < max_cmp && *input_str != '\0') {
3614 count++; input_str++;
3617 break;
3619 case EXACTLY: /* Count occurrences of single character operand. */
3620 while (count < max_cmp && *operand == *input_str) {
3621 count++; input_str++;
3624 break;
3626 case SIMILAR: /* Case insensitive version of EXACTLY */
3627 while (count < max_cmp && *operand == tolower (*input_str)) {
3628 count++; input_str++;
3631 break;
3633 case ANY_OF: /* [...] character class. */
3634 while (count < max_cmp &&
3635 *input_str != '\0' &&
3636 strchr ((char *) operand, (int) *input_str) != NULL) {
3638 count++; input_str++;
3641 break;
3643 case ANY_BUT: /* [^...] Negated character class- does NOT normally
3644 match newline (\n added usually to operand at compile
3645 time.) */
3647 while (count < max_cmp &&
3648 *input_str != '\0' &&
3649 strchr ((char *) operand, (int) *input_str) == NULL) {
3651 count++; input_str++;
3654 break;
3656 case IS_DELIM: /* \y (not a word delimiter char)
3657 NOTE: '\n' and '\0' are always word delimiters. */
3659 while (count < max_cmp && Current_Delimiters [ *input_str ]) {
3660 count++; input_str++;
3663 break;
3665 case NOT_DELIM: /* \Y (not a word delimiter char)
3666 NOTE: '\n' and '\0' are always word delimiters. */
3668 while (count < max_cmp && !Current_Delimiters [ *input_str ]) {
3669 count++; input_str++;
3672 break;
3674 case WORD_CHAR: /* \w (word character, alpha-numeric or underscore) */
3675 while (count < max_cmp &&
3676 (isalnum ((int) *input_str) ||
3677 *input_str == (unsigned char) '_')) {
3679 count++; input_str++;
3682 break;
3684 case NOT_WORD_CHAR:/* \W (NOT a word character) */
3685 while (count < max_cmp &&
3686 !isalnum ((int) *input_str) &&
3687 *input_str != (unsigned char) '_' &&
3688 *input_str != (unsigned char) '\n' &&
3689 *input_str != (unsigned char) '\0') {
3691 count++; input_str++;
3694 break;
3696 case DIGIT: /* same as [0123456789] */
3697 while (count < max_cmp && isdigit ((int) *input_str)) {
3698 count++; input_str++;
3701 break;
3703 case NOT_DIGIT: /* same as [^0123456789] */
3704 while (count < max_cmp &&
3705 !isdigit ((int) *input_str) &&
3706 *input_str != '\n' &&
3707 *input_str != '\0') {
3709 count++; input_str++;
3712 break;
3714 case SPACE: /* same as [ \t\r\f\v]-- doesn't match newline. */
3715 while (count < max_cmp &&
3716 isspace ((int) *input_str) &&
3717 *input_str != '\n' &&
3718 *input_str != '\0') {
3720 count++; input_str++;
3723 break;
3725 case SPACE_NL: /* same as [\n \t\r\f\v]-- matches newline. */
3726 while (count < max_cmp &&
3727 isspace ((int) *input_str) &&
3728 *input_str != '\0') {
3730 count++; input_str++;
3733 break;
3735 case NOT_SPACE: /* same as [^\n \t\r\f\v]-- doesn't match newline. */
3736 while (count < max_cmp &&
3737 !isspace ((int) *input_str) &&
3738 *input_str != '\0') {
3740 count++; input_str++;
3743 break;
3745 case NOT_SPACE_NL: /* same as [^ \t\r\f\v]-- matches newline. */
3746 while (count < max_cmp &&
3747 (!isspace ((int) *input_str) || *input_str == '\n') &&
3748 *input_str != '\0') {
3750 count++; input_str++;
3753 break;
3755 case LETTER: /* same as [a-zA-Z] */
3756 while (count < max_cmp &&
3757 isalpha ((int) *input_str) &&
3758 *input_str != '\0') {
3760 count++; input_str++;
3763 break;
3765 case NOT_LETTER: /* same as [^a-zA-Z] */
3766 while (count < max_cmp &&
3767 !isalpha ((int) *input_str) &&
3768 *input_str != '\n' &&
3769 *input_str != '\0') {
3771 count++; input_str++;
3774 break;
3776 default:
3777 /* Called inappropriately. Only atoms that are SIMPLE should
3778 generate a call to greedy. The above cases should cover
3779 all the atoms that are SIMPLE. */
3781 reg_error ("internal error #10 `greedy\'");
3782 count = 0U; /* Best we can do. */
3785 /* Point to character just after last matched character. */
3787 Reg_Input = input_str;
3789 return (count);
3792 /*----------------------------------------------------------------------*
3793 * next_ptr - compute the address of a node's "NEXT" pointer.
3794 * Note: a simplified inline version is available via the NEXT_PTR() macro,
3795 * but that one is only to be used at time-critical places (see the
3796 * description of the macro).
3797 *----------------------------------------------------------------------*/
3799 static unsigned char * next_ptr (unsigned char *ptr) {
3801 register int offset;
3803 if (ptr == &Compute_Size) return (NULL);
3805 offset = GET_OFFSET (ptr);
3807 if (offset == 0) return (NULL);
3809 if (GET_OP_CODE (ptr) == BACK) {
3810 return (ptr - offset);
3811 } else {
3812 return (ptr + offset);
3816 /*----------------------------------------------------------------------*
3817 * SubstituteRE - Perform substitutions after a `regexp' match.
3818 *----------------------------------------------------------------------*/
3820 void SubstituteRE (
3821 regexp *prog,
3822 const char *source,
3823 char *dest,
3824 int max) {
3826 register unsigned char *src;
3827 unsigned char *src_alias;
3828 register unsigned char *dst;
3829 register unsigned char c;
3830 register unsigned char test;
3831 register int paren_no;
3832 register int len;
3833 register unsigned char chgcase;
3835 if (prog == NULL || source == NULL || dest == NULL) {
3836 reg_error ("NULL parm to `SubstituteRE\'");
3838 return;
3841 if (U_CHAR_AT (prog->program) != MAGIC) {
3842 reg_error ("damaged regexp passed to `SubstituteRE\'");
3844 return;
3847 src = (unsigned char *) source;
3848 dst = (unsigned char *) dest;
3850 while ((c = *src++) != '\0') {
3851 chgcase = '\0';
3852 paren_no = -1;
3854 if (c == '\\') {
3855 /* Process any case altering tokens, i.e \u, \U, \l, \L. */
3857 if (*src == 'u' || *src == 'U' || *src == 'l' || *src == 'L') {
3858 chgcase = *src;
3859 src++;
3860 c = *src++;
3862 if (c == '\0') break;
3866 if (c == '&') {
3867 paren_no = 0;
3868 } else if (c == '\\') {
3869 /* Can not pass register variable `&src' to function `numeric_escape'
3870 so make a non-register copy that we can take the address of. */
3872 src_alias = src;
3874 if ('1' <= *src && *src <= '9') {
3875 paren_no = (int) *src++ - (int) '0';
3877 } else if ((test = literal_escape (*src)) != '\0') {
3878 c = test; src++;
3880 } else if ((test = numeric_escape (*src, &src_alias)) != '\0') {
3881 c = test;
3882 src = src_alias; src++;
3884 /* NOTE: if an octal escape for zero is attempted (e.g. \000), it
3885 will be treated as a literal string. */
3886 } else if (*src == '\0') {
3887 /* If '\' is the last character of the replacement string, it is
3888 interpreted as a literal backslash. */
3890 c = '\\';
3891 } else {
3892 c = *src++; /* Allow any escape sequence (This is */
3893 } /* INCONSISTENT with the `CompileRE' */
3894 } /* mind set of issuing an error! */
3896 if (paren_no < 0) { /* Ordinary character. */
3897 if (((char *) dst - (char *) dest) >= (max - 1)) {
3898 break;
3899 } else {
3900 *dst++ = c;
3902 } else if (prog->startp [paren_no] != NULL &&
3903 prog->endp [paren_no] != NULL) {
3905 len = prog->endp [paren_no] - prog->startp [paren_no];
3907 if (((char *) dst + len - (char *) dest) >= max-1) {
3908 len = max - ((char *) dst - (char *) dest) - 1;
3911 (void) strncpy ((char *) dst, (char *) prog->startp [paren_no], len);
3913 if (chgcase != '\0') adjustcase (dst, len, chgcase);
3915 dst += len;
3917 if (len != 0 && *(dst - 1) == '\0') { /* strncpy hit NUL. */
3918 reg_error ("damaged match string in `SubstituteRE\'");
3920 return;
3925 *dst = '\0';
3928 static void adjustcase (unsigned char *str, int len, unsigned char chgcase) {
3930 register unsigned char *string = str;
3931 int i;
3933 /* The tokens \u and \l only modify the first character while the tokens
3934 \U and \L modify the entire string. */
3936 if (islower (chgcase) && len > 0) len = 1;
3938 switch (chgcase) {
3939 case 'u':
3940 case 'U':
3941 for (i = 0; i < len; i++) {
3942 *(string + i) = toupper ((int) *(string + i));
3945 break;
3947 case 'l':
3948 case 'L':
3949 for (i = 0; i < len; i++) {
3950 *(string + i) = tolower ((int) *(string + i));
3953 break;
3957 /*----------------------------------------------------------------------*
3958 * reg_error
3959 *----------------------------------------------------------------------*/
3961 static void reg_error (char *str) {
3963 fprintf (
3964 stderr,
3965 "NEdit: Internal error processing regular expression (%s)\n",
3966 str);
3969 /*----------------------------------------------------------------------*
3970 * makeDelimiterTable
3972 * Translate a null-terminated string of delimiters into a 256 byte
3973 * lookup table for determining whether a character is a delimiter or
3974 * not.
3976 * Table must be allocated by the caller.
3978 * Return value is a pointer to the table.
3979 *----------------------------------------------------------------------*/
3981 static unsigned char * makeDelimiterTable (
3982 unsigned char *delimiters,
3983 unsigned char *table) {
3985 unsigned char *c;
3987 memset (table, 0, 256);
3989 for (c = (unsigned char *) delimiters; *c != '\0'; c++) {
3990 table [*c] = 1;
3993 table [(int) NULL] = 1; /* These */
3994 table [(int) '\t'] = 1; /* characters */
3995 table [(int) '\n'] = 1; /* are always */
3996 table [(int) ' ' ] = 1; /* delimiters. */
3998 return table;
4001 /*----------------------------------------------------------------------*
4002 * SetREDefaultWordDelimiters
4004 * Builds a default delimiter table that persists across `ExecRE' calls.
4005 *----------------------------------------------------------------------*/
4007 void SetREDefaultWordDelimiters (char *delimiters) {
4008 makeDelimiterTable ((unsigned char *) delimiters, Default_Delimiters);
4011 void EnableCountingQuantifier (int is_enabled) {
4012 Enable_Counting_Quantifier = is_enabled;