Fix for SF bug #1067402: Error in syntax highlighting with sub-pattern.
[nedit.git] / source / regularExp.c
blobe1794dd160d137be3f81f290b217107ecbc0a617
1 static const char CVSID[] = "$Id: regularExp.c,v 1.26 2004/11/26 18:25:51 edg Exp $";
2 /*------------------------------------------------------------------------*
3 * `CompileRE', `ExecRE', and `substituteRE' -- regular expression parsing
5 * This is a HIGHLY ALTERED VERSION of Henry Spencer's `regcomp' and
6 * `regexec' code adapted for NEdit.
8 * .-------------------------------------------------------------------.
9 * | ORIGINAL COPYRIGHT NOTICE: |
10 * | |
11 * | Copyright (c) 1986 by University of Toronto. |
12 * | Written by Henry Spencer. Not derived from licensed software. |
13 * | |
14 * | Permission is granted to anyone to use this software for any |
15 * | purpose on any computer system, and to redistribute it freely, |
16 * | subject to the following restrictions: |
17 * | |
18 * | 1. The author is not responsible for the consequences of use of |
19 * | this software, no matter how awful, even if they arise |
20 * | from defects in it. |
21 * | |
22 * | 2. The origin of this software must not be misrepresented, either |
23 * | by explicit claim or by omission. |
24 * | |
25 * | 3. Altered versions must be plainly marked as such, and must not |
26 * | be misrepresented as being the original software. |
27 * `-------------------------------------------------------------------'
29 * This is free software; you can redistribute it and/or modify it under the
30 * terms of the GNU General Public License as published by the Free Software
31 * Foundation; either version 2 of the License, or (at your option) any later
32 * version. In addition, you may distribute version of this program linked to
33 * Motif or Open Motif. See README for details.
35 * This software is distributed in the hope that it will be useful, but WITHOUT
36 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
37 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
38 * for more details.
40 * You should have received a copy of the GNU General Public License along with
41 * software; if not, write to the Free Software Foundation, Inc., 59 Temple
42 * Place, Suite 330, Boston, MA 02111-1307 USA
45 * BEWARE that some of this code is subtly aware of the way operator
46 * precedence is structured in regular expressions. Serious changes in
47 * regular-expression syntax might require a total rethink.
48 * -- Henry Spencer
49 * (Yes, it did!) -- Christopher Conrad, Dec. 1999
51 * January, 1994, Mark Edel
52 * Consolidated files, changed names of external functions to avoid
53 * potential conflicts with native regcomp and regexec functions, changed
54 * error reporting to NEdit form, added multi-line and reverse searching,
55 * and added \n \t \u \U \l \L.
57 * June, 1996, Mark Edel
58 * Bug in NEXT macro, didn't work for expressions which compiled to over
59 * 256 bytes.
61 * December, 1999, Christopher Conrad
62 * Reformatted code for readability, improved error output, added octal and
63 * hexadecimal escapes, added back-references (\1-\9), added positive look
64 * ahead: (?=...), added negative lookahead: (?!...), added non-capturing
65 * parentheses: (?:...), added case insensitive constructs (?i...) and
66 * (?I...), added newline matching constructs (?n...) and (?N...), added
67 * regex comments: (?#...), added shortcut escapes: \d\D\l\L\s\S\w\W\y\Y.
68 * Added "not a word boundary" anchor \B.
70 * July, 2002, Eddy De Greef
71 * Added look behind, both positive (?<=...) and negative (?<!...) for
72 * bounded-length patterns.
74 * November, 2004, Eddy De Greef
75 * Added constrained matching (allowing specification of the logical end
76 * of the string iso. matching till \0), and fixed several (probably
77 * very old) string overrun errors that could easily result in crashes,
78 * especially in client code.
81 #ifdef HAVE_CONFIG_H
82 #include "../config.h"
83 #endif
85 #include "regularExp.h"
87 #include <stdio.h>
88 #include <stdlib.h>
89 #include <string.h>
90 #include <ctype.h>
91 #include <limits.h>
93 #ifdef HAVE_DEBUG_H
94 #include "../debug.h"
95 #endif
98 /* The first byte of the regexp internal `program' is a magic number to help
99 gaurd against corrupted data; the compiled regex code really begins in the
100 second byte. */
102 #define MAGIC 0234
104 /* The "internal use only" fields in `regexp.h' are present to pass info from
105 * `CompileRE' to `ExecRE' which permits the execute phase to run lots faster on
106 * simple cases. They are:
108 * match_start Character that must begin a match; '\0' if none obvious.
109 * anchor Is the match anchored (at beginning-of-line only)?
111 * `match_start' and `anchor' permit very fast decisions on suitable starting
112 * points for a match, considerably reducing the work done by ExecRE. */
114 /* STRUCTURE FOR A REGULAR EXPRESSION (regex) `PROGRAM'.
116 * This is essentially a linear encoding of a nondeterministic finite-state
117 * machine or NFA (aka syntax charts or `railroad normal form' in parsing
118 * technology). Each node is an opcode plus a NEXT pointer, possibly
119 * followed by operands. NEXT pointers of all nodes except BRANCH implement
120 * concatenation; a NEXT pointer with a BRANCH on both ends of it is
121 * connecting two alternatives. (Here we have one of the subtle syntax
122 * dependencies: an individual BRANCH (as opposed to a collection of them) is
123 * never concatenated with anything because of operator precedence.) The
124 * operand of some types of nodes is a literal string; for others, it is a node
125 * leading into a sub-FSM. In particular, the operand of a BRANCH node is the
126 * first node of the branch. (NB this is _NOT_ a tree structure: the tail of
127 * the branch connects to the thing following the set of BRANCHes.)
129 * The opcodes are: */
131 /* DEFINITION VALUE MEANING */
133 #define END 1 /* End of program. */
135 /* Zero width positional assertions. */
137 #define BOL 2 /* Match position at beginning of line. */
138 #define EOL 3 /* Match position at end of line. */
139 #define BOWORD 4 /* Match "" representing word delimiter or BOL */
140 #define EOWORD 5 /* Match "" representing word delimiter or EOL */
141 #define NOT_BOUNDARY 6 /* Not word boundary (\B, opposite of < and >) */
143 /* Op codes with null terminated string operands. */
145 #define EXACTLY 7 /* Match this string. */
146 #define SIMILAR 8 /* Match this case insensitive string */
147 #define ANY_OF 9 /* Match any character in the set. */
148 #define ANY_BUT 10 /* Match any character not in the set. */
150 /* Op codes to match any character. */
152 #define ANY 11 /* Match any one character (implements '.') */
153 #define EVERY 12 /* Same as ANY but matches newline. */
155 /* Shortcut escapes, \d, \D, \l, \L, \s, \S, \w, \W, \y, \Y. */
157 #define DIGIT 13 /* Match any digit, i.e. [0123456789] */
158 #define NOT_DIGIT 14 /* Match any non-digit, i.e. [^0123456789] */
159 #define LETTER 15 /* Match any letter character [a-zA-Z] */
160 #define NOT_LETTER 16 /* Match any non-letter character [^a-zA-Z] */
161 #define SPACE 17 /* Match any whitespace character EXCEPT \n */
162 #define SPACE_NL 18 /* Match any whitespace character INCLUDING \n */
163 #define NOT_SPACE 19 /* Match any non-whitespace character */
164 #define NOT_SPACE_NL 20 /* Same as NOT_SPACE but matches newline. */
165 #define WORD_CHAR 21 /* Match any word character [a-zA-Z0-9_] */
166 #define NOT_WORD_CHAR 22 /* Match any non-word character [^a-zA-Z0-9_] */
167 #define IS_DELIM 23 /* Match any character that's a word delimiter */
168 #define NOT_DELIM 24 /* Match any character NOT a word delimiter */
170 /* Quantifier nodes. (Only applied to SIMPLE nodes. Quantifiers applied to non
171 SIMPLE nodes or larger atoms are implemented using complex constructs.)*/
173 #define STAR 25 /* Match this (simple) thing 0 or more times. */
174 #define LAZY_STAR 26 /* Minimal matching STAR */
175 #define QUESTION 27 /* Match this (simple) thing 0 or 1 times. */
176 #define LAZY_QUESTION 28 /* Minimal matching QUESTION */
177 #define PLUS 29 /* Match this (simple) thing 1 or more times. */
178 #define LAZY_PLUS 30 /* Minimal matching PLUS */
179 #define BRACE 31 /* Match this (simple) thing m to n times. */
180 #define LAZY_BRACE 32 /* Minimal matching BRACE */
182 /* Nodes used to build complex constructs. */
184 #define NOTHING 33 /* Match empty string (always matches) */
185 #define BRANCH 34 /* Match this alternative, or the next... */
186 #define BACK 35 /* Always matches, NEXT ptr points backward. */
187 #define INIT_COUNT 36 /* Initialize {m,n} counter to zero */
188 #define INC_COUNT 37 /* Increment {m,n} counter by one */
189 #define TEST_COUNT 38 /* Test {m,n} counter against operand */
191 /* Back Reference nodes. */
193 #define BACK_REF 39 /* Match latest matched parenthesized text */
194 #define BACK_REF_CI 40 /* Case insensitive version of BACK_REF */
195 #define X_REGEX_BR 41 /* Cross-Regex Back-Ref for syntax highlighting */
196 #define X_REGEX_BR_CI 42 /* Case insensitive version of X_REGEX_BR_CI */
198 /* Various nodes used to implement parenthetical constructs. */
200 #define POS_AHEAD_OPEN 43 /* Begin positive look ahead */
201 #define NEG_AHEAD_OPEN 44 /* Begin negative look ahead */
202 #define LOOK_AHEAD_CLOSE 45 /* End positive or negative look ahead */
204 #define POS_BEHIND_OPEN 46 /* Begin positive look behind */
205 #define NEG_BEHIND_OPEN 47 /* Begin negative look behind */
206 #define LOOK_BEHIND_CLOSE 48 /* Close look behind */
208 #define OPEN 49 /* Open for capturing parentheses. */
210 /* OPEN+1 is number 1, etc. */
211 #define CLOSE (OPEN + NSUBEXP) /* Close for capturing parentheses. */
213 #define LAST_PAREN (CLOSE + NSUBEXP)
215 #if (LAST_PAREN > UCHAR_MAX)
216 #error "Too many parentheses for storage in an unsigned char (LAST_PAREN too big.)"
217 #endif
219 /* The next_ptr () function can consume up to 30% of the time during matching
220 because it is called an immense number of times (an average of 25
221 next_ptr() calls per match() call was witnessed for Perl syntax
222 highlighting). Therefore it is well worth removing some of the function
223 call overhead by selectively inlining the next_ptr() calls. Moreover,
224 the inlined code can be simplified for matching because one of the tests,
225 only necessary during compilation, can be left out.
226 The net result of using this inlined version at two critical places is
227 a 25% speedup (again, witnesses on Perl syntax highlighting). */
229 #define NEXT_PTR(in_ptr, out_ptr)\
230 next_ptr_offset = GET_OFFSET (in_ptr);\
231 if (next_ptr_offset == 0)\
232 out_ptr = NULL;\
233 else {\
234 if (GET_OP_CODE (in_ptr) == BACK)\
235 out_ptr = in_ptr - next_ptr_offset;\
236 else \
237 out_ptr = in_ptr + next_ptr_offset;\
240 /* OPCODE NOTES:
241 ------------
243 All nodes consist of an 8 bit op code followed by 2 bytes that make up a 16
244 bit NEXT pointer. Some nodes have a null terminated character string operand
245 following the NEXT pointer. Other nodes may have an 8 bit index operand.
246 The TEST_COUNT node has an index operand followed by a 16 bit test value.
247 The BRACE and LAZY_BRACE nodes have two 16 bit values for min and max but no
248 index value.
250 SIMILAR
251 Operand(s): null terminated string
253 Implements a case insensitive match of a string. Mostly intended for use
254 in syntax highlighting patterns for keywords of languages like FORTRAN
255 and Ada that are case insensitive. The regex text in this node is
256 converted to lower case during regex compile.
258 DIGIT, NOT_DIGIT, LETTER, NOT_LETTER, SPACE, NOT_SPACE, WORD_CHAR,
259 NOT_WORD_CHAR
260 Operand(s): None
262 Implements shortcut escapes \d, \D, \l, \L, \s, \S, \w, \W. The locale
263 aware ANSI functions isdigit(), isalpha(), isalnun(), and isspace() are
264 used to implement these in the hopes of increasing portability.
266 NOT_BOUNDARY
267 Operand(s): None
269 Implements \B as a zero width assertion that the current character is
270 NOT on a word boundary. Word boundaries are defined to be the position
271 between two characters where one of those characters is one of the
272 dynamically defined word delimiters, and the other character is not.
274 IS_DELIM
275 Operand(s): None
277 Implements \y as any character that is one of the dynamically
278 specified word delimiters.
280 NOT_DELIM
281 Operand(s): None
283 Implements \Y as any character that is NOT one of the dynamically
284 specified word delimiters.
286 STAR, PLUS, QUESTION, and complex '*', '+', and '?'
287 Operand(s): None (Note: NEXT pointer is usually zero. The code that
288 processes this node skips over it.)
290 Complex (parenthesized) versions implemented as circular BRANCH
291 structures using BACK. SIMPLE versions (one character per match) are
292 implemented separately for speed and to minimize recursion.
294 BRACE, LAZY_BRACE
295 Operand(s): minimum value (2 bytes), maximum value (2 bytes)
297 Implements the {m,n} construct for atoms that are SIMPLE.
299 BRANCH
300 Operand(s): None
302 The set of branches constituting a single choice are hooked together
303 with their NEXT pointers, since precedence prevents anything being
304 concatenated to any individual branch. The NEXT pointer of the last
305 BRANCH in a choice points to the thing following the whole choice. This
306 is also where the final NEXT pointer of each individual branch points;
307 each branch starts with the operand node of a BRANCH node.
309 BACK
310 Operand(s): None
312 Normal NEXT pointers all implicitly point forward. Back implicitly
313 points backward. BACK exists to make loop structures possible.
315 INIT_COUNT
316 Operand(s): index (1 byte)
318 Initializes the count array element referenced by the index operand.
319 This node is used to build general (i.e. parenthesized) {m,n} constructs.
321 INC_COUNT
322 Operand(s): index (1 byte)
324 Increments the count array element referenced by the index operand.
325 This node is used to build general (i.e. parenthesized) {m,n} constructs.
327 TEST_COUNT
328 Operand(s): index (1 byte), test value (2 bytes)
330 Tests the current value of the count array element specified by the
331 index operand against the test value. If the current value is less than
332 the test value, control passes to the node after that TEST_COUNT node.
333 Otherwise control passes to the node referenced by the NEXT pointer for
334 the TEST_COUNT node. This node is used to build general (i.e.
335 parenthesized) {m,n} constructs.
337 BACK_REF, BACK_REF_CI
338 Operand(s): index (1 byte, value 1-9)
340 Implements back references. This node will attempt to match whatever text
341 was most recently captured by the index'th set of parentheses.
342 BACK_REF_CI is case insensitive version.
344 X_REGEX_BR, X_REGEX_BR_CI
345 (NOT IMPLEMENTED YET)
347 Operand(s): index (1 byte, value 1-9)
349 Implements back references into a previously matched but separate regular
350 expression. This is used by syntax highlighting patterns. This node will
351 attempt to match whatever text was most captured by the index'th set of
352 parentheses of the separate regex passed to ExecRE. X_REGEX_BR_CI is case
353 insensitive version.
355 POS_AHEAD_OPEN, NEG_AHEAD_OPEN, LOOK_AHEAD_CLOSE
357 Operand(s): None
359 Implements positive and negative look ahead. Look ahead is an assertion
360 that something is either there or not there. Once this is determined the
361 regex engine backtracks to where it was just before the look ahead was
362 encountered, i.e. look ahead is a zero width assertion.
364 POS_BEHIND_OPEN, NEG_BEHIND_OPEN, LOOK_BEHIND_CLOSE
366 Operand(s): 2x2 bytes for OPEN (match boundaries), None for CLOSE
368 Implements positive and negative look behind. Look behind is an assertion
369 that something is either there or not there in front of the current
370 position. Look behind is a zero width assertion, with the additional
371 constraint that it must have a bounded length (for complexity and
372 efficiency reasons; note that most other implementation even impose
373 fixed length).
375 OPEN, CLOSE
377 Operand(s): None
379 OPEN + n = Start of parenthesis 'n', CLOSE + n = Close of parenthesis
380 'n', and are numbered at compile time.
383 /* A node is one char of opcode followed by two chars of NEXT pointer plus
384 * any operands. NEXT pointers are stored as two 8-bit pieces, high order
385 * first. The value is a positive offset from the opcode of the node
386 * containing it. An operand, if any, simply follows the node. (Note that
387 * much of the code generation knows about this implicit relationship.)
389 * Using two bytes for NEXT_PTR_SIZE is vast overkill for most things,
390 * but allows patterns to get big without disasters. */
392 #define OP_CODE_SIZE 1
393 #define NEXT_PTR_SIZE 2
394 #define INDEX_SIZE 1
395 #define LENGTH_SIZE 4
396 #define NODE_SIZE (NEXT_PTR_SIZE + OP_CODE_SIZE)
398 #define GET_OP_CODE(p) (*(unsigned char *)(p))
399 #define OPERAND(p) ((p) + NODE_SIZE)
400 #define GET_OFFSET(p) ((( *((p) + 1) & 0377) << 8) + (( *((p) + 2)) & 0377))
401 #define PUT_OFFSET_L(v) (unsigned char)(((v) >> 8) & 0377)
402 #define PUT_OFFSET_R(v) (unsigned char) ((v) & 0377)
403 #define GET_LOWER(p) ((( *((p) + NODE_SIZE) & 0377) << 8) + \
404 (( *((p) + NODE_SIZE+1)) & 0377))
405 #define GET_UPPER(p) ((( *((p) + NODE_SIZE+2) & 0377) << 8) + \
406 (( *((p) + NODE_SIZE+3)) & 0377))
408 /* Utility definitions. */
410 #define REG_FAIL(m) {*Error_Ptr = (m); return (NULL);}
411 #define IS_QUANTIFIER(c) ((c) == '*' || (c) == '+' || \
412 (c) == '?' || (c) == Brace_Char)
413 #define SET_BIT(i,n) ((i) |= (1 << ((n) - 1)))
414 #define TEST_BIT(i,n) ((i) & (1 << ((n) - 1)))
415 #define U_CHAR_AT(p) ((unsigned int) *(unsigned char *)(p))
417 /* Flags to be passed up and down via function parameters during compile. */
419 #define WORST 0 /* Worst case. No assumptions can be made.*/
420 #define HAS_WIDTH 1 /* Known never to match null string. */
421 #define SIMPLE 2 /* Simple enough to be STAR/PLUS operand. */
423 #define NO_PAREN 0 /* Only set by initial call to "chunk". */
424 #define PAREN 1 /* Used for normal capturing parentheses. */
425 #define NO_CAPTURE 2 /* Non-capturing parentheses (grouping only). */
426 #define INSENSITIVE 3 /* Case insensitive parenthetical construct */
427 #define SENSITIVE 4 /* Case sensitive parenthetical construct */
428 #define NEWLINE 5 /* Construct to match newlines in most cases */
429 #define NO_NEWLINE 6 /* Construct to match newlines normally */
431 #define REG_INFINITY 0UL
432 #define REG_ZERO 0UL
433 #define REG_ONE 1UL
435 /* Flags for function shortcut_escape() */
437 #define CHECK_ESCAPE 0 /* Check an escape sequence for validity only. */
438 #define CHECK_CLASS_ESCAPE 1 /* Check the validity of an escape within a
439 character class */
440 #define EMIT_CLASS_BYTES 2 /* Emit equivalent character class bytes,
441 e.g \d=0123456789 */
442 #define EMIT_NODE 3 /* Emit the appropriate node. */
444 /* Array sizes for arrays used by function init_ansi_classes. */
446 #define WHITE_SPACE_SIZE 16
447 #define ALNUM_CHAR_SIZE 256
449 /* Number of bytes to offset from the beginning of the regex program to the start
450 of the actual compiled regex code, i.e. skipping over the MAGIC number and
451 the two counters at the front. */
453 #define REGEX_START_OFFSET 3
455 #define MAX_COMPILED_SIZE 32767UL /* Largest size a compiled regex can be.
456 Probably could be 65535UL. */
458 /* Global work variables for `CompileRE'. */
460 static unsigned char *Reg_Parse; /* Input scan ptr (scans user's regex) */
461 static int Total_Paren; /* Parentheses, (), counter. */
462 static int Num_Braces; /* Number of general {m,n} constructs.
463 {m,n} quantifiers of SIMPLE atoms are
464 not included in this count. */
465 static int Closed_Parens; /* Bit flags indicating () closure. */
466 static int Paren_Has_Width; /* Bit flags indicating ()'s that are
467 known to not match the empty string */
468 static unsigned char Compute_Size; /* Address of this used as flag. */
469 static unsigned char *Code_Emit_Ptr; /* When Code_Emit_Ptr is set to
470 &Compute_Size no code is emitted.
471 Instead, the size of code that WOULD
472 have been generated is accumulated in
473 Reg_Size. Otherwise, Code_Emit_Ptr
474 points to where compiled regex code is
475 to be written. */
476 static unsigned long Reg_Size; /* Size of compiled regex code. */
477 static char **Error_Ptr; /* Place to store error messages so
478 they can be returned by `CompileRE' */
479 static char Error_Text [128];/* Sting to build error messages in. */
481 static unsigned char White_Space [WHITE_SPACE_SIZE]; /* Arrays used by */
482 static unsigned char Word_Char [ALNUM_CHAR_SIZE]; /* functions */
483 static unsigned char Letter_Char [ALNUM_CHAR_SIZE]; /* init_ansi_classes () */
484 /* and
485 shortcut_escape (). */
487 static unsigned char ASCII_Digits [] = "0123456789"; /* Same for all */
488 /* locales. */
489 static int Is_Case_Insensitive;
490 static int Match_Newline;
492 static int Enable_Counting_Quantifier = 1;
493 static unsigned char Brace_Char;
494 static unsigned char Default_Meta_Char [] = "{.*+?[(|)^<>$";
495 static unsigned char *Meta_Char;
497 typedef struct { long lower; long upper; } len_range;
499 /* Forward declarations for functions used by `CompileRE'. */
501 static unsigned char * alternative (int *flag_param, len_range *range_param);
502 static unsigned char * back_ref (unsigned char *c, int *flag_param,
503 int emit);
504 static unsigned char * chunk (int paren, int *flag_param, len_range *range_param);
505 static void emit_byte (unsigned char c);
506 static void emit_class_byte (unsigned char c);
507 static unsigned char * emit_node (int op_code);
508 static unsigned char * emit_special (unsigned char op_code,
509 unsigned long test_val,
510 int index);
511 static unsigned char literal_escape (unsigned char c);
512 static unsigned char numeric_escape (unsigned char c, unsigned char **parse);
513 static unsigned char * atom (int *flag_param, len_range *range_param);
514 static void reg_error (char *str);
515 static unsigned char * insert (unsigned char op, unsigned char *opnd,
516 long min, long max, int index);
517 static unsigned char * next_ptr (unsigned char *ptr);
518 static void offset_tail (unsigned char *ptr, int offset,
519 unsigned char *val);
520 static void branch_tail (unsigned char *ptr, int offset,
521 unsigned char *val);
522 static unsigned char * piece (int *flag_param, len_range *range_param);
523 static void tail (unsigned char *search_from,
524 unsigned char *point_t);
525 static unsigned char * shortcut_escape (unsigned char c, int *flag_param,
526 int emit);
528 static int init_ansi_classes (void);
530 /*----------------------------------------------------------------------*
531 * CompileRE
533 * Compiles a regular expression into the internal format used by
534 * `ExecRE'.
536 * The default behaviour wrt. case sensitivity and newline matching can
537 * be controlled through the defaultFlags argument (Markus Schwarzenberg).
538 * Future extensions are possible by using other flag bits.
539 * Note that currently only the case sensitivity flag is effectively used.
541 * Beware that the optimization and preparation code in here knows about
542 * some of the structure of the compiled regexp.
543 *----------------------------------------------------------------------*/
545 regexp * CompileRE (const char *exp, char **errorText, int defaultFlags) {
547 register regexp *comp_regex = NULL;
548 register unsigned char *scan;
549 int flags_local, pass;
550 len_range range_local;
552 if (Enable_Counting_Quantifier) {
553 Brace_Char = '{';
554 Meta_Char = &Default_Meta_Char [0];
555 } else {
556 Brace_Char = '*'; /* Bypass the '{' in */
557 Meta_Char = &Default_Meta_Char [1]; /* Default_Meta_Char */
560 /* Set up errorText to receive failure reports. */
562 Error_Ptr = errorText;
563 *Error_Ptr = "";
565 if (exp == NULL) REG_FAIL ("NULL argument, `CompileRE\'");
567 /* Initialize arrays used by function `shortcut_escape'. */
569 if (!init_ansi_classes ()) REG_FAIL ("internal error #1, `CompileRE\'");
571 Code_Emit_Ptr = &Compute_Size;
572 Reg_Size = 0UL;
574 /* We can't allocate space until we know how big the compiled form will be,
575 but we can't compile it (and thus know how big it is) until we've got a
576 place to put the code. So we cheat: we compile it twice, once with code
577 generation turned off and size counting turned on, and once "for real".
578 This also means that we don't allocate space until we are sure that the
579 thing really will compile successfully, and we never have to move the
580 code and thus invalidate pointers into it. (Note that it has to be in
581 one piece because free() must be able to free it all.) */
583 for (pass = 1; pass <= 2; pass++) {
584 /*-------------------------------------------*
585 * FIRST PASS: Determine size and legality. *
586 * SECOND PASS: Emit code. *
587 *-------------------------------------------*/
589 /* Schwarzenberg:
590 * If defaultFlags = 0 use standard defaults:
591 * Is_Case_Insensitive: Case sensitive is the default
592 * Match_Newline: Newlines are NOT matched by default
593 * in character classes
595 Is_Case_Insensitive = ((defaultFlags & REDFLT_CASE_INSENSITIVE) ? 1 : 0);
596 Match_Newline = 0; /* ((defaultFlags & REDFLT_MATCH_NEWLINE) ? 1 : 0);
597 Currently not used. Uncomment if needed. */
599 Reg_Parse = (unsigned char *) exp;
600 Total_Paren = 1;
601 Num_Braces = 0;
602 Closed_Parens = 0;
603 Paren_Has_Width = 0;
605 emit_byte (MAGIC);
606 emit_byte ('%'); /* Placeholder for num of capturing parentheses. */
607 emit_byte ('%'); /* Placeholder for num of general {m,n} constructs. */
609 if (chunk (NO_PAREN, &flags_local, &range_local) == NULL)
610 return (NULL); /* Something went wrong */
611 if (pass == 1) {
612 if (Reg_Size >= MAX_COMPILED_SIZE) {
613 /* Too big for NEXT pointers NEXT_PTR_SIZE bytes long to span.
614 This is a real issue since the first BRANCH node usually points
615 to the end of the compiled regex code. */
617 sprintf (Error_Text, "regexp > %lu bytes", MAX_COMPILED_SIZE);
618 REG_FAIL (Error_Text);
621 /* Allocate memory. */
623 comp_regex = (regexp *) malloc (sizeof (regexp) + Reg_Size);
625 if (comp_regex == NULL) REG_FAIL ("out of memory in `CompileRE\'");
627 Code_Emit_Ptr = (unsigned char *) comp_regex->program;
631 comp_regex->program [1] = (unsigned char) Total_Paren - 1;
632 comp_regex->program [2] = (unsigned char) Num_Braces;
634 /*----------------------------------------*
635 * Dig out information for optimizations. *
636 *----------------------------------------*/
638 comp_regex->match_start = '\0'; /* Worst-case defaults. */
639 comp_regex->anchor = 0;
641 /* First BRANCH. */
643 scan = (unsigned char *) (comp_regex->program + REGEX_START_OFFSET);
645 if (GET_OP_CODE (next_ptr (scan)) == END) { /* Only one top-level choice. */
646 scan = OPERAND (scan);
648 /* Starting-point info. */
650 if (GET_OP_CODE (scan) == EXACTLY) {
651 comp_regex->match_start = *OPERAND (scan);
653 } else if (PLUS <= GET_OP_CODE (scan) &&
654 GET_OP_CODE (scan) <= LAZY_PLUS) {
656 /* Allow x+ or x+? at the start of the regex to be
657 optimized. */
659 if (GET_OP_CODE (scan + NODE_SIZE) == EXACTLY) {
660 comp_regex->match_start = *OPERAND (scan + NODE_SIZE);
662 } else if (GET_OP_CODE (scan) == BOL) {
663 comp_regex->anchor++;
667 return (comp_regex);
670 /*----------------------------------------------------------------------*
671 * chunk *
673 * Process main body of regex or process a parenthesized "thing". *
675 * Caller must absorb opening parenthesis. *
677 * Combining parenthesis handling with the base level of regular *
678 * expression is a trifle forced, but the need to tie the tails of the *
679 * branches to what follows makes it hard to avoid. *
680 *----------------------------------------------------------------------*/
682 static unsigned char * chunk (int paren, int *flag_param,
683 len_range *range_param) {
685 register unsigned char *ret_val = NULL;
686 register unsigned char *this_branch;
687 register unsigned char *ender = NULL;
688 register int this_paren = 0;
689 int flags_local, first = 1, zero_width, i;
690 int old_sensitive = Is_Case_Insensitive;
691 int old_newline = Match_Newline;
692 len_range range_local;
693 int look_only = 0;
694 unsigned char *emit_look_behind_bounds = NULL;
697 *flag_param = HAS_WIDTH; /* Tentatively. */
698 range_param->lower = 0; /* Idem */
699 range_param->upper = 0;
701 /* Make an OPEN node, if parenthesized. */
703 if (paren == PAREN) {
704 if (Total_Paren >= NSUBEXP) {
705 sprintf (Error_Text, "number of ()'s > %d", (int) NSUBEXP);
706 REG_FAIL (Error_Text);
709 this_paren = Total_Paren; Total_Paren++;
710 ret_val = emit_node (OPEN + this_paren);
711 } else if (paren == POS_AHEAD_OPEN || paren == NEG_AHEAD_OPEN) {
712 *flag_param = WORST; /* Look ahead is zero width. */
713 look_only = 1;
714 ret_val = emit_node (paren);
715 } else if (paren == POS_BEHIND_OPEN || paren == NEG_BEHIND_OPEN) {
716 *flag_param = WORST; /* Look behind is zero width. */
717 look_only = 1;
718 /* We'll overwrite the zero length later on, so we save the ptr */
719 ret_val = emit_special (paren, 0, 0);
720 emit_look_behind_bounds = ret_val + NODE_SIZE;
721 } else if (paren == INSENSITIVE) {
722 Is_Case_Insensitive = 1;
723 } else if (paren == SENSITIVE) {
724 Is_Case_Insensitive = 0;
725 } else if (paren == NEWLINE) {
726 Match_Newline = 1;
727 } else if (paren == NO_NEWLINE) {
728 Match_Newline = 0;
731 /* Pick up the branches, linking them together. */
733 do {
734 this_branch = alternative (&flags_local, &range_local);
736 if (this_branch == NULL) return (NULL);
738 if (first) {
739 first = 0;
740 *range_param = range_local;
741 if (ret_val == NULL) ret_val = this_branch;
742 } else if (range_param->lower >= 0) {
743 if (range_local.lower >= 0) {
744 if (range_local.lower < range_param->lower)
745 range_param->lower = range_local.lower;
746 if (range_local.upper > range_param->upper)
747 range_param->upper = range_local.upper;
748 } else {
749 range_param->lower = -1; /* Branches have different lengths */
750 range_param->upper = -1;
754 tail (ret_val, this_branch); /* Connect BRANCH -> BRANCH. */
756 /* If any alternative could be zero width, consider the whole
757 parenthisized thing to be zero width. */
759 if (!(flags_local & HAS_WIDTH)) *flag_param &= ~HAS_WIDTH;
761 /* Are there more alternatives to process? */
763 if (*Reg_Parse != '|') break;
765 Reg_Parse++;
766 } while (1);
768 /* Make a closing node, and hook it on the end. */
770 if (paren == PAREN) {
771 ender = emit_node (CLOSE + this_paren);
773 } else if (paren == NO_PAREN) {
774 ender = emit_node (END);
776 } else if (paren == POS_AHEAD_OPEN || paren == NEG_AHEAD_OPEN) {
777 ender = emit_node (LOOK_AHEAD_CLOSE);
779 } else if (paren == POS_BEHIND_OPEN || paren == NEG_BEHIND_OPEN) {
780 ender = emit_node (LOOK_BEHIND_CLOSE);
782 } else {
783 ender = emit_node (NOTHING);
786 tail (ret_val, ender);
788 /* Hook the tails of the branch alternatives to the closing node. */
790 for (this_branch = ret_val; this_branch != NULL; ) {
791 branch_tail (this_branch, NODE_SIZE, ender);
792 this_branch = next_ptr (this_branch);
795 /* Check for proper termination. */
797 if (paren != NO_PAREN && *Reg_Parse++ != ')') {
798 REG_FAIL ("missing right parenthesis \')\'");
799 } else if (paren == NO_PAREN && *Reg_Parse != '\0') {
800 if (*Reg_Parse == ')') {
801 REG_FAIL ("missing left parenthesis \'(\'");
802 } else {
803 REG_FAIL ("junk on end"); /* "Can't happen" - NOTREACHED */
807 /* Check whether look behind has a fixed size */
809 if (emit_look_behind_bounds) {
810 if (range_param->lower < 0) {
811 REG_FAIL ("look-behind does not have a bounded size");
813 if (range_param->upper > 65535L) {
814 REG_FAIL ("max. look-behind size is too large (>65535)")
816 if (Code_Emit_Ptr != &Compute_Size) {
817 *emit_look_behind_bounds++ = PUT_OFFSET_L (range_param->lower);
818 *emit_look_behind_bounds++ = PUT_OFFSET_R (range_param->lower);
819 *emit_look_behind_bounds++ = PUT_OFFSET_L (range_param->upper);
820 *emit_look_behind_bounds = PUT_OFFSET_R (range_param->upper);
824 /* For look ahead/behind, the length must be set to zero again */
825 if (look_only) {
826 range_param->lower = 0;
827 range_param->upper = 0;
830 zero_width = 0;
832 /* Set a bit in Closed_Parens to let future calls to function `back_ref'
833 know that we have closed this set of parentheses. */
835 if (paren == PAREN && this_paren <= (int)sizeof (Closed_Parens) * CHAR_BIT) {
836 SET_BIT (Closed_Parens, this_paren);
838 /* Determine if a parenthesized expression is modified by a quantifier
839 that can have zero width. */
841 if (*(Reg_Parse) == '?' || *(Reg_Parse) == '*') {
842 zero_width++;
843 } else if (*(Reg_Parse) == '{' && Brace_Char == '{') {
844 if (*(Reg_Parse + 1) == ',' || *(Reg_Parse + 1) == '}') {
845 zero_width++;
846 } else if (*(Reg_Parse + 1) == '0') {
847 i = 2;
849 while (*(Reg_Parse + i) == '0') i++;
851 if (*(Reg_Parse + i) == ',') zero_width++;
856 /* If this set of parentheses is known to never match the empty string, set
857 a bit in Paren_Has_Width to let future calls to function back_ref know
858 that this set of parentheses has non-zero width. This will allow star
859 (*) or question (?) quantifiers to be aplied to a back-reference that
860 refers to this set of parentheses. */
862 if ((*flag_param & HAS_WIDTH) &&
863 paren == PAREN &&
864 !zero_width &&
865 this_paren <= (int)(sizeof (Paren_Has_Width) * CHAR_BIT)) {
867 SET_BIT (Paren_Has_Width, this_paren);
870 Is_Case_Insensitive = old_sensitive;
871 Match_Newline = old_newline;
873 return (ret_val);
876 /*----------------------------------------------------------------------*
877 * alternative
879 * Processes one alternative of an '|' operator. Connects the NEXT
880 * pointers of each regex atom together sequentialy.
881 *----------------------------------------------------------------------*/
883 static unsigned char * alternative (int *flag_param, len_range *range_param) {
885 register unsigned char *ret_val;
886 register unsigned char *chain;
887 register unsigned char *latest;
888 int flags_local;
889 len_range range_local;
891 *flag_param = WORST; /* Tentatively. */
892 range_param->lower = 0; /* Idem */
893 range_param->upper = 0;
895 ret_val = emit_node (BRANCH);
896 chain = NULL;
898 /* Loop until we hit the start of the next alternative, the end of this set
899 of alternatives (end of parentheses), or the end of the regex. */
901 while (*Reg_Parse != '|' && *Reg_Parse != ')' && *Reg_Parse != '\0') {
902 latest = piece (&flags_local, &range_local);
904 if (latest == NULL) return (NULL); /* Something went wrong. */
906 *flag_param |= flags_local & HAS_WIDTH;
907 if (range_local.lower < 0) {
908 /* Not a fixed length */
909 range_param->lower = -1;
910 range_param->upper = -1;
911 } else if (range_param->lower >= 0) {
912 range_param->lower += range_local.lower;
913 range_param->upper += range_local.upper;
916 if (chain != NULL) { /* Connect the regex atoms together sequentialy. */
917 tail (chain, latest);
920 chain = latest;
923 if (chain == NULL) { /* Loop ran zero times. */
924 (void) emit_node (NOTHING);
927 return (ret_val);
930 /*----------------------------------------------------------------------*
931 * piece - something followed by possible '*', '+', '?', or "{m,n}"
933 * Note that the branching code sequences used for the general cases of
934 * *, +. ?, and {m,n} are somewhat optimized: they use the same
935 * NOTHING node as both the endmarker for their branch list and the
936 * body of the last branch. It might seem that this node could be
937 * dispensed with entirely, but the endmarker role is not redundant.
938 *----------------------------------------------------------------------*/
940 static unsigned char * piece (int *flag_param, len_range *range_param) {
942 register unsigned char *ret_val;
943 register unsigned char *next;
944 register unsigned char op_code;
945 unsigned long min_max [2] = {REG_ZERO, REG_INFINITY};
946 int flags_local, i, brace_present = 0;
947 int lazy = 0, comma_present = 0;
948 int digit_present [2] = {0,0};
949 len_range range_local;
951 ret_val = atom (&flags_local, &range_local);
953 if (ret_val == NULL) return (NULL); /* Something went wrong. */
955 op_code = *Reg_Parse;
957 if (!IS_QUANTIFIER (op_code)) {
958 *flag_param = flags_local;
959 *range_param = range_local;
960 return (ret_val);
961 } else if (op_code == '{') { /* {n,m} quantifier present */
962 brace_present++;
963 Reg_Parse++;
965 /* This code will allow specifying a counting range in any of the
966 following forms:
968 {m,n} between m and n.
969 {,n} same as {0,n} or between 0 and infinity.
970 {m,} same as {m,0} or between m and infinity.
971 {m} same as {m,m} or exactly m.
972 {,} same as {0,0} or between 0 and infinity or just '*'.
973 {} same as {0,0} or between 0 and infinity or just '*'.
975 Note that specifying a max of zero, {m,0} is not allowed in the regex
976 itself, but it is implemented internally that way to support '*', '+',
977 and {min,} constructs and signals an unlimited number. */
979 for (i = 0; i < 2; i++) {
980 /* Look for digits of number and convert as we go. The numeric maximum
981 value for max and min of 65,535 is due to using 2 bytes to store
982 each value in the compiled regex code. */
984 while (isdigit (*Reg_Parse)) {
985 /* (6553 * 10 + 6) > 65535 (16 bit max) */
987 if ((min_max [i] == 6553UL && (*Reg_Parse - '0') <= 5) ||
988 (min_max [i] <= 6552UL)) {
990 min_max [i] = (min_max [i] * 10UL) +
991 (unsigned long) (*Reg_Parse - '0');
992 Reg_Parse++;
994 digit_present [i]++;
995 } else {
996 if (i == 0) {
997 sprintf (Error_Text, "min operand of {%lu%c,???} > 65535",
998 min_max [0], *Reg_Parse);
999 } else {
1000 sprintf (Error_Text, "max operand of {%lu,%lu%c} > 65535",
1001 min_max [0], min_max [1], *Reg_Parse);
1004 REG_FAIL (Error_Text);
1008 if (!comma_present && *Reg_Parse == ',') {
1009 comma_present++;
1010 Reg_Parse++;
1014 /* A max of zero can not be specified directly in the regex since it would
1015 signal a max of infinity. This code specifically disallows `{0,0}',
1016 `{,0}', and `{0}' which really means nothing to humans but would be
1017 interpreted as `{0,infinity}' or `*' if we didn't make this check. */
1019 if (digit_present [0] && (min_max [0] == REG_ZERO) && !comma_present) {
1021 REG_FAIL ("{0} is an invalid range");
1022 } else if (digit_present [0] && (min_max [0] == REG_ZERO) &&
1023 digit_present [1] && (min_max [1] == REG_ZERO)) {
1025 REG_FAIL ("{0,0} is an invalid range");
1026 } else if (digit_present [1] && (min_max [1] == REG_ZERO)) {
1027 if (digit_present [0]) {
1028 sprintf (Error_Text, "{%lu,0} is an invalid range", min_max [0]);
1029 REG_FAIL (Error_Text);
1030 } else {
1031 REG_FAIL ("{,0} is an invalid range");
1035 if (!comma_present) min_max [1] = min_max [0]; /* {x} means {x,x} */
1037 if (*Reg_Parse != '}') {
1038 REG_FAIL ("{m,n} specification missing right \'}\'");
1040 } else if (min_max [1] != REG_INFINITY && min_max [0] > min_max [1]) {
1041 /* Disallow a backward range. */
1043 sprintf (Error_Text, "{%lu,%lu} is an invalid range",
1044 min_max [0], min_max [1]);
1045 REG_FAIL (Error_Text);
1049 Reg_Parse++;
1051 /* Check for a minimal matching (non-greedy or "lazy") specification. */
1053 if (*Reg_Parse == '?') {
1054 lazy = 1;
1055 Reg_Parse++;
1058 /* Avoid overhead of counting if possible */
1060 if (op_code == '{') {
1061 if (min_max [0] == REG_ZERO && min_max [1] == REG_INFINITY) {
1062 op_code = '*';
1063 } else if (min_max [0] == REG_ONE && min_max [1] == REG_INFINITY) {
1064 op_code = '+';
1065 } else if (min_max [0] == REG_ZERO && min_max [1] == REG_ONE) {
1066 op_code = '?';
1067 } else if (min_max [0] == REG_ONE && min_max [1] == REG_ONE) {
1068 /* "x{1,1}" is the same as "x". No need to pollute the compiled
1069 regex with such nonsense. */
1071 *flag_param = flags_local;
1072 *range_param = range_local;
1073 return (ret_val);
1074 } else if (Num_Braces > (int)UCHAR_MAX) {
1075 sprintf (Error_Text, "number of {m,n} constructs > %d", UCHAR_MAX);
1076 REG_FAIL (Error_Text);
1080 if (op_code == '+') min_max [0] = REG_ONE;
1081 if (op_code == '?') min_max [1] = REG_ONE;
1083 /* It is dangerous to apply certain quantifiers to a possibly zero width
1084 item. */
1086 if (!(flags_local & HAS_WIDTH)) {
1087 if (brace_present) {
1088 sprintf (Error_Text, "{%lu,%lu} operand could be empty",
1089 min_max [0], min_max [1]);
1090 } else {
1091 sprintf (Error_Text, "%c operand could be empty", op_code);
1094 REG_FAIL (Error_Text);
1097 *flag_param = (min_max [0] > REG_ZERO) ? (WORST | HAS_WIDTH) : WORST;
1098 if (range_local.lower >= 0) {
1099 if (min_max[1] != REG_INFINITY) {
1100 range_param->lower = range_local.lower * min_max[0];
1101 range_param->upper = range_local.upper * min_max[1];
1102 } else {
1103 range_param->lower = -1; /* Not a fixed-size length */
1104 range_param->upper = -1;
1106 } else {
1107 range_param->lower = -1; /* Not a fixed-size length */
1108 range_param->upper = -1;
1111 /*---------------------------------------------------------------------*
1112 * Symbol Legend For Node Structure Diagrams
1113 *---------------------------------------------------------------------*
1114 * (...) = general grouped thing
1115 * B = (B)ranch, K = bac(K), N = (N)othing
1116 * I = (I)nitialize count, C = Increment (C)ount
1117 * T~m = (T)est against mini(m)um- go to NEXT pointer if >= operand
1118 * T~x = (T)est against ma(x)imum- go to NEXT pointer if >= operand
1119 * '~' = NEXT pointer, \___| = forward pointer, |___/ = Backward pointer
1120 *---------------------------------------------------------------------*/
1122 if (op_code == '*' && (flags_local & SIMPLE)) {
1123 insert ((lazy ? LAZY_STAR : STAR), ret_val, 0UL, 0UL, 0);
1125 } else if (op_code == '+' && (flags_local & SIMPLE)) {
1126 insert (lazy ? LAZY_PLUS : PLUS, ret_val, 0UL, 0UL, 0);
1128 } else if (op_code == '?' && (flags_local & SIMPLE)) {
1129 insert (lazy ? LAZY_QUESTION : QUESTION, ret_val, 0UL, 0UL, 0);
1131 } else if (op_code == '{' && (flags_local & SIMPLE)) {
1132 insert (lazy ? LAZY_BRACE : BRACE, ret_val, min_max [0], min_max [1], 0);
1134 } else if ((op_code == '*' || op_code == '+') && lazy) {
1135 /* Node structure for (x)*? Node structure for (x)+? construct.
1136 * construct. (Same as (x)*? except for initial
1137 * forward jump into parenthesis.)
1139 * ___6____
1140 * _______5_______ /________|______
1141 * | _4__ 1_\ /| ____ | _\
1142 * |/ | / |\ / |/ | | / |\
1143 * B~ N~ B~ (...)~ K~ N~ N~ B~ N~ B~ (...)~ K~ N~
1144 * \ \___2_______| \ \___________|
1145 * \_____3_______| \_____________|
1149 tail (ret_val, emit_node (BACK)); /* 1 */
1150 (void) insert (BRANCH, ret_val, 0UL, 0UL, 0); /* 2,4 */
1151 (void) insert (NOTHING, ret_val, 0UL, 0UL, 0); /* 3 */
1153 next = emit_node (NOTHING); /* 2,3 */
1155 offset_tail (ret_val, NODE_SIZE, next); /* 2 */
1156 tail (ret_val, next); /* 3 */
1157 insert (BRANCH, ret_val, 0UL, 0UL, 0); /* 4,5 */
1158 tail (ret_val, ret_val + (2 * NODE_SIZE)); /* 4 */
1159 offset_tail (ret_val, 3 * NODE_SIZE, ret_val); /* 5 */
1161 if (op_code == '+') {
1162 insert (NOTHING, ret_val, 0UL, 0UL, 0); /* 6 */
1163 tail (ret_val, ret_val + (4 * NODE_SIZE)); /* 6 */
1165 } else if (op_code == '*') {
1166 /* Node structure for (x)* construct.
1167 * ____1_____
1168 * | \
1169 * B~ (...)~ K~ B~ N~
1170 * \ \_|2 |\_|
1171 * \__3_______| 4
1174 insert (BRANCH, ret_val, 0UL, 0UL, 0); /* 1,3 */
1175 offset_tail (ret_val, NODE_SIZE, emit_node (BACK)); /* 2 */
1176 offset_tail (ret_val, NODE_SIZE, ret_val); /* 1 */
1177 tail (ret_val, emit_node (BRANCH)); /* 3 */
1178 tail (ret_val, emit_node (NOTHING)); /* 4 */
1179 } else if (op_code == '+') {
1180 /* Node structure for (x)+ construct.
1182 * ____2_____
1183 * | \
1184 * (...)~ B~ K~ B~ N~
1185 * \_|\____|\_|
1186 * 1 3 4
1189 next = emit_node (BRANCH); /* 1 */
1191 tail (ret_val, next); /* 1 */
1192 tail (emit_node (BACK), ret_val); /* 2 */
1193 tail (next, emit_node (BRANCH)); /* 3 */
1194 tail (ret_val, emit_node (NOTHING)); /* 4 */
1195 } else if (op_code == '?' && lazy) {
1196 /* Node structure for (x)?? construct.
1197 * _4__ 1_
1198 * / | / |
1199 * B~ N~ B~ (...)~ N~
1200 * \ \___2____|
1201 * \_____3____|
1204 (void) insert (BRANCH, ret_val, 0UL, 0UL, 0); /* 2,4 */
1205 (void) insert (NOTHING, ret_val, 0UL, 0UL, 0); /* 3 */
1207 next = emit_node (NOTHING); /* 1,2,3 */
1209 offset_tail (ret_val, 2 * NODE_SIZE, next); /* 1 */
1210 offset_tail (ret_val, NODE_SIZE, next); /* 2 */
1211 tail (ret_val, next); /* 3 */
1212 insert (BRANCH, ret_val, 0UL, 0UL, 0); /* 4 */
1213 tail (ret_val, (ret_val + (2 * NODE_SIZE))); /* 4 */
1215 } else if (op_code == '?') {
1216 /* Node structure for (x)? construct.
1217 * ___1____ _2
1218 * / |/ |
1219 * B~ (...)~ B~ N~
1220 * \__3_|
1223 insert (BRANCH, ret_val, 0UL, 0UL, 0); /* 1 */
1224 tail (ret_val, emit_node (BRANCH)); /* 1 */
1226 next = emit_node (NOTHING); /* 2,3 */
1228 tail (ret_val, next); /* 2 */
1229 offset_tail (ret_val, NODE_SIZE, next); /* 3 */
1230 } else if (op_code == '{' && min_max [0] == min_max [1]) {
1231 /* Node structure for (x){m}, (x){m}?, (x){m,m}, or (x){m,m}? constructs.
1232 * Note that minimal and maximal matching mean the same thing when we
1233 * specify the minimum and maximum to be the same value.
1234 * _______3_____
1235 * | 1_ _2 \
1236 * | / |/ | \
1237 * I~ (...)~ C~ T~m K~ N~
1238 * \_| \_____|
1239 * 5 4
1242 tail (ret_val, emit_special (INC_COUNT, 0UL, Num_Braces)); /* 1 */
1243 tail (ret_val, emit_special (TEST_COUNT, min_max [0], Num_Braces));/* 2 */
1244 tail (emit_node (BACK), ret_val); /* 3 */
1245 tail (ret_val, emit_node (NOTHING)); /* 4 */
1247 next = insert (INIT_COUNT, ret_val, 0UL, 0UL, Num_Braces); /* 5 */
1249 tail (ret_val, next); /* 5 */
1251 Num_Braces++;
1252 } else if (op_code == '{' && lazy) {
1253 if (min_max [0] == REG_ZERO && min_max [1] != REG_INFINITY) {
1254 /* Node structure for (x){0,n}? or {,n}? construct.
1255 * _________3____________
1256 * 8_| _4__ 1_ _2 \
1257 * / |/ | / |/ | \
1258 * I~ B~ N~ B~ (...)~ C~ T~x K~ N~
1259 * \ \ \__7__|
1260 * \ \_________6_______|
1261 * \______5____________|
1264 tail (ret_val, emit_special (INC_COUNT, 0UL, Num_Braces)); /* 1 */
1266 next = emit_special (TEST_COUNT, min_max [0], Num_Braces); /* 2,7 */
1268 tail (ret_val, next); /* 2 */
1269 (void) insert (BRANCH, ret_val, 0UL, 0UL, Num_Braces); /* 4,6 */
1270 (void) insert (NOTHING, ret_val, 0UL, 0UL, Num_Braces); /* 5 */
1271 (void) insert (BRANCH, ret_val, 0UL, 0UL, Num_Braces); /* 3,4,8 */
1272 tail (emit_node (BACK), ret_val); /* 3 */
1273 tail (ret_val, ret_val + (2 * NODE_SIZE)); /* 4 */
1275 next = emit_node (NOTHING); /* 5,6,7 */
1277 offset_tail (ret_val, NODE_SIZE, next); /* 5 */
1278 offset_tail (ret_val, 2 * NODE_SIZE, next); /* 6 */
1279 offset_tail (ret_val, 3 * NODE_SIZE, next); /* 7 */
1281 next = insert (INIT_COUNT, ret_val, 0UL, 0UL, Num_Braces); /* 8 */
1283 tail (ret_val, next); /* 8 */
1285 } else if (min_max [0] > REG_ZERO && min_max [1] == REG_INFINITY) {
1286 /* Node structure for (x){m,}? construct.
1287 * ______8_________________
1288 * | _______3_____ \
1289 * | _7__ | 1_ _2 \ \
1290 * |/ | | / |/ | \ \
1291 * I~ B~ N~ B~ (...)~ C~ T~m K~ K~ N~
1292 * \_____\__\_| \_4___| |
1293 * 9 \ \_________5__________|
1294 * \_______6______________|
1297 tail (ret_val, emit_special (INC_COUNT, 0UL, Num_Braces)); /* 1 */
1299 next = emit_special (TEST_COUNT, min_max [0], Num_Braces); /* 2,4 */
1301 tail (ret_val, next); /* 2 */
1302 tail (emit_node (BACK), ret_val); /* 3 */
1303 tail (ret_val, emit_node (BACK)); /* 4 */
1304 (void) insert (BRANCH, ret_val, 0UL, 0UL, 0); /* 5,7 */
1305 (void) insert (NOTHING, ret_val, 0UL, 0UL, 0); /* 6 */
1307 next = emit_node (NOTHING); /* 5,6 */
1309 offset_tail (ret_val, NODE_SIZE, next); /* 5 */
1310 tail (ret_val, next); /* 6 */
1311 (void) insert (BRANCH, ret_val, 0UL, 0UL, 0); /* 7,8 */
1312 tail (ret_val, ret_val + (2 * NODE_SIZE)); /* 7 */
1313 offset_tail (ret_val, 3 * NODE_SIZE, ret_val); /* 8 */
1314 (void) insert (INIT_COUNT, ret_val, 0UL, 0UL, Num_Braces); /* 9 */
1315 tail (ret_val, ret_val + INDEX_SIZE + (4 * NODE_SIZE)); /* 9 */
1317 } else {
1318 /* Node structure for (x){m,n}? construct.
1319 * ______9_____________________
1320 * | _____________3___ \
1321 * | __8_ | 1_ _2 \ \
1322 * |/ | | / |/ | \ \
1323 * I~ B~ N~ B~ (...)~ C~ T~x T~m K~ K~ N~
1324 * \_____\__\_| \ \__4__| |
1325 * 10 \ \ \_7_________|
1326 * \ \_________6_____________|
1327 * \_______5_________________|
1330 tail (ret_val, emit_special (INC_COUNT, 0UL, Num_Braces)); /* 1 */
1332 next = emit_special (TEST_COUNT, min_max [1], Num_Braces); /* 2,7 */
1334 tail (ret_val, next); /* 2 */
1336 next = emit_special (TEST_COUNT, min_max [0], Num_Braces); /* 4 */
1338 tail (emit_node (BACK), ret_val); /* 3 */
1339 tail (next, emit_node (BACK)); /* 4 */
1340 (void) insert (BRANCH, ret_val, 0UL, 0UL, 0); /* 6,8 */
1341 (void) insert (NOTHING, ret_val, 0UL, 0UL, 0); /* 5 */
1342 (void) insert (BRANCH, ret_val, 0UL, 0UL, 0); /* 8,9 */
1344 next = emit_node (NOTHING); /* 5,6,7 */
1346 offset_tail (ret_val, NODE_SIZE, next); /* 5 */
1347 offset_tail (ret_val, 2 * NODE_SIZE, next); /* 6 */
1348 offset_tail (ret_val, 3 * NODE_SIZE, next); /* 7 */
1349 tail (ret_val, ret_val + (2 * NODE_SIZE)); /* 8 */
1350 offset_tail (next, -NODE_SIZE, ret_val); /* 9 */
1351 insert (INIT_COUNT, ret_val, 0UL, 0UL, Num_Braces); /* 10 */
1352 tail (ret_val, ret_val + INDEX_SIZE + (4 * NODE_SIZE)); /* 10 */
1355 Num_Braces++;
1356 } else if (op_code == '{') {
1357 if (min_max [0] == REG_ZERO && min_max [1] != REG_INFINITY) {
1358 /* Node structure for (x){0,n} or (x){,n} construct.
1360 * ___3____________
1361 * | 1_ _2 \ 5_
1362 * | / |/ | \ / |
1363 * I~ B~ (...)~ C~ T~x K~ B~ N~
1364 * \_|\ \_6___|__|
1365 * 7 \________4________|
1368 tail (ret_val, emit_special (INC_COUNT, 0UL, Num_Braces)); /* 1 */
1370 next = emit_special (TEST_COUNT, min_max [1], Num_Braces); /* 2,6 */
1372 tail (ret_val, next); /* 2 */
1373 (void) insert (BRANCH, ret_val, 0UL, 0UL, 0); /* 3,4,7 */
1374 tail (emit_node (BACK), ret_val); /* 3 */
1376 next = emit_node (BRANCH); /* 4,5 */
1378 tail (ret_val, next); /* 4 */
1379 tail (next, emit_node (NOTHING)); /* 5,6 */
1380 offset_tail (ret_val, NODE_SIZE, next); /* 6 */
1382 next = insert (INIT_COUNT, ret_val, 0UL, 0UL, Num_Braces); /* 7 */
1384 tail (ret_val, next); /* 7 */
1386 } else if (min_max [0] > REG_ZERO && min_max [1] == REG_INFINITY) {
1387 /* Node structure for (x){m,} construct.
1388 * __________4________
1389 * | __3__________ \
1390 * _|___| 1_ _2 \ \ _7
1391 * / | 8 | / |/ | \ \ / |
1392 * I~ B~ (...)~ C~ T~m K~ K~ B~ N~
1393 * \ \_5___| |
1394 * \__________6__________|
1397 tail (ret_val, emit_special (INC_COUNT, 0UL, Num_Braces)); /* 1 */
1399 next = emit_special (TEST_COUNT, min_max [0], Num_Braces); /* 2 */
1401 tail (ret_val, next); /* 2 */
1402 tail (emit_node (BACK), ret_val); /* 3 */
1403 (void) insert (BRANCH, ret_val, 0UL, 0UL, 0); /* 4,6 */
1405 next = emit_node (BACK); /* 4 */
1407 tail (next, ret_val); /* 4 */
1408 offset_tail (ret_val, NODE_SIZE, next); /* 5 */
1409 tail (ret_val, emit_node (BRANCH)); /* 6 */
1410 tail (ret_val, emit_node (NOTHING)); /* 7 */
1412 insert (INIT_COUNT, ret_val, 0UL, 0UL, Num_Braces); /* 8 */
1414 tail (ret_val, ret_val + INDEX_SIZE + (2 * NODE_SIZE)); /* 8 */
1416 } else {
1417 /* Node structure for (x){m,n} construct.
1418 * _____6________________
1419 * | _____________3___ \
1420 * 9_|__| 1_ _2 \ \ _8
1421 * / | | / |/ | \ \ / |
1422 * I~ B~ (...)~ C~ T~x T~m K~ K~ B~ N~
1423 * \ \ \__4__| | |
1424 * \ \_7_________|__|
1425 * \_________5_____________|
1428 tail (ret_val, emit_special (INC_COUNT, 0UL, Num_Braces)); /* 1 */
1430 next = emit_special (TEST_COUNT, min_max [1], Num_Braces); /* 2,4 */
1432 tail (ret_val, next); /* 2 */
1434 next = emit_special (TEST_COUNT, min_max [0], Num_Braces); /* 4 */
1436 tail (emit_node (BACK), ret_val); /* 3 */
1437 tail (next, emit_node (BACK)); /* 4 */
1438 (void) insert (BRANCH, ret_val, 0UL, 0UL, 0); /* 5,6 */
1440 next = emit_node (BRANCH); /* 5,8 */
1442 tail (ret_val, next); /* 5 */
1443 offset_tail (next, -NODE_SIZE, ret_val); /* 6 */
1445 next = emit_node (NOTHING); /* 7,8 */
1447 offset_tail (ret_val, NODE_SIZE, next); /* 7 */
1449 offset_tail (next, -NODE_SIZE, next); /* 8 */
1450 (void) insert (INIT_COUNT, ret_val, 0UL, 0UL, Num_Braces); /* 9 */
1451 tail (ret_val, ret_val + INDEX_SIZE + (2 * NODE_SIZE)); /* 9 */
1454 Num_Braces++;
1455 } else {
1456 /* We get here if the IS_QUANTIFIER macro is not coordinated properly
1457 with this function. */
1459 REG_FAIL ("internal error #2, `piece\'");
1462 if (IS_QUANTIFIER (*Reg_Parse)) {
1463 if (op_code == '{') {
1464 sprintf (Error_Text, "nested quantifiers, {m,n}%c", *Reg_Parse);
1465 } else {
1466 sprintf (Error_Text, "nested quantifiers, %c%c", op_code, *Reg_Parse);
1469 REG_FAIL (Error_Text);
1472 return (ret_val);
1475 /*----------------------------------------------------------------------*
1476 * atom
1478 * Process one regex item at the lowest level
1480 * OPTIMIZATION: Lumps a continuous sequence of ordinary characters
1481 * together so that it can turn them into a single EXACTLY node, which
1482 * is smaller to store and faster to run.
1483 *----------------------------------------------------------------------*/
1485 static unsigned char * atom (int *flag_param, len_range *range_param) {
1487 register unsigned char *ret_val;
1488 unsigned char test;
1489 int flags_local;
1490 len_range range_local;
1492 *flag_param = WORST; /* Tentatively. */
1493 range_param->lower = 0; /* Idem */
1494 range_param->upper = 0;
1496 /* Process any regex comments, e.g. `(?# match next token->)'. The
1497 terminating right parenthesis can not be escaped. The comment stops at
1498 the first right parenthesis encountered (or the end of the regex
1499 string)... period. Handles multiple sequential comments,
1500 e.g. `(?# one)(?# two)...' */
1502 while (*Reg_Parse == '(' &&
1503 *(Reg_Parse + 1) == '?' &&
1504 *(Reg_Parse + 2) == '#') {
1506 Reg_Parse += 3;
1508 while (*Reg_Parse != ')' && *Reg_Parse != '\0') {
1509 Reg_Parse++;
1512 if (*Reg_Parse == ')') {
1513 Reg_Parse++;
1516 if (*Reg_Parse == ')' || *Reg_Parse == '|' || *Reg_Parse == '\0') {
1517 /* Hit end of regex string or end of parenthesized regex; have to
1518 return "something" (i.e. a NOTHING node) to avoid generating an
1519 error. */
1521 ret_val = emit_node (NOTHING);
1523 return (ret_val);
1527 switch (*Reg_Parse++) {
1528 case '^':
1529 ret_val = emit_node (BOL);
1530 break;
1532 case '$':
1533 ret_val = emit_node (EOL);
1534 break;
1536 case '<':
1537 ret_val = emit_node (BOWORD);
1538 break;
1540 case '>':
1541 ret_val = emit_node (EOWORD);
1542 break;
1544 case '.':
1545 if (Match_Newline) {
1546 ret_val = emit_node (EVERY);
1547 } else {
1548 ret_val = emit_node (ANY);
1551 *flag_param |= (HAS_WIDTH | SIMPLE);
1552 range_param->lower = 1;
1553 range_param->upper = 1;
1554 break;
1556 case '(':
1557 if (*Reg_Parse == '?') { /* Special parenthetical expression */
1558 Reg_Parse++;
1559 range_local.lower = 0; /* Make sure it is always used */
1560 range_local.upper = 0;
1562 if (*Reg_Parse == ':') {
1563 Reg_Parse++;
1564 ret_val = chunk (NO_CAPTURE, &flags_local, &range_local);
1565 } else if (*Reg_Parse == '=') {
1566 Reg_Parse++;
1567 ret_val = chunk (POS_AHEAD_OPEN, &flags_local, &range_local);
1568 } else if (*Reg_Parse == '!') {
1569 Reg_Parse++;
1570 ret_val = chunk (NEG_AHEAD_OPEN, &flags_local, &range_local);
1571 } else if (*Reg_Parse == 'i') {
1572 Reg_Parse++;
1573 ret_val = chunk (INSENSITIVE, &flags_local, &range_local);
1574 } else if (*Reg_Parse == 'I') {
1575 Reg_Parse++;
1576 ret_val = chunk (SENSITIVE, &flags_local, &range_local);
1577 } else if (*Reg_Parse == 'n') {
1578 Reg_Parse++;
1579 ret_val = chunk (NEWLINE, &flags_local, &range_local);
1580 } else if (*Reg_Parse == 'N') {
1581 Reg_Parse++;
1582 ret_val = chunk (NO_NEWLINE, &flags_local, &range_local);
1583 } else if (*Reg_Parse == '<') {
1584 Reg_Parse++;
1585 if (*Reg_Parse == '=') {
1586 Reg_Parse++;
1587 ret_val = chunk (POS_BEHIND_OPEN, &flags_local, &range_local);
1588 } else if (*Reg_Parse == '!') {
1589 Reg_Parse++;
1590 ret_val = chunk (NEG_BEHIND_OPEN, &flags_local, &range_local);
1591 } else {
1592 sprintf (Error_Text,
1593 "invalid look-behind syntax, \"(?<%c...)\"",
1594 *Reg_Parse);
1596 REG_FAIL (Error_Text);
1598 } else {
1599 sprintf (Error_Text,
1600 "invalid grouping syntax, \"(?%c...)\"",
1601 *Reg_Parse);
1603 REG_FAIL (Error_Text);
1605 } else { /* Normal capturing parentheses */
1606 ret_val = chunk (PAREN, &flags_local, &range_local);
1609 if (ret_val == NULL) return (NULL); /* Something went wrong. */
1611 /* Add HAS_WIDTH flag if it was set by call to chunk. */
1613 *flag_param |= flags_local & HAS_WIDTH;
1614 *range_param = range_local;
1616 break;
1618 case '\0':
1619 case '|':
1620 case ')':
1621 REG_FAIL ("internal error #3, `atom\'"); /* Supposed to be */
1622 /* caught earlier. */
1623 case '?':
1624 case '+':
1625 case '*':
1626 sprintf (Error_Text, "%c follows nothing", *(Reg_Parse - 1));
1627 REG_FAIL (Error_Text);
1629 case '{':
1630 if (Enable_Counting_Quantifier) {
1631 REG_FAIL ("{m,n} follows nothing");
1632 } else {
1633 ret_val = emit_node (EXACTLY); /* Treat braces as literals. */
1634 emit_byte ('{');
1635 emit_byte ('\0');
1636 range_param->lower = 1;
1637 range_param->upper = 1;
1640 break;
1642 case '[':
1644 register unsigned int second_value;
1645 register unsigned int last_value;
1646 unsigned char last_emit = 0;
1648 /* Handle characters that can only occur at the start of a class. */
1650 if (*Reg_Parse == '^') { /* Complement of range. */
1651 ret_val = emit_node (ANY_BUT);
1652 Reg_Parse++;
1654 /* All negated classes include newline unless escaped with
1655 a "(?n)" switch. */
1657 if (!Match_Newline) emit_byte ('\n');
1658 } else {
1659 ret_val = emit_node (ANY_OF);
1662 if (*Reg_Parse == ']' || *Reg_Parse == '-') {
1663 /* If '-' or ']' is the first character in a class,
1664 it is a literal character in the class. */
1666 last_emit = *Reg_Parse;
1667 emit_byte (*Reg_Parse);
1668 Reg_Parse++;
1671 /* Handle the rest of the class characters. */
1673 while (*Reg_Parse != '\0' && *Reg_Parse != ']') {
1674 if (*Reg_Parse == '-') { /* Process a range, e.g [a-z]. */
1675 Reg_Parse++;
1677 if (*Reg_Parse == ']' || *Reg_Parse == '\0') {
1678 /* If '-' is the last character in a class it is a literal
1679 character. If `Reg_Parse' points to the end of the
1680 regex string, an error will be generated later. */
1682 emit_byte ('-');
1683 last_emit = '-';
1684 } else {
1685 /* We must get the range starting character value from the
1686 emitted code since it may have been an escaped
1687 character. `second_value' is set one larger than the
1688 just emitted character value. This is done since
1689 `second_value' is used as the start value for the loop
1690 that emits the values in the range. Since we have
1691 already emitted the first character of the class, we do
1692 not want to emit it again. */
1694 second_value = ((unsigned int) last_emit) + 1;
1696 if (*Reg_Parse == '\\') {
1697 /* Handle escaped characters within a class range.
1698 Specifically disallow shortcut escapes as the end of
1699 a class range. To allow this would be ambiguous
1700 since shortcut escapes represent a set of characters,
1701 and it would not be clear which character of the
1702 class should be treated as the "last" character. */
1704 Reg_Parse++;
1706 if ((test = numeric_escape (*Reg_Parse, &Reg_Parse))) {
1707 last_value = (unsigned int) test;
1708 } else if ((test = literal_escape (*Reg_Parse))) {
1709 last_value = (unsigned int) test;
1710 } else if (shortcut_escape (*Reg_Parse,
1711 NULL,
1712 CHECK_CLASS_ESCAPE)) {
1713 sprintf (Error_Text,
1714 "\\%c is not allowed as range operand",
1715 *Reg_Parse);
1717 REG_FAIL (Error_Text);
1718 } else {
1719 sprintf (
1720 Error_Text,
1721 "\\%c is an invalid char class escape sequence",
1722 *Reg_Parse);
1724 REG_FAIL (Error_Text);
1726 } else {
1727 last_value = U_CHAR_AT (Reg_Parse);
1730 if (Is_Case_Insensitive) {
1731 second_value =
1732 (unsigned int) tolower ((int) second_value);
1733 last_value =
1734 (unsigned int) tolower ((int) last_value);
1737 /* For case insensitive, something like [A-_] will
1738 generate an error here since ranges are converted to
1739 lower case. */
1741 if (second_value - 1 > last_value) {
1742 REG_FAIL ("invalid [] range");
1745 /* If only one character in range (e.g [a-a]) then this
1746 loop is not run since the first character of any range
1747 was emitted by the previous iteration of while loop. */
1749 for (; second_value <= last_value; second_value++) {
1750 emit_class_byte (second_value);
1753 last_emit = (unsigned char) last_value;
1755 Reg_Parse++;
1757 } /* End class character range code. */
1758 } else if (*Reg_Parse == '\\') {
1759 Reg_Parse++;
1761 if ((test = numeric_escape (*Reg_Parse, &Reg_Parse)) != '\0') {
1762 emit_class_byte (test);
1764 last_emit = test;
1765 } else if ((test = literal_escape (*Reg_Parse)) != '\0') {
1766 emit_byte (test);
1767 last_emit = test;
1768 } else if (shortcut_escape (*Reg_Parse,
1769 NULL,
1770 CHECK_CLASS_ESCAPE)) {
1772 if (*(Reg_Parse + 1) == '-') {
1773 /* Specifically disallow shortcut escapes as the start
1774 of a character class range (see comment above.) */
1776 sprintf (Error_Text,
1777 "\\%c not allowed as range operand",
1778 *Reg_Parse);
1780 REG_FAIL (Error_Text);
1781 } else {
1782 /* Emit the bytes that are part of the shortcut
1783 escape sequence's range (e.g. \d = 0123456789) */
1785 shortcut_escape (*Reg_Parse, NULL, EMIT_CLASS_BYTES);
1787 } else {
1788 sprintf (Error_Text,
1789 "\\%c is an invalid char class escape sequence",
1790 *Reg_Parse);
1792 REG_FAIL (Error_Text);
1795 Reg_Parse++;
1797 /* End of class escaped sequence code */
1798 } else {
1799 emit_class_byte (*Reg_Parse); /* Ordinary class character. */
1801 last_emit = *Reg_Parse;
1802 Reg_Parse++;
1804 } /* End of while (*Reg_Parse != '\0' && *Reg_Parse != ']') */
1806 if (*Reg_Parse != ']') REG_FAIL ("missing right \']\'");
1808 emit_byte('\0');
1810 /* NOTE: it is impossible to specify an empty class. This is
1811 because [] would be interpreted as "begin character class"
1812 followed by a literal ']' character and no "end character class"
1813 delimiter (']'). Because of this, it is always safe to assume
1814 that a class HAS_WIDTH. */
1816 Reg_Parse++;
1817 *flag_param |= HAS_WIDTH | SIMPLE;
1818 range_param->lower = 1;
1819 range_param->upper = 1;
1822 break; /* End of character class code. */
1824 case '\\':
1825 /* Force Error_Text to have a length of zero. This way we can tell if
1826 either of the calls to shortcut_escape() or back_ref() fill
1827 Error_Text with an error message. */
1829 Error_Text [0] = '\0';
1831 if ((ret_val = shortcut_escape (*Reg_Parse, flag_param, EMIT_NODE))) {
1833 Reg_Parse++;
1834 range_param->lower = 1;
1835 range_param->upper = 1;
1836 break;
1838 } else if ((ret_val = back_ref (Reg_Parse, flag_param, EMIT_NODE))) {
1839 /* Can't make any assumptions about a back-reference as to SIMPLE
1840 or HAS_WIDTH. For example (^|<) is neither simple nor has
1841 width. So we don't flip bits in flag_param here. */
1843 Reg_Parse++;
1844 /* Back-references always have an unknown length */
1845 range_param->lower = -1;
1846 range_param->upper = -1;
1847 break;
1850 if (strlen (Error_Text) > 0) REG_FAIL (Error_Text);
1852 /* At this point it is apparent that the escaped character is not a
1853 shortcut escape or back-reference. Back up one character to allow
1854 the default code to include it as an ordinary character. */
1856 /* Fall through to Default case to handle literal escapes and numeric
1857 escapes. */
1859 default:
1860 Reg_Parse--; /* If we fell through from the above code, we are now
1861 pointing at the back slash (\) character. */
1863 unsigned char *parse_save;
1864 int len = 0;
1866 if (Is_Case_Insensitive) {
1867 ret_val = emit_node (SIMILAR);
1868 } else {
1869 ret_val = emit_node (EXACTLY);
1872 /* Loop until we find a meta character, shortcut escape, back
1873 reference, or end of regex string. */
1875 for (; *Reg_Parse != '\0' &&
1876 !strchr ((char *) Meta_Char, (int) *Reg_Parse);
1877 len++) {
1879 /* Save where we are in case we have to back
1880 this character out. */
1882 parse_save = Reg_Parse;
1884 if (*Reg_Parse == '\\') {
1885 Reg_Parse++; /* Point to escaped character */
1887 Error_Text [0] = '\0'; /* See comment above. */
1889 if ((test = numeric_escape (*Reg_Parse, &Reg_Parse))) {
1890 if (Is_Case_Insensitive) {
1891 emit_byte (tolower (test));
1892 } else {
1893 emit_byte (test);
1895 } else if ((test = literal_escape (*Reg_Parse))) {
1896 emit_byte (test);
1897 } else if (back_ref (Reg_Parse, NULL, CHECK_ESCAPE)) {
1898 /* Leave back reference for next `atom' call */
1900 Reg_Parse--; break;
1901 } else if (shortcut_escape (*Reg_Parse, NULL, CHECK_ESCAPE)) {
1902 /* Leave shortcut escape for next `atom' call */
1904 Reg_Parse--; break;
1905 } else {
1906 if (strlen (Error_Text) == 0) {
1907 /* None of the above calls generated an error message
1908 so generate our own here. */
1910 sprintf (Error_Text,
1911 "\\%c is an invalid escape sequence",
1912 *Reg_Parse);
1915 REG_FAIL (Error_Text);
1918 Reg_Parse++;
1919 } else {
1920 /* Ordinary character */
1922 if (Is_Case_Insensitive) {
1923 emit_byte (tolower (*Reg_Parse));
1924 } else {
1925 emit_byte (*Reg_Parse);
1928 Reg_Parse++;
1931 /* If next regex token is a quantifier (?, +. *, or {m,n}) and
1932 our EXACTLY node so far is more than one character, leave the
1933 last character to be made into an EXACTLY node one character
1934 wide for the multiplier to act on. For example 'abcd* would
1935 have an EXACTLY node with an 'abc' operand followed by a STAR
1936 node followed by another EXACTLY node with a 'd' operand. */
1938 if (IS_QUANTIFIER (*Reg_Parse) && len > 0) {
1939 Reg_Parse = parse_save; /* Point to previous regex token. */
1941 if (Code_Emit_Ptr == &Compute_Size) {
1942 Reg_Size--;
1943 } else {
1944 Code_Emit_Ptr--; /* Write over previously emitted byte. */
1947 break;
1951 if (len <= 0) REG_FAIL ("internal error #4, `atom\'");
1953 *flag_param |= HAS_WIDTH;
1955 if (len == 1) *flag_param |= SIMPLE;
1957 range_param->lower = len;
1958 range_param->upper = len;
1960 emit_byte ('\0');
1962 } /* END switch (*Reg_Parse++) */
1964 return (ret_val);
1967 /*----------------------------------------------------------------------*
1968 * emit_node
1970 * Emit (if appropriate) the op code for a regex node atom.
1972 * The NEXT pointer is initialized to NULL.
1974 * Returns a pointer to the START of the emitted node.
1975 *----------------------------------------------------------------------*/
1977 static unsigned char * emit_node (int op_code) {
1979 register unsigned char *ret_val;
1980 register unsigned char *ptr;
1982 ret_val = Code_Emit_Ptr; /* Return address of start of node */
1984 if (ret_val == &Compute_Size) {
1985 Reg_Size += NODE_SIZE;
1986 } else {
1987 ptr = ret_val;
1988 *ptr++ = (unsigned char) op_code;
1989 *ptr++ = '\0'; /* Null "NEXT" pointer. */
1990 *ptr++ = '\0';
1992 Code_Emit_Ptr = ptr;
1995 return (ret_val);
1998 /*----------------------------------------------------------------------*
1999 * emit_byte
2001 * Emit (if appropriate) a byte of code (usually part of an operand.)
2002 *----------------------------------------------------------------------*/
2004 static void emit_byte (unsigned char c) {
2006 if (Code_Emit_Ptr == &Compute_Size) {
2007 Reg_Size++;
2008 } else {
2009 *Code_Emit_Ptr++ = c;
2013 /*----------------------------------------------------------------------*
2014 * emit_class_byte
2016 * Emit (if appropriate) a byte of code (usually part of a character
2017 * class operand.)
2018 *----------------------------------------------------------------------*/
2020 static void emit_class_byte (unsigned char c) {
2022 if (Code_Emit_Ptr == &Compute_Size) {
2023 Reg_Size++;
2025 if (Is_Case_Insensitive && isalpha (c)) Reg_Size++;
2026 } else if (Is_Case_Insensitive && isalpha (c)) {
2027 /* For case insensitive character classes, emit both upper and lower case
2028 versions of alphabetical characters. */
2030 *Code_Emit_Ptr++ = tolower (c);
2031 *Code_Emit_Ptr++ = toupper (c);
2032 } else {
2033 *Code_Emit_Ptr++ = c;
2037 /*----------------------------------------------------------------------*
2038 * emit_special
2040 * Emit nodes that need special processing.
2041 *----------------------------------------------------------------------*/
2043 static unsigned char * emit_special (
2044 unsigned char op_code,
2045 unsigned long test_val,
2046 int index) {
2048 register unsigned char *ret_val = &Compute_Size;
2049 register unsigned char *ptr;
2051 if (Code_Emit_Ptr == &Compute_Size) {
2052 switch (op_code) {
2053 case POS_BEHIND_OPEN:
2054 case NEG_BEHIND_OPEN:
2055 Reg_Size += LENGTH_SIZE; /* Length of the look-behind match */
2056 Reg_Size += NODE_SIZE; /* Make room for the node */
2057 break;
2059 case TEST_COUNT:
2060 Reg_Size += NEXT_PTR_SIZE; /* Make room for a test value. */
2062 case INC_COUNT:
2063 Reg_Size += INDEX_SIZE; /* Make room for an index value. */
2065 default:
2066 Reg_Size += NODE_SIZE; /* Make room for the node. */
2068 } else {
2069 ret_val = emit_node (op_code); /* Return the address for start of node. */
2070 ptr = Code_Emit_Ptr;
2072 if (op_code == INC_COUNT || op_code == TEST_COUNT) {
2073 *ptr++ = (unsigned char) index;
2075 if (op_code == TEST_COUNT) {
2076 *ptr++ = PUT_OFFSET_L (test_val);
2077 *ptr++ = PUT_OFFSET_R (test_val);
2079 } else if (op_code == POS_BEHIND_OPEN || op_code == NEG_BEHIND_OPEN) {
2080 *ptr++ = PUT_OFFSET_L (test_val);
2081 *ptr++ = PUT_OFFSET_R (test_val);
2082 *ptr++ = PUT_OFFSET_L (test_val);
2083 *ptr++ = PUT_OFFSET_R (test_val);
2086 Code_Emit_Ptr = ptr;
2089 return (ret_val);
2092 /*----------------------------------------------------------------------*
2093 * insert
2095 * Insert a node in front of already emitted node(s). Means relocating
2096 * the operand. Code_Emit_Ptr points one byte past the just emitted
2097 * node and operand. The parameter `insert_pos' points to the location
2098 * where the new node is to be inserted.
2099 *----------------------------------------------------------------------*/
2101 static unsigned char * insert (
2102 unsigned char op,
2103 unsigned char *insert_pos,
2104 long min,
2105 long max,
2106 int index) {
2108 register unsigned char *src;
2109 register unsigned char *dst;
2110 unsigned char *place;
2111 int insert_size = NODE_SIZE;
2113 if (op == BRACE || op == LAZY_BRACE) {
2114 /* Make room for the min and max values. */
2116 insert_size += (2 * NEXT_PTR_SIZE);
2117 } else if (op == INIT_COUNT) {
2118 /* Make room for an index value . */
2120 insert_size += INDEX_SIZE;
2123 if (Code_Emit_Ptr == &Compute_Size) {
2124 Reg_Size += insert_size;
2125 return &Compute_Size;
2128 src = Code_Emit_Ptr;
2129 Code_Emit_Ptr += insert_size;
2130 dst = Code_Emit_Ptr;
2132 /* Relocate the existing emitted code to make room for the new node. */
2134 while (src > insert_pos) *--dst = *--src;
2136 place = insert_pos; /* Where operand used to be. */
2137 *place++ = op; /* Inserted operand. */
2138 *place++ = '\0'; /* NEXT pointer for inserted operand. */
2139 *place++ = '\0';
2141 if (op == BRACE || op == LAZY_BRACE) {
2142 *place++ = PUT_OFFSET_L (min);
2143 *place++ = PUT_OFFSET_R (min);
2145 *place++ = PUT_OFFSET_L (max);
2146 *place++ = PUT_OFFSET_R (max);
2147 } else if (op == INIT_COUNT) {
2148 *place++ = (unsigned char) index;
2151 return place; /* Return a pointer to the start of the code moved. */
2154 /*----------------------------------------------------------------------*
2155 * tail - Set the next-pointer at the end of a node chain.
2156 *----------------------------------------------------------------------*/
2158 static void tail (unsigned char *search_from, unsigned char *point_to) {
2160 register unsigned char *scan;
2161 register unsigned char *next;
2162 register int offset;
2164 if (search_from == &Compute_Size) return;
2166 /* Find the last node in the chain (node with a null NEXT pointer) */
2168 scan = search_from;
2170 for (;;) {
2171 next = next_ptr (scan);
2173 if (!next) break;
2175 scan = next;
2178 if (GET_OP_CODE (scan) == BACK) {
2179 offset = scan - point_to;
2180 } else {
2181 offset = point_to - scan;
2184 /* Set NEXT pointer */
2186 *(scan + 1) = PUT_OFFSET_L (offset);
2187 *(scan + 2) = PUT_OFFSET_R (offset);
2190 /*--------------------------------------------------------------------*
2191 * offset_tail
2193 * Perform a tail operation on (ptr + offset).
2194 *--------------------------------------------------------------------*/
2196 static void offset_tail (unsigned char *ptr, int offset, unsigned char *val) {
2198 if (ptr == &Compute_Size || ptr == NULL) return;
2200 tail (ptr + offset, val);
2203 /*--------------------------------------------------------------------*
2204 * branch_tail
2206 * Perform a tail operation on (ptr + offset) but only if `ptr' is a
2207 * BRANCH node.
2208 *--------------------------------------------------------------------*/
2210 static void branch_tail (unsigned char *ptr, int offset, unsigned char *val) {
2212 if (ptr == &Compute_Size || ptr == NULL ||GET_OP_CODE (ptr) != BRANCH) {
2213 return;
2216 tail (ptr + offset, val);
2219 /*--------------------------------------------------------------------*
2220 * shortcut_escape
2222 * Implements convenient escape sequences that represent entire
2223 * character classes or special location assertions (similar to escapes
2224 * supported by Perl)
2226 * \d Digits [0-9] |
2227 * \D NOT a digit [^0-9] | (Examples
2228 * \l Letters [a-zA-Z] | at left
2229 * \L NOT a Letter [^a-zA-Z] | are
2230 * \s Whitespace [ \t\n\r\f\v] | for
2231 * \S NOT Whitespace [^ \t\n\r\f\v] | C
2232 * \w "Word" character [a-zA-Z0-9_] | Locale)
2233 * \W NOT a "Word" character [^a-zA-Z0-9_] _|
2235 * \B Matches any character that is NOT a word-delimiter
2237 * Codes for the "emit" parameter:
2239 * EMIT_NODE
2240 * Emit a shortcut node. Shortcut nodes have an implied set of
2241 * class characters. This helps keep the compiled regex string
2242 * small.
2244 * EMIT_CLASS_BYTES
2245 * Emit just the equivalent characters of the class. This makes
2246 * the escape usable from within a class, e.g. [a-fA-F\d]. Only
2247 * \d, \D, \s, \S, \w, and \W can be used within a class.
2249 * CHECK_ESCAPE
2250 * Only verify that this is a valid shortcut escape.
2252 * CHECK_CLASS_ESCAPE
2253 * Same as CHECK_ESCAPE but only allows characters valid within
2254 * a class.
2256 *--------------------------------------------------------------------*/
2258 static unsigned char * shortcut_escape (
2259 unsigned char c,
2260 int *flag_param,
2261 int emit) {
2263 register unsigned char *class = NULL;
2264 static unsigned char *codes = (unsigned char *) "ByYdDlLsSwW";
2265 unsigned char *ret_val = (unsigned char *) 1; /* Assume success. */
2266 unsigned char *valid_codes;
2268 if (emit == EMIT_CLASS_BYTES || emit == CHECK_CLASS_ESCAPE) {
2269 valid_codes = codes + 3; /* \B, \y and \Y are not allowed in classes */
2270 } else {
2271 valid_codes = codes;
2274 if (!strchr ((char *) valid_codes, (int) c)) {
2275 return NULL; /* Not a valid shortcut escape sequence */
2276 } else if (emit == CHECK_ESCAPE || emit == CHECK_CLASS_ESCAPE) {
2277 return ret_val; /* Just checking if this is a valid shortcut escape. */
2280 switch (c) {
2281 case 'd':
2282 case 'D':
2283 if (emit == EMIT_CLASS_BYTES) {
2284 class = ASCII_Digits;
2285 } else if (emit == EMIT_NODE) {
2286 ret_val = (islower (c) ? emit_node (DIGIT)
2287 : emit_node (NOT_DIGIT));
2290 break;
2292 case 'l':
2293 case 'L':
2294 if (emit == EMIT_CLASS_BYTES) {
2295 class = Letter_Char;
2296 } else if (emit == EMIT_NODE) {
2297 ret_val = (islower (c) ? emit_node (LETTER)
2298 : emit_node (NOT_LETTER));
2301 break;
2303 case 's':
2304 case 'S':
2305 if (emit == EMIT_CLASS_BYTES) {
2306 if (Match_Newline) emit_byte ('\n');
2308 class = White_Space;
2309 } else if (emit == EMIT_NODE) {
2310 if (Match_Newline) {
2311 ret_val = (islower (c) ? emit_node (SPACE_NL)
2312 : emit_node (NOT_SPACE_NL));
2313 } else {
2314 ret_val = (islower (c) ? emit_node (SPACE)
2315 : emit_node (NOT_SPACE));
2319 break;
2321 case 'w':
2322 case 'W':
2323 if (emit == EMIT_CLASS_BYTES) {
2324 class = Word_Char;
2325 } else if (emit == EMIT_NODE) {
2326 ret_val = (islower (c) ? emit_node (WORD_CHAR)
2327 : emit_node (NOT_WORD_CHAR));
2330 break;
2332 /* Since the delimiter table is not available at regex compile time \B,
2333 \Y and \Y can only generate a node. At run time, the delimiter table
2334 will be available for these nodes to use. */
2336 case 'y':
2338 if (emit == EMIT_NODE) {
2339 ret_val = emit_node (IS_DELIM);
2340 } else {
2341 REG_FAIL ("internal error #5 `shortcut_escape\'");
2344 break;
2346 case 'Y':
2348 if (emit == EMIT_NODE) {
2349 ret_val = emit_node (NOT_DELIM);
2350 } else {
2351 REG_FAIL ("internal error #6 `shortcut_escape\'");
2354 break;
2356 case 'B':
2358 if (emit == EMIT_NODE) {
2359 ret_val = emit_node (NOT_BOUNDARY);
2360 } else {
2361 REG_FAIL ("internal error #7 `shortcut_escape\'");
2364 break;
2366 default:
2367 /* We get here if there isn't a case for every character in
2368 the string "codes" */
2370 REG_FAIL ("internal error #8 `shortcut_escape\'");
2373 if (emit == EMIT_NODE && c != 'B') {
2374 *flag_param |= (HAS_WIDTH | SIMPLE);
2377 if (class) {
2378 /* Emit bytes within a character class operand. */
2380 while (*class != '\0') {
2381 emit_byte (*class++);
2385 return ret_val;
2388 /*--------------------------------------------------------------------*
2389 * numeric_escape
2391 * Implements hex and octal numeric escape sequence syntax.
2393 * Hexadecimal Escape: \x## Max of two digits Must have leading 'x'.
2394 * Octal Escape: \0### Max of three digits and not greater
2395 * than 377 octal. Must have leading zero.
2397 * Returns the actual character value or NULL if not a valid hex or
2398 * octal escape. REG_FAIL is called if \x0, \x00, \0, \00, \000, or
2399 * \0000 is specified.
2400 *--------------------------------------------------------------------*/
2402 static unsigned char numeric_escape (
2403 unsigned char c,
2404 unsigned char **parse) {
2406 static unsigned char digits [] = "fedcbaFEDCBA9876543210";
2408 static unsigned int digit_val [] = {
2409 15, 14, 13, 12, 11, 10, /* Lower case Hex digits */
2410 15, 14, 13, 12, 11, 10, /* Upper case Hex digits */
2411 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}; /* Decimal Digits */
2413 register unsigned char *scan;
2414 register unsigned char *pos_ptr;
2415 register unsigned char *digit_str;
2416 unsigned int value = 0;
2417 unsigned int radix = 8;
2418 int width = 3; /* Can not be bigger than \0377 */
2419 int pos_delta = 14;
2420 int i, pos;
2422 switch (c) {
2423 case '0':
2424 digit_str = digits + pos_delta; /* Only use Octal digits, i.e. 0-7. */
2426 break;
2428 case 'x':
2429 case 'X':
2430 width = 2; /* Can not be bigger than \0377 */
2431 radix = 16;
2432 pos_delta = 0;
2433 digit_str = digits; /* Use all of the digit characters. */
2435 break;
2437 default:
2438 return ('\0'); /* Not a numeric escape */
2441 scan = *parse; scan++; /* Only change *parse on success. */
2443 pos_ptr = (unsigned char *) strchr ((char *) digit_str, (int) *scan);
2445 for (i = 0; pos_ptr != NULL && (i < width); i++) {
2446 pos = (pos_ptr - digit_str) + pos_delta;
2447 value = (value * radix) + digit_val [pos];
2449 /* If this digit makes the value over 255, treat this digit as a literal
2450 character instead of part of the numeric escape. For example, \0777
2451 will be processed as \077 (an 'M') and a literal '7' character, NOT
2452 511 decimal which is > 255. */
2454 if (value > 255) {
2455 /* Back out calculations for last digit processed. */
2457 value -= digit_val [pos];
2458 value /= radix;
2460 break; /* Note that scan will not be incremented and still points to
2461 the digit that caused overflow. It will be decremented by
2462 the "else" below to point to the last character that is
2463 considered to be part of the octal escape. */
2466 scan++;
2467 pos_ptr = (unsigned char *) strchr ((char *) digit_str, (int) *scan);
2470 /* Handle the case of "\0" i.e. trying to specify a NULL character. */
2472 if (value == 0) {
2473 if (c == '0') {
2474 sprintf (Error_Text, "\\00 is an invalid octal escape");
2475 } else {
2476 sprintf (Error_Text, "\\%c0 is an invalid hexadecimal escape", c);
2478 } else {
2479 /* Point to the last character of the number on success. */
2481 scan--;
2482 *parse = scan;
2485 return (unsigned char) value;
2488 /*--------------------------------------------------------------------*
2489 * literal_escape
2491 * Recognize escaped literal characters (prefixed with backslash),
2492 * and translate them into the corresponding character.
2494 * Returns the proper character value or NULL if not a valid literal
2495 * escape.
2496 *--------------------------------------------------------------------*/
2498 static unsigned char literal_escape (unsigned char c) {
2500 static unsigned char valid_escape [] = {
2501 'a', 'b',
2502 'e',
2503 'f', 'n', 'r', 't', 'v', '(', ')', '-', '[', ']',
2504 '<', '>', '{', '}', '.', '\\', '|', '^', '$', '*',
2505 '+', '?', '&', '\0'
2508 static unsigned char value [] = {
2509 '\a', '\b',
2510 #ifdef EBCDIC_CHARSET
2511 0x27, /* Escape character in IBM's EBCDIC character set. */
2512 #else
2513 0x1B, /* Escape character in ASCII character set. */
2514 #endif
2515 '\f', '\n', '\r', '\t', '\v', '(', ')', '-', '[', ']',
2516 '<', '>', '{', '}', '.', '\\', '|', '^', '$', '*',
2517 '+', '?', '&', '\0'
2520 int i;
2522 for (i = 0; valid_escape [i] != '\0'; i++) {
2523 if (c == valid_escape [i]) return value [i];
2526 return '\0';
2529 /*--------------------------------------------------------------------*
2530 * back_ref
2532 * Process a request to match a previous parenthesized thing.
2533 * Parenthetical entities are numbered beginning at 1 by counting
2534 * opening parentheses from left to to right. \0 would represent
2535 * whole match, but would confuse numeric_escape as an octal escape,
2536 * so it is forbidden.
2538 * Constructs of the form \~1, \~2, etc. are cross-regex back
2539 * references and are used in syntax highlighting patterns to match
2540 * text previously matched by another regex. *** IMPLEMENT LATER ***
2541 *--------------------------------------------------------------------*/
2543 static unsigned char * back_ref (
2544 unsigned char *c,
2545 int *flag_param,
2546 int emit) {
2548 int paren_no, c_offset = 0, is_cross_regex = 0;
2550 unsigned char *ret_val;
2552 /* Implement cross regex backreferences later. */
2554 /* if (*c == (unsigned char) ('~')) {
2555 c_offset++;
2556 is_cross_regex++;
2557 } */
2559 paren_no = (int) (*(c + c_offset) - (unsigned char) ('0'));
2561 if (!isdigit (*(c + c_offset)) || /* Only \1, \2, ... \9 are supported. */
2562 paren_no == 0) { /* Should be caught by numeric_escape. */
2564 return NULL;
2567 /* Make sure parentheses for requested back-reference are complete. */
2569 if (!is_cross_regex && !TEST_BIT (Closed_Parens, paren_no)) {
2570 sprintf (Error_Text, "\\%d is an illegal back reference", paren_no);
2571 return NULL;
2574 if (emit == EMIT_NODE) {
2575 if (is_cross_regex) {
2576 Reg_Parse++; /* Skip past the '~' in a cross regex back reference.
2577 We only do this if we are emitting code. */
2579 if (Is_Case_Insensitive) {
2580 ret_val = emit_node (X_REGEX_BR_CI);
2581 } else {
2582 ret_val = emit_node (X_REGEX_BR);
2584 } else {
2585 if (Is_Case_Insensitive) {
2586 ret_val = emit_node (BACK_REF_CI);
2587 } else {
2588 ret_val = emit_node (BACK_REF);
2592 emit_byte ((unsigned char) paren_no);
2594 if (is_cross_regex || TEST_BIT (Paren_Has_Width, paren_no)) {
2595 *flag_param |= HAS_WIDTH;
2597 } else if (emit == CHECK_ESCAPE) {
2598 ret_val = (unsigned char *) 1;
2599 } else {
2600 ret_val = NULL;
2603 return ret_val;
2606 /*======================================================================*
2607 * Regex execution related code
2608 *======================================================================*/
2610 /* Global work variables for `ExecRE'. */
2612 static unsigned char *Reg_Input; /* String-input pointer. */
2613 static unsigned char *Start_Of_String; /* Beginning of input, for ^ */
2614 /* and < checks. */
2615 static unsigned char *End_Of_String; /* Logical end of input (if
2616 supplied, till \0 otherwise) */
2617 static unsigned char *Look_Behind_To; /* Position till were look behind
2618 can safely check back */
2619 static unsigned char **Start_Ptr_Ptr; /* Pointer to `startp' array. */
2620 static unsigned char **End_Ptr_Ptr; /* Ditto for `endp'. */
2621 static unsigned char *Extent_Ptr_FW; /* Forward extent pointer */
2622 static unsigned char *Extent_Ptr_BW; /* Backward extent pointer */
2623 static unsigned char *Back_Ref_Start [10]; /* Back_Ref_Start [0] and */
2624 static unsigned char *Back_Ref_End [10]; /* Back_Ref_End [0] are not */
2625 /* used. This simplifies */
2626 /* indexing. */
2628 * Measured recursion limits:
2629 * Linux: +/- 40 000 (up to 110 000)
2630 * Solaris: +/- 85 000
2631 * HP-UX 11: +/- 325 000
2633 * So 10 000 ought to be safe.
2635 #define REGEX_RECURSION_LIMIT 10000
2636 static int Recursion_Count; /* Recursion counter */
2637 static int Recursion_Limit_Exceeded; /* Recursion limit exceeded flag */
2639 #define AT_END_OF_STRING(X) (*(X) == (unsigned char)'\0' ||\
2640 (End_Of_String != NULL && (X) >= End_Of_String))
2642 /* static regexp *Cross_Regex_Backref; */
2644 static int Prev_Is_BOL;
2645 static int Succ_Is_EOL;
2646 static int Prev_Is_Delim;
2647 static int Succ_Is_Delim;
2649 /* Define a pointer to an array to hold general (...){m,n} counts. */
2651 typedef struct brace_counts {
2652 unsigned long count [1]; /* More unwarranted chumminess with compiler. */
2653 } brace_counts;
2655 static struct brace_counts *Brace;
2657 /* Default table for determining whether a character is a word delimiter. */
2659 static unsigned char Default_Delimiters [UCHAR_MAX] = {0};
2661 static unsigned char *Current_Delimiters; /* Current delimiter table */
2663 /* Forward declarations of functions used by `ExecRE' */
2665 static int attempt (regexp *, unsigned char *);
2666 static int match (unsigned char *, int *);
2667 static unsigned long greedy (unsigned char *, long);
2668 static void adjustcase (unsigned char *, int, unsigned char);
2669 static unsigned char * makeDelimiterTable (unsigned char *, unsigned char *);
2672 * ExecRE - match a `regexp' structure against a string
2674 * If `end' is non-NULL, matches may not BEGIN past end, but may extend past
2675 * it. If reverse is true, `end' must be specified, and searching begins at
2676 * `end'. "isbol" should be set to true if the beginning of the string is the
2677 * actual beginning of a line (since `ExecRE' can't look backwards from the
2678 * beginning to find whether there was a newline before). Likewise, "isbow"
2679 * asks whether the string is preceded by a word delimiter. End of string is
2680 * always treated as a word and line boundary (there may be cases where it
2681 * shouldn't be, in which case, this should be changed). "delimit" (if
2682 * non-null) specifies a null-terminated string of characters to be considered
2683 * word delimiters matching "<" and ">". if "delimit" is NULL, the default
2684 * delimiters (as set in SetREDefaultWordDelimiters) are used.
2685 * Look_behind_to indicates the position till where it is safe to
2686 * perform look-behind matches. If set, it should be smaller than or equal
2687 * to the start position of the search (pointed at by string). If it is NULL,
2688 * it defaults to the start position.
2689 * Finally, match_to indicates the logical end of the string, till where
2690 * matches are allowed to extend. Note that look-ahead patterns may look
2691 * past that boundary. If match_to is set to NULL, the terminating \0 is
2692 * assumed to correspond to the logical boundary. Match_to, if set, must be
2693 * larger than or equal to end, if set.
2696 int ExecRE (
2697 regexp *prog,
2698 regexp *cross_regex_backref,
2699 const char *string,
2700 const char *end,
2701 int reverse,
2702 char prev_char,
2703 char succ_char,
2704 const char *delimiters,
2705 const char *look_behind_to,
2706 const char *match_to) {
2708 register unsigned char *str;
2709 unsigned char **s_ptr;
2710 unsigned char **e_ptr;
2711 int ret_val = 0;
2712 unsigned char tempDelimitTable [256];
2713 int i;
2715 s_ptr = (unsigned char **) prog->startp;
2716 e_ptr = (unsigned char **) prog->endp;
2718 /* Check for valid parameters. */
2720 if (prog == NULL || string == NULL) {
2721 reg_error ("NULL parameter to `ExecRE\'");
2722 goto SINGLE_RETURN;
2725 /* Check validity of program. */
2727 if (U_CHAR_AT (prog->program) != MAGIC) {
2728 reg_error ("corrupted program");
2729 goto SINGLE_RETURN;
2732 /* If caller has supplied delimiters, make a delimiter table */
2734 if (delimiters == NULL) {
2735 Current_Delimiters = Default_Delimiters;
2736 } else {
2737 Current_Delimiters = makeDelimiterTable (
2738 (unsigned char *) delimiters,
2739 (unsigned char *) tempDelimitTable);
2742 /* Remember the logical end of the string. */
2744 End_Of_String = (unsigned char *) match_to;
2746 if (end == NULL && reverse) {
2747 for (end = string; !AT_END_OF_STRING((unsigned char*)end); end++) ;
2748 succ_char = '\n';
2749 } else if (end == NULL) {
2750 succ_char = '\n';
2753 /* Initialize arrays used by shortcut_escape. */
2755 if (!init_ansi_classes ()) goto SINGLE_RETURN;
2757 /* Remember the beginning of the string for matching BOL */
2759 Start_Of_String = (unsigned char *) string;
2760 Look_Behind_To = (unsigned char *) (look_behind_to?look_behind_to:string);
2762 Prev_Is_BOL = ((prev_char == '\n') || (prev_char == '\0') ? 1 : 0);
2763 Succ_Is_EOL = ((succ_char == '\n') || (succ_char == '\0') ? 1 : 0);
2764 Prev_Is_Delim = (Current_Delimiters [(unsigned char)prev_char] ? 1 : 0);
2765 Succ_Is_Delim = (Current_Delimiters [(unsigned char)succ_char] ? 1 : 0);
2767 Total_Paren = (int) (prog->program [1]);
2768 Num_Braces = (int) (prog->program [2]);
2770 /* Reset the recursion detection flag */
2771 Recursion_Limit_Exceeded = 0;
2773 /* Cross_Regex_Backref = cross_regex_backref; */
2775 /* Allocate memory for {m,n} construct counting variables if need be. */
2777 if (Num_Braces > 0) {
2778 Brace =
2779 (brace_counts *) malloc (sizeof (brace_counts) + (size_t) Num_Braces);
2781 if (Brace == NULL) {
2782 reg_error ("out of memory in `ExecRE\'");
2783 goto SINGLE_RETURN;
2785 } else {
2786 Brace = NULL;
2789 /* Initialize the first nine (9) capturing parentheses start and end
2790 pointers to point to the start of the search string. This is to prevent
2791 crashes when later trying to reference captured parens that do not exist
2792 in the compiled regex. We only need to do the first nine since users
2793 can only specify \1, \2, ... \9. */
2795 for (i = 9; i > 0; i--) {
2796 *s_ptr++ = (unsigned char *) string;
2797 *e_ptr++ = (unsigned char *) string;
2800 if (!reverse) { /* Forward Search */
2801 if (prog->anchor) {
2802 /* Search is anchored at BOL */
2804 if (attempt (prog, (unsigned char *) string)) {
2805 ret_val = 1;
2806 goto SINGLE_RETURN;
2809 for (str = (unsigned char *) string;
2810 !AT_END_OF_STRING(str) && str != (unsigned char *) end && !Recursion_Limit_Exceeded;
2811 str++) {
2813 if (*str == '\n') {
2814 if (attempt (prog, str + 1)) {
2815 ret_val = 1;
2816 break;
2821 goto SINGLE_RETURN;
2823 } else if (prog->match_start != '\0') {
2824 /* We know what char match must start with. */
2826 for (str = (unsigned char *) string;
2827 !AT_END_OF_STRING(str) && str != (unsigned char *) end && !Recursion_Limit_Exceeded;
2828 str++) {
2830 if (*str == (unsigned char)prog->match_start) {
2831 if (attempt (prog, str)) {
2832 ret_val = 1;
2833 break;
2838 goto SINGLE_RETURN;
2839 } else {
2840 /* General case */
2842 for (str = (unsigned char *) string;
2843 !AT_END_OF_STRING(str) && str != (unsigned char *) end && !Recursion_Limit_Exceeded;
2844 str++) {
2846 if (attempt (prog, str)) {
2847 ret_val = 1;
2848 break;
2852 /* Beware of a single $ matching \0 */
2853 if (!Recursion_Limit_Exceeded && !ret_val && AT_END_OF_STRING(str) && str != (unsigned char *) end) {
2854 if (attempt (prog, str)) {
2855 ret_val = 1;
2859 goto SINGLE_RETURN;
2861 } else { /* Search reverse, same as forward, but loops run backward */
2863 /* Make sure that we don't start matching beyond the logical end */
2864 if (End_Of_String != NULL && (unsigned char*)end > End_Of_String) {
2865 end = (const char*)End_Of_String;
2868 if (prog->anchor) {
2869 /* Search is anchored at BOL */
2871 for (str = (unsigned char *)(end - 1);
2872 str >= (unsigned char *) string && !Recursion_Limit_Exceeded;
2873 str--) {
2875 if (*str == '\n') {
2876 if (attempt (prog, str + 1)) {
2877 ret_val = 1;
2878 goto SINGLE_RETURN;
2883 if (!Recursion_Limit_Exceeded && attempt (prog, (unsigned char *) string)) {
2884 ret_val = 1;
2885 goto SINGLE_RETURN;
2888 goto SINGLE_RETURN;
2889 } else if (prog->match_start != '\0') {
2890 /* We know what char match must start with. */
2892 for (str = (unsigned char *) end;
2893 str >= (unsigned char *) string && !Recursion_Limit_Exceeded;
2894 str--) {
2896 if (*str == (unsigned char)prog->match_start) {
2897 if (attempt (prog, str)) {
2898 ret_val = 1;
2899 break;
2904 goto SINGLE_RETURN;
2905 } else {
2906 /* General case */
2908 for (str = (unsigned char *) end;
2909 str >= (unsigned char *) string && !Recursion_Limit_Exceeded;
2910 str--) {
2912 if (attempt (prog, str)) {
2913 ret_val = 1;
2914 break;
2920 SINGLE_RETURN: if (Brace) free (Brace);
2922 if (Recursion_Limit_Exceeded) return (0);
2924 return (ret_val);
2927 /*--------------------------------------------------------------------*
2928 * init_ansi_classes
2930 * Generate character class sets using locale aware ANSI C functions.
2932 *--------------------------------------------------------------------*/
2934 static int init_ansi_classes (void) {
2936 static int initialized = 0;
2937 static int underscore = (int) '_';
2938 int i, word_count, letter_count, space_count;
2940 if (!initialized) {
2941 initialized = 1; /* Only need to generate character sets once. */
2942 word_count = 0;
2943 letter_count = 0;
2944 space_count = 0;
2946 for (i = 1; i < (int)UCHAR_MAX; i++) {
2947 if (isalnum (i) || i == underscore) {
2948 Word_Char [word_count++] = (unsigned char) i;
2951 if (isalpha (i)) {
2952 Letter_Char [letter_count++] = (unsigned char) i;
2955 /* Note: Whether or not newline is considered to be whitespace is
2956 handled by switches within the original regex and is thus omitted
2957 here. */
2959 if (isspace (i) && (i != (int) '\n')) {
2960 White_Space [space_count++] = (unsigned char) i;
2963 /* Make sure arrays are big enough. ("- 2" because of zero array
2964 origin and we need to leave room for the NULL terminator.) */
2966 if (word_count > (ALNUM_CHAR_SIZE - 2) ||
2967 space_count > (WHITE_SPACE_SIZE - 2) ||
2968 letter_count > (ALNUM_CHAR_SIZE - 2)) {
2970 reg_error ("internal error #9 `init_ansi_classes\'");
2971 return (0);
2975 Word_Char [word_count] = '\0';
2976 Letter_Char [word_count] = '\0';
2977 White_Space [space_count] = '\0';
2980 return (1);
2983 /*----------------------------------------------------------------------*
2984 * attempt - try match at specific point, returns: 0 failure, 1 success
2985 *----------------------------------------------------------------------*/
2987 static int attempt (regexp *prog, unsigned char *string) {
2989 register int i;
2990 register unsigned char **s_ptr;
2991 register unsigned char **e_ptr;
2992 int branch_index = 0; /* Must be set to zero ! */
2994 Reg_Input = string;
2995 Start_Ptr_Ptr = (unsigned char **) prog->startp;
2996 End_Ptr_Ptr = (unsigned char **) prog->endp;
2997 s_ptr = (unsigned char **) prog->startp;
2998 e_ptr = (unsigned char **) prog->endp;
3000 /* Reset the recursion counter. */
3001 Recursion_Count = 0;
3003 /* Overhead due to capturing parentheses. */
3005 Extent_Ptr_BW = string;
3006 Extent_Ptr_FW = NULL;
3008 for (i = Total_Paren + 1; i > 0; i--) {
3009 *s_ptr++ = NULL;
3010 *e_ptr++ = NULL;
3013 if (match ((unsigned char *) (prog->program + REGEX_START_OFFSET),
3014 &branch_index)) {
3015 prog->startp [0] = (char *) string;
3016 prog->endp [0] = (char *) Reg_Input; /* <-- One char AFTER */
3017 prog->extentpBW = (char *) Extent_Ptr_BW; /* matched string! */
3018 prog->extentpFW = (char *) Extent_Ptr_FW;
3019 prog->top_branch = branch_index;
3021 return (1);
3022 } else {
3023 return (0);
3027 /*----------------------------------------------------------------------*
3028 * match - main matching routine
3030 * Conceptually the strategy is simple: check to see whether the
3031 * current node matches, call self recursively to see whether the rest
3032 * matches, and then act accordingly. In practice we make some effort
3033 * to avoid recursion, in particular by going through "ordinary" nodes
3034 * (that don't need to know whether the rest of the match failed) by a
3035 * loop instead of by recursion. Returns 0 failure, 1 success.
3036 *----------------------------------------------------------------------*/
3037 #define MATCH_RETURN(X)\
3038 { --Recursion_Count; return (X); }
3039 #define CHECK_RECURSION_LIMIT\
3040 if (Recursion_Limit_Exceeded) MATCH_RETURN (0);
3042 static int match (unsigned char *prog, int *branch_index_param) {
3044 register unsigned char *scan; /* Current node. */
3045 unsigned char *next; /* Next node. */
3046 register int next_ptr_offset; /* Used by the NEXT_PTR () macro */
3048 if (++Recursion_Count > REGEX_RECURSION_LIMIT) {
3049 if (!Recursion_Limit_Exceeded) /* Prevent duplicate errors */
3050 reg_error("recursion limit exceeded, please respecify expression");
3051 Recursion_Limit_Exceeded = 1;
3052 MATCH_RETURN (0);
3056 scan = prog;
3058 while (scan != NULL) {
3059 NEXT_PTR (scan, next);
3061 switch (GET_OP_CODE (scan)) {
3062 case BRANCH:
3064 register unsigned char *save;
3065 register int branch_index_local = 0;
3067 if (GET_OP_CODE (next) != BRANCH) { /* No choice. */
3068 next = OPERAND (scan); /* Avoid recursion. */
3069 } else {
3070 do {
3071 save = Reg_Input;
3073 if (match (OPERAND (scan), NULL))
3075 if (branch_index_param)
3076 *branch_index_param = branch_index_local;
3077 MATCH_RETURN (1);
3080 CHECK_RECURSION_LIMIT
3082 ++branch_index_local;
3084 Reg_Input = save; /* Backtrack. */
3085 NEXT_PTR (scan, scan);
3086 } while (scan != NULL && GET_OP_CODE (scan) == BRANCH);
3088 MATCH_RETURN (0); /* NOT REACHED */
3092 break;
3094 case EXACTLY:
3096 register int len;
3097 register unsigned char *opnd;
3099 opnd = OPERAND (scan);
3101 /* Inline the first character, for speed. */
3103 if (*opnd != *Reg_Input) MATCH_RETURN (0);
3105 len = strlen ((char *) opnd);
3107 if (End_Of_String != NULL && Reg_Input + len > End_Of_String) {
3108 MATCH_RETURN (0);
3111 if (len > 1 &&
3112 strncmp ((char *) opnd, (char *) Reg_Input, len) != 0) {
3114 MATCH_RETURN (0);
3117 Reg_Input += len;
3120 break;
3122 case SIMILAR:
3124 register unsigned char *opnd;
3125 register unsigned char test;
3127 opnd = OPERAND (scan);
3129 /* Note: the SIMILAR operand was converted to lower case during
3130 regex compile. */
3132 while ((test = *opnd++) != '\0') {
3133 if (AT_END_OF_STRING(Reg_Input) ||
3134 tolower (*Reg_Input++) != test) {
3136 MATCH_RETURN (0);
3141 break;
3143 case BOL: /* `^' (beginning of line anchor) */
3144 if (Reg_Input == Start_Of_String) {
3145 if (Prev_Is_BOL) break;
3146 } else if (*(Reg_Input - 1) == '\n') {
3147 break;
3150 MATCH_RETURN (0);
3152 case EOL: /* `$' anchor matches end of line and end of string */
3153 if (*Reg_Input == '\n' || (AT_END_OF_STRING(Reg_Input) && Succ_Is_EOL)) {
3154 break;
3157 MATCH_RETURN (0);
3159 case BOWORD: /* `<' (beginning of word anchor) */
3160 /* Check to see if the current character is not a delimiter
3161 and the preceding character is. */
3163 int prev_is_delim;
3164 if (Reg_Input == Start_Of_String) {
3165 prev_is_delim = Prev_Is_Delim;
3166 } else {
3167 prev_is_delim = Current_Delimiters [ *(Reg_Input - 1) ];
3169 if (prev_is_delim) {
3170 int current_is_delim;
3171 if (AT_END_OF_STRING(Reg_Input)) {
3172 current_is_delim = Succ_Is_Delim;
3173 } else {
3174 current_is_delim = Current_Delimiters [ *Reg_Input ];
3176 if (!current_is_delim) break;
3180 MATCH_RETURN (0);
3182 case EOWORD: /* `>' (end of word anchor) */
3183 /* Check to see if the current character is a delimiter
3184 and the preceding character is not. */
3186 int prev_is_delim;
3187 if (Reg_Input == Start_Of_String) {
3188 prev_is_delim = Prev_Is_Delim;
3189 } else {
3190 prev_is_delim = Current_Delimiters [ *(Reg_Input-1) ];
3192 if (!prev_is_delim) {
3193 int current_is_delim;
3194 if (AT_END_OF_STRING(Reg_Input)) {
3195 current_is_delim = Succ_Is_Delim;
3196 } else {
3197 current_is_delim = Current_Delimiters [ *Reg_Input ];
3199 if (current_is_delim) break;
3203 MATCH_RETURN (0);
3205 case NOT_BOUNDARY: /* \B (NOT a word boundary) */
3207 int prev_is_delim;
3208 int current_is_delim;
3209 if (Reg_Input == Start_Of_String) {
3210 prev_is_delim = Prev_Is_Delim;
3211 } else {
3212 prev_is_delim = Current_Delimiters [ *(Reg_Input-1) ];
3214 if (AT_END_OF_STRING(Reg_Input)) {
3215 current_is_delim = Succ_Is_Delim;
3216 } else {
3217 current_is_delim = Current_Delimiters [ *Reg_Input ];
3219 if (!(prev_is_delim ^ current_is_delim)) break;
3222 MATCH_RETURN (0);
3224 case IS_DELIM: /* \y (A word delimiter character.) */
3225 if (Current_Delimiters [ *Reg_Input ] &&
3226 !AT_END_OF_STRING(Reg_Input)) {
3227 Reg_Input++; break;
3230 MATCH_RETURN (0);
3232 case NOT_DELIM: /* \Y (NOT a word delimiter character.) */
3233 if (!Current_Delimiters [ *Reg_Input ] &&
3234 !AT_END_OF_STRING(Reg_Input)) {
3235 Reg_Input++; break;
3238 MATCH_RETURN (0);
3240 case WORD_CHAR: /* \w (word character; alpha-numeric or underscore) */
3241 if ((isalnum ((int) *Reg_Input) || *Reg_Input == '_') &&
3242 !AT_END_OF_STRING(Reg_Input)) {
3243 Reg_Input++; break;
3246 MATCH_RETURN (0);
3248 case NOT_WORD_CHAR:/* \W (NOT a word character) */
3249 if (isalnum ((int) *Reg_Input) ||
3250 *Reg_Input == '_' ||
3251 *Reg_Input == '\n' ||
3252 AT_END_OF_STRING(Reg_Input)) MATCH_RETURN (0);
3254 Reg_Input++; break;
3256 case ANY: /* `.' (matches any character EXCEPT newline) */
3257 if (AT_END_OF_STRING(Reg_Input) || *Reg_Input == '\n') MATCH_RETURN (0);
3259 Reg_Input++; break;
3261 case EVERY: /* `.' (matches any character INCLUDING newline) */
3262 if (AT_END_OF_STRING(Reg_Input)) MATCH_RETURN (0);
3264 Reg_Input++; break;
3266 case DIGIT: /* \d, same as [0123456789] */
3267 if (!isdigit ((int) *Reg_Input) ||
3268 AT_END_OF_STRING(Reg_Input)) MATCH_RETURN (0);
3270 Reg_Input++; break;
3272 case NOT_DIGIT: /* \D, same as [^0123456789] */
3273 if (isdigit ((int) *Reg_Input) ||
3274 *Reg_Input == '\n' ||
3275 AT_END_OF_STRING(Reg_Input)) MATCH_RETURN (0);
3277 Reg_Input++; break;
3279 case LETTER: /* \l, same as [a-zA-Z] */
3280 if (!isalpha ((int) *Reg_Input) ||
3281 AT_END_OF_STRING(Reg_Input)) MATCH_RETURN (0);
3283 Reg_Input++; break;
3285 case NOT_LETTER: /* \L, same as [^0123456789] */
3286 if (isalpha ((int) *Reg_Input) ||
3287 *Reg_Input == '\n' ||
3288 AT_END_OF_STRING(Reg_Input)) MATCH_RETURN (0);
3290 Reg_Input++; break;
3292 case SPACE: /* \s, same as [ \t\r\f\v] */
3293 if (!isspace ((int) *Reg_Input) ||
3294 *Reg_Input == '\n' ||
3295 AT_END_OF_STRING(Reg_Input)) MATCH_RETURN (0);
3297 Reg_Input++; break;
3299 case SPACE_NL: /* \s, same as [\n \t\r\f\v] */
3300 if (!isspace ((int) *Reg_Input) ||
3301 AT_END_OF_STRING(Reg_Input)) MATCH_RETURN (0);
3303 Reg_Input++; break;
3305 case NOT_SPACE: /* \S, same as [^\n \t\r\f\v] */
3306 if (isspace ((int) *Reg_Input) ||
3307 AT_END_OF_STRING(Reg_Input)) MATCH_RETURN (0);
3309 Reg_Input++; break;
3311 case NOT_SPACE_NL: /* \S, same as [^ \t\r\f\v] */
3312 if ((isspace ((int) *Reg_Input) && *Reg_Input != '\n') ||
3313 AT_END_OF_STRING(Reg_Input)) MATCH_RETURN (0);
3315 Reg_Input++; break;
3317 case ANY_OF: /* [...] character class. */
3318 if (AT_END_OF_STRING(Reg_Input))
3319 MATCH_RETURN (0); /* Needed because strchr ()
3320 considers \0 as a member
3321 of the character set. */
3323 if (strchr ((char *) OPERAND (scan), (int) *Reg_Input) == NULL) {
3324 MATCH_RETURN (0);
3327 Reg_Input++; break;
3329 case ANY_BUT: /* [^...] Negated character class-- does NOT normally
3330 match newline (\n added usually to operand at compile
3331 time.) */
3333 if (AT_END_OF_STRING(Reg_Input)) MATCH_RETURN (0); /* See comment for ANY_OF. */
3335 if (strchr ((char *) OPERAND (scan), (int) *Reg_Input) != NULL) {
3336 MATCH_RETURN (0);
3339 Reg_Input++; break;
3341 case NOTHING:
3342 case BACK:
3343 break;
3345 case STAR:
3346 case PLUS:
3347 case QUESTION:
3348 case BRACE:
3350 case LAZY_STAR:
3351 case LAZY_PLUS:
3352 case LAZY_QUESTION:
3353 case LAZY_BRACE:
3355 register unsigned long num_matched = REG_ZERO;
3356 register unsigned long min = ULONG_MAX, max = REG_ZERO;
3357 register unsigned char *save;
3358 register unsigned char next_char;
3359 unsigned char *next_op;
3360 int lazy = 0;
3362 /* Lookahead (when possible) to avoid useless match attempts
3363 when we know what character comes next. */
3365 if (GET_OP_CODE (next) == EXACTLY) {
3366 next_char = *OPERAND (next);
3367 } else {
3368 next_char = '\0';/* i.e. Don't know what next character is. */
3371 next_op = OPERAND (scan);
3373 switch (GET_OP_CODE (scan)) {
3374 case LAZY_STAR:
3375 lazy = 1;
3376 case STAR:
3377 min = REG_ZERO;
3378 max = ULONG_MAX;
3379 break;
3381 case LAZY_PLUS:
3382 lazy = 1;
3383 case PLUS:
3384 min = REG_ONE;
3385 max = ULONG_MAX;
3386 break;
3388 case LAZY_QUESTION:
3389 lazy = 1;
3390 case QUESTION:
3391 min = REG_ZERO;
3392 max = REG_ONE;
3393 break;
3395 case LAZY_BRACE:
3396 lazy = 1;
3397 case BRACE:
3398 min = (unsigned long)
3399 GET_OFFSET (scan + NEXT_PTR_SIZE);
3401 max = (unsigned long)
3402 GET_OFFSET (scan + (2 * NEXT_PTR_SIZE));
3404 if (max <= REG_INFINITY) max = ULONG_MAX;
3406 next_op = OPERAND (scan + (2 * NEXT_PTR_SIZE));
3409 save = Reg_Input;
3411 if (lazy) {
3412 if ( min > REG_ZERO) num_matched = greedy (next_op, min);
3413 } else {
3414 num_matched = greedy (next_op, max);
3417 while (min <= num_matched && num_matched <= max) {
3418 if (next_char == '\0' || next_char == *Reg_Input) {
3419 if (match (next, NULL)) MATCH_RETURN (1);
3421 CHECK_RECURSION_LIMIT
3424 /* Couldn't or didn't match. */
3426 if (lazy) {
3427 if (!greedy (next_op, 1)) MATCH_RETURN (0);
3429 num_matched++; /* Inch forward. */
3430 } else if (num_matched > REG_ZERO) {
3431 num_matched--; /* Back up. */
3432 } else if (min == REG_ZERO && num_matched == REG_ZERO) {
3433 break;
3436 Reg_Input = save + num_matched;
3439 MATCH_RETURN (0);
3442 break;
3444 case END:
3445 if (Extent_Ptr_FW == NULL || (Reg_Input - Extent_Ptr_FW) > 0) {
3446 Extent_Ptr_FW = Reg_Input;
3449 MATCH_RETURN (1); /* Success! */
3451 break;
3453 case INIT_COUNT:
3454 Brace->count [*OPERAND (scan)] = REG_ZERO;
3456 break;
3458 case INC_COUNT:
3459 Brace->count [*OPERAND (scan)]++;
3461 break;
3463 case TEST_COUNT:
3464 if (Brace->count [*OPERAND (scan)] <
3465 (unsigned long) GET_OFFSET (scan + NEXT_PTR_SIZE + INDEX_SIZE)) {
3467 next = scan + NODE_SIZE + INDEX_SIZE + NEXT_PTR_SIZE;
3470 break;
3472 case BACK_REF:
3473 case BACK_REF_CI:
3474 /* case X_REGEX_BR: */
3475 /* case X_REGEX_BR_CI: *** IMPLEMENT LATER */
3477 register unsigned char *captured, *finish;
3478 int paren_no;
3480 paren_no = (int) *OPERAND (scan);
3482 /* if (GET_OP_CODE (scan) == X_REGEX_BR ||
3483 GET_OP_CODE (scan) == X_REGEX_BR_CI) {
3485 if (Cross_Regex_Backref == NULL) MATCH_RETURN (0);
3487 captured =
3488 (unsigned char *) Cross_Regex_Backref->startp [paren_no];
3490 finish =
3491 (unsigned char *) Cross_Regex_Backref->endp [paren_no];
3492 } else { */
3493 captured = Back_Ref_Start [paren_no];
3494 finish = Back_Ref_End [paren_no];
3495 /* } */
3497 if ((captured != NULL) && (finish != NULL)) {
3498 if (captured > finish) MATCH_RETURN (0);
3500 if (GET_OP_CODE (scan) == BACK_REF_CI /* ||
3501 GET_OP_CODE (scan) == X_REGEX_BR_CI*/ ) {
3503 while (captured < finish) {
3504 if (AT_END_OF_STRING(Reg_Input) ||
3505 tolower (*captured++) != tolower (*Reg_Input++)) {
3506 MATCH_RETURN (0);
3509 } else {
3510 while (captured < finish) {
3511 if (AT_END_OF_STRING(Reg_Input) ||
3512 *captured++ != *Reg_Input++) MATCH_RETURN (0);
3516 break;
3517 } else {
3518 MATCH_RETURN (0);
3522 case POS_AHEAD_OPEN:
3523 case NEG_AHEAD_OPEN:
3525 register unsigned char *save;
3526 register unsigned char *saved_end;
3527 int answer;
3529 save = Reg_Input;
3531 /* Temporarily ignore the logical end of the string, to allow
3532 lookahead past the end. */
3533 saved_end = End_Of_String;
3534 End_Of_String = NULL;
3536 answer = match (next, NULL); /* Does the look-ahead regex match? */
3538 CHECK_RECURSION_LIMIT
3540 if ((GET_OP_CODE (scan) == POS_AHEAD_OPEN) ? answer : !answer) {
3541 /* Remember the last (most to the right) character position
3542 that we consume in the input for a successful match. This
3543 is info that may be needed should an attempt be made to
3544 match the exact same text at the exact same place. Since
3545 look-aheads backtrack, a regex with a trailing look-ahead
3546 may need more text than it matches to accomplish a
3547 re-match. */
3549 if (Extent_Ptr_FW == NULL || (Reg_Input - Extent_Ptr_FW) > 0) {
3550 Extent_Ptr_FW = Reg_Input;
3553 Reg_Input = save; /* Backtrack to look-ahead start. */
3554 End_Of_String = saved_end; /* Restore logical end. */
3556 /* Jump to the node just after the (?=...) or (?!...)
3557 Construct. */
3559 next = next_ptr (OPERAND (scan)); /* Skip 1st branch */
3560 /* Skip the chain of branches inside the look-ahead */
3561 while(GET_OP_CODE(next) == BRANCH)
3562 next = next_ptr (next);
3563 next = next_ptr (next); /* Skip the LOOK_AHEAD_CLOSE */
3564 } else {
3565 Reg_Input = save; /* Backtrack to look-ahead start. */
3566 End_Of_String = saved_end; /* Restore logical end. */
3568 MATCH_RETURN (0);
3572 break;
3574 case POS_BEHIND_OPEN:
3575 case NEG_BEHIND_OPEN:
3577 register unsigned char *save;
3578 int answer;
3579 register int offset, upper;
3580 int lower;
3581 int found = 0;
3582 unsigned char *saved_end;
3584 save = Reg_Input;
3585 saved_end = End_Of_String;
3587 /* Prevent overshoot (greedy matching could end past the
3588 current position) by tightening the matching boundary.
3589 Lookahead inside lookbehind can still cross that boundary. */
3590 End_Of_String = Reg_Input;
3592 lower = GET_LOWER (scan);
3593 upper = GET_UPPER (scan);
3595 /* Start with the shortest match first. This is the most
3596 efficient direction in general.
3597 Note! Negative look behind is _very_ tricky when the length
3598 is not constant: we have to make sure the expression doesn't
3599 match for _any_ of the starting positions. */
3600 for (offset = lower; offset <= upper; ++offset) {
3601 Reg_Input = save - offset;
3603 if (Reg_Input < Look_Behind_To) {
3604 /* No need to look any further */
3605 break;
3608 answer = match (next, NULL); /* Does the look-behind regex match? */
3610 CHECK_RECURSION_LIMIT
3612 /* The match must have ended at the current position;
3613 otherwise it is invalid */
3614 if (answer && Reg_Input == save) {
3615 /* It matched, exactly far enough */
3616 found = 1;
3618 /* Remember the last (most to the left) character position
3619 that we consume in the input for a successful match.
3620 This is info that may be needed should an attempt be
3621 made to match the exact same text at the exact same
3622 place. Since look-behind backtracks, a regex with a
3623 leading look-behind may need more text than it matches
3624 to accomplish a re-match. */
3626 if (Extent_Ptr_BW == NULL ||
3627 (Extent_Ptr_BW - (save - offset)) > 0) {
3628 Extent_Ptr_BW = save - offset;
3631 break;
3635 /* Always restore the position and the logical string end. */
3636 Reg_Input = save;
3637 End_Of_String = saved_end;
3639 if ((GET_OP_CODE (scan) == POS_BEHIND_OPEN) ? found : !found) {
3640 /* The look-behind matches, so we must jump to the next
3641 node. The look-behind node is followed by a chain of
3642 branches (contents of the look-behind expression), and
3643 terminated by a look-behind-close node. */
3644 next = next_ptr (OPERAND (scan) + LENGTH_SIZE); /* 1st branch */
3645 /* Skip the chained branches inside the look-ahead */
3646 while (GET_OP_CODE (next) == BRANCH)
3647 next = next_ptr (next);
3648 next = next_ptr (next); /* Skip LOOK_BEHIND_CLOSE */
3649 } else {
3650 /* Not a match */
3651 MATCH_RETURN (0);
3654 break;
3656 case LOOK_AHEAD_CLOSE:
3657 case LOOK_BEHIND_CLOSE:
3658 MATCH_RETURN (1); /* We have reached the end of the look-ahead or
3659 look-behind which implies that we matched it,
3660 so return TRUE. */
3661 default:
3662 if ((GET_OP_CODE (scan) > OPEN) &&
3663 (GET_OP_CODE (scan) < OPEN + NSUBEXP)) {
3665 register int no;
3666 register unsigned char *save;
3668 no = GET_OP_CODE (scan) - OPEN;
3669 save = Reg_Input;
3671 if (no < 10) {
3672 Back_Ref_Start [no] = save;
3673 Back_Ref_End [no] = NULL;
3676 if (match (next, NULL)) {
3677 /* Do not set `Start_Ptr_Ptr' if some later invocation (think
3678 recursion) of the same parentheses already has. */
3680 if (Start_Ptr_Ptr [no] == NULL) Start_Ptr_Ptr [no] = save;
3682 MATCH_RETURN (1);
3683 } else {
3684 MATCH_RETURN (0);
3686 } else if ((GET_OP_CODE (scan) > CLOSE) &&
3687 (GET_OP_CODE (scan) < CLOSE + NSUBEXP)) {
3689 register int no;
3690 register unsigned char *save;
3692 no = GET_OP_CODE (scan) - CLOSE;
3693 save = Reg_Input;
3695 if (no < 10) Back_Ref_End [no] = save;
3697 if (match (next, NULL)) {
3698 /* Do not set `End_Ptr_Ptr' if some later invocation of the
3699 same parentheses already has. */
3701 if (End_Ptr_Ptr [no] == NULL) End_Ptr_Ptr [no] = save;
3703 MATCH_RETURN (1);
3704 } else {
3705 MATCH_RETURN (0);
3707 } else {
3708 reg_error ("memory corruption, `match\'");
3710 MATCH_RETURN (0);
3713 break;
3716 scan = next;
3719 /* We get here only if there's trouble -- normally "case END" is
3720 the terminating point. */
3722 reg_error ("corrupted pointers, `match\'");
3724 MATCH_RETURN (0);
3727 /*----------------------------------------------------------------------*
3728 * greedy
3730 * Repeatedly match something simple up to "max" times. If max <= 0
3731 * then match as much as possible (max = infinity). Uses unsigned long
3732 * variables to maximize the amount of text matchable for unbounded
3733 * qualifiers like '*' and '+'. This will allow at least 4,294,967,295
3734 * matches (4 Gig!) for an ANSI C compliant compiler. If you are
3735 * applying a regex to something bigger than that, you shouldn't be
3736 * using NEdit!
3738 * Returns the actual number of matches.
3739 *----------------------------------------------------------------------*/
3741 static unsigned long greedy (unsigned char *p, long max) {
3743 register unsigned char *input_str;
3744 register unsigned char *operand;
3745 register unsigned long count = REG_ZERO;
3746 register unsigned long max_cmp;
3748 input_str = Reg_Input;
3749 operand = OPERAND (p); /* Literal char or start of class characters. */
3750 max_cmp = (max > 0) ? (unsigned long) max : ULONG_MAX;
3752 switch (GET_OP_CODE (p)) {
3753 case ANY:
3754 /* Race to the end of the line or string. Dot DOESN'T match
3755 newline. */
3757 while (count < max_cmp &&
3758 *input_str != '\n' &&
3759 !AT_END_OF_STRING(input_str)) {
3760 count++; input_str++;
3763 break;
3765 case EVERY:
3766 /* Race to the end of the line or string. Dot DOES match newline. */
3768 while (count < max_cmp &&
3769 !AT_END_OF_STRING(input_str)) {
3770 count++; input_str++;
3773 break;
3775 case EXACTLY: /* Count occurrences of single character operand. */
3776 while (count < max_cmp &&
3777 *operand == *input_str &&
3778 !AT_END_OF_STRING(input_str)) {
3779 count++; input_str++;
3782 break;
3784 case SIMILAR: /* Case insensitive version of EXACTLY */
3785 while (count < max_cmp &&
3786 *operand == tolower (*input_str) &&
3787 !AT_END_OF_STRING(input_str)) {
3788 count++; input_str++;
3791 break;
3793 case ANY_OF: /* [...] character class. */
3794 while (count < max_cmp &&
3795 strchr ((char *) operand, (int) *input_str) != NULL &&
3796 !AT_END_OF_STRING(input_str)) {
3798 count++; input_str++;
3801 break;
3803 case ANY_BUT: /* [^...] Negated character class- does NOT normally
3804 match newline (\n added usually to operand at compile
3805 time.) */
3807 while (count < max_cmp &&
3808 strchr ((char *) operand, (int) *input_str) == NULL &&
3809 !AT_END_OF_STRING(input_str)) {
3811 count++; input_str++;
3814 break;
3816 case IS_DELIM: /* \y (not a word delimiter char)
3817 NOTE: '\n' and '\0' are always word delimiters. */
3819 while (count < max_cmp &&
3820 Current_Delimiters [ *input_str ] &&
3821 !AT_END_OF_STRING(input_str)) {
3822 count++; input_str++;
3825 break;
3827 case NOT_DELIM: /* \Y (not a word delimiter char)
3828 NOTE: '\n' and '\0' are always word delimiters. */
3830 while (count < max_cmp &&
3831 !Current_Delimiters [ *input_str ] &&
3832 !AT_END_OF_STRING(input_str)) {
3833 count++; input_str++;
3836 break;
3838 case WORD_CHAR: /* \w (word character, alpha-numeric or underscore) */
3839 while (count < max_cmp &&
3840 (isalnum ((int) *input_str) ||
3841 *input_str == (unsigned char) '_') &&
3842 !AT_END_OF_STRING(input_str)) {
3844 count++; input_str++;
3847 break;
3849 case NOT_WORD_CHAR:/* \W (NOT a word character) */
3850 while (count < max_cmp &&
3851 !isalnum ((int) *input_str) &&
3852 *input_str != (unsigned char) '_' &&
3853 *input_str != (unsigned char) '\n' &&
3854 !AT_END_OF_STRING(input_str)) {
3856 count++; input_str++;
3859 break;
3861 case DIGIT: /* same as [0123456789] */
3862 while (count < max_cmp &&
3863 isdigit ((int) *input_str) &&
3864 !AT_END_OF_STRING(input_str)) {
3865 count++; input_str++;
3868 break;
3870 case NOT_DIGIT: /* same as [^0123456789] */
3871 while (count < max_cmp &&
3872 !isdigit ((int) *input_str) &&
3873 *input_str != '\n' &&
3874 !AT_END_OF_STRING(input_str)) {
3876 count++; input_str++;
3879 break;
3881 case SPACE: /* same as [ \t\r\f\v]-- doesn't match newline. */
3882 while (count < max_cmp &&
3883 isspace ((int) *input_str) &&
3884 *input_str != '\n' &&
3885 !AT_END_OF_STRING(input_str)) {
3887 count++; input_str++;
3890 break;
3892 case SPACE_NL: /* same as [\n \t\r\f\v]-- matches newline. */
3893 while (count < max_cmp &&
3894 isspace ((int) *input_str) &&
3895 !AT_END_OF_STRING(input_str)) {
3897 count++; input_str++;
3900 break;
3902 case NOT_SPACE: /* same as [^\n \t\r\f\v]-- doesn't match newline. */
3903 while (count < max_cmp &&
3904 !isspace ((int) *input_str) &&
3905 !AT_END_OF_STRING(input_str)) {
3907 count++; input_str++;
3910 break;
3912 case NOT_SPACE_NL: /* same as [^ \t\r\f\v]-- matches newline. */
3913 while (count < max_cmp &&
3914 (!isspace ((int) *input_str) || *input_str == '\n') &&
3915 !AT_END_OF_STRING(input_str)) {
3917 count++; input_str++;
3920 break;
3922 case LETTER: /* same as [a-zA-Z] */
3923 while (count < max_cmp &&
3924 isalpha ((int) *input_str) &&
3925 !AT_END_OF_STRING(input_str)) {
3927 count++; input_str++;
3930 break;
3932 case NOT_LETTER: /* same as [^a-zA-Z] */
3933 while (count < max_cmp &&
3934 !isalpha ((int) *input_str) &&
3935 *input_str != '\n' &&
3936 !AT_END_OF_STRING(input_str)) {
3938 count++; input_str++;
3941 break;
3943 default:
3944 /* Called inappropriately. Only atoms that are SIMPLE should
3945 generate a call to greedy. The above cases should cover
3946 all the atoms that are SIMPLE. */
3948 reg_error ("internal error #10 `greedy\'");
3949 count = 0U; /* Best we can do. */
3952 /* Point to character just after last matched character. */
3954 Reg_Input = input_str;
3956 return (count);
3959 /*----------------------------------------------------------------------*
3960 * next_ptr - compute the address of a node's "NEXT" pointer.
3961 * Note: a simplified inline version is available via the NEXT_PTR() macro,
3962 * but that one is only to be used at time-critical places (see the
3963 * description of the macro).
3964 *----------------------------------------------------------------------*/
3966 static unsigned char * next_ptr (unsigned char *ptr) {
3968 register int offset;
3970 if (ptr == &Compute_Size) return (NULL);
3972 offset = GET_OFFSET (ptr);
3974 if (offset == 0) return (NULL);
3976 if (GET_OP_CODE (ptr) == BACK) {
3977 return (ptr - offset);
3978 } else {
3979 return (ptr + offset);
3983 /*----------------------------------------------------------------------*
3984 * SubstituteRE - Perform substitutions after a `regexp' match.
3985 *----------------------------------------------------------------------*/
3987 void SubstituteRE (
3988 regexp *prog,
3989 const char *source,
3990 char *dest,
3991 int max) {
3993 register unsigned char *src;
3994 unsigned char *src_alias;
3995 register unsigned char *dst;
3996 register unsigned char c;
3997 register unsigned char test;
3998 register int paren_no;
3999 register int len;
4000 register unsigned char chgcase;
4002 if (prog == NULL || source == NULL || dest == NULL) {
4003 reg_error ("NULL parm to `SubstituteRE\'");
4005 return;
4008 if (U_CHAR_AT (prog->program) != MAGIC) {
4009 reg_error ("damaged regexp passed to `SubstituteRE\'");
4011 return;
4014 src = (unsigned char *) source;
4015 dst = (unsigned char *) dest;
4017 while ((c = *src++) != '\0') {
4018 chgcase = '\0';
4019 paren_no = -1;
4021 if (c == '\\') {
4022 /* Process any case altering tokens, i.e \u, \U, \l, \L. */
4024 if (*src == 'u' || *src == 'U' || *src == 'l' || *src == 'L') {
4025 chgcase = *src;
4026 src++;
4027 c = *src++;
4029 if (c == '\0') break;
4033 if (c == '&') {
4034 paren_no = 0;
4035 } else if (c == '\\') {
4036 /* Can not pass register variable `&src' to function `numeric_escape'
4037 so make a non-register copy that we can take the address of. */
4039 src_alias = src;
4041 if ('1' <= *src && *src <= '9') {
4042 paren_no = (int) *src++ - (int) '0';
4044 } else if ((test = literal_escape (*src)) != '\0') {
4045 c = test; src++;
4047 } else if ((test = numeric_escape (*src, &src_alias)) != '\0') {
4048 c = test;
4049 src = src_alias; src++;
4051 /* NOTE: if an octal escape for zero is attempted (e.g. \000), it
4052 will be treated as a literal string. */
4053 } else if (*src == '\0') {
4054 /* If '\' is the last character of the replacement string, it is
4055 interpreted as a literal backslash. */
4057 c = '\\';
4058 } else {
4059 c = *src++; /* Allow any escape sequence (This is */
4060 } /* INCONSISTENT with the `CompileRE' */
4061 } /* mind set of issuing an error! */
4063 if (paren_no < 0) { /* Ordinary character. */
4064 if (((char *) dst - (char *) dest) >= (max - 1)) {
4065 break;
4066 } else {
4067 *dst++ = c;
4069 } else if (prog->startp [paren_no] != NULL &&
4070 prog->endp [paren_no] != NULL) {
4072 len = prog->endp [paren_no] - prog->startp [paren_no];
4074 if (((char *) dst + len - (char *) dest) >= max-1) {
4075 len = max - ((char *) dst - (char *) dest) - 1;
4078 (void) strncpy ((char *) dst, (char *) prog->startp [paren_no], len);
4080 if (chgcase != '\0') adjustcase (dst, len, chgcase);
4082 dst += len;
4084 if (len != 0 && *(dst - 1) == '\0') { /* strncpy hit NUL. */
4085 reg_error ("damaged match string in `SubstituteRE\'");
4087 return;
4092 *dst = '\0';
4095 static void adjustcase (unsigned char *str, int len, unsigned char chgcase) {
4097 register unsigned char *string = str;
4098 int i;
4100 /* The tokens \u and \l only modify the first character while the tokens
4101 \U and \L modify the entire string. */
4103 if (islower (chgcase) && len > 0) len = 1;
4105 switch (chgcase) {
4106 case 'u':
4107 case 'U':
4108 for (i = 0; i < len; i++) {
4109 *(string + i) = toupper ((int) *(string + i));
4112 break;
4114 case 'l':
4115 case 'L':
4116 for (i = 0; i < len; i++) {
4117 *(string + i) = tolower ((int) *(string + i));
4120 break;
4124 /*----------------------------------------------------------------------*
4125 * reg_error
4126 *----------------------------------------------------------------------*/
4128 static void reg_error (char *str) {
4130 fprintf (
4131 stderr,
4132 "NEdit: Internal error processing regular expression (%s)\n",
4133 str);
4136 /*----------------------------------------------------------------------*
4137 * makeDelimiterTable
4139 * Translate a null-terminated string of delimiters into a 256 byte
4140 * lookup table for determining whether a character is a delimiter or
4141 * not.
4143 * Table must be allocated by the caller.
4145 * Return value is a pointer to the table.
4146 *----------------------------------------------------------------------*/
4148 static unsigned char * makeDelimiterTable (
4149 unsigned char *delimiters,
4150 unsigned char *table) {
4152 unsigned char *c;
4154 memset (table, 0, 256);
4156 for (c = (unsigned char *) delimiters; *c != '\0'; c++) {
4157 table [*c] = 1;
4160 table [(int) NULL] = 1; /* These */
4161 table [(int) '\t'] = 1; /* characters */
4162 table [(int) '\n'] = 1; /* are always */
4163 table [(int) ' ' ] = 1; /* delimiters. */
4165 return table;
4168 /*----------------------------------------------------------------------*
4169 * SetREDefaultWordDelimiters
4171 * Builds a default delimiter table that persists across `ExecRE' calls.
4172 *----------------------------------------------------------------------*/
4174 void SetREDefaultWordDelimiters (char *delimiters) {
4175 makeDelimiterTable ((unsigned char *) delimiters, Default_Delimiters);
4178 void EnableCountingQuantifier (int is_enabled) {
4179 Enable_Counting_Quantifier = is_enabled;