The prog parameter from ExecRE() is first dereferenced and than checked for
[nedit.git] / source / regularExp.c
blob12afaded3957b13508d2cd99aa81e3425dd3814c
1 static const char CVSID[] = "$Id: regularExp.c,v 1.33 2008/02/29 23:12:26 lebert Exp $";
2 /*------------------------------------------------------------------------*
3 * `CompileRE', `ExecRE', and `substituteRE' -- regular expression parsing
5 * This is a HIGHLY ALTERED VERSION of Henry Spencer's `regcomp' and
6 * `regexec' code adapted for NEdit.
8 * .-------------------------------------------------------------------.
9 * | ORIGINAL COPYRIGHT NOTICE: |
10 * | |
11 * | Copyright (c) 1986 by University of Toronto. |
12 * | Written by Henry Spencer. Not derived from licensed software. |
13 * | |
14 * | Permission is granted to anyone to use this software for any |
15 * | purpose on any computer system, and to redistribute it freely, |
16 * | subject to the following restrictions: |
17 * | |
18 * | 1. The author is not responsible for the consequences of use of |
19 * | this software, no matter how awful, even if they arise |
20 * | from defects in it. |
21 * | |
22 * | 2. The origin of this software must not be misrepresented, either |
23 * | by explicit claim or by omission. |
24 * | |
25 * | 3. Altered versions must be plainly marked as such, and must not |
26 * | be misrepresented as being the original software. |
27 * `-------------------------------------------------------------------'
29 * This is free software; you can redistribute it and/or modify it under the
30 * terms of the GNU General Public License as published by the Free Software
31 * Foundation; either version 2 of the License, or (at your option) any later
32 * version. In addition, you may distribute version of this program linked to
33 * Motif or Open Motif. See README for details.
35 * This software is distributed in the hope that it will be useful, but WITHOUT
36 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
37 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
38 * for more details.
40 * You should have received a copy of the GNU General Public License along with
41 * software; if not, write to the Free Software Foundation, Inc., 59 Temple
42 * Place, Suite 330, Boston, MA 02111-1307 USA
45 * BEWARE that some of this code is subtly aware of the way operator
46 * precedence is structured in regular expressions. Serious changes in
47 * regular-expression syntax might require a total rethink.
48 * -- Henry Spencer
49 * (Yes, it did!) -- Christopher Conrad, Dec. 1999
51 * January, 1994, Mark Edel
52 * Consolidated files, changed names of external functions to avoid
53 * potential conflicts with native regcomp and regexec functions, changed
54 * error reporting to NEdit form, added multi-line and reverse searching,
55 * and added \n \t \u \U \l \L.
57 * June, 1996, Mark Edel
58 * Bug in NEXT macro, didn't work for expressions which compiled to over
59 * 256 bytes.
61 * December, 1999, Christopher Conrad
62 * Reformatted code for readability, improved error output, added octal and
63 * hexadecimal escapes, added back-references (\1-\9), added positive look
64 * ahead: (?=...), added negative lookahead: (?!...), added non-capturing
65 * parentheses: (?:...), added case insensitive constructs (?i...) and
66 * (?I...), added newline matching constructs (?n...) and (?N...), added
67 * regex comments: (?#...), added shortcut escapes: \d\D\l\L\s\S\w\W\y\Y.
68 * Added "not a word boundary" anchor \B.
70 * July, 2002, Eddy De Greef
71 * Added look behind, both positive (?<=...) and negative (?<!...) for
72 * bounded-length patterns.
74 * November, 2004, Eddy De Greef
75 * Added constrained matching (allowing specification of the logical end
76 * of the string iso. matching till \0), and fixed several (probably
77 * very old) string overrun errors that could easily result in crashes,
78 * especially in client code.
81 #ifdef HAVE_CONFIG_H
82 #include "../config.h"
83 #endif
85 #include "regularExp.h"
87 #include <ctype.h>
88 #include <limits.h>
89 #include <stdio.h>
90 #include <stdlib.h>
91 #include <string.h>
93 #ifdef HAVE_DEBUG_H
94 #include "../debug.h"
95 #endif
98 /* The first byte of the regexp internal `program' is a magic number to help
99 gaurd against corrupted data; the compiled regex code really begins in the
100 second byte. */
102 #define MAGIC 0234
104 /* The "internal use only" fields in `regexp.h' are present to pass info from
105 * `CompileRE' to `ExecRE' which permits the execute phase to run lots faster on
106 * simple cases. They are:
108 * match_start Character that must begin a match; '\0' if none obvious.
109 * anchor Is the match anchored (at beginning-of-line only)?
111 * `match_start' and `anchor' permit very fast decisions on suitable starting
112 * points for a match, considerably reducing the work done by ExecRE. */
114 /* STRUCTURE FOR A REGULAR EXPRESSION (regex) `PROGRAM'.
116 * This is essentially a linear encoding of a nondeterministic finite-state
117 * machine or NFA (aka syntax charts or `railroad normal form' in parsing
118 * technology). Each node is an opcode plus a NEXT pointer, possibly
119 * followed by operands. NEXT pointers of all nodes except BRANCH implement
120 * concatenation; a NEXT pointer with a BRANCH on both ends of it is
121 * connecting two alternatives. (Here we have one of the subtle syntax
122 * dependencies: an individual BRANCH (as opposed to a collection of them) is
123 * never concatenated with anything because of operator precedence.) The
124 * operand of some types of nodes is a literal string; for others, it is a node
125 * leading into a sub-FSM. In particular, the operand of a BRANCH node is the
126 * first node of the branch. (NB this is _NOT_ a tree structure: the tail of
127 * the branch connects to the thing following the set of BRANCHes.)
129 * The opcodes are: */
131 /* DEFINITION VALUE MEANING */
133 #define END 1 /* End of program. */
135 /* Zero width positional assertions. */
137 #define BOL 2 /* Match position at beginning of line. */
138 #define EOL 3 /* Match position at end of line. */
139 #define BOWORD 4 /* Match "" representing word delimiter or BOL */
140 #define EOWORD 5 /* Match "" representing word delimiter or EOL */
141 #define NOT_BOUNDARY 6 /* Not word boundary (\B, opposite of < and >) */
143 /* Op codes with null terminated string operands. */
145 #define EXACTLY 7 /* Match this string. */
146 #define SIMILAR 8 /* Match this case insensitive string */
147 #define ANY_OF 9 /* Match any character in the set. */
148 #define ANY_BUT 10 /* Match any character not in the set. */
150 /* Op codes to match any character. */
152 #define ANY 11 /* Match any one character (implements '.') */
153 #define EVERY 12 /* Same as ANY but matches newline. */
155 /* Shortcut escapes, \d, \D, \l, \L, \s, \S, \w, \W, \y, \Y. */
157 #define DIGIT 13 /* Match any digit, i.e. [0123456789] */
158 #define NOT_DIGIT 14 /* Match any non-digit, i.e. [^0123456789] */
159 #define LETTER 15 /* Match any letter character [a-zA-Z] */
160 #define NOT_LETTER 16 /* Match any non-letter character [^a-zA-Z] */
161 #define SPACE 17 /* Match any whitespace character EXCEPT \n */
162 #define SPACE_NL 18 /* Match any whitespace character INCLUDING \n */
163 #define NOT_SPACE 19 /* Match any non-whitespace character */
164 #define NOT_SPACE_NL 20 /* Same as NOT_SPACE but matches newline. */
165 #define WORD_CHAR 21 /* Match any word character [a-zA-Z0-9_] */
166 #define NOT_WORD_CHAR 22 /* Match any non-word character [^a-zA-Z0-9_] */
167 #define IS_DELIM 23 /* Match any character that's a word delimiter */
168 #define NOT_DELIM 24 /* Match any character NOT a word delimiter */
170 /* Quantifier nodes. (Only applied to SIMPLE nodes. Quantifiers applied to non
171 SIMPLE nodes or larger atoms are implemented using complex constructs.)*/
173 #define STAR 25 /* Match this (simple) thing 0 or more times. */
174 #define LAZY_STAR 26 /* Minimal matching STAR */
175 #define QUESTION 27 /* Match this (simple) thing 0 or 1 times. */
176 #define LAZY_QUESTION 28 /* Minimal matching QUESTION */
177 #define PLUS 29 /* Match this (simple) thing 1 or more times. */
178 #define LAZY_PLUS 30 /* Minimal matching PLUS */
179 #define BRACE 31 /* Match this (simple) thing m to n times. */
180 #define LAZY_BRACE 32 /* Minimal matching BRACE */
182 /* Nodes used to build complex constructs. */
184 #define NOTHING 33 /* Match empty string (always matches) */
185 #define BRANCH 34 /* Match this alternative, or the next... */
186 #define BACK 35 /* Always matches, NEXT ptr points backward. */
187 #define INIT_COUNT 36 /* Initialize {m,n} counter to zero */
188 #define INC_COUNT 37 /* Increment {m,n} counter by one */
189 #define TEST_COUNT 38 /* Test {m,n} counter against operand */
191 /* Back Reference nodes. */
193 #define BACK_REF 39 /* Match latest matched parenthesized text */
194 #define BACK_REF_CI 40 /* Case insensitive version of BACK_REF */
195 #define X_REGEX_BR 41 /* Cross-Regex Back-Ref for syntax highlighting */
196 #define X_REGEX_BR_CI 42 /* Case insensitive version of X_REGEX_BR_CI */
198 /* Various nodes used to implement parenthetical constructs. */
200 #define POS_AHEAD_OPEN 43 /* Begin positive look ahead */
201 #define NEG_AHEAD_OPEN 44 /* Begin negative look ahead */
202 #define LOOK_AHEAD_CLOSE 45 /* End positive or negative look ahead */
204 #define POS_BEHIND_OPEN 46 /* Begin positive look behind */
205 #define NEG_BEHIND_OPEN 47 /* Begin negative look behind */
206 #define LOOK_BEHIND_CLOSE 48 /* Close look behind */
208 #define OPEN 49 /* Open for capturing parentheses. */
210 /* OPEN+1 is number 1, etc. */
211 #define CLOSE (OPEN + NSUBEXP) /* Close for capturing parentheses. */
213 #define LAST_PAREN (CLOSE + NSUBEXP)
215 #if (LAST_PAREN > UCHAR_MAX)
216 #error "Too many parentheses for storage in an unsigned char (LAST_PAREN too big.)"
217 #endif
219 /* The next_ptr () function can consume up to 30% of the time during matching
220 because it is called an immense number of times (an average of 25
221 next_ptr() calls per match() call was witnessed for Perl syntax
222 highlighting). Therefore it is well worth removing some of the function
223 call overhead by selectively inlining the next_ptr() calls. Moreover,
224 the inlined code can be simplified for matching because one of the tests,
225 only necessary during compilation, can be left out.
226 The net result of using this inlined version at two critical places is
227 a 25% speedup (again, witnesses on Perl syntax highlighting). */
229 #define NEXT_PTR(in_ptr, out_ptr)\
230 next_ptr_offset = GET_OFFSET (in_ptr);\
231 if (next_ptr_offset == 0)\
232 out_ptr = NULL;\
233 else {\
234 if (GET_OP_CODE (in_ptr) == BACK)\
235 out_ptr = in_ptr - next_ptr_offset;\
236 else \
237 out_ptr = in_ptr + next_ptr_offset;\
240 /* OPCODE NOTES:
241 ------------
243 All nodes consist of an 8 bit op code followed by 2 bytes that make up a 16
244 bit NEXT pointer. Some nodes have a null terminated character string operand
245 following the NEXT pointer. Other nodes may have an 8 bit index operand.
246 The TEST_COUNT node has an index operand followed by a 16 bit test value.
247 The BRACE and LAZY_BRACE nodes have two 16 bit values for min and max but no
248 index value.
250 SIMILAR
251 Operand(s): null terminated string
253 Implements a case insensitive match of a string. Mostly intended for use
254 in syntax highlighting patterns for keywords of languages like FORTRAN
255 and Ada that are case insensitive. The regex text in this node is
256 converted to lower case during regex compile.
258 DIGIT, NOT_DIGIT, LETTER, NOT_LETTER, SPACE, NOT_SPACE, WORD_CHAR,
259 NOT_WORD_CHAR
260 Operand(s): None
262 Implements shortcut escapes \d, \D, \l, \L, \s, \S, \w, \W. The locale
263 aware ANSI functions isdigit(), isalpha(), isalnun(), and isspace() are
264 used to implement these in the hopes of increasing portability.
266 NOT_BOUNDARY
267 Operand(s): None
269 Implements \B as a zero width assertion that the current character is
270 NOT on a word boundary. Word boundaries are defined to be the position
271 between two characters where one of those characters is one of the
272 dynamically defined word delimiters, and the other character is not.
274 IS_DELIM
275 Operand(s): None
277 Implements \y as any character that is one of the dynamically
278 specified word delimiters.
280 NOT_DELIM
281 Operand(s): None
283 Implements \Y as any character that is NOT one of the dynamically
284 specified word delimiters.
286 STAR, PLUS, QUESTION, and complex '*', '+', and '?'
287 Operand(s): None (Note: NEXT pointer is usually zero. The code that
288 processes this node skips over it.)
290 Complex (parenthesized) versions implemented as circular BRANCH
291 structures using BACK. SIMPLE versions (one character per match) are
292 implemented separately for speed and to minimize recursion.
294 BRACE, LAZY_BRACE
295 Operand(s): minimum value (2 bytes), maximum value (2 bytes)
297 Implements the {m,n} construct for atoms that are SIMPLE.
299 BRANCH
300 Operand(s): None
302 The set of branches constituting a single choice are hooked together
303 with their NEXT pointers, since precedence prevents anything being
304 concatenated to any individual branch. The NEXT pointer of the last
305 BRANCH in a choice points to the thing following the whole choice. This
306 is also where the final NEXT pointer of each individual branch points;
307 each branch starts with the operand node of a BRANCH node.
309 BACK
310 Operand(s): None
312 Normal NEXT pointers all implicitly point forward. Back implicitly
313 points backward. BACK exists to make loop structures possible.
315 INIT_COUNT
316 Operand(s): index (1 byte)
318 Initializes the count array element referenced by the index operand.
319 This node is used to build general (i.e. parenthesized) {m,n} constructs.
321 INC_COUNT
322 Operand(s): index (1 byte)
324 Increments the count array element referenced by the index operand.
325 This node is used to build general (i.e. parenthesized) {m,n} constructs.
327 TEST_COUNT
328 Operand(s): index (1 byte), test value (2 bytes)
330 Tests the current value of the count array element specified by the
331 index operand against the test value. If the current value is less than
332 the test value, control passes to the node after that TEST_COUNT node.
333 Otherwise control passes to the node referenced by the NEXT pointer for
334 the TEST_COUNT node. This node is used to build general (i.e.
335 parenthesized) {m,n} constructs.
337 BACK_REF, BACK_REF_CI
338 Operand(s): index (1 byte, value 1-9)
340 Implements back references. This node will attempt to match whatever text
341 was most recently captured by the index'th set of parentheses.
342 BACK_REF_CI is case insensitive version.
344 X_REGEX_BR, X_REGEX_BR_CI
345 (NOT IMPLEMENTED YET)
347 Operand(s): index (1 byte, value 1-9)
349 Implements back references into a previously matched but separate regular
350 expression. This is used by syntax highlighting patterns. This node will
351 attempt to match whatever text was most captured by the index'th set of
352 parentheses of the separate regex passed to ExecRE. X_REGEX_BR_CI is case
353 insensitive version.
355 POS_AHEAD_OPEN, NEG_AHEAD_OPEN, LOOK_AHEAD_CLOSE
357 Operand(s): None
359 Implements positive and negative look ahead. Look ahead is an assertion
360 that something is either there or not there. Once this is determined the
361 regex engine backtracks to where it was just before the look ahead was
362 encountered, i.e. look ahead is a zero width assertion.
364 POS_BEHIND_OPEN, NEG_BEHIND_OPEN, LOOK_BEHIND_CLOSE
366 Operand(s): 2x2 bytes for OPEN (match boundaries), None for CLOSE
368 Implements positive and negative look behind. Look behind is an assertion
369 that something is either there or not there in front of the current
370 position. Look behind is a zero width assertion, with the additional
371 constraint that it must have a bounded length (for complexity and
372 efficiency reasons; note that most other implementation even impose
373 fixed length).
375 OPEN, CLOSE
377 Operand(s): None
379 OPEN + n = Start of parenthesis 'n', CLOSE + n = Close of parenthesis
380 'n', and are numbered at compile time.
383 /* A node is one char of opcode followed by two chars of NEXT pointer plus
384 * any operands. NEXT pointers are stored as two 8-bit pieces, high order
385 * first. The value is a positive offset from the opcode of the node
386 * containing it. An operand, if any, simply follows the node. (Note that
387 * much of the code generation knows about this implicit relationship.)
389 * Using two bytes for NEXT_PTR_SIZE is vast overkill for most things,
390 * but allows patterns to get big without disasters. */
392 #define OP_CODE_SIZE 1
393 #define NEXT_PTR_SIZE 2
394 #define INDEX_SIZE 1
395 #define LENGTH_SIZE 4
396 #define NODE_SIZE (NEXT_PTR_SIZE + OP_CODE_SIZE)
398 #define GET_OP_CODE(p) (*(unsigned char *)(p))
399 #define OPERAND(p) ((p) + NODE_SIZE)
400 #define GET_OFFSET(p) ((( *((p) + 1) & 0377) << 8) + (( *((p) + 2)) & 0377))
401 #define PUT_OFFSET_L(v) (unsigned char)(((v) >> 8) & 0377)
402 #define PUT_OFFSET_R(v) (unsigned char) ((v) & 0377)
403 #define GET_LOWER(p) ((( *((p) + NODE_SIZE) & 0377) << 8) + \
404 (( *((p) + NODE_SIZE+1)) & 0377))
405 #define GET_UPPER(p) ((( *((p) + NODE_SIZE+2) & 0377) << 8) + \
406 (( *((p) + NODE_SIZE+3)) & 0377))
408 /* Utility definitions. */
410 #define REG_FAIL(m) {*Error_Ptr = (m); return (NULL);}
411 #define IS_QUANTIFIER(c) ((c) == '*' || (c) == '+' || \
412 (c) == '?' || (c) == Brace_Char)
413 #define SET_BIT(i,n) ((i) |= (1 << ((n) - 1)))
414 #define TEST_BIT(i,n) ((i) & (1 << ((n) - 1)))
415 #define U_CHAR_AT(p) ((unsigned int) *(unsigned char *)(p))
417 /* Flags to be passed up and down via function parameters during compile. */
419 #define WORST 0 /* Worst case. No assumptions can be made.*/
420 #define HAS_WIDTH 1 /* Known never to match null string. */
421 #define SIMPLE 2 /* Simple enough to be STAR/PLUS operand. */
423 #define NO_PAREN 0 /* Only set by initial call to "chunk". */
424 #define PAREN 1 /* Used for normal capturing parentheses. */
425 #define NO_CAPTURE 2 /* Non-capturing parentheses (grouping only). */
426 #define INSENSITIVE 3 /* Case insensitive parenthetical construct */
427 #define SENSITIVE 4 /* Case sensitive parenthetical construct */
428 #define NEWLINE 5 /* Construct to match newlines in most cases */
429 #define NO_NEWLINE 6 /* Construct to match newlines normally */
431 #define REG_INFINITY 0UL
432 #define REG_ZERO 0UL
433 #define REG_ONE 1UL
435 /* Flags for function shortcut_escape() */
437 #define CHECK_ESCAPE 0 /* Check an escape sequence for validity only. */
438 #define CHECK_CLASS_ESCAPE 1 /* Check the validity of an escape within a
439 character class */
440 #define EMIT_CLASS_BYTES 2 /* Emit equivalent character class bytes,
441 e.g \d=0123456789 */
442 #define EMIT_NODE 3 /* Emit the appropriate node. */
444 /* Array sizes for arrays used by function init_ansi_classes. */
446 #define WHITE_SPACE_SIZE 16
447 #define ALNUM_CHAR_SIZE 256
449 /* Number of bytes to offset from the beginning of the regex program to the start
450 of the actual compiled regex code, i.e. skipping over the MAGIC number and
451 the two counters at the front. */
453 #define REGEX_START_OFFSET 3
455 #define MAX_COMPILED_SIZE 32767UL /* Largest size a compiled regex can be.
456 Probably could be 65535UL. */
458 /* Global work variables for `CompileRE'. */
460 static unsigned char *Reg_Parse; /* Input scan ptr (scans user's regex) */
461 static int Total_Paren; /* Parentheses, (), counter. */
462 static int Num_Braces; /* Number of general {m,n} constructs.
463 {m,n} quantifiers of SIMPLE atoms are
464 not included in this count. */
465 static int Closed_Parens; /* Bit flags indicating () closure. */
466 static int Paren_Has_Width; /* Bit flags indicating ()'s that are
467 known to not match the empty string */
468 static unsigned char Compute_Size; /* Address of this used as flag. */
469 static unsigned char *Code_Emit_Ptr; /* When Code_Emit_Ptr is set to
470 &Compute_Size no code is emitted.
471 Instead, the size of code that WOULD
472 have been generated is accumulated in
473 Reg_Size. Otherwise, Code_Emit_Ptr
474 points to where compiled regex code is
475 to be written. */
476 static unsigned long Reg_Size; /* Size of compiled regex code. */
477 static char **Error_Ptr; /* Place to store error messages so
478 they can be returned by `CompileRE' */
479 static char Error_Text [128];/* Sting to build error messages in. */
481 static unsigned char White_Space [WHITE_SPACE_SIZE]; /* Arrays used by */
482 static unsigned char Word_Char [ALNUM_CHAR_SIZE]; /* functions */
483 static unsigned char Letter_Char [ALNUM_CHAR_SIZE]; /* init_ansi_classes () */
484 /* and
485 shortcut_escape (). */
487 static unsigned char ASCII_Digits [] = "0123456789"; /* Same for all */
488 /* locales. */
489 static int Is_Case_Insensitive;
490 static int Match_Newline;
492 static int Enable_Counting_Quantifier = 1;
493 static unsigned char Brace_Char;
494 static unsigned char Default_Meta_Char [] = "{.*+?[(|)^<>$";
495 static unsigned char *Meta_Char;
497 typedef struct { long lower; long upper; } len_range;
499 /* Forward declarations for functions used by `CompileRE'. */
501 static unsigned char * alternative (int *flag_param, len_range *range_param);
502 static unsigned char * back_ref (unsigned char *c, int *flag_param,
503 int emit);
504 static unsigned char * chunk (int paren, int *flag_param, len_range *range_param);
505 static void emit_byte (unsigned char c);
506 static void emit_class_byte (unsigned char c);
507 static unsigned char * emit_node (int op_code);
508 static unsigned char * emit_special (unsigned char op_code,
509 unsigned long test_val,
510 int index);
511 static unsigned char literal_escape (unsigned char c);
512 static unsigned char numeric_escape (unsigned char c, unsigned char **parse);
513 static unsigned char * atom (int *flag_param, len_range *range_param);
514 static void reg_error (char *str);
515 static unsigned char * insert (unsigned char op, unsigned char *opnd,
516 long min, long max, int index);
517 static unsigned char * next_ptr (unsigned char *ptr);
518 static void offset_tail (unsigned char *ptr, int offset,
519 unsigned char *val);
520 static void branch_tail (unsigned char *ptr, int offset,
521 unsigned char *val);
522 static unsigned char * piece (int *flag_param, len_range *range_param);
523 static void tail (unsigned char *search_from,
524 unsigned char *point_t);
525 static unsigned char * shortcut_escape (unsigned char c, int *flag_param,
526 int emit);
528 static int init_ansi_classes (void);
530 /*----------------------------------------------------------------------*
531 * CompileRE
533 * Compiles a regular expression into the internal format used by
534 * `ExecRE'.
536 * The default behaviour wrt. case sensitivity and newline matching can
537 * be controlled through the defaultFlags argument (Markus Schwarzenberg).
538 * Future extensions are possible by using other flag bits.
539 * Note that currently only the case sensitivity flag is effectively used.
541 * Beware that the optimization and preparation code in here knows about
542 * some of the structure of the compiled regexp.
543 *----------------------------------------------------------------------*/
545 regexp * CompileRE (const char *exp, char **errorText, int defaultFlags) {
547 register regexp *comp_regex = NULL;
548 register unsigned char *scan;
549 int flags_local, pass;
550 len_range range_local;
552 if (Enable_Counting_Quantifier) {
553 Brace_Char = '{';
554 Meta_Char = &Default_Meta_Char [0];
555 } else {
556 Brace_Char = '*'; /* Bypass the '{' in */
557 Meta_Char = &Default_Meta_Char [1]; /* Default_Meta_Char */
560 /* Set up errorText to receive failure reports. */
562 Error_Ptr = errorText;
563 *Error_Ptr = "";
565 if (exp == NULL) REG_FAIL ("NULL argument, `CompileRE\'");
567 /* Initialize arrays used by function `shortcut_escape'. */
569 if (!init_ansi_classes ()) REG_FAIL ("internal error #1, `CompileRE\'");
571 Code_Emit_Ptr = &Compute_Size;
572 Reg_Size = 0UL;
574 /* We can't allocate space until we know how big the compiled form will be,
575 but we can't compile it (and thus know how big it is) until we've got a
576 place to put the code. So we cheat: we compile it twice, once with code
577 generation turned off and size counting turned on, and once "for real".
578 This also means that we don't allocate space until we are sure that the
579 thing really will compile successfully, and we never have to move the
580 code and thus invalidate pointers into it. (Note that it has to be in
581 one piece because free() must be able to free it all.) */
583 for (pass = 1; pass <= 2; pass++) {
584 /*-------------------------------------------*
585 * FIRST PASS: Determine size and legality. *
586 * SECOND PASS: Emit code. *
587 *-------------------------------------------*/
589 /* Schwarzenberg:
590 * If defaultFlags = 0 use standard defaults:
591 * Is_Case_Insensitive: Case sensitive is the default
592 * Match_Newline: Newlines are NOT matched by default
593 * in character classes
595 Is_Case_Insensitive = ((defaultFlags & REDFLT_CASE_INSENSITIVE) ? 1 : 0);
596 Match_Newline = 0; /* ((defaultFlags & REDFLT_MATCH_NEWLINE) ? 1 : 0);
597 Currently not used. Uncomment if needed. */
599 Reg_Parse = (unsigned char *) exp;
600 Total_Paren = 1;
601 Num_Braces = 0;
602 Closed_Parens = 0;
603 Paren_Has_Width = 0;
605 emit_byte (MAGIC);
606 emit_byte ('%'); /* Placeholder for num of capturing parentheses. */
607 emit_byte ('%'); /* Placeholder for num of general {m,n} constructs. */
609 if (chunk (NO_PAREN, &flags_local, &range_local) == NULL)
610 return (NULL); /* Something went wrong */
611 if (pass == 1) {
612 if (Reg_Size >= MAX_COMPILED_SIZE) {
613 /* Too big for NEXT pointers NEXT_PTR_SIZE bytes long to span.
614 This is a real issue since the first BRANCH node usually points
615 to the end of the compiled regex code. */
617 sprintf (Error_Text, "regexp > %lu bytes", MAX_COMPILED_SIZE);
618 REG_FAIL (Error_Text);
621 /* Allocate memory. */
623 comp_regex = (regexp *) malloc (sizeof (regexp) + Reg_Size);
625 if (comp_regex == NULL) REG_FAIL ("out of memory in `CompileRE\'");
627 Code_Emit_Ptr = (unsigned char *) comp_regex->program;
631 comp_regex->program [1] = (unsigned char) Total_Paren - 1;
632 comp_regex->program [2] = (unsigned char) Num_Braces;
634 /*----------------------------------------*
635 * Dig out information for optimizations. *
636 *----------------------------------------*/
638 comp_regex->match_start = '\0'; /* Worst-case defaults. */
639 comp_regex->anchor = 0;
641 /* First BRANCH. */
643 scan = (unsigned char *) (comp_regex->program + REGEX_START_OFFSET);
645 if (GET_OP_CODE (next_ptr (scan)) == END) { /* Only one top-level choice. */
646 scan = OPERAND (scan);
648 /* Starting-point info. */
650 if (GET_OP_CODE (scan) == EXACTLY) {
651 comp_regex->match_start = *OPERAND (scan);
653 } else if (PLUS <= GET_OP_CODE (scan) &&
654 GET_OP_CODE (scan) <= LAZY_PLUS) {
656 /* Allow x+ or x+? at the start of the regex to be
657 optimized. */
659 if (GET_OP_CODE (scan + NODE_SIZE) == EXACTLY) {
660 comp_regex->match_start = *OPERAND (scan + NODE_SIZE);
662 } else if (GET_OP_CODE (scan) == BOL) {
663 comp_regex->anchor++;
667 return (comp_regex);
670 /*----------------------------------------------------------------------*
671 * chunk *
673 * Process main body of regex or process a parenthesized "thing". *
675 * Caller must absorb opening parenthesis. *
677 * Combining parenthesis handling with the base level of regular *
678 * expression is a trifle forced, but the need to tie the tails of the *
679 * branches to what follows makes it hard to avoid. *
680 *----------------------------------------------------------------------*/
682 static unsigned char * chunk (int paren, int *flag_param,
683 len_range *range_param) {
685 register unsigned char *ret_val = NULL;
686 register unsigned char *this_branch;
687 register unsigned char *ender = NULL;
688 register int this_paren = 0;
689 int flags_local, first = 1, zero_width, i;
690 int old_sensitive = Is_Case_Insensitive;
691 int old_newline = Match_Newline;
692 len_range range_local;
693 int look_only = 0;
694 unsigned char *emit_look_behind_bounds = NULL;
697 *flag_param = HAS_WIDTH; /* Tentatively. */
698 range_param->lower = 0; /* Idem */
699 range_param->upper = 0;
701 /* Make an OPEN node, if parenthesized. */
703 if (paren == PAREN) {
704 if (Total_Paren >= NSUBEXP) {
705 sprintf (Error_Text, "number of ()'s > %d", (int) NSUBEXP);
706 REG_FAIL (Error_Text);
709 this_paren = Total_Paren; Total_Paren++;
710 ret_val = emit_node (OPEN + this_paren);
711 } else if (paren == POS_AHEAD_OPEN || paren == NEG_AHEAD_OPEN) {
712 *flag_param = WORST; /* Look ahead is zero width. */
713 look_only = 1;
714 ret_val = emit_node (paren);
715 } else if (paren == POS_BEHIND_OPEN || paren == NEG_BEHIND_OPEN) {
716 *flag_param = WORST; /* Look behind is zero width. */
717 look_only = 1;
718 /* We'll overwrite the zero length later on, so we save the ptr */
719 ret_val = emit_special (paren, 0, 0);
720 emit_look_behind_bounds = ret_val + NODE_SIZE;
721 } else if (paren == INSENSITIVE) {
722 Is_Case_Insensitive = 1;
723 } else if (paren == SENSITIVE) {
724 Is_Case_Insensitive = 0;
725 } else if (paren == NEWLINE) {
726 Match_Newline = 1;
727 } else if (paren == NO_NEWLINE) {
728 Match_Newline = 0;
731 /* Pick up the branches, linking them together. */
733 do {
734 this_branch = alternative (&flags_local, &range_local);
736 if (this_branch == NULL) return (NULL);
738 if (first) {
739 first = 0;
740 *range_param = range_local;
741 if (ret_val == NULL) ret_val = this_branch;
742 } else if (range_param->lower >= 0) {
743 if (range_local.lower >= 0) {
744 if (range_local.lower < range_param->lower)
745 range_param->lower = range_local.lower;
746 if (range_local.upper > range_param->upper)
747 range_param->upper = range_local.upper;
748 } else {
749 range_param->lower = -1; /* Branches have different lengths */
750 range_param->upper = -1;
754 tail (ret_val, this_branch); /* Connect BRANCH -> BRANCH. */
756 /* If any alternative could be zero width, consider the whole
757 parenthisized thing to be zero width. */
759 if (!(flags_local & HAS_WIDTH)) *flag_param &= ~HAS_WIDTH;
761 /* Are there more alternatives to process? */
763 if (*Reg_Parse != '|') break;
765 Reg_Parse++;
766 } while (1);
768 /* Make a closing node, and hook it on the end. */
770 if (paren == PAREN) {
771 ender = emit_node (CLOSE + this_paren);
773 } else if (paren == NO_PAREN) {
774 ender = emit_node (END);
776 } else if (paren == POS_AHEAD_OPEN || paren == NEG_AHEAD_OPEN) {
777 ender = emit_node (LOOK_AHEAD_CLOSE);
779 } else if (paren == POS_BEHIND_OPEN || paren == NEG_BEHIND_OPEN) {
780 ender = emit_node (LOOK_BEHIND_CLOSE);
782 } else {
783 ender = emit_node (NOTHING);
786 tail (ret_val, ender);
788 /* Hook the tails of the branch alternatives to the closing node. */
790 for (this_branch = ret_val; this_branch != NULL; ) {
791 branch_tail (this_branch, NODE_SIZE, ender);
792 this_branch = next_ptr (this_branch);
795 /* Check for proper termination. */
797 if (paren != NO_PAREN && *Reg_Parse++ != ')') {
798 REG_FAIL ("missing right parenthesis \')\'");
799 } else if (paren == NO_PAREN && *Reg_Parse != '\0') {
800 if (*Reg_Parse == ')') {
801 REG_FAIL ("missing left parenthesis \'(\'");
802 } else {
803 REG_FAIL ("junk on end"); /* "Can't happen" - NOTREACHED */
807 /* Check whether look behind has a fixed size */
809 if (emit_look_behind_bounds) {
810 if (range_param->lower < 0) {
811 REG_FAIL ("look-behind does not have a bounded size");
813 if (range_param->upper > 65535L) {
814 REG_FAIL ("max. look-behind size is too large (>65535)")
816 if (Code_Emit_Ptr != &Compute_Size) {
817 *emit_look_behind_bounds++ = PUT_OFFSET_L (range_param->lower);
818 *emit_look_behind_bounds++ = PUT_OFFSET_R (range_param->lower);
819 *emit_look_behind_bounds++ = PUT_OFFSET_L (range_param->upper);
820 *emit_look_behind_bounds = PUT_OFFSET_R (range_param->upper);
824 /* For look ahead/behind, the length must be set to zero again */
825 if (look_only) {
826 range_param->lower = 0;
827 range_param->upper = 0;
830 zero_width = 0;
832 /* Set a bit in Closed_Parens to let future calls to function `back_ref'
833 know that we have closed this set of parentheses. */
835 if (paren == PAREN && this_paren <= (int)sizeof (Closed_Parens) * CHAR_BIT) {
836 SET_BIT (Closed_Parens, this_paren);
838 /* Determine if a parenthesized expression is modified by a quantifier
839 that can have zero width. */
841 if (*(Reg_Parse) == '?' || *(Reg_Parse) == '*') {
842 zero_width++;
843 } else if (*(Reg_Parse) == '{' && Brace_Char == '{') {
844 if (*(Reg_Parse + 1) == ',' || *(Reg_Parse + 1) == '}') {
845 zero_width++;
846 } else if (*(Reg_Parse + 1) == '0') {
847 i = 2;
849 while (*(Reg_Parse + i) == '0') i++;
851 if (*(Reg_Parse + i) == ',') zero_width++;
856 /* If this set of parentheses is known to never match the empty string, set
857 a bit in Paren_Has_Width to let future calls to function back_ref know
858 that this set of parentheses has non-zero width. This will allow star
859 (*) or question (?) quantifiers to be aplied to a back-reference that
860 refers to this set of parentheses. */
862 if ((*flag_param & HAS_WIDTH) &&
863 paren == PAREN &&
864 !zero_width &&
865 this_paren <= (int)(sizeof (Paren_Has_Width) * CHAR_BIT)) {
867 SET_BIT (Paren_Has_Width, this_paren);
870 Is_Case_Insensitive = old_sensitive;
871 Match_Newline = old_newline;
873 return (ret_val);
876 /*----------------------------------------------------------------------*
877 * alternative
879 * Processes one alternative of an '|' operator. Connects the NEXT
880 * pointers of each regex atom together sequentialy.
881 *----------------------------------------------------------------------*/
883 static unsigned char * alternative (int *flag_param, len_range *range_param) {
885 register unsigned char *ret_val;
886 register unsigned char *chain;
887 register unsigned char *latest;
888 int flags_local;
889 len_range range_local;
891 *flag_param = WORST; /* Tentatively. */
892 range_param->lower = 0; /* Idem */
893 range_param->upper = 0;
895 ret_val = emit_node (BRANCH);
896 chain = NULL;
898 /* Loop until we hit the start of the next alternative, the end of this set
899 of alternatives (end of parentheses), or the end of the regex. */
901 while (*Reg_Parse != '|' && *Reg_Parse != ')' && *Reg_Parse != '\0') {
902 latest = piece (&flags_local, &range_local);
904 if (latest == NULL) return (NULL); /* Something went wrong. */
906 *flag_param |= flags_local & HAS_WIDTH;
907 if (range_local.lower < 0) {
908 /* Not a fixed length */
909 range_param->lower = -1;
910 range_param->upper = -1;
911 } else if (range_param->lower >= 0) {
912 range_param->lower += range_local.lower;
913 range_param->upper += range_local.upper;
916 if (chain != NULL) { /* Connect the regex atoms together sequentialy. */
917 tail (chain, latest);
920 chain = latest;
923 if (chain == NULL) { /* Loop ran zero times. */
924 (void) emit_node (NOTHING);
927 return (ret_val);
930 /*----------------------------------------------------------------------*
931 * piece - something followed by possible '*', '+', '?', or "{m,n}"
933 * Note that the branching code sequences used for the general cases of
934 * *, +. ?, and {m,n} are somewhat optimized: they use the same
935 * NOTHING node as both the endmarker for their branch list and the
936 * body of the last branch. It might seem that this node could be
937 * dispensed with entirely, but the endmarker role is not redundant.
938 *----------------------------------------------------------------------*/
940 static unsigned char * piece (int *flag_param, len_range *range_param) {
942 register unsigned char *ret_val;
943 register unsigned char *next;
944 register unsigned char op_code;
945 unsigned long min_max [2] = {REG_ZERO, REG_INFINITY};
946 int flags_local, i, brace_present = 0;
947 int lazy = 0, comma_present = 0;
948 int digit_present [2] = {0,0};
949 len_range range_local;
951 ret_val = atom (&flags_local, &range_local);
953 if (ret_val == NULL) return (NULL); /* Something went wrong. */
955 op_code = *Reg_Parse;
957 if (!IS_QUANTIFIER (op_code)) {
958 *flag_param = flags_local;
959 *range_param = range_local;
960 return (ret_val);
961 } else if (op_code == '{') { /* {n,m} quantifier present */
962 brace_present++;
963 Reg_Parse++;
965 /* This code will allow specifying a counting range in any of the
966 following forms:
968 {m,n} between m and n.
969 {,n} same as {0,n} or between 0 and infinity.
970 {m,} same as {m,0} or between m and infinity.
971 {m} same as {m,m} or exactly m.
972 {,} same as {0,0} or between 0 and infinity or just '*'.
973 {} same as {0,0} or between 0 and infinity or just '*'.
975 Note that specifying a max of zero, {m,0} is not allowed in the regex
976 itself, but it is implemented internally that way to support '*', '+',
977 and {min,} constructs and signals an unlimited number. */
979 for (i = 0; i < 2; i++) {
980 /* Look for digits of number and convert as we go. The numeric maximum
981 value for max and min of 65,535 is due to using 2 bytes to store
982 each value in the compiled regex code. */
984 while (isdigit (*Reg_Parse)) {
985 /* (6553 * 10 + 6) > 65535 (16 bit max) */
987 if ((min_max [i] == 6553UL && (*Reg_Parse - '0') <= 5) ||
988 (min_max [i] <= 6552UL)) {
990 min_max [i] = (min_max [i] * 10UL) +
991 (unsigned long) (*Reg_Parse - '0');
992 Reg_Parse++;
994 digit_present [i]++;
995 } else {
996 if (i == 0) {
997 sprintf (Error_Text, "min operand of {%lu%c,???} > 65535",
998 min_max [0], *Reg_Parse);
999 } else {
1000 sprintf (Error_Text, "max operand of {%lu,%lu%c} > 65535",
1001 min_max [0], min_max [1], *Reg_Parse);
1004 REG_FAIL (Error_Text);
1008 if (!comma_present && *Reg_Parse == ',') {
1009 comma_present++;
1010 Reg_Parse++;
1014 /* A max of zero can not be specified directly in the regex since it would
1015 signal a max of infinity. This code specifically disallows `{0,0}',
1016 `{,0}', and `{0}' which really means nothing to humans but would be
1017 interpreted as `{0,infinity}' or `*' if we didn't make this check. */
1019 if (digit_present [0] && (min_max [0] == REG_ZERO) && !comma_present) {
1021 REG_FAIL ("{0} is an invalid range");
1022 } else if (digit_present [0] && (min_max [0] == REG_ZERO) &&
1023 digit_present [1] && (min_max [1] == REG_ZERO)) {
1025 REG_FAIL ("{0,0} is an invalid range");
1026 } else if (digit_present [1] && (min_max [1] == REG_ZERO)) {
1027 if (digit_present [0]) {
1028 sprintf (Error_Text, "{%lu,0} is an invalid range", min_max [0]);
1029 REG_FAIL (Error_Text);
1030 } else {
1031 REG_FAIL ("{,0} is an invalid range");
1035 if (!comma_present) min_max [1] = min_max [0]; /* {x} means {x,x} */
1037 if (*Reg_Parse != '}') {
1038 REG_FAIL ("{m,n} specification missing right \'}\'");
1040 } else if (min_max [1] != REG_INFINITY && min_max [0] > min_max [1]) {
1041 /* Disallow a backward range. */
1043 sprintf (Error_Text, "{%lu,%lu} is an invalid range",
1044 min_max [0], min_max [1]);
1045 REG_FAIL (Error_Text);
1049 Reg_Parse++;
1051 /* Check for a minimal matching (non-greedy or "lazy") specification. */
1053 if (*Reg_Parse == '?') {
1054 lazy = 1;
1055 Reg_Parse++;
1058 /* Avoid overhead of counting if possible */
1060 if (op_code == '{') {
1061 if (min_max [0] == REG_ZERO && min_max [1] == REG_INFINITY) {
1062 op_code = '*';
1063 } else if (min_max [0] == REG_ONE && min_max [1] == REG_INFINITY) {
1064 op_code = '+';
1065 } else if (min_max [0] == REG_ZERO && min_max [1] == REG_ONE) {
1066 op_code = '?';
1067 } else if (min_max [0] == REG_ONE && min_max [1] == REG_ONE) {
1068 /* "x{1,1}" is the same as "x". No need to pollute the compiled
1069 regex with such nonsense. */
1071 *flag_param = flags_local;
1072 *range_param = range_local;
1073 return (ret_val);
1074 } else if (Num_Braces > (int)UCHAR_MAX) {
1075 sprintf (Error_Text, "number of {m,n} constructs > %d", UCHAR_MAX);
1076 REG_FAIL (Error_Text);
1080 if (op_code == '+') min_max [0] = REG_ONE;
1081 if (op_code == '?') min_max [1] = REG_ONE;
1083 /* It is dangerous to apply certain quantifiers to a possibly zero width
1084 item. */
1086 if (!(flags_local & HAS_WIDTH)) {
1087 if (brace_present) {
1088 sprintf (Error_Text, "{%lu,%lu} operand could be empty",
1089 min_max [0], min_max [1]);
1090 } else {
1091 sprintf (Error_Text, "%c operand could be empty", op_code);
1094 REG_FAIL (Error_Text);
1097 *flag_param = (min_max [0] > REG_ZERO) ? (WORST | HAS_WIDTH) : WORST;
1098 if (range_local.lower >= 0) {
1099 if (min_max[1] != REG_INFINITY) {
1100 range_param->lower = range_local.lower * min_max[0];
1101 range_param->upper = range_local.upper * min_max[1];
1102 } else {
1103 range_param->lower = -1; /* Not a fixed-size length */
1104 range_param->upper = -1;
1106 } else {
1107 range_param->lower = -1; /* Not a fixed-size length */
1108 range_param->upper = -1;
1111 /*---------------------------------------------------------------------*
1112 * Symbol Legend For Node Structure Diagrams
1113 *---------------------------------------------------------------------*
1114 * (...) = general grouped thing
1115 * B = (B)ranch, K = bac(K), N = (N)othing
1116 * I = (I)nitialize count, C = Increment (C)ount
1117 * T~m = (T)est against mini(m)um- go to NEXT pointer if >= operand
1118 * T~x = (T)est against ma(x)imum- go to NEXT pointer if >= operand
1119 * '~' = NEXT pointer, \___| = forward pointer, |___/ = Backward pointer
1120 *---------------------------------------------------------------------*/
1122 if (op_code == '*' && (flags_local & SIMPLE)) {
1123 insert ((lazy ? LAZY_STAR : STAR), ret_val, 0UL, 0UL, 0);
1125 } else if (op_code == '+' && (flags_local & SIMPLE)) {
1126 insert (lazy ? LAZY_PLUS : PLUS, ret_val, 0UL, 0UL, 0);
1128 } else if (op_code == '?' && (flags_local & SIMPLE)) {
1129 insert (lazy ? LAZY_QUESTION : QUESTION, ret_val, 0UL, 0UL, 0);
1131 } else if (op_code == '{' && (flags_local & SIMPLE)) {
1132 insert (lazy ? LAZY_BRACE : BRACE, ret_val, min_max [0], min_max [1], 0);
1134 } else if ((op_code == '*' || op_code == '+') && lazy) {
1135 /* Node structure for (x)*? Node structure for (x)+? construct.
1136 * construct. (Same as (x)*? except for initial
1137 * forward jump into parenthesis.)
1139 * ___6____
1140 * _______5_______ /________|______
1141 * | _4__ 1_\ /| ____ | _\
1142 * |/ | / |\ / |/ | | / |\
1143 * B~ N~ B~ (...)~ K~ N~ N~ B~ N~ B~ (...)~ K~ N~
1144 * \ \___2_______| \ \___________|
1145 * \_____3_______| \_____________|
1149 tail (ret_val, emit_node (BACK)); /* 1 */
1150 (void) insert (BRANCH, ret_val, 0UL, 0UL, 0); /* 2,4 */
1151 (void) insert (NOTHING, ret_val, 0UL, 0UL, 0); /* 3 */
1153 next = emit_node (NOTHING); /* 2,3 */
1155 offset_tail (ret_val, NODE_SIZE, next); /* 2 */
1156 tail (ret_val, next); /* 3 */
1157 insert (BRANCH, ret_val, 0UL, 0UL, 0); /* 4,5 */
1158 tail (ret_val, ret_val + (2 * NODE_SIZE)); /* 4 */
1159 offset_tail (ret_val, 3 * NODE_SIZE, ret_val); /* 5 */
1161 if (op_code == '+') {
1162 insert (NOTHING, ret_val, 0UL, 0UL, 0); /* 6 */
1163 tail (ret_val, ret_val + (4 * NODE_SIZE)); /* 6 */
1165 } else if (op_code == '*') {
1166 /* Node structure for (x)* construct.
1167 * ____1_____
1168 * | \
1169 * B~ (...)~ K~ B~ N~
1170 * \ \_|2 |\_|
1171 * \__3_______| 4
1174 insert (BRANCH, ret_val, 0UL, 0UL, 0); /* 1,3 */
1175 offset_tail (ret_val, NODE_SIZE, emit_node (BACK)); /* 2 */
1176 offset_tail (ret_val, NODE_SIZE, ret_val); /* 1 */
1177 tail (ret_val, emit_node (BRANCH)); /* 3 */
1178 tail (ret_val, emit_node (NOTHING)); /* 4 */
1179 } else if (op_code == '+') {
1180 /* Node structure for (x)+ construct.
1182 * ____2_____
1183 * | \
1184 * (...)~ B~ K~ B~ N~
1185 * \_|\____|\_|
1186 * 1 3 4
1189 next = emit_node (BRANCH); /* 1 */
1191 tail (ret_val, next); /* 1 */
1192 tail (emit_node (BACK), ret_val); /* 2 */
1193 tail (next, emit_node (BRANCH)); /* 3 */
1194 tail (ret_val, emit_node (NOTHING)); /* 4 */
1195 } else if (op_code == '?' && lazy) {
1196 /* Node structure for (x)?? construct.
1197 * _4__ 1_
1198 * / | / |
1199 * B~ N~ B~ (...)~ N~
1200 * \ \___2____|
1201 * \_____3____|
1204 (void) insert (BRANCH, ret_val, 0UL, 0UL, 0); /* 2,4 */
1205 (void) insert (NOTHING, ret_val, 0UL, 0UL, 0); /* 3 */
1207 next = emit_node (NOTHING); /* 1,2,3 */
1209 offset_tail (ret_val, 2 * NODE_SIZE, next); /* 1 */
1210 offset_tail (ret_val, NODE_SIZE, next); /* 2 */
1211 tail (ret_val, next); /* 3 */
1212 insert (BRANCH, ret_val, 0UL, 0UL, 0); /* 4 */
1213 tail (ret_val, (ret_val + (2 * NODE_SIZE))); /* 4 */
1215 } else if (op_code == '?') {
1216 /* Node structure for (x)? construct.
1217 * ___1____ _2
1218 * / |/ |
1219 * B~ (...)~ B~ N~
1220 * \__3_|
1223 insert (BRANCH, ret_val, 0UL, 0UL, 0); /* 1 */
1224 tail (ret_val, emit_node (BRANCH)); /* 1 */
1226 next = emit_node (NOTHING); /* 2,3 */
1228 tail (ret_val, next); /* 2 */
1229 offset_tail (ret_val, NODE_SIZE, next); /* 3 */
1230 } else if (op_code == '{' && min_max [0] == min_max [1]) {
1231 /* Node structure for (x){m}, (x){m}?, (x){m,m}, or (x){m,m}? constructs.
1232 * Note that minimal and maximal matching mean the same thing when we
1233 * specify the minimum and maximum to be the same value.
1234 * _______3_____
1235 * | 1_ _2 \
1236 * | / |/ | \
1237 * I~ (...)~ C~ T~m K~ N~
1238 * \_| \_____|
1239 * 5 4
1242 tail (ret_val, emit_special (INC_COUNT, 0UL, Num_Braces)); /* 1 */
1243 tail (ret_val, emit_special (TEST_COUNT, min_max [0], Num_Braces));/* 2 */
1244 tail (emit_node (BACK), ret_val); /* 3 */
1245 tail (ret_val, emit_node (NOTHING)); /* 4 */
1247 next = insert (INIT_COUNT, ret_val, 0UL, 0UL, Num_Braces); /* 5 */
1249 tail (ret_val, next); /* 5 */
1251 Num_Braces++;
1252 } else if (op_code == '{' && lazy) {
1253 if (min_max [0] == REG_ZERO && min_max [1] != REG_INFINITY) {
1254 /* Node structure for (x){0,n}? or {,n}? construct.
1255 * _________3____________
1256 * 8_| _4__ 1_ _2 \
1257 * / |/ | / |/ | \
1258 * I~ B~ N~ B~ (...)~ C~ T~x K~ N~
1259 * \ \ \__7__|
1260 * \ \_________6_______|
1261 * \______5____________|
1264 tail (ret_val, emit_special (INC_COUNT, 0UL, Num_Braces)); /* 1 */
1266 next = emit_special (TEST_COUNT, min_max [0], Num_Braces); /* 2,7 */
1268 tail (ret_val, next); /* 2 */
1269 (void) insert (BRANCH, ret_val, 0UL, 0UL, Num_Braces); /* 4,6 */
1270 (void) insert (NOTHING, ret_val, 0UL, 0UL, Num_Braces); /* 5 */
1271 (void) insert (BRANCH, ret_val, 0UL, 0UL, Num_Braces); /* 3,4,8 */
1272 tail (emit_node (BACK), ret_val); /* 3 */
1273 tail (ret_val, ret_val + (2 * NODE_SIZE)); /* 4 */
1275 next = emit_node (NOTHING); /* 5,6,7 */
1277 offset_tail (ret_val, NODE_SIZE, next); /* 5 */
1278 offset_tail (ret_val, 2 * NODE_SIZE, next); /* 6 */
1279 offset_tail (ret_val, 3 * NODE_SIZE, next); /* 7 */
1281 next = insert (INIT_COUNT, ret_val, 0UL, 0UL, Num_Braces); /* 8 */
1283 tail (ret_val, next); /* 8 */
1285 } else if (min_max [0] > REG_ZERO && min_max [1] == REG_INFINITY) {
1286 /* Node structure for (x){m,}? construct.
1287 * ______8_________________
1288 * | _______3_____ \
1289 * | _7__ | 1_ _2 \ \
1290 * |/ | | / |/ | \ \
1291 * I~ B~ N~ B~ (...)~ C~ T~m K~ K~ N~
1292 * \_____\__\_| \_4___| |
1293 * 9 \ \_________5__________|
1294 * \_______6______________|
1297 tail (ret_val, emit_special (INC_COUNT, 0UL, Num_Braces)); /* 1 */
1299 next = emit_special (TEST_COUNT, min_max [0], Num_Braces); /* 2,4 */
1301 tail (ret_val, next); /* 2 */
1302 tail (emit_node (BACK), ret_val); /* 3 */
1303 tail (ret_val, emit_node (BACK)); /* 4 */
1304 (void) insert (BRANCH, ret_val, 0UL, 0UL, 0); /* 5,7 */
1305 (void) insert (NOTHING, ret_val, 0UL, 0UL, 0); /* 6 */
1307 next = emit_node (NOTHING); /* 5,6 */
1309 offset_tail (ret_val, NODE_SIZE, next); /* 5 */
1310 tail (ret_val, next); /* 6 */
1311 (void) insert (BRANCH, ret_val, 0UL, 0UL, 0); /* 7,8 */
1312 tail (ret_val, ret_val + (2 * NODE_SIZE)); /* 7 */
1313 offset_tail (ret_val, 3 * NODE_SIZE, ret_val); /* 8 */
1314 (void) insert (INIT_COUNT, ret_val, 0UL, 0UL, Num_Braces); /* 9 */
1315 tail (ret_val, ret_val + INDEX_SIZE + (4 * NODE_SIZE)); /* 9 */
1317 } else {
1318 /* Node structure for (x){m,n}? construct.
1319 * ______9_____________________
1320 * | _____________3___ \
1321 * | __8_ | 1_ _2 \ \
1322 * |/ | | / |/ | \ \
1323 * I~ B~ N~ B~ (...)~ C~ T~x T~m K~ K~ N~
1324 * \_____\__\_| \ \__4__| |
1325 * 10 \ \ \_7_________|
1326 * \ \_________6_____________|
1327 * \_______5_________________|
1330 tail (ret_val, emit_special (INC_COUNT, 0UL, Num_Braces)); /* 1 */
1332 next = emit_special (TEST_COUNT, min_max [1], Num_Braces); /* 2,7 */
1334 tail (ret_val, next); /* 2 */
1336 next = emit_special (TEST_COUNT, min_max [0], Num_Braces); /* 4 */
1338 tail (emit_node (BACK), ret_val); /* 3 */
1339 tail (next, emit_node (BACK)); /* 4 */
1340 (void) insert (BRANCH, ret_val, 0UL, 0UL, 0); /* 6,8 */
1341 (void) insert (NOTHING, ret_val, 0UL, 0UL, 0); /* 5 */
1342 (void) insert (BRANCH, ret_val, 0UL, 0UL, 0); /* 8,9 */
1344 next = emit_node (NOTHING); /* 5,6,7 */
1346 offset_tail (ret_val, NODE_SIZE, next); /* 5 */
1347 offset_tail (ret_val, 2 * NODE_SIZE, next); /* 6 */
1348 offset_tail (ret_val, 3 * NODE_SIZE, next); /* 7 */
1349 tail (ret_val, ret_val + (2 * NODE_SIZE)); /* 8 */
1350 offset_tail (next, -NODE_SIZE, ret_val); /* 9 */
1351 insert (INIT_COUNT, ret_val, 0UL, 0UL, Num_Braces); /* 10 */
1352 tail (ret_val, ret_val + INDEX_SIZE + (4 * NODE_SIZE)); /* 10 */
1355 Num_Braces++;
1356 } else if (op_code == '{') {
1357 if (min_max [0] == REG_ZERO && min_max [1] != REG_INFINITY) {
1358 /* Node structure for (x){0,n} or (x){,n} construct.
1360 * ___3____________
1361 * | 1_ _2 \ 5_
1362 * | / |/ | \ / |
1363 * I~ B~ (...)~ C~ T~x K~ B~ N~
1364 * \_|\ \_6___|__|
1365 * 7 \________4________|
1368 tail (ret_val, emit_special (INC_COUNT, 0UL, Num_Braces)); /* 1 */
1370 next = emit_special (TEST_COUNT, min_max [1], Num_Braces); /* 2,6 */
1372 tail (ret_val, next); /* 2 */
1373 (void) insert (BRANCH, ret_val, 0UL, 0UL, 0); /* 3,4,7 */
1374 tail (emit_node (BACK), ret_val); /* 3 */
1376 next = emit_node (BRANCH); /* 4,5 */
1378 tail (ret_val, next); /* 4 */
1379 tail (next, emit_node (NOTHING)); /* 5,6 */
1380 offset_tail (ret_val, NODE_SIZE, next); /* 6 */
1382 next = insert (INIT_COUNT, ret_val, 0UL, 0UL, Num_Braces); /* 7 */
1384 tail (ret_val, next); /* 7 */
1386 } else if (min_max [0] > REG_ZERO && min_max [1] == REG_INFINITY) {
1387 /* Node structure for (x){m,} construct.
1388 * __________4________
1389 * | __3__________ \
1390 * _|___| 1_ _2 \ \ _7
1391 * / | 8 | / |/ | \ \ / |
1392 * I~ B~ (...)~ C~ T~m K~ K~ B~ N~
1393 * \ \_5___| |
1394 * \__________6__________|
1397 tail (ret_val, emit_special (INC_COUNT, 0UL, Num_Braces)); /* 1 */
1399 next = emit_special (TEST_COUNT, min_max [0], Num_Braces); /* 2 */
1401 tail (ret_val, next); /* 2 */
1402 tail (emit_node (BACK), ret_val); /* 3 */
1403 (void) insert (BRANCH, ret_val, 0UL, 0UL, 0); /* 4,6 */
1405 next = emit_node (BACK); /* 4 */
1407 tail (next, ret_val); /* 4 */
1408 offset_tail (ret_val, NODE_SIZE, next); /* 5 */
1409 tail (ret_val, emit_node (BRANCH)); /* 6 */
1410 tail (ret_val, emit_node (NOTHING)); /* 7 */
1412 insert (INIT_COUNT, ret_val, 0UL, 0UL, Num_Braces); /* 8 */
1414 tail (ret_val, ret_val + INDEX_SIZE + (2 * NODE_SIZE)); /* 8 */
1416 } else {
1417 /* Node structure for (x){m,n} construct.
1418 * _____6________________
1419 * | _____________3___ \
1420 * 9_|__| 1_ _2 \ \ _8
1421 * / | | / |/ | \ \ / |
1422 * I~ B~ (...)~ C~ T~x T~m K~ K~ B~ N~
1423 * \ \ \__4__| | |
1424 * \ \_7_________|__|
1425 * \_________5_____________|
1428 tail (ret_val, emit_special (INC_COUNT, 0UL, Num_Braces)); /* 1 */
1430 next = emit_special (TEST_COUNT, min_max [1], Num_Braces); /* 2,4 */
1432 tail (ret_val, next); /* 2 */
1434 next = emit_special (TEST_COUNT, min_max [0], Num_Braces); /* 4 */
1436 tail (emit_node (BACK), ret_val); /* 3 */
1437 tail (next, emit_node (BACK)); /* 4 */
1438 (void) insert (BRANCH, ret_val, 0UL, 0UL, 0); /* 5,6 */
1440 next = emit_node (BRANCH); /* 5,8 */
1442 tail (ret_val, next); /* 5 */
1443 offset_tail (next, -NODE_SIZE, ret_val); /* 6 */
1445 next = emit_node (NOTHING); /* 7,8 */
1447 offset_tail (ret_val, NODE_SIZE, next); /* 7 */
1449 offset_tail (next, -NODE_SIZE, next); /* 8 */
1450 (void) insert (INIT_COUNT, ret_val, 0UL, 0UL, Num_Braces); /* 9 */
1451 tail (ret_val, ret_val + INDEX_SIZE + (2 * NODE_SIZE)); /* 9 */
1454 Num_Braces++;
1455 } else {
1456 /* We get here if the IS_QUANTIFIER macro is not coordinated properly
1457 with this function. */
1459 REG_FAIL ("internal error #2, `piece\'");
1462 if (IS_QUANTIFIER (*Reg_Parse)) {
1463 if (op_code == '{') {
1464 sprintf (Error_Text, "nested quantifiers, {m,n}%c", *Reg_Parse);
1465 } else {
1466 sprintf (Error_Text, "nested quantifiers, %c%c", op_code, *Reg_Parse);
1469 REG_FAIL (Error_Text);
1472 return (ret_val);
1475 /*----------------------------------------------------------------------*
1476 * atom
1478 * Process one regex item at the lowest level
1480 * OPTIMIZATION: Lumps a continuous sequence of ordinary characters
1481 * together so that it can turn them into a single EXACTLY node, which
1482 * is smaller to store and faster to run.
1483 *----------------------------------------------------------------------*/
1485 static unsigned char * atom (int *flag_param, len_range *range_param) {
1487 register unsigned char *ret_val;
1488 unsigned char test;
1489 int flags_local;
1490 len_range range_local;
1492 *flag_param = WORST; /* Tentatively. */
1493 range_param->lower = 0; /* Idem */
1494 range_param->upper = 0;
1496 /* Process any regex comments, e.g. `(?# match next token->)'. The
1497 terminating right parenthesis can not be escaped. The comment stops at
1498 the first right parenthesis encountered (or the end of the regex
1499 string)... period. Handles multiple sequential comments,
1500 e.g. `(?# one)(?# two)...' */
1502 while (*Reg_Parse == '(' &&
1503 *(Reg_Parse + 1) == '?' &&
1504 *(Reg_Parse + 2) == '#') {
1506 Reg_Parse += 3;
1508 while (*Reg_Parse != ')' && *Reg_Parse != '\0') {
1509 Reg_Parse++;
1512 if (*Reg_Parse == ')') {
1513 Reg_Parse++;
1516 if (*Reg_Parse == ')' || *Reg_Parse == '|' || *Reg_Parse == '\0') {
1517 /* Hit end of regex string or end of parenthesized regex; have to
1518 return "something" (i.e. a NOTHING node) to avoid generating an
1519 error. */
1521 ret_val = emit_node (NOTHING);
1523 return (ret_val);
1527 switch (*Reg_Parse++) {
1528 case '^':
1529 ret_val = emit_node (BOL);
1530 break;
1532 case '$':
1533 ret_val = emit_node (EOL);
1534 break;
1536 case '<':
1537 ret_val = emit_node (BOWORD);
1538 break;
1540 case '>':
1541 ret_val = emit_node (EOWORD);
1542 break;
1544 case '.':
1545 if (Match_Newline) {
1546 ret_val = emit_node (EVERY);
1547 } else {
1548 ret_val = emit_node (ANY);
1551 *flag_param |= (HAS_WIDTH | SIMPLE);
1552 range_param->lower = 1;
1553 range_param->upper = 1;
1554 break;
1556 case '(':
1557 if (*Reg_Parse == '?') { /* Special parenthetical expression */
1558 Reg_Parse++;
1559 range_local.lower = 0; /* Make sure it is always used */
1560 range_local.upper = 0;
1562 if (*Reg_Parse == ':') {
1563 Reg_Parse++;
1564 ret_val = chunk (NO_CAPTURE, &flags_local, &range_local);
1565 } else if (*Reg_Parse == '=') {
1566 Reg_Parse++;
1567 ret_val = chunk (POS_AHEAD_OPEN, &flags_local, &range_local);
1568 } else if (*Reg_Parse == '!') {
1569 Reg_Parse++;
1570 ret_val = chunk (NEG_AHEAD_OPEN, &flags_local, &range_local);
1571 } else if (*Reg_Parse == 'i') {
1572 Reg_Parse++;
1573 ret_val = chunk (INSENSITIVE, &flags_local, &range_local);
1574 } else if (*Reg_Parse == 'I') {
1575 Reg_Parse++;
1576 ret_val = chunk (SENSITIVE, &flags_local, &range_local);
1577 } else if (*Reg_Parse == 'n') {
1578 Reg_Parse++;
1579 ret_val = chunk (NEWLINE, &flags_local, &range_local);
1580 } else if (*Reg_Parse == 'N') {
1581 Reg_Parse++;
1582 ret_val = chunk (NO_NEWLINE, &flags_local, &range_local);
1583 } else if (*Reg_Parse == '<') {
1584 Reg_Parse++;
1585 if (*Reg_Parse == '=') {
1586 Reg_Parse++;
1587 ret_val = chunk (POS_BEHIND_OPEN, &flags_local, &range_local);
1588 } else if (*Reg_Parse == '!') {
1589 Reg_Parse++;
1590 ret_val = chunk (NEG_BEHIND_OPEN, &flags_local, &range_local);
1591 } else {
1592 sprintf (Error_Text,
1593 "invalid look-behind syntax, \"(?<%c...)\"",
1594 *Reg_Parse);
1596 REG_FAIL (Error_Text);
1598 } else {
1599 sprintf (Error_Text,
1600 "invalid grouping syntax, \"(?%c...)\"",
1601 *Reg_Parse);
1603 REG_FAIL (Error_Text);
1605 } else { /* Normal capturing parentheses */
1606 ret_val = chunk (PAREN, &flags_local, &range_local);
1609 if (ret_val == NULL) return (NULL); /* Something went wrong. */
1611 /* Add HAS_WIDTH flag if it was set by call to chunk. */
1613 *flag_param |= flags_local & HAS_WIDTH;
1614 *range_param = range_local;
1616 break;
1618 case '\0':
1619 case '|':
1620 case ')':
1621 REG_FAIL ("internal error #3, `atom\'"); /* Supposed to be */
1622 /* caught earlier. */
1623 case '?':
1624 case '+':
1625 case '*':
1626 sprintf (Error_Text, "%c follows nothing", *(Reg_Parse - 1));
1627 REG_FAIL (Error_Text);
1629 case '{':
1630 if (Enable_Counting_Quantifier) {
1631 REG_FAIL ("{m,n} follows nothing");
1632 } else {
1633 ret_val = emit_node (EXACTLY); /* Treat braces as literals. */
1634 emit_byte ('{');
1635 emit_byte ('\0');
1636 range_param->lower = 1;
1637 range_param->upper = 1;
1640 break;
1642 case '[':
1644 register unsigned int second_value;
1645 register unsigned int last_value;
1646 unsigned char last_emit = 0;
1648 /* Handle characters that can only occur at the start of a class. */
1650 if (*Reg_Parse == '^') { /* Complement of range. */
1651 ret_val = emit_node (ANY_BUT);
1652 Reg_Parse++;
1654 /* All negated classes include newline unless escaped with
1655 a "(?n)" switch. */
1657 if (!Match_Newline) emit_byte ('\n');
1658 } else {
1659 ret_val = emit_node (ANY_OF);
1662 if (*Reg_Parse == ']' || *Reg_Parse == '-') {
1663 /* If '-' or ']' is the first character in a class,
1664 it is a literal character in the class. */
1666 last_emit = *Reg_Parse;
1667 emit_byte (*Reg_Parse);
1668 Reg_Parse++;
1671 /* Handle the rest of the class characters. */
1673 while (*Reg_Parse != '\0' && *Reg_Parse != ']') {
1674 if (*Reg_Parse == '-') { /* Process a range, e.g [a-z]. */
1675 Reg_Parse++;
1677 if (*Reg_Parse == ']' || *Reg_Parse == '\0') {
1678 /* If '-' is the last character in a class it is a literal
1679 character. If `Reg_Parse' points to the end of the
1680 regex string, an error will be generated later. */
1682 emit_byte ('-');
1683 last_emit = '-';
1684 } else {
1685 /* We must get the range starting character value from the
1686 emitted code since it may have been an escaped
1687 character. `second_value' is set one larger than the
1688 just emitted character value. This is done since
1689 `second_value' is used as the start value for the loop
1690 that emits the values in the range. Since we have
1691 already emitted the first character of the class, we do
1692 not want to emit it again. */
1694 second_value = ((unsigned int) last_emit) + 1;
1696 if (*Reg_Parse == '\\') {
1697 /* Handle escaped characters within a class range.
1698 Specifically disallow shortcut escapes as the end of
1699 a class range. To allow this would be ambiguous
1700 since shortcut escapes represent a set of characters,
1701 and it would not be clear which character of the
1702 class should be treated as the "last" character. */
1704 Reg_Parse++;
1706 if ((test = numeric_escape (*Reg_Parse, &Reg_Parse))) {
1707 last_value = (unsigned int) test;
1708 } else if ((test = literal_escape (*Reg_Parse))) {
1709 last_value = (unsigned int) test;
1710 } else if (shortcut_escape (*Reg_Parse,
1711 NULL,
1712 CHECK_CLASS_ESCAPE)) {
1713 sprintf (Error_Text,
1714 "\\%c is not allowed as range operand",
1715 *Reg_Parse);
1717 REG_FAIL (Error_Text);
1718 } else {
1719 sprintf (
1720 Error_Text,
1721 "\\%c is an invalid char class escape sequence",
1722 *Reg_Parse);
1724 REG_FAIL (Error_Text);
1726 } else {
1727 last_value = U_CHAR_AT (Reg_Parse);
1730 if (Is_Case_Insensitive) {
1731 second_value =
1732 (unsigned int) tolower ((int) second_value);
1733 last_value =
1734 (unsigned int) tolower ((int) last_value);
1737 /* For case insensitive, something like [A-_] will
1738 generate an error here since ranges are converted to
1739 lower case. */
1741 if (second_value - 1 > last_value) {
1742 REG_FAIL ("invalid [] range");
1745 /* If only one character in range (e.g [a-a]) then this
1746 loop is not run since the first character of any range
1747 was emitted by the previous iteration of while loop. */
1749 for (; second_value <= last_value; second_value++) {
1750 emit_class_byte (second_value);
1753 last_emit = (unsigned char) last_value;
1755 Reg_Parse++;
1757 } /* End class character range code. */
1758 } else if (*Reg_Parse == '\\') {
1759 Reg_Parse++;
1761 if ((test = numeric_escape (*Reg_Parse, &Reg_Parse)) != '\0') {
1762 emit_class_byte (test);
1764 last_emit = test;
1765 } else if ((test = literal_escape (*Reg_Parse)) != '\0') {
1766 emit_byte (test);
1767 last_emit = test;
1768 } else if (shortcut_escape (*Reg_Parse,
1769 NULL,
1770 CHECK_CLASS_ESCAPE)) {
1772 if (*(Reg_Parse + 1) == '-') {
1773 /* Specifically disallow shortcut escapes as the start
1774 of a character class range (see comment above.) */
1776 sprintf (Error_Text,
1777 "\\%c not allowed as range operand",
1778 *Reg_Parse);
1780 REG_FAIL (Error_Text);
1781 } else {
1782 /* Emit the bytes that are part of the shortcut
1783 escape sequence's range (e.g. \d = 0123456789) */
1785 shortcut_escape (*Reg_Parse, NULL, EMIT_CLASS_BYTES);
1787 } else {
1788 sprintf (Error_Text,
1789 "\\%c is an invalid char class escape sequence",
1790 *Reg_Parse);
1792 REG_FAIL (Error_Text);
1795 Reg_Parse++;
1797 /* End of class escaped sequence code */
1798 } else {
1799 emit_class_byte (*Reg_Parse); /* Ordinary class character. */
1801 last_emit = *Reg_Parse;
1802 Reg_Parse++;
1804 } /* End of while (*Reg_Parse != '\0' && *Reg_Parse != ']') */
1806 if (*Reg_Parse != ']') REG_FAIL ("missing right \']\'");
1808 emit_byte('\0');
1810 /* NOTE: it is impossible to specify an empty class. This is
1811 because [] would be interpreted as "begin character class"
1812 followed by a literal ']' character and no "end character class"
1813 delimiter (']'). Because of this, it is always safe to assume
1814 that a class HAS_WIDTH. */
1816 Reg_Parse++;
1817 *flag_param |= HAS_WIDTH | SIMPLE;
1818 range_param->lower = 1;
1819 range_param->upper = 1;
1822 break; /* End of character class code. */
1824 case '\\':
1825 /* Force Error_Text to have a length of zero. This way we can tell if
1826 either of the calls to shortcut_escape() or back_ref() fill
1827 Error_Text with an error message. */
1829 Error_Text [0] = '\0';
1831 if ((ret_val = shortcut_escape (*Reg_Parse, flag_param, EMIT_NODE))) {
1833 Reg_Parse++;
1834 range_param->lower = 1;
1835 range_param->upper = 1;
1836 break;
1838 } else if ((ret_val = back_ref (Reg_Parse, flag_param, EMIT_NODE))) {
1839 /* Can't make any assumptions about a back-reference as to SIMPLE
1840 or HAS_WIDTH. For example (^|<) is neither simple nor has
1841 width. So we don't flip bits in flag_param here. */
1843 Reg_Parse++;
1844 /* Back-references always have an unknown length */
1845 range_param->lower = -1;
1846 range_param->upper = -1;
1847 break;
1850 if (strlen (Error_Text) > 0) REG_FAIL (Error_Text);
1852 /* At this point it is apparent that the escaped character is not a
1853 shortcut escape or back-reference. Back up one character to allow
1854 the default code to include it as an ordinary character. */
1856 /* Fall through to Default case to handle literal escapes and numeric
1857 escapes. */
1859 default:
1860 Reg_Parse--; /* If we fell through from the above code, we are now
1861 pointing at the back slash (\) character. */
1863 unsigned char *parse_save;
1864 int len = 0;
1866 if (Is_Case_Insensitive) {
1867 ret_val = emit_node (SIMILAR);
1868 } else {
1869 ret_val = emit_node (EXACTLY);
1872 /* Loop until we find a meta character, shortcut escape, back
1873 reference, or end of regex string. */
1875 for (; *Reg_Parse != '\0' &&
1876 !strchr ((char *) Meta_Char, (int) *Reg_Parse);
1877 len++) {
1879 /* Save where we are in case we have to back
1880 this character out. */
1882 parse_save = Reg_Parse;
1884 if (*Reg_Parse == '\\') {
1885 Reg_Parse++; /* Point to escaped character */
1887 Error_Text [0] = '\0'; /* See comment above. */
1889 if ((test = numeric_escape (*Reg_Parse, &Reg_Parse))) {
1890 if (Is_Case_Insensitive) {
1891 emit_byte (tolower (test));
1892 } else {
1893 emit_byte (test);
1895 } else if ((test = literal_escape (*Reg_Parse))) {
1896 emit_byte (test);
1897 } else if (back_ref (Reg_Parse, NULL, CHECK_ESCAPE)) {
1898 /* Leave back reference for next `atom' call */
1900 Reg_Parse--; break;
1901 } else if (shortcut_escape (*Reg_Parse, NULL, CHECK_ESCAPE)) {
1902 /* Leave shortcut escape for next `atom' call */
1904 Reg_Parse--; break;
1905 } else {
1906 if (strlen (Error_Text) == 0) {
1907 /* None of the above calls generated an error message
1908 so generate our own here. */
1910 sprintf (Error_Text,
1911 "\\%c is an invalid escape sequence",
1912 *Reg_Parse);
1915 REG_FAIL (Error_Text);
1918 Reg_Parse++;
1919 } else {
1920 /* Ordinary character */
1922 if (Is_Case_Insensitive) {
1923 emit_byte (tolower (*Reg_Parse));
1924 } else {
1925 emit_byte (*Reg_Parse);
1928 Reg_Parse++;
1931 /* If next regex token is a quantifier (?, +. *, or {m,n}) and
1932 our EXACTLY node so far is more than one character, leave the
1933 last character to be made into an EXACTLY node one character
1934 wide for the multiplier to act on. For example 'abcd* would
1935 have an EXACTLY node with an 'abc' operand followed by a STAR
1936 node followed by another EXACTLY node with a 'd' operand. */
1938 if (IS_QUANTIFIER (*Reg_Parse) && len > 0) {
1939 Reg_Parse = parse_save; /* Point to previous regex token. */
1941 if (Code_Emit_Ptr == &Compute_Size) {
1942 Reg_Size--;
1943 } else {
1944 Code_Emit_Ptr--; /* Write over previously emitted byte. */
1947 break;
1951 if (len <= 0) REG_FAIL ("internal error #4, `atom\'");
1953 *flag_param |= HAS_WIDTH;
1955 if (len == 1) *flag_param |= SIMPLE;
1957 range_param->lower = len;
1958 range_param->upper = len;
1960 emit_byte ('\0');
1962 } /* END switch (*Reg_Parse++) */
1964 return (ret_val);
1967 /*----------------------------------------------------------------------*
1968 * emit_node
1970 * Emit (if appropriate) the op code for a regex node atom.
1972 * The NEXT pointer is initialized to NULL.
1974 * Returns a pointer to the START of the emitted node.
1975 *----------------------------------------------------------------------*/
1977 static unsigned char * emit_node (int op_code) {
1979 register unsigned char *ret_val;
1980 register unsigned char *ptr;
1982 ret_val = Code_Emit_Ptr; /* Return address of start of node */
1984 if (ret_val == &Compute_Size) {
1985 Reg_Size += NODE_SIZE;
1986 } else {
1987 ptr = ret_val;
1988 *ptr++ = (unsigned char) op_code;
1989 *ptr++ = '\0'; /* Null "NEXT" pointer. */
1990 *ptr++ = '\0';
1992 Code_Emit_Ptr = ptr;
1995 return (ret_val);
1998 /*----------------------------------------------------------------------*
1999 * emit_byte
2001 * Emit (if appropriate) a byte of code (usually part of an operand.)
2002 *----------------------------------------------------------------------*/
2004 static void emit_byte (unsigned char c) {
2006 if (Code_Emit_Ptr == &Compute_Size) {
2007 Reg_Size++;
2008 } else {
2009 *Code_Emit_Ptr++ = c;
2013 /*----------------------------------------------------------------------*
2014 * emit_class_byte
2016 * Emit (if appropriate) a byte of code (usually part of a character
2017 * class operand.)
2018 *----------------------------------------------------------------------*/
2020 static void emit_class_byte (unsigned char c) {
2022 if (Code_Emit_Ptr == &Compute_Size) {
2023 Reg_Size++;
2025 if (Is_Case_Insensitive && isalpha (c)) Reg_Size++;
2026 } else if (Is_Case_Insensitive && isalpha (c)) {
2027 /* For case insensitive character classes, emit both upper and lower case
2028 versions of alphabetical characters. */
2030 *Code_Emit_Ptr++ = tolower (c);
2031 *Code_Emit_Ptr++ = toupper (c);
2032 } else {
2033 *Code_Emit_Ptr++ = c;
2037 /*----------------------------------------------------------------------*
2038 * emit_special
2040 * Emit nodes that need special processing.
2041 *----------------------------------------------------------------------*/
2043 static unsigned char * emit_special (
2044 unsigned char op_code,
2045 unsigned long test_val,
2046 int index) {
2048 register unsigned char *ret_val = &Compute_Size;
2049 register unsigned char *ptr;
2051 if (Code_Emit_Ptr == &Compute_Size) {
2052 switch (op_code) {
2053 case POS_BEHIND_OPEN:
2054 case NEG_BEHIND_OPEN:
2055 Reg_Size += LENGTH_SIZE; /* Length of the look-behind match */
2056 Reg_Size += NODE_SIZE; /* Make room for the node */
2057 break;
2059 case TEST_COUNT:
2060 Reg_Size += NEXT_PTR_SIZE; /* Make room for a test value. */
2062 case INC_COUNT:
2063 Reg_Size += INDEX_SIZE; /* Make room for an index value. */
2065 default:
2066 Reg_Size += NODE_SIZE; /* Make room for the node. */
2068 } else {
2069 ret_val = emit_node (op_code); /* Return the address for start of node. */
2070 ptr = Code_Emit_Ptr;
2072 if (op_code == INC_COUNT || op_code == TEST_COUNT) {
2073 *ptr++ = (unsigned char) index;
2075 if (op_code == TEST_COUNT) {
2076 *ptr++ = PUT_OFFSET_L (test_val);
2077 *ptr++ = PUT_OFFSET_R (test_val);
2079 } else if (op_code == POS_BEHIND_OPEN || op_code == NEG_BEHIND_OPEN) {
2080 *ptr++ = PUT_OFFSET_L (test_val);
2081 *ptr++ = PUT_OFFSET_R (test_val);
2082 *ptr++ = PUT_OFFSET_L (test_val);
2083 *ptr++ = PUT_OFFSET_R (test_val);
2086 Code_Emit_Ptr = ptr;
2089 return (ret_val);
2092 /*----------------------------------------------------------------------*
2093 * insert
2095 * Insert a node in front of already emitted node(s). Means relocating
2096 * the operand. Code_Emit_Ptr points one byte past the just emitted
2097 * node and operand. The parameter `insert_pos' points to the location
2098 * where the new node is to be inserted.
2099 *----------------------------------------------------------------------*/
2101 static unsigned char * insert (
2102 unsigned char op,
2103 unsigned char *insert_pos,
2104 long min,
2105 long max,
2106 int index) {
2108 register unsigned char *src;
2109 register unsigned char *dst;
2110 unsigned char *place;
2111 int insert_size = NODE_SIZE;
2113 if (op == BRACE || op == LAZY_BRACE) {
2114 /* Make room for the min and max values. */
2116 insert_size += (2 * NEXT_PTR_SIZE);
2117 } else if (op == INIT_COUNT) {
2118 /* Make room for an index value . */
2120 insert_size += INDEX_SIZE;
2123 if (Code_Emit_Ptr == &Compute_Size) {
2124 Reg_Size += insert_size;
2125 return &Compute_Size;
2128 src = Code_Emit_Ptr;
2129 Code_Emit_Ptr += insert_size;
2130 dst = Code_Emit_Ptr;
2132 /* Relocate the existing emitted code to make room for the new node. */
2134 while (src > insert_pos) *--dst = *--src;
2136 place = insert_pos; /* Where operand used to be. */
2137 *place++ = op; /* Inserted operand. */
2138 *place++ = '\0'; /* NEXT pointer for inserted operand. */
2139 *place++ = '\0';
2141 if (op == BRACE || op == LAZY_BRACE) {
2142 *place++ = PUT_OFFSET_L (min);
2143 *place++ = PUT_OFFSET_R (min);
2145 *place++ = PUT_OFFSET_L (max);
2146 *place++ = PUT_OFFSET_R (max);
2147 } else if (op == INIT_COUNT) {
2148 *place++ = (unsigned char) index;
2151 return place; /* Return a pointer to the start of the code moved. */
2154 /*----------------------------------------------------------------------*
2155 * tail - Set the next-pointer at the end of a node chain.
2156 *----------------------------------------------------------------------*/
2158 static void tail (unsigned char *search_from, unsigned char *point_to) {
2160 register unsigned char *scan;
2161 register unsigned char *next;
2162 register int offset;
2164 if (search_from == &Compute_Size) return;
2166 /* Find the last node in the chain (node with a null NEXT pointer) */
2168 scan = search_from;
2170 for (;;) {
2171 next = next_ptr (scan);
2173 if (!next) break;
2175 scan = next;
2178 if (GET_OP_CODE (scan) == BACK) {
2179 offset = scan - point_to;
2180 } else {
2181 offset = point_to - scan;
2184 /* Set NEXT pointer */
2186 *(scan + 1) = PUT_OFFSET_L (offset);
2187 *(scan + 2) = PUT_OFFSET_R (offset);
2190 /*--------------------------------------------------------------------*
2191 * offset_tail
2193 * Perform a tail operation on (ptr + offset).
2194 *--------------------------------------------------------------------*/
2196 static void offset_tail (unsigned char *ptr, int offset, unsigned char *val) {
2198 if (ptr == &Compute_Size || ptr == NULL) return;
2200 tail (ptr + offset, val);
2203 /*--------------------------------------------------------------------*
2204 * branch_tail
2206 * Perform a tail operation on (ptr + offset) but only if `ptr' is a
2207 * BRANCH node.
2208 *--------------------------------------------------------------------*/
2210 static void branch_tail (unsigned char *ptr, int offset, unsigned char *val) {
2212 if (ptr == &Compute_Size || ptr == NULL ||GET_OP_CODE (ptr) != BRANCH) {
2213 return;
2216 tail (ptr + offset, val);
2219 /*--------------------------------------------------------------------*
2220 * shortcut_escape
2222 * Implements convenient escape sequences that represent entire
2223 * character classes or special location assertions (similar to escapes
2224 * supported by Perl)
2226 * \d Digits [0-9] |
2227 * \D NOT a digit [^0-9] | (Examples
2228 * \l Letters [a-zA-Z] | at left
2229 * \L NOT a Letter [^a-zA-Z] | are
2230 * \s Whitespace [ \t\n\r\f\v] | for
2231 * \S NOT Whitespace [^ \t\n\r\f\v] | C
2232 * \w "Word" character [a-zA-Z0-9_] | Locale)
2233 * \W NOT a "Word" character [^a-zA-Z0-9_] _|
2235 * \B Matches any character that is NOT a word-delimiter
2237 * Codes for the "emit" parameter:
2239 * EMIT_NODE
2240 * Emit a shortcut node. Shortcut nodes have an implied set of
2241 * class characters. This helps keep the compiled regex string
2242 * small.
2244 * EMIT_CLASS_BYTES
2245 * Emit just the equivalent characters of the class. This makes
2246 * the escape usable from within a class, e.g. [a-fA-F\d]. Only
2247 * \d, \D, \s, \S, \w, and \W can be used within a class.
2249 * CHECK_ESCAPE
2250 * Only verify that this is a valid shortcut escape.
2252 * CHECK_CLASS_ESCAPE
2253 * Same as CHECK_ESCAPE but only allows characters valid within
2254 * a class.
2256 *--------------------------------------------------------------------*/
2258 static unsigned char * shortcut_escape (
2259 unsigned char c,
2260 int *flag_param,
2261 int emit) {
2263 register unsigned char *class = NULL;
2264 static unsigned char *codes = (unsigned char *) "ByYdDlLsSwW";
2265 unsigned char *ret_val = (unsigned char *) 1; /* Assume success. */
2266 unsigned char *valid_codes;
2268 if (emit == EMIT_CLASS_BYTES || emit == CHECK_CLASS_ESCAPE) {
2269 valid_codes = codes + 3; /* \B, \y and \Y are not allowed in classes */
2270 } else {
2271 valid_codes = codes;
2274 if (!strchr ((char *) valid_codes, (int) c)) {
2275 return NULL; /* Not a valid shortcut escape sequence */
2276 } else if (emit == CHECK_ESCAPE || emit == CHECK_CLASS_ESCAPE) {
2277 return ret_val; /* Just checking if this is a valid shortcut escape. */
2280 switch (c) {
2281 case 'd':
2282 case 'D':
2283 if (emit == EMIT_CLASS_BYTES) {
2284 class = ASCII_Digits;
2285 } else if (emit == EMIT_NODE) {
2286 ret_val = (islower (c) ? emit_node (DIGIT)
2287 : emit_node (NOT_DIGIT));
2290 break;
2292 case 'l':
2293 case 'L':
2294 if (emit == EMIT_CLASS_BYTES) {
2295 class = Letter_Char;
2296 } else if (emit == EMIT_NODE) {
2297 ret_val = (islower (c) ? emit_node (LETTER)
2298 : emit_node (NOT_LETTER));
2301 break;
2303 case 's':
2304 case 'S':
2305 if (emit == EMIT_CLASS_BYTES) {
2306 if (Match_Newline) emit_byte ('\n');
2308 class = White_Space;
2309 } else if (emit == EMIT_NODE) {
2310 if (Match_Newline) {
2311 ret_val = (islower (c) ? emit_node (SPACE_NL)
2312 : emit_node (NOT_SPACE_NL));
2313 } else {
2314 ret_val = (islower (c) ? emit_node (SPACE)
2315 : emit_node (NOT_SPACE));
2319 break;
2321 case 'w':
2322 case 'W':
2323 if (emit == EMIT_CLASS_BYTES) {
2324 class = Word_Char;
2325 } else if (emit == EMIT_NODE) {
2326 ret_val = (islower (c) ? emit_node (WORD_CHAR)
2327 : emit_node (NOT_WORD_CHAR));
2330 break;
2332 /* Since the delimiter table is not available at regex compile time \B,
2333 \Y and \Y can only generate a node. At run time, the delimiter table
2334 will be available for these nodes to use. */
2336 case 'y':
2338 if (emit == EMIT_NODE) {
2339 ret_val = emit_node (IS_DELIM);
2340 } else {
2341 REG_FAIL ("internal error #5 `shortcut_escape\'");
2344 break;
2346 case 'Y':
2348 if (emit == EMIT_NODE) {
2349 ret_val = emit_node (NOT_DELIM);
2350 } else {
2351 REG_FAIL ("internal error #6 `shortcut_escape\'");
2354 break;
2356 case 'B':
2358 if (emit == EMIT_NODE) {
2359 ret_val = emit_node (NOT_BOUNDARY);
2360 } else {
2361 REG_FAIL ("internal error #7 `shortcut_escape\'");
2364 break;
2366 default:
2367 /* We get here if there isn't a case for every character in
2368 the string "codes" */
2370 REG_FAIL ("internal error #8 `shortcut_escape\'");
2373 if (emit == EMIT_NODE && c != 'B') {
2374 *flag_param |= (HAS_WIDTH | SIMPLE);
2377 if (class) {
2378 /* Emit bytes within a character class operand. */
2380 while (*class != '\0') {
2381 emit_byte (*class++);
2385 return ret_val;
2388 /*--------------------------------------------------------------------*
2389 * numeric_escape
2391 * Implements hex and octal numeric escape sequence syntax.
2393 * Hexadecimal Escape: \x## Max of two digits Must have leading 'x'.
2394 * Octal Escape: \0### Max of three digits and not greater
2395 * than 377 octal. Must have leading zero.
2397 * Returns the actual character value or NULL if not a valid hex or
2398 * octal escape. REG_FAIL is called if \x0, \x00, \0, \00, \000, or
2399 * \0000 is specified.
2400 *--------------------------------------------------------------------*/
2402 static unsigned char numeric_escape (
2403 unsigned char c,
2404 unsigned char **parse) {
2406 static unsigned char digits [] = "fedcbaFEDCBA9876543210";
2408 static unsigned int digit_val [] = {
2409 15, 14, 13, 12, 11, 10, /* Lower case Hex digits */
2410 15, 14, 13, 12, 11, 10, /* Upper case Hex digits */
2411 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}; /* Decimal Digits */
2413 register unsigned char *scan;
2414 register unsigned char *pos_ptr;
2415 register unsigned char *digit_str;
2416 unsigned int value = 0;
2417 unsigned int radix = 8;
2418 int width = 3; /* Can not be bigger than \0377 */
2419 int pos_delta = 14;
2420 int i, pos;
2422 switch (c) {
2423 case '0':
2424 digit_str = digits + pos_delta; /* Only use Octal digits, i.e. 0-7. */
2426 break;
2428 case 'x':
2429 case 'X':
2430 width = 2; /* Can not be bigger than \0377 */
2431 radix = 16;
2432 pos_delta = 0;
2433 digit_str = digits; /* Use all of the digit characters. */
2435 break;
2437 default:
2438 return ('\0'); /* Not a numeric escape */
2441 scan = *parse; scan++; /* Only change *parse on success. */
2443 pos_ptr = (unsigned char *) strchr ((char *) digit_str, (int) *scan);
2445 for (i = 0; pos_ptr != NULL && (i < width); i++) {
2446 pos = (pos_ptr - digit_str) + pos_delta;
2447 value = (value * radix) + digit_val [pos];
2449 /* If this digit makes the value over 255, treat this digit as a literal
2450 character instead of part of the numeric escape. For example, \0777
2451 will be processed as \077 (an 'M') and a literal '7' character, NOT
2452 511 decimal which is > 255. */
2454 if (value > 255) {
2455 /* Back out calculations for last digit processed. */
2457 value -= digit_val [pos];
2458 value /= radix;
2460 break; /* Note that scan will not be incremented and still points to
2461 the digit that caused overflow. It will be decremented by
2462 the "else" below to point to the last character that is
2463 considered to be part of the octal escape. */
2466 scan++;
2467 pos_ptr = (unsigned char *) strchr ((char *) digit_str, (int) *scan);
2470 /* Handle the case of "\0" i.e. trying to specify a NULL character. */
2472 if (value == 0) {
2473 if (c == '0') {
2474 sprintf (Error_Text, "\\00 is an invalid octal escape");
2475 } else {
2476 sprintf (Error_Text, "\\%c0 is an invalid hexadecimal escape", c);
2478 } else {
2479 /* Point to the last character of the number on success. */
2481 scan--;
2482 *parse = scan;
2485 return (unsigned char) value;
2488 /*--------------------------------------------------------------------*
2489 * literal_escape
2491 * Recognize escaped literal characters (prefixed with backslash),
2492 * and translate them into the corresponding character.
2494 * Returns the proper character value or NULL if not a valid literal
2495 * escape.
2496 *--------------------------------------------------------------------*/
2498 static unsigned char literal_escape (unsigned char c) {
2500 static unsigned char valid_escape [] = {
2501 'a', 'b',
2502 'e',
2503 'f', 'n', 'r', 't', 'v', '(', ')', '-', '[', ']',
2504 '<', '>', '{', '}', '.', '\\', '|', '^', '$', '*',
2505 '+', '?', '&', '\0'
2508 static unsigned char value [] = {
2509 '\a', '\b',
2510 #ifdef EBCDIC_CHARSET
2511 0x27, /* Escape character in IBM's EBCDIC character set. */
2512 #else
2513 0x1B, /* Escape character in ASCII character set. */
2514 #endif
2515 '\f', '\n', '\r', '\t', '\v', '(', ')', '-', '[', ']',
2516 '<', '>', '{', '}', '.', '\\', '|', '^', '$', '*',
2517 '+', '?', '&', '\0'
2520 int i;
2522 for (i = 0; valid_escape [i] != '\0'; i++) {
2523 if (c == valid_escape [i]) return value [i];
2526 return '\0';
2529 /*--------------------------------------------------------------------*
2530 * back_ref
2532 * Process a request to match a previous parenthesized thing.
2533 * Parenthetical entities are numbered beginning at 1 by counting
2534 * opening parentheses from left to to right. \0 would represent
2535 * whole match, but would confuse numeric_escape as an octal escape,
2536 * so it is forbidden.
2538 * Constructs of the form \~1, \~2, etc. are cross-regex back
2539 * references and are used in syntax highlighting patterns to match
2540 * text previously matched by another regex. *** IMPLEMENT LATER ***
2541 *--------------------------------------------------------------------*/
2543 static unsigned char * back_ref (
2544 unsigned char *c,
2545 int *flag_param,
2546 int emit) {
2548 int paren_no, c_offset = 0, is_cross_regex = 0;
2550 unsigned char *ret_val;
2552 /* Implement cross regex backreferences later. */
2554 /* if (*c == (unsigned char) ('~')) {
2555 c_offset++;
2556 is_cross_regex++;
2557 } */
2559 paren_no = (int) (*(c + c_offset) - (unsigned char) ('0'));
2561 if (!isdigit (*(c + c_offset)) || /* Only \1, \2, ... \9 are supported. */
2562 paren_no == 0) { /* Should be caught by numeric_escape. */
2564 return NULL;
2567 /* Make sure parentheses for requested back-reference are complete. */
2569 if (!is_cross_regex && !TEST_BIT (Closed_Parens, paren_no)) {
2570 sprintf (Error_Text, "\\%d is an illegal back reference", paren_no);
2571 return NULL;
2574 if (emit == EMIT_NODE) {
2575 if (is_cross_regex) {
2576 Reg_Parse++; /* Skip past the '~' in a cross regex back reference.
2577 We only do this if we are emitting code. */
2579 if (Is_Case_Insensitive) {
2580 ret_val = emit_node (X_REGEX_BR_CI);
2581 } else {
2582 ret_val = emit_node (X_REGEX_BR);
2584 } else {
2585 if (Is_Case_Insensitive) {
2586 ret_val = emit_node (BACK_REF_CI);
2587 } else {
2588 ret_val = emit_node (BACK_REF);
2592 emit_byte ((unsigned char) paren_no);
2594 if (is_cross_regex || TEST_BIT (Paren_Has_Width, paren_no)) {
2595 *flag_param |= HAS_WIDTH;
2597 } else if (emit == CHECK_ESCAPE) {
2598 ret_val = (unsigned char *) 1;
2599 } else {
2600 ret_val = NULL;
2603 return ret_val;
2606 /*======================================================================*
2607 * Regex execution related code
2608 *======================================================================*/
2610 /* Global work variables for `ExecRE'. */
2612 static unsigned char *Reg_Input; /* String-input pointer. */
2613 static unsigned char *Start_Of_String; /* Beginning of input, for ^ */
2614 /* and < checks. */
2615 static unsigned char *End_Of_String; /* Logical end of input (if
2616 supplied, till \0 otherwise) */
2617 static unsigned char *Look_Behind_To; /* Position till were look behind
2618 can safely check back */
2619 static unsigned char **Start_Ptr_Ptr; /* Pointer to `startp' array. */
2620 static unsigned char **End_Ptr_Ptr; /* Ditto for `endp'. */
2621 static unsigned char *Extent_Ptr_FW; /* Forward extent pointer */
2622 static unsigned char *Extent_Ptr_BW; /* Backward extent pointer */
2623 static unsigned char *Back_Ref_Start [10]; /* Back_Ref_Start [0] and */
2624 static unsigned char *Back_Ref_End [10]; /* Back_Ref_End [0] are not */
2625 /* used. This simplifies */
2626 /* indexing. */
2628 * Measured recursion limits:
2629 * Linux: +/- 40 000 (up to 110 000)
2630 * Solaris: +/- 85 000
2631 * HP-UX 11: +/- 325 000
2633 * So 10 000 ought to be safe.
2635 #define REGEX_RECURSION_LIMIT 10000
2636 static int Recursion_Count; /* Recursion counter */
2637 static int Recursion_Limit_Exceeded; /* Recursion limit exceeded flag */
2639 #define AT_END_OF_STRING(X) (*(X) == (unsigned char)'\0' ||\
2640 (End_Of_String != NULL && (X) >= End_Of_String))
2642 /* static regexp *Cross_Regex_Backref; */
2644 static int Prev_Is_BOL;
2645 static int Succ_Is_EOL;
2646 static int Prev_Is_Delim;
2647 static int Succ_Is_Delim;
2649 /* Define a pointer to an array to hold general (...){m,n} counts. */
2651 typedef struct brace_counts {
2652 unsigned long count [1]; /* More unwarranted chumminess with compiler. */
2653 } brace_counts;
2655 static struct brace_counts *Brace;
2657 /* Default table for determining whether a character is a word delimiter. */
2659 static unsigned char Default_Delimiters [UCHAR_MAX] = {0};
2661 static unsigned char *Current_Delimiters; /* Current delimiter table */
2663 /* Forward declarations of functions used by `ExecRE' */
2665 static int attempt (regexp *, unsigned char *);
2666 static int match (unsigned char *, int *);
2667 static unsigned long greedy (unsigned char *, long);
2668 static void adjustcase (unsigned char *, int, unsigned char);
2669 static unsigned char * makeDelimiterTable (unsigned char *, unsigned char *);
2672 * ExecRE - match a `regexp' structure against a string
2674 * If `end' is non-NULL, matches may not BEGIN past end, but may extend past
2675 * it. If reverse is true, `end' must be specified, and searching begins at
2676 * `end'. "isbol" should be set to true if the beginning of the string is the
2677 * actual beginning of a line (since `ExecRE' can't look backwards from the
2678 * beginning to find whether there was a newline before). Likewise, "isbow"
2679 * asks whether the string is preceded by a word delimiter. End of string is
2680 * always treated as a word and line boundary (there may be cases where it
2681 * shouldn't be, in which case, this should be changed). "delimit" (if
2682 * non-null) specifies a null-terminated string of characters to be considered
2683 * word delimiters matching "<" and ">". if "delimit" is NULL, the default
2684 * delimiters (as set in SetREDefaultWordDelimiters) are used.
2685 * Look_behind_to indicates the position till where it is safe to
2686 * perform look-behind matches. If set, it should be smaller than or equal
2687 * to the start position of the search (pointed at by string). If it is NULL,
2688 * it defaults to the start position.
2689 * Finally, match_to indicates the logical end of the string, till where
2690 * matches are allowed to extend. Note that look-ahead patterns may look
2691 * past that boundary. If match_to is set to NULL, the terminating \0 is
2692 * assumed to correspond to the logical boundary. Match_to, if set, must be
2693 * larger than or equal to end, if set.
2696 int ExecRE(regexp *prog, const char* string, const char* end, int reverse,
2697 char prev_char, char succ_char, const char* delimiters,
2698 const char* look_behind_to, const char* match_to)
2701 register unsigned char *str;
2702 unsigned char **s_ptr;
2703 unsigned char **e_ptr;
2704 int ret_val = 0;
2705 unsigned char tempDelimitTable [256];
2706 int i;
2708 /* Check for valid parameters. */
2710 if (prog == NULL || string == NULL) {
2711 reg_error ("NULL parameter to `ExecRE\'");
2712 goto SINGLE_RETURN;
2715 /* Check validity of program. */
2717 if (U_CHAR_AT (prog->program) != MAGIC) {
2718 reg_error ("corrupted program");
2719 goto SINGLE_RETURN;
2722 s_ptr = (unsigned char **) prog->startp;
2723 e_ptr = (unsigned char **) prog->endp;
2725 /* If caller has supplied delimiters, make a delimiter table */
2727 if (delimiters == NULL) {
2728 Current_Delimiters = Default_Delimiters;
2729 } else {
2730 Current_Delimiters = makeDelimiterTable (
2731 (unsigned char *) delimiters,
2732 (unsigned char *) tempDelimitTable);
2735 /* Remember the logical end of the string. */
2737 End_Of_String = (unsigned char *) match_to;
2739 if (end == NULL && reverse) {
2740 for (end = string; !AT_END_OF_STRING((unsigned char*)end); end++) ;
2741 succ_char = '\n';
2742 } else if (end == NULL) {
2743 succ_char = '\n';
2746 /* Initialize arrays used by shortcut_escape. */
2748 if (!init_ansi_classes ()) goto SINGLE_RETURN;
2750 /* Remember the beginning of the string for matching BOL */
2752 Start_Of_String = (unsigned char *) string;
2753 Look_Behind_To = (unsigned char *) (look_behind_to?look_behind_to:string);
2755 Prev_Is_BOL = ((prev_char == '\n') || (prev_char == '\0') ? 1 : 0);
2756 Succ_Is_EOL = ((succ_char == '\n') || (succ_char == '\0') ? 1 : 0);
2757 Prev_Is_Delim = (Current_Delimiters [(unsigned char)prev_char] ? 1 : 0);
2758 Succ_Is_Delim = (Current_Delimiters [(unsigned char)succ_char] ? 1 : 0);
2760 Total_Paren = (int) (prog->program [1]);
2761 Num_Braces = (int) (prog->program [2]);
2763 /* Reset the recursion detection flag */
2764 Recursion_Limit_Exceeded = 0;
2766 /* Allocate memory for {m,n} construct counting variables if need be. */
2768 if (Num_Braces > 0) {
2769 Brace =
2770 (brace_counts *) malloc (sizeof (brace_counts) * (size_t) Num_Braces);
2772 if (Brace == NULL) {
2773 reg_error ("out of memory in `ExecRE\'");
2774 goto SINGLE_RETURN;
2776 } else {
2777 Brace = NULL;
2780 /* Initialize the first nine (9) capturing parentheses start and end
2781 pointers to point to the start of the search string. This is to prevent
2782 crashes when later trying to reference captured parens that do not exist
2783 in the compiled regex. We only need to do the first nine since users
2784 can only specify \1, \2, ... \9. */
2786 for (i = 9; i > 0; i--) {
2787 *s_ptr++ = (unsigned char *) string;
2788 *e_ptr++ = (unsigned char *) string;
2791 if (!reverse) { /* Forward Search */
2792 if (prog->anchor) {
2793 /* Search is anchored at BOL */
2795 if (attempt (prog, (unsigned char *) string)) {
2796 ret_val = 1;
2797 goto SINGLE_RETURN;
2800 for (str = (unsigned char *) string;
2801 !AT_END_OF_STRING(str) && str != (unsigned char *) end && !Recursion_Limit_Exceeded;
2802 str++) {
2804 if (*str == '\n') {
2805 if (attempt (prog, str + 1)) {
2806 ret_val = 1;
2807 break;
2812 goto SINGLE_RETURN;
2814 } else if (prog->match_start != '\0') {
2815 /* We know what char match must start with. */
2817 for (str = (unsigned char *) string;
2818 !AT_END_OF_STRING(str) && str != (unsigned char *) end && !Recursion_Limit_Exceeded;
2819 str++) {
2821 if (*str == (unsigned char)prog->match_start) {
2822 if (attempt (prog, str)) {
2823 ret_val = 1;
2824 break;
2829 goto SINGLE_RETURN;
2830 } else {
2831 /* General case */
2833 for (str = (unsigned char *) string;
2834 !AT_END_OF_STRING(str) && str != (unsigned char *) end && !Recursion_Limit_Exceeded;
2835 str++) {
2837 if (attempt (prog, str)) {
2838 ret_val = 1;
2839 break;
2843 /* Beware of a single $ matching \0 */
2844 if (!Recursion_Limit_Exceeded && !ret_val && AT_END_OF_STRING(str) && str != (unsigned char *) end) {
2845 if (attempt (prog, str)) {
2846 ret_val = 1;
2850 goto SINGLE_RETURN;
2852 } else { /* Search reverse, same as forward, but loops run backward */
2854 /* Make sure that we don't start matching beyond the logical end */
2855 if (End_Of_String != NULL && (unsigned char*)end > End_Of_String) {
2856 end = (const char*)End_Of_String;
2859 if (prog->anchor) {
2860 /* Search is anchored at BOL */
2862 for (str = (unsigned char *)(end - 1);
2863 str >= (unsigned char *) string && !Recursion_Limit_Exceeded;
2864 str--) {
2866 if (*str == '\n') {
2867 if (attempt (prog, str + 1)) {
2868 ret_val = 1;
2869 goto SINGLE_RETURN;
2874 if (!Recursion_Limit_Exceeded && attempt (prog, (unsigned char *) string)) {
2875 ret_val = 1;
2876 goto SINGLE_RETURN;
2879 goto SINGLE_RETURN;
2880 } else if (prog->match_start != '\0') {
2881 /* We know what char match must start with. */
2883 for (str = (unsigned char *) end;
2884 str >= (unsigned char *) string && !Recursion_Limit_Exceeded;
2885 str--) {
2887 if (*str == (unsigned char)prog->match_start) {
2888 if (attempt (prog, str)) {
2889 ret_val = 1;
2890 break;
2895 goto SINGLE_RETURN;
2896 } else {
2897 /* General case */
2899 for (str = (unsigned char *) end;
2900 str >= (unsigned char *) string && !Recursion_Limit_Exceeded;
2901 str--) {
2903 if (attempt (prog, str)) {
2904 ret_val = 1;
2905 break;
2911 SINGLE_RETURN: if (Brace) free (Brace);
2913 if (Recursion_Limit_Exceeded) return (0);
2915 return (ret_val);
2918 /*--------------------------------------------------------------------*
2919 * init_ansi_classes
2921 * Generate character class sets using locale aware ANSI C functions.
2923 *--------------------------------------------------------------------*/
2925 static int init_ansi_classes (void) {
2927 static int initialized = 0;
2928 static int underscore = (int) '_';
2929 int i, word_count, letter_count, space_count;
2931 if (!initialized) {
2932 initialized = 1; /* Only need to generate character sets once. */
2933 word_count = 0;
2934 letter_count = 0;
2935 space_count = 0;
2937 for (i = 1; i < (int)UCHAR_MAX; i++) {
2938 if (isalnum (i) || i == underscore) {
2939 Word_Char [word_count++] = (unsigned char) i;
2942 if (isalpha (i)) {
2943 Letter_Char [letter_count++] = (unsigned char) i;
2946 /* Note: Whether or not newline is considered to be whitespace is
2947 handled by switches within the original regex and is thus omitted
2948 here. */
2950 if (isspace (i) && (i != (int) '\n')) {
2951 White_Space [space_count++] = (unsigned char) i;
2954 /* Make sure arrays are big enough. ("- 2" because of zero array
2955 origin and we need to leave room for the NULL terminator.) */
2957 if (word_count > (ALNUM_CHAR_SIZE - 2) ||
2958 space_count > (WHITE_SPACE_SIZE - 2) ||
2959 letter_count > (ALNUM_CHAR_SIZE - 2)) {
2961 reg_error ("internal error #9 `init_ansi_classes\'");
2962 return (0);
2966 Word_Char [word_count] = '\0';
2967 Letter_Char [word_count] = '\0';
2968 White_Space [space_count] = '\0';
2971 return (1);
2974 /*----------------------------------------------------------------------*
2975 * attempt - try match at specific point, returns: 0 failure, 1 success
2976 *----------------------------------------------------------------------*/
2978 static int attempt (regexp *prog, unsigned char *string) {
2980 register int i;
2981 register unsigned char **s_ptr;
2982 register unsigned char **e_ptr;
2983 int branch_index = 0; /* Must be set to zero ! */
2985 Reg_Input = string;
2986 Start_Ptr_Ptr = (unsigned char **) prog->startp;
2987 End_Ptr_Ptr = (unsigned char **) prog->endp;
2988 s_ptr = (unsigned char **) prog->startp;
2989 e_ptr = (unsigned char **) prog->endp;
2991 /* Reset the recursion counter. */
2992 Recursion_Count = 0;
2994 /* Overhead due to capturing parentheses. */
2996 Extent_Ptr_BW = string;
2997 Extent_Ptr_FW = NULL;
2999 for (i = Total_Paren + 1; i > 0; i--) {
3000 *s_ptr++ = NULL;
3001 *e_ptr++ = NULL;
3004 if (match ((unsigned char *) (prog->program + REGEX_START_OFFSET),
3005 &branch_index)) {
3006 prog->startp [0] = (char *) string;
3007 prog->endp [0] = (char *) Reg_Input; /* <-- One char AFTER */
3008 prog->extentpBW = (char *) Extent_Ptr_BW; /* matched string! */
3009 prog->extentpFW = (char *) Extent_Ptr_FW;
3010 prog->top_branch = branch_index;
3012 return (1);
3013 } else {
3014 return (0);
3018 /*----------------------------------------------------------------------*
3019 * match - main matching routine
3021 * Conceptually the strategy is simple: check to see whether the
3022 * current node matches, call self recursively to see whether the rest
3023 * matches, and then act accordingly. In practice we make some effort
3024 * to avoid recursion, in particular by going through "ordinary" nodes
3025 * (that don't need to know whether the rest of the match failed) by a
3026 * loop instead of by recursion. Returns 0 failure, 1 success.
3027 *----------------------------------------------------------------------*/
3028 #define MATCH_RETURN(X)\
3029 { --Recursion_Count; return (X); }
3030 #define CHECK_RECURSION_LIMIT\
3031 if (Recursion_Limit_Exceeded) MATCH_RETURN (0);
3033 static int match (unsigned char *prog, int *branch_index_param) {
3035 register unsigned char *scan; /* Current node. */
3036 unsigned char *next; /* Next node. */
3037 register int next_ptr_offset; /* Used by the NEXT_PTR () macro */
3039 if (++Recursion_Count > REGEX_RECURSION_LIMIT) {
3040 if (!Recursion_Limit_Exceeded) /* Prevent duplicate errors */
3041 reg_error("recursion limit exceeded, please respecify expression");
3042 Recursion_Limit_Exceeded = 1;
3043 MATCH_RETURN (0);
3047 scan = prog;
3049 while (scan != NULL) {
3050 NEXT_PTR (scan, next);
3052 switch (GET_OP_CODE (scan)) {
3053 case BRANCH:
3055 register unsigned char *save;
3056 register int branch_index_local = 0;
3058 if (GET_OP_CODE (next) != BRANCH) { /* No choice. */
3059 next = OPERAND (scan); /* Avoid recursion. */
3060 } else {
3061 do {
3062 save = Reg_Input;
3064 if (match (OPERAND (scan), NULL))
3066 if (branch_index_param)
3067 *branch_index_param = branch_index_local;
3068 MATCH_RETURN (1);
3071 CHECK_RECURSION_LIMIT
3073 ++branch_index_local;
3075 Reg_Input = save; /* Backtrack. */
3076 NEXT_PTR (scan, scan);
3077 } while (scan != NULL && GET_OP_CODE (scan) == BRANCH);
3079 MATCH_RETURN (0); /* NOT REACHED */
3083 break;
3085 case EXACTLY:
3087 register int len;
3088 register unsigned char *opnd;
3090 opnd = OPERAND (scan);
3092 /* Inline the first character, for speed. */
3094 if (*opnd != *Reg_Input) MATCH_RETURN (0);
3096 len = strlen ((char *) opnd);
3098 if (End_Of_String != NULL && Reg_Input + len > End_Of_String) {
3099 MATCH_RETURN (0);
3102 if (len > 1 &&
3103 strncmp ((char *) opnd, (char *) Reg_Input, len) != 0) {
3105 MATCH_RETURN (0);
3108 Reg_Input += len;
3111 break;
3113 case SIMILAR:
3115 register unsigned char *opnd;
3116 register unsigned char test;
3118 opnd = OPERAND (scan);
3120 /* Note: the SIMILAR operand was converted to lower case during
3121 regex compile. */
3123 while ((test = *opnd++) != '\0') {
3124 if (AT_END_OF_STRING(Reg_Input) ||
3125 tolower (*Reg_Input++) != test) {
3127 MATCH_RETURN (0);
3132 break;
3134 case BOL: /* `^' (beginning of line anchor) */
3135 if (Reg_Input == Start_Of_String) {
3136 if (Prev_Is_BOL) break;
3137 } else if (*(Reg_Input - 1) == '\n') {
3138 break;
3141 MATCH_RETURN (0);
3143 case EOL: /* `$' anchor matches end of line and end of string */
3144 if (*Reg_Input == '\n' || (AT_END_OF_STRING(Reg_Input) && Succ_Is_EOL)) {
3145 break;
3148 MATCH_RETURN (0);
3150 case BOWORD: /* `<' (beginning of word anchor) */
3151 /* Check to see if the current character is not a delimiter
3152 and the preceding character is. */
3154 int prev_is_delim;
3155 if (Reg_Input == Start_Of_String) {
3156 prev_is_delim = Prev_Is_Delim;
3157 } else {
3158 prev_is_delim = Current_Delimiters [ *(Reg_Input - 1) ];
3160 if (prev_is_delim) {
3161 int current_is_delim;
3162 if (AT_END_OF_STRING(Reg_Input)) {
3163 current_is_delim = Succ_Is_Delim;
3164 } else {
3165 current_is_delim = Current_Delimiters [ *Reg_Input ];
3167 if (!current_is_delim) break;
3171 MATCH_RETURN (0);
3173 case EOWORD: /* `>' (end of word anchor) */
3174 /* Check to see if the current character is a delimiter
3175 and the preceding character is not. */
3177 int prev_is_delim;
3178 if (Reg_Input == Start_Of_String) {
3179 prev_is_delim = Prev_Is_Delim;
3180 } else {
3181 prev_is_delim = Current_Delimiters [ *(Reg_Input-1) ];
3183 if (!prev_is_delim) {
3184 int current_is_delim;
3185 if (AT_END_OF_STRING(Reg_Input)) {
3186 current_is_delim = Succ_Is_Delim;
3187 } else {
3188 current_is_delim = Current_Delimiters [ *Reg_Input ];
3190 if (current_is_delim) break;
3194 MATCH_RETURN (0);
3196 case NOT_BOUNDARY: /* \B (NOT a word boundary) */
3198 int prev_is_delim;
3199 int current_is_delim;
3200 if (Reg_Input == Start_Of_String) {
3201 prev_is_delim = Prev_Is_Delim;
3202 } else {
3203 prev_is_delim = Current_Delimiters [ *(Reg_Input-1) ];
3205 if (AT_END_OF_STRING(Reg_Input)) {
3206 current_is_delim = Succ_Is_Delim;
3207 } else {
3208 current_is_delim = Current_Delimiters [ *Reg_Input ];
3210 if (!(prev_is_delim ^ current_is_delim)) break;
3213 MATCH_RETURN (0);
3215 case IS_DELIM: /* \y (A word delimiter character.) */
3216 if (Current_Delimiters [ *Reg_Input ] &&
3217 !AT_END_OF_STRING(Reg_Input)) {
3218 Reg_Input++; break;
3221 MATCH_RETURN (0);
3223 case NOT_DELIM: /* \Y (NOT a word delimiter character.) */
3224 if (!Current_Delimiters [ *Reg_Input ] &&
3225 !AT_END_OF_STRING(Reg_Input)) {
3226 Reg_Input++; break;
3229 MATCH_RETURN (0);
3231 case WORD_CHAR: /* \w (word character; alpha-numeric or underscore) */
3232 if ((isalnum ((int) *Reg_Input) || *Reg_Input == '_') &&
3233 !AT_END_OF_STRING(Reg_Input)) {
3234 Reg_Input++; break;
3237 MATCH_RETURN (0);
3239 case NOT_WORD_CHAR:/* \W (NOT a word character) */
3240 if (isalnum ((int) *Reg_Input) ||
3241 *Reg_Input == '_' ||
3242 *Reg_Input == '\n' ||
3243 AT_END_OF_STRING(Reg_Input)) MATCH_RETURN (0);
3245 Reg_Input++; break;
3247 case ANY: /* `.' (matches any character EXCEPT newline) */
3248 if (AT_END_OF_STRING(Reg_Input) || *Reg_Input == '\n') MATCH_RETURN (0);
3250 Reg_Input++; break;
3252 case EVERY: /* `.' (matches any character INCLUDING newline) */
3253 if (AT_END_OF_STRING(Reg_Input)) MATCH_RETURN (0);
3255 Reg_Input++; break;
3257 case DIGIT: /* \d, same as [0123456789] */
3258 if (!isdigit ((int) *Reg_Input) ||
3259 AT_END_OF_STRING(Reg_Input)) MATCH_RETURN (0);
3261 Reg_Input++; break;
3263 case NOT_DIGIT: /* \D, same as [^0123456789] */
3264 if (isdigit ((int) *Reg_Input) ||
3265 *Reg_Input == '\n' ||
3266 AT_END_OF_STRING(Reg_Input)) MATCH_RETURN (0);
3268 Reg_Input++; break;
3270 case LETTER: /* \l, same as [a-zA-Z] */
3271 if (!isalpha ((int) *Reg_Input) ||
3272 AT_END_OF_STRING(Reg_Input)) MATCH_RETURN (0);
3274 Reg_Input++; break;
3276 case NOT_LETTER: /* \L, same as [^0123456789] */
3277 if (isalpha ((int) *Reg_Input) ||
3278 *Reg_Input == '\n' ||
3279 AT_END_OF_STRING(Reg_Input)) MATCH_RETURN (0);
3281 Reg_Input++; break;
3283 case SPACE: /* \s, same as [ \t\r\f\v] */
3284 if (!isspace ((int) *Reg_Input) ||
3285 *Reg_Input == '\n' ||
3286 AT_END_OF_STRING(Reg_Input)) MATCH_RETURN (0);
3288 Reg_Input++; break;
3290 case SPACE_NL: /* \s, same as [\n \t\r\f\v] */
3291 if (!isspace ((int) *Reg_Input) ||
3292 AT_END_OF_STRING(Reg_Input)) MATCH_RETURN (0);
3294 Reg_Input++; break;
3296 case NOT_SPACE: /* \S, same as [^\n \t\r\f\v] */
3297 if (isspace ((int) *Reg_Input) ||
3298 AT_END_OF_STRING(Reg_Input)) MATCH_RETURN (0);
3300 Reg_Input++; break;
3302 case NOT_SPACE_NL: /* \S, same as [^ \t\r\f\v] */
3303 if ((isspace ((int) *Reg_Input) && *Reg_Input != '\n') ||
3304 AT_END_OF_STRING(Reg_Input)) MATCH_RETURN (0);
3306 Reg_Input++; break;
3308 case ANY_OF: /* [...] character class. */
3309 if (AT_END_OF_STRING(Reg_Input))
3310 MATCH_RETURN (0); /* Needed because strchr ()
3311 considers \0 as a member
3312 of the character set. */
3314 if (strchr ((char *) OPERAND (scan), (int) *Reg_Input) == NULL) {
3315 MATCH_RETURN (0);
3318 Reg_Input++; break;
3320 case ANY_BUT: /* [^...] Negated character class-- does NOT normally
3321 match newline (\n added usually to operand at compile
3322 time.) */
3324 if (AT_END_OF_STRING(Reg_Input)) MATCH_RETURN (0); /* See comment for ANY_OF. */
3326 if (strchr ((char *) OPERAND (scan), (int) *Reg_Input) != NULL) {
3327 MATCH_RETURN (0);
3330 Reg_Input++; break;
3332 case NOTHING:
3333 case BACK:
3334 break;
3336 case STAR:
3337 case PLUS:
3338 case QUESTION:
3339 case BRACE:
3341 case LAZY_STAR:
3342 case LAZY_PLUS:
3343 case LAZY_QUESTION:
3344 case LAZY_BRACE:
3346 register unsigned long num_matched = REG_ZERO;
3347 register unsigned long min = ULONG_MAX, max = REG_ZERO;
3348 register unsigned char *save;
3349 register unsigned char next_char;
3350 unsigned char *next_op;
3351 int lazy = 0;
3353 /* Lookahead (when possible) to avoid useless match attempts
3354 when we know what character comes next. */
3356 if (GET_OP_CODE (next) == EXACTLY) {
3357 next_char = *OPERAND (next);
3358 } else {
3359 next_char = '\0';/* i.e. Don't know what next character is. */
3362 next_op = OPERAND (scan);
3364 switch (GET_OP_CODE (scan)) {
3365 case LAZY_STAR:
3366 lazy = 1;
3367 case STAR:
3368 min = REG_ZERO;
3369 max = ULONG_MAX;
3370 break;
3372 case LAZY_PLUS:
3373 lazy = 1;
3374 case PLUS:
3375 min = REG_ONE;
3376 max = ULONG_MAX;
3377 break;
3379 case LAZY_QUESTION:
3380 lazy = 1;
3381 case QUESTION:
3382 min = REG_ZERO;
3383 max = REG_ONE;
3384 break;
3386 case LAZY_BRACE:
3387 lazy = 1;
3388 case BRACE:
3389 min = (unsigned long)
3390 GET_OFFSET (scan + NEXT_PTR_SIZE);
3392 max = (unsigned long)
3393 GET_OFFSET (scan + (2 * NEXT_PTR_SIZE));
3395 if (max <= REG_INFINITY) max = ULONG_MAX;
3397 next_op = OPERAND (scan + (2 * NEXT_PTR_SIZE));
3400 save = Reg_Input;
3402 if (lazy) {
3403 if ( min > REG_ZERO) num_matched = greedy (next_op, min);
3404 } else {
3405 num_matched = greedy (next_op, max);
3408 while (min <= num_matched && num_matched <= max) {
3409 if (next_char == '\0' || next_char == *Reg_Input) {
3410 if (match (next, NULL)) MATCH_RETURN (1);
3412 CHECK_RECURSION_LIMIT
3415 /* Couldn't or didn't match. */
3417 if (lazy) {
3418 if (!greedy (next_op, 1)) MATCH_RETURN (0);
3420 num_matched++; /* Inch forward. */
3421 } else if (num_matched > REG_ZERO) {
3422 num_matched--; /* Back up. */
3423 } else if (min == REG_ZERO && num_matched == REG_ZERO) {
3424 break;
3427 Reg_Input = save + num_matched;
3430 MATCH_RETURN (0);
3433 break;
3435 case END:
3436 if (Extent_Ptr_FW == NULL || (Reg_Input - Extent_Ptr_FW) > 0) {
3437 Extent_Ptr_FW = Reg_Input;
3440 MATCH_RETURN (1); /* Success! */
3442 break;
3444 case INIT_COUNT:
3445 Brace->count [*OPERAND (scan)] = REG_ZERO;
3447 break;
3449 case INC_COUNT:
3450 Brace->count [*OPERAND (scan)]++;
3452 break;
3454 case TEST_COUNT:
3455 if (Brace->count [*OPERAND (scan)] <
3456 (unsigned long) GET_OFFSET (scan + NEXT_PTR_SIZE + INDEX_SIZE)) {
3458 next = scan + NODE_SIZE + INDEX_SIZE + NEXT_PTR_SIZE;
3461 break;
3463 case BACK_REF:
3464 case BACK_REF_CI:
3465 /* case X_REGEX_BR: */
3466 /* case X_REGEX_BR_CI: *** IMPLEMENT LATER */
3468 register unsigned char *captured, *finish;
3469 int paren_no;
3471 paren_no = (int) *OPERAND (scan);
3473 /* if (GET_OP_CODE (scan) == X_REGEX_BR ||
3474 GET_OP_CODE (scan) == X_REGEX_BR_CI) {
3476 if (Cross_Regex_Backref == NULL) MATCH_RETURN (0);
3478 captured =
3479 (unsigned char *) Cross_Regex_Backref->startp [paren_no];
3481 finish =
3482 (unsigned char *) Cross_Regex_Backref->endp [paren_no];
3483 } else { */
3484 captured = Back_Ref_Start [paren_no];
3485 finish = Back_Ref_End [paren_no];
3486 /* } */
3488 if ((captured != NULL) && (finish != NULL)) {
3489 if (captured > finish) MATCH_RETURN (0);
3491 if (GET_OP_CODE (scan) == BACK_REF_CI /* ||
3492 GET_OP_CODE (scan) == X_REGEX_BR_CI*/ ) {
3494 while (captured < finish) {
3495 if (AT_END_OF_STRING(Reg_Input) ||
3496 tolower (*captured++) != tolower (*Reg_Input++)) {
3497 MATCH_RETURN (0);
3500 } else {
3501 while (captured < finish) {
3502 if (AT_END_OF_STRING(Reg_Input) ||
3503 *captured++ != *Reg_Input++) MATCH_RETURN (0);
3507 break;
3508 } else {
3509 MATCH_RETURN (0);
3513 case POS_AHEAD_OPEN:
3514 case NEG_AHEAD_OPEN:
3516 register unsigned char *save;
3517 register unsigned char *saved_end;
3518 int answer;
3520 save = Reg_Input;
3522 /* Temporarily ignore the logical end of the string, to allow
3523 lookahead past the end. */
3524 saved_end = End_Of_String;
3525 End_Of_String = NULL;
3527 answer = match (next, NULL); /* Does the look-ahead regex match? */
3529 CHECK_RECURSION_LIMIT
3531 if ((GET_OP_CODE (scan) == POS_AHEAD_OPEN) ? answer : !answer) {
3532 /* Remember the last (most to the right) character position
3533 that we consume in the input for a successful match. This
3534 is info that may be needed should an attempt be made to
3535 match the exact same text at the exact same place. Since
3536 look-aheads backtrack, a regex with a trailing look-ahead
3537 may need more text than it matches to accomplish a
3538 re-match. */
3540 if (Extent_Ptr_FW == NULL || (Reg_Input - Extent_Ptr_FW) > 0) {
3541 Extent_Ptr_FW = Reg_Input;
3544 Reg_Input = save; /* Backtrack to look-ahead start. */
3545 End_Of_String = saved_end; /* Restore logical end. */
3547 /* Jump to the node just after the (?=...) or (?!...)
3548 Construct. */
3550 next = next_ptr (OPERAND (scan)); /* Skip 1st branch */
3551 /* Skip the chain of branches inside the look-ahead */
3552 while(GET_OP_CODE(next) == BRANCH)
3553 next = next_ptr (next);
3554 next = next_ptr (next); /* Skip the LOOK_AHEAD_CLOSE */
3555 } else {
3556 Reg_Input = save; /* Backtrack to look-ahead start. */
3557 End_Of_String = saved_end; /* Restore logical end. */
3559 MATCH_RETURN (0);
3563 break;
3565 case POS_BEHIND_OPEN:
3566 case NEG_BEHIND_OPEN:
3568 register unsigned char *save;
3569 int answer;
3570 register int offset, upper;
3571 int lower;
3572 int found = 0;
3573 unsigned char *saved_end;
3575 save = Reg_Input;
3576 saved_end = End_Of_String;
3578 /* Prevent overshoot (greedy matching could end past the
3579 current position) by tightening the matching boundary.
3580 Lookahead inside lookbehind can still cross that boundary. */
3581 End_Of_String = Reg_Input;
3583 lower = GET_LOWER (scan);
3584 upper = GET_UPPER (scan);
3586 /* Start with the shortest match first. This is the most
3587 efficient direction in general.
3588 Note! Negative look behind is _very_ tricky when the length
3589 is not constant: we have to make sure the expression doesn't
3590 match for _any_ of the starting positions. */
3591 for (offset = lower; offset <= upper; ++offset) {
3592 Reg_Input = save - offset;
3594 if (Reg_Input < Look_Behind_To) {
3595 /* No need to look any further */
3596 break;
3599 answer = match (next, NULL); /* Does the look-behind regex match? */
3601 CHECK_RECURSION_LIMIT
3603 /* The match must have ended at the current position;
3604 otherwise it is invalid */
3605 if (answer && Reg_Input == save) {
3606 /* It matched, exactly far enough */
3607 found = 1;
3609 /* Remember the last (most to the left) character position
3610 that we consume in the input for a successful match.
3611 This is info that may be needed should an attempt be
3612 made to match the exact same text at the exact same
3613 place. Since look-behind backtracks, a regex with a
3614 leading look-behind may need more text than it matches
3615 to accomplish a re-match. */
3617 if (Extent_Ptr_BW == NULL ||
3618 (Extent_Ptr_BW - (save - offset)) > 0) {
3619 Extent_Ptr_BW = save - offset;
3622 break;
3626 /* Always restore the position and the logical string end. */
3627 Reg_Input = save;
3628 End_Of_String = saved_end;
3630 if ((GET_OP_CODE (scan) == POS_BEHIND_OPEN) ? found : !found) {
3631 /* The look-behind matches, so we must jump to the next
3632 node. The look-behind node is followed by a chain of
3633 branches (contents of the look-behind expression), and
3634 terminated by a look-behind-close node. */
3635 next = next_ptr (OPERAND (scan) + LENGTH_SIZE); /* 1st branch */
3636 /* Skip the chained branches inside the look-ahead */
3637 while (GET_OP_CODE (next) == BRANCH)
3638 next = next_ptr (next);
3639 next = next_ptr (next); /* Skip LOOK_BEHIND_CLOSE */
3640 } else {
3641 /* Not a match */
3642 MATCH_RETURN (0);
3645 break;
3647 case LOOK_AHEAD_CLOSE:
3648 case LOOK_BEHIND_CLOSE:
3649 MATCH_RETURN (1); /* We have reached the end of the look-ahead or
3650 look-behind which implies that we matched it,
3651 so return TRUE. */
3652 default:
3653 if ((GET_OP_CODE (scan) > OPEN) &&
3654 (GET_OP_CODE (scan) < OPEN + NSUBEXP)) {
3656 register int no;
3657 register unsigned char *save;
3659 no = GET_OP_CODE (scan) - OPEN;
3660 save = Reg_Input;
3662 if (no < 10) {
3663 Back_Ref_Start [no] = save;
3664 Back_Ref_End [no] = NULL;
3667 if (match (next, NULL)) {
3668 /* Do not set `Start_Ptr_Ptr' if some later invocation (think
3669 recursion) of the same parentheses already has. */
3671 if (Start_Ptr_Ptr [no] == NULL) Start_Ptr_Ptr [no] = save;
3673 MATCH_RETURN (1);
3674 } else {
3675 MATCH_RETURN (0);
3677 } else if ((GET_OP_CODE (scan) > CLOSE) &&
3678 (GET_OP_CODE (scan) < CLOSE + NSUBEXP)) {
3680 register int no;
3681 register unsigned char *save;
3683 no = GET_OP_CODE (scan) - CLOSE;
3684 save = Reg_Input;
3686 if (no < 10) Back_Ref_End [no] = save;
3688 if (match (next, NULL)) {
3689 /* Do not set `End_Ptr_Ptr' if some later invocation of the
3690 same parentheses already has. */
3692 if (End_Ptr_Ptr [no] == NULL) End_Ptr_Ptr [no] = save;
3694 MATCH_RETURN (1);
3695 } else {
3696 MATCH_RETURN (0);
3698 } else {
3699 reg_error ("memory corruption, `match\'");
3701 MATCH_RETURN (0);
3704 break;
3707 scan = next;
3710 /* We get here only if there's trouble -- normally "case END" is
3711 the terminating point. */
3713 reg_error ("corrupted pointers, `match\'");
3715 MATCH_RETURN (0);
3718 /*----------------------------------------------------------------------*
3719 * greedy
3721 * Repeatedly match something simple up to "max" times. If max <= 0
3722 * then match as much as possible (max = infinity). Uses unsigned long
3723 * variables to maximize the amount of text matchable for unbounded
3724 * qualifiers like '*' and '+'. This will allow at least 4,294,967,295
3725 * matches (4 Gig!) for an ANSI C compliant compiler. If you are
3726 * applying a regex to something bigger than that, you shouldn't be
3727 * using NEdit!
3729 * Returns the actual number of matches.
3730 *----------------------------------------------------------------------*/
3732 static unsigned long greedy (unsigned char *p, long max) {
3734 register unsigned char *input_str;
3735 register unsigned char *operand;
3736 register unsigned long count = REG_ZERO;
3737 register unsigned long max_cmp;
3739 input_str = Reg_Input;
3740 operand = OPERAND (p); /* Literal char or start of class characters. */
3741 max_cmp = (max > 0) ? (unsigned long) max : ULONG_MAX;
3743 switch (GET_OP_CODE (p)) {
3744 case ANY:
3745 /* Race to the end of the line or string. Dot DOESN'T match
3746 newline. */
3748 while (count < max_cmp &&
3749 *input_str != '\n' &&
3750 !AT_END_OF_STRING(input_str)) {
3751 count++; input_str++;
3754 break;
3756 case EVERY:
3757 /* Race to the end of the line or string. Dot DOES match newline. */
3759 while (count < max_cmp &&
3760 !AT_END_OF_STRING(input_str)) {
3761 count++; input_str++;
3764 break;
3766 case EXACTLY: /* Count occurrences of single character operand. */
3767 while (count < max_cmp &&
3768 *operand == *input_str &&
3769 !AT_END_OF_STRING(input_str)) {
3770 count++; input_str++;
3773 break;
3775 case SIMILAR: /* Case insensitive version of EXACTLY */
3776 while (count < max_cmp &&
3777 *operand == tolower (*input_str) &&
3778 !AT_END_OF_STRING(input_str)) {
3779 count++; input_str++;
3782 break;
3784 case ANY_OF: /* [...] character class. */
3785 while (count < max_cmp &&
3786 strchr ((char *) operand, (int) *input_str) != NULL &&
3787 !AT_END_OF_STRING(input_str)) {
3789 count++; input_str++;
3792 break;
3794 case ANY_BUT: /* [^...] Negated character class- does NOT normally
3795 match newline (\n added usually to operand at compile
3796 time.) */
3798 while (count < max_cmp &&
3799 strchr ((char *) operand, (int) *input_str) == NULL &&
3800 !AT_END_OF_STRING(input_str)) {
3802 count++; input_str++;
3805 break;
3807 case IS_DELIM: /* \y (not a word delimiter char)
3808 NOTE: '\n' and '\0' are always word delimiters. */
3810 while (count < max_cmp &&
3811 Current_Delimiters [ *input_str ] &&
3812 !AT_END_OF_STRING(input_str)) {
3813 count++; input_str++;
3816 break;
3818 case NOT_DELIM: /* \Y (not a word delimiter char)
3819 NOTE: '\n' and '\0' are always word delimiters. */
3821 while (count < max_cmp &&
3822 !Current_Delimiters [ *input_str ] &&
3823 !AT_END_OF_STRING(input_str)) {
3824 count++; input_str++;
3827 break;
3829 case WORD_CHAR: /* \w (word character, alpha-numeric or underscore) */
3830 while (count < max_cmp &&
3831 (isalnum ((int) *input_str) ||
3832 *input_str == (unsigned char) '_') &&
3833 !AT_END_OF_STRING(input_str)) {
3835 count++; input_str++;
3838 break;
3840 case NOT_WORD_CHAR:/* \W (NOT a word character) */
3841 while (count < max_cmp &&
3842 !isalnum ((int) *input_str) &&
3843 *input_str != (unsigned char) '_' &&
3844 *input_str != (unsigned char) '\n' &&
3845 !AT_END_OF_STRING(input_str)) {
3847 count++; input_str++;
3850 break;
3852 case DIGIT: /* same as [0123456789] */
3853 while (count < max_cmp &&
3854 isdigit ((int) *input_str) &&
3855 !AT_END_OF_STRING(input_str)) {
3856 count++; input_str++;
3859 break;
3861 case NOT_DIGIT: /* same as [^0123456789] */
3862 while (count < max_cmp &&
3863 !isdigit ((int) *input_str) &&
3864 *input_str != '\n' &&
3865 !AT_END_OF_STRING(input_str)) {
3867 count++; input_str++;
3870 break;
3872 case SPACE: /* same as [ \t\r\f\v]-- doesn't match newline. */
3873 while (count < max_cmp &&
3874 isspace ((int) *input_str) &&
3875 *input_str != '\n' &&
3876 !AT_END_OF_STRING(input_str)) {
3878 count++; input_str++;
3881 break;
3883 case SPACE_NL: /* same as [\n \t\r\f\v]-- matches newline. */
3884 while (count < max_cmp &&
3885 isspace ((int) *input_str) &&
3886 !AT_END_OF_STRING(input_str)) {
3888 count++; input_str++;
3891 break;
3893 case NOT_SPACE: /* same as [^\n \t\r\f\v]-- doesn't match newline. */
3894 while (count < max_cmp &&
3895 !isspace ((int) *input_str) &&
3896 !AT_END_OF_STRING(input_str)) {
3898 count++; input_str++;
3901 break;
3903 case NOT_SPACE_NL: /* same as [^ \t\r\f\v]-- matches newline. */
3904 while (count < max_cmp &&
3905 (!isspace ((int) *input_str) || *input_str == '\n') &&
3906 !AT_END_OF_STRING(input_str)) {
3908 count++; input_str++;
3911 break;
3913 case LETTER: /* same as [a-zA-Z] */
3914 while (count < max_cmp &&
3915 isalpha ((int) *input_str) &&
3916 !AT_END_OF_STRING(input_str)) {
3918 count++; input_str++;
3921 break;
3923 case NOT_LETTER: /* same as [^a-zA-Z] */
3924 while (count < max_cmp &&
3925 !isalpha ((int) *input_str) &&
3926 *input_str != '\n' &&
3927 !AT_END_OF_STRING(input_str)) {
3929 count++; input_str++;
3932 break;
3934 default:
3935 /* Called inappropriately. Only atoms that are SIMPLE should
3936 generate a call to greedy. The above cases should cover
3937 all the atoms that are SIMPLE. */
3939 reg_error ("internal error #10 `greedy\'");
3940 count = 0U; /* Best we can do. */
3943 /* Point to character just after last matched character. */
3945 Reg_Input = input_str;
3947 return (count);
3950 /*----------------------------------------------------------------------*
3951 * next_ptr - compute the address of a node's "NEXT" pointer.
3952 * Note: a simplified inline version is available via the NEXT_PTR() macro,
3953 * but that one is only to be used at time-critical places (see the
3954 * description of the macro).
3955 *----------------------------------------------------------------------*/
3957 static unsigned char * next_ptr (unsigned char *ptr) {
3959 register int offset;
3961 if (ptr == &Compute_Size) return (NULL);
3963 offset = GET_OFFSET (ptr);
3965 if (offset == 0) return (NULL);
3967 if (GET_OP_CODE (ptr) == BACK) {
3968 return (ptr - offset);
3969 } else {
3970 return (ptr + offset);
3975 ** SubstituteRE - Perform substitutions after a `regexp' match.
3977 ** This function cleanly shortens results of more than max length to max.
3978 ** To give the caller a chance to react to this the function returns False
3979 ** on any error. The substitution will still be executed.
3981 Boolean SubstituteRE(const regexp* prog, const char* source, char* dest,
3982 const int max)
3985 register unsigned char *src;
3986 unsigned char *src_alias;
3987 register unsigned char *dst;
3988 register unsigned char c;
3989 register unsigned char test;
3990 register int paren_no;
3991 register int len;
3992 register unsigned char chgcase;
3993 Boolean anyWarnings = False;
3995 if (prog == NULL || source == NULL || dest == NULL) {
3996 reg_error ("NULL parm to `SubstituteRE\'");
3998 return False;
4001 if (U_CHAR_AT (prog->program) != MAGIC) {
4002 reg_error ("damaged regexp passed to `SubstituteRE\'");
4004 return False;
4007 src = (unsigned char *) source;
4008 dst = (unsigned char *) dest;
4010 while ((c = *src++) != '\0') {
4011 chgcase = '\0';
4012 paren_no = -1;
4014 if (c == '\\') {
4015 /* Process any case altering tokens, i.e \u, \U, \l, \L. */
4017 if (*src == 'u' || *src == 'U' || *src == 'l' || *src == 'L') {
4018 chgcase = *src;
4019 src++;
4020 c = *src++;
4022 if (c == '\0') break;
4026 if (c == '&') {
4027 paren_no = 0;
4028 } else if (c == '\\') {
4029 /* Can not pass register variable `&src' to function `numeric_escape'
4030 so make a non-register copy that we can take the address of. */
4032 src_alias = src;
4034 if ('1' <= *src && *src <= '9') {
4035 paren_no = (int) *src++ - (int) '0';
4037 } else if ((test = literal_escape (*src)) != '\0') {
4038 c = test; src++;
4040 } else if ((test = numeric_escape (*src, &src_alias)) != '\0') {
4041 c = test;
4042 src = src_alias; src++;
4044 /* NOTE: if an octal escape for zero is attempted (e.g. \000), it
4045 will be treated as a literal string. */
4046 } else if (*src == '\0') {
4047 /* If '\' is the last character of the replacement string, it is
4048 interpreted as a literal backslash. */
4050 c = '\\';
4051 } else {
4052 c = *src++; /* Allow any escape sequence (This is */
4053 } /* INCONSISTENT with the `CompileRE' */
4054 } /* mind set of issuing an error! */
4056 if (paren_no < 0) { /* Ordinary character. */
4057 if (((char *) dst - (char *) dest) >= (max - 1)) {
4058 reg_error("replacing expression in `SubstituteRE\' too long; "
4059 "truncating");
4060 anyWarnings = True;
4061 break;
4062 } else {
4063 *dst++ = c;
4065 } else if (prog->startp [paren_no] != NULL &&
4066 prog->endp [paren_no] != NULL) {
4068 len = prog->endp [paren_no] - prog->startp [paren_no];
4070 if (((char *) dst + len - (char *) dest) >= max-1) {
4071 reg_error("replacing expression in `SubstituteRE\' too long; "
4072 "truncating");
4073 anyWarnings = True;
4074 len = max - ((char *) dst - (char *) dest) - 1;
4077 (void) strncpy ((char *) dst, (char *) prog->startp [paren_no], len);
4079 if (chgcase != '\0') adjustcase (dst, len, chgcase);
4081 dst += len;
4083 if (len != 0 && *(dst - 1) == '\0') { /* strncpy hit NUL. */
4084 reg_error ("damaged match string in `SubstituteRE\'");
4085 anyWarnings = True;
4090 *dst = '\0';
4092 return !anyWarnings;
4095 static void adjustcase (unsigned char *str, int len, unsigned char chgcase) {
4097 register unsigned char *string = str;
4098 int i;
4100 /* The tokens \u and \l only modify the first character while the tokens
4101 \U and \L modify the entire string. */
4103 if (islower (chgcase) && len > 0) len = 1;
4105 switch (chgcase) {
4106 case 'u':
4107 case 'U':
4108 for (i = 0; i < len; i++) {
4109 *(string + i) = toupper ((int) *(string + i));
4112 break;
4114 case 'l':
4115 case 'L':
4116 for (i = 0; i < len; i++) {
4117 *(string + i) = tolower ((int) *(string + i));
4120 break;
4124 /*----------------------------------------------------------------------*
4125 * reg_error
4126 *----------------------------------------------------------------------*/
4128 static void reg_error (char *str) {
4130 fprintf (
4131 stderr,
4132 "nedit: Internal error processing regular expression (%s)\n",
4133 str);
4136 /*----------------------------------------------------------------------*
4137 * makeDelimiterTable
4139 * Translate a null-terminated string of delimiters into a 256 byte
4140 * lookup table for determining whether a character is a delimiter or
4141 * not.
4143 * Table must be allocated by the caller.
4145 * Return value is a pointer to the table.
4146 *----------------------------------------------------------------------*/
4148 static unsigned char * makeDelimiterTable (
4149 unsigned char *delimiters,
4150 unsigned char *table) {
4152 unsigned char *c;
4154 memset (table, 0, 256);
4156 for (c = (unsigned char *) delimiters; *c != '\0'; c++) {
4157 table [*c] = 1;
4160 table [(int) NULL] = 1; /* These */
4161 table [(int) '\t'] = 1; /* characters */
4162 table [(int) '\n'] = 1; /* are always */
4163 table [(int) ' ' ] = 1; /* delimiters. */
4165 return table;
4168 /*----------------------------------------------------------------------*
4169 * SetREDefaultWordDelimiters
4171 * Builds a default delimiter table that persists across `ExecRE' calls.
4172 *----------------------------------------------------------------------*/
4174 void SetREDefaultWordDelimiters (char *delimiters) {
4175 makeDelimiterTable ((unsigned char *) delimiters, Default_Delimiters);