4 * regcomp and regexec -- regsub and regerror are elsewhere
6 * Copyright (c) 1986 by University of Toronto.
7 * Written by Henry Spencer. Not derived from licensed software.
9 * Permission is granted to anyone to use this software for any
10 * purpose on any computer system, and to redistribute it freely,
11 * subject to the following restrictions:
13 * 1. The author is not responsible for the consequences of use of
14 * this software, no matter how awful, even if they arise
17 * 2. The origin of this software must not be misrepresented, either
18 * by explicit claim or by omission.
20 * 3. Altered versions must be plainly marked as such, and must not
21 * be misrepresented as being the original software.
22 *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore,
23 *** hoptoad!gnu, on 27 Dec 1986, to add \n as an alternative to |
24 *** to assist in implementing egrep.
25 *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore,
26 *** hoptoad!gnu, on 27 Dec 1986, to add \< and \> for word-matching
27 *** as in BSD grep and ex.
28 *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore,
29 *** hoptoad!gnu, on 28 Dec 1986, to optimize characters quoted with \.
30 *** THIS IS AN ALTERED VERSION. It was altered by James A. Woods,
31 *** ames!jaw, on 19 June 1987, to quash a regcomp() redundancy.
32 *** THIS IS AN ALTERED VERSION. It was altered by Christopher Seiwald
33 *** seiwald@vix.com, on 28 August 1993, for use in jam. Regmagic.h
34 *** was moved into regexp.h, and the include of regexp.h now uses "'s
35 *** to avoid conflicting with the system regexp.h. Const, bless its
36 *** soul, was removed so it can compile everywhere. The declaration
37 *** of strchr() was in conflict on AIX, so it was removed (as it is
38 *** happily defined in string.h).
39 *** THIS IS AN ALTERED VERSION. It was altered by Christopher Seiwald
40 *** seiwald@perforce.com, on 20 January 2000, to use function prototypes.
41 *** THIS IS AN ALTERED VERSION. It was altered by Christopher Seiwald
42 *** seiwald@perforce.com, on 05 November 2002, to const string literals.
44 * THIS IS AN ALTERED VERSION. It was altered by Steve Bennett <steveb@workware.net.au>
45 * on 16 October 2010, to remove static state and add better Tcl ARE compatibility.
46 * This includes counted repetitions, UTF-8 support, character classes,
47 * shorthand character classes, increased number of parentheses to 100,
48 * backslash escape sequences. It also removes \n as an alternative to |.
50 * Beware that some of this code is subtly aware of the way operator
51 * precedence is structured in regular expressions. Serious changes in
52 * regular-expression syntax might require a total rethink.
60 #include "jimautoconf.h"
61 #include "jimregexp.h"
64 #if !defined(HAVE_REGCOMP) || defined(JIM_REGEXP)
66 /* An arbitrary limit, but this seems enough. Must be less than 1000. */
67 #define REG_MAX_PAREN 100
70 * Structure for regexp "program". This is essentially a linear encoding
71 * of a nondeterministic finite-state machine (aka syntax charts or
72 * "railroad normal form" in parsing technology). Each node is an opcode
73 * plus a "next" pointer, possibly plus an operand. "Next" pointers of
74 * all nodes except BRANCH implement concatenation; a "next" pointer with
75 * a BRANCH on both ends of it is connecting two alternatives. (Here we
76 * have one of the subtle syntax dependencies: an individual BRANCH (as
77 * opposed to a collection of them) is never concatenated with anything
78 * because of operator precedence.) The operand of some types of node is
79 * a literal string; for others, it is a node leading into a sub-FSM. In
80 * particular, the operand of a BRANCH node is the first node of the branch.
81 * (NB this is *not* a tree structure: the tail of the branch connects
82 * to the thing following the set of BRANCHes.) The opcodes are:
85 /* definition number opnd? meaning */
86 #define END 0 /* no End of program. */
87 #define BOL 1 /* no Match "" at beginning of line. */
88 #define EOL 2 /* no Match "" at end of line. */
89 #define ANY 3 /* no Match any one character. */
90 #define ANYOF 4 /* str Match any character in this string. */
91 #define ANYBUT 5 /* str Match any character not in this string. */
92 #define BRANCH 6 /* node Match this alternative, or the next... */
93 #define BACK 7 /* no Match "", "next" ptr points backward. */
94 #define EXACTLY 8 /* str Match this string. */
95 #define NOTHING 9 /* no Match empty string. */
96 #define REP 10 /* max,min Match this (simple) thing [min,max] times. */
97 #define REPMIN 11 /* max,min Match this (simple) thing [min,max] times, minimal match. */
98 #define REPX 12 /* max,min Match this (complex) thing [min,max] times. */
99 #define REPXMIN 13 /* max,min Match this (complex) thing [min,max] times, minimal match. */
101 #define WORDA 15 /* no Match "" at wordchar, where prev is nonword */
102 #define WORDZ 16 /* no Match "" at nonwordchar, where prev is word */
104 #define OPENNC 1000 /* no Non-capturing parentheses - must be OPEN-1 */
105 #define OPEN 1001 /* no Mark this point in input as start of #n. */
106 /* OPEN+1 is number 1, etc. */
108 /* must not be any other opts between OPEN and CLOSE */
110 #define CLOSENC 2000 /* no Non-capturing parentheses - must be CLOSE-1 */
111 #define CLOSE 2001 /* no Analogous to OPEN. */
112 #define CLOSE_END (CLOSE+REG_MAX_PAREN)
115 * The first word of the regexp internal "program" is actually this magic
116 * number; the start node begins in the second word.
118 #define REG_MAGIC 0xFADED00D
123 * BRANCH The set of branches constituting a single choice are hooked
124 * together with their "next" pointers, since precedence prevents
125 * anything being concatenated to any individual branch. The
126 * "next" pointer of the last BRANCH in a choice points to the
127 * thing following the whole choice. This is also where the
128 * final "next" pointer of each individual branch points; each
129 * branch starts with the operand node of a BRANCH node.
131 * BACK Normal "next" pointers all implicitly point forward; BACK
132 * exists to make loop structures possible.
134 * REP,REPX Repeated matches ('?', '*', '+' and {min,max}) are implemented
135 * as either simple repeats (REP) or complex repeats (REPX).
136 * These opcodes include a "min" and "max" count after the opcode.
137 * This is followed by a fourth "current count" word that is
138 * only used by REPX, as it implements a recursive match.
139 * REPMIN and REPXMIN are identical except they implement minimal repeats.
141 * OPEN,CLOSE ...are numbered at compile time.
145 * A node is one word of opcode followed by one word of "next" pointer.
146 * The "next" pointer value is a positive offset from the opcode of the node
148 * An operand, if any, simply follows the node. (Note that much of the
149 * code generation knows about this implicit relationship.)
151 #define OP(preg, p) (preg->program[p])
152 #define NEXT(preg, p) (preg->program[p + 1])
153 #define OPERAND(p) ((p) + 2)
156 * See regmagic.h for one further detail of program structure.
161 * Utility definitions.
164 #define FAIL(R,M) { (R)->err = (M); return (M); }
165 #define ISMULT(c) ((c) == '*' || (c) == '+' || (c) == '?' || (c) == '{')
166 #define META "^$.[()|?{+*"
169 * Flags to be passed up and down.
171 #define HASWIDTH 1 /* Known never to match null string. */
172 #define SIMPLE 2 /* Simple enough to be STAR/PLUS operand. */
173 #define SPSTART 4 /* Starts with * or +. */
174 #define WORST 0 /* Worst case. */
176 #define MAX_REP_COUNT 1000000
179 * Forward declarations for regcomp()'s friends.
181 static int reg(regex_t
*preg
, int paren
/* Parenthesized? */, int *flagp
);
182 static int regpiece(regex_t
*preg
, int *flagp
);
183 static int regbranch(regex_t
*preg
, int *flagp
);
184 static int regatom(regex_t
*preg
, int *flagp
);
185 static int regnode(regex_t
*preg
, int op
);
186 static int regnext(regex_t
*preg
, int p
);
187 static void regc(regex_t
*preg
, int b
);
188 static int reginsert(regex_t
*preg
, int op
, int size
, int opnd
);
189 static void regtail(regex_t
*preg
, int p
, int val
);
190 static void regoptail(regex_t
*preg
, int p
, int val
);
191 static int regopsize(regex_t
*preg
, int p
);
193 static int reg_range_find(const int *string
, int c
);
194 static const char *str_find(const char *string
, int c
, int nocase
);
195 static int prefix_cmp(const int *prog
, int proglen
, const char *string
, int nocase
);
199 static int regnarrate
= 0;
200 static void regdump(regex_t
*preg
);
201 static const char *regprop( int op
);
206 * Returns the length of the null-terminated integer sequence.
208 static int str_int_len(const int *seq
)
218 - regcomp - compile a regular expression into internal code
220 * We can't allocate space until we know how big the compiled form will be,
221 * but we can't compile it (and thus know how big it is) until we've got a
222 * place to put the code. So we cheat: we compile it twice, once with code
223 * generation turned off and size counting turned on, and once "for real".
224 * This also means that we don't allocate space until we are sure that the
225 * thing really will compile successfully, and we never have to move the
226 * code and thus invalidate pointers into it. (Note that it has to be in
227 * one piece because free() must be able to free it all.)
229 * Beware that the optimization-preparation code in here knows about some
230 * of the structure of the compiled regexp.
232 int regcomp(regex_t
*preg
, const char *exp
, int cflags
)
240 fprintf(stderr
, "Compiling: '%s'\n", exp
);
242 memset(preg
, 0, sizeof(*preg
));
245 FAIL(preg
, REG_ERR_NULL_ARGUMENT
);
247 /* First pass: determine size, legality. */
248 preg
->cflags
= cflags
;
249 preg
->regparse
= exp
;
251 /* Allocate space. */
252 preg
->proglen
= (strlen(exp
) + 1) * 5;
253 preg
->program
= malloc(preg
->proglen
* sizeof(int));
254 if (preg
->program
== NULL
)
255 FAIL(preg
, REG_ERR_NOMEM
);
257 /* Note that since we store a magic value as the first item in the program,
258 * program offsets will never be 0
260 regc(preg
, REG_MAGIC
);
261 if (reg(preg
, 0, &flags
) == 0) {
265 /* Small enough for pointer-storage convention? */
266 if (preg
->re_nsub
>= REG_MAX_PAREN
) /* Probably could be 65535L. */
267 FAIL(preg
,REG_ERR_TOO_BIG
);
269 /* Dig out information for optimizations. */
270 preg
->regstart
= 0; /* Worst-case defaults. */
274 scan
= 1; /* First BRANCH. */
275 if (OP(preg
, regnext(preg
, scan
)) == END
) { /* Only one top-level choice. */
276 scan
= OPERAND(scan
);
278 /* Starting-point info. */
279 if (OP(preg
, scan
) == EXACTLY
) {
280 preg
->regstart
= preg
->program
[OPERAND(scan
)];
282 else if (OP(preg
, scan
) == BOL
)
286 * If there's something expensive in the r.e., find the
287 * longest literal string that must appear and make it the
288 * regmust. Resolve ties in favor of later strings, since
289 * the regstart check works with the beginning of the r.e.
290 * and avoiding duplication strengthens checking. Not a
291 * strong reason, but sufficient in the absence of others.
296 for (; scan
!= 0; scan
= regnext(preg
, scan
)) {
297 if (OP(preg
, scan
) == EXACTLY
) {
298 int plen
= str_int_len(preg
->program
+ OPERAND(scan
));
300 longest
= OPERAND(scan
);
305 preg
->regmust
= longest
;
318 - reg - regular expression, i.e. main body or parenthesized thing
320 * Caller must absorb opening parenthesis.
322 * Combining parenthesis handling with the base level of regular expression
323 * is a trifle forced, but the need to tie the tails of the branches to what
324 * follows makes it hard to avoid.
326 static int reg(regex_t
*preg
, int paren
/* Parenthesized? */, int *flagp
)
334 *flagp
= HASWIDTH
; /* Tentatively. */
336 /* Make an OPEN node, if parenthesized. */
338 if (preg
->regparse
[0] == '?' && preg
->regparse
[1] == ':') {
339 /* non-capturing paren */
344 parno
= ++preg
->re_nsub
;
346 ret
= regnode(preg
, OPEN
+parno
);
350 /* Pick up the branches, linking them together. */
351 br
= regbranch(preg
, &flags
);
355 regtail(preg
, ret
, br
); /* OPEN -> first. */
358 if (!(flags
&HASWIDTH
))
360 *flagp
|= flags
&SPSTART
;
361 while (*preg
->regparse
== '|') {
363 br
= regbranch(preg
, &flags
);
366 regtail(preg
, ret
, br
); /* BRANCH -> BRANCH. */
367 if (!(flags
&HASWIDTH
))
369 *flagp
|= flags
&SPSTART
;
372 /* Make a closing node, and hook it on the end. */
373 ender
= regnode(preg
, (paren
) ? CLOSE
+parno
: END
);
374 regtail(preg
, ret
, ender
);
376 /* Hook the tails of the branches to the closing node. */
377 for (br
= ret
; br
!= 0; br
= regnext(preg
, br
))
378 regoptail(preg
, br
, ender
);
380 /* Check for proper termination. */
381 if (paren
&& *preg
->regparse
++ != ')') {
382 preg
->err
= REG_ERR_UNMATCHED_PAREN
;
384 } else if (!paren
&& *preg
->regparse
!= '\0') {
385 if (*preg
->regparse
== ')') {
386 preg
->err
= REG_ERR_UNMATCHED_PAREN
;
389 preg
->err
= REG_ERR_JUNK_ON_END
;
398 - regbranch - one alternative of an | operator
400 * Implements the concatenation operator.
402 static int regbranch(regex_t
*preg
, int *flagp
)
409 *flagp
= WORST
; /* Tentatively. */
411 ret
= regnode(preg
, BRANCH
);
413 while (*preg
->regparse
!= '\0' && *preg
->regparse
!= ')' &&
414 *preg
->regparse
!= '|') {
415 latest
= regpiece(preg
, &flags
);
418 *flagp
|= flags
&HASWIDTH
;
419 if (chain
== 0) {/* First piece. */
420 *flagp
|= flags
&SPSTART
;
423 regtail(preg
, chain
, latest
);
427 if (chain
== 0) /* Loop ran zero times. */
428 (void) regnode(preg
, NOTHING
);
434 - regpiece - something followed by possible [*+?]
436 * Note that the branching code sequences used for ? and the general cases
437 * of * and + are somewhat optimized: they use the same NOTHING node as
438 * both the endmarker for their branch list and the body of the last branch.
439 * It might seem that this node could be dispensed with entirely, but the
440 * endmarker role is not redundant.
442 static int regpiece(regex_t
*preg
, int *flagp
)
452 ret
= regatom(preg
, &flags
);
456 op
= *preg
->regparse
;
462 if (!(flags
&HASWIDTH
) && op
!= '?') {
463 preg
->err
= REG_ERR_OPERAND_COULD_BE_EMPTY
;
467 /* Handle braces (counted repetition) by expansion */
471 min
= strtoul(preg
->regparse
+ 1, &end
, 10);
472 if (end
== preg
->regparse
+ 1) {
473 preg
->err
= REG_ERR_BAD_COUNT
;
480 preg
->regparse
= end
;
481 max
= strtoul(preg
->regparse
+ 1, &end
, 10);
483 preg
->err
= REG_ERR_UNMATCHED_BRACES
;
487 if (end
== preg
->regparse
+ 1) {
490 else if (max
< min
|| max
>= 100) {
491 preg
->err
= REG_ERR_BAD_COUNT
;
495 preg
->err
= REG_ERR_BAD_COUNT
;
499 preg
->regparse
= strchr(preg
->regparse
, '}');
503 max
= (op
== '?' ? 1 : MAX_REP_COUNT
);
506 if (preg
->regparse
[1] == '?') {
508 next
= reginsert(preg
, flags
& SIMPLE
? REPMIN
: REPXMIN
, 5, ret
);
511 next
= reginsert(preg
, flags
& SIMPLE
? REP
: REPX
, 5, ret
);
513 preg
->program
[ret
+ 2] = max
;
514 preg
->program
[ret
+ 3] = min
;
515 preg
->program
[ret
+ 4] = 0;
517 *flagp
= (min
) ? (WORST
|HASWIDTH
) : (WORST
|SPSTART
);
519 if (!(flags
& SIMPLE
)) {
520 int back
= regnode(preg
, BACK
);
521 regtail(preg
, back
, ret
);
522 regtail(preg
, next
, back
);
526 if (ISMULT(*preg
->regparse
)) {
527 preg
->err
= REG_ERR_NESTED_COUNT
;
531 return chain
? chain
: ret
;
535 * Add all characters in the inclusive range between lower and upper.
537 * Handles a swapped range (upper < lower).
539 static void reg_addrange(regex_t
*preg
, int lower
, int upper
)
542 reg_addrange(preg
, upper
, lower
);
544 /* Add a range as length, start */
545 regc(preg
, upper
- lower
+ 1);
550 * Add a null-terminated literal string as a set of ranges.
552 static void reg_addrange_str(regex_t
*preg
, const char *str
)
555 reg_addrange(preg
, *str
, *str
);
561 * Extracts the next unicode char from utf8.
563 * If 'upper' is set, converts the char to uppercase.
565 static int reg_utf8_tounicode_case(const char *s
, int *uc
, int upper
)
567 int l
= utf8_tounicode(s
, uc
);
569 *uc
= utf8_upper(*uc
);
575 * Converts a hex digit to decimal.
577 * Returns -1 for an invalid hex digit.
579 static int hexdigitval(int c
)
581 if (c
>= '0' && c
<= '9')
583 if (c
>= 'a' && c
<= 'f')
585 if (c
>= 'A' && c
<= 'F')
591 * Parses up to 'n' hex digits at 's' and stores the result in *uc.
593 * Returns the number of hex digits parsed.
594 * If there are no hex digits, returns 0 and stores nothing.
596 static int parse_hex(const char *s
, int n
, int *uc
)
601 for (k
= 0; k
< n
; k
++) {
602 int c
= hexdigitval(*s
++);
606 val
= (val
<< 4) | c
;
615 * Call for chars after a backlash to decode the escape sequence.
617 * Stores the result in *ch.
619 * Returns the number of bytes consumed.
621 static int reg_decode_escape(const char *s
, int *ch
)
629 case 'b': *ch
= '\b'; break;
630 case 'e': *ch
= 27; break;
631 case 'f': *ch
= '\f'; break;
632 case 'n': *ch
= '\n'; break;
633 case 'r': *ch
= '\r'; break;
634 case 't': *ch
= '\t'; break;
635 case 'v': *ch
= '\v'; break;
638 /* Expect \u{NNNN} */
639 n
= parse_hex(s
+ 1, 6, ch
);
640 if (n
> 0 && s
[n
+ 1] == '}' && *ch
>= 0 && *ch
<= 0x1fffff) {
644 /* Invalid, so just treat as an escaped 'u' */
648 else if ((n
= parse_hex(s
, 4, ch
)) > 0) {
653 if ((n
= parse_hex(s
, 8, ch
)) > 0) {
657 if ((n
= parse_hex(s
, 2, ch
)) > 0) {
670 - regatom - the lowest level
672 * Optimization: gobbles an entire sequence of ordinary characters so that
673 * it can turn them into a single node, which is smaller to store and
674 * faster to run. Backslashed characters are exceptions, each becoming a
675 * separate node; the code is simpler that way and it's not worth fixing.
677 static int regatom(regex_t
*preg
, int *flagp
)
681 int nocase
= (preg
->cflags
& REG_ICASE
);
684 int n
= reg_utf8_tounicode_case(preg
->regparse
, &ch
, nocase
);
686 *flagp
= WORST
; /* Tentatively. */
690 /* FIXME: these chars only have meaning at beg/end of pat? */
692 ret
= regnode(preg
, BOL
);
695 ret
= regnode(preg
, EOL
);
698 ret
= regnode(preg
, ANY
);
699 *flagp
|= HASWIDTH
|SIMPLE
;
702 const char *pattern
= preg
->regparse
;
704 if (*pattern
== '^') { /* Complement of range. */
705 ret
= regnode(preg
, ANYBUT
);
708 ret
= regnode(preg
, ANYOF
);
710 /* Special case. If the first char is ']' or '-', it is part of the set */
711 if (*pattern
== ']' || *pattern
== '-') {
712 reg_addrange(preg
, *pattern
, *pattern
);
716 while (*pattern
&& *pattern
!= ']') {
717 /* Is this a range? a-z */
721 pattern
+= reg_utf8_tounicode_case(pattern
, &start
, nocase
);
723 pattern
+= reg_decode_escape(pattern
, &start
);
725 preg
->err
= REG_ERR_NULL_CHAR
;
729 if (pattern
[0] == '-' && pattern
[1] && pattern
[1] != ']') {
731 pattern
+= utf8_tounicode(pattern
, &end
);
732 pattern
+= reg_utf8_tounicode_case(pattern
, &end
, nocase
);
734 pattern
+= reg_decode_escape(pattern
, &end
);
736 preg
->err
= REG_ERR_NULL_CHAR
;
741 reg_addrange(preg
, start
, end
);
745 if (strncmp(pattern
, ":alpha:]", 8) == 0) {
746 if ((preg
->cflags
& REG_ICASE
) == 0) {
747 reg_addrange(preg
, 'a', 'z');
749 reg_addrange(preg
, 'A', 'Z');
753 if (strncmp(pattern
, ":alnum:]", 8) == 0) {
754 if ((preg
->cflags
& REG_ICASE
) == 0) {
755 reg_addrange(preg
, 'a', 'z');
757 reg_addrange(preg
, 'A', 'Z');
758 reg_addrange(preg
, '0', '9');
762 if (strncmp(pattern
, ":space:]", 8) == 0) {
763 reg_addrange_str(preg
, " \t\r\n\f\v");
768 /* Not a range, so just add the char */
769 reg_addrange(preg
, start
, start
);
776 preg
->regparse
= pattern
;
778 *flagp
|= HASWIDTH
|SIMPLE
;
782 ret
= reg(preg
, 1, &flags
);
785 *flagp
|= flags
&(HASWIDTH
|SPSTART
);
790 preg
->err
= REG_ERR_INTERNAL
;
791 return 0; /* Supposed to be caught earlier. */
796 preg
->err
= REG_ERR_COUNT_FOLLOWS_NOTHING
;
799 switch (*preg
->regparse
++) {
801 preg
->err
= REG_ERR_TRAILING_BACKSLASH
;
805 ret
= regnode(preg
, WORDA
);
809 ret
= regnode(preg
, WORDZ
);
812 ret
= regnode(preg
, ANYOF
);
813 reg_addrange(preg
, '0', '9');
815 *flagp
|= HASWIDTH
|SIMPLE
;
818 ret
= regnode(preg
, ANYOF
);
819 if ((preg
->cflags
& REG_ICASE
) == 0) {
820 reg_addrange(preg
, 'a', 'z');
822 reg_addrange(preg
, 'A', 'Z');
823 reg_addrange(preg
, '0', '9');
824 reg_addrange(preg
, '_', '_');
826 *flagp
|= HASWIDTH
|SIMPLE
;
829 ret
= regnode(preg
, ANYOF
);
830 reg_addrange_str(preg
," \t\r\n\f\v");
832 *flagp
|= HASWIDTH
|SIMPLE
;
834 /* FIXME: Someday handle \1, \2, ... */
836 /* Handle general quoted chars in exact-match routine */
837 /* Back up to include the backslash */
845 * Encode a string of characters to be matched exactly.
849 /* Back up to pick up the first char of interest */
852 ret
= regnode(preg
, EXACTLY
);
854 /* Note that a META operator such as ? or * consumes the
856 * Thus we must be careful to look ahead by 2 and add the
857 * last char as it's own EXACTLY if necessary
860 /* Until end of string or a META char is reached */
861 while (*preg
->regparse
&& strchr(META
, *preg
->regparse
) == NULL
) {
862 n
= reg_utf8_tounicode_case(preg
->regparse
, &ch
, (preg
->cflags
& REG_ICASE
));
863 if (ch
== '\\' && preg
->regparse
[n
]) {
864 /* Non-trailing backslash.
865 * Is this a special escape, or a regular escape?
867 if (strchr("<>mMwds", preg
->regparse
[n
])) {
868 /* A special escape. All done with EXACTLY */
871 /* Decode it. Note that we add the length for the escape
872 * sequence to the length for the backlash so we can skip
873 * the entire sequence, or not as required.
875 n
+= reg_decode_escape(preg
->regparse
+ n
, &ch
);
877 preg
->err
= REG_ERR_NULL_CHAR
;
882 /* Now we have one char 'ch' of length 'n'.
883 * Check to see if the following char is a MULT
886 if (ISMULT(preg
->regparse
[n
])) {
887 /* Yes. But do we already have some EXACTLY chars? */
889 /* Yes, so return what we have and pick up the current char next time around */
892 /* No, so add this single char and finish */
899 /* No, so just add this char normally */
917 static void reg_grow(regex_t
*preg
, int n
)
919 if (preg
->p
+ n
>= preg
->proglen
) {
920 preg
->proglen
= (preg
->p
+ n
) * 2;
921 preg
->program
= realloc(preg
->program
, preg
->proglen
* sizeof(int));
926 - regnode - emit a node
929 static int regnode(regex_t
*preg
, int op
)
933 /* The OP followed by a next pointer */
934 preg
->program
[preg
->p
++] = op
;
935 preg
->program
[preg
->p
++] = 0;
937 /* Return the start of the node */
942 - regc - emit (if appropriate) a byte of code
944 static void regc(regex_t
*preg
, int b
)
947 preg
->program
[preg
->p
++] = b
;
951 - reginsert - insert an operator in front of already-emitted operand
953 * Means relocating the operand.
954 * Returns the new location of the original operand.
956 static int reginsert(regex_t
*preg
, int op
, int size
, int opnd
)
958 reg_grow(preg
, size
);
960 /* Move everything from opnd up */
961 memmove(preg
->program
+ opnd
+ size
, preg
->program
+ opnd
, sizeof(int) * (preg
->p
- opnd
));
962 /* Zero out the new space */
963 memset(preg
->program
+ opnd
, 0, sizeof(int) * size
);
965 preg
->program
[opnd
] = op
;
973 - regtail - set the next-pointer at the end of a node chain
975 static void regtail(regex_t
*preg
, int p
, int val
)
981 /* Find last node. */
984 temp
= regnext(preg
, scan
);
990 if (OP(preg
, scan
) == BACK
)
995 preg
->program
[scan
+ 1] = offset
;
999 - regoptail - regtail on operand of first argument; nop if operandless
1002 static void regoptail(regex_t
*preg
, int p
, int val
)
1004 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
1005 if (p
!= 0 && OP(preg
, p
) == BRANCH
) {
1006 regtail(preg
, OPERAND(p
), val
);
1011 * regexec and friends
1017 static int regtry(regex_t
*preg
, const char *string
);
1018 static int regmatch(regex_t
*preg
, int prog
);
1019 static int regrepeat(regex_t
*preg
, int p
, int max
);
1022 - regexec - match a regexp against a string
1024 int regexec(regex_t
*preg
, const char *string
, size_t nmatch
, regmatch_t pmatch
[], int eflags
)
1029 /* Be paranoid... */
1030 if (preg
== NULL
|| preg
->program
== NULL
|| string
== NULL
) {
1031 return REG_ERR_NULL_ARGUMENT
;
1034 /* Check validity of program. */
1035 if (*preg
->program
!= REG_MAGIC
) {
1036 return REG_ERR_CORRUPTED
;
1040 fprintf(stderr
, "regexec: %s\n", string
);
1044 preg
->eflags
= eflags
;
1045 preg
->pmatch
= pmatch
;
1046 preg
->nmatch
= nmatch
;
1047 preg
->start
= string
; /* All offsets are computed from here */
1049 /* Must clear out the embedded repeat counts of REPX and REPXMIN opcodes */
1050 for (scan
= OPERAND(1); scan
!= 0; scan
+= regopsize(preg
, scan
)) {
1051 int op
= OP(preg
, scan
);
1054 if (op
== REPX
|| op
== REPXMIN
)
1055 preg
->program
[scan
+ 4] = 0;
1058 /* If there is a "must appear" string, look for it. */
1059 if (preg
->regmust
!= 0) {
1061 while ((s
= str_find(s
, preg
->program
[preg
->regmust
], preg
->cflags
& REG_ICASE
)) != NULL
) {
1062 if (prefix_cmp(preg
->program
+ preg
->regmust
, preg
->regmlen
, s
, preg
->cflags
& REG_ICASE
) >= 0) {
1067 if (s
== NULL
) /* Not present. */
1071 /* Mark beginning of line for ^ . */
1072 preg
->regbol
= string
;
1074 /* Simplest case: anchored match need be tried only once (maybe per line). */
1075 if (preg
->reganch
) {
1076 if (eflags
& REG_NOTBOL
) {
1077 /* This is an anchored search, but not an BOL, so possibly skip to the next line */
1081 if (regtry(preg
, string
)) {
1086 if (preg
->cflags
& REG_NEWLINE
) {
1087 /* Try the next anchor? */
1088 string
= strchr(string
, '\n');
1090 preg
->regbol
= ++string
;
1099 /* Messy cases: unanchored match. */
1101 if (preg
->regstart
!= '\0') {
1102 /* We know what char it must start with. */
1103 while ((s
= str_find(s
, preg
->regstart
, preg
->cflags
& REG_ICASE
)) != NULL
) {
1104 if (regtry(preg
, s
))
1110 /* We don't -- general case. */
1112 if (regtry(preg
, s
))
1119 s
+= utf8_tounicode(s
, &c
);
1128 - regtry - try match at specific point
1130 /* 0 failure, 1 success */
1131 static int regtry( regex_t
*preg
, const char *string
)
1135 preg
->reginput
= string
;
1137 for (i
= 0; i
< preg
->nmatch
; i
++) {
1138 preg
->pmatch
[i
].rm_so
= -1;
1139 preg
->pmatch
[i
].rm_eo
= -1;
1141 if (regmatch(preg
, 1)) {
1142 preg
->pmatch
[0].rm_so
= string
- preg
->start
;
1143 preg
->pmatch
[0].rm_eo
= preg
->reginput
- preg
->start
;
1150 * Returns bytes matched if 'pattern' is a prefix of 'string'.
1152 * If 'nocase' is non-zero, does a case-insensitive match.
1154 * Returns -1 on not found.
1156 static int prefix_cmp(const int *prog
, int proglen
, const char *string
, int nocase
)
1158 const char *s
= string
;
1159 while (proglen
&& *s
) {
1161 int n
= reg_utf8_tounicode_case(s
, &ch
, nocase
);
1176 * Searchs for 'c' in the range 'range'.
1178 * Returns 1 if found, or 0 if not.
1180 static int reg_range_find(const int *range
, int c
)
1183 /*printf("Checking %d in range [%d,%d]\n", c, range[1], (range[0] + range[1] - 1));*/
1184 if (c
>= range
[1] && c
<= (range
[0] + range
[1] - 1)) {
1193 * Search for the character 'c' in the utf-8 string 'string'.
1195 * If 'nocase' is set, the 'string' is assumed to be uppercase
1196 * and 'c' is converted to uppercase before matching.
1198 * Returns the byte position in the string where the 'c' was found, or
1199 * NULL if not found.
1201 static const char *str_find(const char *string
, int c
, int nocase
)
1204 /* The "string" should already be converted to uppercase */
1209 int n
= reg_utf8_tounicode_case(string
, &ch
, nocase
);
1219 * Returns true if 'ch' is an end-of-line char.
1221 * In REG_NEWLINE mode, \n is considered EOL in
1224 static int reg_iseol(regex_t
*preg
, int ch
)
1226 if (preg
->cflags
& REG_NEWLINE
) {
1227 return ch
== '\0' || ch
== '\n';
1234 static int regmatchsimplerepeat(regex_t
*preg
, int scan
, int matchmin
)
1241 int max
= preg
->program
[scan
+ 2];
1242 int min
= preg
->program
[scan
+ 3];
1243 int next
= regnext(preg
, scan
);
1246 * Lookahead to avoid useless match attempts
1247 * when we know what character comes next.
1249 if (OP(preg
, next
) == EXACTLY
) {
1250 nextch
= preg
->program
[OPERAND(next
)];
1252 save
= preg
->reginput
;
1253 no
= regrepeat(preg
, scan
+ 5, max
);
1258 /* from min up to no */
1262 /* else from no down to min */
1274 preg
->reginput
= save
+ utf8_index(save
, no
);
1275 reg_utf8_tounicode_case(preg
->reginput
, &c
, (preg
->cflags
& REG_ICASE
));
1276 /* If it could work, try it. */
1277 if (reg_iseol(preg
, nextch
) || c
== nextch
) {
1278 if (regmatch(preg
, next
)) {
1283 /* Couldn't or didn't, add one more */
1287 /* Couldn't or didn't -- back up. */
1294 static int regmatchrepeat(regex_t
*preg
, int scan
, int matchmin
)
1296 int *scanpt
= preg
->program
+ scan
;
1298 int max
= scanpt
[2];
1299 int min
= scanpt
[3];
1301 /* Have we reached min? */
1302 if (scanpt
[4] < min
) {
1303 /* No, so get another one */
1305 if (regmatch(preg
, scan
+ 5)) {
1311 if (scanpt
[4] > max
) {
1316 /* minimal, so try other branch first */
1317 if (regmatch(preg
, regnext(preg
, scan
))) {
1320 /* No, so try one more */
1322 if (regmatch(preg
, scan
+ 5)) {
1328 /* maximal, so try this branch again */
1329 if (scanpt
[4] < max
) {
1331 if (regmatch(preg
, scan
+ 5)) {
1336 /* At this point we are at max with no match. Try the other branch */
1337 return regmatch(preg
, regnext(preg
, scan
));
1341 - regmatch - main matching routine
1343 * Conceptually the strategy is simple: check to see whether the current
1344 * node matches, call self recursively to see whether the rest matches,
1345 * and then act accordingly. In practice we make some effort to avoid
1346 * recursion, in particular by going through "ordinary" nodes (that don't
1347 * need to know whether the rest of the match failed) by a loop instead of
1350 /* 0 failure, 1 success */
1351 static int regmatch(regex_t
*preg
, int prog
)
1353 int scan
; /* Current node. */
1354 int next
; /* Next node. */
1360 if (scan
!= 0 && regnarrate
)
1361 fprintf(stderr
, "%s(\n", regprop(scan
));
1368 fprintf(stderr
, "%3d: %s...\n", scan
, regprop(OP(preg
, scan
))); /* Where, what. */
1371 next
= regnext(preg
, scan
);
1372 n
= reg_utf8_tounicode_case(preg
->reginput
, &c
, (preg
->cflags
& REG_ICASE
));
1374 switch (OP(preg
, scan
)) {
1376 if (preg
->reginput
!= preg
->regbol
)
1380 if (!reg_iseol(preg
, c
)) {
1385 /* Must be looking at a letter, digit, or _ */
1386 if ((!isalnum(UCHAR(c
))) && c
!= '_')
1388 /* Prev must be BOL or nonword */
1389 if (preg
->reginput
> preg
->regbol
&&
1390 (isalnum(UCHAR(preg
->reginput
[-1])) || preg
->reginput
[-1] == '_'))
1394 /* Can't match at BOL */
1395 if (preg
->reginput
> preg
->regbol
) {
1396 /* Current must be EOL or nonword */
1397 if (reg_iseol(preg
, c
) || !isalnum(UCHAR(c
)) || c
!= '_') {
1398 c
= preg
->reginput
[-1];
1399 /* Previous must be word */
1400 if (isalnum(UCHAR(c
)) || c
== '_') {
1409 if (reg_iseol(preg
, c
))
1411 preg
->reginput
+= n
;
1418 opnd
= OPERAND(scan
);
1419 len
= str_int_len(preg
->program
+ opnd
);
1421 slen
= prefix_cmp(preg
->program
+ opnd
, len
, preg
->reginput
, preg
->cflags
& REG_ICASE
);
1425 preg
->reginput
+= slen
;
1429 if (reg_iseol(preg
, c
) || reg_range_find(preg
->program
+ OPERAND(scan
), c
) == 0) {
1432 preg
->reginput
+= n
;
1435 if (reg_iseol(preg
, c
) || reg_range_find(preg
->program
+ OPERAND(scan
), c
) != 0) {
1438 preg
->reginput
+= n
;
1445 if (OP(preg
, next
) != BRANCH
) /* No choice. */
1446 next
= OPERAND(scan
); /* Avoid recursion. */
1449 save
= preg
->reginput
;
1450 if (regmatch(preg
, OPERAND(scan
))) {
1453 preg
->reginput
= save
;
1454 scan
= regnext(preg
, scan
);
1455 } while (scan
!= 0 && OP(preg
, scan
) == BRANCH
);
1462 return regmatchsimplerepeat(preg
, scan
, OP(preg
, scan
) == REPMIN
);
1466 return regmatchrepeat(preg
, scan
, OP(preg
, scan
) == REPXMIN
);
1469 return 1; /* Success! */
1473 return regmatch(preg
, next
);
1476 if (OP(preg
, scan
) >= OPEN
+1 && OP(preg
, scan
) < CLOSE_END
) {
1477 save
= preg
->reginput
;
1478 if (regmatch(preg
, next
)) {
1479 if (OP(preg
, scan
) < CLOSE
) {
1480 int no
= OP(preg
, scan
) - OPEN
;
1481 if (no
< preg
->nmatch
&& preg
->pmatch
[no
].rm_so
== -1) {
1482 preg
->pmatch
[no
].rm_so
= save
- preg
->start
;
1486 int no
= OP(preg
, scan
) - CLOSE
;
1487 if (no
< preg
->nmatch
&& preg
->pmatch
[no
].rm_eo
== -1) {
1488 preg
->pmatch
[no
].rm_eo
= save
- preg
->start
;
1495 return REG_ERR_INTERNAL
;
1502 * We get here only if there's trouble -- normally "case END" is
1503 * the terminating point.
1505 return REG_ERR_INTERNAL
;
1509 - regrepeat - repeatedly match something simple, report how many
1511 static int regrepeat(regex_t
*preg
, int p
, int max
)
1519 scan
= preg
->reginput
;
1521 switch (OP(preg
, p
)) {
1523 /* No need to handle utf8 specially here */
1524 while (!reg_iseol(preg
, *scan
) && count
< max
) {
1530 while (count
< max
) {
1531 n
= reg_utf8_tounicode_case(scan
, &ch
, preg
->cflags
& REG_ICASE
);
1532 if (preg
->program
[opnd
] != ch
) {
1540 while (count
< max
) {
1541 n
= reg_utf8_tounicode_case(scan
, &ch
, preg
->cflags
& REG_ICASE
);
1542 if (reg_iseol(preg
, ch
) || reg_range_find(preg
->program
+ opnd
, ch
) == 0) {
1550 while (count
< max
) {
1551 n
= reg_utf8_tounicode_case(scan
, &ch
, preg
->cflags
& REG_ICASE
);
1552 if (reg_iseol(preg
, ch
) || reg_range_find(preg
->program
+ opnd
, ch
) != 0) {
1559 default: /* Oh dear. Called inappropriately. */
1560 preg
->err
= REG_ERR_INTERNAL
;
1561 count
= 0; /* Best compromise. */
1564 preg
->reginput
= scan
;
1570 - regnext - dig the "next" pointer out of a node
1572 static int regnext(regex_t
*preg
, int p
)
1576 offset
= NEXT(preg
, p
);
1581 if (OP(preg
, p
) == BACK
)
1588 - regopsize - returns the size of opcode + operands at 'p' in words
1590 static int regopsize(regex_t
*preg
, int p
)
1592 /* Almost all opcodes are 2 words, but some are more */
1593 switch (OP(preg
, p
)) {
1604 while (preg
->program
[s
++]) {
1612 #if defined(DEBUG) && !defined(JIM_BOOTSTRAP)
1615 - regdump - dump a regexp onto stdout in vaguely comprehensible form
1617 static void regdump(regex_t
*preg
)
1620 int op
= EXACTLY
; /* Arbitrary non-END op. */
1622 char buf
[MAX_UTF8_LEN
+ 1];
1625 for (i
= 1; i
< preg
->p
; i
++) {
1626 printf("%02x ", (unsigned char)preg
->program
[i
]);
1634 while (op
!= END
&& s
< preg
->p
) { /* While that wasn't END last time... */
1636 printf("%3d: %s", s
, regprop(op
)); /* Where, what. */
1637 next
= regnext(preg
, s
);
1638 if (next
== 0) /* Next ptr. */
1641 printf("(%d)", next
);
1643 if (op
== REP
|| op
== REPMIN
|| op
== REPX
|| op
== REPXMIN
) {
1644 int max
= preg
->program
[s
];
1645 int min
= preg
->program
[s
+ 1];
1647 printf("{%d,*}", min
);
1650 printf("{%d,%d}", min
, max
);
1652 printf(" %d", preg
->program
[s
+ 2]);
1655 else if (op
== ANYOF
|| op
== ANYBUT
) {
1658 while (preg
->program
[s
]) {
1659 int len
= preg
->program
[s
++];
1660 int first
= preg
->program
[s
++];
1661 buf
[utf8_getchars(buf
, first
)] = 0;
1664 buf
[utf8_getchars(buf
, first
+ len
- 1)] = 0;
1670 else if (op
== EXACTLY
) {
1671 /* Literal string, where present. */
1673 while (preg
->program
[s
]) {
1674 buf
[utf8_getchars(buf
, preg
->program
[s
])] = 0;
1684 /* Header fields of interest. */
1685 if (preg
->regstart
) {
1686 buf
[utf8_getchars(buf
, preg
->regstart
)] = 0;
1687 printf("start '%s' ", buf
);
1690 printf("anchored ");
1691 if (preg
->regmust
!= 0) {
1693 printf("must have:");
1694 for (i
= 0; i
< preg
->regmlen
; i
++) {
1695 putchar(preg
->program
[preg
->regmust
+ i
]);
1704 - regprop - printable representation of opcode
1706 static const char *regprop( int op
)
1708 static char buf
[50];
1748 if (op
>= OPEN
&& op
< CLOSE
) {
1749 snprintf(buf
, sizeof(buf
), "OPEN%d", op
-OPEN
);
1751 else if (op
>= CLOSE
&& op
< CLOSE_END
) {
1752 snprintf(buf
, sizeof(buf
), "CLOSE%d", op
-CLOSE
);
1755 snprintf(buf
, sizeof(buf
), "?%d?\n", op
);
1760 #endif /* JIM_BOOTSTRAP */
1762 size_t regerror(int errcode
, const regex_t
*preg
, char *errbuf
, size_t errbuf_size
)
1764 static const char *error_strings
[] = {
1773 "parentheses () not balanced",
1774 "braces {} not balanced",
1775 "invalid repetition count(s)",
1780 "count follows nothing",
1781 "trailing backslash",
1782 "corrupted program",
1783 "contains null char",
1787 if (errcode
< 0 || errcode
>= REG_ERR_NUM
) {
1788 err
= "Bad error code";
1791 err
= error_strings
[errcode
];
1794 return snprintf(errbuf
, errbuf_size
, "%s", err
);
1797 void regfree(regex_t
*preg
)
1799 free(preg
->program
);