2 * regcomp and regexec -- regsub and regerror are elsewhere
4 * Copyright (c) 1986 by University of Toronto.
5 * Written by Henry Spencer. Not derived from licensed software.
7 * Permission is granted to anyone to use this software for any
8 * purpose on any computer system, and to redistribute it freely,
9 * subject to the following restrictions:
11 * 1. The author is not responsible for the consequences of use of
12 * this software, no matter how awful, even if they arise
15 * 2. The origin of this software must not be misrepresented, either
16 * by explicit claim or by omission.
18 * 3. Altered versions must be plainly marked as such, and must not
19 * be misrepresented as being the original software.
20 *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore,
21 *** hoptoad!gnu, on 27 Dec 1986, to add \n as an alternative to |
22 *** to assist in implementing egrep.
23 *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore,
24 *** hoptoad!gnu, on 27 Dec 1986, to add \< and \> for word-matching
25 *** as in BSD grep and ex.
26 *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore,
27 *** hoptoad!gnu, on 28 Dec 1986, to optimize characters quoted with \.
28 *** THIS IS AN ALTERED VERSION. It was altered by James A. Woods,
29 *** ames!jaw, on 19 June 1987, to quash a regcomp() redundancy.
30 *** THIS IS AN ALTERED VERSION. It was altered by Christopher Seiwald
31 *** seiwald@vix.com, on 28 August 1993, for use in jam. Regmagic.h
32 *** was moved into regexp.h, and the include of regexp.h now uses "'s
33 *** to avoid conflicting with the system regexp.h. Const, bless its
34 *** soul, was removed so it can compile everywhere. The declaration
35 *** of strchr() was in conflict on AIX, so it was removed (as it is
36 *** happily defined in string.h).
37 *** THIS IS AN ALTERED VERSION. It was altered by Christopher Seiwald
38 *** seiwald@perforce.com, on 20 January 2000, to use function prototypes.
39 *** THIS IS AN ALTERED VERSION. It was altered by Christopher Seiwald
40 *** seiwald@perforce.com, on 05 November 2002, to const string literals.
42 * THIS IS AN ALTERED VERSION. It was altered by Steve Bennett <steveb@workware.net.au>
43 * on 16 October 2010, to remove static state and add better Tcl ARE compatibility.
44 * This includes counted repetitions, UTF-8 support, character classes,
45 * shorthand character classes, increased number of parentheses to 100,
46 * backslash escape sequences. It also removes \n as an alternative to |.
48 * Beware that some of this code is subtly aware of the way operator
49 * precedence is structured in regular expressions. Serious changes in
50 * regular-expression syntax might require a total rethink.
58 #include "jimautoconf.h"
59 #include "jimregexp.h"
62 #if !defined(HAVE_REGCOMP) || defined(JIM_REGEXP)
65 * Structure for regexp "program". This is essentially a linear encoding
66 * of a nondeterministic finite-state machine (aka syntax charts or
67 * "railroad normal form" in parsing technology). Each node is an opcode
68 * plus a "next" pointer, possibly plus an operand. "Next" pointers of
69 * all nodes except BRANCH implement concatenation; a "next" pointer with
70 * a BRANCH on both ends of it is connecting two alternatives. (Here we
71 * have one of the subtle syntax dependencies: an individual BRANCH (as
72 * opposed to a collection of them) is never concatenated with anything
73 * because of operator precedence.) The operand of some types of node is
74 * a literal string; for others, it is a node leading into a sub-FSM. In
75 * particular, the operand of a BRANCH node is the first node of the branch.
76 * (NB this is *not* a tree structure: the tail of the branch connects
77 * to the thing following the set of BRANCHes.) The opcodes are:
80 /* This *MUST* be less than (255-20-1)/2=117 */
81 #define REG_MAX_PAREN 100
83 /* definition number opnd? meaning */
84 #define END 0 /* no End of program. */
85 #define BOL 1 /* no Match "" at beginning of line. */
86 #define EOL 2 /* no Match "" at end of line. */
87 #define ANY 3 /* no Match any one character. */
88 #define ANYOF 4 /* str Match any character in this string. */
89 #define ANYBUT 5 /* str Match any character not in this string. */
90 #define BRANCH 6 /* node Match this alternative, or the next... */
91 #define BACK 7 /* no Match "", "next" ptr points backward. */
92 #define EXACTLY 8 /* str Match this string. */
93 #define NOTHING 9 /* no Match empty string. */
94 #define REP 10 /* max,min Match this (simple) thing [min,max] times. */
95 #define REPMIN 11 /* max,min Match this (simple) thing [min,max] times, mininal match. */
96 #define REPX 12 /* max,min Match this (complex) thing [min,max] times. */
97 #define REPXMIN 13 /* max,min Match this (complex) thing [min,max] times, minimal match. */
99 #define WORDA 15 /* no Match "" at wordchar, where prev is nonword */
100 #define WORDZ 16 /* no Match "" at nonwordchar, where prev is word */
101 #define OPENNC 19 /* no Non-capturing parentheses - must be OPEN-1 */
102 #define OPEN 20 /* no Mark this point in input as start of #n. */
103 /* OPEN+1 is number 1, etc. */
104 #define CLOSE (OPEN+REG_MAX_PAREN+1) /* no Analogous to OPEN. */
105 #define CLOSE_END (CLOSE+REG_MAX_PAREN)
106 #define CLOSENC (CLOSE-1) /* no Non-capturing parentheses - must be CLOSE-1 */
109 * The first byte of the regexp internal "program" is actually this magic
110 * number; the start node begins in the second byte.
112 #define REG_MAGIC 0xFADED00D
117 * BRANCH The set of branches constituting a single choice are hooked
118 * together with their "next" pointers, since precedence prevents
119 * anything being concatenated to any individual branch. The
120 * "next" pointer of the last BRANCH in a choice points to the
121 * thing following the whole choice. This is also where the
122 * final "next" pointer of each individual branch points; each
123 * branch starts with the operand node of a BRANCH node.
125 * BACK Normal "next" pointers all implicitly point forward; BACK
126 * exists to make loop structures possible.
128 * STAR,PLUS '?', and complex '*' and '+', are implemented as circular
129 * BRANCH structures using BACK. Simple cases (one character
130 * per match) are implemented with STAR and PLUS for speed
131 * and to minimize recursive plunges.
133 * OPEN,CLOSE ...are numbered at compile time.
137 * A node is one char of opcode followed by two chars of "next" pointer.
138 * "Next" pointers are stored as two 8-bit pieces, high order first. The
139 * value is a positive offset from the opcode of the node containing it.
140 * An operand, if any, simply follows the node. (Note that much of the
141 * code generation knows about this implicit relationship.)
143 * Using two bytes for the "next" pointer is vast overkill for most things,
144 * but allows patterns to get big without disasters.
146 #define OP(preg, p) (preg->program[p])
147 #define NEXT(preg, p) (preg->program[p + 1])
148 #define OPERAND(p) ((p) + 2)
151 * See regmagic.h for one further detail of program structure.
156 * Utility definitions.
159 #define FAIL(R,M) { (R)->err = (M); return (M); }
160 #define ISMULT(c) ((c) == '*' || (c) == '+' || (c) == '?' || (c) == '{')
161 #define META "^$.[()|?{+*"
164 * Flags to be passed up and down.
166 #define HASWIDTH 01 /* Known never to match null string. */
167 #define SIMPLE 02 /* Simple enough to be STAR/PLUS operand. */
168 #define SPSTART 04 /* Starts with * or +. */
169 #define WORST 0 /* Worst case. */
171 #define MAX_REP_COUNT 1000000
174 * Forward declarations for regcomp()'s friends.
176 static int reg(regex_t
*preg
, int paren
/* Parenthesized? */, int *flagp
);
177 static int regpiece(regex_t
*preg
, int *flagp
);
178 static int regbranch(regex_t
*preg
, int *flagp
);
179 static int regatom(regex_t
*preg
, int *flagp
);
180 static int regnode(regex_t
*preg
, int op
);
181 static int regnext(regex_t
*preg
, int p
);
182 static void regc(regex_t
*preg
, int b
);
183 static int reginsert(regex_t
*preg
, int op
, int size
, int opnd
);
184 static void regtail_(regex_t
*preg
, int p
, int val
, int line
);
185 static void regoptail(regex_t
*preg
, int p
, int val
);
186 #define regtail(PREG, P, VAL) regtail_(PREG, P, VAL, __LINE__)
188 static int reg_range_find(const int *string
, int c
);
189 static const char *str_find(const char *string
, int c
, int nocase
);
190 static int prefix_cmp(const int *prog
, int proglen
, const char *string
, int nocase
);
194 static int regnarrate
= 0;
195 static void regdump(regex_t
*preg
);
196 static const char *regprop( int op
);
201 * Returns the length of the null-terminated integer sequence.
203 static int str_int_len(const int *seq
)
213 - regcomp - compile a regular expression into internal code
215 * We can't allocate space until we know how big the compiled form will be,
216 * but we can't compile it (and thus know how big it is) until we've got a
217 * place to put the code. So we cheat: we compile it twice, once with code
218 * generation turned off and size counting turned on, and once "for real".
219 * This also means that we don't allocate space until we are sure that the
220 * thing really will compile successfully, and we never have to move the
221 * code and thus invalidate pointers into it. (Note that it has to be in
222 * one piece because free() must be able to free it all.)
224 * Beware that the optimization-preparation code in here knows about some
225 * of the structure of the compiled regexp.
227 int regcomp(regex_t
*preg
, const char *exp
, int cflags
)
235 fprintf(stderr
, "Compiling: '%s'\n", exp
);
237 memset(preg
, 0, sizeof(*preg
));
240 FAIL(preg
, REG_ERR_NULL_ARGUMENT
);
242 /* First pass: determine size, legality. */
243 preg
->cflags
= cflags
;
244 preg
->regparse
= exp
;
245 /* XXX: For now, start unallocated */
246 preg
->program
= NULL
;
249 /* Allocate space. */
250 preg
->proglen
= (strlen(exp
) + 1) * 5;
251 preg
->program
= malloc(preg
->proglen
* sizeof(int));
252 if (preg
->program
== NULL
)
253 FAIL(preg
, REG_ERR_NOMEM
);
255 /* Note that since we store a magic value as the first item in the program,
256 * program offsets will never be 0
258 regc(preg
, REG_MAGIC
);
259 if (reg(preg
, 0, &flags
) == 0) {
263 /* Small enough for pointer-storage convention? */
264 if (preg
->re_nsub
>= REG_MAX_PAREN
) /* Probably could be 65535L. */
265 FAIL(preg
,REG_ERR_TOO_BIG
);
267 /* Dig out information for optimizations. */
268 preg
->regstart
= 0; /* Worst-case defaults. */
272 scan
= 1; /* First BRANCH. */
273 if (OP(preg
, regnext(preg
, scan
)) == END
) { /* Only one top-level choice. */
274 scan
= OPERAND(scan
);
276 /* Starting-point info. */
277 if (OP(preg
, scan
) == EXACTLY
) {
278 preg
->regstart
= preg
->program
[OPERAND(scan
)];
280 else if (OP(preg
, scan
) == BOL
)
284 * If there's something expensive in the r.e., find the
285 * longest literal string that must appear and make it the
286 * regmust. Resolve ties in favor of later strings, since
287 * the regstart check works with the beginning of the r.e.
288 * and avoiding duplication strengthens checking. Not a
289 * strong reason, but sufficient in the absence of others.
294 for (; scan
!= 0; scan
= regnext(preg
, scan
)) {
295 if (OP(preg
, scan
) == EXACTLY
) {
296 int plen
= str_int_len(preg
->program
+ OPERAND(scan
));
298 longest
= OPERAND(scan
);
303 preg
->regmust
= longest
;
316 - reg - regular expression, i.e. main body or parenthesized thing
318 * Caller must absorb opening parenthesis.
320 * Combining parenthesis handling with the base level of regular expression
321 * is a trifle forced, but the need to tie the tails of the branches to what
322 * follows makes it hard to avoid.
324 static int reg(regex_t
*preg
, int paren
/* Parenthesized? */, int *flagp
)
332 *flagp
= HASWIDTH
; /* Tentatively. */
334 /* Make an OPEN node, if parenthesized. */
336 if (preg
->regparse
[0] == '?' && preg
->regparse
[1] == ':') {
337 /* non-capturing paren */
342 parno
= ++preg
->re_nsub
;
344 ret
= regnode(preg
, OPEN
+parno
);
348 /* Pick up the branches, linking them together. */
349 br
= regbranch(preg
, &flags
);
353 regtail(preg
, ret
, br
); /* OPEN -> first. */
356 if (!(flags
&HASWIDTH
))
358 *flagp
|= flags
&SPSTART
;
359 while (*preg
->regparse
== '|') {
361 br
= regbranch(preg
, &flags
);
364 regtail(preg
, ret
, br
); /* BRANCH -> BRANCH. */
365 if (!(flags
&HASWIDTH
))
367 *flagp
|= flags
&SPSTART
;
370 /* Make a closing node, and hook it on the end. */
371 ender
= regnode(preg
, (paren
) ? CLOSE
+parno
: END
);
372 regtail(preg
, ret
, ender
);
374 /* Hook the tails of the branches to the closing node. */
375 for (br
= ret
; br
!= 0; br
= regnext(preg
, br
))
376 regoptail(preg
, br
, ender
);
378 /* Check for proper termination. */
379 if (paren
&& *preg
->regparse
++ != ')') {
380 preg
->err
= REG_ERR_UNMATCHED_PAREN
;
382 } else if (!paren
&& *preg
->regparse
!= '\0') {
383 if (*preg
->regparse
== ')') {
384 preg
->err
= REG_ERR_UNMATCHED_PAREN
;
387 preg
->err
= REG_ERR_JUNK_ON_END
;
396 - regbranch - one alternative of an | operator
398 * Implements the concatenation operator.
400 static int regbranch(regex_t
*preg
, int *flagp
)
407 *flagp
= WORST
; /* Tentatively. */
409 ret
= regnode(preg
, BRANCH
);
411 while (*preg
->regparse
!= '\0' && *preg
->regparse
!= ')' &&
412 *preg
->regparse
!= '|') {
413 latest
= regpiece(preg
, &flags
);
416 *flagp
|= flags
&HASWIDTH
;
417 if (chain
== 0) {/* First piece. */
418 *flagp
|= flags
&SPSTART
;
421 regtail(preg
, chain
, latest
);
425 if (chain
== 0) /* Loop ran zero times. */
426 (void) regnode(preg
, NOTHING
);
432 - regpiece - something followed by possible [*+?]
434 * Note that the branching code sequences used for ? and the general cases
435 * of * and + are somewhat optimized: they use the same NOTHING node as
436 * both the endmarker for their branch list and the body of the last branch.
437 * It might seem that this node could be dispensed with entirely, but the
438 * endmarker role is not redundant.
440 static int regpiece(regex_t
*preg
, int *flagp
)
450 ret
= regatom(preg
, &flags
);
454 op
= *preg
->regparse
;
460 if (!(flags
&HASWIDTH
) && op
!= '?') {
461 preg
->err
= REG_ERR_OPERAND_COULD_BE_EMPTY
;
465 /* Handle braces (counted repetition) by expansion */
469 min
= strtoul(preg
->regparse
+ 1, &end
, 10);
470 if (end
== preg
->regparse
+ 1) {
471 preg
->err
= REG_ERR_BAD_COUNT
;
478 preg
->regparse
= end
;
479 max
= strtoul(preg
->regparse
+ 1, &end
, 10);
481 preg
->err
= REG_ERR_UNMATCHED_BRACES
;
485 if (end
== preg
->regparse
+ 1) {
488 else if (max
< min
|| max
>= 100) {
489 preg
->err
= REG_ERR_BAD_COUNT
;
493 preg
->err
= REG_ERR_BAD_COUNT
;
497 preg
->regparse
= strchr(preg
->regparse
, '}');
501 max
= (op
== '?' ? 1 : MAX_REP_COUNT
);
504 if (preg
->regparse
[1] == '?') {
506 next
= reginsert(preg
, flags
& SIMPLE
? REPMIN
: REPXMIN
, 5, ret
);
509 next
= reginsert(preg
, flags
& SIMPLE
? REP
: REPX
, 5, ret
);
511 preg
->program
[ret
+ 2] = max
;
512 preg
->program
[ret
+ 3] = min
;
513 preg
->program
[ret
+ 4] = 0;
515 *flagp
= (min
) ? (WORST
|HASWIDTH
) : (WORST
|SPSTART
);
517 if (!(flags
& SIMPLE
)) {
518 int back
= regnode(preg
, BACK
);
519 regtail(preg
, back
, ret
);
520 regtail(preg
, next
, back
);
524 if (ISMULT(*preg
->regparse
)) {
525 preg
->err
= REG_ERR_NESTED_COUNT
;
529 return chain
? chain
: ret
;
533 * Add all characters in the inclusive range between lower and upper.
535 * Handles a swapped range (upper < lower).
537 static void reg_addrange(regex_t
*preg
, int lower
, int upper
)
540 reg_addrange(preg
, upper
, lower
);
542 /* Add a range as length, start */
543 regc(preg
, upper
- lower
+ 1);
548 * Add a null-terminated literal string as a set of ranges.
550 static void reg_addrange_str(regex_t
*preg
, const char *str
)
553 reg_addrange(preg
, *str
, *str
);
559 * Extracts the next unicode char from utf8.
561 * If 'upper' is set, converts the char to uppercase.
563 static int reg_utf8_tounicode_case(const char *s
, int *uc
, int upper
)
565 int l
= utf8_tounicode(s
, uc
);
567 *uc
= utf8_upper(*uc
);
573 * Converts a hex digit to decimal.
575 * Returns -1 for an invalid hex digit.
577 static int hexdigitval(int c
)
579 if (c
>= '0' && c
<= '9')
581 if (c
>= 'a' && c
<= 'f')
583 if (c
>= 'A' && c
<= 'F')
589 * Parses up to 'n' hex digits at 's' and stores the result in *uc.
591 * Returns the number of hex digits parsed.
592 * If there are no hex digits, returns 0 and stores nothing.
594 static int parse_hex(const char *s
, int n
, int *uc
)
599 for (k
= 0; k
< n
; k
++) {
600 int c
= hexdigitval(*s
++);
604 val
= (val
<< 4) | c
;
613 * Call for chars after a backlash to decode the escape sequence.
615 * Stores the result in *ch.
617 * Returns the number of bytes consumed.
619 static int reg_decode_escape(const char *s
, int *ch
)
627 case 'b': *ch
= '\b'; break;
628 case 'e': *ch
= 27; break;
629 case 'f': *ch
= '\f'; break;
630 case 'n': *ch
= '\n'; break;
631 case 'r': *ch
= '\r'; break;
632 case 't': *ch
= '\t'; break;
633 case 'v': *ch
= '\v'; break;
636 /* Expect \u{NNNN} */
637 n
= parse_hex(s
+ 1, 6, ch
);
638 if (n
> 0 && s
[n
+ 1] == '}' && *ch
>= 0 && *ch
<= 0x1fffff) {
642 /* Invalid, so just treat as an escaped 'u' */
646 else if ((n
= parse_hex(s
, 4, ch
)) > 0) {
651 if ((n
= parse_hex(s
, 8, ch
)) > 0) {
655 if ((n
= parse_hex(s
, 2, ch
)) > 0) {
668 - regatom - the lowest level
670 * Optimization: gobbles an entire sequence of ordinary characters so that
671 * it can turn them into a single node, which is smaller to store and
672 * faster to run. Backslashed characters are exceptions, each becoming a
673 * separate node; the code is simpler that way and it's not worth fixing.
675 static int regatom(regex_t
*preg
, int *flagp
)
679 int nocase
= (preg
->cflags
& REG_ICASE
);
682 int n
= reg_utf8_tounicode_case(preg
->regparse
, &ch
, nocase
);
684 *flagp
= WORST
; /* Tentatively. */
688 /* FIXME: these chars only have meaning at beg/end of pat? */
690 ret
= regnode(preg
, BOL
);
693 ret
= regnode(preg
, EOL
);
696 ret
= regnode(preg
, ANY
);
697 *flagp
|= HASWIDTH
|SIMPLE
;
700 const char *pattern
= preg
->regparse
;
702 if (*pattern
== '^') { /* Complement of range. */
703 ret
= regnode(preg
, ANYBUT
);
706 ret
= regnode(preg
, ANYOF
);
708 /* Special case. If the first char is ']' or '-', it is part of the set */
709 if (*pattern
== ']' || *pattern
== '-') {
710 reg_addrange(preg
, *pattern
, *pattern
);
714 while (*pattern
&& *pattern
!= ']') {
715 /* Is this a range? a-z */
719 pattern
+= reg_utf8_tounicode_case(pattern
, &start
, nocase
);
721 pattern
+= reg_decode_escape(pattern
, &start
);
723 preg
->err
= REG_ERR_NULL_CHAR
;
727 if (pattern
[0] == '-' && pattern
[1] && pattern
[1] != ']') {
729 pattern
+= utf8_tounicode(pattern
, &end
);
730 pattern
+= reg_utf8_tounicode_case(pattern
, &end
, nocase
);
732 pattern
+= reg_decode_escape(pattern
, &end
);
734 preg
->err
= REG_ERR_NULL_CHAR
;
739 reg_addrange(preg
, start
, end
);
743 if (strncmp(pattern
, ":alpha:]", 8) == 0) {
744 if ((preg
->cflags
& REG_ICASE
) == 0) {
745 reg_addrange(preg
, 'a', 'z');
747 reg_addrange(preg
, 'A', 'Z');
751 if (strncmp(pattern
, ":alnum:]", 8) == 0) {
752 if ((preg
->cflags
& REG_ICASE
) == 0) {
753 reg_addrange(preg
, 'a', 'z');
755 reg_addrange(preg
, 'A', 'Z');
756 reg_addrange(preg
, '0', '9');
760 if (strncmp(pattern
, ":space:]", 8) == 0) {
761 reg_addrange_str(preg
, " \t\r\n\f\v");
766 /* Not a range, so just add the char */
767 reg_addrange(preg
, start
, start
);
774 preg
->regparse
= pattern
;
776 *flagp
|= HASWIDTH
|SIMPLE
;
780 ret
= reg(preg
, 1, &flags
);
783 *flagp
|= flags
&(HASWIDTH
|SPSTART
);
788 preg
->err
= REG_ERR_INTERNAL
;
789 return 0; /* Supposed to be caught earlier. */
794 preg
->err
= REG_ERR_COUNT_FOLLOWS_NOTHING
;
797 switch (*preg
->regparse
++) {
799 preg
->err
= REG_ERR_TRAILING_BACKSLASH
;
803 ret
= regnode(preg
, WORDA
);
807 ret
= regnode(preg
, WORDZ
);
810 ret
= regnode(preg
, ANYOF
);
811 reg_addrange(preg
, '0', '9');
813 *flagp
|= HASWIDTH
|SIMPLE
;
816 ret
= regnode(preg
, ANYOF
);
817 if ((preg
->cflags
& REG_ICASE
) == 0) {
818 reg_addrange(preg
, 'a', 'z');
820 reg_addrange(preg
, 'A', 'Z');
821 reg_addrange(preg
, '0', '9');
822 reg_addrange(preg
, '_', '_');
824 *flagp
|= HASWIDTH
|SIMPLE
;
827 ret
= regnode(preg
, ANYOF
);
828 reg_addrange_str(preg
," \t\r\n\f\v");
830 *flagp
|= HASWIDTH
|SIMPLE
;
832 /* FIXME: Someday handle \1, \2, ... */
834 /* Handle general quoted chars in exact-match routine */
835 /* Back up to include the backslash */
843 * Encode a string of characters to be matched exactly.
847 /* Back up to pick up the first char of interest */
850 ret
= regnode(preg
, EXACTLY
);
852 /* Note that a META operator such as ? or * consumes the
854 * Thus we must be careful to look ahead by 2 and add the
855 * last char as it's own EXACTLY if necessary
858 /* Until end of string or a META char is reached */
859 while (*preg
->regparse
&& strchr(META
, *preg
->regparse
) == NULL
) {
860 n
= reg_utf8_tounicode_case(preg
->regparse
, &ch
, (preg
->cflags
& REG_ICASE
));
861 if (ch
== '\\' && preg
->regparse
[n
]) {
862 /* Non-trailing backslash.
863 * Is this a special escape, or a regular escape?
865 if (strchr("<>mMwds", preg
->regparse
[n
])) {
866 /* A special escape. All done with EXACTLY */
869 /* Decode it. Note that we add the length for the escape
870 * sequence to the length for the backlash so we can skip
871 * the entire sequence, or not as required.
873 n
+= reg_decode_escape(preg
->regparse
+ n
, &ch
);
875 preg
->err
= REG_ERR_NULL_CHAR
;
880 /* Now we have one char 'ch' of length 'n'.
881 * Check to see if the following char is a MULT
884 if (ISMULT(preg
->regparse
[n
])) {
885 /* Yes. But do we already have some EXACTLY chars? */
887 /* Yes, so return what we have and pick up the current char next time around */
890 /* No, so add this single char and finish */
897 /* No, so just add this char normally */
915 static void reg_grow(regex_t
*preg
, int n
)
917 if (preg
->p
+ n
>= preg
->proglen
) {
918 preg
->proglen
= (preg
->p
+ n
) * 2;
919 preg
->program
= realloc(preg
->program
, preg
->proglen
* sizeof(int));
924 - regnode - emit a node
927 static int regnode(regex_t
*preg
, int op
)
931 preg
->program
[preg
->p
++] = op
;
932 preg
->program
[preg
->p
++] = 0;
934 /* Return the start of the node */
939 - regc - emit (if appropriate) a byte of code
941 static void regc(regex_t
*preg
, int b
)
944 preg
->program
[preg
->p
++] = b
;
948 - reginsert - insert an operator in front of already-emitted operand
950 * Means relocating the operand.
951 * Returns the new location of the original operand.
953 static int reginsert(regex_t
*preg
, int op
, int size
, int opnd
)
955 reg_grow(preg
, size
);
957 /* Move everything from opnd up */
958 memmove(preg
->program
+ opnd
+ size
, preg
->program
+ opnd
, sizeof(int) * (preg
->p
- opnd
));
959 /* Zero out the new space */
960 memset(preg
->program
+ opnd
, 0, sizeof(int) * size
);
962 preg
->program
[opnd
] = op
;
970 - regtail - set the next-pointer at the end of a node chain
972 static void regtail_(regex_t
*preg
, int p
, int val
, int line
)
978 /* Find last node. */
981 temp
= regnext(preg
, scan
);
987 if (OP(preg
, scan
) == BACK
)
992 preg
->program
[scan
+ 1] = offset
;
996 - regoptail - regtail on operand of first argument; nop if operandless
999 static void regoptail(regex_t
*preg
, int p
, int val
)
1001 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
1002 if (p
!= 0 && OP(preg
, p
) == BRANCH
) {
1003 regtail(preg
, OPERAND(p
), val
);
1008 * regexec and friends
1014 static int regtry(regex_t
*preg
, const char *string
);
1015 static int regmatch(regex_t
*preg
, int prog
);
1016 static int regrepeat(regex_t
*preg
, int p
, int max
);
1019 - regexec - match a regexp against a string
1021 int regexec(regex_t
*preg
, const char *string
, size_t nmatch
, regmatch_t pmatch
[], int eflags
)
1026 /* Be paranoid... */
1027 if (preg
== NULL
|| preg
->program
== NULL
|| string
== NULL
) {
1028 return REG_ERR_NULL_ARGUMENT
;
1031 /* Check validity of program. */
1032 if (*preg
->program
!= REG_MAGIC
) {
1033 return REG_ERR_CORRUPTED
;
1037 fprintf(stderr
, "regexec: %s\n", string
);
1041 preg
->eflags
= eflags
;
1042 preg
->pmatch
= pmatch
;
1043 preg
->nmatch
= nmatch
;
1044 preg
->start
= string
; /* All offsets are computed from here */
1046 /* Must clear out the embedded repeat counts */
1047 for (scan
= OPERAND(1); scan
!= 0; ) {
1048 switch (OP(preg
, scan
)) {
1053 preg
->program
[scan
+ 4] = 0;
1061 while (preg
->program
[scan
++]) {
1075 /* If there is a "must appear" string, look for it. */
1076 if (preg
->regmust
!= 0) {
1078 while ((s
= str_find(s
, preg
->program
[preg
->regmust
], preg
->cflags
& REG_ICASE
)) != NULL
) {
1079 if (prefix_cmp(preg
->program
+ preg
->regmust
, preg
->regmlen
, s
, preg
->cflags
& REG_ICASE
) >= 0) {
1084 if (s
== NULL
) /* Not present. */
1088 /* Mark beginning of line for ^ . */
1089 preg
->regbol
= string
;
1091 /* Simplest case: anchored match need be tried only once (maybe per line). */
1092 if (preg
->reganch
) {
1093 if (eflags
& REG_NOTBOL
) {
1094 /* This is an anchored search, but not an BOL, so possibly skip to the next line */
1098 if (regtry(preg
, string
)) {
1103 if (preg
->cflags
& REG_NEWLINE
) {
1104 /* Try the next anchor? */
1105 string
= strchr(string
, '\n');
1107 preg
->regbol
= ++string
;
1116 /* Messy cases: unanchored match. */
1118 if (preg
->regstart
!= '\0') {
1119 /* We know what char it must start with. */
1120 while ((s
= str_find(s
, preg
->regstart
, preg
->cflags
& REG_ICASE
)) != NULL
) {
1121 if (regtry(preg
, s
))
1127 /* We don't -- general case. */
1129 if (regtry(preg
, s
))
1136 s
+= utf8_tounicode(s
, &c
);
1145 - regtry - try match at specific point
1147 /* 0 failure, 1 success */
1148 static int regtry( regex_t
*preg
, const char *string
)
1152 preg
->reginput
= string
;
1154 for (i
= 0; i
< preg
->nmatch
; i
++) {
1155 preg
->pmatch
[i
].rm_so
= -1;
1156 preg
->pmatch
[i
].rm_eo
= -1;
1158 if (regmatch(preg
, 1)) {
1159 preg
->pmatch
[0].rm_so
= string
- preg
->start
;
1160 preg
->pmatch
[0].rm_eo
= preg
->reginput
- preg
->start
;
1167 * Returns bytes matched if 'pattern' is a prefix of 'string'.
1169 * If 'nocase' is non-zero, does a case-insensitive match.
1171 * Returns -1 on not found.
1173 static int prefix_cmp(const int *prog
, int proglen
, const char *string
, int nocase
)
1175 const char *s
= string
;
1176 while (proglen
&& *s
) {
1178 int n
= reg_utf8_tounicode_case(s
, &ch
, nocase
);
1193 * Searchs for 'c' in the range 'range'.
1195 * Returns 1 if found, or 0 if not.
1197 static int reg_range_find(const int *range
, int c
)
1200 /*printf("Checking %d in range [%d,%d]\n", c, range[1], (range[0] + range[1] - 1));*/
1201 if (c
>= range
[1] && c
<= (range
[0] + range
[1] - 1)) {
1210 * Search for the character 'c' in the utf-8 string 'string'.
1212 * If 'nocase' is set, the 'string' is assumed to be uppercase
1213 * and 'c' is converted to uppercase before matching.
1215 * Returns the byte position in the string where the 'c' was found, or
1216 * NULL if not found.
1218 static const char *str_find(const char *string
, int c
, int nocase
)
1221 /* The "string" should already be converted to uppercase */
1226 int n
= reg_utf8_tounicode_case(string
, &ch
, nocase
);
1236 * Returns true if 'ch' is an end-of-line char.
1238 * In REG_NEWLINE mode, \n is considered EOL in
1241 static int reg_iseol(regex_t
*preg
, int ch
)
1243 if (preg
->cflags
& REG_NEWLINE
) {
1244 return ch
== '\0' || ch
== '\n';
1251 static int regmatchsimplerepeat(regex_t
*preg
, int scan
, int matchmin
)
1258 int max
= preg
->program
[scan
+ 2];
1259 int min
= preg
->program
[scan
+ 3];
1260 int next
= regnext(preg
, scan
);
1263 * Lookahead to avoid useless match attempts
1264 * when we know what character comes next.
1266 if (OP(preg
, next
) == EXACTLY
) {
1267 nextch
= preg
->program
[OPERAND(next
)];
1269 save
= preg
->reginput
;
1270 no
= regrepeat(preg
, scan
+ 5, max
);
1275 /* from min up to no */
1279 /* else from no down to min */
1291 preg
->reginput
= save
+ utf8_index(save
, no
);
1292 reg_utf8_tounicode_case(preg
->reginput
, &c
, (preg
->cflags
& REG_ICASE
));
1293 /* If it could work, try it. */
1294 if (reg_iseol(preg
, nextch
) || c
== nextch
) {
1295 if (regmatch(preg
, next
)) {
1300 /* Couldn't or didn't, add one more */
1304 /* Couldn't or didn't -- back up. */
1311 static int regmatchrepeat(regex_t
*preg
, int scan
, int matchmin
)
1313 int *scanpt
= preg
->program
+ scan
;
1315 int max
= scanpt
[2];
1316 int min
= scanpt
[3];
1318 /* Have we reached min? */
1319 if (scanpt
[4] < min
) {
1320 /* No, so get another one */
1322 if (regmatch(preg
, scan
+ 5)) {
1328 if (scanpt
[4] > max
) {
1333 /* minimal, so try other branch first */
1334 if (regmatch(preg
, regnext(preg
, scan
))) {
1337 /* No, so try one more */
1339 if (regmatch(preg
, scan
+ 5)) {
1345 /* maximal, so try this branch again */
1346 if (scanpt
[4] < max
) {
1348 if (regmatch(preg
, scan
+ 5)) {
1353 /* At this point we are at max with no match. Try the other branch */
1354 return regmatch(preg
, regnext(preg
, scan
));
1358 - regmatch - main matching routine
1360 * Conceptually the strategy is simple: check to see whether the current
1361 * node matches, call self recursively to see whether the rest matches,
1362 * and then act accordingly. In practice we make some effort to avoid
1363 * recursion, in particular by going through "ordinary" nodes (that don't
1364 * need to know whether the rest of the match failed) by a loop instead of
1367 /* 0 failure, 1 success */
1368 static int regmatch(regex_t
*preg
, int prog
)
1370 int scan
; /* Current node. */
1371 int next
; /* Next node. */
1376 if (scan
!= 0 && regnarrate
)
1377 fprintf(stderr
, "%s(\n", regprop(scan
));
1384 fprintf(stderr
, "%3d: %s...\n", scan
, regprop(OP(preg
, scan
))); /* Where, what. */
1387 next
= regnext(preg
, scan
);
1388 n
= reg_utf8_tounicode_case(preg
->reginput
, &c
, (preg
->cflags
& REG_ICASE
));
1390 switch (OP(preg
, scan
)) {
1392 if (preg
->reginput
!= preg
->regbol
)
1396 if (!reg_iseol(preg
, c
)) {
1401 /* Must be looking at a letter, digit, or _ */
1402 if ((!isalnum(UCHAR(c
))) && c
!= '_')
1404 /* Prev must be BOL or nonword */
1405 if (preg
->reginput
> preg
->regbol
&&
1406 (isalnum(UCHAR(preg
->reginput
[-1])) || preg
->reginput
[-1] == '_'))
1410 /* Can't match at BOL */
1411 if (preg
->reginput
> preg
->regbol
) {
1412 /* Current must be EOL or nonword */
1413 if (reg_iseol(preg
, c
) || !isalnum(UCHAR(c
)) || c
!= '_') {
1414 c
= preg
->reginput
[-1];
1415 /* Previous must be word */
1416 if (isalnum(UCHAR(c
)) || c
== '_') {
1425 if (reg_iseol(preg
, c
))
1427 preg
->reginput
+= n
;
1434 opnd
= OPERAND(scan
);
1435 len
= str_int_len(preg
->program
+ opnd
);
1437 slen
= prefix_cmp(preg
->program
+ opnd
, len
, preg
->reginput
, preg
->cflags
& REG_ICASE
);
1441 preg
->reginput
+= slen
;
1445 if (reg_iseol(preg
, c
) || reg_range_find(preg
->program
+ OPERAND(scan
), c
) == 0) {
1448 preg
->reginput
+= n
;
1451 if (reg_iseol(preg
, c
) || reg_range_find(preg
->program
+ OPERAND(scan
), c
) != 0) {
1454 preg
->reginput
+= n
;
1463 if (OP(preg
, next
) != BRANCH
) /* No choice. */
1464 next
= OPERAND(scan
); /* Avoid recursion. */
1467 save
= preg
->reginput
;
1468 if (regmatch(preg
, OPERAND(scan
))) {
1471 preg
->reginput
= save
;
1472 scan
= regnext(preg
, scan
);
1473 } while (scan
!= 0 && OP(preg
, scan
) == BRANCH
);
1481 return regmatchsimplerepeat(preg
, scan
, OP(preg
, scan
) == REPMIN
);
1485 return regmatchrepeat(preg
, scan
, OP(preg
, scan
) == REPXMIN
);
1488 return(1); /* Success! */
1493 if (regmatch(preg
, next
)) {
1499 if (OP(preg
, scan
) >= OPEN
+1 && OP(preg
, scan
) < CLOSE_END
) {
1502 save
= preg
->reginput
;
1504 if (regmatch(preg
, next
)) {
1507 * Don't set startp if some later
1508 * invocation of the same parentheses
1511 if (OP(preg
, scan
) < CLOSE
) {
1512 no
= OP(preg
, scan
) - OPEN
;
1513 if (no
< preg
->nmatch
&& preg
->pmatch
[no
].rm_so
== -1) {
1514 preg
->pmatch
[no
].rm_so
= save
- preg
->start
;
1518 no
= OP(preg
, scan
) - CLOSE
;
1519 if (no
< preg
->nmatch
&& preg
->pmatch
[no
].rm_eo
== -1) {
1520 preg
->pmatch
[no
].rm_eo
= save
- preg
->start
;
1527 return REG_ERR_INTERNAL
;
1534 * We get here only if there's trouble -- normally "case END" is
1535 * the terminating point.
1537 return REG_ERR_INTERNAL
;
1541 - regrepeat - repeatedly match something simple, report how many
1543 static int regrepeat(regex_t
*preg
, int p
, int max
)
1551 scan
= preg
->reginput
;
1553 switch (OP(preg
, p
)) {
1555 /* No need to handle utf8 specially here */
1556 while (!reg_iseol(preg
, *scan
) && count
< max
) {
1562 while (count
< max
) {
1563 n
= reg_utf8_tounicode_case(scan
, &ch
, preg
->cflags
& REG_ICASE
);
1564 if (preg
->program
[opnd
] != ch
) {
1572 while (count
< max
) {
1573 n
= reg_utf8_tounicode_case(scan
, &ch
, preg
->cflags
& REG_ICASE
);
1574 if (reg_iseol(preg
, ch
) || reg_range_find(preg
->program
+ opnd
, ch
) == 0) {
1582 while (count
< max
) {
1583 n
= reg_utf8_tounicode_case(scan
, &ch
, preg
->cflags
& REG_ICASE
);
1584 if (reg_iseol(preg
, ch
) || reg_range_find(preg
->program
+ opnd
, ch
) != 0) {
1591 default: /* Oh dear. Called inappropriately. */
1592 preg
->err
= REG_ERR_INTERNAL
;
1593 count
= 0; /* Best compromise. */
1596 preg
->reginput
= scan
;
1602 - regnext - dig the "next" pointer out of a node
1604 static int regnext(regex_t
*preg
, int p
)
1608 offset
= NEXT(preg
, p
);
1613 if (OP(preg
, p
) == BACK
)
1619 #if defined(DEBUG) && !defined(JIM_BOOTSTRAP)
1622 - regdump - dump a regexp onto stdout in vaguely comprehensible form
1624 static void regdump(regex_t
*preg
)
1627 int op
= EXACTLY
; /* Arbitrary non-END op. */
1629 char buf
[MAX_UTF8_LEN
+ 1];
1632 for (i
= 1; i
< preg
->p
; i
++) {
1633 printf("%02x ", (unsigned char)preg
->program
[i
]);
1641 while (op
!= END
&& s
< preg
->p
) { /* While that wasn't END last time... */
1643 printf("%3d: %s", s
, regprop(op
)); /* Where, what. */
1644 next
= regnext(preg
, s
);
1645 if (next
== 0) /* Next ptr. */
1648 printf("(%d)", next
);
1650 if (op
== REP
|| op
== REPMIN
|| op
== REPX
|| op
== REPXMIN
) {
1651 int max
= preg
->program
[s
];
1652 int min
= preg
->program
[s
+ 1];
1654 printf("{%d,*}", min
);
1657 printf("{%d,%d}", min
, max
);
1659 printf(" %d", preg
->program
[s
+ 2]);
1662 else if (op
== ANYOF
|| op
== ANYBUT
) {
1665 while (preg
->program
[s
]) {
1666 int len
= preg
->program
[s
++];
1667 int first
= preg
->program
[s
++];
1668 buf
[utf8_getchars(buf
, first
)] = 0;
1671 buf
[utf8_getchars(buf
, first
+ len
- 1)] = 0;
1677 else if (op
== EXACTLY
) {
1678 /* Literal string, where present. */
1680 while (preg
->program
[s
]) {
1681 buf
[utf8_getchars(buf
, preg
->program
[s
])] = 0;
1691 /* Header fields of interest. */
1692 if (preg
->regstart
) {
1693 buf
[utf8_getchars(buf
, preg
->regstart
)] = 0;
1694 printf("start '%s' ", buf
);
1697 printf("anchored ");
1698 if (preg
->regmust
!= 0) {
1700 printf("must have:");
1701 for (i
= 0; i
< preg
->regmlen
; i
++) {
1702 putchar(preg
->program
[preg
->regmust
+ i
]);
1711 - regprop - printable representation of opcode
1713 static const char *regprop( int op
)
1715 static char buf
[50];
1755 if (op
>= OPEN
&& op
< CLOSE
) {
1756 snprintf(buf
, sizeof(buf
), "OPEN%d", op
-OPEN
);
1758 else if (op
>= CLOSE
&& op
< CLOSE_END
) {
1759 snprintf(buf
, sizeof(buf
), "CLOSE%d", op
-CLOSE
);
1762 snprintf(buf
, sizeof(buf
), "?%d?\n", op
);
1767 #endif /* JIM_BOOTSTRAP */
1769 size_t regerror(int errcode
, const regex_t
*preg
, char *errbuf
, size_t errbuf_size
)
1771 static const char *error_strings
[] = {
1780 "parentheses () not balanced",
1781 "braces {} not balanced",
1782 "invalid repetition count(s)",
1787 "count follows nothing",
1788 "trailing backslash",
1789 "corrupted program",
1790 "contains null char",
1794 if (errcode
< 0 || errcode
>= REG_ERR_NUM
) {
1795 err
= "Bad error code";
1798 err
= error_strings
[errcode
];
1801 return snprintf(errbuf
, errbuf_size
, "%s", err
);
1804 void regfree(regex_t
*preg
)
1806 free(preg
->program
);