4 * regcomp and regexec -- regsub and regerror are elsewhere
6 * Copyright (c) 1986 by University of Toronto.
7 * Written by Henry Spencer. Not derived from licensed software.
9 * Permission is granted to anyone to use this software for any
10 * purpose on any computer system, and to redistribute it freely,
11 * subject to the following restrictions:
13 * 1. The author is not responsible for the consequences of use of
14 * this software, no matter how awful, even if they arise
17 * 2. The origin of this software must not be misrepresented, either
18 * by explicit claim or by omission.
20 * 3. Altered versions must be plainly marked as such, and must not
21 * be misrepresented as being the original software.
22 *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore,
23 *** hoptoad!gnu, on 27 Dec 1986, to add \n as an alternative to |
24 *** to assist in implementing egrep.
25 *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore,
26 *** hoptoad!gnu, on 27 Dec 1986, to add \< and \> for word-matching
27 *** as in BSD grep and ex.
28 *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore,
29 *** hoptoad!gnu, on 28 Dec 1986, to optimize characters quoted with \.
30 *** THIS IS AN ALTERED VERSION. It was altered by James A. Woods,
31 *** ames!jaw, on 19 June 1987, to quash a regcomp() redundancy.
32 *** THIS IS AN ALTERED VERSION. It was altered by Christopher Seiwald
33 *** seiwald@vix.com, on 28 August 1993, for use in jam. Regmagic.h
34 *** was moved into regexp.h, and the include of regexp.h now uses "'s
35 *** to avoid conflicting with the system regexp.h. Const, bless its
36 *** soul, was removed so it can compile everywhere. The declaration
37 *** of strchr() was in conflict on AIX, so it was removed (as it is
38 *** happily defined in string.h).
39 *** THIS IS AN ALTERED VERSION. It was altered by Christopher Seiwald
40 *** seiwald@perforce.com, on 20 January 2000, to use function prototypes.
41 *** THIS IS AN ALTERED VERSION. It was altered by Christopher Seiwald
42 *** seiwald@perforce.com, on 05 November 2002, to const string literals.
44 * THIS IS AN ALTERED VERSION. It was altered by Steve Bennett <steveb@workware.net.au>
45 * on 16 October 2010, to remove static state and add better Tcl ARE compatibility.
46 * This includes counted repetitions, UTF-8 support, character classes,
47 * shorthand character classes, increased number of parentheses to 100,
48 * backslash escape sequences. It also removes \n as an alternative to |.
50 * Beware that some of this code is subtly aware of the way operator
51 * precedence is structured in regular expressions. Serious changes in
52 * regular-expression syntax might require a total rethink.
55 #include "jimautoconf.h"
57 #if defined(JIM_REGEXP)
64 #include "jimregexp.h"
67 /* An arbitrary limit, but this seems enough. Must be less than 1000. */
68 #define REG_MAX_PAREN 100
71 * Structure for regexp "program". This is essentially a linear encoding
72 * of a nondeterministic finite-state machine (aka syntax charts or
73 * "railroad normal form" in parsing technology). Each node is an opcode
74 * plus a "next" pointer, possibly plus an operand. "Next" pointers of
75 * all nodes except BRANCH implement concatenation; a "next" pointer with
76 * a BRANCH on both ends of it is connecting two alternatives. (Here we
77 * have one of the subtle syntax dependencies: an individual BRANCH (as
78 * opposed to a collection of them) is never concatenated with anything
79 * because of operator precedence.) The operand of some types of node is
80 * a literal string; for others, it is a node leading into a sub-FSM. In
81 * particular, the operand of a BRANCH node is the first node of the branch.
82 * (NB this is *not* a tree structure: the tail of the branch connects
83 * to the thing following the set of BRANCHes.) The opcodes are:
86 /* definition number opnd? meaning */
87 #define END 0 /* no End of program. */
88 #define BOL 1 /* no Match "" at beginning of line. */
89 #define EOL 2 /* no Match "" at end of line. */
90 #define ANY 3 /* no Match any one character. */
91 #define ANYOF 4 /* str Match any character in this string. */
92 #define ANYBUT 5 /* str Match any character not in this string. */
93 #define BRANCH 6 /* node Match this alternative, or the next... */
94 #define BACK 7 /* no Match "", "next" ptr points backward. */
95 #define EXACTLY 8 /* str Match this string. */
96 #define NOTHING 9 /* no Match empty string. */
97 #define REP 10 /* max,min Match this (simple) thing [min,max] times. */
98 #define REPMIN 11 /* max,min Match this (simple) thing [min,max] times, minimal match. */
99 #define REPX 12 /* max,min Match this (complex) thing [min,max] times. */
100 #define REPXMIN 13 /* max,min Match this (complex) thing [min,max] times, minimal match. */
101 #define BOLX 14 /* no Match "" at beginning of input. */
102 #define EOLX 15 /* no Match "" at end of input. */
103 #define WORDA 16 /* no Match "" at wordchar, where prev is nonword */
104 #define WORDZ 17 /* no Match "" at nonwordchar, where prev is word */
106 #define OPENNC 1000 /* no Non-capturing parentheses - must be OPEN-1 */
107 #define OPEN 1001 /* no Mark this point in input as start of #n. */
108 /* OPEN+1 is number 1, etc. */
110 /* must not be any other opts between OPEN and CLOSE */
112 #define CLOSENC 2000 /* no Non-capturing parentheses - must be CLOSE-1 */
113 #define CLOSE 2001 /* no Analogous to OPEN. */
114 #define CLOSE_END (CLOSE+REG_MAX_PAREN)
117 * The first word of the regexp internal "program" is actually this magic
118 * number; the start node begins in the second word.
120 #define REG_MAGIC 0xFADED00D
125 * BRANCH The set of branches constituting a single choice are hooked
126 * together with their "next" pointers, since precedence prevents
127 * anything being concatenated to any individual branch. The
128 * "next" pointer of the last BRANCH in a choice points to the
129 * thing following the whole choice. This is also where the
130 * final "next" pointer of each individual branch points; each
131 * branch starts with the operand node of a BRANCH node.
133 * BACK Normal "next" pointers all implicitly point forward; BACK
134 * exists to make loop structures possible.
136 * REP,REPX Repeated matches ('?', '*', '+' and {min,max}) are implemented
137 * as either simple repeats (REP) or complex repeats (REPX).
138 * These opcodes include a "min" and "max" count after the opcode.
139 * This is followed by a fourth "current count" word that is
140 * only used by REPX, as it implements a recursive match.
141 * REPMIN and REPXMIN are identical except they implement minimal repeats.
143 * OPEN,CLOSE ...are numbered at compile time.
147 * A node is one word of opcode followed by one word of "next" pointer.
148 * The "next" pointer value is a positive offset from the opcode of the node
150 * An operand, if any, simply follows the node. (Note that much of the
151 * code generation knows about this implicit relationship.)
153 #define OP(preg, p) (preg->program[p])
154 #define NEXT(preg, p) (preg->program[p + 1])
155 #define OPERAND(p) ((p) + 2)
158 * See regmagic.h for one further detail of program structure.
163 * Utility definitions.
166 #define FAIL(R,M) { (R)->err = (M); return (M); }
167 #define ISMULT(c) ((c) == '*' || (c) == '+' || (c) == '?' || (c) == '{')
168 #define META "^$.[()|?{+*"
171 * Flags to be passed up and down.
173 #define HASWIDTH 1 /* Known never to match null string. */
174 #define SIMPLE 2 /* Simple enough to be STAR/PLUS operand. */
175 #define SPSTART 4 /* Starts with * or +. */
176 #define WORST 0 /* Worst case. */
178 #define MAX_REP_COUNT 1000000
181 * Forward declarations for regcomp()'s friends.
183 static int reg(regex_t
*preg
, int paren
/* Parenthesized? */, int *flagp
);
184 static int regpiece(regex_t
*preg
, int *flagp
);
185 static int regbranch(regex_t
*preg
, int *flagp
);
186 static int regatom(regex_t
*preg
, int *flagp
);
187 static int regnode(regex_t
*preg
, int op
);
188 static int regnext(regex_t
*preg
, int p
);
189 static void regc(regex_t
*preg
, int b
);
190 static int reginsert(regex_t
*preg
, int op
, int size
, int opnd
);
191 static void regtail(regex_t
*preg
, int p
, int val
);
192 static void regoptail(regex_t
*preg
, int p
, int val
);
193 static int regopsize(regex_t
*preg
, int p
);
195 static int reg_range_find(const int *string
, int c
);
196 static const char *str_find(const char *string
, int c
, int nocase
);
197 static int prefix_cmp(const int *prog
, int proglen
, const char *string
, int nocase
);
201 static int regnarrate
= 0;
202 static void regdump(regex_t
*preg
);
203 static const char *regprop( int op
);
208 * Returns the length of the null-terminated integer sequence.
210 static int str_int_len(const int *seq
)
220 - regcomp - compile a regular expression into internal code
222 * We can't allocate space until we know how big the compiled form will be,
223 * but we can't compile it (and thus know how big it is) until we've got a
224 * place to put the code. So we cheat: we compile it twice, once with code
225 * generation turned off and size counting turned on, and once "for real".
226 * This also means that we don't allocate space until we are sure that the
227 * thing really will compile successfully, and we never have to move the
228 * code and thus invalidate pointers into it. (Note that it has to be in
229 * one piece because free() must be able to free it all.)
231 * Beware that the optimization-preparation code in here knows about some
232 * of the structure of the compiled regexp.
234 int regcomp(regex_t
*preg
, const char *exp
, int cflags
)
242 fprintf(stderr
, "Compiling: '%s'\n", exp
);
244 memset(preg
, 0, sizeof(*preg
));
247 FAIL(preg
, REG_ERR_NULL_ARGUMENT
);
249 /* First pass: determine size, legality. */
250 preg
->cflags
= cflags
;
251 preg
->regparse
= exp
;
253 /* Allocate space. */
254 preg
->proglen
= (strlen(exp
) + 1) * 5;
255 preg
->program
= malloc(preg
->proglen
* sizeof(int));
256 if (preg
->program
== NULL
)
257 FAIL(preg
, REG_ERR_NOMEM
);
259 /* Note that since we store a magic value as the first item in the program,
260 * program offsets will never be 0
262 regc(preg
, REG_MAGIC
);
263 if (reg(preg
, 0, &flags
) == 0) {
267 /* Small enough for pointer-storage convention? */
268 if (preg
->re_nsub
>= REG_MAX_PAREN
) /* Probably could be 65535L. */
269 FAIL(preg
,REG_ERR_TOO_BIG
);
271 /* Dig out information for optimizations. */
272 preg
->regstart
= 0; /* Worst-case defaults. */
276 scan
= 1; /* First BRANCH. */
277 if (OP(preg
, regnext(preg
, scan
)) == END
) { /* Only one top-level choice. */
278 scan
= OPERAND(scan
);
280 /* Starting-point info. */
281 if (OP(preg
, scan
) == EXACTLY
) {
282 preg
->regstart
= preg
->program
[OPERAND(scan
)];
284 else if (OP(preg
, scan
) == BOL
)
288 * If there's something expensive in the r.e., find the
289 * longest literal string that must appear and make it the
290 * regmust. Resolve ties in favor of later strings, since
291 * the regstart check works with the beginning of the r.e.
292 * and avoiding duplication strengthens checking. Not a
293 * strong reason, but sufficient in the absence of others.
298 for (; scan
!= 0; scan
= regnext(preg
, scan
)) {
299 if (OP(preg
, scan
) == EXACTLY
) {
300 int plen
= str_int_len(preg
->program
+ OPERAND(scan
));
302 longest
= OPERAND(scan
);
307 preg
->regmust
= longest
;
320 - reg - regular expression, i.e. main body or parenthesized thing
322 * Caller must absorb opening parenthesis.
324 * Combining parenthesis handling with the base level of regular expression
325 * is a trifle forced, but the need to tie the tails of the branches to what
326 * follows makes it hard to avoid.
328 static int reg(regex_t
*preg
, int paren
/* Parenthesized? */, int *flagp
)
336 *flagp
= HASWIDTH
; /* Tentatively. */
338 /* Make an OPEN node, if parenthesized. */
340 if (preg
->regparse
[0] == '?' && preg
->regparse
[1] == ':') {
341 /* non-capturing paren */
346 parno
= ++preg
->re_nsub
;
348 ret
= regnode(preg
, OPEN
+parno
);
352 /* Pick up the branches, linking them together. */
353 br
= regbranch(preg
, &flags
);
357 regtail(preg
, ret
, br
); /* OPEN -> first. */
360 if (!(flags
&HASWIDTH
))
362 *flagp
|= flags
&SPSTART
;
363 while (*preg
->regparse
== '|') {
365 br
= regbranch(preg
, &flags
);
368 regtail(preg
, ret
, br
); /* BRANCH -> BRANCH. */
369 if (!(flags
&HASWIDTH
))
371 *flagp
|= flags
&SPSTART
;
374 /* Make a closing node, and hook it on the end. */
375 ender
= regnode(preg
, (paren
) ? CLOSE
+parno
: END
);
376 regtail(preg
, ret
, ender
);
378 /* Hook the tails of the branches to the closing node. */
379 for (br
= ret
; br
!= 0; br
= regnext(preg
, br
))
380 regoptail(preg
, br
, ender
);
382 /* Check for proper termination. */
383 if (paren
&& *preg
->regparse
++ != ')') {
384 preg
->err
= REG_ERR_UNMATCHED_PAREN
;
386 } else if (!paren
&& *preg
->regparse
!= '\0') {
387 if (*preg
->regparse
== ')') {
388 preg
->err
= REG_ERR_UNMATCHED_PAREN
;
391 preg
->err
= REG_ERR_JUNK_ON_END
;
400 - regbranch - one alternative of an | operator
402 * Implements the concatenation operator.
404 static int regbranch(regex_t
*preg
, int *flagp
)
411 *flagp
= WORST
; /* Tentatively. */
413 ret
= regnode(preg
, BRANCH
);
415 while (*preg
->regparse
!= '\0' && *preg
->regparse
!= ')' &&
416 *preg
->regparse
!= '|') {
417 latest
= regpiece(preg
, &flags
);
420 *flagp
|= flags
&HASWIDTH
;
421 if (chain
== 0) {/* First piece. */
422 *flagp
|= flags
&SPSTART
;
425 regtail(preg
, chain
, latest
);
429 if (chain
== 0) /* Loop ran zero times. */
430 (void) regnode(preg
, NOTHING
);
436 - regpiece - something followed by possible [*+?]
438 * Note that the branching code sequences used for ? and the general cases
439 * of * and + are somewhat optimized: they use the same NOTHING node as
440 * both the endmarker for their branch list and the body of the last branch.
441 * It might seem that this node could be dispensed with entirely, but the
442 * endmarker role is not redundant.
444 static int regpiece(regex_t
*preg
, int *flagp
)
453 ret
= regatom(preg
, &flags
);
457 op
= *preg
->regparse
;
463 if (!(flags
&HASWIDTH
) && op
!= '?') {
464 preg
->err
= REG_ERR_OPERAND_COULD_BE_EMPTY
;
468 /* Handle braces (counted repetition) by expansion */
472 min
= strtoul(preg
->regparse
+ 1, &end
, 10);
473 if (end
== preg
->regparse
+ 1) {
474 preg
->err
= REG_ERR_BAD_COUNT
;
481 preg
->regparse
= end
;
482 max
= strtoul(preg
->regparse
+ 1, &end
, 10);
484 preg
->err
= REG_ERR_UNMATCHED_BRACES
;
488 if (end
== preg
->regparse
+ 1) {
491 else if (max
< min
|| max
>= 100) {
492 preg
->err
= REG_ERR_BAD_COUNT
;
496 preg
->err
= REG_ERR_BAD_COUNT
;
500 preg
->regparse
= strchr(preg
->regparse
, '}');
504 max
= (op
== '?' ? 1 : MAX_REP_COUNT
);
507 if (preg
->regparse
[1] == '?') {
509 next
= reginsert(preg
, flags
& SIMPLE
? REPMIN
: REPXMIN
, 5, ret
);
512 next
= reginsert(preg
, flags
& SIMPLE
? REP
: REPX
, 5, ret
);
514 preg
->program
[ret
+ 2] = max
;
515 preg
->program
[ret
+ 3] = min
;
516 preg
->program
[ret
+ 4] = 0;
518 *flagp
= (min
) ? (WORST
|HASWIDTH
) : (WORST
|SPSTART
);
520 if (!(flags
& SIMPLE
)) {
521 int back
= regnode(preg
, BACK
);
522 regtail(preg
, back
, ret
);
523 regtail(preg
, next
, back
);
527 if (ISMULT(*preg
->regparse
)) {
528 preg
->err
= REG_ERR_NESTED_COUNT
;
536 * Add all characters in the inclusive range between lower and upper.
538 * Handles a swapped range (upper < lower).
540 static void reg_addrange(regex_t
*preg
, int lower
, int upper
)
543 reg_addrange(preg
, upper
, lower
);
545 /* Add a range as length, start */
546 regc(preg
, upper
- lower
+ 1);
551 * Add a null-terminated literal string as a set of ranges.
553 static void reg_addrange_str(regex_t
*preg
, const char *str
)
556 reg_addrange(preg
, *str
, *str
);
562 * Extracts the next unicode char from utf8.
564 * If 'upper' is set, converts the char to uppercase.
566 static int reg_utf8_tounicode_case(const char *s
, int *uc
, int upper
)
568 int l
= utf8_tounicode(s
, uc
);
570 *uc
= utf8_upper(*uc
);
576 * Converts a hex digit to decimal.
578 * Returns -1 for an invalid hex digit.
580 static int hexdigitval(int c
)
582 if (c
>= '0' && c
<= '9')
584 if (c
>= 'a' && c
<= 'f')
586 if (c
>= 'A' && c
<= 'F')
592 * Parses up to 'n' hex digits at 's' and stores the result in *uc.
594 * Returns the number of hex digits parsed.
595 * If there are no hex digits, returns 0 and stores nothing.
597 static int parse_hex(const char *s
, int n
, int *uc
)
602 for (k
= 0; k
< n
; k
++) {
603 int c
= hexdigitval(*s
++);
607 val
= (val
<< 4) | c
;
616 * Call for chars after a backlash to decode the escape sequence.
618 * Stores the result in *ch.
620 * Returns the number of bytes consumed.
622 static int reg_decode_escape(const char *s
, int *ch
)
630 case 'b': *ch
= '\b'; break;
631 case 'e': *ch
= 27; break;
632 case 'f': *ch
= '\f'; break;
633 case 'n': *ch
= '\n'; break;
634 case 'r': *ch
= '\r'; break;
635 case 't': *ch
= '\t'; break;
636 case 'v': *ch
= '\v'; break;
639 /* Expect \u{NNNN} */
640 n
= parse_hex(s
+ 1, 6, ch
);
641 if (n
> 0 && s
[n
+ 1] == '}' && *ch
>= 0 && *ch
<= 0x1fffff) {
645 /* Invalid, so just treat as an escaped 'u' */
649 else if ((n
= parse_hex(s
, 4, ch
)) > 0) {
654 if ((n
= parse_hex(s
, 8, ch
)) > 0) {
659 if ((n
= parse_hex(s
, 2, ch
)) > 0) {
672 - regatom - the lowest level
674 * Optimization: gobbles an entire sequence of ordinary characters so that
675 * it can turn them into a single node, which is smaller to store and
676 * faster to run. Backslashed characters are exceptions, each becoming a
677 * separate node; the code is simpler that way and it's not worth fixing.
679 static int regatom(regex_t
*preg
, int *flagp
)
683 int nocase
= (preg
->cflags
& REG_ICASE
);
686 int n
= reg_utf8_tounicode_case(preg
->regparse
, &ch
, nocase
);
688 *flagp
= WORST
; /* Tentatively. */
692 /* FIXME: these chars only have meaning at beg/end of pat? */
694 ret
= regnode(preg
, BOL
);
697 ret
= regnode(preg
, EOL
);
700 ret
= regnode(preg
, ANY
);
701 *flagp
|= HASWIDTH
|SIMPLE
;
704 const char *pattern
= preg
->regparse
;
706 if (*pattern
== '^') { /* Complement of range. */
707 ret
= regnode(preg
, ANYBUT
);
710 ret
= regnode(preg
, ANYOF
);
712 /* Special case. If the first char is ']' or '-', it is part of the set */
713 if (*pattern
== ']' || *pattern
== '-') {
714 reg_addrange(preg
, *pattern
, *pattern
);
718 while (*pattern
&& *pattern
!= ']') {
719 /* Is this a range? a-z */
723 pattern
+= reg_utf8_tounicode_case(pattern
, &start
, nocase
);
725 pattern
+= reg_decode_escape(pattern
, &start
);
727 preg
->err
= REG_ERR_NULL_CHAR
;
731 if (pattern
[0] == '-' && pattern
[1] && pattern
[1] != ']') {
733 pattern
+= utf8_tounicode(pattern
, &end
);
734 pattern
+= reg_utf8_tounicode_case(pattern
, &end
, nocase
);
736 pattern
+= reg_decode_escape(pattern
, &end
);
738 preg
->err
= REG_ERR_NULL_CHAR
;
743 reg_addrange(preg
, start
, end
);
746 if (start
== '[' && pattern
[0] == ':') {
747 static const char *character_class
[] = {
748 ":alpha:", ":alnum:", ":space:", ":blank:", ":upper:", ":lower:",
749 ":digit:", ":xdigit:", ":cntrl:", ":graph:", ":print:", ":punct:",
752 CC_ALPHA
, CC_ALNUM
, CC_SPACE
, CC_BLANK
, CC_UPPER
, CC_LOWER
,
753 CC_DIGIT
, CC_XDIGIT
, CC_CNTRL
, CC_GRAPH
, CC_PRINT
, CC_PUNCT
,
758 for (i
= 0; i
< CC_NUM
; i
++) {
759 n
= strlen(character_class
[i
]);
760 if (strncmp(pattern
, character_class
[i
], n
) == 0) {
761 /* Found a character class */
769 reg_addrange(preg
, '0', '9');
772 if ((preg
->cflags
& REG_ICASE
) == 0) {
773 reg_addrange(preg
, 'a', 'z');
775 reg_addrange(preg
, 'A', 'Z');
778 reg_addrange_str(preg
, " \t\r\n\f\v");
781 reg_addrange_str(preg
, " \t");
784 reg_addrange(preg
, 'A', 'Z');
787 reg_addrange(preg
, 'a', 'z');
790 reg_addrange(preg
, 'a', 'f');
791 reg_addrange(preg
, 'A', 'F');
794 reg_addrange(preg
, '0', '9');
797 reg_addrange(preg
, 0, 31);
798 reg_addrange(preg
, 127, 127);
801 reg_addrange(preg
, ' ', '~');
804 reg_addrange(preg
, '!', '~');
807 reg_addrange(preg
, '!', '/');
808 reg_addrange(preg
, ':', '@');
809 reg_addrange(preg
, '[', '`');
810 reg_addrange(preg
, '{', '~');
816 /* Not a range, so just add the char */
817 reg_addrange(preg
, start
, start
);
824 preg
->regparse
= pattern
;
826 *flagp
|= HASWIDTH
|SIMPLE
;
830 ret
= reg(preg
, 1, &flags
);
833 *flagp
|= flags
&(HASWIDTH
|SPSTART
);
838 preg
->err
= REG_ERR_INTERNAL
;
839 return 0; /* Supposed to be caught earlier. */
844 preg
->err
= REG_ERR_COUNT_FOLLOWS_NOTHING
;
847 ch
= *preg
->regparse
++;
850 preg
->err
= REG_ERR_TRAILING_BACKSLASH
;
853 ret
= regnode(preg
, BOLX
);
856 ret
= regnode(preg
, EOLX
);
860 ret
= regnode(preg
, WORDA
);
864 ret
= regnode(preg
, WORDZ
);
868 ret
= regnode(preg
, ch
== 'd' ? ANYOF
: ANYBUT
);
869 reg_addrange(preg
, '0', '9');
871 *flagp
|= HASWIDTH
|SIMPLE
;
875 ret
= regnode(preg
, ch
== 'w' ? ANYOF
: ANYBUT
);
876 if ((preg
->cflags
& REG_ICASE
) == 0) {
877 reg_addrange(preg
, 'a', 'z');
879 reg_addrange(preg
, 'A', 'Z');
880 reg_addrange(preg
, '0', '9');
881 reg_addrange(preg
, '_', '_');
883 *flagp
|= HASWIDTH
|SIMPLE
;
887 ret
= regnode(preg
, ch
== 's' ? ANYOF
: ANYBUT
);
888 reg_addrange_str(preg
," \t\r\n\f\v");
890 *flagp
|= HASWIDTH
|SIMPLE
;
892 /* FIXME: Someday handle \1, \2, ... */
894 /* Handle general quoted chars in exact-match routine */
895 /* Back up to include the backslash */
903 * Encode a string of characters to be matched exactly.
907 /* Back up to pick up the first char of interest */
910 ret
= regnode(preg
, EXACTLY
);
912 /* Note that a META operator such as ? or * consumes the
914 * Thus we must be careful to look ahead by 2 and add the
915 * last char as it's own EXACTLY if necessary
918 /* Until end of string or a META char is reached */
919 while (*preg
->regparse
&& strchr(META
, *preg
->regparse
) == NULL
) {
920 n
= reg_utf8_tounicode_case(preg
->regparse
, &ch
, (preg
->cflags
& REG_ICASE
));
921 if (ch
== '\\' && preg
->regparse
[n
]) {
922 /* Non-trailing backslash.
923 * Is this a special escape, or a regular escape?
925 if (strchr("<>mMwWdDsSAZ", preg
->regparse
[n
])) {
926 /* A special escape. All done with EXACTLY */
929 /* Decode it. Note that we add the length for the escape
930 * sequence to the length for the backlash so we can skip
931 * the entire sequence, or not as required.
933 n
+= reg_decode_escape(preg
->regparse
+ n
, &ch
);
935 preg
->err
= REG_ERR_NULL_CHAR
;
940 /* Now we have one char 'ch' of length 'n'.
941 * Check to see if the following char is a MULT
944 if (ISMULT(preg
->regparse
[n
])) {
945 /* Yes. But do we already have some EXACTLY chars? */
947 /* Yes, so return what we have and pick up the current char next time around */
950 /* No, so add this single char and finish */
957 /* No, so just add this char normally */
975 static void reg_grow(regex_t
*preg
, int n
)
977 if (preg
->p
+ n
>= preg
->proglen
) {
978 preg
->proglen
= (preg
->p
+ n
) * 2;
979 preg
->program
= realloc(preg
->program
, preg
->proglen
* sizeof(int));
984 - regnode - emit a node
987 static int regnode(regex_t
*preg
, int op
)
991 /* The OP followed by a next pointer */
992 preg
->program
[preg
->p
++] = op
;
993 preg
->program
[preg
->p
++] = 0;
995 /* Return the start of the node */
1000 - regc - emit (if appropriate) a byte of code
1002 static void regc(regex_t
*preg
, int b
)
1005 preg
->program
[preg
->p
++] = b
;
1009 - reginsert - insert an operator in front of already-emitted operand
1011 * Means relocating the operand.
1012 * Returns the new location of the original operand.
1014 static int reginsert(regex_t
*preg
, int op
, int size
, int opnd
)
1016 reg_grow(preg
, size
);
1018 /* Move everything from opnd up */
1019 memmove(preg
->program
+ opnd
+ size
, preg
->program
+ opnd
, sizeof(int) * (preg
->p
- opnd
));
1020 /* Zero out the new space */
1021 memset(preg
->program
+ opnd
, 0, sizeof(int) * size
);
1023 preg
->program
[opnd
] = op
;
1031 - regtail - set the next-pointer at the end of a node chain
1033 static void regtail(regex_t
*preg
, int p
, int val
)
1039 /* Find last node. */
1042 temp
= regnext(preg
, scan
);
1048 if (OP(preg
, scan
) == BACK
)
1049 offset
= scan
- val
;
1051 offset
= val
- scan
;
1053 preg
->program
[scan
+ 1] = offset
;
1057 - regoptail - regtail on operand of first argument; nop if operandless
1060 static void regoptail(regex_t
*preg
, int p
, int val
)
1062 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
1063 if (p
!= 0 && OP(preg
, p
) == BRANCH
) {
1064 regtail(preg
, OPERAND(p
), val
);
1069 * regexec and friends
1075 static int regtry(regex_t
*preg
, const char *string
);
1076 static int regmatch(regex_t
*preg
, int prog
);
1077 static int regrepeat(regex_t
*preg
, int p
, int max
);
1080 - regexec - match a regexp against a string
1082 int regexec(regex_t
*preg
, const char *string
, size_t nmatch
, regmatch_t pmatch
[], int eflags
)
1087 /* Be paranoid... */
1088 if (preg
== NULL
|| preg
->program
== NULL
|| string
== NULL
) {
1089 return REG_ERR_NULL_ARGUMENT
;
1092 /* Check validity of program. */
1093 if (*preg
->program
!= REG_MAGIC
) {
1094 return REG_ERR_CORRUPTED
;
1098 fprintf(stderr
, "regexec: %s\n", string
);
1102 preg
->eflags
= eflags
;
1103 preg
->pmatch
= pmatch
;
1104 preg
->nmatch
= nmatch
;
1105 preg
->start
= string
; /* All offsets are computed from here */
1107 /* Must clear out the embedded repeat counts of REPX and REPXMIN opcodes */
1108 for (scan
= OPERAND(1); scan
!= 0; scan
+= regopsize(preg
, scan
)) {
1109 int op
= OP(preg
, scan
);
1112 if (op
== REPX
|| op
== REPXMIN
)
1113 preg
->program
[scan
+ 4] = 0;
1116 /* If there is a "must appear" string, look for it. */
1117 if (preg
->regmust
!= 0) {
1119 while ((s
= str_find(s
, preg
->program
[preg
->regmust
], preg
->cflags
& REG_ICASE
)) != NULL
) {
1120 if (prefix_cmp(preg
->program
+ preg
->regmust
, preg
->regmlen
, s
, preg
->cflags
& REG_ICASE
) >= 0) {
1125 if (s
== NULL
) /* Not present. */
1129 /* Mark beginning of line for ^ . */
1130 preg
->regbol
= string
;
1132 /* Simplest case: anchored match need be tried only once (maybe per line). */
1133 if (preg
->reganch
) {
1134 if (eflags
& REG_NOTBOL
) {
1135 /* This is an anchored search, but not an BOL, so possibly skip to the next line */
1139 if (regtry(preg
, string
)) {
1144 if (preg
->cflags
& REG_NEWLINE
) {
1145 /* Try the next anchor? */
1146 string
= strchr(string
, '\n');
1148 preg
->regbol
= ++string
;
1157 /* Messy cases: unanchored match. */
1159 if (preg
->regstart
!= '\0') {
1160 /* We know what char it must start with. */
1161 while ((s
= str_find(s
, preg
->regstart
, preg
->cflags
& REG_ICASE
)) != NULL
) {
1162 if (regtry(preg
, s
))
1168 /* We don't -- general case. */
1170 if (regtry(preg
, s
))
1177 s
+= utf8_tounicode(s
, &c
);
1186 - regtry - try match at specific point
1188 /* 0 failure, 1 success */
1189 static int regtry( regex_t
*preg
, const char *string
)
1193 preg
->reginput
= string
;
1195 for (i
= 0; i
< preg
->nmatch
; i
++) {
1196 preg
->pmatch
[i
].rm_so
= -1;
1197 preg
->pmatch
[i
].rm_eo
= -1;
1199 if (regmatch(preg
, 1)) {
1200 preg
->pmatch
[0].rm_so
= string
- preg
->start
;
1201 preg
->pmatch
[0].rm_eo
= preg
->reginput
- preg
->start
;
1208 * Returns bytes matched if 'pattern' is a prefix of 'string'.
1210 * If 'nocase' is non-zero, does a case-insensitive match.
1212 * Returns -1 on not found.
1214 static int prefix_cmp(const int *prog
, int proglen
, const char *string
, int nocase
)
1216 const char *s
= string
;
1217 while (proglen
&& *s
) {
1219 int n
= reg_utf8_tounicode_case(s
, &ch
, nocase
);
1234 * Searchs for 'c' in the range 'range'.
1236 * Returns 1 if found, or 0 if not.
1238 static int reg_range_find(const int *range
, int c
)
1241 /*printf("Checking %d in range [%d,%d]\n", c, range[1], (range[0] + range[1] - 1));*/
1242 if (c
>= range
[1] && c
<= (range
[0] + range
[1] - 1)) {
1251 * Search for the character 'c' in the utf-8 string 'string'.
1253 * If 'nocase' is set, the 'string' is assumed to be uppercase
1254 * and 'c' is converted to uppercase before matching.
1256 * Returns the byte position in the string where the 'c' was found, or
1257 * NULL if not found.
1259 static const char *str_find(const char *string
, int c
, int nocase
)
1262 /* The "string" should already be converted to uppercase */
1267 int n
= reg_utf8_tounicode_case(string
, &ch
, nocase
);
1277 * Returns true if 'ch' is an end-of-line char.
1279 * In REG_NEWLINE mode, \n is considered EOL in
1282 static int reg_iseol(regex_t
*preg
, int ch
)
1284 if (preg
->cflags
& REG_NEWLINE
) {
1285 return ch
== '\0' || ch
== '\n';
1292 static int regmatchsimplerepeat(regex_t
*preg
, int scan
, int matchmin
)
1299 int max
= preg
->program
[scan
+ 2];
1300 int min
= preg
->program
[scan
+ 3];
1301 int next
= regnext(preg
, scan
);
1304 * Lookahead to avoid useless match attempts
1305 * when we know what character comes next.
1307 if (OP(preg
, next
) == EXACTLY
) {
1308 nextch
= preg
->program
[OPERAND(next
)];
1310 save
= preg
->reginput
;
1311 no
= regrepeat(preg
, scan
+ 5, max
);
1316 /* from min up to no */
1320 /* else from no down to min */
1332 preg
->reginput
= save
+ utf8_index(save
, no
);
1333 reg_utf8_tounicode_case(preg
->reginput
, &c
, (preg
->cflags
& REG_ICASE
));
1334 /* If it could work, try it. */
1335 if (reg_iseol(preg
, nextch
) || c
== nextch
) {
1336 if (regmatch(preg
, next
)) {
1341 /* Couldn't or didn't, add one more */
1345 /* Couldn't or didn't -- back up. */
1352 static int regmatchrepeat(regex_t
*preg
, int scan
, int matchmin
)
1354 int *scanpt
= preg
->program
+ scan
;
1356 int max
= scanpt
[2];
1357 int min
= scanpt
[3];
1359 /* Have we reached min? */
1360 if (scanpt
[4] < min
) {
1361 /* No, so get another one */
1363 if (regmatch(preg
, scan
+ 5)) {
1369 if (scanpt
[4] > max
) {
1374 /* minimal, so try other branch first */
1375 if (regmatch(preg
, regnext(preg
, scan
))) {
1378 /* No, so try one more */
1380 if (regmatch(preg
, scan
+ 5)) {
1386 /* maximal, so try this branch again */
1387 if (scanpt
[4] < max
) {
1389 if (regmatch(preg
, scan
+ 5)) {
1394 /* At this point we are at max with no match. Try the other branch */
1395 return regmatch(preg
, regnext(preg
, scan
));
1399 - regmatch - main matching routine
1401 * Conceptually the strategy is simple: check to see whether the current
1402 * node matches, call self recursively to see whether the rest matches,
1403 * and then act accordingly. In practice we make some effort to avoid
1404 * recursion, in particular by going through "ordinary" nodes (that don't
1405 * need to know whether the rest of the match failed) by a loop instead of
1408 /* 0 failure, 1 success */
1409 static int regmatch(regex_t
*preg
, int prog
)
1411 int scan
; /* Current node. */
1412 int next
; /* Next node. */
1418 if (scan
!= 0 && regnarrate
)
1419 fprintf(stderr
, "%s(\n", regprop(scan
));
1426 fprintf(stderr
, "%3d: %s...\n", scan
, regprop(OP(preg
, scan
))); /* Where, what. */
1429 next
= regnext(preg
, scan
);
1430 n
= reg_utf8_tounicode_case(preg
->reginput
, &c
, (preg
->cflags
& REG_ICASE
));
1432 switch (OP(preg
, scan
)) {
1434 if ((preg
->eflags
& REG_NOTBOL
)) {
1439 if (preg
->reginput
!= preg
->regbol
) {
1445 /* For EOLX, only match real end of line, not newline */
1450 if (!reg_iseol(preg
, c
)) {
1455 /* Must be looking at a letter, digit, or _ */
1456 if ((!isalnum(UCHAR(c
))) && c
!= '_')
1458 /* Prev must be BOL or nonword */
1459 if (preg
->reginput
> preg
->regbol
&&
1460 (isalnum(UCHAR(preg
->reginput
[-1])) || preg
->reginput
[-1] == '_'))
1464 /* Can't match at BOL */
1465 if (preg
->reginput
> preg
->regbol
) {
1466 /* Current must be EOL or nonword */
1467 if (reg_iseol(preg
, c
) || !isalnum(UCHAR(c
)) || c
!= '_') {
1468 c
= preg
->reginput
[-1];
1469 /* Previous must be word */
1470 if (isalnum(UCHAR(c
)) || c
== '_') {
1479 if (reg_iseol(preg
, c
))
1481 preg
->reginput
+= n
;
1488 opnd
= OPERAND(scan
);
1489 len
= str_int_len(preg
->program
+ opnd
);
1491 slen
= prefix_cmp(preg
->program
+ opnd
, len
, preg
->reginput
, preg
->cflags
& REG_ICASE
);
1495 preg
->reginput
+= slen
;
1499 if (reg_iseol(preg
, c
) || reg_range_find(preg
->program
+ OPERAND(scan
), c
) == 0) {
1502 preg
->reginput
+= n
;
1505 if (reg_iseol(preg
, c
) || reg_range_find(preg
->program
+ OPERAND(scan
), c
) != 0) {
1508 preg
->reginput
+= n
;
1515 if (OP(preg
, next
) != BRANCH
) /* No choice. */
1516 next
= OPERAND(scan
); /* Avoid recursion. */
1519 save
= preg
->reginput
;
1520 if (regmatch(preg
, OPERAND(scan
))) {
1523 preg
->reginput
= save
;
1524 scan
= regnext(preg
, scan
);
1525 } while (scan
!= 0 && OP(preg
, scan
) == BRANCH
);
1532 return regmatchsimplerepeat(preg
, scan
, OP(preg
, scan
) == REPMIN
);
1536 return regmatchrepeat(preg
, scan
, OP(preg
, scan
) == REPXMIN
);
1539 return 1; /* Success! */
1543 return regmatch(preg
, next
);
1546 if (OP(preg
, scan
) >= OPEN
+1 && OP(preg
, scan
) < CLOSE_END
) {
1547 save
= preg
->reginput
;
1548 if (regmatch(preg
, next
)) {
1549 if (OP(preg
, scan
) < CLOSE
) {
1550 int no
= OP(preg
, scan
) - OPEN
;
1551 if (no
< preg
->nmatch
&& preg
->pmatch
[no
].rm_so
== -1) {
1552 preg
->pmatch
[no
].rm_so
= save
- preg
->start
;
1556 int no
= OP(preg
, scan
) - CLOSE
;
1557 if (no
< preg
->nmatch
&& preg
->pmatch
[no
].rm_eo
== -1) {
1558 preg
->pmatch
[no
].rm_eo
= save
- preg
->start
;
1565 return REG_ERR_INTERNAL
;
1572 * We get here only if there's trouble -- normally "case END" is
1573 * the terminating point.
1575 return REG_ERR_INTERNAL
;
1579 - regrepeat - repeatedly match something simple, report how many
1581 static int regrepeat(regex_t
*preg
, int p
, int max
)
1589 scan
= preg
->reginput
;
1591 switch (OP(preg
, p
)) {
1593 /* No need to handle utf8 specially here */
1594 while (!reg_iseol(preg
, *scan
) && count
< max
) {
1600 while (count
< max
) {
1601 n
= reg_utf8_tounicode_case(scan
, &ch
, preg
->cflags
& REG_ICASE
);
1602 if (preg
->program
[opnd
] != ch
) {
1610 while (count
< max
) {
1611 n
= reg_utf8_tounicode_case(scan
, &ch
, preg
->cflags
& REG_ICASE
);
1612 if (reg_iseol(preg
, ch
) || reg_range_find(preg
->program
+ opnd
, ch
) == 0) {
1620 while (count
< max
) {
1621 n
= reg_utf8_tounicode_case(scan
, &ch
, preg
->cflags
& REG_ICASE
);
1622 if (reg_iseol(preg
, ch
) || reg_range_find(preg
->program
+ opnd
, ch
) != 0) {
1629 default: /* Oh dear. Called inappropriately. */
1630 preg
->err
= REG_ERR_INTERNAL
;
1631 count
= 0; /* Best compromise. */
1634 preg
->reginput
= scan
;
1640 - regnext - dig the "next" pointer out of a node
1642 static int regnext(regex_t
*preg
, int p
)
1646 offset
= NEXT(preg
, p
);
1651 if (OP(preg
, p
) == BACK
)
1658 - regopsize - returns the size of opcode + operands at 'p' in words
1660 static int regopsize(regex_t
*preg
, int p
)
1662 /* Almost all opcodes are 2 words, but some are more */
1663 switch (OP(preg
, p
)) {
1674 while (preg
->program
[s
++]) {
1682 #if defined(DEBUG) && !defined(JIM_BOOTSTRAP)
1685 - regdump - dump a regexp onto stdout in vaguely comprehensible form
1687 static void regdump(regex_t
*preg
)
1690 int op
= EXACTLY
; /* Arbitrary non-END op. */
1692 char buf
[MAX_UTF8_LEN
+ 1];
1695 for (i
= 1; i
< preg
->p
; i
++) {
1696 printf("%02x ", (unsigned char)preg
->program
[i
]);
1704 while (op
!= END
&& s
< preg
->p
) { /* While that wasn't END last time... */
1706 printf("%3d: %s", s
, regprop(op
)); /* Where, what. */
1707 next
= regnext(preg
, s
);
1708 if (next
== 0) /* Next ptr. */
1711 printf("(%d)", next
);
1713 if (op
== REP
|| op
== REPMIN
|| op
== REPX
|| op
== REPXMIN
) {
1714 int max
= preg
->program
[s
];
1715 int min
= preg
->program
[s
+ 1];
1717 printf("{%d,*}", min
);
1720 printf("{%d,%d}", min
, max
);
1722 printf(" %d", preg
->program
[s
+ 2]);
1725 else if (op
== ANYOF
|| op
== ANYBUT
) {
1728 while (preg
->program
[s
]) {
1729 int len
= preg
->program
[s
++];
1730 int first
= preg
->program
[s
++];
1731 buf
[utf8_getchars(buf
, first
)] = 0;
1734 buf
[utf8_getchars(buf
, first
+ len
- 1)] = 0;
1740 else if (op
== EXACTLY
) {
1741 /* Literal string, where present. */
1743 while (preg
->program
[s
]) {
1744 buf
[utf8_getchars(buf
, preg
->program
[s
])] = 0;
1754 /* Header fields of interest. */
1755 if (preg
->regstart
) {
1756 buf
[utf8_getchars(buf
, preg
->regstart
)] = 0;
1757 printf("start '%s' ", buf
);
1760 printf("anchored ");
1761 if (preg
->regmust
!= 0) {
1763 printf("must have:");
1764 for (i
= 0; i
< preg
->regmlen
; i
++) {
1765 putchar(preg
->program
[preg
->regmust
+ i
]);
1774 - regprop - printable representation of opcode
1776 static const char *regprop( int op
)
1778 static char buf
[50];
1822 if (op
>= OPEN
&& op
< CLOSE
) {
1823 snprintf(buf
, sizeof(buf
), "OPEN%d", op
-OPEN
);
1825 else if (op
>= CLOSE
&& op
< CLOSE_END
) {
1826 snprintf(buf
, sizeof(buf
), "CLOSE%d", op
-CLOSE
);
1829 snprintf(buf
, sizeof(buf
), "?%d?\n", op
);
1834 #endif /* JIM_BOOTSTRAP */
1836 size_t regerror(int errcode
, const regex_t
*preg
, char *errbuf
, size_t errbuf_size
)
1838 static const char *error_strings
[] = {
1847 "parentheses () not balanced",
1848 "braces {} not balanced",
1849 "invalid repetition count(s)",
1854 "count follows nothing",
1855 "trailing backslash",
1856 "corrupted program",
1857 "contains null char",
1861 if (errcode
< 0 || errcode
>= REG_ERR_NUM
) {
1862 err
= "Bad error code";
1865 err
= error_strings
[errcode
];
1868 return snprintf(errbuf
, errbuf_size
, "%s", err
);
1871 void regfree(regex_t
*preg
)
1873 free(preg
->program
);