4 * regcomp and regexec -- regsub and regerror are elsewhere
6 * Copyright (c) 1986 by University of Toronto.
7 * Written by Henry Spencer. Not derived from licensed software.
9 * Permission is granted to anyone to use this software for any
10 * purpose on any computer system, and to redistribute it freely,
11 * subject to the following restrictions:
13 * 1. The author is not responsible for the consequences of use of
14 * this software, no matter how awful, even if they arise
17 * 2. The origin of this software must not be misrepresented, either
18 * by explicit claim or by omission.
20 * 3. Altered versions must be plainly marked as such, and must not
21 * be misrepresented as being the original software.
22 *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore,
23 *** hoptoad!gnu, on 27 Dec 1986, to add \n as an alternative to |
24 *** to assist in implementing egrep.
25 *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore,
26 *** hoptoad!gnu, on 27 Dec 1986, to add \< and \> for word-matching
27 *** as in BSD grep and ex.
28 *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore,
29 *** hoptoad!gnu, on 28 Dec 1986, to optimize characters quoted with \.
30 *** THIS IS AN ALTERED VERSION. It was altered by James A. Woods,
31 *** ames!jaw, on 19 June 1987, to quash a regcomp() redundancy.
32 *** THIS IS AN ALTERED VERSION. It was altered by Christopher Seiwald
33 *** seiwald@vix.com, on 28 August 1993, for use in jam. Regmagic.h
34 *** was moved into regexp.h, and the include of regexp.h now uses "'s
35 *** to avoid conflicting with the system regexp.h. Const, bless its
36 *** soul, was removed so it can compile everywhere. The declaration
37 *** of strchr() was in conflict on AIX, so it was removed (as it is
38 *** happily defined in string.h).
39 *** THIS IS AN ALTERED VERSION. It was altered by Christopher Seiwald
40 *** seiwald@perforce.com, on 20 January 2000, to use function prototypes.
41 *** THIS IS AN ALTERED VERSION. It was altered by Christopher Seiwald
42 *** seiwald@perforce.com, on 05 November 2002, to const string literals.
44 * THIS IS AN ALTERED VERSION. It was altered by Steve Bennett <steveb@workware.net.au>
45 * on 16 October 2010, to remove static state and add better Tcl ARE compatibility.
46 * This includes counted repetitions, UTF-8 support, character classes,
47 * shorthand character classes, increased number of parentheses to 100,
48 * backslash escape sequences. It also removes \n as an alternative to |.
50 * Beware that some of this code is subtly aware of the way operator
51 * precedence is structured in regular expressions. Serious changes in
52 * regular-expression syntax might require a total rethink.
55 #include "jimautoconf.h"
57 #if defined(JIM_REGEXP)
64 #include "jimregexp.h"
67 /* An arbitrary limit, but this seems enough. Must be less than 1000. */
68 #define REG_MAX_PAREN 100
71 * Structure for regexp "program". This is essentially a linear encoding
72 * of a nondeterministic finite-state machine (aka syntax charts or
73 * "railroad normal form" in parsing technology). Each node is an opcode
74 * plus a "next" pointer, possibly plus an operand. "Next" pointers of
75 * all nodes except BRANCH implement concatenation; a "next" pointer with
76 * a BRANCH on both ends of it is connecting two alternatives. (Here we
77 * have one of the subtle syntax dependencies: an individual BRANCH (as
78 * opposed to a collection of them) is never concatenated with anything
79 * because of operator precedence.) The operand of some types of node is
80 * a literal string; for others, it is a node leading into a sub-FSM. In
81 * particular, the operand of a BRANCH node is the first node of the branch.
82 * (NB this is *not* a tree structure: the tail of the branch connects
83 * to the thing following the set of BRANCHes.) The opcodes are:
86 /* definition number opnd? meaning */
87 #define END 0 /* no End of program. */
88 #define BOL 1 /* no Match "" at beginning of line. */
89 #define EOL 2 /* no Match "" at end of line. */
90 #define ANY 3 /* no Match any one character. */
91 #define ANYOF 4 /* str Match any character in this string. */
92 #define ANYBUT 5 /* str Match any character not in this string. */
93 #define BRANCH 6 /* node Match this alternative, or the next... */
94 #define BACK 7 /* no Match "", "next" ptr points backward. */
95 #define EXACTLY 8 /* str Match this string. */
96 #define NOTHING 9 /* no Match empty string. */
97 #define REP 10 /* max,min Match this (simple) thing [min,max] times. */
98 #define REPMIN 11 /* max,min Match this (simple) thing [min,max] times, minimal match. */
99 #define REPX 12 /* max,min Match this (complex) thing [min,max] times. */
100 #define REPXMIN 13 /* max,min Match this (complex) thing [min,max] times, minimal match. */
101 #define BOLX 14 /* no Match "" at beginning of input. */
102 #define EOLX 15 /* no Match "" at end of input. */
103 #define WORDA 16 /* no Match "" at wordchar, where prev is nonword */
104 #define WORDZ 17 /* no Match "" at nonwordchar, where prev is word */
106 #define OPENNC 1000 /* no Non-capturing parentheses - must be OPEN-1 */
107 #define OPEN 1001 /* no Mark this point in input as start of #n. */
108 /* OPEN+1 is number 1, etc. */
110 /* must not be any other opts between OPEN and CLOSE */
112 #define CLOSENC 2000 /* no Non-capturing parentheses - must be CLOSE-1 */
113 #define CLOSE 2001 /* no Analogous to OPEN. */
114 #define CLOSE_END (CLOSE+REG_MAX_PAREN)
117 * The first word of the regexp internal "program" is actually this magic
118 * number; the start node begins in the second word.
120 #define REG_MAGIC 0xFADED00D
125 * BRANCH The set of branches constituting a single choice are hooked
126 * together with their "next" pointers, since precedence prevents
127 * anything being concatenated to any individual branch. The
128 * "next" pointer of the last BRANCH in a choice points to the
129 * thing following the whole choice. This is also where the
130 * final "next" pointer of each individual branch points; each
131 * branch starts with the operand node of a BRANCH node.
133 * BACK Normal "next" pointers all implicitly point forward; BACK
134 * exists to make loop structures possible.
136 * REP,REPX Repeated matches ('?', '*', '+' and {min,max}) are implemented
137 * as either simple repeats (REP) or complex repeats (REPX).
138 * These opcodes include a "min" and "max" count after the opcode.
139 * This is followed by a fourth "current count" word that is
140 * only used by REPX, as it implements a recursive match.
141 * REPMIN and REPXMIN are identical except they implement minimal repeats.
143 * OPEN,CLOSE ...are numbered at compile time.
147 * A node is one word of opcode followed by one word of "next" pointer.
148 * The "next" pointer value is a positive offset from the opcode of the node
150 * An operand, if any, simply follows the node. (Note that much of the
151 * code generation knows about this implicit relationship.)
153 #define OP(preg, p) (preg->program[p])
154 #define NEXT(preg, p) (preg->program[p + 1])
155 #define OPERAND(p) ((p) + 2)
158 * See regmagic.h for one further detail of program structure.
163 * Utility definitions.
166 #define FAIL(R,M) { (R)->err = (M); return (M); }
167 #define ISMULT(c) ((c) == '*' || (c) == '+' || (c) == '?' || (c) == '{')
168 #define META "^$.[()|?{+*"
171 * Flags to be passed up and down.
173 #define HASWIDTH 1 /* Known never to match null string. */
174 #define SIMPLE 2 /* Simple enough to be STAR/PLUS operand. */
175 #define SPSTART 4 /* Starts with * or +. */
176 #define WORST 0 /* Worst case. */
178 #define MAX_REP_COUNT 1000000
181 * Forward declarations for regcomp()'s friends.
183 static int reg(regex_t
*preg
, int paren
/* Parenthesized? */, int *flagp
);
184 static int regpiece(regex_t
*preg
, int *flagp
);
185 static int regbranch(regex_t
*preg
, int *flagp
);
186 static int regatom(regex_t
*preg
, int *flagp
);
187 static int regnode(regex_t
*preg
, int op
);
188 static int regnext(regex_t
*preg
, int p
);
189 static void regc(regex_t
*preg
, int b
);
190 static int reginsert(regex_t
*preg
, int op
, int size
, int opnd
);
191 static void regtail(regex_t
*preg
, int p
, int val
);
192 static void regoptail(regex_t
*preg
, int p
, int val
);
193 static int regopsize(regex_t
*preg
, int p
);
195 static int reg_range_find(const int *string
, int c
);
196 static const char *str_find(const char *string
, int c
, int nocase
);
197 static int prefix_cmp(const int *prog
, int proglen
, const char *string
, int nocase
);
201 static int regnarrate
= 0;
202 static void regdump(regex_t
*preg
);
203 static const char *regprop( int op
);
208 * Returns the length of the null-terminated integer sequence.
210 static int str_int_len(const int *seq
)
220 - regcomp - compile a regular expression into internal code
222 * We can't allocate space until we know how big the compiled form will be,
223 * but we can't compile it (and thus know how big it is) until we've got a
224 * place to put the code. So we cheat: we compile it twice, once with code
225 * generation turned off and size counting turned on, and once "for real".
226 * This also means that we don't allocate space until we are sure that the
227 * thing really will compile successfully, and we never have to move the
228 * code and thus invalidate pointers into it. (Note that it has to be in
229 * one piece because free() must be able to free it all.)
231 * Beware that the optimization-preparation code in here knows about some
232 * of the structure of the compiled regexp.
234 int regcomp(regex_t
*preg
, const char *exp
, int cflags
)
242 fprintf(stderr
, "Compiling: '%s'\n", exp
);
244 memset(preg
, 0, sizeof(*preg
));
247 FAIL(preg
, REG_ERR_NULL_ARGUMENT
);
249 /* First pass: determine size, legality. */
250 preg
->cflags
= cflags
;
251 preg
->regparse
= exp
;
253 /* Allocate space. */
254 preg
->proglen
= (strlen(exp
) + 1) * 5;
255 preg
->program
= malloc(preg
->proglen
* sizeof(int));
256 if (preg
->program
== NULL
)
257 FAIL(preg
, REG_ERR_NOMEM
);
259 /* Note that since we store a magic value as the first item in the program,
260 * program offsets will never be 0
262 regc(preg
, REG_MAGIC
);
263 if (reg(preg
, 0, &flags
) == 0) {
267 /* Small enough for pointer-storage convention? */
268 if (preg
->re_nsub
>= REG_MAX_PAREN
) /* Probably could be 65535L. */
269 FAIL(preg
,REG_ERR_TOO_BIG
);
271 /* Dig out information for optimizations. */
272 preg
->regstart
= 0; /* Worst-case defaults. */
276 scan
= 1; /* First BRANCH. */
277 if (OP(preg
, regnext(preg
, scan
)) == END
) { /* Only one top-level choice. */
278 scan
= OPERAND(scan
);
280 /* Starting-point info. */
281 if (OP(preg
, scan
) == EXACTLY
) {
282 preg
->regstart
= preg
->program
[OPERAND(scan
)];
284 else if (OP(preg
, scan
) == BOL
)
288 * If there's something expensive in the r.e., find the
289 * longest literal string that must appear and make it the
290 * regmust. Resolve ties in favor of later strings, since
291 * the regstart check works with the beginning of the r.e.
292 * and avoiding duplication strengthens checking. Not a
293 * strong reason, but sufficient in the absence of others.
298 for (; scan
!= 0; scan
= regnext(preg
, scan
)) {
299 if (OP(preg
, scan
) == EXACTLY
) {
300 int plen
= str_int_len(preg
->program
+ OPERAND(scan
));
302 longest
= OPERAND(scan
);
307 preg
->regmust
= longest
;
320 - reg - regular expression, i.e. main body or parenthesized thing
322 * Caller must absorb opening parenthesis.
324 * Combining parenthesis handling with the base level of regular expression
325 * is a trifle forced, but the need to tie the tails of the branches to what
326 * follows makes it hard to avoid.
328 static int reg(regex_t
*preg
, int paren
/* Parenthesized? */, int *flagp
)
336 *flagp
= HASWIDTH
; /* Tentatively. */
338 /* Make an OPEN node, if parenthesized. */
340 if (preg
->regparse
[0] == '?' && preg
->regparse
[1] == ':') {
341 /* non-capturing paren */
346 parno
= ++preg
->re_nsub
;
348 ret
= regnode(preg
, OPEN
+parno
);
352 /* Pick up the branches, linking them together. */
353 br
= regbranch(preg
, &flags
);
357 regtail(preg
, ret
, br
); /* OPEN -> first. */
360 if (!(flags
&HASWIDTH
))
362 *flagp
|= flags
&SPSTART
;
363 while (*preg
->regparse
== '|') {
365 br
= regbranch(preg
, &flags
);
368 regtail(preg
, ret
, br
); /* BRANCH -> BRANCH. */
369 if (!(flags
&HASWIDTH
))
371 *flagp
|= flags
&SPSTART
;
374 /* Make a closing node, and hook it on the end. */
375 ender
= regnode(preg
, (paren
) ? CLOSE
+parno
: END
);
376 regtail(preg
, ret
, ender
);
378 /* Hook the tails of the branches to the closing node. */
379 for (br
= ret
; br
!= 0; br
= regnext(preg
, br
))
380 regoptail(preg
, br
, ender
);
382 /* Check for proper termination. */
383 if (paren
&& *preg
->regparse
++ != ')') {
384 preg
->err
= REG_ERR_UNMATCHED_PAREN
;
386 } else if (!paren
&& *preg
->regparse
!= '\0') {
387 if (*preg
->regparse
== ')') {
388 preg
->err
= REG_ERR_UNMATCHED_PAREN
;
391 preg
->err
= REG_ERR_JUNK_ON_END
;
400 - regbranch - one alternative of an | operator
402 * Implements the concatenation operator.
404 static int regbranch(regex_t
*preg
, int *flagp
)
411 *flagp
= WORST
; /* Tentatively. */
413 ret
= regnode(preg
, BRANCH
);
415 while (*preg
->regparse
!= '\0' && *preg
->regparse
!= ')' &&
416 *preg
->regparse
!= '|') {
417 latest
= regpiece(preg
, &flags
);
420 *flagp
|= flags
&HASWIDTH
;
421 if (chain
== 0) {/* First piece. */
422 *flagp
|= flags
&SPSTART
;
425 regtail(preg
, chain
, latest
);
429 if (chain
== 0) /* Loop ran zero times. */
430 (void) regnode(preg
, NOTHING
);
436 - regpiece - something followed by possible [*+?]
438 * Note that the branching code sequences used for ? and the general cases
439 * of * and + are somewhat optimized: they use the same NOTHING node as
440 * both the endmarker for their branch list and the body of the last branch.
441 * It might seem that this node could be dispensed with entirely, but the
442 * endmarker role is not redundant.
444 static int regpiece(regex_t
*preg
, int *flagp
)
453 ret
= regatom(preg
, &flags
);
457 op
= *preg
->regparse
;
463 if (!(flags
&HASWIDTH
) && op
!= '?') {
464 preg
->err
= REG_ERR_OPERAND_COULD_BE_EMPTY
;
468 /* Handle braces (counted repetition) by expansion */
472 min
= strtoul(preg
->regparse
+ 1, &end
, 10);
473 if (end
== preg
->regparse
+ 1) {
474 preg
->err
= REG_ERR_BAD_COUNT
;
480 else if (*end
== '\0') {
481 preg
->err
= REG_ERR_UNMATCHED_BRACES
;
485 preg
->regparse
= end
;
486 max
= strtoul(preg
->regparse
+ 1, &end
, 10);
488 preg
->err
= REG_ERR_UNMATCHED_BRACES
;
492 if (end
== preg
->regparse
+ 1) {
495 else if (max
< min
|| max
>= 100) {
496 preg
->err
= REG_ERR_BAD_COUNT
;
500 preg
->err
= REG_ERR_BAD_COUNT
;
504 preg
->regparse
= strchr(preg
->regparse
, '}');
508 max
= (op
== '?' ? 1 : MAX_REP_COUNT
);
511 if (preg
->regparse
[1] == '?') {
513 next
= reginsert(preg
, flags
& SIMPLE
? REPMIN
: REPXMIN
, 5, ret
);
516 next
= reginsert(preg
, flags
& SIMPLE
? REP
: REPX
, 5, ret
);
518 preg
->program
[ret
+ 2] = max
;
519 preg
->program
[ret
+ 3] = min
;
520 preg
->program
[ret
+ 4] = 0;
522 *flagp
= (min
) ? (WORST
|HASWIDTH
) : (WORST
|SPSTART
);
524 if (!(flags
& SIMPLE
)) {
525 int back
= regnode(preg
, BACK
);
526 regtail(preg
, back
, ret
);
527 regtail(preg
, next
, back
);
531 if (ISMULT(*preg
->regparse
)) {
532 preg
->err
= REG_ERR_NESTED_COUNT
;
540 * Add all characters in the inclusive range between lower and upper.
542 * Handles a swapped range (upper < lower).
544 static void reg_addrange(regex_t
*preg
, int lower
, int upper
)
547 reg_addrange(preg
, upper
, lower
);
549 /* Add a range as length, start */
550 regc(preg
, upper
- lower
+ 1);
555 * Add a null-terminated literal string as a set of ranges.
557 static void reg_addrange_str(regex_t
*preg
, const char *str
)
560 reg_addrange(preg
, *str
, *str
);
566 * Extracts the next unicode char from utf8.
568 * If 'upper' is set, converts the char to uppercase.
570 static int reg_utf8_tounicode_case(const char *s
, int *uc
, int upper
)
572 int l
= utf8_tounicode(s
, uc
);
574 *uc
= utf8_upper(*uc
);
580 * Converts a hex digit to decimal.
582 * Returns -1 for an invalid hex digit.
584 static int hexdigitval(int c
)
586 if (c
>= '0' && c
<= '9')
588 if (c
>= 'a' && c
<= 'f')
590 if (c
>= 'A' && c
<= 'F')
596 * Parses up to 'n' hex digits at 's' and stores the result in *uc.
598 * Returns the number of hex digits parsed.
599 * If there are no hex digits, returns 0 and stores nothing.
601 static int parse_hex(const char *s
, int n
, int *uc
)
606 for (k
= 0; k
< n
; k
++) {
607 int c
= hexdigitval(*s
++);
611 val
= (val
<< 4) | c
;
620 * Call for chars after a backlash to decode the escape sequence.
622 * Stores the result in *ch.
624 * Returns the number of bytes consumed.
626 static int reg_decode_escape(const char *s
, int *ch
)
634 case 'b': *ch
= '\b'; break;
635 case 'e': *ch
= 27; break;
636 case 'f': *ch
= '\f'; break;
637 case 'n': *ch
= '\n'; break;
638 case 'r': *ch
= '\r'; break;
639 case 't': *ch
= '\t'; break;
640 case 'v': *ch
= '\v'; break;
643 /* Expect \u{NNNN} */
644 n
= parse_hex(s
+ 1, 6, ch
);
645 if (n
> 0 && s
[n
+ 1] == '}' && *ch
>= 0 && *ch
<= 0x1fffff) {
649 /* Invalid, so just treat as an escaped 'u' */
653 else if ((n
= parse_hex(s
, 4, ch
)) > 0) {
658 if ((n
= parse_hex(s
, 8, ch
)) > 0) {
663 if ((n
= parse_hex(s
, 2, ch
)) > 0) {
676 - regatom - the lowest level
678 * Optimization: gobbles an entire sequence of ordinary characters so that
679 * it can turn them into a single node, which is smaller to store and
680 * faster to run. Backslashed characters are exceptions, each becoming a
681 * separate node; the code is simpler that way and it's not worth fixing.
683 static int regatom(regex_t
*preg
, int *flagp
)
687 int nocase
= (preg
->cflags
& REG_ICASE
);
690 int n
= reg_utf8_tounicode_case(preg
->regparse
, &ch
, nocase
);
692 *flagp
= WORST
; /* Tentatively. */
696 /* FIXME: these chars only have meaning at beg/end of pat? */
698 ret
= regnode(preg
, BOL
);
701 ret
= regnode(preg
, EOL
);
704 ret
= regnode(preg
, ANY
);
705 *flagp
|= HASWIDTH
|SIMPLE
;
708 const char *pattern
= preg
->regparse
;
710 if (*pattern
== '^') { /* Complement of range. */
711 ret
= regnode(preg
, ANYBUT
);
714 ret
= regnode(preg
, ANYOF
);
716 /* Special case. If the first char is ']' or '-', it is part of the set */
717 if (*pattern
== ']' || *pattern
== '-') {
718 reg_addrange(preg
, *pattern
, *pattern
);
722 while (*pattern
&& *pattern
!= ']') {
723 /* Is this a range? a-z */
728 CC_ALPHA
, CC_ALNUM
, CC_SPACE
, CC_BLANK
, CC_UPPER
, CC_LOWER
,
729 CC_DIGIT
, CC_XDIGIT
, CC_CNTRL
, CC_GRAPH
, CC_PRINT
, CC_PUNCT
,
734 pattern
+= reg_utf8_tounicode_case(pattern
, &start
, nocase
);
736 /* First check for class shorthand escapes */
748 reg_addrange(preg
, '_', '_');
752 pattern
+= reg_decode_escape(pattern
, &start
);
754 preg
->err
= REG_ERR_NULL_CHAR
;
758 if (pattern
[0] == '-' && pattern
[1] && pattern
[1] != ']') {
760 pattern
+= utf8_tounicode(pattern
, &end
);
761 pattern
+= reg_utf8_tounicode_case(pattern
, &end
, nocase
);
763 pattern
+= reg_decode_escape(pattern
, &end
);
765 preg
->err
= REG_ERR_NULL_CHAR
;
770 reg_addrange(preg
, start
, end
);
773 if (start
== '[' && pattern
[0] == ':') {
774 static const char *character_class
[] = {
775 ":alpha:", ":alnum:", ":space:", ":blank:", ":upper:", ":lower:",
776 ":digit:", ":xdigit:", ":cntrl:", ":graph:", ":print:", ":punct:",
779 for (cc
= 0; cc
< CC_NUM
; cc
++) {
780 n
= strlen(character_class
[cc
]);
781 if (strncmp(pattern
, character_class
[cc
], n
) == 0) {
782 /* Found a character class */
791 reg_addrange(preg
, '0', '9');
794 if ((preg
->cflags
& REG_ICASE
) == 0) {
795 reg_addrange(preg
, 'a', 'z');
797 reg_addrange(preg
, 'A', 'Z');
800 reg_addrange_str(preg
, " \t\r\n\f\v");
803 reg_addrange_str(preg
, " \t");
806 reg_addrange(preg
, 'A', 'Z');
809 reg_addrange(preg
, 'a', 'z');
812 reg_addrange(preg
, 'a', 'f');
813 reg_addrange(preg
, 'A', 'F');
816 reg_addrange(preg
, '0', '9');
819 reg_addrange(preg
, 0, 31);
820 reg_addrange(preg
, 127, 127);
823 reg_addrange(preg
, ' ', '~');
826 reg_addrange(preg
, '!', '~');
829 reg_addrange(preg
, '!', '/');
830 reg_addrange(preg
, ':', '@');
831 reg_addrange(preg
, '[', '`');
832 reg_addrange(preg
, '{', '~');
838 /* Not a range, so just add the char */
839 reg_addrange(preg
, start
, start
);
846 preg
->regparse
= pattern
;
848 *flagp
|= HASWIDTH
|SIMPLE
;
852 ret
= reg(preg
, 1, &flags
);
855 *flagp
|= flags
&(HASWIDTH
|SPSTART
);
860 preg
->err
= REG_ERR_INTERNAL
;
861 return 0; /* Supposed to be caught earlier. */
866 preg
->err
= REG_ERR_COUNT_FOLLOWS_NOTHING
;
869 ch
= *preg
->regparse
++;
872 preg
->err
= REG_ERR_TRAILING_BACKSLASH
;
875 ret
= regnode(preg
, BOLX
);
878 ret
= regnode(preg
, EOLX
);
882 ret
= regnode(preg
, WORDA
);
886 ret
= regnode(preg
, WORDZ
);
890 ret
= regnode(preg
, ch
== 'd' ? ANYOF
: ANYBUT
);
891 reg_addrange(preg
, '0', '9');
893 *flagp
|= HASWIDTH
|SIMPLE
;
897 ret
= regnode(preg
, ch
== 'w' ? ANYOF
: ANYBUT
);
898 if ((preg
->cflags
& REG_ICASE
) == 0) {
899 reg_addrange(preg
, 'a', 'z');
901 reg_addrange(preg
, 'A', 'Z');
902 reg_addrange(preg
, '0', '9');
903 reg_addrange(preg
, '_', '_');
905 *flagp
|= HASWIDTH
|SIMPLE
;
909 ret
= regnode(preg
, ch
== 's' ? ANYOF
: ANYBUT
);
910 reg_addrange_str(preg
," \t\r\n\f\v");
912 *flagp
|= HASWIDTH
|SIMPLE
;
914 /* FIXME: Someday handle \1, \2, ... */
916 /* Handle general quoted chars in exact-match routine */
917 /* Back up to include the backslash */
925 * Encode a string of characters to be matched exactly.
929 /* Back up to pick up the first char of interest */
932 ret
= regnode(preg
, EXACTLY
);
934 /* Note that a META operator such as ? or * consumes the
936 * Thus we must be careful to look ahead by 2 and add the
937 * last char as it's own EXACTLY if necessary
940 /* Until end of string or a META char is reached */
941 while (*preg
->regparse
&& strchr(META
, *preg
->regparse
) == NULL
) {
942 n
= reg_utf8_tounicode_case(preg
->regparse
, &ch
, (preg
->cflags
& REG_ICASE
));
943 if (ch
== '\\' && preg
->regparse
[n
]) {
944 /* Non-trailing backslash.
945 * Is this a special escape, or a regular escape?
947 if (strchr("<>mMwWdDsSAZ", preg
->regparse
[n
])) {
948 /* A special escape. All done with EXACTLY */
951 /* Decode it. Note that we add the length for the escape
952 * sequence to the length for the backlash so we can skip
953 * the entire sequence, or not as required.
955 n
+= reg_decode_escape(preg
->regparse
+ n
, &ch
);
957 preg
->err
= REG_ERR_NULL_CHAR
;
962 /* Now we have one char 'ch' of length 'n'.
963 * Check to see if the following char is a MULT
966 if (ISMULT(preg
->regparse
[n
])) {
967 /* Yes. But do we already have some EXACTLY chars? */
969 /* Yes, so return what we have and pick up the current char next time around */
972 /* No, so add this single char and finish */
979 /* No, so just add this char normally */
997 static void reg_grow(regex_t
*preg
, int n
)
999 if (preg
->p
+ n
>= preg
->proglen
) {
1000 preg
->proglen
= (preg
->p
+ n
) * 2;
1001 preg
->program
= realloc(preg
->program
, preg
->proglen
* sizeof(int));
1006 - regnode - emit a node
1009 static int regnode(regex_t
*preg
, int op
)
1013 /* The OP followed by a next pointer */
1014 preg
->program
[preg
->p
++] = op
;
1015 preg
->program
[preg
->p
++] = 0;
1017 /* Return the start of the node */
1022 - regc - emit (if appropriate) a byte of code
1024 static void regc(regex_t
*preg
, int b
)
1027 preg
->program
[preg
->p
++] = b
;
1031 - reginsert - insert an operator in front of already-emitted operand
1033 * Means relocating the operand.
1034 * Returns the new location of the original operand.
1036 static int reginsert(regex_t
*preg
, int op
, int size
, int opnd
)
1038 reg_grow(preg
, size
);
1040 /* Move everything from opnd up */
1041 memmove(preg
->program
+ opnd
+ size
, preg
->program
+ opnd
, sizeof(int) * (preg
->p
- opnd
));
1042 /* Zero out the new space */
1043 memset(preg
->program
+ opnd
, 0, sizeof(int) * size
);
1045 preg
->program
[opnd
] = op
;
1053 - regtail - set the next-pointer at the end of a node chain
1055 static void regtail(regex_t
*preg
, int p
, int val
)
1061 /* Find last node. */
1064 temp
= regnext(preg
, scan
);
1070 if (OP(preg
, scan
) == BACK
)
1071 offset
= scan
- val
;
1073 offset
= val
- scan
;
1075 preg
->program
[scan
+ 1] = offset
;
1079 - regoptail - regtail on operand of first argument; nop if operandless
1082 static void regoptail(regex_t
*preg
, int p
, int val
)
1084 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
1085 if (p
!= 0 && OP(preg
, p
) == BRANCH
) {
1086 regtail(preg
, OPERAND(p
), val
);
1091 * regexec and friends
1097 static int regtry(regex_t
*preg
, const char *string
);
1098 static int regmatch(regex_t
*preg
, int prog
);
1099 static int regrepeat(regex_t
*preg
, int p
, int max
);
1102 - regexec - match a regexp against a string
1104 int regexec(regex_t
*preg
, const char *string
, size_t nmatch
, regmatch_t pmatch
[], int eflags
)
1109 /* Be paranoid... */
1110 if (preg
== NULL
|| preg
->program
== NULL
|| string
== NULL
) {
1111 return REG_ERR_NULL_ARGUMENT
;
1114 /* Check validity of program. */
1115 if (*preg
->program
!= REG_MAGIC
) {
1116 return REG_ERR_CORRUPTED
;
1120 fprintf(stderr
, "regexec: %s\n", string
);
1124 preg
->eflags
= eflags
;
1125 preg
->pmatch
= pmatch
;
1126 preg
->nmatch
= nmatch
;
1127 preg
->start
= string
; /* All offsets are computed from here */
1129 /* Must clear out the embedded repeat counts of REPX and REPXMIN opcodes */
1130 for (scan
= OPERAND(1); scan
!= 0; scan
+= regopsize(preg
, scan
)) {
1131 int op
= OP(preg
, scan
);
1134 if (op
== REPX
|| op
== REPXMIN
)
1135 preg
->program
[scan
+ 4] = 0;
1138 /* If there is a "must appear" string, look for it. */
1139 if (preg
->regmust
!= 0) {
1141 while ((s
= str_find(s
, preg
->program
[preg
->regmust
], preg
->cflags
& REG_ICASE
)) != NULL
) {
1142 if (prefix_cmp(preg
->program
+ preg
->regmust
, preg
->regmlen
, s
, preg
->cflags
& REG_ICASE
) >= 0) {
1147 if (s
== NULL
) /* Not present. */
1151 /* Mark beginning of line for ^ . */
1152 preg
->regbol
= string
;
1154 /* Simplest case: anchored match need be tried only once (maybe per line). */
1155 if (preg
->reganch
) {
1156 if (eflags
& REG_NOTBOL
) {
1157 /* This is an anchored search, but not an BOL, so possibly skip to the next line */
1161 if (regtry(preg
, string
)) {
1166 if (preg
->cflags
& REG_NEWLINE
) {
1167 /* Try the next anchor? */
1168 string
= strchr(string
, '\n');
1170 preg
->regbol
= ++string
;
1179 /* Messy cases: unanchored match. */
1181 if (preg
->regstart
!= '\0') {
1182 /* We know what char it must start with. */
1183 while ((s
= str_find(s
, preg
->regstart
, preg
->cflags
& REG_ICASE
)) != NULL
) {
1184 if (regtry(preg
, s
))
1190 /* We don't -- general case. */
1192 if (regtry(preg
, s
))
1199 s
+= utf8_tounicode(s
, &c
);
1208 - regtry - try match at specific point
1210 /* 0 failure, 1 success */
1211 static int regtry( regex_t
*preg
, const char *string
)
1215 preg
->reginput
= string
;
1217 for (i
= 0; i
< preg
->nmatch
; i
++) {
1218 preg
->pmatch
[i
].rm_so
= -1;
1219 preg
->pmatch
[i
].rm_eo
= -1;
1221 if (regmatch(preg
, 1)) {
1222 preg
->pmatch
[0].rm_so
= string
- preg
->start
;
1223 preg
->pmatch
[0].rm_eo
= preg
->reginput
- preg
->start
;
1230 * Returns bytes matched if 'pattern' is a prefix of 'string'.
1232 * If 'nocase' is non-zero, does a case-insensitive match.
1234 * Returns -1 on not found.
1236 static int prefix_cmp(const int *prog
, int proglen
, const char *string
, int nocase
)
1238 const char *s
= string
;
1239 while (proglen
&& *s
) {
1241 int n
= reg_utf8_tounicode_case(s
, &ch
, nocase
);
1256 * Searchs for 'c' in the range 'range'.
1258 * Returns 1 if found, or 0 if not.
1260 static int reg_range_find(const int *range
, int c
)
1263 /*printf("Checking %d in range [%d,%d]\n", c, range[1], (range[0] + range[1] - 1));*/
1264 if (c
>= range
[1] && c
<= (range
[0] + range
[1] - 1)) {
1273 * Search for the character 'c' in the utf-8 string 'string'.
1275 * If 'nocase' is set, the 'string' is assumed to be uppercase
1276 * and 'c' is converted to uppercase before matching.
1278 * Returns the byte position in the string where the 'c' was found, or
1279 * NULL if not found.
1281 static const char *str_find(const char *string
, int c
, int nocase
)
1284 /* The "string" should already be converted to uppercase */
1289 int n
= reg_utf8_tounicode_case(string
, &ch
, nocase
);
1299 * Returns true if 'ch' is an end-of-line char.
1301 * In REG_NEWLINE mode, \n is considered EOL in
1304 static int reg_iseol(regex_t
*preg
, int ch
)
1306 if (preg
->cflags
& REG_NEWLINE
) {
1307 return ch
== '\0' || ch
== '\n';
1314 static int regmatchsimplerepeat(regex_t
*preg
, int scan
, int matchmin
)
1321 int max
= preg
->program
[scan
+ 2];
1322 int min
= preg
->program
[scan
+ 3];
1323 int next
= regnext(preg
, scan
);
1326 * Lookahead to avoid useless match attempts
1327 * when we know what character comes next.
1329 if (OP(preg
, next
) == EXACTLY
) {
1330 nextch
= preg
->program
[OPERAND(next
)];
1332 save
= preg
->reginput
;
1333 no
= regrepeat(preg
, scan
+ 5, max
);
1338 /* from min up to no */
1342 /* else from no down to min */
1354 preg
->reginput
= save
+ utf8_index(save
, no
);
1355 reg_utf8_tounicode_case(preg
->reginput
, &c
, (preg
->cflags
& REG_ICASE
));
1356 /* If it could work, try it. */
1357 if (reg_iseol(preg
, nextch
) || c
== nextch
) {
1358 if (regmatch(preg
, next
)) {
1363 /* Couldn't or didn't, add one more */
1367 /* Couldn't or didn't -- back up. */
1374 static int regmatchrepeat(regex_t
*preg
, int scan
, int matchmin
)
1376 int *scanpt
= preg
->program
+ scan
;
1378 int max
= scanpt
[2];
1379 int min
= scanpt
[3];
1381 /* Have we reached min? */
1382 if (scanpt
[4] < min
) {
1383 /* No, so get another one */
1385 if (regmatch(preg
, scan
+ 5)) {
1391 if (scanpt
[4] > max
) {
1396 /* minimal, so try other branch first */
1397 if (regmatch(preg
, regnext(preg
, scan
))) {
1400 /* No, so try one more */
1402 if (regmatch(preg
, scan
+ 5)) {
1408 /* maximal, so try this branch again */
1409 if (scanpt
[4] < max
) {
1411 if (regmatch(preg
, scan
+ 5)) {
1416 /* At this point we are at max with no match. Try the other branch */
1417 return regmatch(preg
, regnext(preg
, scan
));
1421 - regmatch - main matching routine
1423 * Conceptually the strategy is simple: check to see whether the current
1424 * node matches, call self recursively to see whether the rest matches,
1425 * and then act accordingly. In practice we make some effort to avoid
1426 * recursion, in particular by going through "ordinary" nodes (that don't
1427 * need to know whether the rest of the match failed) by a loop instead of
1430 /* 0 failure, 1 success */
1431 static int regmatch(regex_t
*preg
, int prog
)
1433 int scan
; /* Current node. */
1434 int next
; /* Next node. */
1440 if (scan
!= 0 && regnarrate
)
1441 fprintf(stderr
, "%s(\n", regprop(scan
));
1448 fprintf(stderr
, "%3d: %s...\n", scan
, regprop(OP(preg
, scan
))); /* Where, what. */
1451 next
= regnext(preg
, scan
);
1452 n
= reg_utf8_tounicode_case(preg
->reginput
, &c
, (preg
->cflags
& REG_ICASE
));
1454 switch (OP(preg
, scan
)) {
1456 if ((preg
->eflags
& REG_NOTBOL
)) {
1461 if (preg
->reginput
!= preg
->regbol
) {
1467 /* For EOLX, only match real end of line, not newline */
1472 if (!reg_iseol(preg
, c
)) {
1477 /* Must be looking at a letter, digit, or _ */
1478 if ((!isalnum(UCHAR(c
))) && c
!= '_')
1480 /* Prev must be BOL or nonword */
1481 if (preg
->reginput
> preg
->regbol
&&
1482 (isalnum(UCHAR(preg
->reginput
[-1])) || preg
->reginput
[-1] == '_'))
1486 /* Can't match at BOL */
1487 if (preg
->reginput
> preg
->regbol
) {
1488 /* Current must be EOL or nonword */
1489 if (reg_iseol(preg
, c
) || !isalnum(UCHAR(c
)) || c
!= '_') {
1490 c
= preg
->reginput
[-1];
1491 /* Previous must be word */
1492 if (isalnum(UCHAR(c
)) || c
== '_') {
1501 if (reg_iseol(preg
, c
))
1503 preg
->reginput
+= n
;
1510 opnd
= OPERAND(scan
);
1511 len
= str_int_len(preg
->program
+ opnd
);
1513 slen
= prefix_cmp(preg
->program
+ opnd
, len
, preg
->reginput
, preg
->cflags
& REG_ICASE
);
1517 preg
->reginput
+= slen
;
1521 if (reg_iseol(preg
, c
) || reg_range_find(preg
->program
+ OPERAND(scan
), c
) == 0) {
1524 preg
->reginput
+= n
;
1527 if (reg_iseol(preg
, c
) || reg_range_find(preg
->program
+ OPERAND(scan
), c
) != 0) {
1530 preg
->reginput
+= n
;
1537 if (OP(preg
, next
) != BRANCH
) /* No choice. */
1538 next
= OPERAND(scan
); /* Avoid recursion. */
1541 save
= preg
->reginput
;
1542 if (regmatch(preg
, OPERAND(scan
))) {
1545 preg
->reginput
= save
;
1546 scan
= regnext(preg
, scan
);
1547 } while (scan
!= 0 && OP(preg
, scan
) == BRANCH
);
1554 return regmatchsimplerepeat(preg
, scan
, OP(preg
, scan
) == REPMIN
);
1558 return regmatchrepeat(preg
, scan
, OP(preg
, scan
) == REPXMIN
);
1561 return 1; /* Success! */
1565 return regmatch(preg
, next
);
1568 if (OP(preg
, scan
) >= OPEN
+1 && OP(preg
, scan
) < CLOSE_END
) {
1569 save
= preg
->reginput
;
1570 if (regmatch(preg
, next
)) {
1571 if (OP(preg
, scan
) < CLOSE
) {
1572 int no
= OP(preg
, scan
) - OPEN
;
1573 if (no
< preg
->nmatch
&& preg
->pmatch
[no
].rm_so
== -1) {
1574 preg
->pmatch
[no
].rm_so
= save
- preg
->start
;
1578 int no
= OP(preg
, scan
) - CLOSE
;
1579 if (no
< preg
->nmatch
&& preg
->pmatch
[no
].rm_eo
== -1) {
1580 preg
->pmatch
[no
].rm_eo
= save
- preg
->start
;
1587 return REG_ERR_INTERNAL
;
1594 * We get here only if there's trouble -- normally "case END" is
1595 * the terminating point.
1597 return REG_ERR_INTERNAL
;
1601 - regrepeat - repeatedly match something simple, report how many
1603 static int regrepeat(regex_t
*preg
, int p
, int max
)
1611 scan
= preg
->reginput
;
1613 switch (OP(preg
, p
)) {
1615 /* No need to handle utf8 specially here */
1616 while (!reg_iseol(preg
, *scan
) && count
< max
) {
1622 while (count
< max
) {
1623 n
= reg_utf8_tounicode_case(scan
, &ch
, preg
->cflags
& REG_ICASE
);
1624 if (preg
->program
[opnd
] != ch
) {
1632 while (count
< max
) {
1633 n
= reg_utf8_tounicode_case(scan
, &ch
, preg
->cflags
& REG_ICASE
);
1634 if (reg_iseol(preg
, ch
) || reg_range_find(preg
->program
+ opnd
, ch
) == 0) {
1642 while (count
< max
) {
1643 n
= reg_utf8_tounicode_case(scan
, &ch
, preg
->cflags
& REG_ICASE
);
1644 if (reg_iseol(preg
, ch
) || reg_range_find(preg
->program
+ opnd
, ch
) != 0) {
1651 default: /* Oh dear. Called inappropriately. */
1652 preg
->err
= REG_ERR_INTERNAL
;
1653 count
= 0; /* Best compromise. */
1656 preg
->reginput
= scan
;
1662 - regnext - dig the "next" pointer out of a node
1664 static int regnext(regex_t
*preg
, int p
)
1668 offset
= NEXT(preg
, p
);
1673 if (OP(preg
, p
) == BACK
)
1680 - regopsize - returns the size of opcode + operands at 'p' in words
1682 static int regopsize(regex_t
*preg
, int p
)
1684 /* Almost all opcodes are 2 words, but some are more */
1685 switch (OP(preg
, p
)) {
1696 while (preg
->program
[s
++]) {
1704 #if defined(DEBUG) && !defined(JIM_BOOTSTRAP)
1707 - regdump - dump a regexp onto stdout in vaguely comprehensible form
1709 static void regdump(regex_t
*preg
)
1712 int op
= EXACTLY
; /* Arbitrary non-END op. */
1714 char buf
[MAX_UTF8_LEN
+ 1];
1717 for (i
= 1; i
< preg
->p
; i
++) {
1718 printf("%02x ", (unsigned char)preg
->program
[i
]);
1726 while (op
!= END
&& s
< preg
->p
) { /* While that wasn't END last time... */
1728 printf("%3d: %s", s
, regprop(op
)); /* Where, what. */
1729 next
= regnext(preg
, s
);
1730 if (next
== 0) /* Next ptr. */
1733 printf("(%d)", next
);
1735 if (op
== REP
|| op
== REPMIN
|| op
== REPX
|| op
== REPXMIN
) {
1736 int max
= preg
->program
[s
];
1737 int min
= preg
->program
[s
+ 1];
1739 printf("{%d,*}", min
);
1742 printf("{%d,%d}", min
, max
);
1744 printf(" %d", preg
->program
[s
+ 2]);
1747 else if (op
== ANYOF
|| op
== ANYBUT
) {
1750 while (preg
->program
[s
]) {
1751 int len
= preg
->program
[s
++];
1752 int first
= preg
->program
[s
++];
1753 buf
[utf8_getchars(buf
, first
)] = 0;
1756 buf
[utf8_getchars(buf
, first
+ len
- 1)] = 0;
1762 else if (op
== EXACTLY
) {
1763 /* Literal string, where present. */
1765 while (preg
->program
[s
]) {
1766 buf
[utf8_getchars(buf
, preg
->program
[s
])] = 0;
1776 /* Header fields of interest. */
1777 if (preg
->regstart
) {
1778 buf
[utf8_getchars(buf
, preg
->regstart
)] = 0;
1779 printf("start '%s' ", buf
);
1782 printf("anchored ");
1783 if (preg
->regmust
!= 0) {
1785 printf("must have:");
1786 for (i
= 0; i
< preg
->regmlen
; i
++) {
1787 putchar(preg
->program
[preg
->regmust
+ i
]);
1796 - regprop - printable representation of opcode
1798 static const char *regprop( int op
)
1800 static char buf
[50];
1844 if (op
>= OPEN
&& op
< CLOSE
) {
1845 snprintf(buf
, sizeof(buf
), "OPEN%d", op
-OPEN
);
1847 else if (op
>= CLOSE
&& op
< CLOSE_END
) {
1848 snprintf(buf
, sizeof(buf
), "CLOSE%d", op
-CLOSE
);
1851 snprintf(buf
, sizeof(buf
), "?%d?\n", op
);
1856 #endif /* JIM_BOOTSTRAP */
1858 size_t regerror(int errcode
, const regex_t
*preg
, char *errbuf
, size_t errbuf_size
)
1860 static const char *error_strings
[] = {
1869 "parentheses () not balanced",
1870 "braces {} not balanced",
1871 "invalid repetition count(s)",
1876 "count follows nothing",
1877 "trailing backslash",
1878 "corrupted program",
1879 "contains null char",
1883 if (errcode
< 0 || errcode
>= REG_ERR_NUM
) {
1884 err
= "Bad error code";
1887 err
= error_strings
[errcode
];
1890 return snprintf(errbuf
, errbuf_size
, "%s", err
);
1893 void regfree(regex_t
*preg
)
1895 free(preg
->program
);