2 * regcomp and regexec -- regsub and regerror are elsewhere
4 * Copyright (c) 1986 by University of Toronto.
5 * Written by Henry Spencer. Not derived from licensed software.
7 * Permission is granted to anyone to use this software for any
8 * purpose on any computer system, and to redistribute it freely,
9 * subject to the following restrictions:
11 * 1. The author is not responsible for the consequences of use of
12 * this software, no matter how awful, even if they arise
15 * 2. The origin of this software must not be misrepresented, either
16 * by explicit claim or by omission.
18 * 3. Altered versions must be plainly marked as such, and must not
19 * be misrepresented as being the original software.
20 *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore,
21 *** hoptoad!gnu, on 27 Dec 1986, to add \n as an alternative to |
22 *** to assist in implementing egrep.
23 *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore,
24 *** hoptoad!gnu, on 27 Dec 1986, to add \< and \> for word-matching
25 *** as in BSD grep and ex.
26 *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore,
27 *** hoptoad!gnu, on 28 Dec 1986, to optimize characters quoted with \.
28 *** THIS IS AN ALTERED VERSION. It was altered by James A. Woods,
29 *** ames!jaw, on 19 June 1987, to quash a regcomp() redundancy.
30 *** THIS IS AN ALTERED VERSION. It was altered by Christopher Seiwald
31 *** seiwald@vix.com, on 28 August 1993, for use in jam. Regmagic.h
32 *** was moved into regexp.h, and the include of regexp.h now uses "'s
33 *** to avoid conflicting with the system regexp.h. Const, bless its
34 *** soul, was removed so it can compile everywhere. The declaration
35 *** of strchr() was in conflict on AIX, so it was removed (as it is
36 *** happily defined in string.h).
37 *** THIS IS AN ALTERED VERSION. It was altered by Christopher Seiwald
38 *** seiwald@perforce.com, on 20 January 2000, to use function prototypes.
39 *** THIS IS AN ALTERED VERSION. It was altered by Christopher Seiwald
40 *** seiwald@perforce.com, on 05 November 2002, to const string literals.
42 * THIS IS AN ALTERED VERSION. It was altered by Steve Bennett <steveb@workware.net.au>
43 * on 16 October 2010, to remove static state and add better Tcl ARE compatibility.
44 * This includes counted repetitions, UTF-8 support, character classes,
45 * shorthand character classes, increased number of parentheses to 100,
46 * backslash escape sequences. It also removes \n as an alternative to |.
48 * Beware that some of this code is subtly aware of the way operator
49 * precedence is structured in regular expressions. Serious changes in
50 * regular-expression syntax might require a total rethink.
58 #include "jimautoconf.h"
59 #include "jimregexp.h"
62 #if !defined(HAVE_REGCOMP) || defined(JIM_REGEXP)
65 * Structure for regexp "program". This is essentially a linear encoding
66 * of a nondeterministic finite-state machine (aka syntax charts or
67 * "railroad normal form" in parsing technology). Each node is an opcode
68 * plus a "next" pointer, possibly plus an operand. "Next" pointers of
69 * all nodes except BRANCH implement concatenation; a "next" pointer with
70 * a BRANCH on both ends of it is connecting two alternatives. (Here we
71 * have one of the subtle syntax dependencies: an individual BRANCH (as
72 * opposed to a collection of them) is never concatenated with anything
73 * because of operator precedence.) The operand of some types of node is
74 * a literal string; for others, it is a node leading into a sub-FSM. In
75 * particular, the operand of a BRANCH node is the first node of the branch.
76 * (NB this is *not* a tree structure: the tail of the branch connects
77 * to the thing following the set of BRANCHes.) The opcodes are:
80 /* This *MUST* be less than (255-20)/2=117 */
81 #define REG_MAX_PAREN 100
83 /* definition number opnd? meaning */
84 #define END 0 /* no End of program. */
85 #define BOL 1 /* no Match "" at beginning of line. */
86 #define EOL 2 /* no Match "" at end of line. */
87 #define ANY 3 /* no Match any one character. */
88 #define ANYOF 4 /* str Match any character in this string. */
89 #define ANYBUT 5 /* str Match any character not in this string. */
90 #define BRANCH 6 /* node Match this alternative, or the next... */
91 #define BACK 7 /* no Match "", "next" ptr points backward. */
92 #define EXACTLY 8 /* str Match this string. */
93 #define NOTHING 9 /* no Match empty string. */
94 #define REP 10 /* max,min Match this (simple) thing [min,max] times. */
95 #define REPMIN 11 /* max,min Match this (simple) thing [min,max] times, mininal match. */
96 #define REPX 12 /* max,min Match this (complex) thing [min,max] times. */
97 #define REPXMIN 13 /* max,min Match this (complex) thing [min,max] times, minimal match. */
99 #define WORDA 15 /* no Match "" at wordchar, where prev is nonword */
100 #define WORDZ 16 /* no Match "" at nonwordchar, where prev is word */
101 #define OPEN 20 /* no Mark this point in input as start of #n. */
102 /* OPEN+1 is number 1, etc. */
103 #define CLOSE (OPEN+REG_MAX_PAREN) /* no Analogous to OPEN. */
104 #define CLOSE_END (CLOSE+REG_MAX_PAREN)
107 * The first byte of the regexp internal "program" is actually this magic
108 * number; the start node begins in the second byte.
110 #define REG_MAGIC 0xFADED00D
115 * BRANCH The set of branches constituting a single choice are hooked
116 * together with their "next" pointers, since precedence prevents
117 * anything being concatenated to any individual branch. The
118 * "next" pointer of the last BRANCH in a choice points to the
119 * thing following the whole choice. This is also where the
120 * final "next" pointer of each individual branch points; each
121 * branch starts with the operand node of a BRANCH node.
123 * BACK Normal "next" pointers all implicitly point forward; BACK
124 * exists to make loop structures possible.
126 * STAR,PLUS '?', and complex '*' and '+', are implemented as circular
127 * BRANCH structures using BACK. Simple cases (one character
128 * per match) are implemented with STAR and PLUS for speed
129 * and to minimize recursive plunges.
131 * OPEN,CLOSE ...are numbered at compile time.
135 * A node is one char of opcode followed by two chars of "next" pointer.
136 * "Next" pointers are stored as two 8-bit pieces, high order first. The
137 * value is a positive offset from the opcode of the node containing it.
138 * An operand, if any, simply follows the node. (Note that much of the
139 * code generation knows about this implicit relationship.)
141 * Using two bytes for the "next" pointer is vast overkill for most things,
142 * but allows patterns to get big without disasters.
144 #define OP(p) ((p)[0])
145 #define NEXT(p) ((p)[1])
146 #define OPERAND(p) ((p) + 2)
149 * See regmagic.h for one further detail of program structure.
154 * Utility definitions.
157 #define FAIL(R,M) { (R)->err = (M); return (M); }
158 #define ISMULT(c) ((c) == '*' || (c) == '+' || (c) == '?' || (c) == '{')
159 #define META "^$.[()|?{+*"
162 * Flags to be passed up and down.
164 #define HASWIDTH 01 /* Known never to match null string. */
165 #define SIMPLE 02 /* Simple enough to be STAR/PLUS operand. */
166 #define SPSTART 04 /* Starts with * or +. */
167 #define WORST 0 /* Worst case. */
169 #define MAX_REP_COUNT 1000000
172 * Forward declarations for regcomp()'s friends.
174 static int *reg(regex_t
*preg
, int paren
/* Parenthesized? */, int *flagp
);
175 static int *regpiece(regex_t
*preg
, int *flagp
);
176 static int *regbranch(regex_t
*preg
, int *flagp
);
177 static int *regatom(regex_t
*preg
, int *flagp
);
178 static int *regnode(regex_t
*preg
, int op
);
179 static const int *regnext(regex_t
*preg
, const int *p
);
180 static void regc(regex_t
*preg
, int b
);
181 static int *reginsert(regex_t
*preg
, int op
, int size
, int *opnd
);
182 static void regtail(regex_t
*preg
, int *p
, const int *val
);
183 static void regoptail(regex_t
*preg
, int *p
, const int *val
);
185 static int reg_range_find(const int *string
, int c
);
186 static const char *str_find(const char *string
, int c
, int nocase
);
187 static int prefix_cmp(const int *prog
, int proglen
, const char *string
, int nocase
);
192 static void regdump(regex_t
*preg
);
193 static const char *regprop( const int *op
);
200 * Returns the length of the null-terminated integer sequence.
202 static int str_int_len(const int *seq
)
212 - regcomp - compile a regular expression into internal code
214 * We can't allocate space until we know how big the compiled form will be,
215 * but we can't compile it (and thus know how big it is) until we've got a
216 * place to put the code. So we cheat: we compile it twice, once with code
217 * generation turned off and size counting turned on, and once "for real".
218 * This also means that we don't allocate space until we are sure that the
219 * thing really will compile successfully, and we never have to move the
220 * code and thus invalidate pointers into it. (Note that it has to be in
221 * one piece because free() must be able to free it all.)
223 * Beware that the optimization-preparation code in here knows about some
224 * of the structure of the compiled regexp.
226 int regcomp(regex_t
*preg
, const char *exp
, int cflags
)
234 fprintf(stderr
, "Compiling: '%s'\n", exp
);
236 memset(preg
, 0, sizeof(*preg
));
239 FAIL(preg
, REG_ERR_NULL_ARGUMENT
);
241 /* First pass: determine size, legality. */
242 preg
->cflags
= cflags
;
243 preg
->regparse
= exp
;
246 preg
->regcode
= ®dummy
;
247 regc(preg
, REG_MAGIC
);
248 if (reg(preg
, 0, &flags
) == NULL
)
251 /* Small enough for pointer-storage convention? */
252 if (preg
->regsize
>= 32767L || preg
->re_nsub
>= REG_MAX_PAREN
) /* Probably could be 65535L. */
253 FAIL(preg
,REG_ERR_TOO_BIG
);
255 /* Allocate space. */
256 preg
->program
= malloc(preg
->regsize
* sizeof(*preg
->program
));
257 if (preg
->program
== NULL
)
258 FAIL(preg
, REG_ERR_NOMEM
);
260 /* Second pass: emit code. */
261 preg
->regparse
= exp
;
264 preg
->regcode
= preg
->program
;
265 regc(preg
, REG_MAGIC
);
266 if (reg(preg
, 0, &flags
) == NULL
)
269 /* Dig out information for optimizations. */
270 preg
->regstart
= 0; /* Worst-case defaults. */
272 preg
->regmust
= NULL
;
274 scan
= preg
->program
+1; /* First BRANCH. */
275 if (OP(regnext(preg
, scan
)) == END
) { /* Only one top-level choice. */
276 scan
= OPERAND(scan
);
278 /* Starting-point info. */
279 if (OP(scan
) == EXACTLY
) {
280 preg
->regstart
= *OPERAND(scan
);
282 else if (OP(scan
) == BOL
)
286 * If there's something expensive in the r.e., find the
287 * longest literal string that must appear and make it the
288 * regmust. Resolve ties in favor of later strings, since
289 * the regstart check works with the beginning of the r.e.
290 * and avoiding duplication strengthens checking. Not a
291 * strong reason, but sufficient in the absence of others.
296 for (; scan
!= NULL
; scan
= regnext(preg
, scan
)) {
297 if (OP(scan
) == EXACTLY
) {
298 int plen
= str_int_len(OPERAND(scan
));
300 longest
= OPERAND(scan
);
305 preg
->regmust
= longest
;
318 - reg - regular expression, i.e. main body or parenthesized thing
320 * Caller must absorb opening parenthesis.
322 * Combining parenthesis handling with the base level of regular expression
323 * is a trifle forced, but the need to tie the tails of the branches to what
324 * follows makes it hard to avoid.
326 static int *reg(regex_t
*preg
, int paren
/* Parenthesized? */, int *flagp
)
334 *flagp
= HASWIDTH
; /* Tentatively. */
336 /* Make an OPEN node, if parenthesized. */
338 parno
= ++preg
->re_nsub
;
339 ret
= regnode(preg
, OPEN
+parno
);
343 /* Pick up the branches, linking them together. */
344 br
= regbranch(preg
, &flags
);
348 regtail(preg
, ret
, br
); /* OPEN -> first. */
351 if (!(flags
&HASWIDTH
))
353 *flagp
|= flags
&SPSTART
;
354 while (*preg
->regparse
== '|') {
356 br
= regbranch(preg
, &flags
);
359 regtail(preg
, ret
, br
); /* BRANCH -> BRANCH. */
360 if (!(flags
&HASWIDTH
))
362 *flagp
|= flags
&SPSTART
;
365 /* Make a closing node, and hook it on the end. */
366 ender
= regnode(preg
, (paren
) ? CLOSE
+parno
: END
);
367 regtail(preg
, ret
, ender
);
369 /* Hook the tails of the branches to the closing node. */
370 for (br
= ret
; br
!= NULL
; br
= (int *)regnext(preg
, br
))
371 regoptail(preg
, br
, ender
);
373 /* Check for proper termination. */
374 if (paren
&& *preg
->regparse
++ != ')') {
375 preg
->err
= REG_ERR_UNMATCHED_PAREN
;
377 } else if (!paren
&& *preg
->regparse
!= '\0') {
378 if (*preg
->regparse
== ')') {
379 preg
->err
= REG_ERR_UNMATCHED_PAREN
;
382 preg
->err
= REG_ERR_JUNK_ON_END
;
391 - regbranch - one alternative of an | operator
393 * Implements the concatenation operator.
395 static int *regbranch(regex_t
*preg
, int *flagp
)
402 *flagp
= WORST
; /* Tentatively. */
404 ret
= regnode(preg
, BRANCH
);
406 while (*preg
->regparse
!= '\0' && *preg
->regparse
!= ')' &&
407 *preg
->regparse
!= '|') {
408 latest
= regpiece(preg
, &flags
);
411 *flagp
|= flags
&HASWIDTH
;
412 if (chain
== NULL
) {/* First piece. */
413 *flagp
|= flags
&SPSTART
;
416 regtail(preg
, chain
, latest
);
420 if (chain
== NULL
) /* Loop ran zero times. */
421 (void) regnode(preg
, NOTHING
);
427 - regpiece - something followed by possible [*+?]
429 * Note that the branching code sequences used for ? and the general cases
430 * of * and + are somewhat optimized: they use the same NOTHING node as
431 * both the endmarker for their branch list and the body of the last branch.
432 * It might seem that this node could be dispensed with entirely, but the
433 * endmarker role is not redundant.
435 static int *regpiece(regex_t
*preg
, int *flagp
)
441 int size
= preg
->regsize
;
446 ret
= regatom(preg
, &flags
);
450 size
= preg
->regsize
- size
;
452 op
= *preg
->regparse
;
458 if (!(flags
&HASWIDTH
) && op
!= '?') {
459 preg
->err
= REG_ERR_OPERAND_COULD_BE_EMPTY
;
463 /* Handle braces (counted repetition) by expansion */
467 min
= strtoul(preg
->regparse
+ 1, &end
, 10);
468 if (end
== preg
->regparse
+ 1) {
469 preg
->err
= REG_ERR_BAD_COUNT
;
476 preg
->regparse
= end
;
477 max
= strtoul(preg
->regparse
+ 1, &end
, 10);
479 preg
->err
= REG_ERR_UNMATCHED_BRACES
;
483 if (end
== preg
->regparse
+ 1) {
486 else if (max
< min
|| max
>= 100) {
487 preg
->err
= REG_ERR_BAD_COUNT
;
491 preg
->err
= REG_ERR_BAD_COUNT
;
495 preg
->regparse
= strchr(preg
->regparse
, '}');
499 max
= (op
== '?' ? 1 : MAX_REP_COUNT
);
502 if (preg
->regparse
[1] == '?') {
504 next
= reginsert(preg
, flags
& SIMPLE
? REPMIN
: REPXMIN
, 5, ret
);
507 next
= reginsert(preg
, flags
& SIMPLE
? REP
: REPX
, 5, ret
);
513 *flagp
= (min
) ? (WORST
|HASWIDTH
) : (WORST
|SPSTART
);
515 if (!(flags
& SIMPLE
)) {
516 int *back
= regnode(preg
, BACK
);
517 regtail(preg
, back
, ret
);
518 regtail(preg
, next
, back
);
522 if (ISMULT(*preg
->regparse
)) {
523 preg
->err
= REG_ERR_NESTED_COUNT
;
527 return chain
? chain
: ret
;
531 * Add all characters in the inclusive range between lower and upper.
533 * Handles a swapped range (upper < lower).
535 static void reg_addrange(regex_t
*preg
, int lower
, int upper
)
538 reg_addrange(preg
, upper
, lower
);
540 /* Add a range as length, start */
541 regc(preg
, upper
- lower
+ 1);
546 * Add a null-terminated literal string as a set of ranges.
548 static void reg_addrange_str(regex_t
*preg
, const char *str
)
551 reg_addrange(preg
, *str
, *str
);
557 * Extracts the next unicode char from utf8.
559 * If 'upper' is set, converts the char to uppercase.
561 static int reg_utf8_tounicode_case(const char *s
, int *uc
, int upper
)
563 int l
= utf8_tounicode(s
, uc
);
565 *uc
= utf8_upper(*uc
);
571 * Converts a hex digit to decimal.
573 * Returns -1 for an invalid hex digit.
575 static int hexdigitval(int c
)
577 if (c
>= '0' && c
<= '9')
579 if (c
>= 'a' && c
<= 'f')
581 if (c
>= 'A' && c
<= 'F')
587 * Parses up to 'n' hex digits at 's' and stores the result in *uc.
589 * Returns the number of hex digits parsed.
590 * If there are no hex digits, returns 0 and stores nothing.
592 static int parse_hex(const char *s
, int n
, int *uc
)
597 for (k
= 0; k
< n
; k
++) {
598 int c
= hexdigitval(*s
++);
602 val
= (val
<< 4) | c
;
611 * Call for chars after a backlash to decode the escape sequence.
613 * Stores the result in *ch.
615 * Returns the number of bytes consumed.
617 static int reg_decode_escape(const char *s
, int *ch
)
625 case 'b': *ch
= '\b'; break;
626 case 'e': *ch
= 27; break;
627 case 'f': *ch
= '\f'; break;
628 case 'n': *ch
= '\n'; break;
629 case 'r': *ch
= '\r'; break;
630 case 't': *ch
= '\t'; break;
631 case 'v': *ch
= '\v'; break;
633 if ((n
= parse_hex(s
, 4, ch
)) > 0) {
638 if ((n
= parse_hex(s
, 2, ch
)) > 0) {
651 - regatom - the lowest level
653 * Optimization: gobbles an entire sequence of ordinary characters so that
654 * it can turn them into a single node, which is smaller to store and
655 * faster to run. Backslashed characters are exceptions, each becoming a
656 * separate node; the code is simpler that way and it's not worth fixing.
658 static int *regatom(regex_t
*preg
, int *flagp
)
662 int nocase
= (preg
->cflags
& REG_ICASE
);
665 int n
= reg_utf8_tounicode_case(preg
->regparse
, &ch
, nocase
);
667 *flagp
= WORST
; /* Tentatively. */
671 /* FIXME: these chars only have meaning at beg/end of pat? */
673 ret
= regnode(preg
, BOL
);
676 ret
= regnode(preg
, EOL
);
679 ret
= regnode(preg
, ANY
);
680 *flagp
|= HASWIDTH
|SIMPLE
;
683 const char *pattern
= preg
->regparse
;
685 if (*pattern
== '^') { /* Complement of range. */
686 ret
= regnode(preg
, ANYBUT
);
689 ret
= regnode(preg
, ANYOF
);
691 /* Special case. If the first char is ']' or '-', it is part of the set */
692 if (*pattern
== ']' || *pattern
== '-') {
693 reg_addrange(preg
, *pattern
, *pattern
);
697 while (*pattern
&& *pattern
!= ']') {
698 /* Is this a range? a-z */
702 pattern
+= reg_utf8_tounicode_case(pattern
, &start
, nocase
);
704 pattern
+= reg_decode_escape(pattern
, &start
);
706 preg
->err
= REG_ERR_NULL_CHAR
;
710 if (pattern
[0] == '-' && pattern
[1]) {
712 pattern
+= utf8_tounicode(pattern
, &end
);
713 pattern
+= reg_utf8_tounicode_case(pattern
, &end
, nocase
);
715 pattern
+= reg_decode_escape(pattern
, &end
);
717 preg
->err
= REG_ERR_NULL_CHAR
;
722 reg_addrange(preg
, start
, end
);
726 if (strncmp(pattern
, ":alpha:]", 8) == 0) {
727 if ((preg
->cflags
& REG_ICASE
) == 0) {
728 reg_addrange(preg
, 'a', 'z');
730 reg_addrange(preg
, 'A', 'Z');
734 if (strncmp(pattern
, ":alnum:]", 8) == 0) {
735 if ((preg
->cflags
& REG_ICASE
) == 0) {
736 reg_addrange(preg
, 'a', 'z');
738 reg_addrange(preg
, 'A', 'Z');
739 reg_addrange(preg
, '0', '9');
743 if (strncmp(pattern
, ":space:]", 8) == 0) {
744 reg_addrange_str(preg
, " \t\r\n\f\v");
749 /* Not a range, so just add the char */
750 reg_addrange(preg
, start
, start
);
757 preg
->regparse
= pattern
;
759 *flagp
|= HASWIDTH
|SIMPLE
;
763 ret
= reg(preg
, 1, &flags
);
766 *flagp
|= flags
&(HASWIDTH
|SPSTART
);
771 preg
->err
= REG_ERR_INTERNAL
;
772 return NULL
; /* Supposed to be caught earlier. */
777 preg
->err
= REG_ERR_COUNT_FOLLOWS_NOTHING
;
780 switch (*preg
->regparse
++) {
782 preg
->err
= REG_ERR_TRAILING_BACKSLASH
;
787 ret
= regnode(preg
, WORDA
);
791 ret
= regnode(preg
, WORDZ
);
794 ret
= regnode(preg
, ANYOF
);
795 reg_addrange(preg
, '0', '9');
797 *flagp
|= HASWIDTH
|SIMPLE
;
800 ret
= regnode(preg
, ANYOF
);
801 if ((preg
->cflags
& REG_ICASE
) == 0) {
802 reg_addrange(preg
, 'a', 'z');
804 reg_addrange(preg
, 'A', 'Z');
805 reg_addrange(preg
, '0', '9');
806 reg_addrange(preg
, '_', '_');
808 *flagp
|= HASWIDTH
|SIMPLE
;
811 ret
= regnode(preg
, ANYOF
);
812 reg_addrange_str(preg
," \t\r\n\f\v");
814 *flagp
|= HASWIDTH
|SIMPLE
;
816 /* FIXME: Someday handle \1, \2, ... */
818 /* Handle general quoted chars in exact-match routine */
819 /* Back up to include the backslash */
827 * Encode a string of characters to be matched exactly.
831 /* Back up to pick up the first char of interest */
834 ret
= regnode(preg
, EXACTLY
);
836 /* Note that a META operator such as ? or * consumes the
838 * Thus we must be careful to look ahead by 2 and add the
839 * last char as it's own EXACTLY if necessary
842 /* Until end of string or a META char is reached */
843 while (*preg
->regparse
&& strchr(META
, *preg
->regparse
) == NULL
) {
844 n
= reg_utf8_tounicode_case(preg
->regparse
, &ch
, (preg
->cflags
& REG_ICASE
));
845 if (ch
== '\\' && preg
->regparse
[n
]) {
846 /* Non-trailing backslash.
847 * Is this a special escape, or a regular escape?
849 if (strchr("<>mMwds", preg
->regparse
[n
])) {
850 /* A special escape. All done with EXACTLY */
853 /* Decode it. Note that we add the length for the escape
854 * sequence to the length for the backlash so we can skip
855 * the entire sequence, or not as required.
857 n
+= reg_decode_escape(preg
->regparse
+ n
, &ch
);
859 preg
->err
= REG_ERR_NULL_CHAR
;
864 /* Now we have one char 'ch' of length 'n'.
865 * Check to see if the following char is a MULT
868 if (ISMULT(preg
->regparse
[n
])) {
869 /* Yes. But do we already have some EXACTLY chars? */
871 /* Yes, so return what we have and pick up the current char next time around */
874 /* No, so add this single char and finish */
881 /* No, so just add this char normally */
900 - regnode - emit a node
903 static int *regnode(regex_t
*preg
, int op
)
910 if (ret
== ®dummy
) {
916 *ptr
++ = 0; /* Null "next" pointer. */
923 - regc - emit (if appropriate) a byte of code
925 static void regc(regex_t
*preg
, int b
)
928 if (preg
->regcode
!= ®dummy
)
929 *preg
->regcode
++ = b
;
933 - reginsert - insert an operator in front of already-emitted operand
935 * Means relocating the operand.
936 * Returns the new location of the original operand.
938 static int *reginsert(regex_t
*preg
, int op
, int size
, int *opnd
)
944 preg
->regsize
+= size
;
946 if (preg
->regcode
== ®dummy
) {
951 preg
->regcode
+= size
;
956 place
= opnd
; /* Op node, where operand used to be. */
966 - regtail - set the next-pointer at the end of a node chain
968 static void regtail(regex_t
*preg
, int *p
, const int *val
)
977 /* Find last node. */
980 temp
= (int *)regnext(preg
, scan
);
986 if (OP(scan
) == BACK
)
995 - regoptail - regtail on operand of first argument; nop if operandless
998 static void regoptail(regex_t
*preg
, int *p
, const int *val
)
1000 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
1001 if (p
== NULL
|| p
== ®dummy
|| OP(p
) != BRANCH
)
1003 regtail(preg
, OPERAND(p
), val
);
1007 * regexec and friends
1013 static int regtry(regex_t
*preg
, const char *string
);
1014 static int regmatch(regex_t
*preg
, const int *prog
);
1015 static int regrepeat(regex_t
*preg
, const int *p
, int max
);
1018 - regexec - match a regexp against a string
1020 int regexec(regex_t
*preg
, const char *string
, size_t nmatch
, regmatch_t pmatch
[], int eflags
)
1025 /* Be paranoid... */
1026 if (preg
== NULL
|| preg
->program
== NULL
|| string
== NULL
) {
1027 return REG_ERR_NULL_ARGUMENT
;
1030 /* Check validity of program. */
1031 if (*preg
->program
!= REG_MAGIC
) {
1032 return REG_ERR_CORRUPTED
;
1036 fprintf(stderr
, "regexec: %s\n", string
);
1040 preg
->eflags
= eflags
;
1041 preg
->pmatch
= pmatch
;
1042 preg
->nmatch
= nmatch
;
1043 preg
->start
= string
; /* All offsets are computed from here */
1045 /* Must clear out the embedded repeat counts */
1046 for (scan
= OPERAND(preg
->program
+ 1); scan
!= NULL
; scan
= regnext(preg
, scan
)) {
1052 *(int *)(scan
+ 4) = 0;
1057 /* If there is a "must appear" string, look for it. */
1058 if (preg
->regmust
!= NULL
) {
1060 while ((s
= str_find(s
, preg
->regmust
[0], preg
->cflags
& REG_ICASE
)) != NULL
) {
1061 if (prefix_cmp(preg
->regmust
, preg
->regmlen
, s
, preg
->cflags
& REG_ICASE
) >= 0) {
1066 if (s
== NULL
) /* Not present. */
1070 /* Mark beginning of line for ^ . */
1071 preg
->regbol
= string
;
1073 /* Simplest case: anchored match need be tried only once (maybe per line). */
1074 if (preg
->reganch
) {
1075 if (eflags
& REG_NOTBOL
) {
1076 /* This is an anchored search, but not an BOL, so possibly skip to the next line */
1080 int ret
= regtry(preg
, string
);
1086 if (preg
->cflags
& REG_NEWLINE
) {
1087 /* Try the next anchor? */
1088 string
= strchr(string
, '\n');
1090 preg
->regbol
= ++string
;
1099 /* Messy cases: unanchored match. */
1101 if (preg
->regstart
!= '\0') {
1102 /* We know what char it must start with. */
1103 while ((s
= str_find(s
, preg
->regstart
, preg
->cflags
& REG_ICASE
)) != NULL
) {
1104 if (regtry(preg
, s
))
1110 /* We don't -- general case. */
1112 if (regtry(preg
, s
))
1117 s
+= utf8_charlen(*s
);
1125 - regtry - try match at specific point
1127 /* 0 failure, 1 success */
1128 static int regtry( regex_t
*preg
, const char *string
)
1132 preg
->reginput
= string
;
1134 for (i
= 0; i
< preg
->nmatch
; i
++) {
1135 preg
->pmatch
[i
].rm_so
= -1;
1136 preg
->pmatch
[i
].rm_eo
= -1;
1138 if (regmatch(preg
, preg
->program
+ 1)) {
1139 preg
->pmatch
[0].rm_so
= string
- preg
->start
;
1140 preg
->pmatch
[0].rm_eo
= preg
->reginput
- preg
->start
;
1147 * Returns bytes matched if 'pattern' is a prefix of 'string'.
1149 * If 'nocase' is non-zero, does a case-insensitive match.
1151 * Returns -1 on not found.
1153 static int prefix_cmp(const int *prog
, int proglen
, const char *string
, int nocase
)
1155 const char *s
= string
;
1156 while (proglen
&& *s
) {
1158 int n
= reg_utf8_tounicode_case(s
, &ch
, nocase
);
1173 * Searchs for 'c' in the range 'range'.
1175 * Returns 1 if found, or 0 if not.
1177 static int reg_range_find(const int *range
, int c
)
1180 /*printf("Checking %d in range [%d,%d]\n", c, range[1], (range[0] + range[1] - 1));*/
1181 if (c
>= range
[1] && c
<= (range
[0] + range
[1] - 1)) {
1190 * Search for the character 'c' in the utf-8 string 'string'.
1192 * If 'nocase' is set, the 'string' is assumed to be uppercase
1193 * and 'c' is converted to uppercase before matching.
1195 * Returns the byte position in the string where the 'c' was found, or
1196 * NULL if not found.
1198 static const char *str_find(const char *string
, int c
, int nocase
)
1201 /* The "string" should already be converted to uppercase */
1206 int n
= reg_utf8_tounicode_case(string
, &ch
, nocase
);
1216 * Returns true if 'ch' is an end-of-line char.
1218 * In REG_NEWLINE mode, \n is considered EOL in
1221 static int reg_iseol(regex_t
*preg
, int ch
)
1223 if (preg
->cflags
& REG_NEWLINE
) {
1224 return ch
== '\0' || ch
== '\n';
1231 static int regmatchsimplerepeat(regex_t
*preg
, const int *scan
, int matchmin
)
1240 const int *next
= regnext(preg
, scan
);
1243 * Lookahead to avoid useless match attempts
1244 * when we know what character comes next.
1246 if (OP(next
) == EXACTLY
) {
1247 nextch
= *OPERAND(next
);
1249 save
= preg
->reginput
;
1250 no
= regrepeat(preg
, scan
+ 5, max
);
1255 /* from min up to no */
1259 /* else from no down to min */
1271 preg
->reginput
= save
+ utf8_index(save
, no
);
1272 reg_utf8_tounicode_case(preg
->reginput
, &c
, (preg
->cflags
& REG_ICASE
));
1273 /* If it could work, try it. */
1274 if (reg_iseol(preg
, nextch
) || c
== nextch
) {
1275 if (regmatch(preg
, next
)) {
1280 /* Couldn't or didn't, add one more */
1284 /* Couldn't or didn't -- back up. */
1291 static int regmatchrepeat(regex_t
*preg
, int *scan
, int matchmin
)
1298 save
= preg
->reginput
;
1300 /* Have we reached min? */
1301 if (scan
[4] < min
) {
1302 /* No, so get another one */
1304 if (regmatch(preg
, scan
+ 5)) {
1310 if (scan
[4] > max
) {
1315 /* minimal, so try other branch first */
1316 if (regmatch(preg
, regnext(preg
, scan
))) {
1319 /* No, so try one more */
1321 if (regmatch(preg
, scan
+ 5)) {
1327 /* maximal, so try this branch again */
1328 save
= preg
->reginput
;
1329 if (scan
[4] < max
) {
1331 if (regmatch(preg
, scan
+ 5)) {
1336 /* At this point we are at max with no match. Back up by one and try the other branch */
1337 preg
->reginput
= save
;
1338 int ret
= regmatch(preg
, regnext(preg
, scan
));
1343 - regmatch - main matching routine
1345 * Conceptually the strategy is simple: check to see whether the current
1346 * node matches, call self recursively to see whether the rest matches,
1347 * and then act accordingly. In practice we make some effort to avoid
1348 * recursion, in particular by going through "ordinary" nodes (that don't
1349 * need to know whether the rest of the match failed) by a loop instead of
1352 /* 0 failure, 1 success */
1353 static int regmatch(regex_t
*preg
, const int *prog
)
1355 const int *scan
; /* Current node. */
1356 const int *next
; /* Next node. */
1361 if (scan
!= NULL
&& regnarrate
)
1362 fprintf(stderr
, "%s(\n", regprop(scan
));
1364 while (scan
!= NULL
) {
1369 //fprintf(stderr, "%s...\n", regprop(scan));
1371 fprintf(stderr
, "%2d{%02x}%s...\n", (int)(scan
-preg
->program
), op
, regprop(scan
)); /* Where, what. */
1374 next
= regnext(preg
, scan
);
1375 n
= reg_utf8_tounicode_case(preg
->reginput
, &c
, (preg
->cflags
& REG_ICASE
));
1379 if (preg
->reginput
!= preg
->regbol
)
1383 if (!reg_iseol(preg
, c
)) {
1388 /* Must be looking at a letter, digit, or _ */
1389 if ((!isalnum(UCHAR(c
))) && c
!= '_')
1391 /* Prev must be BOL or nonword */
1392 if (preg
->reginput
> preg
->regbol
&&
1393 (isalnum(UCHAR(preg
->reginput
[-1])) || preg
->reginput
[-1] == '_'))
1397 /* Can't match at BOL */
1398 if (preg
->reginput
> preg
->regbol
) {
1399 /* Current must be EOL or nonword */
1400 if (reg_iseol(preg
, c
) || !isalnum(UCHAR(c
)) || c
!= '_') {
1401 c
= preg
->reginput
[-1];
1402 /* Previous must be word */
1403 if (isalnum(UCHAR(c
)) || c
== '_') {
1412 if (reg_iseol(preg
, c
))
1414 preg
->reginput
+= n
;
1421 opnd
= OPERAND(scan
);
1422 len
= str_int_len(opnd
);
1424 slen
= prefix_cmp(opnd
, len
, preg
->reginput
, preg
->cflags
& REG_ICASE
);
1428 preg
->reginput
+= slen
;
1432 if (reg_iseol(preg
, c
) || reg_range_find(OPERAND(scan
), c
) == 0) {
1435 preg
->reginput
+= n
;
1438 if (reg_iseol(preg
, c
) || reg_range_find(OPERAND(scan
), c
) != 0) {
1441 preg
->reginput
+= n
;
1450 if (OP(next
) != BRANCH
) /* No choice. */
1451 next
= OPERAND(scan
); /* Avoid recursion. */
1454 save
= preg
->reginput
;
1455 if (regmatch(preg
, OPERAND(scan
))) {
1458 preg
->reginput
= save
;
1459 scan
= regnext(preg
, scan
);
1460 } while (scan
!= NULL
&& OP(scan
) == BRANCH
);
1468 return regmatchsimplerepeat(preg
, scan
, OP(scan
) == REPMIN
);
1472 return regmatchrepeat(preg
, (int *)scan
, OP(scan
) == REPXMIN
);
1475 return(1); /* Success! */
1478 if (OP(scan
) >= OPEN
+1 && OP(scan
) < CLOSE_END
) {
1481 save
= preg
->reginput
;
1483 if (regmatch(preg
, next
)) {
1486 * Don't set startp if some later
1487 * invocation of the same parentheses
1490 if (OP(scan
) < CLOSE
) {
1491 no
= OP(scan
) - OPEN
;
1492 if (no
< preg
->nmatch
&& preg
->pmatch
[no
].rm_so
== -1) {
1493 preg
->pmatch
[no
].rm_so
= save
- preg
->start
;
1497 no
= OP(scan
) - CLOSE
;
1498 if (no
< preg
->nmatch
&& preg
->pmatch
[no
].rm_eo
== -1) {
1499 preg
->pmatch
[no
].rm_eo
= save
- preg
->start
;
1506 return REG_ERR_INTERNAL
;
1513 * We get here only if there's trouble -- normally "case END" is
1514 * the terminating point.
1516 return REG_ERR_INTERNAL
;
1520 - regrepeat - repeatedly match something simple, report how many
1522 static int regrepeat(regex_t
*preg
, const int *p
, int max
)
1530 scan
= preg
->reginput
;
1534 /* No need to handle utf8 specially here */
1535 while (!reg_iseol(preg
, *scan
) && count
< max
) {
1541 while (count
< max
) {
1542 n
= reg_utf8_tounicode_case(scan
, &ch
, preg
->cflags
& REG_ICASE
);
1551 while (count
< max
) {
1552 n
= reg_utf8_tounicode_case(scan
, &ch
, preg
->cflags
& REG_ICASE
);
1553 if (reg_iseol(preg
, ch
) || reg_range_find(opnd
, ch
) == 0) {
1561 while (count
< max
) {
1562 n
= reg_utf8_tounicode_case(scan
, &ch
, preg
->cflags
& REG_ICASE
);
1563 if (reg_iseol(preg
, ch
) || reg_range_find(opnd
, ch
) != 0) {
1570 default: /* Oh dear. Called inappropriately. */
1571 preg
->err
= REG_ERR_INTERNAL
;
1572 count
= 0; /* Best compromise. */
1575 preg
->reginput
= scan
;
1581 - regnext - dig the "next" pointer out of a node
1583 static const int *regnext(regex_t
*preg
, const int *p
)
1604 - regdump - dump a regexp onto stdout in vaguely comprehensible form
1606 static void regdump(regex_t
*preg
)
1609 char op
= EXACTLY
; /* Arbitrary non-END op. */
1613 if (preg
->regcode
== ®dummy
)
1616 s
= preg
->program
+ 1;
1617 while (op
!= END
&& s
< preg
->regcode
) { /* While that wasn't END last time... */
1619 printf("%2d{%02x}%s", (int)(s
-preg
->program
), op
, regprop(s
)); /* Where, what. */
1620 next
= regnext(preg
, s
);
1621 if (next
== NULL
) /* Next ptr. */
1624 printf("(%d)", (int)((s
-preg
->program
)+(next
-s
)));
1626 if (op
== REP
|| op
== REPMIN
|| op
== REPX
|| op
== REPXMIN
) {
1630 printf("{%d,*}", min
);
1633 printf("{%d,%d}", min
, max
);
1635 printf(" %d", s
[2]);
1638 else if (op
== ANYOF
|| op
== ANYBUT
) {
1644 buf
[utf8_fromunicode(buf
, first
)] = 0;
1647 buf
[utf8_fromunicode(buf
, first
+ len
- 1)] = 0;
1653 else if (op
== EXACTLY
) {
1654 /* Literal string, where present. */
1657 buf
[utf8_fromunicode(buf
, *s
)] = 0;
1667 /* Header fields of interest. */
1668 if (preg
->regstart
) {
1669 buf
[utf8_fromunicode(buf
, preg
->regstart
)] = 0;
1670 printf("start '%s' ", buf
);
1673 printf("anchored ");
1674 if (preg
->regmust
!= NULL
) {
1676 printf("must have:");
1677 for (i
= 0; i
< preg
->regmlen
; i
++) {
1678 putchar(preg
->regmust
[i
]);
1687 - regprop - printable representation of opcode
1689 static const char *regprop( const int *op
)
1692 static char buf
[50];
1694 (void) strcpy(buf
, ":");
1746 if (OP(op
) >= OPEN
&& OP(op
) < CLOSE
) {
1747 sprintf(buf
+strlen(buf
), "OPEN%d", OP(op
)-OPEN
);
1749 else if (OP(op
) >= CLOSE
&& OP(op
) < CLOSE_END
) {
1750 sprintf(buf
+strlen(buf
), "CLOSE%d", OP(op
)-CLOSE
);
1759 (void) strcat(buf
, p
);
1764 size_t regerror(int errcode
, const regex_t
*preg
, char *errbuf
, size_t errbuf_size
)
1766 static const char *error_strings
[] = {
1775 "parentheses () not balanced",
1776 "braces {} not balanced",
1777 "invalid repetition count(s)",
1782 "count follows nothing",
1783 "trailing backslash",
1784 "corrupted program",
1785 "contains null char",
1789 if (errcode
< 0 || errcode
>= REG_ERR_NUM
) {
1790 err
= "Bad error code";
1793 err
= error_strings
[errcode
];
1796 return snprintf(errbuf
, errbuf_size
, "%s", err
);
1799 void regfree(regex_t
*preg
)
1801 free(preg
->program
);