core: improvements to garbage collection
[jimtcl.git] / jimregexp.c
blobe04d3a44fecf1a7bc4e2c88a49ca98554651859d
1 /*
2 * vi:se ts=8:
4 * regcomp and regexec -- regsub and regerror are elsewhere
6 * Copyright (c) 1986 by University of Toronto.
7 * Written by Henry Spencer. Not derived from licensed software.
9 * Permission is granted to anyone to use this software for any
10 * purpose on any computer system, and to redistribute it freely,
11 * subject to the following restrictions:
13 * 1. The author is not responsible for the consequences of use of
14 * this software, no matter how awful, even if they arise
15 * from defects in it.
17 * 2. The origin of this software must not be misrepresented, either
18 * by explicit claim or by omission.
20 * 3. Altered versions must be plainly marked as such, and must not
21 * be misrepresented as being the original software.
22 *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore,
23 *** hoptoad!gnu, on 27 Dec 1986, to add \n as an alternative to |
24 *** to assist in implementing egrep.
25 *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore,
26 *** hoptoad!gnu, on 27 Dec 1986, to add \< and \> for word-matching
27 *** as in BSD grep and ex.
28 *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore,
29 *** hoptoad!gnu, on 28 Dec 1986, to optimize characters quoted with \.
30 *** THIS IS AN ALTERED VERSION. It was altered by James A. Woods,
31 *** ames!jaw, on 19 June 1987, to quash a regcomp() redundancy.
32 *** THIS IS AN ALTERED VERSION. It was altered by Christopher Seiwald
33 *** seiwald@vix.com, on 28 August 1993, for use in jam. Regmagic.h
34 *** was moved into regexp.h, and the include of regexp.h now uses "'s
35 *** to avoid conflicting with the system regexp.h. Const, bless its
36 *** soul, was removed so it can compile everywhere. The declaration
37 *** of strchr() was in conflict on AIX, so it was removed (as it is
38 *** happily defined in string.h).
39 *** THIS IS AN ALTERED VERSION. It was altered by Christopher Seiwald
40 *** seiwald@perforce.com, on 20 January 2000, to use function prototypes.
41 *** THIS IS AN ALTERED VERSION. It was altered by Christopher Seiwald
42 *** seiwald@perforce.com, on 05 November 2002, to const string literals.
44 * THIS IS AN ALTERED VERSION. It was altered by Steve Bennett <steveb@workware.net.au>
45 * on 16 October 2010, to remove static state and add better Tcl ARE compatibility.
46 * This includes counted repetitions, UTF-8 support, character classes,
47 * shorthand character classes, increased number of parentheses to 100,
48 * backslash escape sequences. It also removes \n as an alternative to |.
50 * Beware that some of this code is subtly aware of the way operator
51 * precedence is structured in regular expressions. Serious changes in
52 * regular-expression syntax might require a total rethink.
55 #include "jimautoconf.h"
57 #if defined(JIM_REGEXP)
58 #include <stdio.h>
59 #include <ctype.h>
60 #include <stdlib.h>
61 #include <string.h>
63 #include "jim.h"
64 #include "jimregexp.h"
65 #include "utf8.h"
67 /* An arbitrary limit, but this seems enough. Must be less than 1000. */
68 #define REG_MAX_PAREN 100
71 * Structure for regexp "program". This is essentially a linear encoding
72 * of a nondeterministic finite-state machine (aka syntax charts or
73 * "railroad normal form" in parsing technology). Each node is an opcode
74 * plus a "next" pointer, possibly plus an operand. "Next" pointers of
75 * all nodes except BRANCH implement concatenation; a "next" pointer with
76 * a BRANCH on both ends of it is connecting two alternatives. (Here we
77 * have one of the subtle syntax dependencies: an individual BRANCH (as
78 * opposed to a collection of them) is never concatenated with anything
79 * because of operator precedence.) The operand of some types of node is
80 * a literal string; for others, it is a node leading into a sub-FSM. In
81 * particular, the operand of a BRANCH node is the first node of the branch.
82 * (NB this is *not* a tree structure: the tail of the branch connects
83 * to the thing following the set of BRANCHes.) The opcodes are:
86 /* definition number opnd? meaning */
87 #define END 0 /* no End of program. */
88 #define BOL 1 /* no Match "" at beginning of line. */
89 #define EOL 2 /* no Match "" at end of line. */
90 #define ANY 3 /* no Match any one character. */
91 #define ANYOF 4 /* str Match any character in this string. */
92 #define ANYBUT 5 /* str Match any character not in this string. */
93 #define BRANCH 6 /* node Match this alternative, or the next... */
94 #define BACK 7 /* no Match "", "next" ptr points backward. */
95 #define EXACTLY 8 /* str Match this string. */
96 #define NOTHING 9 /* no Match empty string. */
97 #define REP 10 /* max,min Match this (simple) thing [min,max] times. */
98 #define REPMIN 11 /* max,min Match this (simple) thing [min,max] times, minimal match. */
99 #define REPX 12 /* max,min Match this (complex) thing [min,max] times. */
100 #define REPXMIN 13 /* max,min Match this (complex) thing [min,max] times, minimal match. */
101 #define BOLX 14 /* no Match "" at beginning of input. */
102 #define EOLX 15 /* no Match "" at end of input. */
103 #define WORDA 16 /* no Match "" at wordchar, where prev is nonword */
104 #define WORDZ 17 /* no Match "" at nonwordchar, where prev is word */
106 #define OPENNC 1000 /* no Non-capturing parentheses - must be OPEN-1 */
107 #define OPEN 1001 /* no Mark this point in input as start of #n. */
108 /* OPEN+1 is number 1, etc. */
110 /* must not be any other opts between OPEN and CLOSE */
112 #define CLOSENC 2000 /* no Non-capturing parentheses - must be CLOSE-1 */
113 #define CLOSE 2001 /* no Analogous to OPEN. */
114 #define CLOSE_END (CLOSE+REG_MAX_PAREN)
117 * The first word of the regexp internal "program" is actually this magic
118 * number; the start node begins in the second word.
120 #define REG_MAGIC 0xFADED00D
123 * Opcode notes:
125 * BRANCH The set of branches constituting a single choice are hooked
126 * together with their "next" pointers, since precedence prevents
127 * anything being concatenated to any individual branch. The
128 * "next" pointer of the last BRANCH in a choice points to the
129 * thing following the whole choice. This is also where the
130 * final "next" pointer of each individual branch points; each
131 * branch starts with the operand node of a BRANCH node.
133 * BACK Normal "next" pointers all implicitly point forward; BACK
134 * exists to make loop structures possible.
136 * REP,REPX Repeated matches ('?', '*', '+' and {min,max}) are implemented
137 * as either simple repeats (REP) or complex repeats (REPX).
138 * These opcodes include a "min" and "max" count after the opcode.
139 * This is followed by a fourth "current count" word that is
140 * only used by REPX, as it implements a recursive match.
141 * REPMIN and REPXMIN are identical except they implement minimal repeats.
143 * OPEN,CLOSE ...are numbered at compile time.
147 * A node is one word of opcode followed by one word of "next" pointer.
148 * The "next" pointer value is a positive offset from the opcode of the node
149 * containing it.
150 * An operand, if any, simply follows the node. (Note that much of the
151 * code generation knows about this implicit relationship.)
153 #define OP(preg, p) (preg->program[p])
154 #define NEXT(preg, p) (preg->program[p + 1])
155 #define OPERAND(p) ((p) + 2)
158 * See regmagic.h for one further detail of program structure.
163 * Utility definitions.
166 #define FAIL(R,M) { (R)->err = (M); return (M); }
167 #define ISMULT(c) ((c) == '*' || (c) == '+' || (c) == '?' || (c) == '{')
168 #define META "^$.[()|?{+*"
171 * Flags to be passed up and down.
173 #define HASWIDTH 1 /* Known never to match null string. */
174 #define SIMPLE 2 /* Simple enough to be STAR/PLUS operand. */
175 #define SPSTART 4 /* Starts with * or +. */
176 #define WORST 0 /* Worst case. */
178 #define MAX_REP_COUNT 1000000
181 * Forward declarations for regcomp()'s friends.
183 static int reg(regex_t *preg, int paren /* Parenthesized? */, int *flagp );
184 static int regpiece(regex_t *preg, int *flagp );
185 static int regbranch(regex_t *preg, int *flagp );
186 static int regatom(regex_t *preg, int *flagp );
187 static int regnode(regex_t *preg, int op );
188 static int regnext(regex_t *preg, int p );
189 static void regc(regex_t *preg, int b );
190 static int reginsert(regex_t *preg, int op, int size, int opnd );
191 static void regtail(regex_t *preg, int p, int val);
192 static void regoptail(regex_t *preg, int p, int val );
193 static int regopsize(regex_t *preg, int p );
195 static int reg_range_find(const int *string, int c);
196 static const char *str_find(const char *string, int c, int nocase);
197 static int prefix_cmp(const int *prog, int proglen, const char *string, int nocase);
199 /*#define DEBUG*/
200 #ifdef DEBUG
201 static int regnarrate = 0;
202 static void regdump(regex_t *preg);
203 static const char *regprop( int op );
204 #endif
208 * Returns the length of the null-terminated integer sequence.
210 static int str_int_len(const int *seq)
212 int n = 0;
213 while (*seq++) {
214 n++;
216 return n;
220 - regcomp - compile a regular expression into internal code
222 * We can't allocate space until we know how big the compiled form will be,
223 * but we can't compile it (and thus know how big it is) until we've got a
224 * place to put the code. So we cheat: we compile it twice, once with code
225 * generation turned off and size counting turned on, and once "for real".
226 * This also means that we don't allocate space until we are sure that the
227 * thing really will compile successfully, and we never have to move the
228 * code and thus invalidate pointers into it. (Note that it has to be in
229 * one piece because free() must be able to free it all.)
231 * Beware that the optimization-preparation code in here knows about some
232 * of the structure of the compiled regexp.
234 int regcomp(regex_t *preg, const char *exp, int cflags)
236 int scan;
237 int longest;
238 unsigned len;
239 int flags;
241 #ifdef DEBUG
242 fprintf(stderr, "Compiling: '%s'\n", exp);
243 #endif
244 memset(preg, 0, sizeof(*preg));
246 if (exp == NULL)
247 FAIL(preg, REG_ERR_NULL_ARGUMENT);
249 /* First pass: determine size, legality. */
250 preg->cflags = cflags;
251 preg->regparse = exp;
253 /* Allocate space. */
254 preg->proglen = (strlen(exp) + 1) * 5;
255 preg->program = malloc(preg->proglen * sizeof(int));
256 if (preg->program == NULL)
257 FAIL(preg, REG_ERR_NOMEM);
259 /* Note that since we store a magic value as the first item in the program,
260 * program offsets will never be 0
262 regc(preg, REG_MAGIC);
263 if (reg(preg, 0, &flags) == 0) {
264 return preg->err;
267 /* Small enough for pointer-storage convention? */
268 if (preg->re_nsub >= REG_MAX_PAREN) /* Probably could be 65535L. */
269 FAIL(preg,REG_ERR_TOO_BIG);
271 /* Dig out information for optimizations. */
272 preg->regstart = 0; /* Worst-case defaults. */
273 preg->reganch = 0;
274 preg->regmust = 0;
275 preg->regmlen = 0;
276 scan = 1; /* First BRANCH. */
277 if (OP(preg, regnext(preg, scan)) == END) { /* Only one top-level choice. */
278 scan = OPERAND(scan);
280 /* Starting-point info. */
281 if (OP(preg, scan) == EXACTLY) {
282 preg->regstart = preg->program[OPERAND(scan)];
284 else if (OP(preg, scan) == BOL)
285 preg->reganch++;
288 * If there's something expensive in the r.e., find the
289 * longest literal string that must appear and make it the
290 * regmust. Resolve ties in favor of later strings, since
291 * the regstart check works with the beginning of the r.e.
292 * and avoiding duplication strengthens checking. Not a
293 * strong reason, but sufficient in the absence of others.
295 if (flags&SPSTART) {
296 longest = 0;
297 len = 0;
298 for (; scan != 0; scan = regnext(preg, scan)) {
299 if (OP(preg, scan) == EXACTLY) {
300 int plen = str_int_len(preg->program + OPERAND(scan));
301 if (plen >= len) {
302 longest = OPERAND(scan);
303 len = plen;
307 preg->regmust = longest;
308 preg->regmlen = len;
312 #ifdef DEBUG
313 regdump(preg);
314 #endif
316 return 0;
320 - reg - regular expression, i.e. main body or parenthesized thing
322 * Caller must absorb opening parenthesis.
324 * Combining parenthesis handling with the base level of regular expression
325 * is a trifle forced, but the need to tie the tails of the branches to what
326 * follows makes it hard to avoid.
328 static int reg(regex_t *preg, int paren /* Parenthesized? */, int *flagp )
330 int ret;
331 int br;
332 int ender;
333 int parno = 0;
334 int flags;
336 *flagp = HASWIDTH; /* Tentatively. */
338 /* Make an OPEN node, if parenthesized. */
339 if (paren) {
340 if (preg->regparse[0] == '?' && preg->regparse[1] == ':') {
341 /* non-capturing paren */
342 preg->regparse += 2;
343 parno = -1;
345 else {
346 parno = ++preg->re_nsub;
348 ret = regnode(preg, OPEN+parno);
349 } else
350 ret = 0;
352 /* Pick up the branches, linking them together. */
353 br = regbranch(preg, &flags);
354 if (br == 0)
355 return 0;
356 if (ret != 0)
357 regtail(preg, ret, br); /* OPEN -> first. */
358 else
359 ret = br;
360 if (!(flags&HASWIDTH))
361 *flagp &= ~HASWIDTH;
362 *flagp |= flags&SPSTART;
363 while (*preg->regparse == '|') {
364 preg->regparse++;
365 br = regbranch(preg, &flags);
366 if (br == 0)
367 return 0;
368 regtail(preg, ret, br); /* BRANCH -> BRANCH. */
369 if (!(flags&HASWIDTH))
370 *flagp &= ~HASWIDTH;
371 *flagp |= flags&SPSTART;
374 /* Make a closing node, and hook it on the end. */
375 ender = regnode(preg, (paren) ? CLOSE+parno : END);
376 regtail(preg, ret, ender);
378 /* Hook the tails of the branches to the closing node. */
379 for (br = ret; br != 0; br = regnext(preg, br))
380 regoptail(preg, br, ender);
382 /* Check for proper termination. */
383 if (paren && *preg->regparse++ != ')') {
384 preg->err = REG_ERR_UNMATCHED_PAREN;
385 return 0;
386 } else if (!paren && *preg->regparse != '\0') {
387 if (*preg->regparse == ')') {
388 preg->err = REG_ERR_UNMATCHED_PAREN;
389 return 0;
390 } else {
391 preg->err = REG_ERR_JUNK_ON_END;
392 return 0;
396 return(ret);
400 - regbranch - one alternative of an | operator
402 * Implements the concatenation operator.
404 static int regbranch(regex_t *preg, int *flagp )
406 int ret;
407 int chain;
408 int latest;
409 int flags;
411 *flagp = WORST; /* Tentatively. */
413 ret = regnode(preg, BRANCH);
414 chain = 0;
415 while (*preg->regparse != '\0' && *preg->regparse != ')' &&
416 *preg->regparse != '|') {
417 latest = regpiece(preg, &flags);
418 if (latest == 0)
419 return 0;
420 *flagp |= flags&HASWIDTH;
421 if (chain == 0) {/* First piece. */
422 *flagp |= flags&SPSTART;
424 else {
425 regtail(preg, chain, latest);
427 chain = latest;
429 if (chain == 0) /* Loop ran zero times. */
430 (void) regnode(preg, NOTHING);
432 return(ret);
436 - regpiece - something followed by possible [*+?]
438 * Note that the branching code sequences used for ? and the general cases
439 * of * and + are somewhat optimized: they use the same NOTHING node as
440 * both the endmarker for their branch list and the body of the last branch.
441 * It might seem that this node could be dispensed with entirely, but the
442 * endmarker role is not redundant.
444 static int regpiece(regex_t *preg, int *flagp)
446 int ret;
447 char op;
448 int next;
449 int flags;
450 int min;
451 int max;
453 ret = regatom(preg, &flags);
454 if (ret == 0)
455 return 0;
457 op = *preg->regparse;
458 if (!ISMULT(op)) {
459 *flagp = flags;
460 return(ret);
463 if (!(flags&HASWIDTH) && op != '?') {
464 preg->err = REG_ERR_OPERAND_COULD_BE_EMPTY;
465 return 0;
468 /* Handle braces (counted repetition) by expansion */
469 if (op == '{') {
470 char *end;
472 min = strtoul(preg->regparse + 1, &end, 10);
473 if (end == preg->regparse + 1) {
474 preg->err = REG_ERR_BAD_COUNT;
475 return 0;
477 if (*end == '}') {
478 max = min;
480 else if (*end == '\0') {
481 preg->err = REG_ERR_UNMATCHED_BRACES;
482 return 0;
484 else {
485 preg->regparse = end;
486 max = strtoul(preg->regparse + 1, &end, 10);
487 if (*end != '}') {
488 preg->err = REG_ERR_UNMATCHED_BRACES;
489 return 0;
492 if (end == preg->regparse + 1) {
493 max = MAX_REP_COUNT;
495 else if (max < min || max >= 100) {
496 preg->err = REG_ERR_BAD_COUNT;
497 return 0;
499 if (min >= 100) {
500 preg->err = REG_ERR_BAD_COUNT;
501 return 0;
504 preg->regparse = strchr(preg->regparse, '}');
506 else {
507 min = (op == '+');
508 max = (op == '?' ? 1 : MAX_REP_COUNT);
511 if (preg->regparse[1] == '?') {
512 preg->regparse++;
513 next = reginsert(preg, flags & SIMPLE ? REPMIN : REPXMIN, 5, ret);
515 else {
516 next = reginsert(preg, flags & SIMPLE ? REP: REPX, 5, ret);
518 preg->program[ret + 2] = max;
519 preg->program[ret + 3] = min;
520 preg->program[ret + 4] = 0;
522 *flagp = (min) ? (WORST|HASWIDTH) : (WORST|SPSTART);
524 if (!(flags & SIMPLE)) {
525 int back = regnode(preg, BACK);
526 regtail(preg, back, ret);
527 regtail(preg, next, back);
530 preg->regparse++;
531 if (ISMULT(*preg->regparse)) {
532 preg->err = REG_ERR_NESTED_COUNT;
533 return 0;
536 return ret;
540 * Add all characters in the inclusive range between lower and upper.
542 * Handles a swapped range (upper < lower).
544 static void reg_addrange(regex_t *preg, int lower, int upper)
546 if (lower > upper) {
547 reg_addrange(preg, upper, lower);
549 /* Add a range as length, start */
550 regc(preg, upper - lower + 1);
551 regc(preg, lower);
555 * Add a null-terminated literal string as a set of ranges.
557 static void reg_addrange_str(regex_t *preg, const char *str)
559 while (*str) {
560 reg_addrange(preg, *str, *str);
561 str++;
566 * Extracts the next unicode char from utf8.
568 * If 'upper' is set, converts the char to uppercase.
570 static int reg_utf8_tounicode_case(const char *s, int *uc, int upper)
572 int l = utf8_tounicode(s, uc);
573 if (upper) {
574 *uc = utf8_upper(*uc);
576 return l;
580 * Converts a hex digit to decimal.
582 * Returns -1 for an invalid hex digit.
584 static int hexdigitval(int c)
586 if (c >= '0' && c <= '9')
587 return c - '0';
588 if (c >= 'a' && c <= 'f')
589 return c - 'a' + 10;
590 if (c >= 'A' && c <= 'F')
591 return c - 'A' + 10;
592 return -1;
596 * Parses up to 'n' hex digits at 's' and stores the result in *uc.
598 * Returns the number of hex digits parsed.
599 * If there are no hex digits, returns 0 and stores nothing.
601 static int parse_hex(const char *s, int n, int *uc)
603 int val = 0;
604 int k;
606 for (k = 0; k < n; k++) {
607 int c = hexdigitval(*s++);
608 if (c == -1) {
609 break;
611 val = (val << 4) | c;
613 if (k) {
614 *uc = val;
616 return k;
620 * Call for chars after a backlash to decode the escape sequence.
622 * Stores the result in *ch.
624 * Returns the number of bytes consumed.
626 static int reg_decode_escape(const char *s, int *ch)
628 int n;
629 const char *s0 = s;
631 *ch = *s++;
633 switch (*ch) {
634 case 'b': *ch = '\b'; break;
635 case 'e': *ch = 27; break;
636 case 'f': *ch = '\f'; break;
637 case 'n': *ch = '\n'; break;
638 case 'r': *ch = '\r'; break;
639 case 't': *ch = '\t'; break;
640 case 'v': *ch = '\v'; break;
641 case 'u':
642 if (*s == '{') {
643 /* Expect \u{NNNN} */
644 n = parse_hex(s + 1, 6, ch);
645 if (n > 0 && s[n + 1] == '}' && *ch >= 0 && *ch <= 0x1fffff) {
646 s += n + 2;
648 else {
649 /* Invalid, so just treat as an escaped 'u' */
650 *ch = 'u';
653 else if ((n = parse_hex(s, 4, ch)) > 0) {
654 s += n;
656 break;
657 case 'U':
658 if ((n = parse_hex(s, 8, ch)) > 0) {
659 s += n;
661 break;
662 case 'x':
663 if ((n = parse_hex(s, 2, ch)) > 0) {
664 s += n;
666 break;
667 case '\0':
668 s--;
669 *ch = '\\';
670 break;
672 return s - s0;
676 - regatom - the lowest level
678 * Optimization: gobbles an entire sequence of ordinary characters so that
679 * it can turn them into a single node, which is smaller to store and
680 * faster to run. Backslashed characters are exceptions, each becoming a
681 * separate node; the code is simpler that way and it's not worth fixing.
683 static int regatom(regex_t *preg, int *flagp)
685 int ret;
686 int flags;
687 int nocase = (preg->cflags & REG_ICASE);
689 int ch;
690 int n = reg_utf8_tounicode_case(preg->regparse, &ch, nocase);
692 *flagp = WORST; /* Tentatively. */
694 preg->regparse += n;
695 switch (ch) {
696 /* FIXME: these chars only have meaning at beg/end of pat? */
697 case '^':
698 ret = regnode(preg, BOL);
699 break;
700 case '$':
701 ret = regnode(preg, EOL);
702 break;
703 case '.':
704 ret = regnode(preg, ANY);
705 *flagp |= HASWIDTH|SIMPLE;
706 break;
707 case '[': {
708 const char *pattern = preg->regparse;
710 if (*pattern == '^') { /* Complement of range. */
711 ret = regnode(preg, ANYBUT);
712 pattern++;
713 } else
714 ret = regnode(preg, ANYOF);
716 /* Special case. If the first char is ']' or '-', it is part of the set */
717 if (*pattern == ']' || *pattern == '-') {
718 reg_addrange(preg, *pattern, *pattern);
719 pattern++;
722 while (*pattern != ']') {
723 /* Is this a range? a-z */
724 int start;
725 int end;
727 enum {
728 CC_ALPHA, CC_ALNUM, CC_SPACE, CC_BLANK, CC_UPPER, CC_LOWER,
729 CC_DIGIT, CC_XDIGIT, CC_CNTRL, CC_GRAPH, CC_PRINT, CC_PUNCT,
730 CC_NUM
732 int cc;
734 if (!*pattern) {
735 preg->err = REG_ERR_UNMATCHED_BRACKET;
736 return 0;
739 pattern += reg_utf8_tounicode_case(pattern, &start, nocase);
740 if (start == '\\') {
741 /* First check for class shorthand escapes */
742 switch (*pattern) {
743 case 's':
744 pattern++;
745 cc = CC_SPACE;
746 goto cc_switch;
747 case 'd':
748 pattern++;
749 cc = CC_DIGIT;
750 goto cc_switch;
751 case 'w':
752 pattern++;
753 reg_addrange(preg, '_', '_');
754 cc = CC_ALNUM;
755 goto cc_switch;
757 pattern += reg_decode_escape(pattern, &start);
758 if (start == 0) {
759 preg->err = REG_ERR_NULL_CHAR;
760 return 0;
762 if (start == '\\' && *pattern == 0) {
763 preg->err = REG_ERR_INVALID_ESCAPE;
764 return 0;
767 if (pattern[0] == '-' && pattern[1] && pattern[1] != ']') {
768 /* skip '-' */
769 pattern += utf8_tounicode(pattern, &end);
770 pattern += reg_utf8_tounicode_case(pattern, &end, nocase);
771 if (end == '\\') {
772 pattern += reg_decode_escape(pattern, &end);
773 if (end == 0) {
774 preg->err = REG_ERR_NULL_CHAR;
775 return 0;
777 if (start == '\\' && *pattern == 0) {
778 preg->err = REG_ERR_INVALID_ESCAPE;
779 return 0;
783 reg_addrange(preg, start, end);
784 continue;
786 if (start == '[' && pattern[0] == ':') {
787 static const char *character_class[] = {
788 ":alpha:", ":alnum:", ":space:", ":blank:", ":upper:", ":lower:",
789 ":digit:", ":xdigit:", ":cntrl:", ":graph:", ":print:", ":punct:",
792 for (cc = 0; cc < CC_NUM; cc++) {
793 n = strlen(character_class[cc]);
794 if (strncmp(pattern, character_class[cc], n) == 0) {
795 /* Found a character class */
796 pattern += n + 1;
797 break;
800 if (cc != CC_NUM) {
801 cc_switch:
802 switch (cc) {
803 case CC_ALNUM:
804 reg_addrange(preg, '0', '9');
805 /* Fall through */
806 case CC_ALPHA:
807 if ((preg->cflags & REG_ICASE) == 0) {
808 reg_addrange(preg, 'a', 'z');
810 reg_addrange(preg, 'A', 'Z');
811 break;
812 case CC_SPACE:
813 reg_addrange_str(preg, " \t\r\n\f\v");
814 break;
815 case CC_BLANK:
816 reg_addrange_str(preg, " \t");
817 break;
818 case CC_UPPER:
819 reg_addrange(preg, 'A', 'Z');
820 break;
821 case CC_LOWER:
822 reg_addrange(preg, 'a', 'z');
823 break;
824 case CC_XDIGIT:
825 reg_addrange(preg, 'a', 'f');
826 reg_addrange(preg, 'A', 'F');
827 /* Fall through */
828 case CC_DIGIT:
829 reg_addrange(preg, '0', '9');
830 break;
831 case CC_CNTRL:
832 reg_addrange(preg, 0, 31);
833 reg_addrange(preg, 127, 127);
834 break;
835 case CC_PRINT:
836 reg_addrange(preg, ' ', '~');
837 break;
838 case CC_GRAPH:
839 reg_addrange(preg, '!', '~');
840 break;
841 case CC_PUNCT:
842 reg_addrange(preg, '!', '/');
843 reg_addrange(preg, ':', '@');
844 reg_addrange(preg, '[', '`');
845 reg_addrange(preg, '{', '~');
846 break;
848 continue;
851 /* Not a range, so just add the char */
852 reg_addrange(preg, start, start);
854 regc(preg, '\0');
856 if (*pattern) {
857 pattern++;
859 preg->regparse = pattern;
861 *flagp |= HASWIDTH|SIMPLE;
863 break;
864 case '(':
865 ret = reg(preg, 1, &flags);
866 if (ret == 0)
867 return 0;
868 *flagp |= flags&(HASWIDTH|SPSTART);
869 break;
870 case '\0':
871 case '|':
872 case ')':
873 preg->err = REG_ERR_INTERNAL;
874 return 0; /* Supposed to be caught earlier. */
875 case '?':
876 case '+':
877 case '*':
878 case '{':
879 preg->err = REG_ERR_COUNT_FOLLOWS_NOTHING;
880 return 0;
881 case '\\':
882 ch = *preg->regparse++;
883 switch (ch) {
884 case '\0':
885 preg->err = REG_ERR_INVALID_ESCAPE;
886 return 0;
887 case 'A':
888 ret = regnode(preg, BOLX);
889 break;
890 case 'Z':
891 ret = regnode(preg, EOLX);
892 break;
893 case '<':
894 case 'm':
895 ret = regnode(preg, WORDA);
896 break;
897 case '>':
898 case 'M':
899 ret = regnode(preg, WORDZ);
900 break;
901 case 'd':
902 case 'D':
903 ret = regnode(preg, ch == 'd' ? ANYOF : ANYBUT);
904 reg_addrange(preg, '0', '9');
905 regc(preg, '\0');
906 *flagp |= HASWIDTH|SIMPLE;
907 break;
908 case 'w':
909 case 'W':
910 ret = regnode(preg, ch == 'w' ? ANYOF : ANYBUT);
911 if ((preg->cflags & REG_ICASE) == 0) {
912 reg_addrange(preg, 'a', 'z');
914 reg_addrange(preg, 'A', 'Z');
915 reg_addrange(preg, '0', '9');
916 reg_addrange(preg, '_', '_');
917 regc(preg, '\0');
918 *flagp |= HASWIDTH|SIMPLE;
919 break;
920 case 's':
921 case 'S':
922 ret = regnode(preg, ch == 's' ? ANYOF : ANYBUT);
923 reg_addrange_str(preg," \t\r\n\f\v");
924 regc(preg, '\0');
925 *flagp |= HASWIDTH|SIMPLE;
926 break;
927 /* FIXME: Someday handle \1, \2, ... */
928 default:
929 /* Handle general quoted chars in exact-match routine */
930 /* Back up to include the backslash */
931 preg->regparse--;
932 goto de_fault;
934 break;
935 de_fault:
936 default: {
938 * Encode a string of characters to be matched exactly.
940 int added = 0;
942 /* Back up to pick up the first char of interest */
943 preg->regparse -= n;
945 ret = regnode(preg, EXACTLY);
947 /* Note that a META operator such as ? or * consumes the
948 * preceding char.
949 * Thus we must be careful to look ahead by 2 and add the
950 * last char as it's own EXACTLY if necessary
953 /* Until end of string or a META char is reached */
954 while (*preg->regparse && strchr(META, *preg->regparse) == NULL) {
955 n = reg_utf8_tounicode_case(preg->regparse, &ch, (preg->cflags & REG_ICASE));
956 if (ch == '\\' && preg->regparse[n]) {
957 /* Non-trailing backslash.
958 * Is this a special escape, or a regular escape?
960 if (strchr("<>mMwWdDsSAZ", preg->regparse[n])) {
961 /* A special escape. All done with EXACTLY */
962 break;
964 /* Decode it. Note that we add the length for the escape
965 * sequence to the length for the backlash so we can skip
966 * the entire sequence, or not as required.
968 n += reg_decode_escape(preg->regparse + n, &ch);
969 if (ch == 0) {
970 preg->err = REG_ERR_NULL_CHAR;
971 return 0;
975 /* Now we have one char 'ch' of length 'n'.
976 * Check to see if the following char is a MULT
979 if (ISMULT(preg->regparse[n])) {
980 /* Yes. But do we already have some EXACTLY chars? */
981 if (added) {
982 /* Yes, so return what we have and pick up the current char next time around */
983 break;
985 /* No, so add this single char and finish */
986 regc(preg, ch);
987 added++;
988 preg->regparse += n;
989 break;
992 /* No, so just add this char normally */
993 regc(preg, ch);
994 added++;
995 preg->regparse += n;
997 regc(preg, '\0');
999 *flagp |= HASWIDTH;
1000 if (added == 1)
1001 *flagp |= SIMPLE;
1002 break;
1004 break;
1007 return(ret);
1010 static void reg_grow(regex_t *preg, int n)
1012 if (preg->p + n >= preg->proglen) {
1013 preg->proglen = (preg->p + n) * 2;
1014 preg->program = realloc(preg->program, preg->proglen * sizeof(int));
1019 - regnode - emit a node
1021 /* Location. */
1022 static int regnode(regex_t *preg, int op)
1024 reg_grow(preg, 2);
1026 /* The OP followed by a next pointer */
1027 preg->program[preg->p++] = op;
1028 preg->program[preg->p++] = 0;
1030 /* Return the start of the node */
1031 return preg->p - 2;
1035 - regc - emit (if appropriate) a byte of code
1037 static void regc(regex_t *preg, int b )
1039 reg_grow(preg, 1);
1040 preg->program[preg->p++] = b;
1044 - reginsert - insert an operator in front of already-emitted operand
1046 * Means relocating the operand.
1047 * Returns the new location of the original operand.
1049 static int reginsert(regex_t *preg, int op, int size, int opnd )
1051 reg_grow(preg, size);
1053 /* Move everything from opnd up */
1054 memmove(preg->program + opnd + size, preg->program + opnd, sizeof(int) * (preg->p - opnd));
1055 /* Zero out the new space */
1056 memset(preg->program + opnd, 0, sizeof(int) * size);
1058 preg->program[opnd] = op;
1060 preg->p += size;
1062 return opnd + size;
1066 - regtail - set the next-pointer at the end of a node chain
1068 static void regtail(regex_t *preg, int p, int val)
1070 int scan;
1071 int temp;
1072 int offset;
1074 /* Find last node. */
1075 scan = p;
1076 for (;;) {
1077 temp = regnext(preg, scan);
1078 if (temp == 0)
1079 break;
1080 scan = temp;
1083 if (OP(preg, scan) == BACK)
1084 offset = scan - val;
1085 else
1086 offset = val - scan;
1088 preg->program[scan + 1] = offset;
1092 - regoptail - regtail on operand of first argument; nop if operandless
1095 static void regoptail(regex_t *preg, int p, int val )
1097 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
1098 if (p != 0 && OP(preg, p) == BRANCH) {
1099 regtail(preg, OPERAND(p), val);
1104 * regexec and friends
1108 * Forwards.
1110 static int regtry(regex_t *preg, const char *string );
1111 static int regmatch(regex_t *preg, int prog);
1112 static int regrepeat(regex_t *preg, int p, int max);
1115 - regexec - match a regexp against a string
1117 int regexec(regex_t *preg, const char *string, size_t nmatch, regmatch_t pmatch[], int eflags)
1119 const char *s;
1120 int scan;
1122 /* Be paranoid... */
1123 if (preg == NULL || preg->program == NULL || string == NULL) {
1124 return REG_ERR_NULL_ARGUMENT;
1127 /* Check validity of program. */
1128 if (*preg->program != REG_MAGIC) {
1129 return REG_ERR_CORRUPTED;
1132 #ifdef DEBUG
1133 fprintf(stderr, "regexec: %s\n", string);
1134 regdump(preg);
1135 #endif
1137 preg->eflags = eflags;
1138 preg->pmatch = pmatch;
1139 preg->nmatch = nmatch;
1140 preg->start = string; /* All offsets are computed from here */
1142 /* Must clear out the embedded repeat counts of REPX and REPXMIN opcodes */
1143 for (scan = OPERAND(1); scan != 0; scan += regopsize(preg, scan)) {
1144 int op = OP(preg, scan);
1145 if (op == END)
1146 break;
1147 if (op == REPX || op == REPXMIN)
1148 preg->program[scan + 4] = 0;
1151 /* If there is a "must appear" string, look for it. */
1152 if (preg->regmust != 0) {
1153 s = string;
1154 while ((s = str_find(s, preg->program[preg->regmust], preg->cflags & REG_ICASE)) != NULL) {
1155 if (prefix_cmp(preg->program + preg->regmust, preg->regmlen, s, preg->cflags & REG_ICASE) >= 0) {
1156 break;
1158 s++;
1160 if (s == NULL) /* Not present. */
1161 return REG_NOMATCH;
1164 /* Mark beginning of line for ^ . */
1165 preg->regbol = string;
1167 /* Simplest case: anchored match need be tried only once (maybe per line). */
1168 if (preg->reganch) {
1169 if (eflags & REG_NOTBOL) {
1170 /* This is an anchored search, but not an BOL, so possibly skip to the next line */
1171 goto nextline;
1173 while (1) {
1174 if (regtry(preg, string)) {
1175 return REG_NOERROR;
1177 if (*string) {
1178 nextline:
1179 if (preg->cflags & REG_NEWLINE) {
1180 /* Try the next anchor? */
1181 string = strchr(string, '\n');
1182 if (string) {
1183 preg->regbol = ++string;
1184 continue;
1188 return REG_NOMATCH;
1192 /* Messy cases: unanchored match. */
1193 s = string;
1194 if (preg->regstart != '\0') {
1195 /* We know what char it must start with. */
1196 while ((s = str_find(s, preg->regstart, preg->cflags & REG_ICASE)) != NULL) {
1197 if (regtry(preg, s))
1198 return REG_NOERROR;
1199 s++;
1202 else
1203 /* We don't -- general case. */
1204 while (1) {
1205 if (regtry(preg, s))
1206 return REG_NOERROR;
1207 if (*s == '\0') {
1208 break;
1210 else {
1211 int c;
1212 s += utf8_tounicode(s, &c);
1216 /* Failure. */
1217 return REG_NOMATCH;
1221 - regtry - try match at specific point
1223 /* 0 failure, 1 success */
1224 static int regtry( regex_t *preg, const char *string )
1226 int i;
1228 preg->reginput = string;
1230 for (i = 0; i < preg->nmatch; i++) {
1231 preg->pmatch[i].rm_so = -1;
1232 preg->pmatch[i].rm_eo = -1;
1234 if (regmatch(preg, 1)) {
1235 preg->pmatch[0].rm_so = string - preg->start;
1236 preg->pmatch[0].rm_eo = preg->reginput - preg->start;
1237 return(1);
1238 } else
1239 return(0);
1243 * Returns bytes matched if 'pattern' is a prefix of 'string'.
1245 * If 'nocase' is non-zero, does a case-insensitive match.
1247 * Returns -1 on not found.
1249 static int prefix_cmp(const int *prog, int proglen, const char *string, int nocase)
1251 const char *s = string;
1252 while (proglen && *s) {
1253 int ch;
1254 int n = reg_utf8_tounicode_case(s, &ch, nocase);
1255 if (ch != *prog) {
1256 return -1;
1258 prog++;
1259 s += n;
1260 proglen--;
1262 if (proglen == 0) {
1263 return s - string;
1265 return -1;
1269 * Searchs for 'c' in the range 'range'.
1271 * Returns 1 if found, or 0 if not.
1273 static int reg_range_find(const int *range, int c)
1275 while (*range) {
1276 /*printf("Checking %d in range [%d,%d]\n", c, range[1], (range[0] + range[1] - 1));*/
1277 if (c >= range[1] && c <= (range[0] + range[1] - 1)) {
1278 return 1;
1280 range += 2;
1282 return 0;
1286 * Search for the character 'c' in the utf-8 string 'string'.
1288 * If 'nocase' is set, the 'string' is assumed to be uppercase
1289 * and 'c' is converted to uppercase before matching.
1291 * Returns the byte position in the string where the 'c' was found, or
1292 * NULL if not found.
1294 static const char *str_find(const char *string, int c, int nocase)
1296 if (nocase) {
1297 /* The "string" should already be converted to uppercase */
1298 c = utf8_upper(c);
1300 while (*string) {
1301 int ch;
1302 int n = reg_utf8_tounicode_case(string, &ch, nocase);
1303 if (c == ch) {
1304 return string;
1306 string += n;
1308 return NULL;
1312 * Returns true if 'ch' is an end-of-line char.
1314 * In REG_NEWLINE mode, \n is considered EOL in
1315 * addition to \0
1317 static int reg_iseol(regex_t *preg, int ch)
1319 if (preg->cflags & REG_NEWLINE) {
1320 return ch == '\0' || ch == '\n';
1322 else {
1323 return ch == '\0';
1327 static int regmatchsimplerepeat(regex_t *preg, int scan, int matchmin)
1329 int nextch = '\0';
1330 const char *save;
1331 int no;
1332 int c;
1334 int max = preg->program[scan + 2];
1335 int min = preg->program[scan + 3];
1336 int next = regnext(preg, scan);
1339 * Lookahead to avoid useless match attempts
1340 * when we know what character comes next.
1342 if (OP(preg, next) == EXACTLY) {
1343 nextch = preg->program[OPERAND(next)];
1345 save = preg->reginput;
1346 no = regrepeat(preg, scan + 5, max);
1347 if (no < min) {
1348 return 0;
1350 if (matchmin) {
1351 /* from min up to no */
1352 max = no;
1353 no = min;
1355 /* else from no down to min */
1356 while (1) {
1357 if (matchmin) {
1358 if (no > max) {
1359 break;
1362 else {
1363 if (no < min) {
1364 break;
1367 preg->reginput = save + utf8_index(save, no);
1368 reg_utf8_tounicode_case(preg->reginput, &c, (preg->cflags & REG_ICASE));
1369 /* If it could work, try it. */
1370 if (reg_iseol(preg, nextch) || c == nextch) {
1371 if (regmatch(preg, next)) {
1372 return(1);
1375 if (matchmin) {
1376 /* Couldn't or didn't, add one more */
1377 no++;
1379 else {
1380 /* Couldn't or didn't -- back up. */
1381 no--;
1384 return(0);
1387 static int regmatchrepeat(regex_t *preg, int scan, int matchmin)
1389 int *scanpt = preg->program + scan;
1391 int max = scanpt[2];
1392 int min = scanpt[3];
1394 /* Have we reached min? */
1395 if (scanpt[4] < min) {
1396 /* No, so get another one */
1397 scanpt[4]++;
1398 if (regmatch(preg, scan + 5)) {
1399 return 1;
1401 scanpt[4]--;
1402 return 0;
1404 if (scanpt[4] > max) {
1405 return 0;
1408 if (matchmin) {
1409 /* minimal, so try other branch first */
1410 if (regmatch(preg, regnext(preg, scan))) {
1411 return 1;
1413 /* No, so try one more */
1414 scanpt[4]++;
1415 if (regmatch(preg, scan + 5)) {
1416 return 1;
1418 scanpt[4]--;
1419 return 0;
1421 /* maximal, so try this branch again */
1422 if (scanpt[4] < max) {
1423 scanpt[4]++;
1424 if (regmatch(preg, scan + 5)) {
1425 return 1;
1427 scanpt[4]--;
1429 /* At this point we are at max with no match. Try the other branch */
1430 return regmatch(preg, regnext(preg, scan));
1434 - regmatch - main matching routine
1436 * Conceptually the strategy is simple: check to see whether the current
1437 * node matches, call self recursively to see whether the rest matches,
1438 * and then act accordingly. In practice we make some effort to avoid
1439 * recursion, in particular by going through "ordinary" nodes (that don't
1440 * need to know whether the rest of the match failed) by a loop instead of
1441 * by recursion.
1443 /* 0 failure, 1 success */
1444 static int regmatch(regex_t *preg, int prog)
1446 int scan; /* Current node. */
1447 int next; /* Next node. */
1448 const char *save;
1450 scan = prog;
1452 #ifdef DEBUG
1453 if (scan != 0 && regnarrate)
1454 fprintf(stderr, "%s(\n", regprop(scan));
1455 #endif
1456 while (scan != 0) {
1457 int n;
1458 int c;
1459 #ifdef DEBUG
1460 if (regnarrate) {
1461 fprintf(stderr, "%3d: %s...\n", scan, regprop(OP(preg, scan))); /* Where, what. */
1463 #endif
1464 next = regnext(preg, scan);
1465 n = reg_utf8_tounicode_case(preg->reginput, &c, (preg->cflags & REG_ICASE));
1467 switch (OP(preg, scan)) {
1468 case BOLX:
1469 if ((preg->eflags & REG_NOTBOL)) {
1470 return(0);
1472 /* Fall through */
1473 case BOL:
1474 if (preg->reginput != preg->regbol) {
1475 return(0);
1477 break;
1478 case EOLX:
1479 if (c != 0) {
1480 /* For EOLX, only match real end of line, not newline */
1481 return 0;
1483 break;
1484 case EOL:
1485 if (!reg_iseol(preg, c)) {
1486 return(0);
1488 break;
1489 case WORDA:
1490 /* Must be looking at a letter, digit, or _ */
1491 if ((!isalnum(UCHAR(c))) && c != '_')
1492 return(0);
1493 /* Prev must be BOL or nonword */
1494 if (preg->reginput > preg->regbol &&
1495 (isalnum(UCHAR(preg->reginput[-1])) || preg->reginput[-1] == '_'))
1496 return(0);
1497 break;
1498 case WORDZ:
1499 /* Can't match at BOL */
1500 if (preg->reginput > preg->regbol) {
1501 /* Current must be EOL or nonword */
1502 if (reg_iseol(preg, c) || !isalnum(UCHAR(c)) || c != '_') {
1503 c = preg->reginput[-1];
1504 /* Previous must be word */
1505 if (isalnum(UCHAR(c)) || c == '_') {
1506 break;
1510 /* No */
1511 return(0);
1513 case ANY:
1514 if (reg_iseol(preg, c))
1515 return 0;
1516 preg->reginput += n;
1517 break;
1518 case EXACTLY: {
1519 int opnd;
1520 int len;
1521 int slen;
1523 opnd = OPERAND(scan);
1524 len = str_int_len(preg->program + opnd);
1526 slen = prefix_cmp(preg->program + opnd, len, preg->reginput, preg->cflags & REG_ICASE);
1527 if (slen < 0) {
1528 return(0);
1530 preg->reginput += slen;
1532 break;
1533 case ANYOF:
1534 if (reg_iseol(preg, c) || reg_range_find(preg->program + OPERAND(scan), c) == 0) {
1535 return(0);
1537 preg->reginput += n;
1538 break;
1539 case ANYBUT:
1540 if (reg_iseol(preg, c) || reg_range_find(preg->program + OPERAND(scan), c) != 0) {
1541 return(0);
1543 preg->reginput += n;
1544 break;
1545 case NOTHING:
1546 break;
1547 case BACK:
1548 break;
1549 case BRANCH:
1550 if (OP(preg, next) != BRANCH) /* No choice. */
1551 next = OPERAND(scan); /* Avoid recursion. */
1552 else {
1553 do {
1554 save = preg->reginput;
1555 if (regmatch(preg, OPERAND(scan))) {
1556 return(1);
1558 preg->reginput = save;
1559 scan = regnext(preg, scan);
1560 } while (scan != 0 && OP(preg, scan) == BRANCH);
1561 return(0);
1562 /* NOTREACHED */
1564 break;
1565 case REP:
1566 case REPMIN:
1567 return regmatchsimplerepeat(preg, scan, OP(preg, scan) == REPMIN);
1569 case REPX:
1570 case REPXMIN:
1571 return regmatchrepeat(preg, scan, OP(preg, scan) == REPXMIN);
1573 case END:
1574 return 1; /* Success! */
1576 case OPENNC:
1577 case CLOSENC:
1578 return regmatch(preg, next);
1580 default:
1581 if (OP(preg, scan) >= OPEN+1 && OP(preg, scan) < CLOSE_END) {
1582 save = preg->reginput;
1583 if (regmatch(preg, next)) {
1584 if (OP(preg, scan) < CLOSE) {
1585 int no = OP(preg, scan) - OPEN;
1586 if (no < preg->nmatch && preg->pmatch[no].rm_so == -1) {
1587 preg->pmatch[no].rm_so = save - preg->start;
1590 else {
1591 int no = OP(preg, scan) - CLOSE;
1592 if (no < preg->nmatch && preg->pmatch[no].rm_eo == -1) {
1593 preg->pmatch[no].rm_eo = save - preg->start;
1596 return(1);
1598 /* Restore input position after failure */
1599 preg->reginput = save;
1600 return(0);
1602 return REG_ERR_INTERNAL;
1605 scan = next;
1609 * We get here only if there's trouble -- normally "case END" is
1610 * the terminating point.
1612 return REG_ERR_INTERNAL;
1616 - regrepeat - repeatedly match something simple, report how many
1618 static int regrepeat(regex_t *preg, int p, int max)
1620 int count = 0;
1621 const char *scan;
1622 int opnd;
1623 int ch;
1624 int n;
1626 scan = preg->reginput;
1627 opnd = OPERAND(p);
1628 switch (OP(preg, p)) {
1629 case ANY:
1630 while (!reg_iseol(preg, *scan) && count < max) {
1631 count++;
1632 scan += utf8_charlen(*scan);
1634 break;
1635 case EXACTLY:
1636 while (count < max) {
1637 n = reg_utf8_tounicode_case(scan, &ch, preg->cflags & REG_ICASE);
1638 if (preg->program[opnd] != ch) {
1639 break;
1641 count++;
1642 scan += n;
1644 break;
1645 case ANYOF:
1646 while (count < max) {
1647 n = reg_utf8_tounicode_case(scan, &ch, preg->cflags & REG_ICASE);
1648 if (reg_iseol(preg, ch) || reg_range_find(preg->program + opnd, ch) == 0) {
1649 break;
1651 count++;
1652 scan += n;
1654 break;
1655 case ANYBUT:
1656 while (count < max) {
1657 n = reg_utf8_tounicode_case(scan, &ch, preg->cflags & REG_ICASE);
1658 if (reg_iseol(preg, ch) || reg_range_find(preg->program + opnd, ch) != 0) {
1659 break;
1661 count++;
1662 scan += n;
1664 break;
1665 default: /* Oh dear. Called inappropriately. */
1666 preg->err = REG_ERR_INTERNAL;
1667 count = 0; /* Best compromise. */
1668 break;
1670 preg->reginput = scan;
1672 return(count);
1676 - regnext - dig the "next" pointer out of a node
1678 static int regnext(regex_t *preg, int p )
1680 int offset;
1682 offset = NEXT(preg, p);
1684 if (offset == 0)
1685 return 0;
1687 if (OP(preg, p) == BACK)
1688 return(p-offset);
1689 else
1690 return(p+offset);
1694 - regopsize - returns the size of opcode + operands at 'p' in words
1696 static int regopsize(regex_t *preg, int p )
1698 /* Almost all opcodes are 2 words, but some are more */
1699 switch (OP(preg, p)) {
1700 case REP:
1701 case REPMIN:
1702 case REPX:
1703 case REPXMIN:
1704 return 5;
1706 case ANYOF:
1707 case ANYBUT:
1708 case EXACTLY: {
1709 int s = p + 2;
1710 while (preg->program[s++]) {
1712 return s - p;
1715 return 2;
1718 #if defined(DEBUG) && !defined(JIM_BOOTSTRAP)
1721 - regdump - dump a regexp onto stdout in vaguely comprehensible form
1723 static void regdump(regex_t *preg)
1725 int s;
1726 int op = EXACTLY; /* Arbitrary non-END op. */
1727 int next;
1728 char buf[MAX_UTF8_LEN + 1];
1730 int i;
1731 for (i = 1; i < preg->p; i++) {
1732 printf("%02x ", (unsigned char)preg->program[i]);
1733 if (i % 16 == 0) {
1734 printf("\n");
1737 printf("\n");
1739 s = 1;
1740 while (op != END && s < preg->p) { /* While that wasn't END last time... */
1741 op = OP(preg, s);
1742 printf("%3d: %s", s, regprop(op)); /* Where, what. */
1743 next = regnext(preg, s);
1744 if (next == 0) /* Next ptr. */
1745 printf("(0)");
1746 else
1747 printf("(%d)", next);
1748 s += 2;
1749 if (op == REP || op == REPMIN || op == REPX || op == REPXMIN) {
1750 int max = preg->program[s];
1751 int min = preg->program[s + 1];
1752 if (max == 65535) {
1753 printf("{%d,*}", min);
1755 else {
1756 printf("{%d,%d}", min, max);
1758 printf(" %d", preg->program[s + 2]);
1759 s += 3;
1761 else if (op == ANYOF || op == ANYBUT) {
1762 /* set of ranges */
1764 while (preg->program[s]) {
1765 int len = preg->program[s++];
1766 int first = preg->program[s++];
1767 buf[utf8_getchars(buf, first)] = 0;
1768 printf("%s", buf);
1769 if (len > 1) {
1770 buf[utf8_getchars(buf, first + len - 1)] = 0;
1771 printf("-%s", buf);
1774 s++;
1776 else if (op == EXACTLY) {
1777 /* Literal string, where present. */
1779 while (preg->program[s]) {
1780 buf[utf8_getchars(buf, preg->program[s])] = 0;
1781 printf("%s", buf);
1782 s++;
1784 s++;
1786 putchar('\n');
1789 if (op == END) {
1790 /* Header fields of interest. */
1791 if (preg->regstart) {
1792 buf[utf8_getchars(buf, preg->regstart)] = 0;
1793 printf("start '%s' ", buf);
1795 if (preg->reganch)
1796 printf("anchored ");
1797 if (preg->regmust != 0) {
1798 int i;
1799 printf("must have:");
1800 for (i = 0; i < preg->regmlen; i++) {
1801 putchar(preg->program[preg->regmust + i]);
1803 putchar('\n');
1806 printf("\n");
1810 - regprop - printable representation of opcode
1812 static const char *regprop( int op )
1814 static char buf[50];
1816 switch (op) {
1817 case BOL:
1818 return "BOL";
1819 case EOL:
1820 return "EOL";
1821 case BOLX:
1822 return "BOLX";
1823 case EOLX:
1824 return "EOLX";
1825 case ANY:
1826 return "ANY";
1827 case ANYOF:
1828 return "ANYOF";
1829 case ANYBUT:
1830 return "ANYBUT";
1831 case BRANCH:
1832 return "BRANCH";
1833 case EXACTLY:
1834 return "EXACTLY";
1835 case NOTHING:
1836 return "NOTHING";
1837 case BACK:
1838 return "BACK";
1839 case END:
1840 return "END";
1841 case REP:
1842 return "REP";
1843 case REPMIN:
1844 return "REPMIN";
1845 case REPX:
1846 return "REPX";
1847 case REPXMIN:
1848 return "REPXMIN";
1849 case WORDA:
1850 return "WORDA";
1851 case WORDZ:
1852 return "WORDZ";
1853 case OPENNC:
1854 return "OPEN";
1855 case CLOSENC:
1856 return "CLOSE";
1857 default:
1858 if (op >= OPEN && op < CLOSE) {
1859 snprintf(buf, sizeof(buf), "OPEN%d", op-OPEN);
1861 else if (op >= CLOSE && op < CLOSE_END) {
1862 snprintf(buf, sizeof(buf), "CLOSE%d", op-CLOSE);
1864 else {
1865 snprintf(buf, sizeof(buf), "?%d?\n", op);
1867 return(buf);
1870 #endif /* JIM_BOOTSTRAP */
1872 size_t regerror(int errcode, const regex_t *preg, char *errbuf, size_t errbuf_size)
1874 static const char *error_strings[] = {
1875 "success",
1876 "no match",
1877 "bad pattern",
1878 "null argument",
1879 "unknown error",
1880 "too big",
1881 "out of memory",
1882 "too many ()",
1883 "parentheses () not balanced",
1884 "braces {} not balanced",
1885 "invalid repetition count(s)",
1886 "extra characters",
1887 "*+ of empty atom",
1888 "nested count",
1889 "internal error",
1890 "count follows nothing",
1891 "invalid escape \\ sequence",
1892 "corrupted program",
1893 "contains null char",
1894 "brackets [] not balanced",
1896 const char *err;
1898 if (errcode < 0 || errcode >= REG_ERR_NUM) {
1899 err = "Bad error code";
1901 else {
1902 err = error_strings[errcode];
1905 return snprintf(errbuf, errbuf_size, "%s", err);
1908 void regfree(regex_t *preg)
1910 free(preg->program);
1913 #endif