jim.c: Jim_Length/Jim_String internal checks
[jimtcl.git] / jimregexp.c
blob3771bd72e5a85d073978bbff8071e9b24670183f
1 /*
2 * vi:se ts=8:
4 * regcomp and regexec -- regsub and regerror are elsewhere
6 * Copyright (c) 1986 by University of Toronto.
7 * Written by Henry Spencer. Not derived from licensed software.
9 * Permission is granted to anyone to use this software for any
10 * purpose on any computer system, and to redistribute it freely,
11 * subject to the following restrictions:
13 * 1. The author is not responsible for the consequences of use of
14 * this software, no matter how awful, even if they arise
15 * from defects in it.
17 * 2. The origin of this software must not be misrepresented, either
18 * by explicit claim or by omission.
20 * 3. Altered versions must be plainly marked as such, and must not
21 * be misrepresented as being the original software.
22 *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore,
23 *** hoptoad!gnu, on 27 Dec 1986, to add \n as an alternative to |
24 *** to assist in implementing egrep.
25 *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore,
26 *** hoptoad!gnu, on 27 Dec 1986, to add \< and \> for word-matching
27 *** as in BSD grep and ex.
28 *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore,
29 *** hoptoad!gnu, on 28 Dec 1986, to optimize characters quoted with \.
30 *** THIS IS AN ALTERED VERSION. It was altered by James A. Woods,
31 *** ames!jaw, on 19 June 1987, to quash a regcomp() redundancy.
32 *** THIS IS AN ALTERED VERSION. It was altered by Christopher Seiwald
33 *** seiwald@vix.com, on 28 August 1993, for use in jam. Regmagic.h
34 *** was moved into regexp.h, and the include of regexp.h now uses "'s
35 *** to avoid conflicting with the system regexp.h. Const, bless its
36 *** soul, was removed so it can compile everywhere. The declaration
37 *** of strchr() was in conflict on AIX, so it was removed (as it is
38 *** happily defined in string.h).
39 *** THIS IS AN ALTERED VERSION. It was altered by Christopher Seiwald
40 *** seiwald@perforce.com, on 20 January 2000, to use function prototypes.
41 *** THIS IS AN ALTERED VERSION. It was altered by Christopher Seiwald
42 *** seiwald@perforce.com, on 05 November 2002, to const string literals.
44 * THIS IS AN ALTERED VERSION. It was altered by Steve Bennett <steveb@workware.net.au>
45 * on 16 October 2010, to remove static state and add better Tcl ARE compatibility.
46 * This includes counted repetitions, UTF-8 support, character classes,
47 * shorthand character classes, increased number of parentheses to 100,
48 * backslash escape sequences. It also removes \n as an alternative to |.
50 * Beware that some of this code is subtly aware of the way operator
51 * precedence is structured in regular expressions. Serious changes in
52 * regular-expression syntax might require a total rethink.
55 #include "jimautoconf.h"
57 #if defined(JIM_REGEXP)
58 #include <stdio.h>
59 #include <ctype.h>
60 #include <stdlib.h>
61 #include <string.h>
63 #include "jim.h"
64 #include "jimregexp.h"
65 #include "utf8.h"
67 /* An arbitrary limit, but this seems enough. Must be less than 1000. */
68 #define REG_MAX_PAREN 100
71 * Structure for regexp "program". This is essentially a linear encoding
72 * of a nondeterministic finite-state machine (aka syntax charts or
73 * "railroad normal form" in parsing technology). Each node is an opcode
74 * plus a "next" pointer, possibly plus an operand. "Next" pointers of
75 * all nodes except BRANCH implement concatenation; a "next" pointer with
76 * a BRANCH on both ends of it is connecting two alternatives. (Here we
77 * have one of the subtle syntax dependencies: an individual BRANCH (as
78 * opposed to a collection of them) is never concatenated with anything
79 * because of operator precedence.) The operand of some types of node is
80 * a literal string; for others, it is a node leading into a sub-FSM. In
81 * particular, the operand of a BRANCH node is the first node of the branch.
82 * (NB this is *not* a tree structure: the tail of the branch connects
83 * to the thing following the set of BRANCHes.) The opcodes are:
86 /* definition number opnd? meaning */
87 #define END 0 /* no End of program. */
88 #define BOL 1 /* no Match "" at beginning of line. */
89 #define EOL 2 /* no Match "" at end of line. */
90 #define ANY 3 /* no Match any one character. */
91 #define ANYOF 4 /* str Match any character in this string. */
92 #define ANYBUT 5 /* str Match any character not in this string. */
93 #define BRANCH 6 /* node Match this alternative, or the next... */
94 #define BACK 7 /* no Match "", "next" ptr points backward. */
95 #define EXACTLY 8 /* str Match this string. */
96 #define NOTHING 9 /* no Match empty string. */
97 #define REP 10 /* max,min Match this (simple) thing [min,max] times. */
98 #define REPMIN 11 /* max,min Match this (simple) thing [min,max] times, minimal match. */
99 #define REPX 12 /* max,min Match this (complex) thing [min,max] times. */
100 #define REPXMIN 13 /* max,min Match this (complex) thing [min,max] times, minimal match. */
101 #define BOLX 14 /* no Match "" at beginning of input. */
102 #define EOLX 15 /* no Match "" at end of input. */
103 #define WORDA 16 /* no Match "" at wordchar, where prev is nonword */
104 #define WORDZ 17 /* no Match "" at nonwordchar, where prev is word */
106 #define OPENNC 1000 /* no Non-capturing parentheses - must be OPEN-1 */
107 #define OPEN 1001 /* no Mark this point in input as start of #n. */
108 /* OPEN+1 is number 1, etc. */
110 /* must not be any other opts between OPEN and CLOSE */
112 #define CLOSENC 2000 /* no Non-capturing parentheses - must be CLOSE-1 */
113 #define CLOSE 2001 /* no Analogous to OPEN. */
114 #define CLOSE_END (CLOSE+REG_MAX_PAREN)
117 * The first word of the regexp internal "program" is actually this magic
118 * number; the start node begins in the second word.
120 #define REG_MAGIC 0xFADED00D
123 * Opcode notes:
125 * BRANCH The set of branches constituting a single choice are hooked
126 * together with their "next" pointers, since precedence prevents
127 * anything being concatenated to any individual branch. The
128 * "next" pointer of the last BRANCH in a choice points to the
129 * thing following the whole choice. This is also where the
130 * final "next" pointer of each individual branch points; each
131 * branch starts with the operand node of a BRANCH node.
133 * BACK Normal "next" pointers all implicitly point forward; BACK
134 * exists to make loop structures possible.
136 * REP,REPX Repeated matches ('?', '*', '+' and {min,max}) are implemented
137 * as either simple repeats (REP) or complex repeats (REPX).
138 * These opcodes include a "min" and "max" count after the opcode.
139 * This is followed by a fourth "current count" word that is
140 * only used by REPX, as it implements a recursive match.
141 * REPMIN and REPXMIN are identical except they implement minimal repeats.
143 * OPEN,CLOSE ...are numbered at compile time.
147 * A node is one word of opcode followed by one word of "next" pointer.
148 * The "next" pointer value is a positive offset from the opcode of the node
149 * containing it.
150 * An operand, if any, simply follows the node. (Note that much of the
151 * code generation knows about this implicit relationship.)
153 #define OP(preg, p) (preg->program[p])
154 #define NEXT(preg, p) (preg->program[p + 1])
155 #define OPERAND(p) ((p) + 2)
158 * See regmagic.h for one further detail of program structure.
163 * Utility definitions.
166 #define FAIL(R,M) { (R)->err = (M); return (M); }
167 #define ISMULT(c) ((c) == '*' || (c) == '+' || (c) == '?' || (c) == '{')
168 #define META "^$.[()|?{+*"
171 * Flags to be passed up and down.
173 #define HASWIDTH 1 /* Known never to match null string. */
174 #define SIMPLE 2 /* Simple enough to be STAR/PLUS operand. */
175 #define SPSTART 4 /* Starts with * or +. */
176 #define WORST 0 /* Worst case. */
178 #define MAX_REP_COUNT 1000000
181 * Forward declarations for regcomp()'s friends.
183 static int reg(regex_t *preg, int paren /* Parenthesized? */, int *flagp );
184 static int regpiece(regex_t *preg, int *flagp );
185 static int regbranch(regex_t *preg, int *flagp );
186 static int regatom(regex_t *preg, int *flagp );
187 static int regnode(regex_t *preg, int op );
188 static int regnext(regex_t *preg, int p );
189 static void regc(regex_t *preg, int b );
190 static int reginsert(regex_t *preg, int op, int size, int opnd );
191 static void regtail(regex_t *preg, int p, int val);
192 static void regoptail(regex_t *preg, int p, int val );
193 static int regopsize(regex_t *preg, int p );
195 static int reg_range_find(const int *string, int c);
196 static const char *str_find(const char *string, int c, int nocase);
197 static int prefix_cmp(const int *prog, int proglen, const char *string, int nocase);
199 /*#define DEBUG*/
200 #ifdef DEBUG
201 static int regnarrate = 0;
202 static void regdump(regex_t *preg);
203 static const char *regprop( int op );
204 #endif
208 * Returns the length of the null-terminated integer sequence.
210 static int str_int_len(const int *seq)
212 int n = 0;
213 while (*seq++) {
214 n++;
216 return n;
220 - regcomp - compile a regular expression into internal code
222 * We can't allocate space until we know how big the compiled form will be,
223 * but we can't compile it (and thus know how big it is) until we've got a
224 * place to put the code. So we cheat: we compile it twice, once with code
225 * generation turned off and size counting turned on, and once "for real".
226 * This also means that we don't allocate space until we are sure that the
227 * thing really will compile successfully, and we never have to move the
228 * code and thus invalidate pointers into it. (Note that it has to be in
229 * one piece because free() must be able to free it all.)
231 * Beware that the optimization-preparation code in here knows about some
232 * of the structure of the compiled regexp.
234 int regcomp(regex_t *preg, const char *exp, int cflags)
236 int scan;
237 int longest;
238 unsigned len;
239 int flags;
241 #ifdef DEBUG
242 fprintf(stderr, "Compiling: '%s'\n", exp);
243 #endif
244 memset(preg, 0, sizeof(*preg));
246 if (exp == NULL)
247 FAIL(preg, REG_ERR_NULL_ARGUMENT);
249 /* First pass: determine size, legality. */
250 preg->cflags = cflags;
251 preg->regparse = exp;
253 /* Allocate space. */
254 preg->proglen = (strlen(exp) + 1) * 5;
255 preg->program = malloc(preg->proglen * sizeof(int));
256 if (preg->program == NULL)
257 FAIL(preg, REG_ERR_NOMEM);
259 /* Note that since we store a magic value as the first item in the program,
260 * program offsets will never be 0
262 regc(preg, REG_MAGIC);
263 if (reg(preg, 0, &flags) == 0) {
264 return preg->err;
267 /* Small enough for pointer-storage convention? */
268 if (preg->re_nsub >= REG_MAX_PAREN) /* Probably could be 65535L. */
269 FAIL(preg,REG_ERR_TOO_BIG);
271 /* Dig out information for optimizations. */
272 preg->regstart = 0; /* Worst-case defaults. */
273 preg->reganch = 0;
274 preg->regmust = 0;
275 preg->regmlen = 0;
276 scan = 1; /* First BRANCH. */
277 if (OP(preg, regnext(preg, scan)) == END) { /* Only one top-level choice. */
278 scan = OPERAND(scan);
280 /* Starting-point info. */
281 if (OP(preg, scan) == EXACTLY) {
282 preg->regstart = preg->program[OPERAND(scan)];
284 else if (OP(preg, scan) == BOL)
285 preg->reganch++;
288 * If there's something expensive in the r.e., find the
289 * longest literal string that must appear and make it the
290 * regmust. Resolve ties in favor of later strings, since
291 * the regstart check works with the beginning of the r.e.
292 * and avoiding duplication strengthens checking. Not a
293 * strong reason, but sufficient in the absence of others.
295 if (flags&SPSTART) {
296 longest = 0;
297 len = 0;
298 for (; scan != 0; scan = regnext(preg, scan)) {
299 if (OP(preg, scan) == EXACTLY) {
300 int plen = str_int_len(preg->program + OPERAND(scan));
301 if (plen >= len) {
302 longest = OPERAND(scan);
303 len = plen;
307 preg->regmust = longest;
308 preg->regmlen = len;
312 #ifdef DEBUG
313 regdump(preg);
314 #endif
316 return 0;
320 - reg - regular expression, i.e. main body or parenthesized thing
322 * Caller must absorb opening parenthesis.
324 * Combining parenthesis handling with the base level of regular expression
325 * is a trifle forced, but the need to tie the tails of the branches to what
326 * follows makes it hard to avoid.
328 static int reg(regex_t *preg, int paren /* Parenthesized? */, int *flagp )
330 int ret;
331 int br;
332 int ender;
333 int parno = 0;
334 int flags;
336 *flagp = HASWIDTH; /* Tentatively. */
338 /* Make an OPEN node, if parenthesized. */
339 if (paren) {
340 if (preg->regparse[0] == '?' && preg->regparse[1] == ':') {
341 /* non-capturing paren */
342 preg->regparse += 2;
343 parno = -1;
345 else {
346 parno = ++preg->re_nsub;
348 ret = regnode(preg, OPEN+parno);
349 } else
350 ret = 0;
352 /* Pick up the branches, linking them together. */
353 br = regbranch(preg, &flags);
354 if (br == 0)
355 return 0;
356 if (ret != 0)
357 regtail(preg, ret, br); /* OPEN -> first. */
358 else
359 ret = br;
360 if (!(flags&HASWIDTH))
361 *flagp &= ~HASWIDTH;
362 *flagp |= flags&SPSTART;
363 while (*preg->regparse == '|') {
364 preg->regparse++;
365 br = regbranch(preg, &flags);
366 if (br == 0)
367 return 0;
368 regtail(preg, ret, br); /* BRANCH -> BRANCH. */
369 if (!(flags&HASWIDTH))
370 *flagp &= ~HASWIDTH;
371 *flagp |= flags&SPSTART;
374 /* Make a closing node, and hook it on the end. */
375 ender = regnode(preg, (paren) ? CLOSE+parno : END);
376 regtail(preg, ret, ender);
378 /* Hook the tails of the branches to the closing node. */
379 for (br = ret; br != 0; br = regnext(preg, br))
380 regoptail(preg, br, ender);
382 /* Check for proper termination. */
383 if (paren && *preg->regparse++ != ')') {
384 preg->err = REG_ERR_UNMATCHED_PAREN;
385 return 0;
386 } else if (!paren && *preg->regparse != '\0') {
387 if (*preg->regparse == ')') {
388 preg->err = REG_ERR_UNMATCHED_PAREN;
389 return 0;
390 } else {
391 preg->err = REG_ERR_JUNK_ON_END;
392 return 0;
396 return(ret);
400 - regbranch - one alternative of an | operator
402 * Implements the concatenation operator.
404 static int regbranch(regex_t *preg, int *flagp )
406 int ret;
407 int chain;
408 int latest;
409 int flags;
411 *flagp = WORST; /* Tentatively. */
413 ret = regnode(preg, BRANCH);
414 chain = 0;
415 while (*preg->regparse != '\0' && *preg->regparse != ')' &&
416 *preg->regparse != '|') {
417 latest = regpiece(preg, &flags);
418 if (latest == 0)
419 return 0;
420 *flagp |= flags&HASWIDTH;
421 if (chain == 0) {/* First piece. */
422 *flagp |= flags&SPSTART;
424 else {
425 regtail(preg, chain, latest);
427 chain = latest;
429 if (chain == 0) /* Loop ran zero times. */
430 (void) regnode(preg, NOTHING);
432 return(ret);
436 - regpiece - something followed by possible [*+?]
438 * Note that the branching code sequences used for ? and the general cases
439 * of * and + are somewhat optimized: they use the same NOTHING node as
440 * both the endmarker for their branch list and the body of the last branch.
441 * It might seem that this node could be dispensed with entirely, but the
442 * endmarker role is not redundant.
444 static int regpiece(regex_t *preg, int *flagp)
446 int ret;
447 char op;
448 int next;
449 int flags;
450 int min;
451 int max;
453 ret = regatom(preg, &flags);
454 if (ret == 0)
455 return 0;
457 op = *preg->regparse;
458 if (!ISMULT(op)) {
459 *flagp = flags;
460 return(ret);
463 if (!(flags&HASWIDTH) && op != '?') {
464 preg->err = REG_ERR_OPERAND_COULD_BE_EMPTY;
465 return 0;
468 /* Handle braces (counted repetition) by expansion */
469 if (op == '{') {
470 char *end;
472 min = strtoul(preg->regparse + 1, &end, 10);
473 if (end == preg->regparse + 1) {
474 preg->err = REG_ERR_BAD_COUNT;
475 return 0;
477 if (*end == '}') {
478 max = min;
480 else if (*end == '\0') {
481 preg->err = REG_ERR_UNMATCHED_BRACES;
482 return 0;
484 else {
485 preg->regparse = end;
486 max = strtoul(preg->regparse + 1, &end, 10);
487 if (*end != '}') {
488 preg->err = REG_ERR_UNMATCHED_BRACES;
489 return 0;
492 if (end == preg->regparse + 1) {
493 max = MAX_REP_COUNT;
495 else if (max < min || max >= 100) {
496 preg->err = REG_ERR_BAD_COUNT;
497 return 0;
499 if (min >= 100) {
500 preg->err = REG_ERR_BAD_COUNT;
501 return 0;
504 preg->regparse = strchr(preg->regparse, '}');
506 else {
507 min = (op == '+');
508 max = (op == '?' ? 1 : MAX_REP_COUNT);
511 if (preg->regparse[1] == '?') {
512 preg->regparse++;
513 next = reginsert(preg, flags & SIMPLE ? REPMIN : REPXMIN, 5, ret);
515 else {
516 next = reginsert(preg, flags & SIMPLE ? REP: REPX, 5, ret);
518 preg->program[ret + 2] = max;
519 preg->program[ret + 3] = min;
520 preg->program[ret + 4] = 0;
522 *flagp = (min) ? (WORST|HASWIDTH) : (WORST|SPSTART);
524 if (!(flags & SIMPLE)) {
525 int back = regnode(preg, BACK);
526 regtail(preg, back, ret);
527 regtail(preg, next, back);
530 preg->regparse++;
531 if (ISMULT(*preg->regparse)) {
532 preg->err = REG_ERR_NESTED_COUNT;
533 return 0;
536 return ret;
540 * Add all characters in the inclusive range between lower and upper.
542 * Handles a swapped range (upper < lower).
544 static void reg_addrange(regex_t *preg, int lower, int upper)
546 if (lower > upper) {
547 reg_addrange(preg, upper, lower);
549 /* Add a range as length, start */
550 regc(preg, upper - lower + 1);
551 regc(preg, lower);
555 * Add a null-terminated literal string as a set of ranges.
557 static void reg_addrange_str(regex_t *preg, const char *str)
559 while (*str) {
560 reg_addrange(preg, *str, *str);
561 str++;
566 * Extracts the next unicode char from utf8.
568 * If 'upper' is set, converts the char to uppercase.
570 static int reg_utf8_tounicode_case(const char *s, int *uc, int upper)
572 int l = utf8_tounicode(s, uc);
573 if (upper) {
574 *uc = utf8_upper(*uc);
576 return l;
580 * Converts a hex digit to decimal.
582 * Returns -1 for an invalid hex digit.
584 static int hexdigitval(int c)
586 if (c >= '0' && c <= '9')
587 return c - '0';
588 if (c >= 'a' && c <= 'f')
589 return c - 'a' + 10;
590 if (c >= 'A' && c <= 'F')
591 return c - 'A' + 10;
592 return -1;
596 * Parses up to 'n' hex digits at 's' and stores the result in *uc.
598 * Returns the number of hex digits parsed.
599 * If there are no hex digits, returns 0 and stores nothing.
601 static int parse_hex(const char *s, int n, int *uc)
603 int val = 0;
604 int k;
606 for (k = 0; k < n; k++) {
607 int c = hexdigitval(*s++);
608 if (c == -1) {
609 break;
611 val = (val << 4) | c;
613 if (k) {
614 *uc = val;
616 return k;
620 * Call for chars after a backlash to decode the escape sequence.
622 * Stores the result in *ch.
624 * Returns the number of bytes consumed.
626 static int reg_decode_escape(const char *s, int *ch)
628 int n;
629 const char *s0 = s;
631 *ch = *s++;
633 switch (*ch) {
634 case 'b': *ch = '\b'; break;
635 case 'e': *ch = 27; break;
636 case 'f': *ch = '\f'; break;
637 case 'n': *ch = '\n'; break;
638 case 'r': *ch = '\r'; break;
639 case 't': *ch = '\t'; break;
640 case 'v': *ch = '\v'; break;
641 case 'u':
642 if (*s == '{') {
643 /* Expect \u{NNNN} */
644 n = parse_hex(s + 1, 6, ch);
645 if (n > 0 && s[n + 1] == '}' && *ch >= 0 && *ch <= 0x1fffff) {
646 s += n + 2;
648 else {
649 /* Invalid, so just treat as an escaped 'u' */
650 *ch = 'u';
653 else if ((n = parse_hex(s, 4, ch)) > 0) {
654 s += n;
656 break;
657 case 'U':
658 if ((n = parse_hex(s, 8, ch)) > 0) {
659 s += n;
661 break;
662 case 'x':
663 if ((n = parse_hex(s, 2, ch)) > 0) {
664 s += n;
666 break;
667 case '\0':
668 s--;
669 *ch = '\\';
670 break;
672 return s - s0;
676 - regatom - the lowest level
678 * Optimization: gobbles an entire sequence of ordinary characters so that
679 * it can turn them into a single node, which is smaller to store and
680 * faster to run. Backslashed characters are exceptions, each becoming a
681 * separate node; the code is simpler that way and it's not worth fixing.
683 static int regatom(regex_t *preg, int *flagp)
685 int ret;
686 int flags;
687 int nocase = (preg->cflags & REG_ICASE);
689 int ch;
690 int n = reg_utf8_tounicode_case(preg->regparse, &ch, nocase);
692 *flagp = WORST; /* Tentatively. */
694 preg->regparse += n;
695 switch (ch) {
696 /* FIXME: these chars only have meaning at beg/end of pat? */
697 case '^':
698 ret = regnode(preg, BOL);
699 break;
700 case '$':
701 ret = regnode(preg, EOL);
702 break;
703 case '.':
704 ret = regnode(preg, ANY);
705 *flagp |= HASWIDTH|SIMPLE;
706 break;
707 case '[': {
708 const char *pattern = preg->regparse;
710 if (*pattern == '^') { /* Complement of range. */
711 ret = regnode(preg, ANYBUT);
712 pattern++;
713 } else
714 ret = regnode(preg, ANYOF);
716 /* Special case. If the first char is ']' or '-', it is part of the set */
717 if (*pattern == ']' || *pattern == '-') {
718 reg_addrange(preg, *pattern, *pattern);
719 pattern++;
722 while (*pattern && *pattern != ']') {
723 /* Is this a range? a-z */
724 int start;
725 int end;
727 enum {
728 CC_ALPHA, CC_ALNUM, CC_SPACE, CC_BLANK, CC_UPPER, CC_LOWER,
729 CC_DIGIT, CC_XDIGIT, CC_CNTRL, CC_GRAPH, CC_PRINT, CC_PUNCT,
730 CC_NUM
732 int cc;
734 pattern += reg_utf8_tounicode_case(pattern, &start, nocase);
735 if (start == '\\') {
736 /* First check for class shorthand escapes */
737 switch (*pattern) {
738 case 's':
739 pattern++;
740 cc = CC_SPACE;
741 goto cc_switch;
742 case 'd':
743 pattern++;
744 cc = CC_DIGIT;
745 goto cc_switch;
746 case 'w':
747 pattern++;
748 reg_addrange(preg, '_', '_');
749 cc = CC_ALNUM;
750 goto cc_switch;
752 pattern += reg_decode_escape(pattern, &start);
753 if (start == 0) {
754 preg->err = REG_ERR_NULL_CHAR;
755 return 0;
758 if (pattern[0] == '-' && pattern[1] && pattern[1] != ']') {
759 /* skip '-' */
760 pattern += utf8_tounicode(pattern, &end);
761 pattern += reg_utf8_tounicode_case(pattern, &end, nocase);
762 if (end == '\\') {
763 pattern += reg_decode_escape(pattern, &end);
764 if (end == 0) {
765 preg->err = REG_ERR_NULL_CHAR;
766 return 0;
770 reg_addrange(preg, start, end);
771 continue;
773 if (start == '[' && pattern[0] == ':') {
774 static const char *character_class[] = {
775 ":alpha:", ":alnum:", ":space:", ":blank:", ":upper:", ":lower:",
776 ":digit:", ":xdigit:", ":cntrl:", ":graph:", ":print:", ":punct:",
779 for (cc = 0; cc < CC_NUM; cc++) {
780 n = strlen(character_class[cc]);
781 if (strncmp(pattern, character_class[cc], n) == 0) {
782 /* Found a character class */
783 pattern += n + 1;
784 break;
787 if (cc != CC_NUM) {
788 cc_switch:
789 switch (cc) {
790 case CC_ALNUM:
791 reg_addrange(preg, '0', '9');
792 /* Fall through */
793 case CC_ALPHA:
794 if ((preg->cflags & REG_ICASE) == 0) {
795 reg_addrange(preg, 'a', 'z');
797 reg_addrange(preg, 'A', 'Z');
798 break;
799 case CC_SPACE:
800 reg_addrange_str(preg, " \t\r\n\f\v");
801 break;
802 case CC_BLANK:
803 reg_addrange_str(preg, " \t");
804 break;
805 case CC_UPPER:
806 reg_addrange(preg, 'A', 'Z');
807 break;
808 case CC_LOWER:
809 reg_addrange(preg, 'a', 'z');
810 break;
811 case CC_XDIGIT:
812 reg_addrange(preg, 'a', 'f');
813 reg_addrange(preg, 'A', 'F');
814 /* Fall through */
815 case CC_DIGIT:
816 reg_addrange(preg, '0', '9');
817 break;
818 case CC_CNTRL:
819 reg_addrange(preg, 0, 31);
820 reg_addrange(preg, 127, 127);
821 break;
822 case CC_PRINT:
823 reg_addrange(preg, ' ', '~');
824 break;
825 case CC_GRAPH:
826 reg_addrange(preg, '!', '~');
827 break;
828 case CC_PUNCT:
829 reg_addrange(preg, '!', '/');
830 reg_addrange(preg, ':', '@');
831 reg_addrange(preg, '[', '`');
832 reg_addrange(preg, '{', '~');
833 break;
835 continue;
838 /* Not a range, so just add the char */
839 reg_addrange(preg, start, start);
841 regc(preg, '\0');
843 if (*pattern) {
844 pattern++;
846 preg->regparse = pattern;
848 *flagp |= HASWIDTH|SIMPLE;
850 break;
851 case '(':
852 ret = reg(preg, 1, &flags);
853 if (ret == 0)
854 return 0;
855 *flagp |= flags&(HASWIDTH|SPSTART);
856 break;
857 case '\0':
858 case '|':
859 case ')':
860 preg->err = REG_ERR_INTERNAL;
861 return 0; /* Supposed to be caught earlier. */
862 case '?':
863 case '+':
864 case '*':
865 case '{':
866 preg->err = REG_ERR_COUNT_FOLLOWS_NOTHING;
867 return 0;
868 case '\\':
869 ch = *preg->regparse++;
870 switch (ch) {
871 case '\0':
872 preg->err = REG_ERR_TRAILING_BACKSLASH;
873 return 0;
874 case 'A':
875 ret = regnode(preg, BOLX);
876 break;
877 case 'Z':
878 ret = regnode(preg, EOLX);
879 break;
880 case '<':
881 case 'm':
882 ret = regnode(preg, WORDA);
883 break;
884 case '>':
885 case 'M':
886 ret = regnode(preg, WORDZ);
887 break;
888 case 'd':
889 case 'D':
890 ret = regnode(preg, ch == 'd' ? ANYOF : ANYBUT);
891 reg_addrange(preg, '0', '9');
892 regc(preg, '\0');
893 *flagp |= HASWIDTH|SIMPLE;
894 break;
895 case 'w':
896 case 'W':
897 ret = regnode(preg, ch == 'w' ? ANYOF : ANYBUT);
898 if ((preg->cflags & REG_ICASE) == 0) {
899 reg_addrange(preg, 'a', 'z');
901 reg_addrange(preg, 'A', 'Z');
902 reg_addrange(preg, '0', '9');
903 reg_addrange(preg, '_', '_');
904 regc(preg, '\0');
905 *flagp |= HASWIDTH|SIMPLE;
906 break;
907 case 's':
908 case 'S':
909 ret = regnode(preg, ch == 's' ? ANYOF : ANYBUT);
910 reg_addrange_str(preg," \t\r\n\f\v");
911 regc(preg, '\0');
912 *flagp |= HASWIDTH|SIMPLE;
913 break;
914 /* FIXME: Someday handle \1, \2, ... */
915 default:
916 /* Handle general quoted chars in exact-match routine */
917 /* Back up to include the backslash */
918 preg->regparse--;
919 goto de_fault;
921 break;
922 de_fault:
923 default: {
925 * Encode a string of characters to be matched exactly.
927 int added = 0;
929 /* Back up to pick up the first char of interest */
930 preg->regparse -= n;
932 ret = regnode(preg, EXACTLY);
934 /* Note that a META operator such as ? or * consumes the
935 * preceding char.
936 * Thus we must be careful to look ahead by 2 and add the
937 * last char as it's own EXACTLY if necessary
940 /* Until end of string or a META char is reached */
941 while (*preg->regparse && strchr(META, *preg->regparse) == NULL) {
942 n = reg_utf8_tounicode_case(preg->regparse, &ch, (preg->cflags & REG_ICASE));
943 if (ch == '\\' && preg->regparse[n]) {
944 /* Non-trailing backslash.
945 * Is this a special escape, or a regular escape?
947 if (strchr("<>mMwWdDsSAZ", preg->regparse[n])) {
948 /* A special escape. All done with EXACTLY */
949 break;
951 /* Decode it. Note that we add the length for the escape
952 * sequence to the length for the backlash so we can skip
953 * the entire sequence, or not as required.
955 n += reg_decode_escape(preg->regparse + n, &ch);
956 if (ch == 0) {
957 preg->err = REG_ERR_NULL_CHAR;
958 return 0;
962 /* Now we have one char 'ch' of length 'n'.
963 * Check to see if the following char is a MULT
966 if (ISMULT(preg->regparse[n])) {
967 /* Yes. But do we already have some EXACTLY chars? */
968 if (added) {
969 /* Yes, so return what we have and pick up the current char next time around */
970 break;
972 /* No, so add this single char and finish */
973 regc(preg, ch);
974 added++;
975 preg->regparse += n;
976 break;
979 /* No, so just add this char normally */
980 regc(preg, ch);
981 added++;
982 preg->regparse += n;
984 regc(preg, '\0');
986 *flagp |= HASWIDTH;
987 if (added == 1)
988 *flagp |= SIMPLE;
989 break;
991 break;
994 return(ret);
997 static void reg_grow(regex_t *preg, int n)
999 if (preg->p + n >= preg->proglen) {
1000 preg->proglen = (preg->p + n) * 2;
1001 preg->program = realloc(preg->program, preg->proglen * sizeof(int));
1006 - regnode - emit a node
1008 /* Location. */
1009 static int regnode(regex_t *preg, int op)
1011 reg_grow(preg, 2);
1013 /* The OP followed by a next pointer */
1014 preg->program[preg->p++] = op;
1015 preg->program[preg->p++] = 0;
1017 /* Return the start of the node */
1018 return preg->p - 2;
1022 - regc - emit (if appropriate) a byte of code
1024 static void regc(regex_t *preg, int b )
1026 reg_grow(preg, 1);
1027 preg->program[preg->p++] = b;
1031 - reginsert - insert an operator in front of already-emitted operand
1033 * Means relocating the operand.
1034 * Returns the new location of the original operand.
1036 static int reginsert(regex_t *preg, int op, int size, int opnd )
1038 reg_grow(preg, size);
1040 /* Move everything from opnd up */
1041 memmove(preg->program + opnd + size, preg->program + opnd, sizeof(int) * (preg->p - opnd));
1042 /* Zero out the new space */
1043 memset(preg->program + opnd, 0, sizeof(int) * size);
1045 preg->program[opnd] = op;
1047 preg->p += size;
1049 return opnd + size;
1053 - regtail - set the next-pointer at the end of a node chain
1055 static void regtail(regex_t *preg, int p, int val)
1057 int scan;
1058 int temp;
1059 int offset;
1061 /* Find last node. */
1062 scan = p;
1063 for (;;) {
1064 temp = regnext(preg, scan);
1065 if (temp == 0)
1066 break;
1067 scan = temp;
1070 if (OP(preg, scan) == BACK)
1071 offset = scan - val;
1072 else
1073 offset = val - scan;
1075 preg->program[scan + 1] = offset;
1079 - regoptail - regtail on operand of first argument; nop if operandless
1082 static void regoptail(regex_t *preg, int p, int val )
1084 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
1085 if (p != 0 && OP(preg, p) == BRANCH) {
1086 regtail(preg, OPERAND(p), val);
1091 * regexec and friends
1095 * Forwards.
1097 static int regtry(regex_t *preg, const char *string );
1098 static int regmatch(regex_t *preg, int prog);
1099 static int regrepeat(regex_t *preg, int p, int max);
1102 - regexec - match a regexp against a string
1104 int regexec(regex_t *preg, const char *string, size_t nmatch, regmatch_t pmatch[], int eflags)
1106 const char *s;
1107 int scan;
1109 /* Be paranoid... */
1110 if (preg == NULL || preg->program == NULL || string == NULL) {
1111 return REG_ERR_NULL_ARGUMENT;
1114 /* Check validity of program. */
1115 if (*preg->program != REG_MAGIC) {
1116 return REG_ERR_CORRUPTED;
1119 #ifdef DEBUG
1120 fprintf(stderr, "regexec: %s\n", string);
1121 regdump(preg);
1122 #endif
1124 preg->eflags = eflags;
1125 preg->pmatch = pmatch;
1126 preg->nmatch = nmatch;
1127 preg->start = string; /* All offsets are computed from here */
1129 /* Must clear out the embedded repeat counts of REPX and REPXMIN opcodes */
1130 for (scan = OPERAND(1); scan != 0; scan += regopsize(preg, scan)) {
1131 int op = OP(preg, scan);
1132 if (op == END)
1133 break;
1134 if (op == REPX || op == REPXMIN)
1135 preg->program[scan + 4] = 0;
1138 /* If there is a "must appear" string, look for it. */
1139 if (preg->regmust != 0) {
1140 s = string;
1141 while ((s = str_find(s, preg->program[preg->regmust], preg->cflags & REG_ICASE)) != NULL) {
1142 if (prefix_cmp(preg->program + preg->regmust, preg->regmlen, s, preg->cflags & REG_ICASE) >= 0) {
1143 break;
1145 s++;
1147 if (s == NULL) /* Not present. */
1148 return REG_NOMATCH;
1151 /* Mark beginning of line for ^ . */
1152 preg->regbol = string;
1154 /* Simplest case: anchored match need be tried only once (maybe per line). */
1155 if (preg->reganch) {
1156 if (eflags & REG_NOTBOL) {
1157 /* This is an anchored search, but not an BOL, so possibly skip to the next line */
1158 goto nextline;
1160 while (1) {
1161 if (regtry(preg, string)) {
1162 return REG_NOERROR;
1164 if (*string) {
1165 nextline:
1166 if (preg->cflags & REG_NEWLINE) {
1167 /* Try the next anchor? */
1168 string = strchr(string, '\n');
1169 if (string) {
1170 preg->regbol = ++string;
1171 continue;
1175 return REG_NOMATCH;
1179 /* Messy cases: unanchored match. */
1180 s = string;
1181 if (preg->regstart != '\0') {
1182 /* We know what char it must start with. */
1183 while ((s = str_find(s, preg->regstart, preg->cflags & REG_ICASE)) != NULL) {
1184 if (regtry(preg, s))
1185 return REG_NOERROR;
1186 s++;
1189 else
1190 /* We don't -- general case. */
1191 while (1) {
1192 if (regtry(preg, s))
1193 return REG_NOERROR;
1194 if (*s == '\0') {
1195 break;
1197 else {
1198 int c;
1199 s += utf8_tounicode(s, &c);
1203 /* Failure. */
1204 return REG_NOMATCH;
1208 - regtry - try match at specific point
1210 /* 0 failure, 1 success */
1211 static int regtry( regex_t *preg, const char *string )
1213 int i;
1215 preg->reginput = string;
1217 for (i = 0; i < preg->nmatch; i++) {
1218 preg->pmatch[i].rm_so = -1;
1219 preg->pmatch[i].rm_eo = -1;
1221 if (regmatch(preg, 1)) {
1222 preg->pmatch[0].rm_so = string - preg->start;
1223 preg->pmatch[0].rm_eo = preg->reginput - preg->start;
1224 return(1);
1225 } else
1226 return(0);
1230 * Returns bytes matched if 'pattern' is a prefix of 'string'.
1232 * If 'nocase' is non-zero, does a case-insensitive match.
1234 * Returns -1 on not found.
1236 static int prefix_cmp(const int *prog, int proglen, const char *string, int nocase)
1238 const char *s = string;
1239 while (proglen && *s) {
1240 int ch;
1241 int n = reg_utf8_tounicode_case(s, &ch, nocase);
1242 if (ch != *prog) {
1243 return -1;
1245 prog++;
1246 s += n;
1247 proglen--;
1249 if (proglen == 0) {
1250 return s - string;
1252 return -1;
1256 * Searchs for 'c' in the range 'range'.
1258 * Returns 1 if found, or 0 if not.
1260 static int reg_range_find(const int *range, int c)
1262 while (*range) {
1263 /*printf("Checking %d in range [%d,%d]\n", c, range[1], (range[0] + range[1] - 1));*/
1264 if (c >= range[1] && c <= (range[0] + range[1] - 1)) {
1265 return 1;
1267 range += 2;
1269 return 0;
1273 * Search for the character 'c' in the utf-8 string 'string'.
1275 * If 'nocase' is set, the 'string' is assumed to be uppercase
1276 * and 'c' is converted to uppercase before matching.
1278 * Returns the byte position in the string where the 'c' was found, or
1279 * NULL if not found.
1281 static const char *str_find(const char *string, int c, int nocase)
1283 if (nocase) {
1284 /* The "string" should already be converted to uppercase */
1285 c = utf8_upper(c);
1287 while (*string) {
1288 int ch;
1289 int n = reg_utf8_tounicode_case(string, &ch, nocase);
1290 if (c == ch) {
1291 return string;
1293 string += n;
1295 return NULL;
1299 * Returns true if 'ch' is an end-of-line char.
1301 * In REG_NEWLINE mode, \n is considered EOL in
1302 * addition to \0
1304 static int reg_iseol(regex_t *preg, int ch)
1306 if (preg->cflags & REG_NEWLINE) {
1307 return ch == '\0' || ch == '\n';
1309 else {
1310 return ch == '\0';
1314 static int regmatchsimplerepeat(regex_t *preg, int scan, int matchmin)
1316 int nextch = '\0';
1317 const char *save;
1318 int no;
1319 int c;
1321 int max = preg->program[scan + 2];
1322 int min = preg->program[scan + 3];
1323 int next = regnext(preg, scan);
1326 * Lookahead to avoid useless match attempts
1327 * when we know what character comes next.
1329 if (OP(preg, next) == EXACTLY) {
1330 nextch = preg->program[OPERAND(next)];
1332 save = preg->reginput;
1333 no = regrepeat(preg, scan + 5, max);
1334 if (no < min) {
1335 return 0;
1337 if (matchmin) {
1338 /* from min up to no */
1339 max = no;
1340 no = min;
1342 /* else from no down to min */
1343 while (1) {
1344 if (matchmin) {
1345 if (no > max) {
1346 break;
1349 else {
1350 if (no < min) {
1351 break;
1354 preg->reginput = save + utf8_index(save, no);
1355 reg_utf8_tounicode_case(preg->reginput, &c, (preg->cflags & REG_ICASE));
1356 /* If it could work, try it. */
1357 if (reg_iseol(preg, nextch) || c == nextch) {
1358 if (regmatch(preg, next)) {
1359 return(1);
1362 if (matchmin) {
1363 /* Couldn't or didn't, add one more */
1364 no++;
1366 else {
1367 /* Couldn't or didn't -- back up. */
1368 no--;
1371 return(0);
1374 static int regmatchrepeat(regex_t *preg, int scan, int matchmin)
1376 int *scanpt = preg->program + scan;
1378 int max = scanpt[2];
1379 int min = scanpt[3];
1381 /* Have we reached min? */
1382 if (scanpt[4] < min) {
1383 /* No, so get another one */
1384 scanpt[4]++;
1385 if (regmatch(preg, scan + 5)) {
1386 return 1;
1388 scanpt[4]--;
1389 return 0;
1391 if (scanpt[4] > max) {
1392 return 0;
1395 if (matchmin) {
1396 /* minimal, so try other branch first */
1397 if (regmatch(preg, regnext(preg, scan))) {
1398 return 1;
1400 /* No, so try one more */
1401 scanpt[4]++;
1402 if (regmatch(preg, scan + 5)) {
1403 return 1;
1405 scanpt[4]--;
1406 return 0;
1408 /* maximal, so try this branch again */
1409 if (scanpt[4] < max) {
1410 scanpt[4]++;
1411 if (regmatch(preg, scan + 5)) {
1412 return 1;
1414 scanpt[4]--;
1416 /* At this point we are at max with no match. Try the other branch */
1417 return regmatch(preg, regnext(preg, scan));
1421 - regmatch - main matching routine
1423 * Conceptually the strategy is simple: check to see whether the current
1424 * node matches, call self recursively to see whether the rest matches,
1425 * and then act accordingly. In practice we make some effort to avoid
1426 * recursion, in particular by going through "ordinary" nodes (that don't
1427 * need to know whether the rest of the match failed) by a loop instead of
1428 * by recursion.
1430 /* 0 failure, 1 success */
1431 static int regmatch(regex_t *preg, int prog)
1433 int scan; /* Current node. */
1434 int next; /* Next node. */
1435 const char *save;
1437 scan = prog;
1439 #ifdef DEBUG
1440 if (scan != 0 && regnarrate)
1441 fprintf(stderr, "%s(\n", regprop(scan));
1442 #endif
1443 while (scan != 0) {
1444 int n;
1445 int c;
1446 #ifdef DEBUG
1447 if (regnarrate) {
1448 fprintf(stderr, "%3d: %s...\n", scan, regprop(OP(preg, scan))); /* Where, what. */
1450 #endif
1451 next = regnext(preg, scan);
1452 n = reg_utf8_tounicode_case(preg->reginput, &c, (preg->cflags & REG_ICASE));
1454 switch (OP(preg, scan)) {
1455 case BOLX:
1456 if ((preg->eflags & REG_NOTBOL)) {
1457 return(0);
1459 /* Fall through */
1460 case BOL:
1461 if (preg->reginput != preg->regbol) {
1462 return(0);
1464 break;
1465 case EOLX:
1466 if (c != 0) {
1467 /* For EOLX, only match real end of line, not newline */
1468 return 0;
1470 break;
1471 case EOL:
1472 if (!reg_iseol(preg, c)) {
1473 return(0);
1475 break;
1476 case WORDA:
1477 /* Must be looking at a letter, digit, or _ */
1478 if ((!isalnum(UCHAR(c))) && c != '_')
1479 return(0);
1480 /* Prev must be BOL or nonword */
1481 if (preg->reginput > preg->regbol &&
1482 (isalnum(UCHAR(preg->reginput[-1])) || preg->reginput[-1] == '_'))
1483 return(0);
1484 break;
1485 case WORDZ:
1486 /* Can't match at BOL */
1487 if (preg->reginput > preg->regbol) {
1488 /* Current must be EOL or nonword */
1489 if (reg_iseol(preg, c) || !isalnum(UCHAR(c)) || c != '_') {
1490 c = preg->reginput[-1];
1491 /* Previous must be word */
1492 if (isalnum(UCHAR(c)) || c == '_') {
1493 break;
1497 /* No */
1498 return(0);
1500 case ANY:
1501 if (reg_iseol(preg, c))
1502 return 0;
1503 preg->reginput += n;
1504 break;
1505 case EXACTLY: {
1506 int opnd;
1507 int len;
1508 int slen;
1510 opnd = OPERAND(scan);
1511 len = str_int_len(preg->program + opnd);
1513 slen = prefix_cmp(preg->program + opnd, len, preg->reginput, preg->cflags & REG_ICASE);
1514 if (slen < 0) {
1515 return(0);
1517 preg->reginput += slen;
1519 break;
1520 case ANYOF:
1521 if (reg_iseol(preg, c) || reg_range_find(preg->program + OPERAND(scan), c) == 0) {
1522 return(0);
1524 preg->reginput += n;
1525 break;
1526 case ANYBUT:
1527 if (reg_iseol(preg, c) || reg_range_find(preg->program + OPERAND(scan), c) != 0) {
1528 return(0);
1530 preg->reginput += n;
1531 break;
1532 case NOTHING:
1533 break;
1534 case BACK:
1535 break;
1536 case BRANCH:
1537 if (OP(preg, next) != BRANCH) /* No choice. */
1538 next = OPERAND(scan); /* Avoid recursion. */
1539 else {
1540 do {
1541 save = preg->reginput;
1542 if (regmatch(preg, OPERAND(scan))) {
1543 return(1);
1545 preg->reginput = save;
1546 scan = regnext(preg, scan);
1547 } while (scan != 0 && OP(preg, scan) == BRANCH);
1548 return(0);
1549 /* NOTREACHED */
1551 break;
1552 case REP:
1553 case REPMIN:
1554 return regmatchsimplerepeat(preg, scan, OP(preg, scan) == REPMIN);
1556 case REPX:
1557 case REPXMIN:
1558 return regmatchrepeat(preg, scan, OP(preg, scan) == REPXMIN);
1560 case END:
1561 return 1; /* Success! */
1563 case OPENNC:
1564 case CLOSENC:
1565 return regmatch(preg, next);
1567 default:
1568 if (OP(preg, scan) >= OPEN+1 && OP(preg, scan) < CLOSE_END) {
1569 save = preg->reginput;
1570 if (regmatch(preg, next)) {
1571 if (OP(preg, scan) < CLOSE) {
1572 int no = OP(preg, scan) - OPEN;
1573 if (no < preg->nmatch && preg->pmatch[no].rm_so == -1) {
1574 preg->pmatch[no].rm_so = save - preg->start;
1577 else {
1578 int no = OP(preg, scan) - CLOSE;
1579 if (no < preg->nmatch && preg->pmatch[no].rm_eo == -1) {
1580 preg->pmatch[no].rm_eo = save - preg->start;
1583 return(1);
1585 return(0);
1587 return REG_ERR_INTERNAL;
1590 scan = next;
1594 * We get here only if there's trouble -- normally "case END" is
1595 * the terminating point.
1597 return REG_ERR_INTERNAL;
1601 - regrepeat - repeatedly match something simple, report how many
1603 static int regrepeat(regex_t *preg, int p, int max)
1605 int count = 0;
1606 const char *scan;
1607 int opnd;
1608 int ch;
1609 int n;
1611 scan = preg->reginput;
1612 opnd = OPERAND(p);
1613 switch (OP(preg, p)) {
1614 case ANY:
1615 /* No need to handle utf8 specially here */
1616 while (!reg_iseol(preg, *scan) && count < max) {
1617 count++;
1618 scan++;
1620 break;
1621 case EXACTLY:
1622 while (count < max) {
1623 n = reg_utf8_tounicode_case(scan, &ch, preg->cflags & REG_ICASE);
1624 if (preg->program[opnd] != ch) {
1625 break;
1627 count++;
1628 scan += n;
1630 break;
1631 case ANYOF:
1632 while (count < max) {
1633 n = reg_utf8_tounicode_case(scan, &ch, preg->cflags & REG_ICASE);
1634 if (reg_iseol(preg, ch) || reg_range_find(preg->program + opnd, ch) == 0) {
1635 break;
1637 count++;
1638 scan += n;
1640 break;
1641 case ANYBUT:
1642 while (count < max) {
1643 n = reg_utf8_tounicode_case(scan, &ch, preg->cflags & REG_ICASE);
1644 if (reg_iseol(preg, ch) || reg_range_find(preg->program + opnd, ch) != 0) {
1645 break;
1647 count++;
1648 scan += n;
1650 break;
1651 default: /* Oh dear. Called inappropriately. */
1652 preg->err = REG_ERR_INTERNAL;
1653 count = 0; /* Best compromise. */
1654 break;
1656 preg->reginput = scan;
1658 return(count);
1662 - regnext - dig the "next" pointer out of a node
1664 static int regnext(regex_t *preg, int p )
1666 int offset;
1668 offset = NEXT(preg, p);
1670 if (offset == 0)
1671 return 0;
1673 if (OP(preg, p) == BACK)
1674 return(p-offset);
1675 else
1676 return(p+offset);
1680 - regopsize - returns the size of opcode + operands at 'p' in words
1682 static int regopsize(regex_t *preg, int p )
1684 /* Almost all opcodes are 2 words, but some are more */
1685 switch (OP(preg, p)) {
1686 case REP:
1687 case REPMIN:
1688 case REPX:
1689 case REPXMIN:
1690 return 5;
1692 case ANYOF:
1693 case ANYBUT:
1694 case EXACTLY: {
1695 int s = p + 2;
1696 while (preg->program[s++]) {
1698 return s - p;
1701 return 2;
1704 #if defined(DEBUG) && !defined(JIM_BOOTSTRAP)
1707 - regdump - dump a regexp onto stdout in vaguely comprehensible form
1709 static void regdump(regex_t *preg)
1711 int s;
1712 int op = EXACTLY; /* Arbitrary non-END op. */
1713 int next;
1714 char buf[MAX_UTF8_LEN + 1];
1716 int i;
1717 for (i = 1; i < preg->p; i++) {
1718 printf("%02x ", (unsigned char)preg->program[i]);
1719 if (i % 16 == 0) {
1720 printf("\n");
1723 printf("\n");
1725 s = 1;
1726 while (op != END && s < preg->p) { /* While that wasn't END last time... */
1727 op = OP(preg, s);
1728 printf("%3d: %s", s, regprop(op)); /* Where, what. */
1729 next = regnext(preg, s);
1730 if (next == 0) /* Next ptr. */
1731 printf("(0)");
1732 else
1733 printf("(%d)", next);
1734 s += 2;
1735 if (op == REP || op == REPMIN || op == REPX || op == REPXMIN) {
1736 int max = preg->program[s];
1737 int min = preg->program[s + 1];
1738 if (max == 65535) {
1739 printf("{%d,*}", min);
1741 else {
1742 printf("{%d,%d}", min, max);
1744 printf(" %d", preg->program[s + 2]);
1745 s += 3;
1747 else if (op == ANYOF || op == ANYBUT) {
1748 /* set of ranges */
1750 while (preg->program[s]) {
1751 int len = preg->program[s++];
1752 int first = preg->program[s++];
1753 buf[utf8_getchars(buf, first)] = 0;
1754 printf("%s", buf);
1755 if (len > 1) {
1756 buf[utf8_getchars(buf, first + len - 1)] = 0;
1757 printf("-%s", buf);
1760 s++;
1762 else if (op == EXACTLY) {
1763 /* Literal string, where present. */
1765 while (preg->program[s]) {
1766 buf[utf8_getchars(buf, preg->program[s])] = 0;
1767 printf("%s", buf);
1768 s++;
1770 s++;
1772 putchar('\n');
1775 if (op == END) {
1776 /* Header fields of interest. */
1777 if (preg->regstart) {
1778 buf[utf8_getchars(buf, preg->regstart)] = 0;
1779 printf("start '%s' ", buf);
1781 if (preg->reganch)
1782 printf("anchored ");
1783 if (preg->regmust != 0) {
1784 int i;
1785 printf("must have:");
1786 for (i = 0; i < preg->regmlen; i++) {
1787 putchar(preg->program[preg->regmust + i]);
1789 putchar('\n');
1792 printf("\n");
1796 - regprop - printable representation of opcode
1798 static const char *regprop( int op )
1800 static char buf[50];
1802 switch (op) {
1803 case BOL:
1804 return "BOL";
1805 case EOL:
1806 return "EOL";
1807 case BOLX:
1808 return "BOLX";
1809 case EOLX:
1810 return "EOLX";
1811 case ANY:
1812 return "ANY";
1813 case ANYOF:
1814 return "ANYOF";
1815 case ANYBUT:
1816 return "ANYBUT";
1817 case BRANCH:
1818 return "BRANCH";
1819 case EXACTLY:
1820 return "EXACTLY";
1821 case NOTHING:
1822 return "NOTHING";
1823 case BACK:
1824 return "BACK";
1825 case END:
1826 return "END";
1827 case REP:
1828 return "REP";
1829 case REPMIN:
1830 return "REPMIN";
1831 case REPX:
1832 return "REPX";
1833 case REPXMIN:
1834 return "REPXMIN";
1835 case WORDA:
1836 return "WORDA";
1837 case WORDZ:
1838 return "WORDZ";
1839 case OPENNC:
1840 return "OPEN";
1841 case CLOSENC:
1842 return "CLOSE";
1843 default:
1844 if (op >= OPEN && op < CLOSE) {
1845 snprintf(buf, sizeof(buf), "OPEN%d", op-OPEN);
1847 else if (op >= CLOSE && op < CLOSE_END) {
1848 snprintf(buf, sizeof(buf), "CLOSE%d", op-CLOSE);
1850 else {
1851 snprintf(buf, sizeof(buf), "?%d?\n", op);
1853 return(buf);
1856 #endif /* JIM_BOOTSTRAP */
1858 size_t regerror(int errcode, const regex_t *preg, char *errbuf, size_t errbuf_size)
1860 static const char *error_strings[] = {
1861 "success",
1862 "no match",
1863 "bad pattern",
1864 "null argument",
1865 "unknown error",
1866 "too big",
1867 "out of memory",
1868 "too many ()",
1869 "parentheses () not balanced",
1870 "braces {} not balanced",
1871 "invalid repetition count(s)",
1872 "extra characters",
1873 "*+ of empty atom",
1874 "nested count",
1875 "internal error",
1876 "count follows nothing",
1877 "trailing backslash",
1878 "corrupted program",
1879 "contains null char",
1881 const char *err;
1883 if (errcode < 0 || errcode >= REG_ERR_NUM) {
1884 err = "Bad error code";
1886 else {
1887 err = error_strings[errcode];
1890 return snprintf(errbuf, errbuf_size, "%s", err);
1893 void regfree(regex_t *preg)
1895 free(preg->program);
1898 #endif