Update HTML doc template.
[tmk.git] / jimtcl / jimregexp.c
blobf1b7cd381e346b7551a2ad85d1aa618617ddf8b4
1 /*
2 * vi:se ts=8:
4 * regcomp and regexec -- regsub and regerror are elsewhere
6 * Copyright (c) 1986 by University of Toronto.
7 * Written by Henry Spencer. Not derived from licensed software.
9 * Permission is granted to anyone to use this software for any
10 * purpose on any computer system, and to redistribute it freely,
11 * subject to the following restrictions:
13 * 1. The author is not responsible for the consequences of use of
14 * this software, no matter how awful, even if they arise
15 * from defects in it.
17 * 2. The origin of this software must not be misrepresented, either
18 * by explicit claim or by omission.
20 * 3. Altered versions must be plainly marked as such, and must not
21 * be misrepresented as being the original software.
22 *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore,
23 *** hoptoad!gnu, on 27 Dec 1986, to add \n as an alternative to |
24 *** to assist in implementing egrep.
25 *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore,
26 *** hoptoad!gnu, on 27 Dec 1986, to add \< and \> for word-matching
27 *** as in BSD grep and ex.
28 *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore,
29 *** hoptoad!gnu, on 28 Dec 1986, to optimize characters quoted with \.
30 *** THIS IS AN ALTERED VERSION. It was altered by James A. Woods,
31 *** ames!jaw, on 19 June 1987, to quash a regcomp() redundancy.
32 *** THIS IS AN ALTERED VERSION. It was altered by Christopher Seiwald
33 *** seiwald@vix.com, on 28 August 1993, for use in jam. Regmagic.h
34 *** was moved into regexp.h, and the include of regexp.h now uses "'s
35 *** to avoid conflicting with the system regexp.h. Const, bless its
36 *** soul, was removed so it can compile everywhere. The declaration
37 *** of strchr() was in conflict on AIX, so it was removed (as it is
38 *** happily defined in string.h).
39 *** THIS IS AN ALTERED VERSION. It was altered by Christopher Seiwald
40 *** seiwald@perforce.com, on 20 January 2000, to use function prototypes.
41 *** THIS IS AN ALTERED VERSION. It was altered by Christopher Seiwald
42 *** seiwald@perforce.com, on 05 November 2002, to const string literals.
44 * THIS IS AN ALTERED VERSION. It was altered by Steve Bennett <steveb@workware.net.au>
45 * on 16 October 2010, to remove static state and add better Tcl ARE compatibility.
46 * This includes counted repetitions, UTF-8 support, character classes,
47 * shorthand character classes, increased number of parentheses to 100,
48 * backslash escape sequences. It also removes \n as an alternative to |.
50 * Beware that some of this code is subtly aware of the way operator
51 * precedence is structured in regular expressions. Serious changes in
52 * regular-expression syntax might require a total rethink.
55 #include "jimautoconf.h"
57 #if defined(JIM_REGEXP)
58 #include <stdio.h>
59 #include <ctype.h>
60 #include <stdlib.h>
61 #include <string.h>
63 #include "jim.h"
64 #include "jimregexp.h"
65 #include "utf8.h"
67 /* An arbitrary limit, but this seems enough. Must be less than 1000. */
68 #define REG_MAX_PAREN 100
71 * Structure for regexp "program". This is essentially a linear encoding
72 * of a nondeterministic finite-state machine (aka syntax charts or
73 * "railroad normal form" in parsing technology). Each node is an opcode
74 * plus a "next" pointer, possibly plus an operand. "Next" pointers of
75 * all nodes except BRANCH implement concatenation; a "next" pointer with
76 * a BRANCH on both ends of it is connecting two alternatives. (Here we
77 * have one of the subtle syntax dependencies: an individual BRANCH (as
78 * opposed to a collection of them) is never concatenated with anything
79 * because of operator precedence.) The operand of some types of node is
80 * a literal string; for others, it is a node leading into a sub-FSM. In
81 * particular, the operand of a BRANCH node is the first node of the branch.
82 * (NB this is *not* a tree structure: the tail of the branch connects
83 * to the thing following the set of BRANCHes.) The opcodes are:
86 /* definition number opnd? meaning */
87 #define END 0 /* no End of program. */
88 #define BOL 1 /* no Match "" at beginning of line. */
89 #define EOL 2 /* no Match "" at end of line. */
90 #define ANY 3 /* no Match any one character. */
91 #define ANYOF 4 /* str Match any character in this string. */
92 #define ANYBUT 5 /* str Match any character not in this string. */
93 #define BRANCH 6 /* node Match this alternative, or the next... */
94 #define BACK 7 /* no Match "", "next" ptr points backward. */
95 #define EXACTLY 8 /* str Match this string. */
96 #define NOTHING 9 /* no Match empty string. */
97 #define REP 10 /* max,min Match this (simple) thing [min,max] times. */
98 #define REPMIN 11 /* max,min Match this (simple) thing [min,max] times, minimal match. */
99 #define REPX 12 /* max,min Match this (complex) thing [min,max] times. */
100 #define REPXMIN 13 /* max,min Match this (complex) thing [min,max] times, minimal match. */
102 #define WORDA 15 /* no Match "" at wordchar, where prev is nonword */
103 #define WORDZ 16 /* no Match "" at nonwordchar, where prev is word */
105 #define OPENNC 1000 /* no Non-capturing parentheses - must be OPEN-1 */
106 #define OPEN 1001 /* no Mark this point in input as start of #n. */
107 /* OPEN+1 is number 1, etc. */
109 /* must not be any other opts between OPEN and CLOSE */
111 #define CLOSENC 2000 /* no Non-capturing parentheses - must be CLOSE-1 */
112 #define CLOSE 2001 /* no Analogous to OPEN. */
113 #define CLOSE_END (CLOSE+REG_MAX_PAREN)
116 * The first word of the regexp internal "program" is actually this magic
117 * number; the start node begins in the second word.
119 #define REG_MAGIC 0xFADED00D
122 * Opcode notes:
124 * BRANCH The set of branches constituting a single choice are hooked
125 * together with their "next" pointers, since precedence prevents
126 * anything being concatenated to any individual branch. The
127 * "next" pointer of the last BRANCH in a choice points to the
128 * thing following the whole choice. This is also where the
129 * final "next" pointer of each individual branch points; each
130 * branch starts with the operand node of a BRANCH node.
132 * BACK Normal "next" pointers all implicitly point forward; BACK
133 * exists to make loop structures possible.
135 * REP,REPX Repeated matches ('?', '*', '+' and {min,max}) are implemented
136 * as either simple repeats (REP) or complex repeats (REPX).
137 * These opcodes include a "min" and "max" count after the opcode.
138 * This is followed by a fourth "current count" word that is
139 * only used by REPX, as it implements a recursive match.
140 * REPMIN and REPXMIN are identical except they implement minimal repeats.
142 * OPEN,CLOSE ...are numbered at compile time.
146 * A node is one word of opcode followed by one word of "next" pointer.
147 * The "next" pointer value is a positive offset from the opcode of the node
148 * containing it.
149 * An operand, if any, simply follows the node. (Note that much of the
150 * code generation knows about this implicit relationship.)
152 #define OP(preg, p) (preg->program[p])
153 #define NEXT(preg, p) (preg->program[p + 1])
154 #define OPERAND(p) ((p) + 2)
157 * See regmagic.h for one further detail of program structure.
162 * Utility definitions.
165 #define FAIL(R,M) { (R)->err = (M); return (M); }
166 #define ISMULT(c) ((c) == '*' || (c) == '+' || (c) == '?' || (c) == '{')
167 #define META "^$.[()|?{+*"
170 * Flags to be passed up and down.
172 #define HASWIDTH 1 /* Known never to match null string. */
173 #define SIMPLE 2 /* Simple enough to be STAR/PLUS operand. */
174 #define SPSTART 4 /* Starts with * or +. */
175 #define WORST 0 /* Worst case. */
177 #define MAX_REP_COUNT 1000000
180 * Forward declarations for regcomp()'s friends.
182 static int reg(regex_t *preg, int paren /* Parenthesized? */, int *flagp );
183 static int regpiece(regex_t *preg, int *flagp );
184 static int regbranch(regex_t *preg, int *flagp );
185 static int regatom(regex_t *preg, int *flagp );
186 static int regnode(regex_t *preg, int op );
187 static int regnext(regex_t *preg, int p );
188 static void regc(regex_t *preg, int b );
189 static int reginsert(regex_t *preg, int op, int size, int opnd );
190 static void regtail(regex_t *preg, int p, int val);
191 static void regoptail(regex_t *preg, int p, int val );
192 static int regopsize(regex_t *preg, int p );
194 static int reg_range_find(const int *string, int c);
195 static const char *str_find(const char *string, int c, int nocase);
196 static int prefix_cmp(const int *prog, int proglen, const char *string, int nocase);
198 /*#define DEBUG*/
199 #ifdef DEBUG
200 static int regnarrate = 0;
201 static void regdump(regex_t *preg);
202 static const char *regprop( int op );
203 #endif
207 * Returns the length of the null-terminated integer sequence.
209 static int str_int_len(const int *seq)
211 int n = 0;
212 while (*seq++) {
213 n++;
215 return n;
219 - regcomp - compile a regular expression into internal code
221 * We can't allocate space until we know how big the compiled form will be,
222 * but we can't compile it (and thus know how big it is) until we've got a
223 * place to put the code. So we cheat: we compile it twice, once with code
224 * generation turned off and size counting turned on, and once "for real".
225 * This also means that we don't allocate space until we are sure that the
226 * thing really will compile successfully, and we never have to move the
227 * code and thus invalidate pointers into it. (Note that it has to be in
228 * one piece because free() must be able to free it all.)
230 * Beware that the optimization-preparation code in here knows about some
231 * of the structure of the compiled regexp.
233 int regcomp(regex_t *preg, const char *exp, int cflags)
235 int scan;
236 int longest;
237 unsigned len;
238 int flags;
240 #ifdef DEBUG
241 fprintf(stderr, "Compiling: '%s'\n", exp);
242 #endif
243 memset(preg, 0, sizeof(*preg));
245 if (exp == NULL)
246 FAIL(preg, REG_ERR_NULL_ARGUMENT);
248 /* First pass: determine size, legality. */
249 preg->cflags = cflags;
250 preg->regparse = exp;
252 /* Allocate space. */
253 preg->proglen = (strlen(exp) + 1) * 5;
254 preg->program = malloc(preg->proglen * sizeof(int));
255 if (preg->program == NULL)
256 FAIL(preg, REG_ERR_NOMEM);
258 /* Note that since we store a magic value as the first item in the program,
259 * program offsets will never be 0
261 regc(preg, REG_MAGIC);
262 if (reg(preg, 0, &flags) == 0) {
263 return preg->err;
266 /* Small enough for pointer-storage convention? */
267 if (preg->re_nsub >= REG_MAX_PAREN) /* Probably could be 65535L. */
268 FAIL(preg,REG_ERR_TOO_BIG);
270 /* Dig out information for optimizations. */
271 preg->regstart = 0; /* Worst-case defaults. */
272 preg->reganch = 0;
273 preg->regmust = 0;
274 preg->regmlen = 0;
275 scan = 1; /* First BRANCH. */
276 if (OP(preg, regnext(preg, scan)) == END) { /* Only one top-level choice. */
277 scan = OPERAND(scan);
279 /* Starting-point info. */
280 if (OP(preg, scan) == EXACTLY) {
281 preg->regstart = preg->program[OPERAND(scan)];
283 else if (OP(preg, scan) == BOL)
284 preg->reganch++;
287 * If there's something expensive in the r.e., find the
288 * longest literal string that must appear and make it the
289 * regmust. Resolve ties in favor of later strings, since
290 * the regstart check works with the beginning of the r.e.
291 * and avoiding duplication strengthens checking. Not a
292 * strong reason, but sufficient in the absence of others.
294 if (flags&SPSTART) {
295 longest = 0;
296 len = 0;
297 for (; scan != 0; scan = regnext(preg, scan)) {
298 if (OP(preg, scan) == EXACTLY) {
299 int plen = str_int_len(preg->program + OPERAND(scan));
300 if (plen >= len) {
301 longest = OPERAND(scan);
302 len = plen;
306 preg->regmust = longest;
307 preg->regmlen = len;
311 #ifdef DEBUG
312 regdump(preg);
313 #endif
315 return 0;
319 - reg - regular expression, i.e. main body or parenthesized thing
321 * Caller must absorb opening parenthesis.
323 * Combining parenthesis handling with the base level of regular expression
324 * is a trifle forced, but the need to tie the tails of the branches to what
325 * follows makes it hard to avoid.
327 static int reg(regex_t *preg, int paren /* Parenthesized? */, int *flagp )
329 int ret;
330 int br;
331 int ender;
332 int parno = 0;
333 int flags;
335 *flagp = HASWIDTH; /* Tentatively. */
337 /* Make an OPEN node, if parenthesized. */
338 if (paren) {
339 if (preg->regparse[0] == '?' && preg->regparse[1] == ':') {
340 /* non-capturing paren */
341 preg->regparse += 2;
342 parno = -1;
344 else {
345 parno = ++preg->re_nsub;
347 ret = regnode(preg, OPEN+parno);
348 } else
349 ret = 0;
351 /* Pick up the branches, linking them together. */
352 br = regbranch(preg, &flags);
353 if (br == 0)
354 return 0;
355 if (ret != 0)
356 regtail(preg, ret, br); /* OPEN -> first. */
357 else
358 ret = br;
359 if (!(flags&HASWIDTH))
360 *flagp &= ~HASWIDTH;
361 *flagp |= flags&SPSTART;
362 while (*preg->regparse == '|') {
363 preg->regparse++;
364 br = regbranch(preg, &flags);
365 if (br == 0)
366 return 0;
367 regtail(preg, ret, br); /* BRANCH -> BRANCH. */
368 if (!(flags&HASWIDTH))
369 *flagp &= ~HASWIDTH;
370 *flagp |= flags&SPSTART;
373 /* Make a closing node, and hook it on the end. */
374 ender = regnode(preg, (paren) ? CLOSE+parno : END);
375 regtail(preg, ret, ender);
377 /* Hook the tails of the branches to the closing node. */
378 for (br = ret; br != 0; br = regnext(preg, br))
379 regoptail(preg, br, ender);
381 /* Check for proper termination. */
382 if (paren && *preg->regparse++ != ')') {
383 preg->err = REG_ERR_UNMATCHED_PAREN;
384 return 0;
385 } else if (!paren && *preg->regparse != '\0') {
386 if (*preg->regparse == ')') {
387 preg->err = REG_ERR_UNMATCHED_PAREN;
388 return 0;
389 } else {
390 preg->err = REG_ERR_JUNK_ON_END;
391 return 0;
395 return(ret);
399 - regbranch - one alternative of an | operator
401 * Implements the concatenation operator.
403 static int regbranch(regex_t *preg, int *flagp )
405 int ret;
406 int chain;
407 int latest;
408 int flags;
410 *flagp = WORST; /* Tentatively. */
412 ret = regnode(preg, BRANCH);
413 chain = 0;
414 while (*preg->regparse != '\0' && *preg->regparse != ')' &&
415 *preg->regparse != '|') {
416 latest = regpiece(preg, &flags);
417 if (latest == 0)
418 return 0;
419 *flagp |= flags&HASWIDTH;
420 if (chain == 0) {/* First piece. */
421 *flagp |= flags&SPSTART;
423 else {
424 regtail(preg, chain, latest);
426 chain = latest;
428 if (chain == 0) /* Loop ran zero times. */
429 (void) regnode(preg, NOTHING);
431 return(ret);
435 - regpiece - something followed by possible [*+?]
437 * Note that the branching code sequences used for ? and the general cases
438 * of * and + are somewhat optimized: they use the same NOTHING node as
439 * both the endmarker for their branch list and the body of the last branch.
440 * It might seem that this node could be dispensed with entirely, but the
441 * endmarker role is not redundant.
443 static int regpiece(regex_t *preg, int *flagp)
445 int ret;
446 char op;
447 int next;
448 int flags;
449 int min;
450 int max;
452 ret = regatom(preg, &flags);
453 if (ret == 0)
454 return 0;
456 op = *preg->regparse;
457 if (!ISMULT(op)) {
458 *flagp = flags;
459 return(ret);
462 if (!(flags&HASWIDTH) && op != '?') {
463 preg->err = REG_ERR_OPERAND_COULD_BE_EMPTY;
464 return 0;
467 /* Handle braces (counted repetition) by expansion */
468 if (op == '{') {
469 char *end;
471 min = strtoul(preg->regparse + 1, &end, 10);
472 if (end == preg->regparse + 1) {
473 preg->err = REG_ERR_BAD_COUNT;
474 return 0;
476 if (*end == '}') {
477 max = min;
479 else {
480 preg->regparse = end;
481 max = strtoul(preg->regparse + 1, &end, 10);
482 if (*end != '}') {
483 preg->err = REG_ERR_UNMATCHED_BRACES;
484 return 0;
487 if (end == preg->regparse + 1) {
488 max = MAX_REP_COUNT;
490 else if (max < min || max >= 100) {
491 preg->err = REG_ERR_BAD_COUNT;
492 return 0;
494 if (min >= 100) {
495 preg->err = REG_ERR_BAD_COUNT;
496 return 0;
499 preg->regparse = strchr(preg->regparse, '}');
501 else {
502 min = (op == '+');
503 max = (op == '?' ? 1 : MAX_REP_COUNT);
506 if (preg->regparse[1] == '?') {
507 preg->regparse++;
508 next = reginsert(preg, flags & SIMPLE ? REPMIN : REPXMIN, 5, ret);
510 else {
511 next = reginsert(preg, flags & SIMPLE ? REP: REPX, 5, ret);
513 preg->program[ret + 2] = max;
514 preg->program[ret + 3] = min;
515 preg->program[ret + 4] = 0;
517 *flagp = (min) ? (WORST|HASWIDTH) : (WORST|SPSTART);
519 if (!(flags & SIMPLE)) {
520 int back = regnode(preg, BACK);
521 regtail(preg, back, ret);
522 regtail(preg, next, back);
525 preg->regparse++;
526 if (ISMULT(*preg->regparse)) {
527 preg->err = REG_ERR_NESTED_COUNT;
528 return 0;
531 return ret;
535 * Add all characters in the inclusive range between lower and upper.
537 * Handles a swapped range (upper < lower).
539 static void reg_addrange(regex_t *preg, int lower, int upper)
541 if (lower > upper) {
542 reg_addrange(preg, upper, lower);
544 /* Add a range as length, start */
545 regc(preg, upper - lower + 1);
546 regc(preg, lower);
550 * Add a null-terminated literal string as a set of ranges.
552 static void reg_addrange_str(regex_t *preg, const char *str)
554 while (*str) {
555 reg_addrange(preg, *str, *str);
556 str++;
561 * Extracts the next unicode char from utf8.
563 * If 'upper' is set, converts the char to uppercase.
565 static int reg_utf8_tounicode_case(const char *s, int *uc, int upper)
567 int l = utf8_tounicode(s, uc);
568 if (upper) {
569 *uc = utf8_upper(*uc);
571 return l;
575 * Converts a hex digit to decimal.
577 * Returns -1 for an invalid hex digit.
579 static int hexdigitval(int c)
581 if (c >= '0' && c <= '9')
582 return c - '0';
583 if (c >= 'a' && c <= 'f')
584 return c - 'a' + 10;
585 if (c >= 'A' && c <= 'F')
586 return c - 'A' + 10;
587 return -1;
591 * Parses up to 'n' hex digits at 's' and stores the result in *uc.
593 * Returns the number of hex digits parsed.
594 * If there are no hex digits, returns 0 and stores nothing.
596 static int parse_hex(const char *s, int n, int *uc)
598 int val = 0;
599 int k;
601 for (k = 0; k < n; k++) {
602 int c = hexdigitval(*s++);
603 if (c == -1) {
604 break;
606 val = (val << 4) | c;
608 if (k) {
609 *uc = val;
611 return k;
615 * Call for chars after a backlash to decode the escape sequence.
617 * Stores the result in *ch.
619 * Returns the number of bytes consumed.
621 static int reg_decode_escape(const char *s, int *ch)
623 int n;
624 const char *s0 = s;
626 *ch = *s++;
628 switch (*ch) {
629 case 'b': *ch = '\b'; break;
630 case 'e': *ch = 27; break;
631 case 'f': *ch = '\f'; break;
632 case 'n': *ch = '\n'; break;
633 case 'r': *ch = '\r'; break;
634 case 't': *ch = '\t'; break;
635 case 'v': *ch = '\v'; break;
636 case 'u':
637 if (*s == '{') {
638 /* Expect \u{NNNN} */
639 n = parse_hex(s + 1, 6, ch);
640 if (n > 0 && s[n + 1] == '}' && *ch >= 0 && *ch <= 0x1fffff) {
641 s += n + 2;
643 else {
644 /* Invalid, so just treat as an escaped 'u' */
645 *ch = 'u';
648 else if ((n = parse_hex(s, 4, ch)) > 0) {
649 s += n;
651 break;
652 case 'U':
653 if ((n = parse_hex(s, 8, ch)) > 0) {
654 s += n;
656 break;
657 case 'x':
658 if ((n = parse_hex(s, 2, ch)) > 0) {
659 s += n;
661 break;
662 case '\0':
663 s--;
664 *ch = '\\';
665 break;
667 return s - s0;
671 - regatom - the lowest level
673 * Optimization: gobbles an entire sequence of ordinary characters so that
674 * it can turn them into a single node, which is smaller to store and
675 * faster to run. Backslashed characters are exceptions, each becoming a
676 * separate node; the code is simpler that way and it's not worth fixing.
678 static int regatom(regex_t *preg, int *flagp)
680 int ret;
681 int flags;
682 int nocase = (preg->cflags & REG_ICASE);
684 int ch;
685 int n = reg_utf8_tounicode_case(preg->regparse, &ch, nocase);
687 *flagp = WORST; /* Tentatively. */
689 preg->regparse += n;
690 switch (ch) {
691 /* FIXME: these chars only have meaning at beg/end of pat? */
692 case '^':
693 ret = regnode(preg, BOL);
694 break;
695 case '$':
696 ret = regnode(preg, EOL);
697 break;
698 case '.':
699 ret = regnode(preg, ANY);
700 *flagp |= HASWIDTH|SIMPLE;
701 break;
702 case '[': {
703 const char *pattern = preg->regparse;
705 if (*pattern == '^') { /* Complement of range. */
706 ret = regnode(preg, ANYBUT);
707 pattern++;
708 } else
709 ret = regnode(preg, ANYOF);
711 /* Special case. If the first char is ']' or '-', it is part of the set */
712 if (*pattern == ']' || *pattern == '-') {
713 reg_addrange(preg, *pattern, *pattern);
714 pattern++;
717 while (*pattern && *pattern != ']') {
718 /* Is this a range? a-z */
719 int start;
720 int end;
722 pattern += reg_utf8_tounicode_case(pattern, &start, nocase);
723 if (start == '\\') {
724 pattern += reg_decode_escape(pattern, &start);
725 if (start == 0) {
726 preg->err = REG_ERR_NULL_CHAR;
727 return 0;
730 if (pattern[0] == '-' && pattern[1] && pattern[1] != ']') {
731 /* skip '-' */
732 pattern += utf8_tounicode(pattern, &end);
733 pattern += reg_utf8_tounicode_case(pattern, &end, nocase);
734 if (end == '\\') {
735 pattern += reg_decode_escape(pattern, &end);
736 if (end == 0) {
737 preg->err = REG_ERR_NULL_CHAR;
738 return 0;
742 reg_addrange(preg, start, end);
743 continue;
745 if (start == '[') {
746 if (strncmp(pattern, ":alpha:]", 8) == 0) {
747 if ((preg->cflags & REG_ICASE) == 0) {
748 reg_addrange(preg, 'a', 'z');
750 reg_addrange(preg, 'A', 'Z');
751 pattern += 8;
752 continue;
754 if (strncmp(pattern, ":alnum:]", 8) == 0) {
755 if ((preg->cflags & REG_ICASE) == 0) {
756 reg_addrange(preg, 'a', 'z');
758 reg_addrange(preg, 'A', 'Z');
759 reg_addrange(preg, '0', '9');
760 pattern += 8;
761 continue;
763 if (strncmp(pattern, ":space:]", 8) == 0) {
764 reg_addrange_str(preg, " \t\r\n\f\v");
765 pattern += 8;
766 continue;
769 /* Not a range, so just add the char */
770 reg_addrange(preg, start, start);
772 regc(preg, '\0');
774 if (*pattern) {
775 pattern++;
777 preg->regparse = pattern;
779 *flagp |= HASWIDTH|SIMPLE;
781 break;
782 case '(':
783 ret = reg(preg, 1, &flags);
784 if (ret == 0)
785 return 0;
786 *flagp |= flags&(HASWIDTH|SPSTART);
787 break;
788 case '\0':
789 case '|':
790 case ')':
791 preg->err = REG_ERR_INTERNAL;
792 return 0; /* Supposed to be caught earlier. */
793 case '?':
794 case '+':
795 case '*':
796 case '{':
797 preg->err = REG_ERR_COUNT_FOLLOWS_NOTHING;
798 return 0;
799 case '\\':
800 switch (*preg->regparse++) {
801 case '\0':
802 preg->err = REG_ERR_TRAILING_BACKSLASH;
803 return 0;
804 case '<':
805 case 'm':
806 ret = regnode(preg, WORDA);
807 break;
808 case '>':
809 case 'M':
810 ret = regnode(preg, WORDZ);
811 break;
812 case 'd':
813 ret = regnode(preg, ANYOF);
814 reg_addrange(preg, '0', '9');
815 regc(preg, '\0');
816 *flagp |= HASWIDTH|SIMPLE;
817 break;
818 case 'w':
819 ret = regnode(preg, ANYOF);
820 if ((preg->cflags & REG_ICASE) == 0) {
821 reg_addrange(preg, 'a', 'z');
823 reg_addrange(preg, 'A', 'Z');
824 reg_addrange(preg, '0', '9');
825 reg_addrange(preg, '_', '_');
826 regc(preg, '\0');
827 *flagp |= HASWIDTH|SIMPLE;
828 break;
829 case 's':
830 ret = regnode(preg, ANYOF);
831 reg_addrange_str(preg," \t\r\n\f\v");
832 regc(preg, '\0');
833 *flagp |= HASWIDTH|SIMPLE;
834 break;
835 /* FIXME: Someday handle \1, \2, ... */
836 default:
837 /* Handle general quoted chars in exact-match routine */
838 /* Back up to include the backslash */
839 preg->regparse--;
840 goto de_fault;
842 break;
843 de_fault:
844 default: {
846 * Encode a string of characters to be matched exactly.
848 int added = 0;
850 /* Back up to pick up the first char of interest */
851 preg->regparse -= n;
853 ret = regnode(preg, EXACTLY);
855 /* Note that a META operator such as ? or * consumes the
856 * preceding char.
857 * Thus we must be careful to look ahead by 2 and add the
858 * last char as it's own EXACTLY if necessary
861 /* Until end of string or a META char is reached */
862 while (*preg->regparse && strchr(META, *preg->regparse) == NULL) {
863 n = reg_utf8_tounicode_case(preg->regparse, &ch, (preg->cflags & REG_ICASE));
864 if (ch == '\\' && preg->regparse[n]) {
865 /* Non-trailing backslash.
866 * Is this a special escape, or a regular escape?
868 if (strchr("<>mMwds", preg->regparse[n])) {
869 /* A special escape. All done with EXACTLY */
870 break;
872 /* Decode it. Note that we add the length for the escape
873 * sequence to the length for the backlash so we can skip
874 * the entire sequence, or not as required.
876 n += reg_decode_escape(preg->regparse + n, &ch);
877 if (ch == 0) {
878 preg->err = REG_ERR_NULL_CHAR;
879 return 0;
883 /* Now we have one char 'ch' of length 'n'.
884 * Check to see if the following char is a MULT
887 if (ISMULT(preg->regparse[n])) {
888 /* Yes. But do we already have some EXACTLY chars? */
889 if (added) {
890 /* Yes, so return what we have and pick up the current char next time around */
891 break;
893 /* No, so add this single char and finish */
894 regc(preg, ch);
895 added++;
896 preg->regparse += n;
897 break;
900 /* No, so just add this char normally */
901 regc(preg, ch);
902 added++;
903 preg->regparse += n;
905 regc(preg, '\0');
907 *flagp |= HASWIDTH;
908 if (added == 1)
909 *flagp |= SIMPLE;
910 break;
912 break;
915 return(ret);
918 static void reg_grow(regex_t *preg, int n)
920 if (preg->p + n >= preg->proglen) {
921 preg->proglen = (preg->p + n) * 2;
922 preg->program = realloc(preg->program, preg->proglen * sizeof(int));
927 - regnode - emit a node
929 /* Location. */
930 static int regnode(regex_t *preg, int op)
932 reg_grow(preg, 2);
934 /* The OP followed by a next pointer */
935 preg->program[preg->p++] = op;
936 preg->program[preg->p++] = 0;
938 /* Return the start of the node */
939 return preg->p - 2;
943 - regc - emit (if appropriate) a byte of code
945 static void regc(regex_t *preg, int b )
947 reg_grow(preg, 1);
948 preg->program[preg->p++] = b;
952 - reginsert - insert an operator in front of already-emitted operand
954 * Means relocating the operand.
955 * Returns the new location of the original operand.
957 static int reginsert(regex_t *preg, int op, int size, int opnd )
959 reg_grow(preg, size);
961 /* Move everything from opnd up */
962 memmove(preg->program + opnd + size, preg->program + opnd, sizeof(int) * (preg->p - opnd));
963 /* Zero out the new space */
964 memset(preg->program + opnd, 0, sizeof(int) * size);
966 preg->program[opnd] = op;
968 preg->p += size;
970 return opnd + size;
974 - regtail - set the next-pointer at the end of a node chain
976 static void regtail(regex_t *preg, int p, int val)
978 int scan;
979 int temp;
980 int offset;
982 /* Find last node. */
983 scan = p;
984 for (;;) {
985 temp = regnext(preg, scan);
986 if (temp == 0)
987 break;
988 scan = temp;
991 if (OP(preg, scan) == BACK)
992 offset = scan - val;
993 else
994 offset = val - scan;
996 preg->program[scan + 1] = offset;
1000 - regoptail - regtail on operand of first argument; nop if operandless
1003 static void regoptail(regex_t *preg, int p, int val )
1005 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
1006 if (p != 0 && OP(preg, p) == BRANCH) {
1007 regtail(preg, OPERAND(p), val);
1012 * regexec and friends
1016 * Forwards.
1018 static int regtry(regex_t *preg, const char *string );
1019 static int regmatch(regex_t *preg, int prog);
1020 static int regrepeat(regex_t *preg, int p, int max);
1023 - regexec - match a regexp against a string
1025 int regexec(regex_t *preg, const char *string, size_t nmatch, regmatch_t pmatch[], int eflags)
1027 const char *s;
1028 int scan;
1030 /* Be paranoid... */
1031 if (preg == NULL || preg->program == NULL || string == NULL) {
1032 return REG_ERR_NULL_ARGUMENT;
1035 /* Check validity of program. */
1036 if (*preg->program != REG_MAGIC) {
1037 return REG_ERR_CORRUPTED;
1040 #ifdef DEBUG
1041 fprintf(stderr, "regexec: %s\n", string);
1042 regdump(preg);
1043 #endif
1045 preg->eflags = eflags;
1046 preg->pmatch = pmatch;
1047 preg->nmatch = nmatch;
1048 preg->start = string; /* All offsets are computed from here */
1050 /* Must clear out the embedded repeat counts of REPX and REPXMIN opcodes */
1051 for (scan = OPERAND(1); scan != 0; scan += regopsize(preg, scan)) {
1052 int op = OP(preg, scan);
1053 if (op == END)
1054 break;
1055 if (op == REPX || op == REPXMIN)
1056 preg->program[scan + 4] = 0;
1059 /* If there is a "must appear" string, look for it. */
1060 if (preg->regmust != 0) {
1061 s = string;
1062 while ((s = str_find(s, preg->program[preg->regmust], preg->cflags & REG_ICASE)) != NULL) {
1063 if (prefix_cmp(preg->program + preg->regmust, preg->regmlen, s, preg->cflags & REG_ICASE) >= 0) {
1064 break;
1066 s++;
1068 if (s == NULL) /* Not present. */
1069 return REG_NOMATCH;
1072 /* Mark beginning of line for ^ . */
1073 preg->regbol = string;
1075 /* Simplest case: anchored match need be tried only once (maybe per line). */
1076 if (preg->reganch) {
1077 if (eflags & REG_NOTBOL) {
1078 /* This is an anchored search, but not an BOL, so possibly skip to the next line */
1079 goto nextline;
1081 while (1) {
1082 if (regtry(preg, string)) {
1083 return REG_NOERROR;
1085 if (*string) {
1086 nextline:
1087 if (preg->cflags & REG_NEWLINE) {
1088 /* Try the next anchor? */
1089 string = strchr(string, '\n');
1090 if (string) {
1091 preg->regbol = ++string;
1092 continue;
1096 return REG_NOMATCH;
1100 /* Messy cases: unanchored match. */
1101 s = string;
1102 if (preg->regstart != '\0') {
1103 /* We know what char it must start with. */
1104 while ((s = str_find(s, preg->regstart, preg->cflags & REG_ICASE)) != NULL) {
1105 if (regtry(preg, s))
1106 return REG_NOERROR;
1107 s++;
1110 else
1111 /* We don't -- general case. */
1112 while (1) {
1113 if (regtry(preg, s))
1114 return REG_NOERROR;
1115 if (*s == '\0') {
1116 break;
1118 else {
1119 int c;
1120 s += utf8_tounicode(s, &c);
1124 /* Failure. */
1125 return REG_NOMATCH;
1129 - regtry - try match at specific point
1131 /* 0 failure, 1 success */
1132 static int regtry( regex_t *preg, const char *string )
1134 int i;
1136 preg->reginput = string;
1138 for (i = 0; i < preg->nmatch; i++) {
1139 preg->pmatch[i].rm_so = -1;
1140 preg->pmatch[i].rm_eo = -1;
1142 if (regmatch(preg, 1)) {
1143 preg->pmatch[0].rm_so = string - preg->start;
1144 preg->pmatch[0].rm_eo = preg->reginput - preg->start;
1145 return(1);
1146 } else
1147 return(0);
1151 * Returns bytes matched if 'pattern' is a prefix of 'string'.
1153 * If 'nocase' is non-zero, does a case-insensitive match.
1155 * Returns -1 on not found.
1157 static int prefix_cmp(const int *prog, int proglen, const char *string, int nocase)
1159 const char *s = string;
1160 while (proglen && *s) {
1161 int ch;
1162 int n = reg_utf8_tounicode_case(s, &ch, nocase);
1163 if (ch != *prog) {
1164 return -1;
1166 prog++;
1167 s += n;
1168 proglen--;
1170 if (proglen == 0) {
1171 return s - string;
1173 return -1;
1177 * Searchs for 'c' in the range 'range'.
1179 * Returns 1 if found, or 0 if not.
1181 static int reg_range_find(const int *range, int c)
1183 while (*range) {
1184 /*printf("Checking %d in range [%d,%d]\n", c, range[1], (range[0] + range[1] - 1));*/
1185 if (c >= range[1] && c <= (range[0] + range[1] - 1)) {
1186 return 1;
1188 range += 2;
1190 return 0;
1194 * Search for the character 'c' in the utf-8 string 'string'.
1196 * If 'nocase' is set, the 'string' is assumed to be uppercase
1197 * and 'c' is converted to uppercase before matching.
1199 * Returns the byte position in the string where the 'c' was found, or
1200 * NULL if not found.
1202 static const char *str_find(const char *string, int c, int nocase)
1204 if (nocase) {
1205 /* The "string" should already be converted to uppercase */
1206 c = utf8_upper(c);
1208 while (*string) {
1209 int ch;
1210 int n = reg_utf8_tounicode_case(string, &ch, nocase);
1211 if (c == ch) {
1212 return string;
1214 string += n;
1216 return NULL;
1220 * Returns true if 'ch' is an end-of-line char.
1222 * In REG_NEWLINE mode, \n is considered EOL in
1223 * addition to \0
1225 static int reg_iseol(regex_t *preg, int ch)
1227 if (preg->cflags & REG_NEWLINE) {
1228 return ch == '\0' || ch == '\n';
1230 else {
1231 return ch == '\0';
1235 static int regmatchsimplerepeat(regex_t *preg, int scan, int matchmin)
1237 int nextch = '\0';
1238 const char *save;
1239 int no;
1240 int c;
1242 int max = preg->program[scan + 2];
1243 int min = preg->program[scan + 3];
1244 int next = regnext(preg, scan);
1247 * Lookahead to avoid useless match attempts
1248 * when we know what character comes next.
1250 if (OP(preg, next) == EXACTLY) {
1251 nextch = preg->program[OPERAND(next)];
1253 save = preg->reginput;
1254 no = regrepeat(preg, scan + 5, max);
1255 if (no < min) {
1256 return 0;
1258 if (matchmin) {
1259 /* from min up to no */
1260 max = no;
1261 no = min;
1263 /* else from no down to min */
1264 while (1) {
1265 if (matchmin) {
1266 if (no > max) {
1267 break;
1270 else {
1271 if (no < min) {
1272 break;
1275 preg->reginput = save + utf8_index(save, no);
1276 reg_utf8_tounicode_case(preg->reginput, &c, (preg->cflags & REG_ICASE));
1277 /* If it could work, try it. */
1278 if (reg_iseol(preg, nextch) || c == nextch) {
1279 if (regmatch(preg, next)) {
1280 return(1);
1283 if (matchmin) {
1284 /* Couldn't or didn't, add one more */
1285 no++;
1287 else {
1288 /* Couldn't or didn't -- back up. */
1289 no--;
1292 return(0);
1295 static int regmatchrepeat(regex_t *preg, int scan, int matchmin)
1297 int *scanpt = preg->program + scan;
1299 int max = scanpt[2];
1300 int min = scanpt[3];
1302 /* Have we reached min? */
1303 if (scanpt[4] < min) {
1304 /* No, so get another one */
1305 scanpt[4]++;
1306 if (regmatch(preg, scan + 5)) {
1307 return 1;
1309 scanpt[4]--;
1310 return 0;
1312 if (scanpt[4] > max) {
1313 return 0;
1316 if (matchmin) {
1317 /* minimal, so try other branch first */
1318 if (regmatch(preg, regnext(preg, scan))) {
1319 return 1;
1321 /* No, so try one more */
1322 scanpt[4]++;
1323 if (regmatch(preg, scan + 5)) {
1324 return 1;
1326 scanpt[4]--;
1327 return 0;
1329 /* maximal, so try this branch again */
1330 if (scanpt[4] < max) {
1331 scanpt[4]++;
1332 if (regmatch(preg, scan + 5)) {
1333 return 1;
1335 scanpt[4]--;
1337 /* At this point we are at max with no match. Try the other branch */
1338 return regmatch(preg, regnext(preg, scan));
1342 - regmatch - main matching routine
1344 * Conceptually the strategy is simple: check to see whether the current
1345 * node matches, call self recursively to see whether the rest matches,
1346 * and then act accordingly. In practice we make some effort to avoid
1347 * recursion, in particular by going through "ordinary" nodes (that don't
1348 * need to know whether the rest of the match failed) by a loop instead of
1349 * by recursion.
1351 /* 0 failure, 1 success */
1352 static int regmatch(regex_t *preg, int prog)
1354 int scan; /* Current node. */
1355 int next; /* Next node. */
1356 const char *save;
1358 scan = prog;
1360 #ifdef DEBUG
1361 if (scan != 0 && regnarrate)
1362 fprintf(stderr, "%s(\n", regprop(scan));
1363 #endif
1364 while (scan != 0) {
1365 int n;
1366 int c;
1367 #ifdef DEBUG
1368 if (regnarrate) {
1369 fprintf(stderr, "%3d: %s...\n", scan, regprop(OP(preg, scan))); /* Where, what. */
1371 #endif
1372 next = regnext(preg, scan);
1373 n = reg_utf8_tounicode_case(preg->reginput, &c, (preg->cflags & REG_ICASE));
1375 switch (OP(preg, scan)) {
1376 case BOL:
1377 if (preg->reginput != preg->regbol)
1378 return(0);
1379 break;
1380 case EOL:
1381 if (!reg_iseol(preg, c)) {
1382 return(0);
1384 break;
1385 case WORDA:
1386 /* Must be looking at a letter, digit, or _ */
1387 if ((!isalnum(UCHAR(c))) && c != '_')
1388 return(0);
1389 /* Prev must be BOL or nonword */
1390 if (preg->reginput > preg->regbol &&
1391 (isalnum(UCHAR(preg->reginput[-1])) || preg->reginput[-1] == '_'))
1392 return(0);
1393 break;
1394 case WORDZ:
1395 /* Can't match at BOL */
1396 if (preg->reginput > preg->regbol) {
1397 /* Current must be EOL or nonword */
1398 if (reg_iseol(preg, c) || !isalnum(UCHAR(c)) || c != '_') {
1399 c = preg->reginput[-1];
1400 /* Previous must be word */
1401 if (isalnum(UCHAR(c)) || c == '_') {
1402 break;
1406 /* No */
1407 return(0);
1409 case ANY:
1410 if (reg_iseol(preg, c))
1411 return 0;
1412 preg->reginput += n;
1413 break;
1414 case EXACTLY: {
1415 int opnd;
1416 int len;
1417 int slen;
1419 opnd = OPERAND(scan);
1420 len = str_int_len(preg->program + opnd);
1422 slen = prefix_cmp(preg->program + opnd, len, preg->reginput, preg->cflags & REG_ICASE);
1423 if (slen < 0) {
1424 return(0);
1426 preg->reginput += slen;
1428 break;
1429 case ANYOF:
1430 if (reg_iseol(preg, c) || reg_range_find(preg->program + OPERAND(scan), c) == 0) {
1431 return(0);
1433 preg->reginput += n;
1434 break;
1435 case ANYBUT:
1436 if (reg_iseol(preg, c) || reg_range_find(preg->program + OPERAND(scan), c) != 0) {
1437 return(0);
1439 preg->reginput += n;
1440 break;
1441 case NOTHING:
1442 break;
1443 case BACK:
1444 break;
1445 case BRANCH:
1446 if (OP(preg, next) != BRANCH) /* No choice. */
1447 next = OPERAND(scan); /* Avoid recursion. */
1448 else {
1449 do {
1450 save = preg->reginput;
1451 if (regmatch(preg, OPERAND(scan))) {
1452 return(1);
1454 preg->reginput = save;
1455 scan = regnext(preg, scan);
1456 } while (scan != 0 && OP(preg, scan) == BRANCH);
1457 return(0);
1458 /* NOTREACHED */
1460 break;
1461 case REP:
1462 case REPMIN:
1463 return regmatchsimplerepeat(preg, scan, OP(preg, scan) == REPMIN);
1465 case REPX:
1466 case REPXMIN:
1467 return regmatchrepeat(preg, scan, OP(preg, scan) == REPXMIN);
1469 case END:
1470 return 1; /* Success! */
1472 case OPENNC:
1473 case CLOSENC:
1474 return regmatch(preg, next);
1476 default:
1477 if (OP(preg, scan) >= OPEN+1 && OP(preg, scan) < CLOSE_END) {
1478 save = preg->reginput;
1479 if (regmatch(preg, next)) {
1480 if (OP(preg, scan) < CLOSE) {
1481 int no = OP(preg, scan) - OPEN;
1482 if (no < preg->nmatch && preg->pmatch[no].rm_so == -1) {
1483 preg->pmatch[no].rm_so = save - preg->start;
1486 else {
1487 int no = OP(preg, scan) - CLOSE;
1488 if (no < preg->nmatch && preg->pmatch[no].rm_eo == -1) {
1489 preg->pmatch[no].rm_eo = save - preg->start;
1492 return(1);
1494 return(0);
1496 return REG_ERR_INTERNAL;
1499 scan = next;
1503 * We get here only if there's trouble -- normally "case END" is
1504 * the terminating point.
1506 return REG_ERR_INTERNAL;
1510 - regrepeat - repeatedly match something simple, report how many
1512 static int regrepeat(regex_t *preg, int p, int max)
1514 int count = 0;
1515 const char *scan;
1516 int opnd;
1517 int ch;
1518 int n;
1520 scan = preg->reginput;
1521 opnd = OPERAND(p);
1522 switch (OP(preg, p)) {
1523 case ANY:
1524 /* No need to handle utf8 specially here */
1525 while (!reg_iseol(preg, *scan) && count < max) {
1526 count++;
1527 scan++;
1529 break;
1530 case EXACTLY:
1531 while (count < max) {
1532 n = reg_utf8_tounicode_case(scan, &ch, preg->cflags & REG_ICASE);
1533 if (preg->program[opnd] != ch) {
1534 break;
1536 count++;
1537 scan += n;
1539 break;
1540 case ANYOF:
1541 while (count < max) {
1542 n = reg_utf8_tounicode_case(scan, &ch, preg->cflags & REG_ICASE);
1543 if (reg_iseol(preg, ch) || reg_range_find(preg->program + opnd, ch) == 0) {
1544 break;
1546 count++;
1547 scan += n;
1549 break;
1550 case ANYBUT:
1551 while (count < max) {
1552 n = reg_utf8_tounicode_case(scan, &ch, preg->cflags & REG_ICASE);
1553 if (reg_iseol(preg, ch) || reg_range_find(preg->program + opnd, ch) != 0) {
1554 break;
1556 count++;
1557 scan += n;
1559 break;
1560 default: /* Oh dear. Called inappropriately. */
1561 preg->err = REG_ERR_INTERNAL;
1562 count = 0; /* Best compromise. */
1563 break;
1565 preg->reginput = scan;
1567 return(count);
1571 - regnext - dig the "next" pointer out of a node
1573 static int regnext(regex_t *preg, int p )
1575 int offset;
1577 offset = NEXT(preg, p);
1579 if (offset == 0)
1580 return 0;
1582 if (OP(preg, p) == BACK)
1583 return(p-offset);
1584 else
1585 return(p+offset);
1589 - regopsize - returns the size of opcode + operands at 'p' in words
1591 static int regopsize(regex_t *preg, int p )
1593 /* Almost all opcodes are 2 words, but some are more */
1594 switch (OP(preg, p)) {
1595 case REP:
1596 case REPMIN:
1597 case REPX:
1598 case REPXMIN:
1599 return 5;
1601 case ANYOF:
1602 case ANYBUT:
1603 case EXACTLY: {
1604 int s = p + 2;
1605 while (preg->program[s++]) {
1607 return s - p;
1610 return 2;
1613 #if defined(DEBUG) && !defined(JIM_BOOTSTRAP)
1616 - regdump - dump a regexp onto stdout in vaguely comprehensible form
1618 static void regdump(regex_t *preg)
1620 int s;
1621 int op = EXACTLY; /* Arbitrary non-END op. */
1622 int next;
1623 char buf[MAX_UTF8_LEN + 1];
1625 int i;
1626 for (i = 1; i < preg->p; i++) {
1627 printf("%02x ", (unsigned char)preg->program[i]);
1628 if (i % 16 == 0) {
1629 printf("\n");
1632 printf("\n");
1634 s = 1;
1635 while (op != END && s < preg->p) { /* While that wasn't END last time... */
1636 op = OP(preg, s);
1637 printf("%3d: %s", s, regprop(op)); /* Where, what. */
1638 next = regnext(preg, s);
1639 if (next == 0) /* Next ptr. */
1640 printf("(0)");
1641 else
1642 printf("(%d)", next);
1643 s += 2;
1644 if (op == REP || op == REPMIN || op == REPX || op == REPXMIN) {
1645 int max = preg->program[s];
1646 int min = preg->program[s + 1];
1647 if (max == 65535) {
1648 printf("{%d,*}", min);
1650 else {
1651 printf("{%d,%d}", min, max);
1653 printf(" %d", preg->program[s + 2]);
1654 s += 3;
1656 else if (op == ANYOF || op == ANYBUT) {
1657 /* set of ranges */
1659 while (preg->program[s]) {
1660 int len = preg->program[s++];
1661 int first = preg->program[s++];
1662 buf[utf8_getchars(buf, first)] = 0;
1663 printf("%s", buf);
1664 if (len > 1) {
1665 buf[utf8_getchars(buf, first + len - 1)] = 0;
1666 printf("-%s", buf);
1669 s++;
1671 else if (op == EXACTLY) {
1672 /* Literal string, where present. */
1674 while (preg->program[s]) {
1675 buf[utf8_getchars(buf, preg->program[s])] = 0;
1676 printf("%s", buf);
1677 s++;
1679 s++;
1681 putchar('\n');
1684 if (op == END) {
1685 /* Header fields of interest. */
1686 if (preg->regstart) {
1687 buf[utf8_getchars(buf, preg->regstart)] = 0;
1688 printf("start '%s' ", buf);
1690 if (preg->reganch)
1691 printf("anchored ");
1692 if (preg->regmust != 0) {
1693 int i;
1694 printf("must have:");
1695 for (i = 0; i < preg->regmlen; i++) {
1696 putchar(preg->program[preg->regmust + i]);
1698 putchar('\n');
1701 printf("\n");
1705 - regprop - printable representation of opcode
1707 static const char *regprop( int op )
1709 static char buf[50];
1711 switch (op) {
1712 case BOL:
1713 return "BOL";
1714 case EOL:
1715 return "EOL";
1716 case ANY:
1717 return "ANY";
1718 case ANYOF:
1719 return "ANYOF";
1720 case ANYBUT:
1721 return "ANYBUT";
1722 case BRANCH:
1723 return "BRANCH";
1724 case EXACTLY:
1725 return "EXACTLY";
1726 case NOTHING:
1727 return "NOTHING";
1728 case BACK:
1729 return "BACK";
1730 case END:
1731 return "END";
1732 case REP:
1733 return "REP";
1734 case REPMIN:
1735 return "REPMIN";
1736 case REPX:
1737 return "REPX";
1738 case REPXMIN:
1739 return "REPXMIN";
1740 case WORDA:
1741 return "WORDA";
1742 case WORDZ:
1743 return "WORDZ";
1744 case OPENNC:
1745 return "OPEN";
1746 case CLOSENC:
1747 return "CLOSE";
1748 default:
1749 if (op >= OPEN && op < CLOSE) {
1750 snprintf(buf, sizeof(buf), "OPEN%d", op-OPEN);
1752 else if (op >= CLOSE && op < CLOSE_END) {
1753 snprintf(buf, sizeof(buf), "CLOSE%d", op-CLOSE);
1755 else {
1756 snprintf(buf, sizeof(buf), "?%d?\n", op);
1758 return(buf);
1761 #endif /* JIM_BOOTSTRAP */
1763 size_t regerror(int errcode, const regex_t *preg, char *errbuf, size_t errbuf_size)
1765 static const char *error_strings[] = {
1766 "success",
1767 "no match",
1768 "bad pattern",
1769 "null argument",
1770 "unknown error",
1771 "too big",
1772 "out of memory",
1773 "too many ()",
1774 "parentheses () not balanced",
1775 "braces {} not balanced",
1776 "invalid repetition count(s)",
1777 "extra characters",
1778 "*+ of empty atom",
1779 "nested count",
1780 "internal error",
1781 "count follows nothing",
1782 "trailing backslash",
1783 "corrupted program",
1784 "contains null char",
1786 const char *err;
1788 if (errcode < 0 || errcode >= REG_ERR_NUM) {
1789 err = "Bad error code";
1791 else {
1792 err = error_strings[errcode];
1795 return snprintf(errbuf, errbuf_size, "%s", err);
1798 void regfree(regex_t *preg)
1800 free(preg->program);
1803 #endif