Add a general purpose hashtable pattern matcher
[jimtcl.git] / jimregexp.c
blob0c5a4ddcac774d2e5d50c25cb32aa4a237886078
1 /*
2 * regcomp and regexec -- regsub and regerror are elsewhere
4 * Copyright (c) 1986 by University of Toronto.
5 * Written by Henry Spencer. Not derived from licensed software.
7 * Permission is granted to anyone to use this software for any
8 * purpose on any computer system, and to redistribute it freely,
9 * subject to the following restrictions:
11 * 1. The author is not responsible for the consequences of use of
12 * this software, no matter how awful, even if they arise
13 * from defects in it.
15 * 2. The origin of this software must not be misrepresented, either
16 * by explicit claim or by omission.
18 * 3. Altered versions must be plainly marked as such, and must not
19 * be misrepresented as being the original software.
20 *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore,
21 *** hoptoad!gnu, on 27 Dec 1986, to add \n as an alternative to |
22 *** to assist in implementing egrep.
23 *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore,
24 *** hoptoad!gnu, on 27 Dec 1986, to add \< and \> for word-matching
25 *** as in BSD grep and ex.
26 *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore,
27 *** hoptoad!gnu, on 28 Dec 1986, to optimize characters quoted with \.
28 *** THIS IS AN ALTERED VERSION. It was altered by James A. Woods,
29 *** ames!jaw, on 19 June 1987, to quash a regcomp() redundancy.
30 *** THIS IS AN ALTERED VERSION. It was altered by Christopher Seiwald
31 *** seiwald@vix.com, on 28 August 1993, for use in jam. Regmagic.h
32 *** was moved into regexp.h, and the include of regexp.h now uses "'s
33 *** to avoid conflicting with the system regexp.h. Const, bless its
34 *** soul, was removed so it can compile everywhere. The declaration
35 *** of strchr() was in conflict on AIX, so it was removed (as it is
36 *** happily defined in string.h).
37 *** THIS IS AN ALTERED VERSION. It was altered by Christopher Seiwald
38 *** seiwald@perforce.com, on 20 January 2000, to use function prototypes.
39 *** THIS IS AN ALTERED VERSION. It was altered by Christopher Seiwald
40 *** seiwald@perforce.com, on 05 November 2002, to const string literals.
42 * THIS IS AN ALTERED VERSION. It was altered by Steve Bennett <steveb@workware.net.au>
43 * on 16 October 2010, to remove static state and add better Tcl ARE compatibility.
44 * This includes counted repetitions, UTF-8 support, character classes,
45 * shorthand character classes, increased number of parentheses to 100,
46 * backslash escape sequences. It also removes \n as an alternative to |.
48 * Beware that some of this code is subtly aware of the way operator
49 * precedence is structured in regular expressions. Serious changes in
50 * regular-expression syntax might require a total rethink.
52 #include <stdio.h>
53 #include <ctype.h>
54 #include <stdlib.h>
55 #include <string.h>
57 #include "jim.h"
58 #include "jimautoconf.h"
59 #include "jimregexp.h"
60 #include "utf8.h"
62 #if !defined(HAVE_REGCOMP) || defined(JIM_REGEXP)
65 * Structure for regexp "program". This is essentially a linear encoding
66 * of a nondeterministic finite-state machine (aka syntax charts or
67 * "railroad normal form" in parsing technology). Each node is an opcode
68 * plus a "next" pointer, possibly plus an operand. "Next" pointers of
69 * all nodes except BRANCH implement concatenation; a "next" pointer with
70 * a BRANCH on both ends of it is connecting two alternatives. (Here we
71 * have one of the subtle syntax dependencies: an individual BRANCH (as
72 * opposed to a collection of them) is never concatenated with anything
73 * because of operator precedence.) The operand of some types of node is
74 * a literal string; for others, it is a node leading into a sub-FSM. In
75 * particular, the operand of a BRANCH node is the first node of the branch.
76 * (NB this is *not* a tree structure: the tail of the branch connects
77 * to the thing following the set of BRANCHes.) The opcodes are:
80 /* This *MUST* be less than (255-20-1)/2=117 */
81 #define REG_MAX_PAREN 100
83 /* definition number opnd? meaning */
84 #define END 0 /* no End of program. */
85 #define BOL 1 /* no Match "" at beginning of line. */
86 #define EOL 2 /* no Match "" at end of line. */
87 #define ANY 3 /* no Match any one character. */
88 #define ANYOF 4 /* str Match any character in this string. */
89 #define ANYBUT 5 /* str Match any character not in this string. */
90 #define BRANCH 6 /* node Match this alternative, or the next... */
91 #define BACK 7 /* no Match "", "next" ptr points backward. */
92 #define EXACTLY 8 /* str Match this string. */
93 #define NOTHING 9 /* no Match empty string. */
94 #define REP 10 /* max,min Match this (simple) thing [min,max] times. */
95 #define REPMIN 11 /* max,min Match this (simple) thing [min,max] times, mininal match. */
96 #define REPX 12 /* max,min Match this (complex) thing [min,max] times. */
97 #define REPXMIN 13 /* max,min Match this (complex) thing [min,max] times, minimal match. */
99 #define WORDA 15 /* no Match "" at wordchar, where prev is nonword */
100 #define WORDZ 16 /* no Match "" at nonwordchar, where prev is word */
101 #define OPENNC 19 /* no Non-capturing parentheses - must be OPEN-1 */
102 #define OPEN 20 /* no Mark this point in input as start of #n. */
103 /* OPEN+1 is number 1, etc. */
104 #define CLOSE (OPEN+REG_MAX_PAREN+1) /* no Analogous to OPEN. */
105 #define CLOSE_END (CLOSE+REG_MAX_PAREN)
106 #define CLOSENC (CLOSE-1) /* no Non-capturing parentheses - must be CLOSE-1 */
109 * The first byte of the regexp internal "program" is actually this magic
110 * number; the start node begins in the second byte.
112 #define REG_MAGIC 0xFADED00D
115 * Opcode notes:
117 * BRANCH The set of branches constituting a single choice are hooked
118 * together with their "next" pointers, since precedence prevents
119 * anything being concatenated to any individual branch. The
120 * "next" pointer of the last BRANCH in a choice points to the
121 * thing following the whole choice. This is also where the
122 * final "next" pointer of each individual branch points; each
123 * branch starts with the operand node of a BRANCH node.
125 * BACK Normal "next" pointers all implicitly point forward; BACK
126 * exists to make loop structures possible.
128 * STAR,PLUS '?', and complex '*' and '+', are implemented as circular
129 * BRANCH structures using BACK. Simple cases (one character
130 * per match) are implemented with STAR and PLUS for speed
131 * and to minimize recursive plunges.
133 * OPEN,CLOSE ...are numbered at compile time.
137 * A node is one char of opcode followed by two chars of "next" pointer.
138 * "Next" pointers are stored as two 8-bit pieces, high order first. The
139 * value is a positive offset from the opcode of the node containing it.
140 * An operand, if any, simply follows the node. (Note that much of the
141 * code generation knows about this implicit relationship.)
143 * Using two bytes for the "next" pointer is vast overkill for most things,
144 * but allows patterns to get big without disasters.
146 #define OP(preg, p) (preg->program[p])
147 #define NEXT(preg, p) (preg->program[p + 1])
148 #define OPERAND(p) ((p) + 2)
151 * See regmagic.h for one further detail of program structure.
156 * Utility definitions.
159 #define FAIL(R,M) { (R)->err = (M); return (M); }
160 #define ISMULT(c) ((c) == '*' || (c) == '+' || (c) == '?' || (c) == '{')
161 #define META "^$.[()|?{+*"
164 * Flags to be passed up and down.
166 #define HASWIDTH 01 /* Known never to match null string. */
167 #define SIMPLE 02 /* Simple enough to be STAR/PLUS operand. */
168 #define SPSTART 04 /* Starts with * or +. */
169 #define WORST 0 /* Worst case. */
171 #define MAX_REP_COUNT 1000000
174 * Forward declarations for regcomp()'s friends.
176 static int reg(regex_t *preg, int paren /* Parenthesized? */, int *flagp );
177 static int regpiece(regex_t *preg, int *flagp );
178 static int regbranch(regex_t *preg, int *flagp );
179 static int regatom(regex_t *preg, int *flagp );
180 static int regnode(regex_t *preg, int op );
181 static int regnext(regex_t *preg, int p );
182 static void regc(regex_t *preg, int b );
183 static int reginsert(regex_t *preg, int op, int size, int opnd );
184 static void regtail_(regex_t *preg, int p, int val, int line );
185 static void regoptail(regex_t *preg, int p, int val );
186 #define regtail(PREG, P, VAL) regtail_(PREG, P, VAL, __LINE__)
188 static int reg_range_find(const int *string, int c);
189 static const char *str_find(const char *string, int c, int nocase);
190 static int prefix_cmp(const int *prog, int proglen, const char *string, int nocase);
192 /*#define DEBUG*/
193 #ifdef DEBUG
194 static int regnarrate = 0;
195 static void regdump(regex_t *preg);
196 static const char *regprop( int op );
197 #endif
201 * Returns the length of the null-terminated integer sequence.
203 static int str_int_len(const int *seq)
205 int n = 0;
206 while (*seq++) {
207 n++;
209 return n;
213 - regcomp - compile a regular expression into internal code
215 * We can't allocate space until we know how big the compiled form will be,
216 * but we can't compile it (and thus know how big it is) until we've got a
217 * place to put the code. So we cheat: we compile it twice, once with code
218 * generation turned off and size counting turned on, and once "for real".
219 * This also means that we don't allocate space until we are sure that the
220 * thing really will compile successfully, and we never have to move the
221 * code and thus invalidate pointers into it. (Note that it has to be in
222 * one piece because free() must be able to free it all.)
224 * Beware that the optimization-preparation code in here knows about some
225 * of the structure of the compiled regexp.
227 int regcomp(regex_t *preg, const char *exp, int cflags)
229 int scan;
230 int longest;
231 unsigned len;
232 int flags;
234 #ifdef DEBUG
235 fprintf(stderr, "Compiling: '%s'\n", exp);
236 #endif
237 memset(preg, 0, sizeof(*preg));
239 if (exp == NULL)
240 FAIL(preg, REG_ERR_NULL_ARGUMENT);
242 /* First pass: determine size, legality. */
243 preg->cflags = cflags;
244 preg->regparse = exp;
245 /* XXX: For now, start unallocated */
246 preg->program = NULL;
247 preg->proglen = 0;
249 /* Allocate space. */
250 preg->proglen = (strlen(exp) + 1) * 5;
251 preg->program = malloc(preg->proglen * sizeof(int));
252 if (preg->program == NULL)
253 FAIL(preg, REG_ERR_NOMEM);
255 /* Note that since we store a magic value as the first item in the program,
256 * program offsets will never be 0
258 regc(preg, REG_MAGIC);
259 if (reg(preg, 0, &flags) == 0) {
260 return preg->err;
263 /* Small enough for pointer-storage convention? */
264 if (preg->re_nsub >= REG_MAX_PAREN) /* Probably could be 65535L. */
265 FAIL(preg,REG_ERR_TOO_BIG);
267 /* Dig out information for optimizations. */
268 preg->regstart = 0; /* Worst-case defaults. */
269 preg->reganch = 0;
270 preg->regmust = 0;
271 preg->regmlen = 0;
272 scan = 1; /* First BRANCH. */
273 if (OP(preg, regnext(preg, scan)) == END) { /* Only one top-level choice. */
274 scan = OPERAND(scan);
276 /* Starting-point info. */
277 if (OP(preg, scan) == EXACTLY) {
278 preg->regstart = preg->program[OPERAND(scan)];
280 else if (OP(preg, scan) == BOL)
281 preg->reganch++;
284 * If there's something expensive in the r.e., find the
285 * longest literal string that must appear and make it the
286 * regmust. Resolve ties in favor of later strings, since
287 * the regstart check works with the beginning of the r.e.
288 * and avoiding duplication strengthens checking. Not a
289 * strong reason, but sufficient in the absence of others.
291 if (flags&SPSTART) {
292 longest = 0;
293 len = 0;
294 for (; scan != 0; scan = regnext(preg, scan)) {
295 if (OP(preg, scan) == EXACTLY) {
296 int plen = str_int_len(preg->program + OPERAND(scan));
297 if (plen >= len) {
298 longest = OPERAND(scan);
299 len = plen;
303 preg->regmust = longest;
304 preg->regmlen = len;
308 #ifdef DEBUG
309 regdump(preg);
310 #endif
312 return 0;
316 - reg - regular expression, i.e. main body or parenthesized thing
318 * Caller must absorb opening parenthesis.
320 * Combining parenthesis handling with the base level of regular expression
321 * is a trifle forced, but the need to tie the tails of the branches to what
322 * follows makes it hard to avoid.
324 static int reg(regex_t *preg, int paren /* Parenthesized? */, int *flagp )
326 int ret;
327 int br;
328 int ender;
329 int parno = 0;
330 int flags;
332 *flagp = HASWIDTH; /* Tentatively. */
334 /* Make an OPEN node, if parenthesized. */
335 if (paren) {
336 if (preg->regparse[0] == '?' && preg->regparse[1] == ':') {
337 /* non-capturing paren */
338 preg->regparse += 2;
339 parno = -1;
341 else {
342 parno = ++preg->re_nsub;
344 ret = regnode(preg, OPEN+parno);
345 } else
346 ret = 0;
348 /* Pick up the branches, linking them together. */
349 br = regbranch(preg, &flags);
350 if (br == 0)
351 return 0;
352 if (ret != 0)
353 regtail(preg, ret, br); /* OPEN -> first. */
354 else
355 ret = br;
356 if (!(flags&HASWIDTH))
357 *flagp &= ~HASWIDTH;
358 *flagp |= flags&SPSTART;
359 while (*preg->regparse == '|') {
360 preg->regparse++;
361 br = regbranch(preg, &flags);
362 if (br == 0)
363 return 0;
364 regtail(preg, ret, br); /* BRANCH -> BRANCH. */
365 if (!(flags&HASWIDTH))
366 *flagp &= ~HASWIDTH;
367 *flagp |= flags&SPSTART;
370 /* Make a closing node, and hook it on the end. */
371 ender = regnode(preg, (paren) ? CLOSE+parno : END);
372 regtail(preg, ret, ender);
374 /* Hook the tails of the branches to the closing node. */
375 for (br = ret; br != 0; br = regnext(preg, br))
376 regoptail(preg, br, ender);
378 /* Check for proper termination. */
379 if (paren && *preg->regparse++ != ')') {
380 preg->err = REG_ERR_UNMATCHED_PAREN;
381 return 0;
382 } else if (!paren && *preg->regparse != '\0') {
383 if (*preg->regparse == ')') {
384 preg->err = REG_ERR_UNMATCHED_PAREN;
385 return 0;
386 } else {
387 preg->err = REG_ERR_JUNK_ON_END;
388 return 0;
392 return(ret);
396 - regbranch - one alternative of an | operator
398 * Implements the concatenation operator.
400 static int regbranch(regex_t *preg, int *flagp )
402 int ret;
403 int chain;
404 int latest;
405 int flags;
407 *flagp = WORST; /* Tentatively. */
409 ret = regnode(preg, BRANCH);
410 chain = 0;
411 while (*preg->regparse != '\0' && *preg->regparse != ')' &&
412 *preg->regparse != '|') {
413 latest = regpiece(preg, &flags);
414 if (latest == 0)
415 return 0;
416 *flagp |= flags&HASWIDTH;
417 if (chain == 0) {/* First piece. */
418 *flagp |= flags&SPSTART;
420 else {
421 regtail(preg, chain, latest);
423 chain = latest;
425 if (chain == 0) /* Loop ran zero times. */
426 (void) regnode(preg, NOTHING);
428 return(ret);
432 - regpiece - something followed by possible [*+?]
434 * Note that the branching code sequences used for ? and the general cases
435 * of * and + are somewhat optimized: they use the same NOTHING node as
436 * both the endmarker for their branch list and the body of the last branch.
437 * It might seem that this node could be dispensed with entirely, but the
438 * endmarker role is not redundant.
440 static int regpiece(regex_t *preg, int *flagp)
442 int ret;
443 char op;
444 int next;
445 int flags;
446 int chain = 0;
447 int min;
448 int max;
450 ret = regatom(preg, &flags);
451 if (ret == 0)
452 return 0;
454 op = *preg->regparse;
455 if (!ISMULT(op)) {
456 *flagp = flags;
457 return(ret);
460 if (!(flags&HASWIDTH) && op != '?') {
461 preg->err = REG_ERR_OPERAND_COULD_BE_EMPTY;
462 return 0;
465 /* Handle braces (counted repetition) by expansion */
466 if (op == '{') {
467 char *end;
469 min = strtoul(preg->regparse + 1, &end, 10);
470 if (end == preg->regparse + 1) {
471 preg->err = REG_ERR_BAD_COUNT;
472 return 0;
474 if (*end == '}') {
475 max = min;
477 else {
478 preg->regparse = end;
479 max = strtoul(preg->regparse + 1, &end, 10);
480 if (*end != '}') {
481 preg->err = REG_ERR_UNMATCHED_BRACES;
482 return 0;
485 if (end == preg->regparse + 1) {
486 max = MAX_REP_COUNT;
488 else if (max < min || max >= 100) {
489 preg->err = REG_ERR_BAD_COUNT;
490 return 0;
492 if (min >= 100) {
493 preg->err = REG_ERR_BAD_COUNT;
494 return 0;
497 preg->regparse = strchr(preg->regparse, '}');
499 else {
500 min = (op == '+');
501 max = (op == '?' ? 1 : MAX_REP_COUNT);
504 if (preg->regparse[1] == '?') {
505 preg->regparse++;
506 next = reginsert(preg, flags & SIMPLE ? REPMIN : REPXMIN, 5, ret);
508 else {
509 next = reginsert(preg, flags & SIMPLE ? REP: REPX, 5, ret);
511 preg->program[ret + 2] = max;
512 preg->program[ret + 3] = min;
513 preg->program[ret + 4] = 0;
515 *flagp = (min) ? (WORST|HASWIDTH) : (WORST|SPSTART);
517 if (!(flags & SIMPLE)) {
518 int back = regnode(preg, BACK);
519 regtail(preg, back, ret);
520 regtail(preg, next, back);
523 preg->regparse++;
524 if (ISMULT(*preg->regparse)) {
525 preg->err = REG_ERR_NESTED_COUNT;
526 return 0;
529 return chain ? chain : ret;
533 * Add all characters in the inclusive range between lower and upper.
535 * Handles a swapped range (upper < lower).
537 static void reg_addrange(regex_t *preg, int lower, int upper)
539 if (lower > upper) {
540 reg_addrange(preg, upper, lower);
542 /* Add a range as length, start */
543 regc(preg, upper - lower + 1);
544 regc(preg, lower);
548 * Add a null-terminated literal string as a set of ranges.
550 static void reg_addrange_str(regex_t *preg, const char *str)
552 while (*str) {
553 reg_addrange(preg, *str, *str);
554 str++;
559 * Extracts the next unicode char from utf8.
561 * If 'upper' is set, converts the char to uppercase.
563 static int reg_utf8_tounicode_case(const char *s, int *uc, int upper)
565 int l = utf8_tounicode(s, uc);
566 if (upper) {
567 *uc = utf8_upper(*uc);
569 return l;
573 * Converts a hex digit to decimal.
575 * Returns -1 for an invalid hex digit.
577 static int hexdigitval(int c)
579 if (c >= '0' && c <= '9')
580 return c - '0';
581 if (c >= 'a' && c <= 'f')
582 return c - 'a' + 10;
583 if (c >= 'A' && c <= 'F')
584 return c - 'A' + 10;
585 return -1;
589 * Parses up to 'n' hex digits at 's' and stores the result in *uc.
591 * Returns the number of hex digits parsed.
592 * If there are no hex digits, returns 0 and stores nothing.
594 static int parse_hex(const char *s, int n, int *uc)
596 int val = 0;
597 int k;
599 for (k = 0; k < n; k++) {
600 int c = hexdigitval(*s++);
601 if (c == -1) {
602 break;
604 val = (val << 4) | c;
606 if (k) {
607 *uc = val;
609 return k;
613 * Call for chars after a backlash to decode the escape sequence.
615 * Stores the result in *ch.
617 * Returns the number of bytes consumed.
619 static int reg_decode_escape(const char *s, int *ch)
621 int n;
622 const char *s0 = s;
624 *ch = *s++;
626 switch (*ch) {
627 case 'b': *ch = '\b'; break;
628 case 'e': *ch = 27; break;
629 case 'f': *ch = '\f'; break;
630 case 'n': *ch = '\n'; break;
631 case 'r': *ch = '\r'; break;
632 case 't': *ch = '\t'; break;
633 case 'v': *ch = '\v'; break;
634 case 'u':
635 if ((n = parse_hex(s, 4, ch)) > 0) {
636 s += n;
638 break;
639 case 'x':
640 if ((n = parse_hex(s, 2, ch)) > 0) {
641 s += n;
643 break;
644 case '\0':
645 s--;
646 *ch = '\\';
647 break;
649 return s - s0;
653 - regatom - the lowest level
655 * Optimization: gobbles an entire sequence of ordinary characters so that
656 * it can turn them into a single node, which is smaller to store and
657 * faster to run. Backslashed characters are exceptions, each becoming a
658 * separate node; the code is simpler that way and it's not worth fixing.
660 static int regatom(regex_t *preg, int *flagp)
662 int ret;
663 int flags;
664 int nocase = (preg->cflags & REG_ICASE);
666 int ch;
667 int n = reg_utf8_tounicode_case(preg->regparse, &ch, nocase);
669 *flagp = WORST; /* Tentatively. */
671 preg->regparse += n;
672 switch (ch) {
673 /* FIXME: these chars only have meaning at beg/end of pat? */
674 case '^':
675 ret = regnode(preg, BOL);
676 break;
677 case '$':
678 ret = regnode(preg, EOL);
679 break;
680 case '.':
681 ret = regnode(preg, ANY);
682 *flagp |= HASWIDTH|SIMPLE;
683 break;
684 case '[': {
685 const char *pattern = preg->regparse;
687 if (*pattern == '^') { /* Complement of range. */
688 ret = regnode(preg, ANYBUT);
689 pattern++;
690 } else
691 ret = regnode(preg, ANYOF);
693 /* Special case. If the first char is ']' or '-', it is part of the set */
694 if (*pattern == ']' || *pattern == '-') {
695 reg_addrange(preg, *pattern, *pattern);
696 pattern++;
699 while (*pattern && *pattern != ']') {
700 /* Is this a range? a-z */
701 int start;
702 int end;
704 pattern += reg_utf8_tounicode_case(pattern, &start, nocase);
705 if (start == '\\') {
706 pattern += reg_decode_escape(pattern, &start);
707 if (start == 0) {
708 preg->err = REG_ERR_NULL_CHAR;
709 return 0;
712 if (pattern[0] == '-' && pattern[1] && pattern[1] != ']') {
713 /* skip '-' */
714 pattern += utf8_tounicode(pattern, &end);
715 pattern += reg_utf8_tounicode_case(pattern, &end, nocase);
716 if (end == '\\') {
717 pattern += reg_decode_escape(pattern, &end);
718 if (end == 0) {
719 preg->err = REG_ERR_NULL_CHAR;
720 return 0;
724 reg_addrange(preg, start, end);
725 continue;
727 if (start == '[') {
728 if (strncmp(pattern, ":alpha:]", 8) == 0) {
729 if ((preg->cflags & REG_ICASE) == 0) {
730 reg_addrange(preg, 'a', 'z');
732 reg_addrange(preg, 'A', 'Z');
733 pattern += 8;
734 continue;
736 if (strncmp(pattern, ":alnum:]", 8) == 0) {
737 if ((preg->cflags & REG_ICASE) == 0) {
738 reg_addrange(preg, 'a', 'z');
740 reg_addrange(preg, 'A', 'Z');
741 reg_addrange(preg, '0', '9');
742 pattern += 8;
743 continue;
745 if (strncmp(pattern, ":space:]", 8) == 0) {
746 reg_addrange_str(preg, " \t\r\n\f\v");
747 pattern += 8;
748 continue;
751 /* Not a range, so just add the char */
752 reg_addrange(preg, start, start);
754 regc(preg, '\0');
756 if (*pattern) {
757 pattern++;
759 preg->regparse = pattern;
761 *flagp |= HASWIDTH|SIMPLE;
763 break;
764 case '(':
765 ret = reg(preg, 1, &flags);
766 if (ret == 0)
767 return 0;
768 *flagp |= flags&(HASWIDTH|SPSTART);
769 break;
770 case '\0':
771 case '|':
772 case ')':
773 preg->err = REG_ERR_INTERNAL;
774 return 0; /* Supposed to be caught earlier. */
775 case '?':
776 case '+':
777 case '*':
778 case '{':
779 preg->err = REG_ERR_COUNT_FOLLOWS_NOTHING;
780 return 0;
781 case '\\':
782 switch (*preg->regparse++) {
783 case '\0':
784 preg->err = REG_ERR_TRAILING_BACKSLASH;
785 return 0;
786 case '<':
787 case 'm':
788 ret = regnode(preg, WORDA);
789 break;
790 case '>':
791 case 'M':
792 ret = regnode(preg, WORDZ);
793 break;
794 case 'd':
795 ret = regnode(preg, ANYOF);
796 reg_addrange(preg, '0', '9');
797 regc(preg, '\0');
798 *flagp |= HASWIDTH|SIMPLE;
799 break;
800 case 'w':
801 ret = regnode(preg, ANYOF);
802 if ((preg->cflags & REG_ICASE) == 0) {
803 reg_addrange(preg, 'a', 'z');
805 reg_addrange(preg, 'A', 'Z');
806 reg_addrange(preg, '0', '9');
807 reg_addrange(preg, '_', '_');
808 regc(preg, '\0');
809 *flagp |= HASWIDTH|SIMPLE;
810 break;
811 case 's':
812 ret = regnode(preg, ANYOF);
813 reg_addrange_str(preg," \t\r\n\f\v");
814 regc(preg, '\0');
815 *flagp |= HASWIDTH|SIMPLE;
816 break;
817 /* FIXME: Someday handle \1, \2, ... */
818 default:
819 /* Handle general quoted chars in exact-match routine */
820 /* Back up to include the backslash */
821 preg->regparse--;
822 goto de_fault;
824 break;
825 de_fault:
826 default: {
828 * Encode a string of characters to be matched exactly.
830 int added = 0;
832 /* Back up to pick up the first char of interest */
833 preg->regparse -= n;
835 ret = regnode(preg, EXACTLY);
837 /* Note that a META operator such as ? or * consumes the
838 * preceding char.
839 * Thus we must be careful to look ahead by 2 and add the
840 * last char as it's own EXACTLY if necessary
843 /* Until end of string or a META char is reached */
844 while (*preg->regparse && strchr(META, *preg->regparse) == NULL) {
845 n = reg_utf8_tounicode_case(preg->regparse, &ch, (preg->cflags & REG_ICASE));
846 if (ch == '\\' && preg->regparse[n]) {
847 /* Non-trailing backslash.
848 * Is this a special escape, or a regular escape?
850 if (strchr("<>mMwds", preg->regparse[n])) {
851 /* A special escape. All done with EXACTLY */
852 break;
854 /* Decode it. Note that we add the length for the escape
855 * sequence to the length for the backlash so we can skip
856 * the entire sequence, or not as required.
858 n += reg_decode_escape(preg->regparse + n, &ch);
859 if (ch == 0) {
860 preg->err = REG_ERR_NULL_CHAR;
861 return 0;
865 /* Now we have one char 'ch' of length 'n'.
866 * Check to see if the following char is a MULT
869 if (ISMULT(preg->regparse[n])) {
870 /* Yes. But do we already have some EXACTLY chars? */
871 if (added) {
872 /* Yes, so return what we have and pick up the current char next time around */
873 break;
875 /* No, so add this single char and finish */
876 regc(preg, ch);
877 added++;
878 preg->regparse += n;
879 break;
882 /* No, so just add this char normally */
883 regc(preg, ch);
884 added++;
885 preg->regparse += n;
887 regc(preg, '\0');
889 *flagp |= HASWIDTH;
890 if (added == 1)
891 *flagp |= SIMPLE;
892 break;
894 break;
897 return(ret);
900 static void reg_grow(regex_t *preg, int n)
902 if (preg->p + n >= preg->proglen) {
903 preg->proglen = (preg->p + n) * 2;
904 preg->program = realloc(preg->program, preg->proglen * sizeof(int));
909 - regnode - emit a node
911 /* Location. */
912 static int regnode(regex_t *preg, int op)
914 reg_grow(preg, 2);
916 preg->program[preg->p++] = op;
917 preg->program[preg->p++] = 0;
919 /* Return the start of the node */
920 return preg->p - 2;
924 - regc - emit (if appropriate) a byte of code
926 static void regc(regex_t *preg, int b )
928 reg_grow(preg, 1);
929 preg->program[preg->p++] = b;
933 - reginsert - insert an operator in front of already-emitted operand
935 * Means relocating the operand.
936 * Returns the new location of the original operand.
938 static int reginsert(regex_t *preg, int op, int size, int opnd )
940 reg_grow(preg, size);
942 /* Move everything from opnd up */
943 memmove(preg->program + opnd + size, preg->program + opnd, sizeof(int) * (preg->p - opnd));
944 /* Zero out the new space */
945 memset(preg->program + opnd, 0, sizeof(int) * size);
947 preg->program[opnd] = op;
949 preg->p += size;
951 return opnd + size;
955 - regtail - set the next-pointer at the end of a node chain
957 static void regtail_(regex_t *preg, int p, int val, int line )
959 int scan;
960 int temp;
961 int offset;
963 /* Find last node. */
964 scan = p;
965 for (;;) {
966 temp = regnext(preg, scan);
967 if (temp == 0)
968 break;
969 scan = temp;
972 if (OP(preg, scan) == BACK)
973 offset = scan - val;
974 else
975 offset = val - scan;
977 preg->program[scan + 1] = offset;
981 - regoptail - regtail on operand of first argument; nop if operandless
984 static void regoptail(regex_t *preg, int p, int val )
986 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
987 if (p != 0 && OP(preg, p) == BRANCH) {
988 regtail(preg, OPERAND(p), val);
993 * regexec and friends
997 * Forwards.
999 static int regtry(regex_t *preg, const char *string );
1000 static int regmatch(regex_t *preg, int prog);
1001 static int regrepeat(regex_t *preg, int p, int max);
1004 - regexec - match a regexp against a string
1006 int regexec(regex_t *preg, const char *string, size_t nmatch, regmatch_t pmatch[], int eflags)
1008 const char *s;
1009 int scan;
1011 /* Be paranoid... */
1012 if (preg == NULL || preg->program == NULL || string == NULL) {
1013 return REG_ERR_NULL_ARGUMENT;
1016 /* Check validity of program. */
1017 if (*preg->program != REG_MAGIC) {
1018 return REG_ERR_CORRUPTED;
1021 #ifdef DEBUG
1022 fprintf(stderr, "regexec: %s\n", string);
1023 regdump(preg);
1024 #endif
1026 preg->eflags = eflags;
1027 preg->pmatch = pmatch;
1028 preg->nmatch = nmatch;
1029 preg->start = string; /* All offsets are computed from here */
1031 /* Must clear out the embedded repeat counts */
1032 for (scan = OPERAND(1); scan != 0; ) {
1033 switch (OP(preg, scan)) {
1034 case REP:
1035 case REPMIN:
1036 case REPX:
1037 case REPXMIN:
1038 preg->program[scan + 4] = 0;
1039 scan += 5;
1040 break;
1042 case ANYOF:
1043 case ANYBUT:
1044 case EXACTLY:
1045 scan += 2;
1046 while (preg->program[scan++]) {
1048 break;
1050 case END:
1051 scan = 0;
1052 break;
1054 default:
1055 scan += 2;
1056 break;
1060 /* If there is a "must appear" string, look for it. */
1061 if (preg->regmust != 0) {
1062 s = string;
1063 while ((s = str_find(s, preg->program[preg->regmust], preg->cflags & REG_ICASE)) != NULL) {
1064 if (prefix_cmp(preg->program + preg->regmust, preg->regmlen, s, preg->cflags & REG_ICASE) >= 0) {
1065 break;
1067 s++;
1069 if (s == NULL) /* Not present. */
1070 return REG_NOMATCH;
1073 /* Mark beginning of line for ^ . */
1074 preg->regbol = string;
1076 /* Simplest case: anchored match need be tried only once (maybe per line). */
1077 if (preg->reganch) {
1078 if (eflags & REG_NOTBOL) {
1079 /* This is an anchored search, but not an BOL, so possibly skip to the next line */
1080 goto nextline;
1082 while (1) {
1083 int ret = regtry(preg, string);
1084 if (ret) {
1085 return REG_NOERROR;
1087 if (*string) {
1088 nextline:
1089 if (preg->cflags & REG_NEWLINE) {
1090 /* Try the next anchor? */
1091 string = strchr(string, '\n');
1092 if (string) {
1093 preg->regbol = ++string;
1094 continue;
1098 return REG_NOMATCH;
1102 /* Messy cases: unanchored match. */
1103 s = string;
1104 if (preg->regstart != '\0') {
1105 /* We know what char it must start with. */
1106 while ((s = str_find(s, preg->regstart, preg->cflags & REG_ICASE)) != NULL) {
1107 if (regtry(preg, s))
1108 return REG_NOERROR;
1109 s++;
1112 else
1113 /* We don't -- general case. */
1114 while (1) {
1115 if (regtry(preg, s))
1116 return REG_NOERROR;
1117 if (*s == '\0') {
1118 break;
1120 s += utf8_charlen(*s);
1123 /* Failure. */
1124 return REG_NOMATCH;
1128 - regtry - try match at specific point
1130 /* 0 failure, 1 success */
1131 static int regtry( regex_t *preg, const char *string )
1133 int i;
1135 preg->reginput = string;
1137 for (i = 0; i < preg->nmatch; i++) {
1138 preg->pmatch[i].rm_so = -1;
1139 preg->pmatch[i].rm_eo = -1;
1141 if (regmatch(preg, 1)) {
1142 preg->pmatch[0].rm_so = string - preg->start;
1143 preg->pmatch[0].rm_eo = preg->reginput - preg->start;
1144 return(1);
1145 } else
1146 return(0);
1150 * Returns bytes matched if 'pattern' is a prefix of 'string'.
1152 * If 'nocase' is non-zero, does a case-insensitive match.
1154 * Returns -1 on not found.
1156 static int prefix_cmp(const int *prog, int proglen, const char *string, int nocase)
1158 const char *s = string;
1159 while (proglen && *s) {
1160 int ch;
1161 int n = reg_utf8_tounicode_case(s, &ch, nocase);
1162 if (ch != *prog) {
1163 return -1;
1165 prog++;
1166 s += n;
1167 proglen--;
1169 if (proglen == 0) {
1170 return s - string;
1172 return -1;
1176 * Searchs for 'c' in the range 'range'.
1178 * Returns 1 if found, or 0 if not.
1180 static int reg_range_find(const int *range, int c)
1182 while (*range) {
1183 /*printf("Checking %d in range [%d,%d]\n", c, range[1], (range[0] + range[1] - 1));*/
1184 if (c >= range[1] && c <= (range[0] + range[1] - 1)) {
1185 return 1;
1187 range += 2;
1189 return 0;
1193 * Search for the character 'c' in the utf-8 string 'string'.
1195 * If 'nocase' is set, the 'string' is assumed to be uppercase
1196 * and 'c' is converted to uppercase before matching.
1198 * Returns the byte position in the string where the 'c' was found, or
1199 * NULL if not found.
1201 static const char *str_find(const char *string, int c, int nocase)
1203 if (nocase) {
1204 /* The "string" should already be converted to uppercase */
1205 c = utf8_upper(c);
1207 while (*string) {
1208 int ch;
1209 int n = reg_utf8_tounicode_case(string, &ch, nocase);
1210 if (c == ch) {
1211 return string;
1213 string += n;
1215 return NULL;
1219 * Returns true if 'ch' is an end-of-line char.
1221 * In REG_NEWLINE mode, \n is considered EOL in
1222 * addition to \0
1224 static int reg_iseol(regex_t *preg, int ch)
1226 if (preg->cflags & REG_NEWLINE) {
1227 return ch == '\0' || ch == '\n';
1229 else {
1230 return ch == '\0';
1234 static int regmatchsimplerepeat(regex_t *preg, int scan, int matchmin)
1236 int nextch = '\0';
1237 const char *save;
1238 int no;
1239 int c;
1241 int max = preg->program[scan + 2];
1242 int min = preg->program[scan + 3];
1243 int next = regnext(preg, scan);
1246 * Lookahead to avoid useless match attempts
1247 * when we know what character comes next.
1249 if (OP(preg, next) == EXACTLY) {
1250 nextch = preg->program[OPERAND(next)];
1252 save = preg->reginput;
1253 no = regrepeat(preg, scan + 5, max);
1254 if (no < min) {
1255 return 0;
1257 if (matchmin) {
1258 /* from min up to no */
1259 max = no;
1260 no = min;
1262 /* else from no down to min */
1263 while (1) {
1264 if (matchmin) {
1265 if (no > max) {
1266 break;
1269 else {
1270 if (no < min) {
1271 break;
1274 preg->reginput = save + utf8_index(save, no);
1275 reg_utf8_tounicode_case(preg->reginput, &c, (preg->cflags & REG_ICASE));
1276 /* If it could work, try it. */
1277 if (reg_iseol(preg, nextch) || c == nextch) {
1278 if (regmatch(preg, next)) {
1279 return(1);
1282 if (matchmin) {
1283 /* Couldn't or didn't, add one more */
1284 no++;
1286 else {
1287 /* Couldn't or didn't -- back up. */
1288 no--;
1291 return(0);
1294 static int regmatchrepeat(regex_t *preg, int scan, int matchmin)
1296 int *scanpt = preg->program + scan;
1298 int max = scanpt[2];
1299 int min = scanpt[3];
1301 /* Have we reached min? */
1302 if (scanpt[4] < min) {
1303 /* No, so get another one */
1304 scanpt[4]++;
1305 if (regmatch(preg, scan + 5)) {
1306 return 1;
1308 scanpt[4]--;
1309 return 0;
1311 if (scanpt[4] > max) {
1312 return 0;
1315 if (matchmin) {
1316 /* minimal, so try other branch first */
1317 if (regmatch(preg, regnext(preg, scan))) {
1318 return 1;
1320 /* No, so try one more */
1321 scanpt[4]++;
1322 if (regmatch(preg, scan + 5)) {
1323 return 1;
1325 scanpt[4]--;
1326 return 0;
1328 /* maximal, so try this branch again */
1329 if (scanpt[4] < max) {
1330 scanpt[4]++;
1331 if (regmatch(preg, scan + 5)) {
1332 return 1;
1334 scanpt[4]--;
1336 /* At this point we are at max with no match. Try the other branch */
1337 return regmatch(preg, regnext(preg, scan));
1341 - regmatch - main matching routine
1343 * Conceptually the strategy is simple: check to see whether the current
1344 * node matches, call self recursively to see whether the rest matches,
1345 * and then act accordingly. In practice we make some effort to avoid
1346 * recursion, in particular by going through "ordinary" nodes (that don't
1347 * need to know whether the rest of the match failed) by a loop instead of
1348 * by recursion.
1350 /* 0 failure, 1 success */
1351 static int regmatch(regex_t *preg, int prog)
1353 int scan; /* Current node. */
1354 int next; /* Next node. */
1356 scan = prog;
1358 #ifdef DEBUG
1359 if (scan != 0 && regnarrate)
1360 fprintf(stderr, "%s(\n", regprop(scan));
1361 #endif
1362 while (scan != 0) {
1363 int n;
1364 int c;
1365 #ifdef DEBUG
1366 if (regnarrate) {
1367 fprintf(stderr, "%3d: %s...\n", scan, regprop(OP(preg, scan))); /* Where, what. */
1369 #endif
1370 next = regnext(preg, scan);
1371 n = reg_utf8_tounicode_case(preg->reginput, &c, (preg->cflags & REG_ICASE));
1373 switch (OP(preg, scan)) {
1374 case BOL:
1375 if (preg->reginput != preg->regbol)
1376 return(0);
1377 break;
1378 case EOL:
1379 if (!reg_iseol(preg, c)) {
1380 return(0);
1382 break;
1383 case WORDA:
1384 /* Must be looking at a letter, digit, or _ */
1385 if ((!isalnum(UCHAR(c))) && c != '_')
1386 return(0);
1387 /* Prev must be BOL or nonword */
1388 if (preg->reginput > preg->regbol &&
1389 (isalnum(UCHAR(preg->reginput[-1])) || preg->reginput[-1] == '_'))
1390 return(0);
1391 break;
1392 case WORDZ:
1393 /* Can't match at BOL */
1394 if (preg->reginput > preg->regbol) {
1395 /* Current must be EOL or nonword */
1396 if (reg_iseol(preg, c) || !isalnum(UCHAR(c)) || c != '_') {
1397 c = preg->reginput[-1];
1398 /* Previous must be word */
1399 if (isalnum(UCHAR(c)) || c == '_') {
1400 break;
1404 /* No */
1405 return(0);
1407 case ANY:
1408 if (reg_iseol(preg, c))
1409 return 0;
1410 preg->reginput += n;
1411 break;
1412 case EXACTLY: {
1413 int opnd;
1414 int len;
1415 int slen;
1417 opnd = OPERAND(scan);
1418 len = str_int_len(preg->program + opnd);
1420 slen = prefix_cmp(preg->program + opnd, len, preg->reginput, preg->cflags & REG_ICASE);
1421 if (slen < 0) {
1422 return(0);
1424 preg->reginput += slen;
1426 break;
1427 case ANYOF:
1428 if (reg_iseol(preg, c) || reg_range_find(preg->program + OPERAND(scan), c) == 0) {
1429 return(0);
1431 preg->reginput += n;
1432 break;
1433 case ANYBUT:
1434 if (reg_iseol(preg, c) || reg_range_find(preg->program + OPERAND(scan), c) != 0) {
1435 return(0);
1437 preg->reginput += n;
1438 break;
1439 case NOTHING:
1440 break;
1441 case BACK:
1442 break;
1443 case BRANCH: {
1444 const char *save;
1446 if (OP(preg, next) != BRANCH) /* No choice. */
1447 next = OPERAND(scan); /* Avoid recursion. */
1448 else {
1449 do {
1450 save = preg->reginput;
1451 if (regmatch(preg, OPERAND(scan))) {
1452 return(1);
1454 preg->reginput = save;
1455 scan = regnext(preg, scan);
1456 } while (scan != 0 && OP(preg, scan) == BRANCH);
1457 return(0);
1458 /* NOTREACHED */
1461 break;
1462 case REP:
1463 case REPMIN:
1464 return regmatchsimplerepeat(preg, scan, OP(preg, scan) == REPMIN);
1466 case REPX:
1467 case REPXMIN:
1468 return regmatchrepeat(preg, scan, OP(preg, scan) == REPXMIN);
1470 case END:
1471 return(1); /* Success! */
1472 break;
1474 case OPENNC:
1475 case CLOSENC:
1476 if (regmatch(preg, next)) {
1477 return 1;
1479 return 0;
1481 default:
1482 if (OP(preg, scan) >= OPEN+1 && OP(preg, scan) < CLOSE_END) {
1483 const char *save;
1485 save = preg->reginput;
1487 if (regmatch(preg, next)) {
1488 int no;
1490 * Don't set startp if some later
1491 * invocation of the same parentheses
1492 * already has.
1494 if (OP(preg, scan) < CLOSE) {
1495 no = OP(preg, scan) - OPEN;
1496 if (no < preg->nmatch && preg->pmatch[no].rm_so == -1) {
1497 preg->pmatch[no].rm_so = save - preg->start;
1500 else {
1501 no = OP(preg, scan) - CLOSE;
1502 if (no < preg->nmatch && preg->pmatch[no].rm_eo == -1) {
1503 preg->pmatch[no].rm_eo = save - preg->start;
1506 return(1);
1507 } else
1508 return(0);
1510 return REG_ERR_INTERNAL;
1513 scan = next;
1517 * We get here only if there's trouble -- normally "case END" is
1518 * the terminating point.
1520 return REG_ERR_INTERNAL;
1524 - regrepeat - repeatedly match something simple, report how many
1526 static int regrepeat(regex_t *preg, int p, int max)
1528 int count = 0;
1529 const char *scan;
1530 int opnd;
1531 int ch;
1532 int n;
1534 scan = preg->reginput;
1535 opnd = OPERAND(p);
1536 switch (OP(preg, p)) {
1537 case ANY:
1538 /* No need to handle utf8 specially here */
1539 while (!reg_iseol(preg, *scan) && count < max) {
1540 count++;
1541 scan++;
1543 break;
1544 case EXACTLY:
1545 while (count < max) {
1546 n = reg_utf8_tounicode_case(scan, &ch, preg->cflags & REG_ICASE);
1547 if (preg->program[opnd] != ch) {
1548 break;
1550 count++;
1551 scan += n;
1553 break;
1554 case ANYOF:
1555 while (count < max) {
1556 n = reg_utf8_tounicode_case(scan, &ch, preg->cflags & REG_ICASE);
1557 if (reg_iseol(preg, ch) || reg_range_find(preg->program + opnd, ch) == 0) {
1558 break;
1560 count++;
1561 scan += n;
1563 break;
1564 case ANYBUT:
1565 while (count < max) {
1566 n = reg_utf8_tounicode_case(scan, &ch, preg->cflags & REG_ICASE);
1567 if (reg_iseol(preg, ch) || reg_range_find(preg->program + opnd, ch) != 0) {
1568 break;
1570 count++;
1571 scan += n;
1573 break;
1574 default: /* Oh dear. Called inappropriately. */
1575 preg->err = REG_ERR_INTERNAL;
1576 count = 0; /* Best compromise. */
1577 break;
1579 preg->reginput = scan;
1581 return(count);
1585 - regnext - dig the "next" pointer out of a node
1587 static int regnext(regex_t *preg, int p )
1589 int offset;
1591 offset = NEXT(preg, p);
1593 if (offset == 0)
1594 return 0;
1596 if (OP(preg, p) == BACK)
1597 return(p-offset);
1598 else
1599 return(p+offset);
1602 #if defined(DEBUG) && !defined(JIM_BOOTSTRAP)
1605 - regdump - dump a regexp onto stdout in vaguely comprehensible form
1607 static void regdump(regex_t *preg)
1609 int s;
1610 int op = EXACTLY; /* Arbitrary non-END op. */
1611 int next;
1612 char buf[4];
1614 int i;
1615 for (i = 1; i < preg->p; i++) {
1616 printf("%02x ", (unsigned char)preg->program[i]);
1617 if (i % 16 == 0) {
1618 printf("\n");
1621 printf("\n");
1623 s = 1;
1624 while (op != END && s < preg->p) { /* While that wasn't END last time... */
1625 op = OP(preg, s);
1626 printf("%3d: %s", s, regprop(op)); /* Where, what. */
1627 next = regnext(preg, s);
1628 if (next == 0) /* Next ptr. */
1629 printf("(0)");
1630 else
1631 printf("(%d)", next);
1632 s += 2;
1633 if (op == REP || op == REPMIN || op == REPX || op == REPXMIN) {
1634 int max = preg->program[s];
1635 int min = preg->program[s + 1];
1636 if (max == 65535) {
1637 printf("{%d,*}", min);
1639 else {
1640 printf("{%d,%d}", min, max);
1642 printf(" %d", preg->program[s + 2]);
1643 s += 3;
1645 else if (op == ANYOF || op == ANYBUT) {
1646 /* set of ranges */
1648 while (preg->program[s]) {
1649 int len = preg->program[s++];
1650 int first = preg->program[s++];
1651 buf[utf8_fromunicode(buf, first)] = 0;
1652 printf("%s", buf);
1653 if (len > 1) {
1654 buf[utf8_fromunicode(buf, first + len - 1)] = 0;
1655 printf("-%s", buf);
1658 s++;
1660 else if (op == EXACTLY) {
1661 /* Literal string, where present. */
1663 while (preg->program[s]) {
1664 buf[utf8_fromunicode(buf, preg->program[s])] = 0;
1665 printf("%s", buf);
1666 s++;
1668 s++;
1670 putchar('\n');
1673 if (op == END) {
1674 /* Header fields of interest. */
1675 if (preg->regstart) {
1676 buf[utf8_fromunicode(buf, preg->regstart)] = 0;
1677 printf("start '%s' ", buf);
1679 if (preg->reganch)
1680 printf("anchored ");
1681 if (preg->regmust != 0) {
1682 int i;
1683 printf("must have:");
1684 for (i = 0; i < preg->regmlen; i++) {
1685 putchar(preg->program[preg->regmust + i]);
1687 putchar('\n');
1690 printf("\n");
1694 - regprop - printable representation of opcode
1696 static const char *regprop( int op )
1698 static char buf[50];
1700 switch (op) {
1701 case BOL:
1702 return "BOL";
1703 case EOL:
1704 return "EOL";
1705 case ANY:
1706 return "ANY";
1707 case ANYOF:
1708 return "ANYOF";
1709 case ANYBUT:
1710 return "ANYBUT";
1711 case BRANCH:
1712 return "BRANCH";
1713 case EXACTLY:
1714 return "EXACTLY";
1715 case NOTHING:
1716 return "NOTHING";
1717 case BACK:
1718 return "BACK";
1719 case END:
1720 return "END";
1721 case REP:
1722 return "REP";
1723 case REPMIN:
1724 return "REPMIN";
1725 case REPX:
1726 return "REPX";
1727 case REPXMIN:
1728 return "REPXMIN";
1729 case WORDA:
1730 return "WORDA";
1731 case WORDZ:
1732 return "WORDZ";
1733 case OPENNC:
1734 return "OPEN";
1735 case CLOSENC:
1736 return "CLOSE";
1737 default:
1738 if (op >= OPEN && op < CLOSE) {
1739 snprintf(buf, sizeof(buf), "OPEN%d", op-OPEN);
1741 else if (op >= CLOSE && op < CLOSE_END) {
1742 snprintf(buf, sizeof(buf), "CLOSE%d", op-CLOSE);
1744 else {
1745 snprintf(buf, sizeof(buf), "?%d?\n", op);
1747 return(buf);
1750 #endif /* JIM_BOOTSTRAP */
1752 size_t regerror(int errcode, const regex_t *preg, char *errbuf, size_t errbuf_size)
1754 static const char *error_strings[] = {
1755 "success",
1756 "no match",
1757 "bad pattern",
1758 "null argument",
1759 "unknown error",
1760 "too big",
1761 "out of memory",
1762 "too many ()",
1763 "parentheses () not balanced",
1764 "braces {} not balanced",
1765 "invalid repetition count(s)",
1766 "extra characters",
1767 "*+ of empty atom",
1768 "nested count",
1769 "internal error",
1770 "count follows nothing",
1771 "trailing backslash",
1772 "corrupted program",
1773 "contains null char",
1775 const char *err;
1777 if (errcode < 0 || errcode >= REG_ERR_NUM) {
1778 err = "Bad error code";
1780 else {
1781 err = error_strings[errcode];
1784 return snprintf(errbuf, errbuf_size, "%s", err);
1787 void regfree(regex_t *preg)
1789 free(preg->program);
1792 #endif