jim.c: fix some dict/list shimmering issues
[jimtcl.git] / jimregexp.c
blobd32f31630492a46008e10bea0a57a3bc98c49871
1 /*
2 * vi:se ts=8:
4 * regcomp and regexec -- regsub and regerror are elsewhere
6 * Copyright (c) 1986 by University of Toronto.
7 * Written by Henry Spencer. Not derived from licensed software.
9 * Permission is granted to anyone to use this software for any
10 * purpose on any computer system, and to redistribute it freely,
11 * subject to the following restrictions:
13 * 1. The author is not responsible for the consequences of use of
14 * this software, no matter how awful, even if they arise
15 * from defects in it.
17 * 2. The origin of this software must not be misrepresented, either
18 * by explicit claim or by omission.
20 * 3. Altered versions must be plainly marked as such, and must not
21 * be misrepresented as being the original software.
22 *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore,
23 *** hoptoad!gnu, on 27 Dec 1986, to add \n as an alternative to |
24 *** to assist in implementing egrep.
25 *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore,
26 *** hoptoad!gnu, on 27 Dec 1986, to add \< and \> for word-matching
27 *** as in BSD grep and ex.
28 *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore,
29 *** hoptoad!gnu, on 28 Dec 1986, to optimize characters quoted with \.
30 *** THIS IS AN ALTERED VERSION. It was altered by James A. Woods,
31 *** ames!jaw, on 19 June 1987, to quash a regcomp() redundancy.
32 *** THIS IS AN ALTERED VERSION. It was altered by Christopher Seiwald
33 *** seiwald@vix.com, on 28 August 1993, for use in jam. Regmagic.h
34 *** was moved into regexp.h, and the include of regexp.h now uses "'s
35 *** to avoid conflicting with the system regexp.h. Const, bless its
36 *** soul, was removed so it can compile everywhere. The declaration
37 *** of strchr() was in conflict on AIX, so it was removed (as it is
38 *** happily defined in string.h).
39 *** THIS IS AN ALTERED VERSION. It was altered by Christopher Seiwald
40 *** seiwald@perforce.com, on 20 January 2000, to use function prototypes.
41 *** THIS IS AN ALTERED VERSION. It was altered by Christopher Seiwald
42 *** seiwald@perforce.com, on 05 November 2002, to const string literals.
44 * THIS IS AN ALTERED VERSION. It was altered by Steve Bennett <steveb@workware.net.au>
45 * on 16 October 2010, to remove static state and add better Tcl ARE compatibility.
46 * This includes counted repetitions, UTF-8 support, character classes,
47 * shorthand character classes, increased number of parentheses to 100,
48 * backslash escape sequences. It also removes \n as an alternative to |.
50 * Beware that some of this code is subtly aware of the way operator
51 * precedence is structured in regular expressions. Serious changes in
52 * regular-expression syntax might require a total rethink.
54 #include <stdio.h>
55 #include <ctype.h>
56 #include <stdlib.h>
57 #include <string.h>
59 #include "jim.h"
60 #include "jimautoconf.h"
61 #include "jimregexp.h"
62 #include "utf8.h"
64 #if !defined(HAVE_REGCOMP) || defined(JIM_REGEXP)
66 /* An arbitrary limit, but this seems enough. Must be less than 1000. */
67 #define REG_MAX_PAREN 100
70 * Structure for regexp "program". This is essentially a linear encoding
71 * of a nondeterministic finite-state machine (aka syntax charts or
72 * "railroad normal form" in parsing technology). Each node is an opcode
73 * plus a "next" pointer, possibly plus an operand. "Next" pointers of
74 * all nodes except BRANCH implement concatenation; a "next" pointer with
75 * a BRANCH on both ends of it is connecting two alternatives. (Here we
76 * have one of the subtle syntax dependencies: an individual BRANCH (as
77 * opposed to a collection of them) is never concatenated with anything
78 * because of operator precedence.) The operand of some types of node is
79 * a literal string; for others, it is a node leading into a sub-FSM. In
80 * particular, the operand of a BRANCH node is the first node of the branch.
81 * (NB this is *not* a tree structure: the tail of the branch connects
82 * to the thing following the set of BRANCHes.) The opcodes are:
85 /* definition number opnd? meaning */
86 #define END 0 /* no End of program. */
87 #define BOL 1 /* no Match "" at beginning of line. */
88 #define EOL 2 /* no Match "" at end of line. */
89 #define ANY 3 /* no Match any one character. */
90 #define ANYOF 4 /* str Match any character in this string. */
91 #define ANYBUT 5 /* str Match any character not in this string. */
92 #define BRANCH 6 /* node Match this alternative, or the next... */
93 #define BACK 7 /* no Match "", "next" ptr points backward. */
94 #define EXACTLY 8 /* str Match this string. */
95 #define NOTHING 9 /* no Match empty string. */
96 #define REP 10 /* max,min Match this (simple) thing [min,max] times. */
97 #define REPMIN 11 /* max,min Match this (simple) thing [min,max] times, minimal match. */
98 #define REPX 12 /* max,min Match this (complex) thing [min,max] times. */
99 #define REPXMIN 13 /* max,min Match this (complex) thing [min,max] times, minimal match. */
101 #define WORDA 15 /* no Match "" at wordchar, where prev is nonword */
102 #define WORDZ 16 /* no Match "" at nonwordchar, where prev is word */
104 #define OPENNC 1000 /* no Non-capturing parentheses - must be OPEN-1 */
105 #define OPEN 1001 /* no Mark this point in input as start of #n. */
106 /* OPEN+1 is number 1, etc. */
108 /* must not be any other opts between OPEN and CLOSE */
110 #define CLOSENC 2000 /* no Non-capturing parentheses - must be CLOSE-1 */
111 #define CLOSE 2001 /* no Analogous to OPEN. */
112 #define CLOSE_END (CLOSE+REG_MAX_PAREN)
115 * The first word of the regexp internal "program" is actually this magic
116 * number; the start node begins in the second word.
118 #define REG_MAGIC 0xFADED00D
121 * Opcode notes:
123 * BRANCH The set of branches constituting a single choice are hooked
124 * together with their "next" pointers, since precedence prevents
125 * anything being concatenated to any individual branch. The
126 * "next" pointer of the last BRANCH in a choice points to the
127 * thing following the whole choice. This is also where the
128 * final "next" pointer of each individual branch points; each
129 * branch starts with the operand node of a BRANCH node.
131 * BACK Normal "next" pointers all implicitly point forward; BACK
132 * exists to make loop structures possible.
134 * REP,REPX Repeated matches ('?', '*', '+' and {min,max}) are implemented
135 * as either simple repeats (REP) or complex repeats (REPX).
136 * These opcodes include a "min" and "max" count after the opcode.
137 * This is followed by a fourth "current count" word that is
138 * only used by REPX, as it implements a recursive match.
139 * REPMIN and REPXMIN are identical except they implement minimal repeats.
141 * OPEN,CLOSE ...are numbered at compile time.
145 * A node is one word of opcode followed by one word of "next" pointer.
146 * The "next" pointer value is a positive offset from the opcode of the node
147 * containing it.
148 * An operand, if any, simply follows the node. (Note that much of the
149 * code generation knows about this implicit relationship.)
151 #define OP(preg, p) (preg->program[p])
152 #define NEXT(preg, p) (preg->program[p + 1])
153 #define OPERAND(p) ((p) + 2)
156 * See regmagic.h for one further detail of program structure.
161 * Utility definitions.
164 #define FAIL(R,M) { (R)->err = (M); return (M); }
165 #define ISMULT(c) ((c) == '*' || (c) == '+' || (c) == '?' || (c) == '{')
166 #define META "^$.[()|?{+*"
169 * Flags to be passed up and down.
171 #define HASWIDTH 1 /* Known never to match null string. */
172 #define SIMPLE 2 /* Simple enough to be STAR/PLUS operand. */
173 #define SPSTART 4 /* Starts with * or +. */
174 #define WORST 0 /* Worst case. */
176 #define MAX_REP_COUNT 1000000
179 * Forward declarations for regcomp()'s friends.
181 static int reg(regex_t *preg, int paren /* Parenthesized? */, int *flagp );
182 static int regpiece(regex_t *preg, int *flagp );
183 static int regbranch(regex_t *preg, int *flagp );
184 static int regatom(regex_t *preg, int *flagp );
185 static int regnode(regex_t *preg, int op );
186 static int regnext(regex_t *preg, int p );
187 static void regc(regex_t *preg, int b );
188 static int reginsert(regex_t *preg, int op, int size, int opnd );
189 static void regtail(regex_t *preg, int p, int val);
190 static void regoptail(regex_t *preg, int p, int val );
191 static int regopsize(regex_t *preg, int p );
193 static int reg_range_find(const int *string, int c);
194 static const char *str_find(const char *string, int c, int nocase);
195 static int prefix_cmp(const int *prog, int proglen, const char *string, int nocase);
197 /*#define DEBUG*/
198 #ifdef DEBUG
199 static int regnarrate = 0;
200 static void regdump(regex_t *preg);
201 static const char *regprop( int op );
202 #endif
206 * Returns the length of the null-terminated integer sequence.
208 static int str_int_len(const int *seq)
210 int n = 0;
211 while (*seq++) {
212 n++;
214 return n;
218 - regcomp - compile a regular expression into internal code
220 * We can't allocate space until we know how big the compiled form will be,
221 * but we can't compile it (and thus know how big it is) until we've got a
222 * place to put the code. So we cheat: we compile it twice, once with code
223 * generation turned off and size counting turned on, and once "for real".
224 * This also means that we don't allocate space until we are sure that the
225 * thing really will compile successfully, and we never have to move the
226 * code and thus invalidate pointers into it. (Note that it has to be in
227 * one piece because free() must be able to free it all.)
229 * Beware that the optimization-preparation code in here knows about some
230 * of the structure of the compiled regexp.
232 int regcomp(regex_t *preg, const char *exp, int cflags)
234 int scan;
235 int longest;
236 unsigned len;
237 int flags;
239 #ifdef DEBUG
240 fprintf(stderr, "Compiling: '%s'\n", exp);
241 #endif
242 memset(preg, 0, sizeof(*preg));
244 if (exp == NULL)
245 FAIL(preg, REG_ERR_NULL_ARGUMENT);
247 /* First pass: determine size, legality. */
248 preg->cflags = cflags;
249 preg->regparse = exp;
251 /* Allocate space. */
252 preg->proglen = (strlen(exp) + 1) * 5;
253 preg->program = malloc(preg->proglen * sizeof(int));
254 if (preg->program == NULL)
255 FAIL(preg, REG_ERR_NOMEM);
257 /* Note that since we store a magic value as the first item in the program,
258 * program offsets will never be 0
260 regc(preg, REG_MAGIC);
261 if (reg(preg, 0, &flags) == 0) {
262 return preg->err;
265 /* Small enough for pointer-storage convention? */
266 if (preg->re_nsub >= REG_MAX_PAREN) /* Probably could be 65535L. */
267 FAIL(preg,REG_ERR_TOO_BIG);
269 /* Dig out information for optimizations. */
270 preg->regstart = 0; /* Worst-case defaults. */
271 preg->reganch = 0;
272 preg->regmust = 0;
273 preg->regmlen = 0;
274 scan = 1; /* First BRANCH. */
275 if (OP(preg, regnext(preg, scan)) == END) { /* Only one top-level choice. */
276 scan = OPERAND(scan);
278 /* Starting-point info. */
279 if (OP(preg, scan) == EXACTLY) {
280 preg->regstart = preg->program[OPERAND(scan)];
282 else if (OP(preg, scan) == BOL)
283 preg->reganch++;
286 * If there's something expensive in the r.e., find the
287 * longest literal string that must appear and make it the
288 * regmust. Resolve ties in favor of later strings, since
289 * the regstart check works with the beginning of the r.e.
290 * and avoiding duplication strengthens checking. Not a
291 * strong reason, but sufficient in the absence of others.
293 if (flags&SPSTART) {
294 longest = 0;
295 len = 0;
296 for (; scan != 0; scan = regnext(preg, scan)) {
297 if (OP(preg, scan) == EXACTLY) {
298 int plen = str_int_len(preg->program + OPERAND(scan));
299 if (plen >= len) {
300 longest = OPERAND(scan);
301 len = plen;
305 preg->regmust = longest;
306 preg->regmlen = len;
310 #ifdef DEBUG
311 regdump(preg);
312 #endif
314 return 0;
318 - reg - regular expression, i.e. main body or parenthesized thing
320 * Caller must absorb opening parenthesis.
322 * Combining parenthesis handling with the base level of regular expression
323 * is a trifle forced, but the need to tie the tails of the branches to what
324 * follows makes it hard to avoid.
326 static int reg(regex_t *preg, int paren /* Parenthesized? */, int *flagp )
328 int ret;
329 int br;
330 int ender;
331 int parno = 0;
332 int flags;
334 *flagp = HASWIDTH; /* Tentatively. */
336 /* Make an OPEN node, if parenthesized. */
337 if (paren) {
338 if (preg->regparse[0] == '?' && preg->regparse[1] == ':') {
339 /* non-capturing paren */
340 preg->regparse += 2;
341 parno = -1;
343 else {
344 parno = ++preg->re_nsub;
346 ret = regnode(preg, OPEN+parno);
347 } else
348 ret = 0;
350 /* Pick up the branches, linking them together. */
351 br = regbranch(preg, &flags);
352 if (br == 0)
353 return 0;
354 if (ret != 0)
355 regtail(preg, ret, br); /* OPEN -> first. */
356 else
357 ret = br;
358 if (!(flags&HASWIDTH))
359 *flagp &= ~HASWIDTH;
360 *flagp |= flags&SPSTART;
361 while (*preg->regparse == '|') {
362 preg->regparse++;
363 br = regbranch(preg, &flags);
364 if (br == 0)
365 return 0;
366 regtail(preg, ret, br); /* BRANCH -> BRANCH. */
367 if (!(flags&HASWIDTH))
368 *flagp &= ~HASWIDTH;
369 *flagp |= flags&SPSTART;
372 /* Make a closing node, and hook it on the end. */
373 ender = regnode(preg, (paren) ? CLOSE+parno : END);
374 regtail(preg, ret, ender);
376 /* Hook the tails of the branches to the closing node. */
377 for (br = ret; br != 0; br = regnext(preg, br))
378 regoptail(preg, br, ender);
380 /* Check for proper termination. */
381 if (paren && *preg->regparse++ != ')') {
382 preg->err = REG_ERR_UNMATCHED_PAREN;
383 return 0;
384 } else if (!paren && *preg->regparse != '\0') {
385 if (*preg->regparse == ')') {
386 preg->err = REG_ERR_UNMATCHED_PAREN;
387 return 0;
388 } else {
389 preg->err = REG_ERR_JUNK_ON_END;
390 return 0;
394 return(ret);
398 - regbranch - one alternative of an | operator
400 * Implements the concatenation operator.
402 static int regbranch(regex_t *preg, int *flagp )
404 int ret;
405 int chain;
406 int latest;
407 int flags;
409 *flagp = WORST; /* Tentatively. */
411 ret = regnode(preg, BRANCH);
412 chain = 0;
413 while (*preg->regparse != '\0' && *preg->regparse != ')' &&
414 *preg->regparse != '|') {
415 latest = regpiece(preg, &flags);
416 if (latest == 0)
417 return 0;
418 *flagp |= flags&HASWIDTH;
419 if (chain == 0) {/* First piece. */
420 *flagp |= flags&SPSTART;
422 else {
423 regtail(preg, chain, latest);
425 chain = latest;
427 if (chain == 0) /* Loop ran zero times. */
428 (void) regnode(preg, NOTHING);
430 return(ret);
434 - regpiece - something followed by possible [*+?]
436 * Note that the branching code sequences used for ? and the general cases
437 * of * and + are somewhat optimized: they use the same NOTHING node as
438 * both the endmarker for their branch list and the body of the last branch.
439 * It might seem that this node could be dispensed with entirely, but the
440 * endmarker role is not redundant.
442 static int regpiece(regex_t *preg, int *flagp)
444 int ret;
445 char op;
446 int next;
447 int flags;
448 int chain = 0;
449 int min;
450 int max;
452 ret = regatom(preg, &flags);
453 if (ret == 0)
454 return 0;
456 op = *preg->regparse;
457 if (!ISMULT(op)) {
458 *flagp = flags;
459 return(ret);
462 if (!(flags&HASWIDTH) && op != '?') {
463 preg->err = REG_ERR_OPERAND_COULD_BE_EMPTY;
464 return 0;
467 /* Handle braces (counted repetition) by expansion */
468 if (op == '{') {
469 char *end;
471 min = strtoul(preg->regparse + 1, &end, 10);
472 if (end == preg->regparse + 1) {
473 preg->err = REG_ERR_BAD_COUNT;
474 return 0;
476 if (*end == '}') {
477 max = min;
479 else {
480 preg->regparse = end;
481 max = strtoul(preg->regparse + 1, &end, 10);
482 if (*end != '}') {
483 preg->err = REG_ERR_UNMATCHED_BRACES;
484 return 0;
487 if (end == preg->regparse + 1) {
488 max = MAX_REP_COUNT;
490 else if (max < min || max >= 100) {
491 preg->err = REG_ERR_BAD_COUNT;
492 return 0;
494 if (min >= 100) {
495 preg->err = REG_ERR_BAD_COUNT;
496 return 0;
499 preg->regparse = strchr(preg->regparse, '}');
501 else {
502 min = (op == '+');
503 max = (op == '?' ? 1 : MAX_REP_COUNT);
506 if (preg->regparse[1] == '?') {
507 preg->regparse++;
508 next = reginsert(preg, flags & SIMPLE ? REPMIN : REPXMIN, 5, ret);
510 else {
511 next = reginsert(preg, flags & SIMPLE ? REP: REPX, 5, ret);
513 preg->program[ret + 2] = max;
514 preg->program[ret + 3] = min;
515 preg->program[ret + 4] = 0;
517 *flagp = (min) ? (WORST|HASWIDTH) : (WORST|SPSTART);
519 if (!(flags & SIMPLE)) {
520 int back = regnode(preg, BACK);
521 regtail(preg, back, ret);
522 regtail(preg, next, back);
525 preg->regparse++;
526 if (ISMULT(*preg->regparse)) {
527 preg->err = REG_ERR_NESTED_COUNT;
528 return 0;
531 return chain ? chain : ret;
535 * Add all characters in the inclusive range between lower and upper.
537 * Handles a swapped range (upper < lower).
539 static void reg_addrange(regex_t *preg, int lower, int upper)
541 if (lower > upper) {
542 reg_addrange(preg, upper, lower);
544 /* Add a range as length, start */
545 regc(preg, upper - lower + 1);
546 regc(preg, lower);
550 * Add a null-terminated literal string as a set of ranges.
552 static void reg_addrange_str(regex_t *preg, const char *str)
554 while (*str) {
555 reg_addrange(preg, *str, *str);
556 str++;
561 * Extracts the next unicode char from utf8.
563 * If 'upper' is set, converts the char to uppercase.
565 static int reg_utf8_tounicode_case(const char *s, int *uc, int upper)
567 int l = utf8_tounicode(s, uc);
568 if (upper) {
569 *uc = utf8_upper(*uc);
571 return l;
575 * Converts a hex digit to decimal.
577 * Returns -1 for an invalid hex digit.
579 static int hexdigitval(int c)
581 if (c >= '0' && c <= '9')
582 return c - '0';
583 if (c >= 'a' && c <= 'f')
584 return c - 'a' + 10;
585 if (c >= 'A' && c <= 'F')
586 return c - 'A' + 10;
587 return -1;
591 * Parses up to 'n' hex digits at 's' and stores the result in *uc.
593 * Returns the number of hex digits parsed.
594 * If there are no hex digits, returns 0 and stores nothing.
596 static int parse_hex(const char *s, int n, int *uc)
598 int val = 0;
599 int k;
601 for (k = 0; k < n; k++) {
602 int c = hexdigitval(*s++);
603 if (c == -1) {
604 break;
606 val = (val << 4) | c;
608 if (k) {
609 *uc = val;
611 return k;
615 * Call for chars after a backlash to decode the escape sequence.
617 * Stores the result in *ch.
619 * Returns the number of bytes consumed.
621 static int reg_decode_escape(const char *s, int *ch)
623 int n;
624 const char *s0 = s;
626 *ch = *s++;
628 switch (*ch) {
629 case 'b': *ch = '\b'; break;
630 case 'e': *ch = 27; break;
631 case 'f': *ch = '\f'; break;
632 case 'n': *ch = '\n'; break;
633 case 'r': *ch = '\r'; break;
634 case 't': *ch = '\t'; break;
635 case 'v': *ch = '\v'; break;
636 case 'u':
637 if (*s == '{') {
638 /* Expect \u{NNNN} */
639 n = parse_hex(s + 1, 6, ch);
640 if (n > 0 && s[n + 1] == '}' && *ch >= 0 && *ch <= 0x1fffff) {
641 s += n + 2;
643 else {
644 /* Invalid, so just treat as an escaped 'u' */
645 *ch = 'u';
648 else if ((n = parse_hex(s, 4, ch)) > 0) {
649 s += n;
651 break;
652 case 'U':
653 if ((n = parse_hex(s, 8, ch)) > 0) {
654 s += n;
656 case 'x':
657 if ((n = parse_hex(s, 2, ch)) > 0) {
658 s += n;
660 break;
661 case '\0':
662 s--;
663 *ch = '\\';
664 break;
666 return s - s0;
670 - regatom - the lowest level
672 * Optimization: gobbles an entire sequence of ordinary characters so that
673 * it can turn them into a single node, which is smaller to store and
674 * faster to run. Backslashed characters are exceptions, each becoming a
675 * separate node; the code is simpler that way and it's not worth fixing.
677 static int regatom(regex_t *preg, int *flagp)
679 int ret;
680 int flags;
681 int nocase = (preg->cflags & REG_ICASE);
683 int ch;
684 int n = reg_utf8_tounicode_case(preg->regparse, &ch, nocase);
686 *flagp = WORST; /* Tentatively. */
688 preg->regparse += n;
689 switch (ch) {
690 /* FIXME: these chars only have meaning at beg/end of pat? */
691 case '^':
692 ret = regnode(preg, BOL);
693 break;
694 case '$':
695 ret = regnode(preg, EOL);
696 break;
697 case '.':
698 ret = regnode(preg, ANY);
699 *flagp |= HASWIDTH|SIMPLE;
700 break;
701 case '[': {
702 const char *pattern = preg->regparse;
704 if (*pattern == '^') { /* Complement of range. */
705 ret = regnode(preg, ANYBUT);
706 pattern++;
707 } else
708 ret = regnode(preg, ANYOF);
710 /* Special case. If the first char is ']' or '-', it is part of the set */
711 if (*pattern == ']' || *pattern == '-') {
712 reg_addrange(preg, *pattern, *pattern);
713 pattern++;
716 while (*pattern && *pattern != ']') {
717 /* Is this a range? a-z */
718 int start;
719 int end;
721 pattern += reg_utf8_tounicode_case(pattern, &start, nocase);
722 if (start == '\\') {
723 pattern += reg_decode_escape(pattern, &start);
724 if (start == 0) {
725 preg->err = REG_ERR_NULL_CHAR;
726 return 0;
729 if (pattern[0] == '-' && pattern[1] && pattern[1] != ']') {
730 /* skip '-' */
731 pattern += utf8_tounicode(pattern, &end);
732 pattern += reg_utf8_tounicode_case(pattern, &end, nocase);
733 if (end == '\\') {
734 pattern += reg_decode_escape(pattern, &end);
735 if (end == 0) {
736 preg->err = REG_ERR_NULL_CHAR;
737 return 0;
741 reg_addrange(preg, start, end);
742 continue;
744 if (start == '[') {
745 if (strncmp(pattern, ":alpha:]", 8) == 0) {
746 if ((preg->cflags & REG_ICASE) == 0) {
747 reg_addrange(preg, 'a', 'z');
749 reg_addrange(preg, 'A', 'Z');
750 pattern += 8;
751 continue;
753 if (strncmp(pattern, ":alnum:]", 8) == 0) {
754 if ((preg->cflags & REG_ICASE) == 0) {
755 reg_addrange(preg, 'a', 'z');
757 reg_addrange(preg, 'A', 'Z');
758 reg_addrange(preg, '0', '9');
759 pattern += 8;
760 continue;
762 if (strncmp(pattern, ":space:]", 8) == 0) {
763 reg_addrange_str(preg, " \t\r\n\f\v");
764 pattern += 8;
765 continue;
768 /* Not a range, so just add the char */
769 reg_addrange(preg, start, start);
771 regc(preg, '\0');
773 if (*pattern) {
774 pattern++;
776 preg->regparse = pattern;
778 *flagp |= HASWIDTH|SIMPLE;
780 break;
781 case '(':
782 ret = reg(preg, 1, &flags);
783 if (ret == 0)
784 return 0;
785 *flagp |= flags&(HASWIDTH|SPSTART);
786 break;
787 case '\0':
788 case '|':
789 case ')':
790 preg->err = REG_ERR_INTERNAL;
791 return 0; /* Supposed to be caught earlier. */
792 case '?':
793 case '+':
794 case '*':
795 case '{':
796 preg->err = REG_ERR_COUNT_FOLLOWS_NOTHING;
797 return 0;
798 case '\\':
799 switch (*preg->regparse++) {
800 case '\0':
801 preg->err = REG_ERR_TRAILING_BACKSLASH;
802 return 0;
803 case '<':
804 case 'm':
805 ret = regnode(preg, WORDA);
806 break;
807 case '>':
808 case 'M':
809 ret = regnode(preg, WORDZ);
810 break;
811 case 'd':
812 ret = regnode(preg, ANYOF);
813 reg_addrange(preg, '0', '9');
814 regc(preg, '\0');
815 *flagp |= HASWIDTH|SIMPLE;
816 break;
817 case 'w':
818 ret = regnode(preg, ANYOF);
819 if ((preg->cflags & REG_ICASE) == 0) {
820 reg_addrange(preg, 'a', 'z');
822 reg_addrange(preg, 'A', 'Z');
823 reg_addrange(preg, '0', '9');
824 reg_addrange(preg, '_', '_');
825 regc(preg, '\0');
826 *flagp |= HASWIDTH|SIMPLE;
827 break;
828 case 's':
829 ret = regnode(preg, ANYOF);
830 reg_addrange_str(preg," \t\r\n\f\v");
831 regc(preg, '\0');
832 *flagp |= HASWIDTH|SIMPLE;
833 break;
834 /* FIXME: Someday handle \1, \2, ... */
835 default:
836 /* Handle general quoted chars in exact-match routine */
837 /* Back up to include the backslash */
838 preg->regparse--;
839 goto de_fault;
841 break;
842 de_fault:
843 default: {
845 * Encode a string of characters to be matched exactly.
847 int added = 0;
849 /* Back up to pick up the first char of interest */
850 preg->regparse -= n;
852 ret = regnode(preg, EXACTLY);
854 /* Note that a META operator such as ? or * consumes the
855 * preceding char.
856 * Thus we must be careful to look ahead by 2 and add the
857 * last char as it's own EXACTLY if necessary
860 /* Until end of string or a META char is reached */
861 while (*preg->regparse && strchr(META, *preg->regparse) == NULL) {
862 n = reg_utf8_tounicode_case(preg->regparse, &ch, (preg->cflags & REG_ICASE));
863 if (ch == '\\' && preg->regparse[n]) {
864 /* Non-trailing backslash.
865 * Is this a special escape, or a regular escape?
867 if (strchr("<>mMwds", preg->regparse[n])) {
868 /* A special escape. All done with EXACTLY */
869 break;
871 /* Decode it. Note that we add the length for the escape
872 * sequence to the length for the backlash so we can skip
873 * the entire sequence, or not as required.
875 n += reg_decode_escape(preg->regparse + n, &ch);
876 if (ch == 0) {
877 preg->err = REG_ERR_NULL_CHAR;
878 return 0;
882 /* Now we have one char 'ch' of length 'n'.
883 * Check to see if the following char is a MULT
886 if (ISMULT(preg->regparse[n])) {
887 /* Yes. But do we already have some EXACTLY chars? */
888 if (added) {
889 /* Yes, so return what we have and pick up the current char next time around */
890 break;
892 /* No, so add this single char and finish */
893 regc(preg, ch);
894 added++;
895 preg->regparse += n;
896 break;
899 /* No, so just add this char normally */
900 regc(preg, ch);
901 added++;
902 preg->regparse += n;
904 regc(preg, '\0');
906 *flagp |= HASWIDTH;
907 if (added == 1)
908 *flagp |= SIMPLE;
909 break;
911 break;
914 return(ret);
917 static void reg_grow(regex_t *preg, int n)
919 if (preg->p + n >= preg->proglen) {
920 preg->proglen = (preg->p + n) * 2;
921 preg->program = realloc(preg->program, preg->proglen * sizeof(int));
926 - regnode - emit a node
928 /* Location. */
929 static int regnode(regex_t *preg, int op)
931 reg_grow(preg, 2);
933 /* The OP followed by a next pointer */
934 preg->program[preg->p++] = op;
935 preg->program[preg->p++] = 0;
937 /* Return the start of the node */
938 return preg->p - 2;
942 - regc - emit (if appropriate) a byte of code
944 static void regc(regex_t *preg, int b )
946 reg_grow(preg, 1);
947 preg->program[preg->p++] = b;
951 - reginsert - insert an operator in front of already-emitted operand
953 * Means relocating the operand.
954 * Returns the new location of the original operand.
956 static int reginsert(regex_t *preg, int op, int size, int opnd )
958 reg_grow(preg, size);
960 /* Move everything from opnd up */
961 memmove(preg->program + opnd + size, preg->program + opnd, sizeof(int) * (preg->p - opnd));
962 /* Zero out the new space */
963 memset(preg->program + opnd, 0, sizeof(int) * size);
965 preg->program[opnd] = op;
967 preg->p += size;
969 return opnd + size;
973 - regtail - set the next-pointer at the end of a node chain
975 static void regtail(regex_t *preg, int p, int val)
977 int scan;
978 int temp;
979 int offset;
981 /* Find last node. */
982 scan = p;
983 for (;;) {
984 temp = regnext(preg, scan);
985 if (temp == 0)
986 break;
987 scan = temp;
990 if (OP(preg, scan) == BACK)
991 offset = scan - val;
992 else
993 offset = val - scan;
995 preg->program[scan + 1] = offset;
999 - regoptail - regtail on operand of first argument; nop if operandless
1002 static void regoptail(regex_t *preg, int p, int val )
1004 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
1005 if (p != 0 && OP(preg, p) == BRANCH) {
1006 regtail(preg, OPERAND(p), val);
1011 * regexec and friends
1015 * Forwards.
1017 static int regtry(regex_t *preg, const char *string );
1018 static int regmatch(regex_t *preg, int prog);
1019 static int regrepeat(regex_t *preg, int p, int max);
1022 - regexec - match a regexp against a string
1024 int regexec(regex_t *preg, const char *string, size_t nmatch, regmatch_t pmatch[], int eflags)
1026 const char *s;
1027 int scan;
1029 /* Be paranoid... */
1030 if (preg == NULL || preg->program == NULL || string == NULL) {
1031 return REG_ERR_NULL_ARGUMENT;
1034 /* Check validity of program. */
1035 if (*preg->program != REG_MAGIC) {
1036 return REG_ERR_CORRUPTED;
1039 #ifdef DEBUG
1040 fprintf(stderr, "regexec: %s\n", string);
1041 regdump(preg);
1042 #endif
1044 preg->eflags = eflags;
1045 preg->pmatch = pmatch;
1046 preg->nmatch = nmatch;
1047 preg->start = string; /* All offsets are computed from here */
1049 /* Must clear out the embedded repeat counts of REPX and REPXMIN opcodes */
1050 for (scan = OPERAND(1); scan != 0; scan += regopsize(preg, scan)) {
1051 int op = OP(preg, scan);
1052 if (op == END)
1053 break;
1054 if (op == REPX || op == REPXMIN)
1055 preg->program[scan + 4] = 0;
1058 /* If there is a "must appear" string, look for it. */
1059 if (preg->regmust != 0) {
1060 s = string;
1061 while ((s = str_find(s, preg->program[preg->regmust], preg->cflags & REG_ICASE)) != NULL) {
1062 if (prefix_cmp(preg->program + preg->regmust, preg->regmlen, s, preg->cflags & REG_ICASE) >= 0) {
1063 break;
1065 s++;
1067 if (s == NULL) /* Not present. */
1068 return REG_NOMATCH;
1071 /* Mark beginning of line for ^ . */
1072 preg->regbol = string;
1074 /* Simplest case: anchored match need be tried only once (maybe per line). */
1075 if (preg->reganch) {
1076 if (eflags & REG_NOTBOL) {
1077 /* This is an anchored search, but not an BOL, so possibly skip to the next line */
1078 goto nextline;
1080 while (1) {
1081 if (regtry(preg, string)) {
1082 return REG_NOERROR;
1084 if (*string) {
1085 nextline:
1086 if (preg->cflags & REG_NEWLINE) {
1087 /* Try the next anchor? */
1088 string = strchr(string, '\n');
1089 if (string) {
1090 preg->regbol = ++string;
1091 continue;
1095 return REG_NOMATCH;
1099 /* Messy cases: unanchored match. */
1100 s = string;
1101 if (preg->regstart != '\0') {
1102 /* We know what char it must start with. */
1103 while ((s = str_find(s, preg->regstart, preg->cflags & REG_ICASE)) != NULL) {
1104 if (regtry(preg, s))
1105 return REG_NOERROR;
1106 s++;
1109 else
1110 /* We don't -- general case. */
1111 while (1) {
1112 if (regtry(preg, s))
1113 return REG_NOERROR;
1114 if (*s == '\0') {
1115 break;
1117 else {
1118 int c;
1119 s += utf8_tounicode(s, &c);
1123 /* Failure. */
1124 return REG_NOMATCH;
1128 - regtry - try match at specific point
1130 /* 0 failure, 1 success */
1131 static int regtry( regex_t *preg, const char *string )
1133 int i;
1135 preg->reginput = string;
1137 for (i = 0; i < preg->nmatch; i++) {
1138 preg->pmatch[i].rm_so = -1;
1139 preg->pmatch[i].rm_eo = -1;
1141 if (regmatch(preg, 1)) {
1142 preg->pmatch[0].rm_so = string - preg->start;
1143 preg->pmatch[0].rm_eo = preg->reginput - preg->start;
1144 return(1);
1145 } else
1146 return(0);
1150 * Returns bytes matched if 'pattern' is a prefix of 'string'.
1152 * If 'nocase' is non-zero, does a case-insensitive match.
1154 * Returns -1 on not found.
1156 static int prefix_cmp(const int *prog, int proglen, const char *string, int nocase)
1158 const char *s = string;
1159 while (proglen && *s) {
1160 int ch;
1161 int n = reg_utf8_tounicode_case(s, &ch, nocase);
1162 if (ch != *prog) {
1163 return -1;
1165 prog++;
1166 s += n;
1167 proglen--;
1169 if (proglen == 0) {
1170 return s - string;
1172 return -1;
1176 * Searchs for 'c' in the range 'range'.
1178 * Returns 1 if found, or 0 if not.
1180 static int reg_range_find(const int *range, int c)
1182 while (*range) {
1183 /*printf("Checking %d in range [%d,%d]\n", c, range[1], (range[0] + range[1] - 1));*/
1184 if (c >= range[1] && c <= (range[0] + range[1] - 1)) {
1185 return 1;
1187 range += 2;
1189 return 0;
1193 * Search for the character 'c' in the utf-8 string 'string'.
1195 * If 'nocase' is set, the 'string' is assumed to be uppercase
1196 * and 'c' is converted to uppercase before matching.
1198 * Returns the byte position in the string where the 'c' was found, or
1199 * NULL if not found.
1201 static const char *str_find(const char *string, int c, int nocase)
1203 if (nocase) {
1204 /* The "string" should already be converted to uppercase */
1205 c = utf8_upper(c);
1207 while (*string) {
1208 int ch;
1209 int n = reg_utf8_tounicode_case(string, &ch, nocase);
1210 if (c == ch) {
1211 return string;
1213 string += n;
1215 return NULL;
1219 * Returns true if 'ch' is an end-of-line char.
1221 * In REG_NEWLINE mode, \n is considered EOL in
1222 * addition to \0
1224 static int reg_iseol(regex_t *preg, int ch)
1226 if (preg->cflags & REG_NEWLINE) {
1227 return ch == '\0' || ch == '\n';
1229 else {
1230 return ch == '\0';
1234 static int regmatchsimplerepeat(regex_t *preg, int scan, int matchmin)
1236 int nextch = '\0';
1237 const char *save;
1238 int no;
1239 int c;
1241 int max = preg->program[scan + 2];
1242 int min = preg->program[scan + 3];
1243 int next = regnext(preg, scan);
1246 * Lookahead to avoid useless match attempts
1247 * when we know what character comes next.
1249 if (OP(preg, next) == EXACTLY) {
1250 nextch = preg->program[OPERAND(next)];
1252 save = preg->reginput;
1253 no = regrepeat(preg, scan + 5, max);
1254 if (no < min) {
1255 return 0;
1257 if (matchmin) {
1258 /* from min up to no */
1259 max = no;
1260 no = min;
1262 /* else from no down to min */
1263 while (1) {
1264 if (matchmin) {
1265 if (no > max) {
1266 break;
1269 else {
1270 if (no < min) {
1271 break;
1274 preg->reginput = save + utf8_index(save, no);
1275 reg_utf8_tounicode_case(preg->reginput, &c, (preg->cflags & REG_ICASE));
1276 /* If it could work, try it. */
1277 if (reg_iseol(preg, nextch) || c == nextch) {
1278 if (regmatch(preg, next)) {
1279 return(1);
1282 if (matchmin) {
1283 /* Couldn't or didn't, add one more */
1284 no++;
1286 else {
1287 /* Couldn't or didn't -- back up. */
1288 no--;
1291 return(0);
1294 static int regmatchrepeat(regex_t *preg, int scan, int matchmin)
1296 int *scanpt = preg->program + scan;
1298 int max = scanpt[2];
1299 int min = scanpt[3];
1301 /* Have we reached min? */
1302 if (scanpt[4] < min) {
1303 /* No, so get another one */
1304 scanpt[4]++;
1305 if (regmatch(preg, scan + 5)) {
1306 return 1;
1308 scanpt[4]--;
1309 return 0;
1311 if (scanpt[4] > max) {
1312 return 0;
1315 if (matchmin) {
1316 /* minimal, so try other branch first */
1317 if (regmatch(preg, regnext(preg, scan))) {
1318 return 1;
1320 /* No, so try one more */
1321 scanpt[4]++;
1322 if (regmatch(preg, scan + 5)) {
1323 return 1;
1325 scanpt[4]--;
1326 return 0;
1328 /* maximal, so try this branch again */
1329 if (scanpt[4] < max) {
1330 scanpt[4]++;
1331 if (regmatch(preg, scan + 5)) {
1332 return 1;
1334 scanpt[4]--;
1336 /* At this point we are at max with no match. Try the other branch */
1337 return regmatch(preg, regnext(preg, scan));
1341 - regmatch - main matching routine
1343 * Conceptually the strategy is simple: check to see whether the current
1344 * node matches, call self recursively to see whether the rest matches,
1345 * and then act accordingly. In practice we make some effort to avoid
1346 * recursion, in particular by going through "ordinary" nodes (that don't
1347 * need to know whether the rest of the match failed) by a loop instead of
1348 * by recursion.
1350 /* 0 failure, 1 success */
1351 static int regmatch(regex_t *preg, int prog)
1353 int scan; /* Current node. */
1354 int next; /* Next node. */
1355 const char *save;
1357 scan = prog;
1359 #ifdef DEBUG
1360 if (scan != 0 && regnarrate)
1361 fprintf(stderr, "%s(\n", regprop(scan));
1362 #endif
1363 while (scan != 0) {
1364 int n;
1365 int c;
1366 #ifdef DEBUG
1367 if (regnarrate) {
1368 fprintf(stderr, "%3d: %s...\n", scan, regprop(OP(preg, scan))); /* Where, what. */
1370 #endif
1371 next = regnext(preg, scan);
1372 n = reg_utf8_tounicode_case(preg->reginput, &c, (preg->cflags & REG_ICASE));
1374 switch (OP(preg, scan)) {
1375 case BOL:
1376 if (preg->reginput != preg->regbol)
1377 return(0);
1378 break;
1379 case EOL:
1380 if (!reg_iseol(preg, c)) {
1381 return(0);
1383 break;
1384 case WORDA:
1385 /* Must be looking at a letter, digit, or _ */
1386 if ((!isalnum(UCHAR(c))) && c != '_')
1387 return(0);
1388 /* Prev must be BOL or nonword */
1389 if (preg->reginput > preg->regbol &&
1390 (isalnum(UCHAR(preg->reginput[-1])) || preg->reginput[-1] == '_'))
1391 return(0);
1392 break;
1393 case WORDZ:
1394 /* Can't match at BOL */
1395 if (preg->reginput > preg->regbol) {
1396 /* Current must be EOL or nonword */
1397 if (reg_iseol(preg, c) || !isalnum(UCHAR(c)) || c != '_') {
1398 c = preg->reginput[-1];
1399 /* Previous must be word */
1400 if (isalnum(UCHAR(c)) || c == '_') {
1401 break;
1405 /* No */
1406 return(0);
1408 case ANY:
1409 if (reg_iseol(preg, c))
1410 return 0;
1411 preg->reginput += n;
1412 break;
1413 case EXACTLY: {
1414 int opnd;
1415 int len;
1416 int slen;
1418 opnd = OPERAND(scan);
1419 len = str_int_len(preg->program + opnd);
1421 slen = prefix_cmp(preg->program + opnd, len, preg->reginput, preg->cflags & REG_ICASE);
1422 if (slen < 0) {
1423 return(0);
1425 preg->reginput += slen;
1427 break;
1428 case ANYOF:
1429 if (reg_iseol(preg, c) || reg_range_find(preg->program + OPERAND(scan), c) == 0) {
1430 return(0);
1432 preg->reginput += n;
1433 break;
1434 case ANYBUT:
1435 if (reg_iseol(preg, c) || reg_range_find(preg->program + OPERAND(scan), c) != 0) {
1436 return(0);
1438 preg->reginput += n;
1439 break;
1440 case NOTHING:
1441 break;
1442 case BACK:
1443 break;
1444 case BRANCH:
1445 if (OP(preg, next) != BRANCH) /* No choice. */
1446 next = OPERAND(scan); /* Avoid recursion. */
1447 else {
1448 do {
1449 save = preg->reginput;
1450 if (regmatch(preg, OPERAND(scan))) {
1451 return(1);
1453 preg->reginput = save;
1454 scan = regnext(preg, scan);
1455 } while (scan != 0 && OP(preg, scan) == BRANCH);
1456 return(0);
1457 /* NOTREACHED */
1459 break;
1460 case REP:
1461 case REPMIN:
1462 return regmatchsimplerepeat(preg, scan, OP(preg, scan) == REPMIN);
1464 case REPX:
1465 case REPXMIN:
1466 return regmatchrepeat(preg, scan, OP(preg, scan) == REPXMIN);
1468 case END:
1469 return 1; /* Success! */
1471 case OPENNC:
1472 case CLOSENC:
1473 return regmatch(preg, next);
1475 default:
1476 if (OP(preg, scan) >= OPEN+1 && OP(preg, scan) < CLOSE_END) {
1477 save = preg->reginput;
1478 if (regmatch(preg, next)) {
1479 if (OP(preg, scan) < CLOSE) {
1480 int no = OP(preg, scan) - OPEN;
1481 if (no < preg->nmatch && preg->pmatch[no].rm_so == -1) {
1482 preg->pmatch[no].rm_so = save - preg->start;
1485 else {
1486 int no = OP(preg, scan) - CLOSE;
1487 if (no < preg->nmatch && preg->pmatch[no].rm_eo == -1) {
1488 preg->pmatch[no].rm_eo = save - preg->start;
1491 return(1);
1493 return(0);
1495 return REG_ERR_INTERNAL;
1498 scan = next;
1502 * We get here only if there's trouble -- normally "case END" is
1503 * the terminating point.
1505 return REG_ERR_INTERNAL;
1509 - regrepeat - repeatedly match something simple, report how many
1511 static int regrepeat(regex_t *preg, int p, int max)
1513 int count = 0;
1514 const char *scan;
1515 int opnd;
1516 int ch;
1517 int n;
1519 scan = preg->reginput;
1520 opnd = OPERAND(p);
1521 switch (OP(preg, p)) {
1522 case ANY:
1523 /* No need to handle utf8 specially here */
1524 while (!reg_iseol(preg, *scan) && count < max) {
1525 count++;
1526 scan++;
1528 break;
1529 case EXACTLY:
1530 while (count < max) {
1531 n = reg_utf8_tounicode_case(scan, &ch, preg->cflags & REG_ICASE);
1532 if (preg->program[opnd] != ch) {
1533 break;
1535 count++;
1536 scan += n;
1538 break;
1539 case ANYOF:
1540 while (count < max) {
1541 n = reg_utf8_tounicode_case(scan, &ch, preg->cflags & REG_ICASE);
1542 if (reg_iseol(preg, ch) || reg_range_find(preg->program + opnd, ch) == 0) {
1543 break;
1545 count++;
1546 scan += n;
1548 break;
1549 case ANYBUT:
1550 while (count < max) {
1551 n = reg_utf8_tounicode_case(scan, &ch, preg->cflags & REG_ICASE);
1552 if (reg_iseol(preg, ch) || reg_range_find(preg->program + opnd, ch) != 0) {
1553 break;
1555 count++;
1556 scan += n;
1558 break;
1559 default: /* Oh dear. Called inappropriately. */
1560 preg->err = REG_ERR_INTERNAL;
1561 count = 0; /* Best compromise. */
1562 break;
1564 preg->reginput = scan;
1566 return(count);
1570 - regnext - dig the "next" pointer out of a node
1572 static int regnext(regex_t *preg, int p )
1574 int offset;
1576 offset = NEXT(preg, p);
1578 if (offset == 0)
1579 return 0;
1581 if (OP(preg, p) == BACK)
1582 return(p-offset);
1583 else
1584 return(p+offset);
1588 - regopsize - returns the size of opcode + operands at 'p' in words
1590 static int regopsize(regex_t *preg, int p )
1592 /* Almost all opcodes are 2 words, but some are more */
1593 switch (OP(preg, p)) {
1594 case REP:
1595 case REPMIN:
1596 case REPX:
1597 case REPXMIN:
1598 return 5;
1600 case ANYOF:
1601 case ANYBUT:
1602 case EXACTLY: {
1603 int s = p + 2;
1604 while (preg->program[s++]) {
1606 return s - p;
1609 return 2;
1612 #if defined(DEBUG) && !defined(JIM_BOOTSTRAP)
1615 - regdump - dump a regexp onto stdout in vaguely comprehensible form
1617 static void regdump(regex_t *preg)
1619 int s;
1620 int op = EXACTLY; /* Arbitrary non-END op. */
1621 int next;
1622 char buf[MAX_UTF8_LEN + 1];
1624 int i;
1625 for (i = 1; i < preg->p; i++) {
1626 printf("%02x ", (unsigned char)preg->program[i]);
1627 if (i % 16 == 0) {
1628 printf("\n");
1631 printf("\n");
1633 s = 1;
1634 while (op != END && s < preg->p) { /* While that wasn't END last time... */
1635 op = OP(preg, s);
1636 printf("%3d: %s", s, regprop(op)); /* Where, what. */
1637 next = regnext(preg, s);
1638 if (next == 0) /* Next ptr. */
1639 printf("(0)");
1640 else
1641 printf("(%d)", next);
1642 s += 2;
1643 if (op == REP || op == REPMIN || op == REPX || op == REPXMIN) {
1644 int max = preg->program[s];
1645 int min = preg->program[s + 1];
1646 if (max == 65535) {
1647 printf("{%d,*}", min);
1649 else {
1650 printf("{%d,%d}", min, max);
1652 printf(" %d", preg->program[s + 2]);
1653 s += 3;
1655 else if (op == ANYOF || op == ANYBUT) {
1656 /* set of ranges */
1658 while (preg->program[s]) {
1659 int len = preg->program[s++];
1660 int first = preg->program[s++];
1661 buf[utf8_getchars(buf, first)] = 0;
1662 printf("%s", buf);
1663 if (len > 1) {
1664 buf[utf8_getchars(buf, first + len - 1)] = 0;
1665 printf("-%s", buf);
1668 s++;
1670 else if (op == EXACTLY) {
1671 /* Literal string, where present. */
1673 while (preg->program[s]) {
1674 buf[utf8_getchars(buf, preg->program[s])] = 0;
1675 printf("%s", buf);
1676 s++;
1678 s++;
1680 putchar('\n');
1683 if (op == END) {
1684 /* Header fields of interest. */
1685 if (preg->regstart) {
1686 buf[utf8_getchars(buf, preg->regstart)] = 0;
1687 printf("start '%s' ", buf);
1689 if (preg->reganch)
1690 printf("anchored ");
1691 if (preg->regmust != 0) {
1692 int i;
1693 printf("must have:");
1694 for (i = 0; i < preg->regmlen; i++) {
1695 putchar(preg->program[preg->regmust + i]);
1697 putchar('\n');
1700 printf("\n");
1704 - regprop - printable representation of opcode
1706 static const char *regprop( int op )
1708 static char buf[50];
1710 switch (op) {
1711 case BOL:
1712 return "BOL";
1713 case EOL:
1714 return "EOL";
1715 case ANY:
1716 return "ANY";
1717 case ANYOF:
1718 return "ANYOF";
1719 case ANYBUT:
1720 return "ANYBUT";
1721 case BRANCH:
1722 return "BRANCH";
1723 case EXACTLY:
1724 return "EXACTLY";
1725 case NOTHING:
1726 return "NOTHING";
1727 case BACK:
1728 return "BACK";
1729 case END:
1730 return "END";
1731 case REP:
1732 return "REP";
1733 case REPMIN:
1734 return "REPMIN";
1735 case REPX:
1736 return "REPX";
1737 case REPXMIN:
1738 return "REPXMIN";
1739 case WORDA:
1740 return "WORDA";
1741 case WORDZ:
1742 return "WORDZ";
1743 case OPENNC:
1744 return "OPEN";
1745 case CLOSENC:
1746 return "CLOSE";
1747 default:
1748 if (op >= OPEN && op < CLOSE) {
1749 snprintf(buf, sizeof(buf), "OPEN%d", op-OPEN);
1751 else if (op >= CLOSE && op < CLOSE_END) {
1752 snprintf(buf, sizeof(buf), "CLOSE%d", op-CLOSE);
1754 else {
1755 snprintf(buf, sizeof(buf), "?%d?\n", op);
1757 return(buf);
1760 #endif /* JIM_BOOTSTRAP */
1762 size_t regerror(int errcode, const regex_t *preg, char *errbuf, size_t errbuf_size)
1764 static const char *error_strings[] = {
1765 "success",
1766 "no match",
1767 "bad pattern",
1768 "null argument",
1769 "unknown error",
1770 "too big",
1771 "out of memory",
1772 "too many ()",
1773 "parentheses () not balanced",
1774 "braces {} not balanced",
1775 "invalid repetition count(s)",
1776 "extra characters",
1777 "*+ of empty atom",
1778 "nested count",
1779 "internal error",
1780 "count follows nothing",
1781 "trailing backslash",
1782 "corrupted program",
1783 "contains null char",
1785 const char *err;
1787 if (errcode < 0 || errcode >= REG_ERR_NUM) {
1788 err = "Bad error code";
1790 else {
1791 err = error_strings[errcode];
1794 return snprintf(errbuf, errbuf_size, "%s", err);
1797 void regfree(regex_t *preg)
1799 free(preg->program);
1802 #endif