Add support for sub-interpreters
[jimtcl.git] / jimregexp.c
blobdca55a2fb04041fa1965422e4a3b80a0ebba5c92
1 /*
2 * vi:se ts=8:
4 * regcomp and regexec -- regsub and regerror are elsewhere
6 * Copyright (c) 1986 by University of Toronto.
7 * Written by Henry Spencer. Not derived from licensed software.
9 * Permission is granted to anyone to use this software for any
10 * purpose on any computer system, and to redistribute it freely,
11 * subject to the following restrictions:
13 * 1. The author is not responsible for the consequences of use of
14 * this software, no matter how awful, even if they arise
15 * from defects in it.
17 * 2. The origin of this software must not be misrepresented, either
18 * by explicit claim or by omission.
20 * 3. Altered versions must be plainly marked as such, and must not
21 * be misrepresented as being the original software.
22 *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore,
23 *** hoptoad!gnu, on 27 Dec 1986, to add \n as an alternative to |
24 *** to assist in implementing egrep.
25 *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore,
26 *** hoptoad!gnu, on 27 Dec 1986, to add \< and \> for word-matching
27 *** as in BSD grep and ex.
28 *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore,
29 *** hoptoad!gnu, on 28 Dec 1986, to optimize characters quoted with \.
30 *** THIS IS AN ALTERED VERSION. It was altered by James A. Woods,
31 *** ames!jaw, on 19 June 1987, to quash a regcomp() redundancy.
32 *** THIS IS AN ALTERED VERSION. It was altered by Christopher Seiwald
33 *** seiwald@vix.com, on 28 August 1993, for use in jam. Regmagic.h
34 *** was moved into regexp.h, and the include of regexp.h now uses "'s
35 *** to avoid conflicting with the system regexp.h. Const, bless its
36 *** soul, was removed so it can compile everywhere. The declaration
37 *** of strchr() was in conflict on AIX, so it was removed (as it is
38 *** happily defined in string.h).
39 *** THIS IS AN ALTERED VERSION. It was altered by Christopher Seiwald
40 *** seiwald@perforce.com, on 20 January 2000, to use function prototypes.
41 *** THIS IS AN ALTERED VERSION. It was altered by Christopher Seiwald
42 *** seiwald@perforce.com, on 05 November 2002, to const string literals.
44 * THIS IS AN ALTERED VERSION. It was altered by Steve Bennett <steveb@workware.net.au>
45 * on 16 October 2010, to remove static state and add better Tcl ARE compatibility.
46 * This includes counted repetitions, UTF-8 support, character classes,
47 * shorthand character classes, increased number of parentheses to 100,
48 * backslash escape sequences. It also removes \n as an alternative to |.
50 * Beware that some of this code is subtly aware of the way operator
51 * precedence is structured in regular expressions. Serious changes in
52 * regular-expression syntax might require a total rethink.
55 #include "jimautoconf.h"
57 #if defined(JIM_REGEXP)
58 #include <stdio.h>
59 #include <ctype.h>
60 #include <stdlib.h>
61 #include <string.h>
63 #include "jim.h"
64 #include "jimregexp.h"
65 #include "utf8.h"
67 /* An arbitrary limit, but this seems enough. Must be less than 1000. */
68 #define REG_MAX_PAREN 100
71 * Structure for regexp "program". This is essentially a linear encoding
72 * of a nondeterministic finite-state machine (aka syntax charts or
73 * "railroad normal form" in parsing technology). Each node is an opcode
74 * plus a "next" pointer, possibly plus an operand. "Next" pointers of
75 * all nodes except BRANCH implement concatenation; a "next" pointer with
76 * a BRANCH on both ends of it is connecting two alternatives. (Here we
77 * have one of the subtle syntax dependencies: an individual BRANCH (as
78 * opposed to a collection of them) is never concatenated with anything
79 * because of operator precedence.) The operand of some types of node is
80 * a literal string; for others, it is a node leading into a sub-FSM. In
81 * particular, the operand of a BRANCH node is the first node of the branch.
82 * (NB this is *not* a tree structure: the tail of the branch connects
83 * to the thing following the set of BRANCHes.) The opcodes are:
86 /* definition number opnd? meaning */
87 #define END 0 /* no End of program. */
88 #define BOL 1 /* no Match "" at beginning of line. */
89 #define EOL 2 /* no Match "" at end of line. */
90 #define ANY 3 /* no Match any one character. */
91 #define ANYOF 4 /* str Match any character in this string. */
92 #define ANYBUT 5 /* str Match any character not in this string. */
93 #define BRANCH 6 /* node Match this alternative, or the next... */
94 #define BACK 7 /* no Match "", "next" ptr points backward. */
95 #define EXACTLY 8 /* str Match this string. */
96 #define NOTHING 9 /* no Match empty string. */
97 #define REP 10 /* max,min Match this (simple) thing [min,max] times. */
98 #define REPMIN 11 /* max,min Match this (simple) thing [min,max] times, minimal match. */
99 #define REPX 12 /* max,min Match this (complex) thing [min,max] times. */
100 #define REPXMIN 13 /* max,min Match this (complex) thing [min,max] times, minimal match. */
101 #define BOLX 14 /* no Match "" at beginning of input. */
102 #define EOLX 15 /* no Match "" at end of input. */
103 #define WORDA 16 /* no Match "" at wordchar, where prev is nonword */
104 #define WORDZ 17 /* no Match "" at nonwordchar, where prev is word */
106 #define OPENNC 1000 /* no Non-capturing parentheses - must be OPEN-1 */
107 #define OPEN 1001 /* no Mark this point in input as start of #n. */
108 /* OPEN+1 is number 1, etc. */
110 /* must not be any other opts between OPEN and CLOSE */
112 #define CLOSENC 2000 /* no Non-capturing parentheses - must be CLOSE-1 */
113 #define CLOSE 2001 /* no Analogous to OPEN. */
114 #define CLOSE_END (CLOSE+REG_MAX_PAREN)
117 * The first word of the regexp internal "program" is actually this magic
118 * number; the start node begins in the second word.
120 #define REG_MAGIC 0xFADED00D
123 * Opcode notes:
125 * BRANCH The set of branches constituting a single choice are hooked
126 * together with their "next" pointers, since precedence prevents
127 * anything being concatenated to any individual branch. The
128 * "next" pointer of the last BRANCH in a choice points to the
129 * thing following the whole choice. This is also where the
130 * final "next" pointer of each individual branch points; each
131 * branch starts with the operand node of a BRANCH node.
133 * BACK Normal "next" pointers all implicitly point forward; BACK
134 * exists to make loop structures possible.
136 * REP,REPX Repeated matches ('?', '*', '+' and {min,max}) are implemented
137 * as either simple repeats (REP) or complex repeats (REPX).
138 * These opcodes include a "min" and "max" count after the opcode.
139 * This is followed by a fourth "current count" word that is
140 * only used by REPX, as it implements a recursive match.
141 * REPMIN and REPXMIN are identical except they implement minimal repeats.
143 * OPEN,CLOSE ...are numbered at compile time.
147 * A node is one word of opcode followed by one word of "next" pointer.
148 * The "next" pointer value is a positive offset from the opcode of the node
149 * containing it.
150 * An operand, if any, simply follows the node. (Note that much of the
151 * code generation knows about this implicit relationship.)
153 #define OP(preg, p) (preg->program[p])
154 #define NEXT(preg, p) (preg->program[p + 1])
155 #define OPERAND(p) ((p) + 2)
158 * See regmagic.h for one further detail of program structure.
163 * Utility definitions.
166 #define FAIL(R,M) { (R)->err = (M); return (M); }
167 #define ISMULT(c) ((c) == '*' || (c) == '+' || (c) == '?' || (c) == '{')
168 #define META "^$.[()|?{+*"
171 * Flags to be passed up and down.
173 #define HASWIDTH 1 /* Known never to match null string. */
174 #define SIMPLE 2 /* Simple enough to be STAR/PLUS operand. */
175 #define SPSTART 4 /* Starts with * or +. */
176 #define WORST 0 /* Worst case. */
178 #define MAX_REP_COUNT 1000000
181 * Forward declarations for regcomp()'s friends.
183 static int reg(regex_t *preg, int paren /* Parenthesized? */, int *flagp );
184 static int regpiece(regex_t *preg, int *flagp );
185 static int regbranch(regex_t *preg, int *flagp );
186 static int regatom(regex_t *preg, int *flagp );
187 static int regnode(regex_t *preg, int op );
188 static int regnext(regex_t *preg, int p );
189 static void regc(regex_t *preg, int b );
190 static int reginsert(regex_t *preg, int op, int size, int opnd );
191 static void regtail(regex_t *preg, int p, int val);
192 static void regoptail(regex_t *preg, int p, int val );
193 static int regopsize(regex_t *preg, int p );
195 static int reg_range_find(const int *string, int c);
196 static const char *str_find(const char *string, int c, int nocase);
197 static int prefix_cmp(const int *prog, int proglen, const char *string, int nocase);
199 /*#define DEBUG*/
200 #ifdef DEBUG
201 static int regnarrate = 0;
202 static void regdump(regex_t *preg);
203 static const char *regprop( int op );
204 #endif
208 * Returns the length of the null-terminated integer sequence.
210 static int str_int_len(const int *seq)
212 int n = 0;
213 while (*seq++) {
214 n++;
216 return n;
220 - regcomp - compile a regular expression into internal code
222 * We can't allocate space until we know how big the compiled form will be,
223 * but we can't compile it (and thus know how big it is) until we've got a
224 * place to put the code. So we cheat: we compile it twice, once with code
225 * generation turned off and size counting turned on, and once "for real".
226 * This also means that we don't allocate space until we are sure that the
227 * thing really will compile successfully, and we never have to move the
228 * code and thus invalidate pointers into it. (Note that it has to be in
229 * one piece because free() must be able to free it all.)
231 * Beware that the optimization-preparation code in here knows about some
232 * of the structure of the compiled regexp.
234 int regcomp(regex_t *preg, const char *exp, int cflags)
236 int scan;
237 int longest;
238 unsigned len;
239 int flags;
241 #ifdef DEBUG
242 fprintf(stderr, "Compiling: '%s'\n", exp);
243 #endif
244 memset(preg, 0, sizeof(*preg));
246 if (exp == NULL)
247 FAIL(preg, REG_ERR_NULL_ARGUMENT);
249 /* First pass: determine size, legality. */
250 preg->cflags = cflags;
251 preg->regparse = exp;
253 /* Allocate space. */
254 preg->proglen = (strlen(exp) + 1) * 5;
255 preg->program = malloc(preg->proglen * sizeof(int));
256 if (preg->program == NULL)
257 FAIL(preg, REG_ERR_NOMEM);
259 /* Note that since we store a magic value as the first item in the program,
260 * program offsets will never be 0
262 regc(preg, REG_MAGIC);
263 if (reg(preg, 0, &flags) == 0) {
264 return preg->err;
267 /* Small enough for pointer-storage convention? */
268 if (preg->re_nsub >= REG_MAX_PAREN) /* Probably could be 65535L. */
269 FAIL(preg,REG_ERR_TOO_BIG);
271 /* Dig out information for optimizations. */
272 preg->regstart = 0; /* Worst-case defaults. */
273 preg->reganch = 0;
274 preg->regmust = 0;
275 preg->regmlen = 0;
276 scan = 1; /* First BRANCH. */
277 if (OP(preg, regnext(preg, scan)) == END) { /* Only one top-level choice. */
278 scan = OPERAND(scan);
280 /* Starting-point info. */
281 if (OP(preg, scan) == EXACTLY) {
282 preg->regstart = preg->program[OPERAND(scan)];
284 else if (OP(preg, scan) == BOL)
285 preg->reganch++;
288 * If there's something expensive in the r.e., find the
289 * longest literal string that must appear and make it the
290 * regmust. Resolve ties in favor of later strings, since
291 * the regstart check works with the beginning of the r.e.
292 * and avoiding duplication strengthens checking. Not a
293 * strong reason, but sufficient in the absence of others.
295 if (flags&SPSTART) {
296 longest = 0;
297 len = 0;
298 for (; scan != 0; scan = regnext(preg, scan)) {
299 if (OP(preg, scan) == EXACTLY) {
300 int plen = str_int_len(preg->program + OPERAND(scan));
301 if (plen >= len) {
302 longest = OPERAND(scan);
303 len = plen;
307 preg->regmust = longest;
308 preg->regmlen = len;
312 #ifdef DEBUG
313 regdump(preg);
314 #endif
316 return 0;
320 - reg - regular expression, i.e. main body or parenthesized thing
322 * Caller must absorb opening parenthesis.
324 * Combining parenthesis handling with the base level of regular expression
325 * is a trifle forced, but the need to tie the tails of the branches to what
326 * follows makes it hard to avoid.
328 static int reg(regex_t *preg, int paren /* Parenthesized? */, int *flagp )
330 int ret;
331 int br;
332 int ender;
333 int parno = 0;
334 int flags;
336 *flagp = HASWIDTH; /* Tentatively. */
338 /* Make an OPEN node, if parenthesized. */
339 if (paren) {
340 if (preg->regparse[0] == '?' && preg->regparse[1] == ':') {
341 /* non-capturing paren */
342 preg->regparse += 2;
343 parno = -1;
345 else {
346 parno = ++preg->re_nsub;
348 ret = regnode(preg, OPEN+parno);
349 } else
350 ret = 0;
352 /* Pick up the branches, linking them together. */
353 br = regbranch(preg, &flags);
354 if (br == 0)
355 return 0;
356 if (ret != 0)
357 regtail(preg, ret, br); /* OPEN -> first. */
358 else
359 ret = br;
360 if (!(flags&HASWIDTH))
361 *flagp &= ~HASWIDTH;
362 *flagp |= flags&SPSTART;
363 while (*preg->regparse == '|') {
364 preg->regparse++;
365 br = regbranch(preg, &flags);
366 if (br == 0)
367 return 0;
368 regtail(preg, ret, br); /* BRANCH -> BRANCH. */
369 if (!(flags&HASWIDTH))
370 *flagp &= ~HASWIDTH;
371 *flagp |= flags&SPSTART;
374 /* Make a closing node, and hook it on the end. */
375 ender = regnode(preg, (paren) ? CLOSE+parno : END);
376 regtail(preg, ret, ender);
378 /* Hook the tails of the branches to the closing node. */
379 for (br = ret; br != 0; br = regnext(preg, br))
380 regoptail(preg, br, ender);
382 /* Check for proper termination. */
383 if (paren && *preg->regparse++ != ')') {
384 preg->err = REG_ERR_UNMATCHED_PAREN;
385 return 0;
386 } else if (!paren && *preg->regparse != '\0') {
387 if (*preg->regparse == ')') {
388 preg->err = REG_ERR_UNMATCHED_PAREN;
389 return 0;
390 } else {
391 preg->err = REG_ERR_JUNK_ON_END;
392 return 0;
396 return(ret);
400 - regbranch - one alternative of an | operator
402 * Implements the concatenation operator.
404 static int regbranch(regex_t *preg, int *flagp )
406 int ret;
407 int chain;
408 int latest;
409 int flags;
411 *flagp = WORST; /* Tentatively. */
413 ret = regnode(preg, BRANCH);
414 chain = 0;
415 while (*preg->regparse != '\0' && *preg->regparse != ')' &&
416 *preg->regparse != '|') {
417 latest = regpiece(preg, &flags);
418 if (latest == 0)
419 return 0;
420 *flagp |= flags&HASWIDTH;
421 if (chain == 0) {/* First piece. */
422 *flagp |= flags&SPSTART;
424 else {
425 regtail(preg, chain, latest);
427 chain = latest;
429 if (chain == 0) /* Loop ran zero times. */
430 (void) regnode(preg, NOTHING);
432 return(ret);
436 - regpiece - something followed by possible [*+?]
438 * Note that the branching code sequences used for ? and the general cases
439 * of * and + are somewhat optimized: they use the same NOTHING node as
440 * both the endmarker for their branch list and the body of the last branch.
441 * It might seem that this node could be dispensed with entirely, but the
442 * endmarker role is not redundant.
444 static int regpiece(regex_t *preg, int *flagp)
446 int ret;
447 char op;
448 int next;
449 int flags;
450 int min;
451 int max;
453 ret = regatom(preg, &flags);
454 if (ret == 0)
455 return 0;
457 op = *preg->regparse;
458 if (!ISMULT(op)) {
459 *flagp = flags;
460 return(ret);
463 if (!(flags&HASWIDTH) && op != '?') {
464 preg->err = REG_ERR_OPERAND_COULD_BE_EMPTY;
465 return 0;
468 /* Handle braces (counted repetition) by expansion */
469 if (op == '{') {
470 char *end;
472 min = strtoul(preg->regparse + 1, &end, 10);
473 if (end == preg->regparse + 1) {
474 preg->err = REG_ERR_BAD_COUNT;
475 return 0;
477 if (*end == '}') {
478 max = min;
480 else {
481 preg->regparse = end;
482 max = strtoul(preg->regparse + 1, &end, 10);
483 if (*end != '}') {
484 preg->err = REG_ERR_UNMATCHED_BRACES;
485 return 0;
488 if (end == preg->regparse + 1) {
489 max = MAX_REP_COUNT;
491 else if (max < min || max >= 100) {
492 preg->err = REG_ERR_BAD_COUNT;
493 return 0;
495 if (min >= 100) {
496 preg->err = REG_ERR_BAD_COUNT;
497 return 0;
500 preg->regparse = strchr(preg->regparse, '}');
502 else {
503 min = (op == '+');
504 max = (op == '?' ? 1 : MAX_REP_COUNT);
507 if (preg->regparse[1] == '?') {
508 preg->regparse++;
509 next = reginsert(preg, flags & SIMPLE ? REPMIN : REPXMIN, 5, ret);
511 else {
512 next = reginsert(preg, flags & SIMPLE ? REP: REPX, 5, ret);
514 preg->program[ret + 2] = max;
515 preg->program[ret + 3] = min;
516 preg->program[ret + 4] = 0;
518 *flagp = (min) ? (WORST|HASWIDTH) : (WORST|SPSTART);
520 if (!(flags & SIMPLE)) {
521 int back = regnode(preg, BACK);
522 regtail(preg, back, ret);
523 regtail(preg, next, back);
526 preg->regparse++;
527 if (ISMULT(*preg->regparse)) {
528 preg->err = REG_ERR_NESTED_COUNT;
529 return 0;
532 return ret;
536 * Add all characters in the inclusive range between lower and upper.
538 * Handles a swapped range (upper < lower).
540 static void reg_addrange(regex_t *preg, int lower, int upper)
542 if (lower > upper) {
543 reg_addrange(preg, upper, lower);
545 /* Add a range as length, start */
546 regc(preg, upper - lower + 1);
547 regc(preg, lower);
551 * Add a null-terminated literal string as a set of ranges.
553 static void reg_addrange_str(regex_t *preg, const char *str)
555 while (*str) {
556 reg_addrange(preg, *str, *str);
557 str++;
562 * Extracts the next unicode char from utf8.
564 * If 'upper' is set, converts the char to uppercase.
566 static int reg_utf8_tounicode_case(const char *s, int *uc, int upper)
568 int l = utf8_tounicode(s, uc);
569 if (upper) {
570 *uc = utf8_upper(*uc);
572 return l;
576 * Converts a hex digit to decimal.
578 * Returns -1 for an invalid hex digit.
580 static int hexdigitval(int c)
582 if (c >= '0' && c <= '9')
583 return c - '0';
584 if (c >= 'a' && c <= 'f')
585 return c - 'a' + 10;
586 if (c >= 'A' && c <= 'F')
587 return c - 'A' + 10;
588 return -1;
592 * Parses up to 'n' hex digits at 's' and stores the result in *uc.
594 * Returns the number of hex digits parsed.
595 * If there are no hex digits, returns 0 and stores nothing.
597 static int parse_hex(const char *s, int n, int *uc)
599 int val = 0;
600 int k;
602 for (k = 0; k < n; k++) {
603 int c = hexdigitval(*s++);
604 if (c == -1) {
605 break;
607 val = (val << 4) | c;
609 if (k) {
610 *uc = val;
612 return k;
616 * Call for chars after a backlash to decode the escape sequence.
618 * Stores the result in *ch.
620 * Returns the number of bytes consumed.
622 static int reg_decode_escape(const char *s, int *ch)
624 int n;
625 const char *s0 = s;
627 *ch = *s++;
629 switch (*ch) {
630 case 'b': *ch = '\b'; break;
631 case 'e': *ch = 27; break;
632 case 'f': *ch = '\f'; break;
633 case 'n': *ch = '\n'; break;
634 case 'r': *ch = '\r'; break;
635 case 't': *ch = '\t'; break;
636 case 'v': *ch = '\v'; break;
637 case 'u':
638 if (*s == '{') {
639 /* Expect \u{NNNN} */
640 n = parse_hex(s + 1, 6, ch);
641 if (n > 0 && s[n + 1] == '}' && *ch >= 0 && *ch <= 0x1fffff) {
642 s += n + 2;
644 else {
645 /* Invalid, so just treat as an escaped 'u' */
646 *ch = 'u';
649 else if ((n = parse_hex(s, 4, ch)) > 0) {
650 s += n;
652 break;
653 case 'U':
654 if ((n = parse_hex(s, 8, ch)) > 0) {
655 s += n;
657 break;
658 case 'x':
659 if ((n = parse_hex(s, 2, ch)) > 0) {
660 s += n;
662 break;
663 case '\0':
664 s--;
665 *ch = '\\';
666 break;
668 return s - s0;
672 - regatom - the lowest level
674 * Optimization: gobbles an entire sequence of ordinary characters so that
675 * it can turn them into a single node, which is smaller to store and
676 * faster to run. Backslashed characters are exceptions, each becoming a
677 * separate node; the code is simpler that way and it's not worth fixing.
679 static int regatom(regex_t *preg, int *flagp)
681 int ret;
682 int flags;
683 int nocase = (preg->cflags & REG_ICASE);
685 int ch;
686 int n = reg_utf8_tounicode_case(preg->regparse, &ch, nocase);
688 *flagp = WORST; /* Tentatively. */
690 preg->regparse += n;
691 switch (ch) {
692 /* FIXME: these chars only have meaning at beg/end of pat? */
693 case '^':
694 ret = regnode(preg, BOL);
695 break;
696 case '$':
697 ret = regnode(preg, EOL);
698 break;
699 case '.':
700 ret = regnode(preg, ANY);
701 *flagp |= HASWIDTH|SIMPLE;
702 break;
703 case '[': {
704 const char *pattern = preg->regparse;
706 if (*pattern == '^') { /* Complement of range. */
707 ret = regnode(preg, ANYBUT);
708 pattern++;
709 } else
710 ret = regnode(preg, ANYOF);
712 /* Special case. If the first char is ']' or '-', it is part of the set */
713 if (*pattern == ']' || *pattern == '-') {
714 reg_addrange(preg, *pattern, *pattern);
715 pattern++;
718 while (*pattern && *pattern != ']') {
719 /* Is this a range? a-z */
720 int start;
721 int end;
723 pattern += reg_utf8_tounicode_case(pattern, &start, nocase);
724 if (start == '\\') {
725 pattern += reg_decode_escape(pattern, &start);
726 if (start == 0) {
727 preg->err = REG_ERR_NULL_CHAR;
728 return 0;
731 if (pattern[0] == '-' && pattern[1] && pattern[1] != ']') {
732 /* skip '-' */
733 pattern += utf8_tounicode(pattern, &end);
734 pattern += reg_utf8_tounicode_case(pattern, &end, nocase);
735 if (end == '\\') {
736 pattern += reg_decode_escape(pattern, &end);
737 if (end == 0) {
738 preg->err = REG_ERR_NULL_CHAR;
739 return 0;
743 reg_addrange(preg, start, end);
744 continue;
746 if (start == '[' && pattern[0] == ':') {
747 static const char *character_class[] = {
748 ":alpha:", ":alnum:", ":space:", ":blank:", ":upper:", ":lower:",
749 ":digit:", ":xdigit:", ":cntrl:", ":graph:", ":print:", ":punct:",
751 enum {
752 CC_ALPHA, CC_ALNUM, CC_SPACE, CC_BLANK, CC_UPPER, CC_LOWER,
753 CC_DIGIT, CC_XDIGIT, CC_CNTRL, CC_GRAPH, CC_PRINT, CC_PUNCT,
754 CC_NUM
756 int i;
758 for (i = 0; i < CC_NUM; i++) {
759 int n = strlen(character_class[i]);
760 if (strncmp(pattern, character_class[i], n) == 0) {
761 /* Found a character class */
762 pattern += n + 1;
763 break;
766 if (i != CC_NUM) {
767 switch (i) {
768 case CC_ALNUM:
769 reg_addrange(preg, '0', '9');
770 /* Fall through */
771 case CC_ALPHA:
772 if ((preg->cflags & REG_ICASE) == 0) {
773 reg_addrange(preg, 'a', 'z');
775 reg_addrange(preg, 'A', 'Z');
776 break;
777 case CC_SPACE:
778 reg_addrange_str(preg, " \t\r\n\f\v");
779 break;
780 case CC_BLANK:
781 reg_addrange_str(preg, " \t");
782 break;
783 case CC_UPPER:
784 reg_addrange(preg, 'A', 'Z');
785 break;
786 case CC_LOWER:
787 reg_addrange(preg, 'a', 'z');
788 break;
789 case CC_XDIGIT:
790 reg_addrange(preg, 'a', 'f');
791 reg_addrange(preg, 'A', 'F');
792 /* Fall through */
793 case CC_DIGIT:
794 reg_addrange(preg, '0', '9');
795 break;
796 case CC_CNTRL:
797 reg_addrange(preg, 0, 31);
798 reg_addrange(preg, 127, 127);
799 break;
800 case CC_PRINT:
801 reg_addrange(preg, ' ', '~');
802 break;
803 case CC_GRAPH:
804 reg_addrange(preg, '!', '~');
805 break;
806 case CC_PUNCT:
807 reg_addrange(preg, '!', '/');
808 reg_addrange(preg, ':', '@');
809 reg_addrange(preg, '[', '`');
810 reg_addrange(preg, '{', '~');
811 break;
813 continue;
816 /* Not a range, so just add the char */
817 reg_addrange(preg, start, start);
819 regc(preg, '\0');
821 if (*pattern) {
822 pattern++;
824 preg->regparse = pattern;
826 *flagp |= HASWIDTH|SIMPLE;
828 break;
829 case '(':
830 ret = reg(preg, 1, &flags);
831 if (ret == 0)
832 return 0;
833 *flagp |= flags&(HASWIDTH|SPSTART);
834 break;
835 case '\0':
836 case '|':
837 case ')':
838 preg->err = REG_ERR_INTERNAL;
839 return 0; /* Supposed to be caught earlier. */
840 case '?':
841 case '+':
842 case '*':
843 case '{':
844 preg->err = REG_ERR_COUNT_FOLLOWS_NOTHING;
845 return 0;
846 case '\\':
847 ch = *preg->regparse++;
848 switch (ch) {
849 case '\0':
850 preg->err = REG_ERR_TRAILING_BACKSLASH;
851 return 0;
852 case 'A':
853 ret = regnode(preg, BOLX);
854 break;
855 case 'Z':
856 ret = regnode(preg, EOLX);
857 break;
858 case '<':
859 case 'm':
860 ret = regnode(preg, WORDA);
861 break;
862 case '>':
863 case 'M':
864 ret = regnode(preg, WORDZ);
865 break;
866 case 'd':
867 case 'D':
868 ret = regnode(preg, ch == 'd' ? ANYOF : ANYBUT);
869 reg_addrange(preg, '0', '9');
870 regc(preg, '\0');
871 *flagp |= HASWIDTH|SIMPLE;
872 break;
873 case 'w':
874 case 'W':
875 ret = regnode(preg, ch == 'w' ? ANYOF : ANYBUT);
876 if ((preg->cflags & REG_ICASE) == 0) {
877 reg_addrange(preg, 'a', 'z');
879 reg_addrange(preg, 'A', 'Z');
880 reg_addrange(preg, '0', '9');
881 reg_addrange(preg, '_', '_');
882 regc(preg, '\0');
883 *flagp |= HASWIDTH|SIMPLE;
884 break;
885 case 's':
886 case 'S':
887 ret = regnode(preg, ch == 's' ? ANYOF : ANYBUT);
888 reg_addrange_str(preg," \t\r\n\f\v");
889 regc(preg, '\0');
890 *flagp |= HASWIDTH|SIMPLE;
891 break;
892 /* FIXME: Someday handle \1, \2, ... */
893 default:
894 /* Handle general quoted chars in exact-match routine */
895 /* Back up to include the backslash */
896 preg->regparse--;
897 goto de_fault;
899 break;
900 de_fault:
901 default: {
903 * Encode a string of characters to be matched exactly.
905 int added = 0;
907 /* Back up to pick up the first char of interest */
908 preg->regparse -= n;
910 ret = regnode(preg, EXACTLY);
912 /* Note that a META operator such as ? or * consumes the
913 * preceding char.
914 * Thus we must be careful to look ahead by 2 and add the
915 * last char as it's own EXACTLY if necessary
918 /* Until end of string or a META char is reached */
919 while (*preg->regparse && strchr(META, *preg->regparse) == NULL) {
920 n = reg_utf8_tounicode_case(preg->regparse, &ch, (preg->cflags & REG_ICASE));
921 if (ch == '\\' && preg->regparse[n]) {
922 /* Non-trailing backslash.
923 * Is this a special escape, or a regular escape?
925 if (strchr("<>mMwWdDsSAZ", preg->regparse[n])) {
926 /* A special escape. All done with EXACTLY */
927 break;
929 /* Decode it. Note that we add the length for the escape
930 * sequence to the length for the backlash so we can skip
931 * the entire sequence, or not as required.
933 n += reg_decode_escape(preg->regparse + n, &ch);
934 if (ch == 0) {
935 preg->err = REG_ERR_NULL_CHAR;
936 return 0;
940 /* Now we have one char 'ch' of length 'n'.
941 * Check to see if the following char is a MULT
944 if (ISMULT(preg->regparse[n])) {
945 /* Yes. But do we already have some EXACTLY chars? */
946 if (added) {
947 /* Yes, so return what we have and pick up the current char next time around */
948 break;
950 /* No, so add this single char and finish */
951 regc(preg, ch);
952 added++;
953 preg->regparse += n;
954 break;
957 /* No, so just add this char normally */
958 regc(preg, ch);
959 added++;
960 preg->regparse += n;
962 regc(preg, '\0');
964 *flagp |= HASWIDTH;
965 if (added == 1)
966 *flagp |= SIMPLE;
967 break;
969 break;
972 return(ret);
975 static void reg_grow(regex_t *preg, int n)
977 if (preg->p + n >= preg->proglen) {
978 preg->proglen = (preg->p + n) * 2;
979 preg->program = realloc(preg->program, preg->proglen * sizeof(int));
984 - regnode - emit a node
986 /* Location. */
987 static int regnode(regex_t *preg, int op)
989 reg_grow(preg, 2);
991 /* The OP followed by a next pointer */
992 preg->program[preg->p++] = op;
993 preg->program[preg->p++] = 0;
995 /* Return the start of the node */
996 return preg->p - 2;
1000 - regc - emit (if appropriate) a byte of code
1002 static void regc(regex_t *preg, int b )
1004 reg_grow(preg, 1);
1005 preg->program[preg->p++] = b;
1009 - reginsert - insert an operator in front of already-emitted operand
1011 * Means relocating the operand.
1012 * Returns the new location of the original operand.
1014 static int reginsert(regex_t *preg, int op, int size, int opnd )
1016 reg_grow(preg, size);
1018 /* Move everything from opnd up */
1019 memmove(preg->program + opnd + size, preg->program + opnd, sizeof(int) * (preg->p - opnd));
1020 /* Zero out the new space */
1021 memset(preg->program + opnd, 0, sizeof(int) * size);
1023 preg->program[opnd] = op;
1025 preg->p += size;
1027 return opnd + size;
1031 - regtail - set the next-pointer at the end of a node chain
1033 static void regtail(regex_t *preg, int p, int val)
1035 int scan;
1036 int temp;
1037 int offset;
1039 /* Find last node. */
1040 scan = p;
1041 for (;;) {
1042 temp = regnext(preg, scan);
1043 if (temp == 0)
1044 break;
1045 scan = temp;
1048 if (OP(preg, scan) == BACK)
1049 offset = scan - val;
1050 else
1051 offset = val - scan;
1053 preg->program[scan + 1] = offset;
1057 - regoptail - regtail on operand of first argument; nop if operandless
1060 static void regoptail(regex_t *preg, int p, int val )
1062 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
1063 if (p != 0 && OP(preg, p) == BRANCH) {
1064 regtail(preg, OPERAND(p), val);
1069 * regexec and friends
1073 * Forwards.
1075 static int regtry(regex_t *preg, const char *string );
1076 static int regmatch(regex_t *preg, int prog);
1077 static int regrepeat(regex_t *preg, int p, int max);
1080 - regexec - match a regexp against a string
1082 int regexec(regex_t *preg, const char *string, size_t nmatch, regmatch_t pmatch[], int eflags)
1084 const char *s;
1085 int scan;
1087 /* Be paranoid... */
1088 if (preg == NULL || preg->program == NULL || string == NULL) {
1089 return REG_ERR_NULL_ARGUMENT;
1092 /* Check validity of program. */
1093 if (*preg->program != REG_MAGIC) {
1094 return REG_ERR_CORRUPTED;
1097 #ifdef DEBUG
1098 fprintf(stderr, "regexec: %s\n", string);
1099 regdump(preg);
1100 #endif
1102 preg->eflags = eflags;
1103 preg->pmatch = pmatch;
1104 preg->nmatch = nmatch;
1105 preg->start = string; /* All offsets are computed from here */
1107 /* Must clear out the embedded repeat counts of REPX and REPXMIN opcodes */
1108 for (scan = OPERAND(1); scan != 0; scan += regopsize(preg, scan)) {
1109 int op = OP(preg, scan);
1110 if (op == END)
1111 break;
1112 if (op == REPX || op == REPXMIN)
1113 preg->program[scan + 4] = 0;
1116 /* If there is a "must appear" string, look for it. */
1117 if (preg->regmust != 0) {
1118 s = string;
1119 while ((s = str_find(s, preg->program[preg->regmust], preg->cflags & REG_ICASE)) != NULL) {
1120 if (prefix_cmp(preg->program + preg->regmust, preg->regmlen, s, preg->cflags & REG_ICASE) >= 0) {
1121 break;
1123 s++;
1125 if (s == NULL) /* Not present. */
1126 return REG_NOMATCH;
1129 /* Mark beginning of line for ^ . */
1130 preg->regbol = string;
1132 /* Simplest case: anchored match need be tried only once (maybe per line). */
1133 if (preg->reganch) {
1134 if (eflags & REG_NOTBOL) {
1135 /* This is an anchored search, but not an BOL, so possibly skip to the next line */
1136 goto nextline;
1138 while (1) {
1139 if (regtry(preg, string)) {
1140 return REG_NOERROR;
1142 if (*string) {
1143 nextline:
1144 if (preg->cflags & REG_NEWLINE) {
1145 /* Try the next anchor? */
1146 string = strchr(string, '\n');
1147 if (string) {
1148 preg->regbol = ++string;
1149 continue;
1153 return REG_NOMATCH;
1157 /* Messy cases: unanchored match. */
1158 s = string;
1159 if (preg->regstart != '\0') {
1160 /* We know what char it must start with. */
1161 while ((s = str_find(s, preg->regstart, preg->cflags & REG_ICASE)) != NULL) {
1162 if (regtry(preg, s))
1163 return REG_NOERROR;
1164 s++;
1167 else
1168 /* We don't -- general case. */
1169 while (1) {
1170 if (regtry(preg, s))
1171 return REG_NOERROR;
1172 if (*s == '\0') {
1173 break;
1175 else {
1176 int c;
1177 s += utf8_tounicode(s, &c);
1181 /* Failure. */
1182 return REG_NOMATCH;
1186 - regtry - try match at specific point
1188 /* 0 failure, 1 success */
1189 static int regtry( regex_t *preg, const char *string )
1191 int i;
1193 preg->reginput = string;
1195 for (i = 0; i < preg->nmatch; i++) {
1196 preg->pmatch[i].rm_so = -1;
1197 preg->pmatch[i].rm_eo = -1;
1199 if (regmatch(preg, 1)) {
1200 preg->pmatch[0].rm_so = string - preg->start;
1201 preg->pmatch[0].rm_eo = preg->reginput - preg->start;
1202 return(1);
1203 } else
1204 return(0);
1208 * Returns bytes matched if 'pattern' is a prefix of 'string'.
1210 * If 'nocase' is non-zero, does a case-insensitive match.
1212 * Returns -1 on not found.
1214 static int prefix_cmp(const int *prog, int proglen, const char *string, int nocase)
1216 const char *s = string;
1217 while (proglen && *s) {
1218 int ch;
1219 int n = reg_utf8_tounicode_case(s, &ch, nocase);
1220 if (ch != *prog) {
1221 return -1;
1223 prog++;
1224 s += n;
1225 proglen--;
1227 if (proglen == 0) {
1228 return s - string;
1230 return -1;
1234 * Searchs for 'c' in the range 'range'.
1236 * Returns 1 if found, or 0 if not.
1238 static int reg_range_find(const int *range, int c)
1240 while (*range) {
1241 /*printf("Checking %d in range [%d,%d]\n", c, range[1], (range[0] + range[1] - 1));*/
1242 if (c >= range[1] && c <= (range[0] + range[1] - 1)) {
1243 return 1;
1245 range += 2;
1247 return 0;
1251 * Search for the character 'c' in the utf-8 string 'string'.
1253 * If 'nocase' is set, the 'string' is assumed to be uppercase
1254 * and 'c' is converted to uppercase before matching.
1256 * Returns the byte position in the string where the 'c' was found, or
1257 * NULL if not found.
1259 static const char *str_find(const char *string, int c, int nocase)
1261 if (nocase) {
1262 /* The "string" should already be converted to uppercase */
1263 c = utf8_upper(c);
1265 while (*string) {
1266 int ch;
1267 int n = reg_utf8_tounicode_case(string, &ch, nocase);
1268 if (c == ch) {
1269 return string;
1271 string += n;
1273 return NULL;
1277 * Returns true if 'ch' is an end-of-line char.
1279 * In REG_NEWLINE mode, \n is considered EOL in
1280 * addition to \0
1282 static int reg_iseol(regex_t *preg, int ch)
1284 if (preg->cflags & REG_NEWLINE) {
1285 return ch == '\0' || ch == '\n';
1287 else {
1288 return ch == '\0';
1292 static int regmatchsimplerepeat(regex_t *preg, int scan, int matchmin)
1294 int nextch = '\0';
1295 const char *save;
1296 int no;
1297 int c;
1299 int max = preg->program[scan + 2];
1300 int min = preg->program[scan + 3];
1301 int next = regnext(preg, scan);
1304 * Lookahead to avoid useless match attempts
1305 * when we know what character comes next.
1307 if (OP(preg, next) == EXACTLY) {
1308 nextch = preg->program[OPERAND(next)];
1310 save = preg->reginput;
1311 no = regrepeat(preg, scan + 5, max);
1312 if (no < min) {
1313 return 0;
1315 if (matchmin) {
1316 /* from min up to no */
1317 max = no;
1318 no = min;
1320 /* else from no down to min */
1321 while (1) {
1322 if (matchmin) {
1323 if (no > max) {
1324 break;
1327 else {
1328 if (no < min) {
1329 break;
1332 preg->reginput = save + utf8_index(save, no);
1333 reg_utf8_tounicode_case(preg->reginput, &c, (preg->cflags & REG_ICASE));
1334 /* If it could work, try it. */
1335 if (reg_iseol(preg, nextch) || c == nextch) {
1336 if (regmatch(preg, next)) {
1337 return(1);
1340 if (matchmin) {
1341 /* Couldn't or didn't, add one more */
1342 no++;
1344 else {
1345 /* Couldn't or didn't -- back up. */
1346 no--;
1349 return(0);
1352 static int regmatchrepeat(regex_t *preg, int scan, int matchmin)
1354 int *scanpt = preg->program + scan;
1356 int max = scanpt[2];
1357 int min = scanpt[3];
1359 /* Have we reached min? */
1360 if (scanpt[4] < min) {
1361 /* No, so get another one */
1362 scanpt[4]++;
1363 if (regmatch(preg, scan + 5)) {
1364 return 1;
1366 scanpt[4]--;
1367 return 0;
1369 if (scanpt[4] > max) {
1370 return 0;
1373 if (matchmin) {
1374 /* minimal, so try other branch first */
1375 if (regmatch(preg, regnext(preg, scan))) {
1376 return 1;
1378 /* No, so try one more */
1379 scanpt[4]++;
1380 if (regmatch(preg, scan + 5)) {
1381 return 1;
1383 scanpt[4]--;
1384 return 0;
1386 /* maximal, so try this branch again */
1387 if (scanpt[4] < max) {
1388 scanpt[4]++;
1389 if (regmatch(preg, scan + 5)) {
1390 return 1;
1392 scanpt[4]--;
1394 /* At this point we are at max with no match. Try the other branch */
1395 return regmatch(preg, regnext(preg, scan));
1399 - regmatch - main matching routine
1401 * Conceptually the strategy is simple: check to see whether the current
1402 * node matches, call self recursively to see whether the rest matches,
1403 * and then act accordingly. In practice we make some effort to avoid
1404 * recursion, in particular by going through "ordinary" nodes (that don't
1405 * need to know whether the rest of the match failed) by a loop instead of
1406 * by recursion.
1408 /* 0 failure, 1 success */
1409 static int regmatch(regex_t *preg, int prog)
1411 int scan; /* Current node. */
1412 int next; /* Next node. */
1413 const char *save;
1415 scan = prog;
1417 #ifdef DEBUG
1418 if (scan != 0 && regnarrate)
1419 fprintf(stderr, "%s(\n", regprop(scan));
1420 #endif
1421 while (scan != 0) {
1422 int n;
1423 int c;
1424 #ifdef DEBUG
1425 if (regnarrate) {
1426 fprintf(stderr, "%3d: %s...\n", scan, regprop(OP(preg, scan))); /* Where, what. */
1428 #endif
1429 next = regnext(preg, scan);
1430 n = reg_utf8_tounicode_case(preg->reginput, &c, (preg->cflags & REG_ICASE));
1432 switch (OP(preg, scan)) {
1433 case BOLX:
1434 if ((preg->eflags & REG_NOTBOL)) {
1435 return(0);
1437 /* Fall through */
1438 case BOL:
1439 if (preg->reginput != preg->regbol) {
1440 return(0);
1442 break;
1443 case EOLX:
1444 if (c != 0) {
1445 /* For EOLX, only match real end of line, not newline */
1446 return 0;
1448 break;
1449 case EOL:
1450 if (!reg_iseol(preg, c)) {
1451 return(0);
1453 break;
1454 case WORDA:
1455 /* Must be looking at a letter, digit, or _ */
1456 if ((!isalnum(UCHAR(c))) && c != '_')
1457 return(0);
1458 /* Prev must be BOL or nonword */
1459 if (preg->reginput > preg->regbol &&
1460 (isalnum(UCHAR(preg->reginput[-1])) || preg->reginput[-1] == '_'))
1461 return(0);
1462 break;
1463 case WORDZ:
1464 /* Can't match at BOL */
1465 if (preg->reginput > preg->regbol) {
1466 /* Current must be EOL or nonword */
1467 if (reg_iseol(preg, c) || !isalnum(UCHAR(c)) || c != '_') {
1468 c = preg->reginput[-1];
1469 /* Previous must be word */
1470 if (isalnum(UCHAR(c)) || c == '_') {
1471 break;
1475 /* No */
1476 return(0);
1478 case ANY:
1479 if (reg_iseol(preg, c))
1480 return 0;
1481 preg->reginput += n;
1482 break;
1483 case EXACTLY: {
1484 int opnd;
1485 int len;
1486 int slen;
1488 opnd = OPERAND(scan);
1489 len = str_int_len(preg->program + opnd);
1491 slen = prefix_cmp(preg->program + opnd, len, preg->reginput, preg->cflags & REG_ICASE);
1492 if (slen < 0) {
1493 return(0);
1495 preg->reginput += slen;
1497 break;
1498 case ANYOF:
1499 if (reg_iseol(preg, c) || reg_range_find(preg->program + OPERAND(scan), c) == 0) {
1500 return(0);
1502 preg->reginput += n;
1503 break;
1504 case ANYBUT:
1505 if (reg_iseol(preg, c) || reg_range_find(preg->program + OPERAND(scan), c) != 0) {
1506 return(0);
1508 preg->reginput += n;
1509 break;
1510 case NOTHING:
1511 break;
1512 case BACK:
1513 break;
1514 case BRANCH:
1515 if (OP(preg, next) != BRANCH) /* No choice. */
1516 next = OPERAND(scan); /* Avoid recursion. */
1517 else {
1518 do {
1519 save = preg->reginput;
1520 if (regmatch(preg, OPERAND(scan))) {
1521 return(1);
1523 preg->reginput = save;
1524 scan = regnext(preg, scan);
1525 } while (scan != 0 && OP(preg, scan) == BRANCH);
1526 return(0);
1527 /* NOTREACHED */
1529 break;
1530 case REP:
1531 case REPMIN:
1532 return regmatchsimplerepeat(preg, scan, OP(preg, scan) == REPMIN);
1534 case REPX:
1535 case REPXMIN:
1536 return regmatchrepeat(preg, scan, OP(preg, scan) == REPXMIN);
1538 case END:
1539 return 1; /* Success! */
1541 case OPENNC:
1542 case CLOSENC:
1543 return regmatch(preg, next);
1545 default:
1546 if (OP(preg, scan) >= OPEN+1 && OP(preg, scan) < CLOSE_END) {
1547 save = preg->reginput;
1548 if (regmatch(preg, next)) {
1549 if (OP(preg, scan) < CLOSE) {
1550 int no = OP(preg, scan) - OPEN;
1551 if (no < preg->nmatch && preg->pmatch[no].rm_so == -1) {
1552 preg->pmatch[no].rm_so = save - preg->start;
1555 else {
1556 int no = OP(preg, scan) - CLOSE;
1557 if (no < preg->nmatch && preg->pmatch[no].rm_eo == -1) {
1558 preg->pmatch[no].rm_eo = save - preg->start;
1561 return(1);
1563 return(0);
1565 return REG_ERR_INTERNAL;
1568 scan = next;
1572 * We get here only if there's trouble -- normally "case END" is
1573 * the terminating point.
1575 return REG_ERR_INTERNAL;
1579 - regrepeat - repeatedly match something simple, report how many
1581 static int regrepeat(regex_t *preg, int p, int max)
1583 int count = 0;
1584 const char *scan;
1585 int opnd;
1586 int ch;
1587 int n;
1589 scan = preg->reginput;
1590 opnd = OPERAND(p);
1591 switch (OP(preg, p)) {
1592 case ANY:
1593 /* No need to handle utf8 specially here */
1594 while (!reg_iseol(preg, *scan) && count < max) {
1595 count++;
1596 scan++;
1598 break;
1599 case EXACTLY:
1600 while (count < max) {
1601 n = reg_utf8_tounicode_case(scan, &ch, preg->cflags & REG_ICASE);
1602 if (preg->program[opnd] != ch) {
1603 break;
1605 count++;
1606 scan += n;
1608 break;
1609 case ANYOF:
1610 while (count < max) {
1611 n = reg_utf8_tounicode_case(scan, &ch, preg->cflags & REG_ICASE);
1612 if (reg_iseol(preg, ch) || reg_range_find(preg->program + opnd, ch) == 0) {
1613 break;
1615 count++;
1616 scan += n;
1618 break;
1619 case ANYBUT:
1620 while (count < max) {
1621 n = reg_utf8_tounicode_case(scan, &ch, preg->cflags & REG_ICASE);
1622 if (reg_iseol(preg, ch) || reg_range_find(preg->program + opnd, ch) != 0) {
1623 break;
1625 count++;
1626 scan += n;
1628 break;
1629 default: /* Oh dear. Called inappropriately. */
1630 preg->err = REG_ERR_INTERNAL;
1631 count = 0; /* Best compromise. */
1632 break;
1634 preg->reginput = scan;
1636 return(count);
1640 - regnext - dig the "next" pointer out of a node
1642 static int regnext(regex_t *preg, int p )
1644 int offset;
1646 offset = NEXT(preg, p);
1648 if (offset == 0)
1649 return 0;
1651 if (OP(preg, p) == BACK)
1652 return(p-offset);
1653 else
1654 return(p+offset);
1658 - regopsize - returns the size of opcode + operands at 'p' in words
1660 static int regopsize(regex_t *preg, int p )
1662 /* Almost all opcodes are 2 words, but some are more */
1663 switch (OP(preg, p)) {
1664 case REP:
1665 case REPMIN:
1666 case REPX:
1667 case REPXMIN:
1668 return 5;
1670 case ANYOF:
1671 case ANYBUT:
1672 case EXACTLY: {
1673 int s = p + 2;
1674 while (preg->program[s++]) {
1676 return s - p;
1679 return 2;
1682 #if defined(DEBUG) && !defined(JIM_BOOTSTRAP)
1685 - regdump - dump a regexp onto stdout in vaguely comprehensible form
1687 static void regdump(regex_t *preg)
1689 int s;
1690 int op = EXACTLY; /* Arbitrary non-END op. */
1691 int next;
1692 char buf[MAX_UTF8_LEN + 1];
1694 int i;
1695 for (i = 1; i < preg->p; i++) {
1696 printf("%02x ", (unsigned char)preg->program[i]);
1697 if (i % 16 == 0) {
1698 printf("\n");
1701 printf("\n");
1703 s = 1;
1704 while (op != END && s < preg->p) { /* While that wasn't END last time... */
1705 op = OP(preg, s);
1706 printf("%3d: %s", s, regprop(op)); /* Where, what. */
1707 next = regnext(preg, s);
1708 if (next == 0) /* Next ptr. */
1709 printf("(0)");
1710 else
1711 printf("(%d)", next);
1712 s += 2;
1713 if (op == REP || op == REPMIN || op == REPX || op == REPXMIN) {
1714 int max = preg->program[s];
1715 int min = preg->program[s + 1];
1716 if (max == 65535) {
1717 printf("{%d,*}", min);
1719 else {
1720 printf("{%d,%d}", min, max);
1722 printf(" %d", preg->program[s + 2]);
1723 s += 3;
1725 else if (op == ANYOF || op == ANYBUT) {
1726 /* set of ranges */
1728 while (preg->program[s]) {
1729 int len = preg->program[s++];
1730 int first = preg->program[s++];
1731 buf[utf8_getchars(buf, first)] = 0;
1732 printf("%s", buf);
1733 if (len > 1) {
1734 buf[utf8_getchars(buf, first + len - 1)] = 0;
1735 printf("-%s", buf);
1738 s++;
1740 else if (op == EXACTLY) {
1741 /* Literal string, where present. */
1743 while (preg->program[s]) {
1744 buf[utf8_getchars(buf, preg->program[s])] = 0;
1745 printf("%s", buf);
1746 s++;
1748 s++;
1750 putchar('\n');
1753 if (op == END) {
1754 /* Header fields of interest. */
1755 if (preg->regstart) {
1756 buf[utf8_getchars(buf, preg->regstart)] = 0;
1757 printf("start '%s' ", buf);
1759 if (preg->reganch)
1760 printf("anchored ");
1761 if (preg->regmust != 0) {
1762 int i;
1763 printf("must have:");
1764 for (i = 0; i < preg->regmlen; i++) {
1765 putchar(preg->program[preg->regmust + i]);
1767 putchar('\n');
1770 printf("\n");
1774 - regprop - printable representation of opcode
1776 static const char *regprop( int op )
1778 static char buf[50];
1780 switch (op) {
1781 case BOL:
1782 return "BOL";
1783 case EOL:
1784 return "EOL";
1785 case BOLX:
1786 return "BOLX";
1787 case EOLX:
1788 return "EOLX";
1789 case ANY:
1790 return "ANY";
1791 case ANYOF:
1792 return "ANYOF";
1793 case ANYBUT:
1794 return "ANYBUT";
1795 case BRANCH:
1796 return "BRANCH";
1797 case EXACTLY:
1798 return "EXACTLY";
1799 case NOTHING:
1800 return "NOTHING";
1801 case BACK:
1802 return "BACK";
1803 case END:
1804 return "END";
1805 case REP:
1806 return "REP";
1807 case REPMIN:
1808 return "REPMIN";
1809 case REPX:
1810 return "REPX";
1811 case REPXMIN:
1812 return "REPXMIN";
1813 case WORDA:
1814 return "WORDA";
1815 case WORDZ:
1816 return "WORDZ";
1817 case OPENNC:
1818 return "OPEN";
1819 case CLOSENC:
1820 return "CLOSE";
1821 default:
1822 if (op >= OPEN && op < CLOSE) {
1823 snprintf(buf, sizeof(buf), "OPEN%d", op-OPEN);
1825 else if (op >= CLOSE && op < CLOSE_END) {
1826 snprintf(buf, sizeof(buf), "CLOSE%d", op-CLOSE);
1828 else {
1829 snprintf(buf, sizeof(buf), "?%d?\n", op);
1831 return(buf);
1834 #endif /* JIM_BOOTSTRAP */
1836 size_t regerror(int errcode, const regex_t *preg, char *errbuf, size_t errbuf_size)
1838 static const char *error_strings[] = {
1839 "success",
1840 "no match",
1841 "bad pattern",
1842 "null argument",
1843 "unknown error",
1844 "too big",
1845 "out of memory",
1846 "too many ()",
1847 "parentheses () not balanced",
1848 "braces {} not balanced",
1849 "invalid repetition count(s)",
1850 "extra characters",
1851 "*+ of empty atom",
1852 "nested count",
1853 "internal error",
1854 "count follows nothing",
1855 "trailing backslash",
1856 "corrupted program",
1857 "contains null char",
1859 const char *err;
1861 if (errcode < 0 || errcode >= REG_ERR_NUM) {
1862 err = "Bad error code";
1864 else {
1865 err = error_strings[errcode];
1868 return snprintf(errbuf, errbuf_size, "%s", err);
1871 void regfree(regex_t *preg)
1873 free(preg->program);
1876 #endif