regexp: add partial support for \A \Z matching
[jimtcl.git] / jimregexp.c
blob20a0e838d594bacd6ae28e3007df6b6d05ddf957
1 /*
2 * vi:se ts=8:
4 * regcomp and regexec -- regsub and regerror are elsewhere
6 * Copyright (c) 1986 by University of Toronto.
7 * Written by Henry Spencer. Not derived from licensed software.
9 * Permission is granted to anyone to use this software for any
10 * purpose on any computer system, and to redistribute it freely,
11 * subject to the following restrictions:
13 * 1. The author is not responsible for the consequences of use of
14 * this software, no matter how awful, even if they arise
15 * from defects in it.
17 * 2. The origin of this software must not be misrepresented, either
18 * by explicit claim or by omission.
20 * 3. Altered versions must be plainly marked as such, and must not
21 * be misrepresented as being the original software.
22 *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore,
23 *** hoptoad!gnu, on 27 Dec 1986, to add \n as an alternative to |
24 *** to assist in implementing egrep.
25 *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore,
26 *** hoptoad!gnu, on 27 Dec 1986, to add \< and \> for word-matching
27 *** as in BSD grep and ex.
28 *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore,
29 *** hoptoad!gnu, on 28 Dec 1986, to optimize characters quoted with \.
30 *** THIS IS AN ALTERED VERSION. It was altered by James A. Woods,
31 *** ames!jaw, on 19 June 1987, to quash a regcomp() redundancy.
32 *** THIS IS AN ALTERED VERSION. It was altered by Christopher Seiwald
33 *** seiwald@vix.com, on 28 August 1993, for use in jam. Regmagic.h
34 *** was moved into regexp.h, and the include of regexp.h now uses "'s
35 *** to avoid conflicting with the system regexp.h. Const, bless its
36 *** soul, was removed so it can compile everywhere. The declaration
37 *** of strchr() was in conflict on AIX, so it was removed (as it is
38 *** happily defined in string.h).
39 *** THIS IS AN ALTERED VERSION. It was altered by Christopher Seiwald
40 *** seiwald@perforce.com, on 20 January 2000, to use function prototypes.
41 *** THIS IS AN ALTERED VERSION. It was altered by Christopher Seiwald
42 *** seiwald@perforce.com, on 05 November 2002, to const string literals.
44 * THIS IS AN ALTERED VERSION. It was altered by Steve Bennett <steveb@workware.net.au>
45 * on 16 October 2010, to remove static state and add better Tcl ARE compatibility.
46 * This includes counted repetitions, UTF-8 support, character classes,
47 * shorthand character classes, increased number of parentheses to 100,
48 * backslash escape sequences. It also removes \n as an alternative to |.
50 * Beware that some of this code is subtly aware of the way operator
51 * precedence is structured in regular expressions. Serious changes in
52 * regular-expression syntax might require a total rethink.
55 #include "jimautoconf.h"
57 #if defined(JIM_REGEXP)
58 #include <stdio.h>
59 #include <ctype.h>
60 #include <stdlib.h>
61 #include <string.h>
63 #include "jim.h"
64 #include "jimregexp.h"
65 #include "utf8.h"
67 /* An arbitrary limit, but this seems enough. Must be less than 1000. */
68 #define REG_MAX_PAREN 100
71 * Structure for regexp "program". This is essentially a linear encoding
72 * of a nondeterministic finite-state machine (aka syntax charts or
73 * "railroad normal form" in parsing technology). Each node is an opcode
74 * plus a "next" pointer, possibly plus an operand. "Next" pointers of
75 * all nodes except BRANCH implement concatenation; a "next" pointer with
76 * a BRANCH on both ends of it is connecting two alternatives. (Here we
77 * have one of the subtle syntax dependencies: an individual BRANCH (as
78 * opposed to a collection of them) is never concatenated with anything
79 * because of operator precedence.) The operand of some types of node is
80 * a literal string; for others, it is a node leading into a sub-FSM. In
81 * particular, the operand of a BRANCH node is the first node of the branch.
82 * (NB this is *not* a tree structure: the tail of the branch connects
83 * to the thing following the set of BRANCHes.) The opcodes are:
86 /* definition number opnd? meaning */
87 #define END 0 /* no End of program. */
88 #define BOL 1 /* no Match "" at beginning of line. */
89 #define EOL 2 /* no Match "" at end of line. */
90 #define ANY 3 /* no Match any one character. */
91 #define ANYOF 4 /* str Match any character in this string. */
92 #define ANYBUT 5 /* str Match any character not in this string. */
93 #define BRANCH 6 /* node Match this alternative, or the next... */
94 #define BACK 7 /* no Match "", "next" ptr points backward. */
95 #define EXACTLY 8 /* str Match this string. */
96 #define NOTHING 9 /* no Match empty string. */
97 #define REP 10 /* max,min Match this (simple) thing [min,max] times. */
98 #define REPMIN 11 /* max,min Match this (simple) thing [min,max] times, minimal match. */
99 #define REPX 12 /* max,min Match this (complex) thing [min,max] times. */
100 #define REPXMIN 13 /* max,min Match this (complex) thing [min,max] times, minimal match. */
101 #define BOLX 14 /* no Match "" at beginning of input. */
102 #define EOLX 15 /* no Match "" at end of input. */
103 #define WORDA 16 /* no Match "" at wordchar, where prev is nonword */
104 #define WORDZ 17 /* no Match "" at nonwordchar, where prev is word */
106 #define OPENNC 1000 /* no Non-capturing parentheses - must be OPEN-1 */
107 #define OPEN 1001 /* no Mark this point in input as start of #n. */
108 /* OPEN+1 is number 1, etc. */
110 /* must not be any other opts between OPEN and CLOSE */
112 #define CLOSENC 2000 /* no Non-capturing parentheses - must be CLOSE-1 */
113 #define CLOSE 2001 /* no Analogous to OPEN. */
114 #define CLOSE_END (CLOSE+REG_MAX_PAREN)
117 * The first word of the regexp internal "program" is actually this magic
118 * number; the start node begins in the second word.
120 #define REG_MAGIC 0xFADED00D
123 * Opcode notes:
125 * BRANCH The set of branches constituting a single choice are hooked
126 * together with their "next" pointers, since precedence prevents
127 * anything being concatenated to any individual branch. The
128 * "next" pointer of the last BRANCH in a choice points to the
129 * thing following the whole choice. This is also where the
130 * final "next" pointer of each individual branch points; each
131 * branch starts with the operand node of a BRANCH node.
133 * BACK Normal "next" pointers all implicitly point forward; BACK
134 * exists to make loop structures possible.
136 * REP,REPX Repeated matches ('?', '*', '+' and {min,max}) are implemented
137 * as either simple repeats (REP) or complex repeats (REPX).
138 * These opcodes include a "min" and "max" count after the opcode.
139 * This is followed by a fourth "current count" word that is
140 * only used by REPX, as it implements a recursive match.
141 * REPMIN and REPXMIN are identical except they implement minimal repeats.
143 * OPEN,CLOSE ...are numbered at compile time.
147 * A node is one word of opcode followed by one word of "next" pointer.
148 * The "next" pointer value is a positive offset from the opcode of the node
149 * containing it.
150 * An operand, if any, simply follows the node. (Note that much of the
151 * code generation knows about this implicit relationship.)
153 #define OP(preg, p) (preg->program[p])
154 #define NEXT(preg, p) (preg->program[p + 1])
155 #define OPERAND(p) ((p) + 2)
158 * See regmagic.h for one further detail of program structure.
163 * Utility definitions.
166 #define FAIL(R,M) { (R)->err = (M); return (M); }
167 #define ISMULT(c) ((c) == '*' || (c) == '+' || (c) == '?' || (c) == '{')
168 #define META "^$.[()|?{+*"
171 * Flags to be passed up and down.
173 #define HASWIDTH 1 /* Known never to match null string. */
174 #define SIMPLE 2 /* Simple enough to be STAR/PLUS operand. */
175 #define SPSTART 4 /* Starts with * or +. */
176 #define WORST 0 /* Worst case. */
178 #define MAX_REP_COUNT 1000000
181 * Forward declarations for regcomp()'s friends.
183 static int reg(regex_t *preg, int paren /* Parenthesized? */, int *flagp );
184 static int regpiece(regex_t *preg, int *flagp );
185 static int regbranch(regex_t *preg, int *flagp );
186 static int regatom(regex_t *preg, int *flagp );
187 static int regnode(regex_t *preg, int op );
188 static int regnext(regex_t *preg, int p );
189 static void regc(regex_t *preg, int b );
190 static int reginsert(regex_t *preg, int op, int size, int opnd );
191 static void regtail(regex_t *preg, int p, int val);
192 static void regoptail(regex_t *preg, int p, int val );
193 static int regopsize(regex_t *preg, int p );
195 static int reg_range_find(const int *string, int c);
196 static const char *str_find(const char *string, int c, int nocase);
197 static int prefix_cmp(const int *prog, int proglen, const char *string, int nocase);
199 /*#define DEBUG*/
200 #ifdef DEBUG
201 static int regnarrate = 0;
202 static void regdump(regex_t *preg);
203 static const char *regprop( int op );
204 #endif
208 * Returns the length of the null-terminated integer sequence.
210 static int str_int_len(const int *seq)
212 int n = 0;
213 while (*seq++) {
214 n++;
216 return n;
220 - regcomp - compile a regular expression into internal code
222 * We can't allocate space until we know how big the compiled form will be,
223 * but we can't compile it (and thus know how big it is) until we've got a
224 * place to put the code. So we cheat: we compile it twice, once with code
225 * generation turned off and size counting turned on, and once "for real".
226 * This also means that we don't allocate space until we are sure that the
227 * thing really will compile successfully, and we never have to move the
228 * code and thus invalidate pointers into it. (Note that it has to be in
229 * one piece because free() must be able to free it all.)
231 * Beware that the optimization-preparation code in here knows about some
232 * of the structure of the compiled regexp.
234 int regcomp(regex_t *preg, const char *exp, int cflags)
236 int scan;
237 int longest;
238 unsigned len;
239 int flags;
241 #ifdef DEBUG
242 fprintf(stderr, "Compiling: '%s'\n", exp);
243 #endif
244 memset(preg, 0, sizeof(*preg));
246 if (exp == NULL)
247 FAIL(preg, REG_ERR_NULL_ARGUMENT);
249 /* First pass: determine size, legality. */
250 preg->cflags = cflags;
251 preg->regparse = exp;
253 /* Allocate space. */
254 preg->proglen = (strlen(exp) + 1) * 5;
255 preg->program = malloc(preg->proglen * sizeof(int));
256 if (preg->program == NULL)
257 FAIL(preg, REG_ERR_NOMEM);
259 /* Note that since we store a magic value as the first item in the program,
260 * program offsets will never be 0
262 regc(preg, REG_MAGIC);
263 if (reg(preg, 0, &flags) == 0) {
264 return preg->err;
267 /* Small enough for pointer-storage convention? */
268 if (preg->re_nsub >= REG_MAX_PAREN) /* Probably could be 65535L. */
269 FAIL(preg,REG_ERR_TOO_BIG);
271 /* Dig out information for optimizations. */
272 preg->regstart = 0; /* Worst-case defaults. */
273 preg->reganch = 0;
274 preg->regmust = 0;
275 preg->regmlen = 0;
276 scan = 1; /* First BRANCH. */
277 if (OP(preg, regnext(preg, scan)) == END) { /* Only one top-level choice. */
278 scan = OPERAND(scan);
280 /* Starting-point info. */
281 if (OP(preg, scan) == EXACTLY) {
282 preg->regstart = preg->program[OPERAND(scan)];
284 else if (OP(preg, scan) == BOL)
285 preg->reganch++;
288 * If there's something expensive in the r.e., find the
289 * longest literal string that must appear and make it the
290 * regmust. Resolve ties in favor of later strings, since
291 * the regstart check works with the beginning of the r.e.
292 * and avoiding duplication strengthens checking. Not a
293 * strong reason, but sufficient in the absence of others.
295 if (flags&SPSTART) {
296 longest = 0;
297 len = 0;
298 for (; scan != 0; scan = regnext(preg, scan)) {
299 if (OP(preg, scan) == EXACTLY) {
300 int plen = str_int_len(preg->program + OPERAND(scan));
301 if (plen >= len) {
302 longest = OPERAND(scan);
303 len = plen;
307 preg->regmust = longest;
308 preg->regmlen = len;
312 #ifdef DEBUG
313 regdump(preg);
314 #endif
316 return 0;
320 - reg - regular expression, i.e. main body or parenthesized thing
322 * Caller must absorb opening parenthesis.
324 * Combining parenthesis handling with the base level of regular expression
325 * is a trifle forced, but the need to tie the tails of the branches to what
326 * follows makes it hard to avoid.
328 static int reg(regex_t *preg, int paren /* Parenthesized? */, int *flagp )
330 int ret;
331 int br;
332 int ender;
333 int parno = 0;
334 int flags;
336 *flagp = HASWIDTH; /* Tentatively. */
338 /* Make an OPEN node, if parenthesized. */
339 if (paren) {
340 if (preg->regparse[0] == '?' && preg->regparse[1] == ':') {
341 /* non-capturing paren */
342 preg->regparse += 2;
343 parno = -1;
345 else {
346 parno = ++preg->re_nsub;
348 ret = regnode(preg, OPEN+parno);
349 } else
350 ret = 0;
352 /* Pick up the branches, linking them together. */
353 br = regbranch(preg, &flags);
354 if (br == 0)
355 return 0;
356 if (ret != 0)
357 regtail(preg, ret, br); /* OPEN -> first. */
358 else
359 ret = br;
360 if (!(flags&HASWIDTH))
361 *flagp &= ~HASWIDTH;
362 *flagp |= flags&SPSTART;
363 while (*preg->regparse == '|') {
364 preg->regparse++;
365 br = regbranch(preg, &flags);
366 if (br == 0)
367 return 0;
368 regtail(preg, ret, br); /* BRANCH -> BRANCH. */
369 if (!(flags&HASWIDTH))
370 *flagp &= ~HASWIDTH;
371 *flagp |= flags&SPSTART;
374 /* Make a closing node, and hook it on the end. */
375 ender = regnode(preg, (paren) ? CLOSE+parno : END);
376 regtail(preg, ret, ender);
378 /* Hook the tails of the branches to the closing node. */
379 for (br = ret; br != 0; br = regnext(preg, br))
380 regoptail(preg, br, ender);
382 /* Check for proper termination. */
383 if (paren && *preg->regparse++ != ')') {
384 preg->err = REG_ERR_UNMATCHED_PAREN;
385 return 0;
386 } else if (!paren && *preg->regparse != '\0') {
387 if (*preg->regparse == ')') {
388 preg->err = REG_ERR_UNMATCHED_PAREN;
389 return 0;
390 } else {
391 preg->err = REG_ERR_JUNK_ON_END;
392 return 0;
396 return(ret);
400 - regbranch - one alternative of an | operator
402 * Implements the concatenation operator.
404 static int regbranch(regex_t *preg, int *flagp )
406 int ret;
407 int chain;
408 int latest;
409 int flags;
411 *flagp = WORST; /* Tentatively. */
413 ret = regnode(preg, BRANCH);
414 chain = 0;
415 while (*preg->regparse != '\0' && *preg->regparse != ')' &&
416 *preg->regparse != '|') {
417 latest = regpiece(preg, &flags);
418 if (latest == 0)
419 return 0;
420 *flagp |= flags&HASWIDTH;
421 if (chain == 0) {/* First piece. */
422 *flagp |= flags&SPSTART;
424 else {
425 regtail(preg, chain, latest);
427 chain = latest;
429 if (chain == 0) /* Loop ran zero times. */
430 (void) regnode(preg, NOTHING);
432 return(ret);
436 - regpiece - something followed by possible [*+?]
438 * Note that the branching code sequences used for ? and the general cases
439 * of * and + are somewhat optimized: they use the same NOTHING node as
440 * both the endmarker for their branch list and the body of the last branch.
441 * It might seem that this node could be dispensed with entirely, but the
442 * endmarker role is not redundant.
444 static int regpiece(regex_t *preg, int *flagp)
446 int ret;
447 char op;
448 int next;
449 int flags;
450 int min;
451 int max;
453 ret = regatom(preg, &flags);
454 if (ret == 0)
455 return 0;
457 op = *preg->regparse;
458 if (!ISMULT(op)) {
459 *flagp = flags;
460 return(ret);
463 if (!(flags&HASWIDTH) && op != '?') {
464 preg->err = REG_ERR_OPERAND_COULD_BE_EMPTY;
465 return 0;
468 /* Handle braces (counted repetition) by expansion */
469 if (op == '{') {
470 char *end;
472 min = strtoul(preg->regparse + 1, &end, 10);
473 if (end == preg->regparse + 1) {
474 preg->err = REG_ERR_BAD_COUNT;
475 return 0;
477 if (*end == '}') {
478 max = min;
480 else {
481 preg->regparse = end;
482 max = strtoul(preg->regparse + 1, &end, 10);
483 if (*end != '}') {
484 preg->err = REG_ERR_UNMATCHED_BRACES;
485 return 0;
488 if (end == preg->regparse + 1) {
489 max = MAX_REP_COUNT;
491 else if (max < min || max >= 100) {
492 preg->err = REG_ERR_BAD_COUNT;
493 return 0;
495 if (min >= 100) {
496 preg->err = REG_ERR_BAD_COUNT;
497 return 0;
500 preg->regparse = strchr(preg->regparse, '}');
502 else {
503 min = (op == '+');
504 max = (op == '?' ? 1 : MAX_REP_COUNT);
507 if (preg->regparse[1] == '?') {
508 preg->regparse++;
509 next = reginsert(preg, flags & SIMPLE ? REPMIN : REPXMIN, 5, ret);
511 else {
512 next = reginsert(preg, flags & SIMPLE ? REP: REPX, 5, ret);
514 preg->program[ret + 2] = max;
515 preg->program[ret + 3] = min;
516 preg->program[ret + 4] = 0;
518 *flagp = (min) ? (WORST|HASWIDTH) : (WORST|SPSTART);
520 if (!(flags & SIMPLE)) {
521 int back = regnode(preg, BACK);
522 regtail(preg, back, ret);
523 regtail(preg, next, back);
526 preg->regparse++;
527 if (ISMULT(*preg->regparse)) {
528 preg->err = REG_ERR_NESTED_COUNT;
529 return 0;
532 return ret;
536 * Add all characters in the inclusive range between lower and upper.
538 * Handles a swapped range (upper < lower).
540 static void reg_addrange(regex_t *preg, int lower, int upper)
542 if (lower > upper) {
543 reg_addrange(preg, upper, lower);
545 /* Add a range as length, start */
546 regc(preg, upper - lower + 1);
547 regc(preg, lower);
551 * Add a null-terminated literal string as a set of ranges.
553 static void reg_addrange_str(regex_t *preg, const char *str)
555 while (*str) {
556 reg_addrange(preg, *str, *str);
557 str++;
562 * Extracts the next unicode char from utf8.
564 * If 'upper' is set, converts the char to uppercase.
566 static int reg_utf8_tounicode_case(const char *s, int *uc, int upper)
568 int l = utf8_tounicode(s, uc);
569 if (upper) {
570 *uc = utf8_upper(*uc);
572 return l;
576 * Converts a hex digit to decimal.
578 * Returns -1 for an invalid hex digit.
580 static int hexdigitval(int c)
582 if (c >= '0' && c <= '9')
583 return c - '0';
584 if (c >= 'a' && c <= 'f')
585 return c - 'a' + 10;
586 if (c >= 'A' && c <= 'F')
587 return c - 'A' + 10;
588 return -1;
592 * Parses up to 'n' hex digits at 's' and stores the result in *uc.
594 * Returns the number of hex digits parsed.
595 * If there are no hex digits, returns 0 and stores nothing.
597 static int parse_hex(const char *s, int n, int *uc)
599 int val = 0;
600 int k;
602 for (k = 0; k < n; k++) {
603 int c = hexdigitval(*s++);
604 if (c == -1) {
605 break;
607 val = (val << 4) | c;
609 if (k) {
610 *uc = val;
612 return k;
616 * Call for chars after a backlash to decode the escape sequence.
618 * Stores the result in *ch.
620 * Returns the number of bytes consumed.
622 static int reg_decode_escape(const char *s, int *ch)
624 int n;
625 const char *s0 = s;
627 *ch = *s++;
629 switch (*ch) {
630 case 'b': *ch = '\b'; break;
631 case 'e': *ch = 27; break;
632 case 'f': *ch = '\f'; break;
633 case 'n': *ch = '\n'; break;
634 case 'r': *ch = '\r'; break;
635 case 't': *ch = '\t'; break;
636 case 'v': *ch = '\v'; break;
637 case 'u':
638 if (*s == '{') {
639 /* Expect \u{NNNN} */
640 n = parse_hex(s + 1, 6, ch);
641 if (n > 0 && s[n + 1] == '}' && *ch >= 0 && *ch <= 0x1fffff) {
642 s += n + 2;
644 else {
645 /* Invalid, so just treat as an escaped 'u' */
646 *ch = 'u';
649 else if ((n = parse_hex(s, 4, ch)) > 0) {
650 s += n;
652 break;
653 case 'U':
654 if ((n = parse_hex(s, 8, ch)) > 0) {
655 s += n;
657 break;
658 case 'x':
659 if ((n = parse_hex(s, 2, ch)) > 0) {
660 s += n;
662 break;
663 case '\0':
664 s--;
665 *ch = '\\';
666 break;
668 return s - s0;
672 - regatom - the lowest level
674 * Optimization: gobbles an entire sequence of ordinary characters so that
675 * it can turn them into a single node, which is smaller to store and
676 * faster to run. Backslashed characters are exceptions, each becoming a
677 * separate node; the code is simpler that way and it's not worth fixing.
679 static int regatom(regex_t *preg, int *flagp)
681 int ret;
682 int flags;
683 int nocase = (preg->cflags & REG_ICASE);
685 int ch;
686 int n = reg_utf8_tounicode_case(preg->regparse, &ch, nocase);
688 *flagp = WORST; /* Tentatively. */
690 preg->regparse += n;
691 switch (ch) {
692 /* FIXME: these chars only have meaning at beg/end of pat? */
693 case '^':
694 ret = regnode(preg, BOL);
695 break;
696 case '$':
697 ret = regnode(preg, EOL);
698 break;
699 case '.':
700 ret = regnode(preg, ANY);
701 *flagp |= HASWIDTH|SIMPLE;
702 break;
703 case '[': {
704 const char *pattern = preg->regparse;
706 if (*pattern == '^') { /* Complement of range. */
707 ret = regnode(preg, ANYBUT);
708 pattern++;
709 } else
710 ret = regnode(preg, ANYOF);
712 /* Special case. If the first char is ']' or '-', it is part of the set */
713 if (*pattern == ']' || *pattern == '-') {
714 reg_addrange(preg, *pattern, *pattern);
715 pattern++;
718 while (*pattern && *pattern != ']') {
719 /* Is this a range? a-z */
720 int start;
721 int end;
723 pattern += reg_utf8_tounicode_case(pattern, &start, nocase);
724 if (start == '\\') {
725 pattern += reg_decode_escape(pattern, &start);
726 if (start == 0) {
727 preg->err = REG_ERR_NULL_CHAR;
728 return 0;
731 if (pattern[0] == '-' && pattern[1] && pattern[1] != ']') {
732 /* skip '-' */
733 pattern += utf8_tounicode(pattern, &end);
734 pattern += reg_utf8_tounicode_case(pattern, &end, nocase);
735 if (end == '\\') {
736 pattern += reg_decode_escape(pattern, &end);
737 if (end == 0) {
738 preg->err = REG_ERR_NULL_CHAR;
739 return 0;
743 reg_addrange(preg, start, end);
744 continue;
746 if (start == '[') {
747 if (strncmp(pattern, ":alpha:]", 8) == 0) {
748 if ((preg->cflags & REG_ICASE) == 0) {
749 reg_addrange(preg, 'a', 'z');
751 reg_addrange(preg, 'A', 'Z');
752 pattern += 8;
753 continue;
755 if (strncmp(pattern, ":alnum:]", 8) == 0) {
756 if ((preg->cflags & REG_ICASE) == 0) {
757 reg_addrange(preg, 'a', 'z');
759 reg_addrange(preg, 'A', 'Z');
760 reg_addrange(preg, '0', '9');
761 pattern += 8;
762 continue;
764 if (strncmp(pattern, ":space:]", 8) == 0) {
765 reg_addrange_str(preg, " \t\r\n\f\v");
766 pattern += 8;
767 continue;
770 /* Not a range, so just add the char */
771 reg_addrange(preg, start, start);
773 regc(preg, '\0');
775 if (*pattern) {
776 pattern++;
778 preg->regparse = pattern;
780 *flagp |= HASWIDTH|SIMPLE;
782 break;
783 case '(':
784 ret = reg(preg, 1, &flags);
785 if (ret == 0)
786 return 0;
787 *flagp |= flags&(HASWIDTH|SPSTART);
788 break;
789 case '\0':
790 case '|':
791 case ')':
792 preg->err = REG_ERR_INTERNAL;
793 return 0; /* Supposed to be caught earlier. */
794 case '?':
795 case '+':
796 case '*':
797 case '{':
798 preg->err = REG_ERR_COUNT_FOLLOWS_NOTHING;
799 return 0;
800 case '\\':
801 ch = *preg->regparse++;
802 switch (ch) {
803 case '\0':
804 preg->err = REG_ERR_TRAILING_BACKSLASH;
805 return 0;
806 case 'A':
807 ret = regnode(preg, BOLX);
808 break;
809 case 'Z':
810 ret = regnode(preg, EOLX);
811 break;
812 case '<':
813 case 'm':
814 ret = regnode(preg, WORDA);
815 break;
816 case '>':
817 case 'M':
818 ret = regnode(preg, WORDZ);
819 break;
820 case 'd':
821 case 'D':
822 ret = regnode(preg, ch == 'd' ? ANYOF : ANYBUT);
823 reg_addrange(preg, '0', '9');
824 regc(preg, '\0');
825 *flagp |= HASWIDTH|SIMPLE;
826 break;
827 case 'w':
828 case 'W':
829 ret = regnode(preg, ch == 'w' ? ANYOF : ANYBUT);
830 if ((preg->cflags & REG_ICASE) == 0) {
831 reg_addrange(preg, 'a', 'z');
833 reg_addrange(preg, 'A', 'Z');
834 reg_addrange(preg, '0', '9');
835 reg_addrange(preg, '_', '_');
836 regc(preg, '\0');
837 *flagp |= HASWIDTH|SIMPLE;
838 break;
839 case 's':
840 case 'S':
841 ret = regnode(preg, ch == 's' ? ANYOF : ANYBUT);
842 reg_addrange_str(preg," \t\r\n\f\v");
843 regc(preg, '\0');
844 *flagp |= HASWIDTH|SIMPLE;
845 break;
846 /* FIXME: Someday handle \1, \2, ... */
847 default:
848 /* Handle general quoted chars in exact-match routine */
849 /* Back up to include the backslash */
850 preg->regparse--;
851 goto de_fault;
853 break;
854 de_fault:
855 default: {
857 * Encode a string of characters to be matched exactly.
859 int added = 0;
861 /* Back up to pick up the first char of interest */
862 preg->regparse -= n;
864 ret = regnode(preg, EXACTLY);
866 /* Note that a META operator such as ? or * consumes the
867 * preceding char.
868 * Thus we must be careful to look ahead by 2 and add the
869 * last char as it's own EXACTLY if necessary
872 /* Until end of string or a META char is reached */
873 while (*preg->regparse && strchr(META, *preg->regparse) == NULL) {
874 n = reg_utf8_tounicode_case(preg->regparse, &ch, (preg->cflags & REG_ICASE));
875 if (ch == '\\' && preg->regparse[n]) {
876 /* Non-trailing backslash.
877 * Is this a special escape, or a regular escape?
879 if (strchr("<>mMwWdDsSAZ", preg->regparse[n])) {
880 /* A special escape. All done with EXACTLY */
881 break;
883 /* Decode it. Note that we add the length for the escape
884 * sequence to the length for the backlash so we can skip
885 * the entire sequence, or not as required.
887 n += reg_decode_escape(preg->regparse + n, &ch);
888 if (ch == 0) {
889 preg->err = REG_ERR_NULL_CHAR;
890 return 0;
894 /* Now we have one char 'ch' of length 'n'.
895 * Check to see if the following char is a MULT
898 if (ISMULT(preg->regparse[n])) {
899 /* Yes. But do we already have some EXACTLY chars? */
900 if (added) {
901 /* Yes, so return what we have and pick up the current char next time around */
902 break;
904 /* No, so add this single char and finish */
905 regc(preg, ch);
906 added++;
907 preg->regparse += n;
908 break;
911 /* No, so just add this char normally */
912 regc(preg, ch);
913 added++;
914 preg->regparse += n;
916 regc(preg, '\0');
918 *flagp |= HASWIDTH;
919 if (added == 1)
920 *flagp |= SIMPLE;
921 break;
923 break;
926 return(ret);
929 static void reg_grow(regex_t *preg, int n)
931 if (preg->p + n >= preg->proglen) {
932 preg->proglen = (preg->p + n) * 2;
933 preg->program = realloc(preg->program, preg->proglen * sizeof(int));
938 - regnode - emit a node
940 /* Location. */
941 static int regnode(regex_t *preg, int op)
943 reg_grow(preg, 2);
945 /* The OP followed by a next pointer */
946 preg->program[preg->p++] = op;
947 preg->program[preg->p++] = 0;
949 /* Return the start of the node */
950 return preg->p - 2;
954 - regc - emit (if appropriate) a byte of code
956 static void regc(regex_t *preg, int b )
958 reg_grow(preg, 1);
959 preg->program[preg->p++] = b;
963 - reginsert - insert an operator in front of already-emitted operand
965 * Means relocating the operand.
966 * Returns the new location of the original operand.
968 static int reginsert(regex_t *preg, int op, int size, int opnd )
970 reg_grow(preg, size);
972 /* Move everything from opnd up */
973 memmove(preg->program + opnd + size, preg->program + opnd, sizeof(int) * (preg->p - opnd));
974 /* Zero out the new space */
975 memset(preg->program + opnd, 0, sizeof(int) * size);
977 preg->program[opnd] = op;
979 preg->p += size;
981 return opnd + size;
985 - regtail - set the next-pointer at the end of a node chain
987 static void regtail(regex_t *preg, int p, int val)
989 int scan;
990 int temp;
991 int offset;
993 /* Find last node. */
994 scan = p;
995 for (;;) {
996 temp = regnext(preg, scan);
997 if (temp == 0)
998 break;
999 scan = temp;
1002 if (OP(preg, scan) == BACK)
1003 offset = scan - val;
1004 else
1005 offset = val - scan;
1007 preg->program[scan + 1] = offset;
1011 - regoptail - regtail on operand of first argument; nop if operandless
1014 static void regoptail(regex_t *preg, int p, int val )
1016 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
1017 if (p != 0 && OP(preg, p) == BRANCH) {
1018 regtail(preg, OPERAND(p), val);
1023 * regexec and friends
1027 * Forwards.
1029 static int regtry(regex_t *preg, const char *string );
1030 static int regmatch(regex_t *preg, int prog);
1031 static int regrepeat(regex_t *preg, int p, int max);
1034 - regexec - match a regexp against a string
1036 int regexec(regex_t *preg, const char *string, size_t nmatch, regmatch_t pmatch[], int eflags)
1038 const char *s;
1039 int scan;
1041 /* Be paranoid... */
1042 if (preg == NULL || preg->program == NULL || string == NULL) {
1043 return REG_ERR_NULL_ARGUMENT;
1046 /* Check validity of program. */
1047 if (*preg->program != REG_MAGIC) {
1048 return REG_ERR_CORRUPTED;
1051 #ifdef DEBUG
1052 fprintf(stderr, "regexec: %s\n", string);
1053 regdump(preg);
1054 #endif
1056 preg->eflags = eflags;
1057 preg->pmatch = pmatch;
1058 preg->nmatch = nmatch;
1059 preg->start = string; /* All offsets are computed from here */
1061 /* Must clear out the embedded repeat counts of REPX and REPXMIN opcodes */
1062 for (scan = OPERAND(1); scan != 0; scan += regopsize(preg, scan)) {
1063 int op = OP(preg, scan);
1064 if (op == END)
1065 break;
1066 if (op == REPX || op == REPXMIN)
1067 preg->program[scan + 4] = 0;
1070 /* If there is a "must appear" string, look for it. */
1071 if (preg->regmust != 0) {
1072 s = string;
1073 while ((s = str_find(s, preg->program[preg->regmust], preg->cflags & REG_ICASE)) != NULL) {
1074 if (prefix_cmp(preg->program + preg->regmust, preg->regmlen, s, preg->cflags & REG_ICASE) >= 0) {
1075 break;
1077 s++;
1079 if (s == NULL) /* Not present. */
1080 return REG_NOMATCH;
1083 /* Mark beginning of line for ^ . */
1084 preg->regbol = string;
1086 /* Simplest case: anchored match need be tried only once (maybe per line). */
1087 if (preg->reganch) {
1088 if (eflags & REG_NOTBOL) {
1089 /* This is an anchored search, but not an BOL, so possibly skip to the next line */
1090 goto nextline;
1092 while (1) {
1093 if (regtry(preg, string)) {
1094 return REG_NOERROR;
1096 if (*string) {
1097 nextline:
1098 if (preg->cflags & REG_NEWLINE) {
1099 /* Try the next anchor? */
1100 string = strchr(string, '\n');
1101 if (string) {
1102 preg->regbol = ++string;
1103 continue;
1107 return REG_NOMATCH;
1111 /* Messy cases: unanchored match. */
1112 s = string;
1113 if (preg->regstart != '\0') {
1114 /* We know what char it must start with. */
1115 while ((s = str_find(s, preg->regstart, preg->cflags & REG_ICASE)) != NULL) {
1116 if (regtry(preg, s))
1117 return REG_NOERROR;
1118 s++;
1121 else
1122 /* We don't -- general case. */
1123 while (1) {
1124 if (regtry(preg, s))
1125 return REG_NOERROR;
1126 if (*s == '\0') {
1127 break;
1129 else {
1130 int c;
1131 s += utf8_tounicode(s, &c);
1135 /* Failure. */
1136 return REG_NOMATCH;
1140 - regtry - try match at specific point
1142 /* 0 failure, 1 success */
1143 static int regtry( regex_t *preg, const char *string )
1145 int i;
1147 preg->reginput = string;
1149 for (i = 0; i < preg->nmatch; i++) {
1150 preg->pmatch[i].rm_so = -1;
1151 preg->pmatch[i].rm_eo = -1;
1153 if (regmatch(preg, 1)) {
1154 preg->pmatch[0].rm_so = string - preg->start;
1155 preg->pmatch[0].rm_eo = preg->reginput - preg->start;
1156 return(1);
1157 } else
1158 return(0);
1162 * Returns bytes matched if 'pattern' is a prefix of 'string'.
1164 * If 'nocase' is non-zero, does a case-insensitive match.
1166 * Returns -1 on not found.
1168 static int prefix_cmp(const int *prog, int proglen, const char *string, int nocase)
1170 const char *s = string;
1171 while (proglen && *s) {
1172 int ch;
1173 int n = reg_utf8_tounicode_case(s, &ch, nocase);
1174 if (ch != *prog) {
1175 return -1;
1177 prog++;
1178 s += n;
1179 proglen--;
1181 if (proglen == 0) {
1182 return s - string;
1184 return -1;
1188 * Searchs for 'c' in the range 'range'.
1190 * Returns 1 if found, or 0 if not.
1192 static int reg_range_find(const int *range, int c)
1194 while (*range) {
1195 /*printf("Checking %d in range [%d,%d]\n", c, range[1], (range[0] + range[1] - 1));*/
1196 if (c >= range[1] && c <= (range[0] + range[1] - 1)) {
1197 return 1;
1199 range += 2;
1201 return 0;
1205 * Search for the character 'c' in the utf-8 string 'string'.
1207 * If 'nocase' is set, the 'string' is assumed to be uppercase
1208 * and 'c' is converted to uppercase before matching.
1210 * Returns the byte position in the string where the 'c' was found, or
1211 * NULL if not found.
1213 static const char *str_find(const char *string, int c, int nocase)
1215 if (nocase) {
1216 /* The "string" should already be converted to uppercase */
1217 c = utf8_upper(c);
1219 while (*string) {
1220 int ch;
1221 int n = reg_utf8_tounicode_case(string, &ch, nocase);
1222 if (c == ch) {
1223 return string;
1225 string += n;
1227 return NULL;
1231 * Returns true if 'ch' is an end-of-line char.
1233 * In REG_NEWLINE mode, \n is considered EOL in
1234 * addition to \0
1236 static int reg_iseol(regex_t *preg, int ch)
1238 if (preg->cflags & REG_NEWLINE) {
1239 return ch == '\0' || ch == '\n';
1241 else {
1242 return ch == '\0';
1246 static int regmatchsimplerepeat(regex_t *preg, int scan, int matchmin)
1248 int nextch = '\0';
1249 const char *save;
1250 int no;
1251 int c;
1253 int max = preg->program[scan + 2];
1254 int min = preg->program[scan + 3];
1255 int next = regnext(preg, scan);
1258 * Lookahead to avoid useless match attempts
1259 * when we know what character comes next.
1261 if (OP(preg, next) == EXACTLY) {
1262 nextch = preg->program[OPERAND(next)];
1264 save = preg->reginput;
1265 no = regrepeat(preg, scan + 5, max);
1266 if (no < min) {
1267 return 0;
1269 if (matchmin) {
1270 /* from min up to no */
1271 max = no;
1272 no = min;
1274 /* else from no down to min */
1275 while (1) {
1276 if (matchmin) {
1277 if (no > max) {
1278 break;
1281 else {
1282 if (no < min) {
1283 break;
1286 preg->reginput = save + utf8_index(save, no);
1287 reg_utf8_tounicode_case(preg->reginput, &c, (preg->cflags & REG_ICASE));
1288 /* If it could work, try it. */
1289 if (reg_iseol(preg, nextch) || c == nextch) {
1290 if (regmatch(preg, next)) {
1291 return(1);
1294 if (matchmin) {
1295 /* Couldn't or didn't, add one more */
1296 no++;
1298 else {
1299 /* Couldn't or didn't -- back up. */
1300 no--;
1303 return(0);
1306 static int regmatchrepeat(regex_t *preg, int scan, int matchmin)
1308 int *scanpt = preg->program + scan;
1310 int max = scanpt[2];
1311 int min = scanpt[3];
1313 /* Have we reached min? */
1314 if (scanpt[4] < min) {
1315 /* No, so get another one */
1316 scanpt[4]++;
1317 if (regmatch(preg, scan + 5)) {
1318 return 1;
1320 scanpt[4]--;
1321 return 0;
1323 if (scanpt[4] > max) {
1324 return 0;
1327 if (matchmin) {
1328 /* minimal, so try other branch first */
1329 if (regmatch(preg, regnext(preg, scan))) {
1330 return 1;
1332 /* No, so try one more */
1333 scanpt[4]++;
1334 if (regmatch(preg, scan + 5)) {
1335 return 1;
1337 scanpt[4]--;
1338 return 0;
1340 /* maximal, so try this branch again */
1341 if (scanpt[4] < max) {
1342 scanpt[4]++;
1343 if (regmatch(preg, scan + 5)) {
1344 return 1;
1346 scanpt[4]--;
1348 /* At this point we are at max with no match. Try the other branch */
1349 return regmatch(preg, regnext(preg, scan));
1353 - regmatch - main matching routine
1355 * Conceptually the strategy is simple: check to see whether the current
1356 * node matches, call self recursively to see whether the rest matches,
1357 * and then act accordingly. In practice we make some effort to avoid
1358 * recursion, in particular by going through "ordinary" nodes (that don't
1359 * need to know whether the rest of the match failed) by a loop instead of
1360 * by recursion.
1362 /* 0 failure, 1 success */
1363 static int regmatch(regex_t *preg, int prog)
1365 int scan; /* Current node. */
1366 int next; /* Next node. */
1367 const char *save;
1369 scan = prog;
1371 #ifdef DEBUG
1372 if (scan != 0 && regnarrate)
1373 fprintf(stderr, "%s(\n", regprop(scan));
1374 #endif
1375 while (scan != 0) {
1376 int n;
1377 int c;
1378 #ifdef DEBUG
1379 if (regnarrate) {
1380 fprintf(stderr, "%3d: %s...\n", scan, regprop(OP(preg, scan))); /* Where, what. */
1382 #endif
1383 next = regnext(preg, scan);
1384 n = reg_utf8_tounicode_case(preg->reginput, &c, (preg->cflags & REG_ICASE));
1386 switch (OP(preg, scan)) {
1387 case BOLX:
1388 if ((preg->eflags & REG_NOTBOL)) {
1389 return(0);
1391 /* Fall through */
1392 case BOL:
1393 if (preg->reginput != preg->regbol) {
1394 return(0);
1396 break;
1397 case EOLX:
1398 if (c != 0) {
1399 /* For EOLX, only match real end of line, not newline */
1400 return 0;
1402 break;
1403 case EOL:
1404 if (!reg_iseol(preg, c)) {
1405 return(0);
1407 break;
1408 case WORDA:
1409 /* Must be looking at a letter, digit, or _ */
1410 if ((!isalnum(UCHAR(c))) && c != '_')
1411 return(0);
1412 /* Prev must be BOL or nonword */
1413 if (preg->reginput > preg->regbol &&
1414 (isalnum(UCHAR(preg->reginput[-1])) || preg->reginput[-1] == '_'))
1415 return(0);
1416 break;
1417 case WORDZ:
1418 /* Can't match at BOL */
1419 if (preg->reginput > preg->regbol) {
1420 /* Current must be EOL or nonword */
1421 if (reg_iseol(preg, c) || !isalnum(UCHAR(c)) || c != '_') {
1422 c = preg->reginput[-1];
1423 /* Previous must be word */
1424 if (isalnum(UCHAR(c)) || c == '_') {
1425 break;
1429 /* No */
1430 return(0);
1432 case ANY:
1433 if (reg_iseol(preg, c))
1434 return 0;
1435 preg->reginput += n;
1436 break;
1437 case EXACTLY: {
1438 int opnd;
1439 int len;
1440 int slen;
1442 opnd = OPERAND(scan);
1443 len = str_int_len(preg->program + opnd);
1445 slen = prefix_cmp(preg->program + opnd, len, preg->reginput, preg->cflags & REG_ICASE);
1446 if (slen < 0) {
1447 return(0);
1449 preg->reginput += slen;
1451 break;
1452 case ANYOF:
1453 if (reg_iseol(preg, c) || reg_range_find(preg->program + OPERAND(scan), c) == 0) {
1454 return(0);
1456 preg->reginput += n;
1457 break;
1458 case ANYBUT:
1459 if (reg_iseol(preg, c) || reg_range_find(preg->program + OPERAND(scan), c) != 0) {
1460 return(0);
1462 preg->reginput += n;
1463 break;
1464 case NOTHING:
1465 break;
1466 case BACK:
1467 break;
1468 case BRANCH:
1469 if (OP(preg, next) != BRANCH) /* No choice. */
1470 next = OPERAND(scan); /* Avoid recursion. */
1471 else {
1472 do {
1473 save = preg->reginput;
1474 if (regmatch(preg, OPERAND(scan))) {
1475 return(1);
1477 preg->reginput = save;
1478 scan = regnext(preg, scan);
1479 } while (scan != 0 && OP(preg, scan) == BRANCH);
1480 return(0);
1481 /* NOTREACHED */
1483 break;
1484 case REP:
1485 case REPMIN:
1486 return regmatchsimplerepeat(preg, scan, OP(preg, scan) == REPMIN);
1488 case REPX:
1489 case REPXMIN:
1490 return regmatchrepeat(preg, scan, OP(preg, scan) == REPXMIN);
1492 case END:
1493 return 1; /* Success! */
1495 case OPENNC:
1496 case CLOSENC:
1497 return regmatch(preg, next);
1499 default:
1500 if (OP(preg, scan) >= OPEN+1 && OP(preg, scan) < CLOSE_END) {
1501 save = preg->reginput;
1502 if (regmatch(preg, next)) {
1503 if (OP(preg, scan) < CLOSE) {
1504 int no = OP(preg, scan) - OPEN;
1505 if (no < preg->nmatch && preg->pmatch[no].rm_so == -1) {
1506 preg->pmatch[no].rm_so = save - preg->start;
1509 else {
1510 int no = OP(preg, scan) - CLOSE;
1511 if (no < preg->nmatch && preg->pmatch[no].rm_eo == -1) {
1512 preg->pmatch[no].rm_eo = save - preg->start;
1515 return(1);
1517 return(0);
1519 return REG_ERR_INTERNAL;
1522 scan = next;
1526 * We get here only if there's trouble -- normally "case END" is
1527 * the terminating point.
1529 return REG_ERR_INTERNAL;
1533 - regrepeat - repeatedly match something simple, report how many
1535 static int regrepeat(regex_t *preg, int p, int max)
1537 int count = 0;
1538 const char *scan;
1539 int opnd;
1540 int ch;
1541 int n;
1543 scan = preg->reginput;
1544 opnd = OPERAND(p);
1545 switch (OP(preg, p)) {
1546 case ANY:
1547 /* No need to handle utf8 specially here */
1548 while (!reg_iseol(preg, *scan) && count < max) {
1549 count++;
1550 scan++;
1552 break;
1553 case EXACTLY:
1554 while (count < max) {
1555 n = reg_utf8_tounicode_case(scan, &ch, preg->cflags & REG_ICASE);
1556 if (preg->program[opnd] != ch) {
1557 break;
1559 count++;
1560 scan += n;
1562 break;
1563 case ANYOF:
1564 while (count < max) {
1565 n = reg_utf8_tounicode_case(scan, &ch, preg->cflags & REG_ICASE);
1566 if (reg_iseol(preg, ch) || reg_range_find(preg->program + opnd, ch) == 0) {
1567 break;
1569 count++;
1570 scan += n;
1572 break;
1573 case ANYBUT:
1574 while (count < max) {
1575 n = reg_utf8_tounicode_case(scan, &ch, preg->cflags & REG_ICASE);
1576 if (reg_iseol(preg, ch) || reg_range_find(preg->program + opnd, ch) != 0) {
1577 break;
1579 count++;
1580 scan += n;
1582 break;
1583 default: /* Oh dear. Called inappropriately. */
1584 preg->err = REG_ERR_INTERNAL;
1585 count = 0; /* Best compromise. */
1586 break;
1588 preg->reginput = scan;
1590 return(count);
1594 - regnext - dig the "next" pointer out of a node
1596 static int regnext(regex_t *preg, int p )
1598 int offset;
1600 offset = NEXT(preg, p);
1602 if (offset == 0)
1603 return 0;
1605 if (OP(preg, p) == BACK)
1606 return(p-offset);
1607 else
1608 return(p+offset);
1612 - regopsize - returns the size of opcode + operands at 'p' in words
1614 static int regopsize(regex_t *preg, int p )
1616 /* Almost all opcodes are 2 words, but some are more */
1617 switch (OP(preg, p)) {
1618 case REP:
1619 case REPMIN:
1620 case REPX:
1621 case REPXMIN:
1622 return 5;
1624 case ANYOF:
1625 case ANYBUT:
1626 case EXACTLY: {
1627 int s = p + 2;
1628 while (preg->program[s++]) {
1630 return s - p;
1633 return 2;
1636 #if defined(DEBUG) && !defined(JIM_BOOTSTRAP)
1639 - regdump - dump a regexp onto stdout in vaguely comprehensible form
1641 static void regdump(regex_t *preg)
1643 int s;
1644 int op = EXACTLY; /* Arbitrary non-END op. */
1645 int next;
1646 char buf[MAX_UTF8_LEN + 1];
1648 int i;
1649 for (i = 1; i < preg->p; i++) {
1650 printf("%02x ", (unsigned char)preg->program[i]);
1651 if (i % 16 == 0) {
1652 printf("\n");
1655 printf("\n");
1657 s = 1;
1658 while (op != END && s < preg->p) { /* While that wasn't END last time... */
1659 op = OP(preg, s);
1660 printf("%3d: %s", s, regprop(op)); /* Where, what. */
1661 next = regnext(preg, s);
1662 if (next == 0) /* Next ptr. */
1663 printf("(0)");
1664 else
1665 printf("(%d)", next);
1666 s += 2;
1667 if (op == REP || op == REPMIN || op == REPX || op == REPXMIN) {
1668 int max = preg->program[s];
1669 int min = preg->program[s + 1];
1670 if (max == 65535) {
1671 printf("{%d,*}", min);
1673 else {
1674 printf("{%d,%d}", min, max);
1676 printf(" %d", preg->program[s + 2]);
1677 s += 3;
1679 else if (op == ANYOF || op == ANYBUT) {
1680 /* set of ranges */
1682 while (preg->program[s]) {
1683 int len = preg->program[s++];
1684 int first = preg->program[s++];
1685 buf[utf8_getchars(buf, first)] = 0;
1686 printf("%s", buf);
1687 if (len > 1) {
1688 buf[utf8_getchars(buf, first + len - 1)] = 0;
1689 printf("-%s", buf);
1692 s++;
1694 else if (op == EXACTLY) {
1695 /* Literal string, where present. */
1697 while (preg->program[s]) {
1698 buf[utf8_getchars(buf, preg->program[s])] = 0;
1699 printf("%s", buf);
1700 s++;
1702 s++;
1704 putchar('\n');
1707 if (op == END) {
1708 /* Header fields of interest. */
1709 if (preg->regstart) {
1710 buf[utf8_getchars(buf, preg->regstart)] = 0;
1711 printf("start '%s' ", buf);
1713 if (preg->reganch)
1714 printf("anchored ");
1715 if (preg->regmust != 0) {
1716 int i;
1717 printf("must have:");
1718 for (i = 0; i < preg->regmlen; i++) {
1719 putchar(preg->program[preg->regmust + i]);
1721 putchar('\n');
1724 printf("\n");
1728 - regprop - printable representation of opcode
1730 static const char *regprop( int op )
1732 static char buf[50];
1734 switch (op) {
1735 case BOL:
1736 return "BOL";
1737 case EOL:
1738 return "EOL";
1739 case BOLX:
1740 return "BOLX";
1741 case EOLX:
1742 return "EOLX";
1743 case ANY:
1744 return "ANY";
1745 case ANYOF:
1746 return "ANYOF";
1747 case ANYBUT:
1748 return "ANYBUT";
1749 case BRANCH:
1750 return "BRANCH";
1751 case EXACTLY:
1752 return "EXACTLY";
1753 case NOTHING:
1754 return "NOTHING";
1755 case BACK:
1756 return "BACK";
1757 case END:
1758 return "END";
1759 case REP:
1760 return "REP";
1761 case REPMIN:
1762 return "REPMIN";
1763 case REPX:
1764 return "REPX";
1765 case REPXMIN:
1766 return "REPXMIN";
1767 case WORDA:
1768 return "WORDA";
1769 case WORDZ:
1770 return "WORDZ";
1771 case OPENNC:
1772 return "OPEN";
1773 case CLOSENC:
1774 return "CLOSE";
1775 default:
1776 if (op >= OPEN && op < CLOSE) {
1777 snprintf(buf, sizeof(buf), "OPEN%d", op-OPEN);
1779 else if (op >= CLOSE && op < CLOSE_END) {
1780 snprintf(buf, sizeof(buf), "CLOSE%d", op-CLOSE);
1782 else {
1783 snprintf(buf, sizeof(buf), "?%d?\n", op);
1785 return(buf);
1788 #endif /* JIM_BOOTSTRAP */
1790 size_t regerror(int errcode, const regex_t *preg, char *errbuf, size_t errbuf_size)
1792 static const char *error_strings[] = {
1793 "success",
1794 "no match",
1795 "bad pattern",
1796 "null argument",
1797 "unknown error",
1798 "too big",
1799 "out of memory",
1800 "too many ()",
1801 "parentheses () not balanced",
1802 "braces {} not balanced",
1803 "invalid repetition count(s)",
1804 "extra characters",
1805 "*+ of empty atom",
1806 "nested count",
1807 "internal error",
1808 "count follows nothing",
1809 "trailing backslash",
1810 "corrupted program",
1811 "contains null char",
1813 const char *err;
1815 if (errcode < 0 || errcode >= REG_ERR_NUM) {
1816 err = "Bad error code";
1818 else {
1819 err = error_strings[errcode];
1822 return snprintf(errbuf, errbuf_size, "%s", err);
1825 void regfree(regex_t *preg)
1827 free(preg->program);
1830 #endif