package: simplification/code cleanup
[jimtcl.git] / jimregexp.c
bloba8f7c1d28f700e9ecf46b7a29f1b0a055b9cb4f6
1 /*
2 * regcomp and regexec -- regsub and regerror are elsewhere
4 * Copyright (c) 1986 by University of Toronto.
5 * Written by Henry Spencer. Not derived from licensed software.
7 * Permission is granted to anyone to use this software for any
8 * purpose on any computer system, and to redistribute it freely,
9 * subject to the following restrictions:
11 * 1. The author is not responsible for the consequences of use of
12 * this software, no matter how awful, even if they arise
13 * from defects in it.
15 * 2. The origin of this software must not be misrepresented, either
16 * by explicit claim or by omission.
18 * 3. Altered versions must be plainly marked as such, and must not
19 * be misrepresented as being the original software.
20 *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore,
21 *** hoptoad!gnu, on 27 Dec 1986, to add \n as an alternative to |
22 *** to assist in implementing egrep.
23 *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore,
24 *** hoptoad!gnu, on 27 Dec 1986, to add \< and \> for word-matching
25 *** as in BSD grep and ex.
26 *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore,
27 *** hoptoad!gnu, on 28 Dec 1986, to optimize characters quoted with \.
28 *** THIS IS AN ALTERED VERSION. It was altered by James A. Woods,
29 *** ames!jaw, on 19 June 1987, to quash a regcomp() redundancy.
30 *** THIS IS AN ALTERED VERSION. It was altered by Christopher Seiwald
31 *** seiwald@vix.com, on 28 August 1993, for use in jam. Regmagic.h
32 *** was moved into regexp.h, and the include of regexp.h now uses "'s
33 *** to avoid conflicting with the system regexp.h. Const, bless its
34 *** soul, was removed so it can compile everywhere. The declaration
35 *** of strchr() was in conflict on AIX, so it was removed (as it is
36 *** happily defined in string.h).
37 *** THIS IS AN ALTERED VERSION. It was altered by Christopher Seiwald
38 *** seiwald@perforce.com, on 20 January 2000, to use function prototypes.
39 *** THIS IS AN ALTERED VERSION. It was altered by Christopher Seiwald
40 *** seiwald@perforce.com, on 05 November 2002, to const string literals.
42 * THIS IS AN ALTERED VERSION. It was altered by Steve Bennett <steveb@workware.net.au>
43 * on 16 October 2010, to remove static state and add better Tcl ARE compatibility.
44 * This includes counted repetitions, UTF-8 support, character classes,
45 * shorthand character classes, increased number of parentheses to 100,
46 * backslash escape sequences. It also removes \n as an alternative to |.
48 * Beware that some of this code is subtly aware of the way operator
49 * precedence is structured in regular expressions. Serious changes in
50 * regular-expression syntax might require a total rethink.
52 #include <stdio.h>
53 #include <ctype.h>
54 #include <stdlib.h>
55 #include <string.h>
57 #include "jim.h"
58 #include "jimautoconf.h"
59 #include "jimregexp.h"
60 #include "utf8.h"
62 #if !defined(HAVE_REGCOMP) || defined(JIM_REGEXP)
65 * Structure for regexp "program". This is essentially a linear encoding
66 * of a nondeterministic finite-state machine (aka syntax charts or
67 * "railroad normal form" in parsing technology). Each node is an opcode
68 * plus a "next" pointer, possibly plus an operand. "Next" pointers of
69 * all nodes except BRANCH implement concatenation; a "next" pointer with
70 * a BRANCH on both ends of it is connecting two alternatives. (Here we
71 * have one of the subtle syntax dependencies: an individual BRANCH (as
72 * opposed to a collection of them) is never concatenated with anything
73 * because of operator precedence.) The operand of some types of node is
74 * a literal string; for others, it is a node leading into a sub-FSM. In
75 * particular, the operand of a BRANCH node is the first node of the branch.
76 * (NB this is *not* a tree structure: the tail of the branch connects
77 * to the thing following the set of BRANCHes.) The opcodes are:
80 /* This *MUST* be less than (255-20-1)/2=117 */
81 #define REG_MAX_PAREN 100
83 /* definition number opnd? meaning */
84 #define END 0 /* no End of program. */
85 #define BOL 1 /* no Match "" at beginning of line. */
86 #define EOL 2 /* no Match "" at end of line. */
87 #define ANY 3 /* no Match any one character. */
88 #define ANYOF 4 /* str Match any character in this string. */
89 #define ANYBUT 5 /* str Match any character not in this string. */
90 #define BRANCH 6 /* node Match this alternative, or the next... */
91 #define BACK 7 /* no Match "", "next" ptr points backward. */
92 #define EXACTLY 8 /* str Match this string. */
93 #define NOTHING 9 /* no Match empty string. */
94 #define REP 10 /* max,min Match this (simple) thing [min,max] times. */
95 #define REPMIN 11 /* max,min Match this (simple) thing [min,max] times, mininal match. */
96 #define REPX 12 /* max,min Match this (complex) thing [min,max] times. */
97 #define REPXMIN 13 /* max,min Match this (complex) thing [min,max] times, minimal match. */
99 #define WORDA 15 /* no Match "" at wordchar, where prev is nonword */
100 #define WORDZ 16 /* no Match "" at nonwordchar, where prev is word */
101 #define OPENNC 19 /* no Non-capturing parentheses - must be OPEN-1 */
102 #define OPEN 20 /* no Mark this point in input as start of #n. */
103 /* OPEN+1 is number 1, etc. */
104 #define CLOSE (OPEN+REG_MAX_PAREN+1) /* no Analogous to OPEN. */
105 #define CLOSE_END (CLOSE+REG_MAX_PAREN)
106 #define CLOSENC (CLOSE-1) /* no Non-capturing parentheses - must be CLOSE-1 */
109 * The first byte of the regexp internal "program" is actually this magic
110 * number; the start node begins in the second byte.
112 #define REG_MAGIC 0xFADED00D
115 * Opcode notes:
117 * BRANCH The set of branches constituting a single choice are hooked
118 * together with their "next" pointers, since precedence prevents
119 * anything being concatenated to any individual branch. The
120 * "next" pointer of the last BRANCH in a choice points to the
121 * thing following the whole choice. This is also where the
122 * final "next" pointer of each individual branch points; each
123 * branch starts with the operand node of a BRANCH node.
125 * BACK Normal "next" pointers all implicitly point forward; BACK
126 * exists to make loop structures possible.
128 * STAR,PLUS '?', and complex '*' and '+', are implemented as circular
129 * BRANCH structures using BACK. Simple cases (one character
130 * per match) are implemented with STAR and PLUS for speed
131 * and to minimize recursive plunges.
133 * OPEN,CLOSE ...are numbered at compile time.
137 * A node is one char of opcode followed by two chars of "next" pointer.
138 * "Next" pointers are stored as two 8-bit pieces, high order first. The
139 * value is a positive offset from the opcode of the node containing it.
140 * An operand, if any, simply follows the node. (Note that much of the
141 * code generation knows about this implicit relationship.)
143 * Using two bytes for the "next" pointer is vast overkill for most things,
144 * but allows patterns to get big without disasters.
146 #define OP(preg, p) (preg->program[p])
147 #define NEXT(preg, p) (preg->program[p + 1])
148 #define OPERAND(p) ((p) + 2)
151 * See regmagic.h for one further detail of program structure.
156 * Utility definitions.
159 #define FAIL(R,M) { (R)->err = (M); return (M); }
160 #define ISMULT(c) ((c) == '*' || (c) == '+' || (c) == '?' || (c) == '{')
161 #define META "^$.[()|?{+*"
164 * Flags to be passed up and down.
166 #define HASWIDTH 01 /* Known never to match null string. */
167 #define SIMPLE 02 /* Simple enough to be STAR/PLUS operand. */
168 #define SPSTART 04 /* Starts with * or +. */
169 #define WORST 0 /* Worst case. */
171 #define MAX_REP_COUNT 1000000
174 * Forward declarations for regcomp()'s friends.
176 static int reg(regex_t *preg, int paren /* Parenthesized? */, int *flagp );
177 static int regpiece(regex_t *preg, int *flagp );
178 static int regbranch(regex_t *preg, int *flagp );
179 static int regatom(regex_t *preg, int *flagp );
180 static int regnode(regex_t *preg, int op );
181 static int regnext(regex_t *preg, int p );
182 static void regc(regex_t *preg, int b );
183 static int reginsert(regex_t *preg, int op, int size, int opnd );
184 static void regtail_(regex_t *preg, int p, int val, int line );
185 static void regoptail(regex_t *preg, int p, int val );
186 #define regtail(PREG, P, VAL) regtail_(PREG, P, VAL, __LINE__)
188 static int reg_range_find(const int *string, int c);
189 static const char *str_find(const char *string, int c, int nocase);
190 static int prefix_cmp(const int *prog, int proglen, const char *string, int nocase);
192 /*#define DEBUG*/
193 #ifdef DEBUG
194 static int regnarrate = 0;
195 static void regdump(regex_t *preg);
196 static const char *regprop( int op );
197 #endif
201 * Returns the length of the null-terminated integer sequence.
203 static int str_int_len(const int *seq)
205 int n = 0;
206 while (*seq++) {
207 n++;
209 return n;
213 - regcomp - compile a regular expression into internal code
215 * We can't allocate space until we know how big the compiled form will be,
216 * but we can't compile it (and thus know how big it is) until we've got a
217 * place to put the code. So we cheat: we compile it twice, once with code
218 * generation turned off and size counting turned on, and once "for real".
219 * This also means that we don't allocate space until we are sure that the
220 * thing really will compile successfully, and we never have to move the
221 * code and thus invalidate pointers into it. (Note that it has to be in
222 * one piece because free() must be able to free it all.)
224 * Beware that the optimization-preparation code in here knows about some
225 * of the structure of the compiled regexp.
227 int regcomp(regex_t *preg, const char *exp, int cflags)
229 int scan;
230 int longest;
231 unsigned len;
232 int flags;
234 #ifdef DEBUG
235 fprintf(stderr, "Compiling: '%s'\n", exp);
236 #endif
237 memset(preg, 0, sizeof(*preg));
239 if (exp == NULL)
240 FAIL(preg, REG_ERR_NULL_ARGUMENT);
242 /* First pass: determine size, legality. */
243 preg->cflags = cflags;
244 preg->regparse = exp;
245 /* XXX: For now, start unallocated */
246 preg->program = NULL;
247 preg->proglen = 0;
249 /* Allocate space. */
250 preg->proglen = (strlen(exp) + 1) * 5;
251 preg->program = malloc(preg->proglen * sizeof(int));
252 if (preg->program == NULL)
253 FAIL(preg, REG_ERR_NOMEM);
255 /* Note that since we store a magic value as the first item in the program,
256 * program offsets will never be 0
258 regc(preg, REG_MAGIC);
259 if (reg(preg, 0, &flags) == 0) {
260 return preg->err;
263 /* Small enough for pointer-storage convention? */
264 if (preg->re_nsub >= REG_MAX_PAREN) /* Probably could be 65535L. */
265 FAIL(preg,REG_ERR_TOO_BIG);
267 /* Dig out information for optimizations. */
268 preg->regstart = 0; /* Worst-case defaults. */
269 preg->reganch = 0;
270 preg->regmust = 0;
271 preg->regmlen = 0;
272 scan = 1; /* First BRANCH. */
273 if (OP(preg, regnext(preg, scan)) == END) { /* Only one top-level choice. */
274 scan = OPERAND(scan);
276 /* Starting-point info. */
277 if (OP(preg, scan) == EXACTLY) {
278 preg->regstart = preg->program[OPERAND(scan)];
280 else if (OP(preg, scan) == BOL)
281 preg->reganch++;
284 * If there's something expensive in the r.e., find the
285 * longest literal string that must appear and make it the
286 * regmust. Resolve ties in favor of later strings, since
287 * the regstart check works with the beginning of the r.e.
288 * and avoiding duplication strengthens checking. Not a
289 * strong reason, but sufficient in the absence of others.
291 if (flags&SPSTART) {
292 longest = 0;
293 len = 0;
294 for (; scan != 0; scan = regnext(preg, scan)) {
295 if (OP(preg, scan) == EXACTLY) {
296 int plen = str_int_len(preg->program + OPERAND(scan));
297 if (plen >= len) {
298 longest = OPERAND(scan);
299 len = plen;
303 preg->regmust = longest;
304 preg->regmlen = len;
308 #ifdef DEBUG
309 regdump(preg);
310 #endif
312 return 0;
316 - reg - regular expression, i.e. main body or parenthesized thing
318 * Caller must absorb opening parenthesis.
320 * Combining parenthesis handling with the base level of regular expression
321 * is a trifle forced, but the need to tie the tails of the branches to what
322 * follows makes it hard to avoid.
324 static int reg(regex_t *preg, int paren /* Parenthesized? */, int *flagp )
326 int ret;
327 int br;
328 int ender;
329 int parno = 0;
330 int flags;
332 *flagp = HASWIDTH; /* Tentatively. */
334 /* Make an OPEN node, if parenthesized. */
335 if (paren) {
336 if (preg->regparse[0] == '?' && preg->regparse[1] == ':') {
337 /* non-capturing paren */
338 preg->regparse += 2;
339 parno = -1;
341 else {
342 parno = ++preg->re_nsub;
344 ret = regnode(preg, OPEN+parno);
345 } else
346 ret = 0;
348 /* Pick up the branches, linking them together. */
349 br = regbranch(preg, &flags);
350 if (br == 0)
351 return 0;
352 if (ret != 0)
353 regtail(preg, ret, br); /* OPEN -> first. */
354 else
355 ret = br;
356 if (!(flags&HASWIDTH))
357 *flagp &= ~HASWIDTH;
358 *flagp |= flags&SPSTART;
359 while (*preg->regparse == '|') {
360 preg->regparse++;
361 br = regbranch(preg, &flags);
362 if (br == 0)
363 return 0;
364 regtail(preg, ret, br); /* BRANCH -> BRANCH. */
365 if (!(flags&HASWIDTH))
366 *flagp &= ~HASWIDTH;
367 *flagp |= flags&SPSTART;
370 /* Make a closing node, and hook it on the end. */
371 ender = regnode(preg, (paren) ? CLOSE+parno : END);
372 regtail(preg, ret, ender);
374 /* Hook the tails of the branches to the closing node. */
375 for (br = ret; br != 0; br = regnext(preg, br))
376 regoptail(preg, br, ender);
378 /* Check for proper termination. */
379 if (paren && *preg->regparse++ != ')') {
380 preg->err = REG_ERR_UNMATCHED_PAREN;
381 return 0;
382 } else if (!paren && *preg->regparse != '\0') {
383 if (*preg->regparse == ')') {
384 preg->err = REG_ERR_UNMATCHED_PAREN;
385 return 0;
386 } else {
387 preg->err = REG_ERR_JUNK_ON_END;
388 return 0;
392 return(ret);
396 - regbranch - one alternative of an | operator
398 * Implements the concatenation operator.
400 static int regbranch(regex_t *preg, int *flagp )
402 int ret;
403 int chain;
404 int latest;
405 int flags;
407 *flagp = WORST; /* Tentatively. */
409 ret = regnode(preg, BRANCH);
410 chain = 0;
411 while (*preg->regparse != '\0' && *preg->regparse != ')' &&
412 *preg->regparse != '|') {
413 latest = regpiece(preg, &flags);
414 if (latest == 0)
415 return 0;
416 *flagp |= flags&HASWIDTH;
417 if (chain == 0) {/* First piece. */
418 *flagp |= flags&SPSTART;
420 else {
421 regtail(preg, chain, latest);
423 chain = latest;
425 if (chain == 0) /* Loop ran zero times. */
426 (void) regnode(preg, NOTHING);
428 return(ret);
432 - regpiece - something followed by possible [*+?]
434 * Note that the branching code sequences used for ? and the general cases
435 * of * and + are somewhat optimized: they use the same NOTHING node as
436 * both the endmarker for their branch list and the body of the last branch.
437 * It might seem that this node could be dispensed with entirely, but the
438 * endmarker role is not redundant.
440 static int regpiece(regex_t *preg, int *flagp)
442 int ret;
443 char op;
444 int next;
445 int flags;
446 int chain = 0;
447 int min;
448 int max;
450 ret = regatom(preg, &flags);
451 if (ret == 0)
452 return 0;
454 op = *preg->regparse;
455 if (!ISMULT(op)) {
456 *flagp = flags;
457 return(ret);
460 if (!(flags&HASWIDTH) && op != '?') {
461 preg->err = REG_ERR_OPERAND_COULD_BE_EMPTY;
462 return 0;
465 /* Handle braces (counted repetition) by expansion */
466 if (op == '{') {
467 char *end;
469 min = strtoul(preg->regparse + 1, &end, 10);
470 if (end == preg->regparse + 1) {
471 preg->err = REG_ERR_BAD_COUNT;
472 return 0;
474 if (*end == '}') {
475 max = min;
477 else {
478 preg->regparse = end;
479 max = strtoul(preg->regparse + 1, &end, 10);
480 if (*end != '}') {
481 preg->err = REG_ERR_UNMATCHED_BRACES;
482 return 0;
485 if (end == preg->regparse + 1) {
486 max = MAX_REP_COUNT;
488 else if (max < min || max >= 100) {
489 preg->err = REG_ERR_BAD_COUNT;
490 return 0;
492 if (min >= 100) {
493 preg->err = REG_ERR_BAD_COUNT;
494 return 0;
497 preg->regparse = strchr(preg->regparse, '}');
499 else {
500 min = (op == '+');
501 max = (op == '?' ? 1 : MAX_REP_COUNT);
504 if (preg->regparse[1] == '?') {
505 preg->regparse++;
506 next = reginsert(preg, flags & SIMPLE ? REPMIN : REPXMIN, 5, ret);
508 else {
509 next = reginsert(preg, flags & SIMPLE ? REP: REPX, 5, ret);
511 preg->program[ret + 2] = max;
512 preg->program[ret + 3] = min;
513 preg->program[ret + 4] = 0;
515 *flagp = (min) ? (WORST|HASWIDTH) : (WORST|SPSTART);
517 if (!(flags & SIMPLE)) {
518 int back = regnode(preg, BACK);
519 regtail(preg, back, ret);
520 regtail(preg, next, back);
523 preg->regparse++;
524 if (ISMULT(*preg->regparse)) {
525 preg->err = REG_ERR_NESTED_COUNT;
526 return 0;
529 return chain ? chain : ret;
533 * Add all characters in the inclusive range between lower and upper.
535 * Handles a swapped range (upper < lower).
537 static void reg_addrange(regex_t *preg, int lower, int upper)
539 if (lower > upper) {
540 reg_addrange(preg, upper, lower);
542 /* Add a range as length, start */
543 regc(preg, upper - lower + 1);
544 regc(preg, lower);
548 * Add a null-terminated literal string as a set of ranges.
550 static void reg_addrange_str(regex_t *preg, const char *str)
552 while (*str) {
553 reg_addrange(preg, *str, *str);
554 str++;
559 * Extracts the next unicode char from utf8.
561 * If 'upper' is set, converts the char to uppercase.
563 static int reg_utf8_tounicode_case(const char *s, int *uc, int upper)
565 int l = utf8_tounicode(s, uc);
566 if (upper) {
567 *uc = utf8_upper(*uc);
569 return l;
573 * Converts a hex digit to decimal.
575 * Returns -1 for an invalid hex digit.
577 static int hexdigitval(int c)
579 if (c >= '0' && c <= '9')
580 return c - '0';
581 if (c >= 'a' && c <= 'f')
582 return c - 'a' + 10;
583 if (c >= 'A' && c <= 'F')
584 return c - 'A' + 10;
585 return -1;
589 * Parses up to 'n' hex digits at 's' and stores the result in *uc.
591 * Returns the number of hex digits parsed.
592 * If there are no hex digits, returns 0 and stores nothing.
594 static int parse_hex(const char *s, int n, int *uc)
596 int val = 0;
597 int k;
599 for (k = 0; k < n; k++) {
600 int c = hexdigitval(*s++);
601 if (c == -1) {
602 break;
604 val = (val << 4) | c;
606 if (k) {
607 *uc = val;
609 return k;
613 * Call for chars after a backlash to decode the escape sequence.
615 * Stores the result in *ch.
617 * Returns the number of bytes consumed.
619 static int reg_decode_escape(const char *s, int *ch)
621 int n;
622 const char *s0 = s;
624 *ch = *s++;
626 switch (*ch) {
627 case 'b': *ch = '\b'; break;
628 case 'e': *ch = 27; break;
629 case 'f': *ch = '\f'; break;
630 case 'n': *ch = '\n'; break;
631 case 'r': *ch = '\r'; break;
632 case 't': *ch = '\t'; break;
633 case 'v': *ch = '\v'; break;
634 case 'u':
635 if (*s == '{') {
636 /* Expect \u{NNNN} */
637 n = parse_hex(s + 1, 6, ch);
638 if (n > 0 && s[n + 1] == '}' && *ch >= 0 && *ch <= 0x1fffff) {
639 s += n + 2;
641 else {
642 /* Invalid, so just treat as an escaped 'u' */
643 *ch = 'u';
646 else if ((n = parse_hex(s, 4, ch)) > 0) {
647 s += n;
649 break;
650 case 'U':
651 if ((n = parse_hex(s, 8, ch)) > 0) {
652 s += n;
654 case 'x':
655 if ((n = parse_hex(s, 2, ch)) > 0) {
656 s += n;
658 break;
659 case '\0':
660 s--;
661 *ch = '\\';
662 break;
664 return s - s0;
668 - regatom - the lowest level
670 * Optimization: gobbles an entire sequence of ordinary characters so that
671 * it can turn them into a single node, which is smaller to store and
672 * faster to run. Backslashed characters are exceptions, each becoming a
673 * separate node; the code is simpler that way and it's not worth fixing.
675 static int regatom(regex_t *preg, int *flagp)
677 int ret;
678 int flags;
679 int nocase = (preg->cflags & REG_ICASE);
681 int ch;
682 int n = reg_utf8_tounicode_case(preg->regparse, &ch, nocase);
684 *flagp = WORST; /* Tentatively. */
686 preg->regparse += n;
687 switch (ch) {
688 /* FIXME: these chars only have meaning at beg/end of pat? */
689 case '^':
690 ret = regnode(preg, BOL);
691 break;
692 case '$':
693 ret = regnode(preg, EOL);
694 break;
695 case '.':
696 ret = regnode(preg, ANY);
697 *flagp |= HASWIDTH|SIMPLE;
698 break;
699 case '[': {
700 const char *pattern = preg->regparse;
702 if (*pattern == '^') { /* Complement of range. */
703 ret = regnode(preg, ANYBUT);
704 pattern++;
705 } else
706 ret = regnode(preg, ANYOF);
708 /* Special case. If the first char is ']' or '-', it is part of the set */
709 if (*pattern == ']' || *pattern == '-') {
710 reg_addrange(preg, *pattern, *pattern);
711 pattern++;
714 while (*pattern && *pattern != ']') {
715 /* Is this a range? a-z */
716 int start;
717 int end;
719 pattern += reg_utf8_tounicode_case(pattern, &start, nocase);
720 if (start == '\\') {
721 pattern += reg_decode_escape(pattern, &start);
722 if (start == 0) {
723 preg->err = REG_ERR_NULL_CHAR;
724 return 0;
727 if (pattern[0] == '-' && pattern[1] && pattern[1] != ']') {
728 /* skip '-' */
729 pattern += utf8_tounicode(pattern, &end);
730 pattern += reg_utf8_tounicode_case(pattern, &end, nocase);
731 if (end == '\\') {
732 pattern += reg_decode_escape(pattern, &end);
733 if (end == 0) {
734 preg->err = REG_ERR_NULL_CHAR;
735 return 0;
739 reg_addrange(preg, start, end);
740 continue;
742 if (start == '[') {
743 if (strncmp(pattern, ":alpha:]", 8) == 0) {
744 if ((preg->cflags & REG_ICASE) == 0) {
745 reg_addrange(preg, 'a', 'z');
747 reg_addrange(preg, 'A', 'Z');
748 pattern += 8;
749 continue;
751 if (strncmp(pattern, ":alnum:]", 8) == 0) {
752 if ((preg->cflags & REG_ICASE) == 0) {
753 reg_addrange(preg, 'a', 'z');
755 reg_addrange(preg, 'A', 'Z');
756 reg_addrange(preg, '0', '9');
757 pattern += 8;
758 continue;
760 if (strncmp(pattern, ":space:]", 8) == 0) {
761 reg_addrange_str(preg, " \t\r\n\f\v");
762 pattern += 8;
763 continue;
766 /* Not a range, so just add the char */
767 reg_addrange(preg, start, start);
769 regc(preg, '\0');
771 if (*pattern) {
772 pattern++;
774 preg->regparse = pattern;
776 *flagp |= HASWIDTH|SIMPLE;
778 break;
779 case '(':
780 ret = reg(preg, 1, &flags);
781 if (ret == 0)
782 return 0;
783 *flagp |= flags&(HASWIDTH|SPSTART);
784 break;
785 case '\0':
786 case '|':
787 case ')':
788 preg->err = REG_ERR_INTERNAL;
789 return 0; /* Supposed to be caught earlier. */
790 case '?':
791 case '+':
792 case '*':
793 case '{':
794 preg->err = REG_ERR_COUNT_FOLLOWS_NOTHING;
795 return 0;
796 case '\\':
797 switch (*preg->regparse++) {
798 case '\0':
799 preg->err = REG_ERR_TRAILING_BACKSLASH;
800 return 0;
801 case '<':
802 case 'm':
803 ret = regnode(preg, WORDA);
804 break;
805 case '>':
806 case 'M':
807 ret = regnode(preg, WORDZ);
808 break;
809 case 'd':
810 ret = regnode(preg, ANYOF);
811 reg_addrange(preg, '0', '9');
812 regc(preg, '\0');
813 *flagp |= HASWIDTH|SIMPLE;
814 break;
815 case 'w':
816 ret = regnode(preg, ANYOF);
817 if ((preg->cflags & REG_ICASE) == 0) {
818 reg_addrange(preg, 'a', 'z');
820 reg_addrange(preg, 'A', 'Z');
821 reg_addrange(preg, '0', '9');
822 reg_addrange(preg, '_', '_');
823 regc(preg, '\0');
824 *flagp |= HASWIDTH|SIMPLE;
825 break;
826 case 's':
827 ret = regnode(preg, ANYOF);
828 reg_addrange_str(preg," \t\r\n\f\v");
829 regc(preg, '\0');
830 *flagp |= HASWIDTH|SIMPLE;
831 break;
832 /* FIXME: Someday handle \1, \2, ... */
833 default:
834 /* Handle general quoted chars in exact-match routine */
835 /* Back up to include the backslash */
836 preg->regparse--;
837 goto de_fault;
839 break;
840 de_fault:
841 default: {
843 * Encode a string of characters to be matched exactly.
845 int added = 0;
847 /* Back up to pick up the first char of interest */
848 preg->regparse -= n;
850 ret = regnode(preg, EXACTLY);
852 /* Note that a META operator such as ? or * consumes the
853 * preceding char.
854 * Thus we must be careful to look ahead by 2 and add the
855 * last char as it's own EXACTLY if necessary
858 /* Until end of string or a META char is reached */
859 while (*preg->regparse && strchr(META, *preg->regparse) == NULL) {
860 n = reg_utf8_tounicode_case(preg->regparse, &ch, (preg->cflags & REG_ICASE));
861 if (ch == '\\' && preg->regparse[n]) {
862 /* Non-trailing backslash.
863 * Is this a special escape, or a regular escape?
865 if (strchr("<>mMwds", preg->regparse[n])) {
866 /* A special escape. All done with EXACTLY */
867 break;
869 /* Decode it. Note that we add the length for the escape
870 * sequence to the length for the backlash so we can skip
871 * the entire sequence, or not as required.
873 n += reg_decode_escape(preg->regparse + n, &ch);
874 if (ch == 0) {
875 preg->err = REG_ERR_NULL_CHAR;
876 return 0;
880 /* Now we have one char 'ch' of length 'n'.
881 * Check to see if the following char is a MULT
884 if (ISMULT(preg->regparse[n])) {
885 /* Yes. But do we already have some EXACTLY chars? */
886 if (added) {
887 /* Yes, so return what we have and pick up the current char next time around */
888 break;
890 /* No, so add this single char and finish */
891 regc(preg, ch);
892 added++;
893 preg->regparse += n;
894 break;
897 /* No, so just add this char normally */
898 regc(preg, ch);
899 added++;
900 preg->regparse += n;
902 regc(preg, '\0');
904 *flagp |= HASWIDTH;
905 if (added == 1)
906 *flagp |= SIMPLE;
907 break;
909 break;
912 return(ret);
915 static void reg_grow(regex_t *preg, int n)
917 if (preg->p + n >= preg->proglen) {
918 preg->proglen = (preg->p + n) * 2;
919 preg->program = realloc(preg->program, preg->proglen * sizeof(int));
924 - regnode - emit a node
926 /* Location. */
927 static int regnode(regex_t *preg, int op)
929 reg_grow(preg, 2);
931 preg->program[preg->p++] = op;
932 preg->program[preg->p++] = 0;
934 /* Return the start of the node */
935 return preg->p - 2;
939 - regc - emit (if appropriate) a byte of code
941 static void regc(regex_t *preg, int b )
943 reg_grow(preg, 1);
944 preg->program[preg->p++] = b;
948 - reginsert - insert an operator in front of already-emitted operand
950 * Means relocating the operand.
951 * Returns the new location of the original operand.
953 static int reginsert(regex_t *preg, int op, int size, int opnd )
955 reg_grow(preg, size);
957 /* Move everything from opnd up */
958 memmove(preg->program + opnd + size, preg->program + opnd, sizeof(int) * (preg->p - opnd));
959 /* Zero out the new space */
960 memset(preg->program + opnd, 0, sizeof(int) * size);
962 preg->program[opnd] = op;
964 preg->p += size;
966 return opnd + size;
970 - regtail - set the next-pointer at the end of a node chain
972 static void regtail_(regex_t *preg, int p, int val, int line )
974 int scan;
975 int temp;
976 int offset;
978 /* Find last node. */
979 scan = p;
980 for (;;) {
981 temp = regnext(preg, scan);
982 if (temp == 0)
983 break;
984 scan = temp;
987 if (OP(preg, scan) == BACK)
988 offset = scan - val;
989 else
990 offset = val - scan;
992 preg->program[scan + 1] = offset;
996 - regoptail - regtail on operand of first argument; nop if operandless
999 static void regoptail(regex_t *preg, int p, int val )
1001 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
1002 if (p != 0 && OP(preg, p) == BRANCH) {
1003 regtail(preg, OPERAND(p), val);
1008 * regexec and friends
1012 * Forwards.
1014 static int regtry(regex_t *preg, const char *string );
1015 static int regmatch(regex_t *preg, int prog);
1016 static int regrepeat(regex_t *preg, int p, int max);
1019 - regexec - match a regexp against a string
1021 int regexec(regex_t *preg, const char *string, size_t nmatch, regmatch_t pmatch[], int eflags)
1023 const char *s;
1024 int scan;
1026 /* Be paranoid... */
1027 if (preg == NULL || preg->program == NULL || string == NULL) {
1028 return REG_ERR_NULL_ARGUMENT;
1031 /* Check validity of program. */
1032 if (*preg->program != REG_MAGIC) {
1033 return REG_ERR_CORRUPTED;
1036 #ifdef DEBUG
1037 fprintf(stderr, "regexec: %s\n", string);
1038 regdump(preg);
1039 #endif
1041 preg->eflags = eflags;
1042 preg->pmatch = pmatch;
1043 preg->nmatch = nmatch;
1044 preg->start = string; /* All offsets are computed from here */
1046 /* Must clear out the embedded repeat counts */
1047 for (scan = OPERAND(1); scan != 0; ) {
1048 switch (OP(preg, scan)) {
1049 case REP:
1050 case REPMIN:
1051 case REPX:
1052 case REPXMIN:
1053 preg->program[scan + 4] = 0;
1054 scan += 5;
1055 break;
1057 case ANYOF:
1058 case ANYBUT:
1059 case EXACTLY:
1060 scan += 2;
1061 while (preg->program[scan++]) {
1063 break;
1065 case END:
1066 scan = 0;
1067 break;
1069 default:
1070 scan += 2;
1071 break;
1075 /* If there is a "must appear" string, look for it. */
1076 if (preg->regmust != 0) {
1077 s = string;
1078 while ((s = str_find(s, preg->program[preg->regmust], preg->cflags & REG_ICASE)) != NULL) {
1079 if (prefix_cmp(preg->program + preg->regmust, preg->regmlen, s, preg->cflags & REG_ICASE) >= 0) {
1080 break;
1082 s++;
1084 if (s == NULL) /* Not present. */
1085 return REG_NOMATCH;
1088 /* Mark beginning of line for ^ . */
1089 preg->regbol = string;
1091 /* Simplest case: anchored match need be tried only once (maybe per line). */
1092 if (preg->reganch) {
1093 if (eflags & REG_NOTBOL) {
1094 /* This is an anchored search, but not an BOL, so possibly skip to the next line */
1095 goto nextline;
1097 while (1) {
1098 if (regtry(preg, string)) {
1099 return REG_NOERROR;
1101 if (*string) {
1102 nextline:
1103 if (preg->cflags & REG_NEWLINE) {
1104 /* Try the next anchor? */
1105 string = strchr(string, '\n');
1106 if (string) {
1107 preg->regbol = ++string;
1108 continue;
1112 return REG_NOMATCH;
1116 /* Messy cases: unanchored match. */
1117 s = string;
1118 if (preg->regstart != '\0') {
1119 /* We know what char it must start with. */
1120 while ((s = str_find(s, preg->regstart, preg->cflags & REG_ICASE)) != NULL) {
1121 if (regtry(preg, s))
1122 return REG_NOERROR;
1123 s++;
1126 else
1127 /* We don't -- general case. */
1128 while (1) {
1129 if (regtry(preg, s))
1130 return REG_NOERROR;
1131 if (*s == '\0') {
1132 break;
1134 else {
1135 int c;
1136 s += utf8_tounicode(s, &c);
1140 /* Failure. */
1141 return REG_NOMATCH;
1145 - regtry - try match at specific point
1147 /* 0 failure, 1 success */
1148 static int regtry( regex_t *preg, const char *string )
1150 int i;
1152 preg->reginput = string;
1154 for (i = 0; i < preg->nmatch; i++) {
1155 preg->pmatch[i].rm_so = -1;
1156 preg->pmatch[i].rm_eo = -1;
1158 if (regmatch(preg, 1)) {
1159 preg->pmatch[0].rm_so = string - preg->start;
1160 preg->pmatch[0].rm_eo = preg->reginput - preg->start;
1161 return(1);
1162 } else
1163 return(0);
1167 * Returns bytes matched if 'pattern' is a prefix of 'string'.
1169 * If 'nocase' is non-zero, does a case-insensitive match.
1171 * Returns -1 on not found.
1173 static int prefix_cmp(const int *prog, int proglen, const char *string, int nocase)
1175 const char *s = string;
1176 while (proglen && *s) {
1177 int ch;
1178 int n = reg_utf8_tounicode_case(s, &ch, nocase);
1179 if (ch != *prog) {
1180 return -1;
1182 prog++;
1183 s += n;
1184 proglen--;
1186 if (proglen == 0) {
1187 return s - string;
1189 return -1;
1193 * Searchs for 'c' in the range 'range'.
1195 * Returns 1 if found, or 0 if not.
1197 static int reg_range_find(const int *range, int c)
1199 while (*range) {
1200 /*printf("Checking %d in range [%d,%d]\n", c, range[1], (range[0] + range[1] - 1));*/
1201 if (c >= range[1] && c <= (range[0] + range[1] - 1)) {
1202 return 1;
1204 range += 2;
1206 return 0;
1210 * Search for the character 'c' in the utf-8 string 'string'.
1212 * If 'nocase' is set, the 'string' is assumed to be uppercase
1213 * and 'c' is converted to uppercase before matching.
1215 * Returns the byte position in the string where the 'c' was found, or
1216 * NULL if not found.
1218 static const char *str_find(const char *string, int c, int nocase)
1220 if (nocase) {
1221 /* The "string" should already be converted to uppercase */
1222 c = utf8_upper(c);
1224 while (*string) {
1225 int ch;
1226 int n = reg_utf8_tounicode_case(string, &ch, nocase);
1227 if (c == ch) {
1228 return string;
1230 string += n;
1232 return NULL;
1236 * Returns true if 'ch' is an end-of-line char.
1238 * In REG_NEWLINE mode, \n is considered EOL in
1239 * addition to \0
1241 static int reg_iseol(regex_t *preg, int ch)
1243 if (preg->cflags & REG_NEWLINE) {
1244 return ch == '\0' || ch == '\n';
1246 else {
1247 return ch == '\0';
1251 static int regmatchsimplerepeat(regex_t *preg, int scan, int matchmin)
1253 int nextch = '\0';
1254 const char *save;
1255 int no;
1256 int c;
1258 int max = preg->program[scan + 2];
1259 int min = preg->program[scan + 3];
1260 int next = regnext(preg, scan);
1263 * Lookahead to avoid useless match attempts
1264 * when we know what character comes next.
1266 if (OP(preg, next) == EXACTLY) {
1267 nextch = preg->program[OPERAND(next)];
1269 save = preg->reginput;
1270 no = regrepeat(preg, scan + 5, max);
1271 if (no < min) {
1272 return 0;
1274 if (matchmin) {
1275 /* from min up to no */
1276 max = no;
1277 no = min;
1279 /* else from no down to min */
1280 while (1) {
1281 if (matchmin) {
1282 if (no > max) {
1283 break;
1286 else {
1287 if (no < min) {
1288 break;
1291 preg->reginput = save + utf8_index(save, no);
1292 reg_utf8_tounicode_case(preg->reginput, &c, (preg->cflags & REG_ICASE));
1293 /* If it could work, try it. */
1294 if (reg_iseol(preg, nextch) || c == nextch) {
1295 if (regmatch(preg, next)) {
1296 return(1);
1299 if (matchmin) {
1300 /* Couldn't or didn't, add one more */
1301 no++;
1303 else {
1304 /* Couldn't or didn't -- back up. */
1305 no--;
1308 return(0);
1311 static int regmatchrepeat(regex_t *preg, int scan, int matchmin)
1313 int *scanpt = preg->program + scan;
1315 int max = scanpt[2];
1316 int min = scanpt[3];
1318 /* Have we reached min? */
1319 if (scanpt[4] < min) {
1320 /* No, so get another one */
1321 scanpt[4]++;
1322 if (regmatch(preg, scan + 5)) {
1323 return 1;
1325 scanpt[4]--;
1326 return 0;
1328 if (scanpt[4] > max) {
1329 return 0;
1332 if (matchmin) {
1333 /* minimal, so try other branch first */
1334 if (regmatch(preg, regnext(preg, scan))) {
1335 return 1;
1337 /* No, so try one more */
1338 scanpt[4]++;
1339 if (regmatch(preg, scan + 5)) {
1340 return 1;
1342 scanpt[4]--;
1343 return 0;
1345 /* maximal, so try this branch again */
1346 if (scanpt[4] < max) {
1347 scanpt[4]++;
1348 if (regmatch(preg, scan + 5)) {
1349 return 1;
1351 scanpt[4]--;
1353 /* At this point we are at max with no match. Try the other branch */
1354 return regmatch(preg, regnext(preg, scan));
1358 - regmatch - main matching routine
1360 * Conceptually the strategy is simple: check to see whether the current
1361 * node matches, call self recursively to see whether the rest matches,
1362 * and then act accordingly. In practice we make some effort to avoid
1363 * recursion, in particular by going through "ordinary" nodes (that don't
1364 * need to know whether the rest of the match failed) by a loop instead of
1365 * by recursion.
1367 /* 0 failure, 1 success */
1368 static int regmatch(regex_t *preg, int prog)
1370 int scan; /* Current node. */
1371 int next; /* Next node. */
1373 scan = prog;
1375 #ifdef DEBUG
1376 if (scan != 0 && regnarrate)
1377 fprintf(stderr, "%s(\n", regprop(scan));
1378 #endif
1379 while (scan != 0) {
1380 int n;
1381 int c;
1382 #ifdef DEBUG
1383 if (regnarrate) {
1384 fprintf(stderr, "%3d: %s...\n", scan, regprop(OP(preg, scan))); /* Where, what. */
1386 #endif
1387 next = regnext(preg, scan);
1388 n = reg_utf8_tounicode_case(preg->reginput, &c, (preg->cflags & REG_ICASE));
1390 switch (OP(preg, scan)) {
1391 case BOL:
1392 if (preg->reginput != preg->regbol)
1393 return(0);
1394 break;
1395 case EOL:
1396 if (!reg_iseol(preg, c)) {
1397 return(0);
1399 break;
1400 case WORDA:
1401 /* Must be looking at a letter, digit, or _ */
1402 if ((!isalnum(UCHAR(c))) && c != '_')
1403 return(0);
1404 /* Prev must be BOL or nonword */
1405 if (preg->reginput > preg->regbol &&
1406 (isalnum(UCHAR(preg->reginput[-1])) || preg->reginput[-1] == '_'))
1407 return(0);
1408 break;
1409 case WORDZ:
1410 /* Can't match at BOL */
1411 if (preg->reginput > preg->regbol) {
1412 /* Current must be EOL or nonword */
1413 if (reg_iseol(preg, c) || !isalnum(UCHAR(c)) || c != '_') {
1414 c = preg->reginput[-1];
1415 /* Previous must be word */
1416 if (isalnum(UCHAR(c)) || c == '_') {
1417 break;
1421 /* No */
1422 return(0);
1424 case ANY:
1425 if (reg_iseol(preg, c))
1426 return 0;
1427 preg->reginput += n;
1428 break;
1429 case EXACTLY: {
1430 int opnd;
1431 int len;
1432 int slen;
1434 opnd = OPERAND(scan);
1435 len = str_int_len(preg->program + opnd);
1437 slen = prefix_cmp(preg->program + opnd, len, preg->reginput, preg->cflags & REG_ICASE);
1438 if (slen < 0) {
1439 return(0);
1441 preg->reginput += slen;
1443 break;
1444 case ANYOF:
1445 if (reg_iseol(preg, c) || reg_range_find(preg->program + OPERAND(scan), c) == 0) {
1446 return(0);
1448 preg->reginput += n;
1449 break;
1450 case ANYBUT:
1451 if (reg_iseol(preg, c) || reg_range_find(preg->program + OPERAND(scan), c) != 0) {
1452 return(0);
1454 preg->reginput += n;
1455 break;
1456 case NOTHING:
1457 break;
1458 case BACK:
1459 break;
1460 case BRANCH: {
1461 const char *save;
1463 if (OP(preg, next) != BRANCH) /* No choice. */
1464 next = OPERAND(scan); /* Avoid recursion. */
1465 else {
1466 do {
1467 save = preg->reginput;
1468 if (regmatch(preg, OPERAND(scan))) {
1469 return(1);
1471 preg->reginput = save;
1472 scan = regnext(preg, scan);
1473 } while (scan != 0 && OP(preg, scan) == BRANCH);
1474 return(0);
1475 /* NOTREACHED */
1478 break;
1479 case REP:
1480 case REPMIN:
1481 return regmatchsimplerepeat(preg, scan, OP(preg, scan) == REPMIN);
1483 case REPX:
1484 case REPXMIN:
1485 return regmatchrepeat(preg, scan, OP(preg, scan) == REPXMIN);
1487 case END:
1488 return(1); /* Success! */
1489 break;
1491 case OPENNC:
1492 case CLOSENC:
1493 if (regmatch(preg, next)) {
1494 return 1;
1496 return 0;
1498 default:
1499 if (OP(preg, scan) >= OPEN+1 && OP(preg, scan) < CLOSE_END) {
1500 const char *save;
1502 save = preg->reginput;
1504 if (regmatch(preg, next)) {
1505 int no;
1507 * Don't set startp if some later
1508 * invocation of the same parentheses
1509 * already has.
1511 if (OP(preg, scan) < CLOSE) {
1512 no = OP(preg, scan) - OPEN;
1513 if (no < preg->nmatch && preg->pmatch[no].rm_so == -1) {
1514 preg->pmatch[no].rm_so = save - preg->start;
1517 else {
1518 no = OP(preg, scan) - CLOSE;
1519 if (no < preg->nmatch && preg->pmatch[no].rm_eo == -1) {
1520 preg->pmatch[no].rm_eo = save - preg->start;
1523 return(1);
1524 } else
1525 return(0);
1527 return REG_ERR_INTERNAL;
1530 scan = next;
1534 * We get here only if there's trouble -- normally "case END" is
1535 * the terminating point.
1537 return REG_ERR_INTERNAL;
1541 - regrepeat - repeatedly match something simple, report how many
1543 static int regrepeat(regex_t *preg, int p, int max)
1545 int count = 0;
1546 const char *scan;
1547 int opnd;
1548 int ch;
1549 int n;
1551 scan = preg->reginput;
1552 opnd = OPERAND(p);
1553 switch (OP(preg, p)) {
1554 case ANY:
1555 /* No need to handle utf8 specially here */
1556 while (!reg_iseol(preg, *scan) && count < max) {
1557 count++;
1558 scan++;
1560 break;
1561 case EXACTLY:
1562 while (count < max) {
1563 n = reg_utf8_tounicode_case(scan, &ch, preg->cflags & REG_ICASE);
1564 if (preg->program[opnd] != ch) {
1565 break;
1567 count++;
1568 scan += n;
1570 break;
1571 case ANYOF:
1572 while (count < max) {
1573 n = reg_utf8_tounicode_case(scan, &ch, preg->cflags & REG_ICASE);
1574 if (reg_iseol(preg, ch) || reg_range_find(preg->program + opnd, ch) == 0) {
1575 break;
1577 count++;
1578 scan += n;
1580 break;
1581 case ANYBUT:
1582 while (count < max) {
1583 n = reg_utf8_tounicode_case(scan, &ch, preg->cflags & REG_ICASE);
1584 if (reg_iseol(preg, ch) || reg_range_find(preg->program + opnd, ch) != 0) {
1585 break;
1587 count++;
1588 scan += n;
1590 break;
1591 default: /* Oh dear. Called inappropriately. */
1592 preg->err = REG_ERR_INTERNAL;
1593 count = 0; /* Best compromise. */
1594 break;
1596 preg->reginput = scan;
1598 return(count);
1602 - regnext - dig the "next" pointer out of a node
1604 static int regnext(regex_t *preg, int p )
1606 int offset;
1608 offset = NEXT(preg, p);
1610 if (offset == 0)
1611 return 0;
1613 if (OP(preg, p) == BACK)
1614 return(p-offset);
1615 else
1616 return(p+offset);
1619 #if defined(DEBUG) && !defined(JIM_BOOTSTRAP)
1622 - regdump - dump a regexp onto stdout in vaguely comprehensible form
1624 static void regdump(regex_t *preg)
1626 int s;
1627 int op = EXACTLY; /* Arbitrary non-END op. */
1628 int next;
1629 char buf[MAX_UTF8_LEN + 1];
1631 int i;
1632 for (i = 1; i < preg->p; i++) {
1633 printf("%02x ", (unsigned char)preg->program[i]);
1634 if (i % 16 == 0) {
1635 printf("\n");
1638 printf("\n");
1640 s = 1;
1641 while (op != END && s < preg->p) { /* While that wasn't END last time... */
1642 op = OP(preg, s);
1643 printf("%3d: %s", s, regprop(op)); /* Where, what. */
1644 next = regnext(preg, s);
1645 if (next == 0) /* Next ptr. */
1646 printf("(0)");
1647 else
1648 printf("(%d)", next);
1649 s += 2;
1650 if (op == REP || op == REPMIN || op == REPX || op == REPXMIN) {
1651 int max = preg->program[s];
1652 int min = preg->program[s + 1];
1653 if (max == 65535) {
1654 printf("{%d,*}", min);
1656 else {
1657 printf("{%d,%d}", min, max);
1659 printf(" %d", preg->program[s + 2]);
1660 s += 3;
1662 else if (op == ANYOF || op == ANYBUT) {
1663 /* set of ranges */
1665 while (preg->program[s]) {
1666 int len = preg->program[s++];
1667 int first = preg->program[s++];
1668 buf[utf8_getchars(buf, first)] = 0;
1669 printf("%s", buf);
1670 if (len > 1) {
1671 buf[utf8_getchars(buf, first + len - 1)] = 0;
1672 printf("-%s", buf);
1675 s++;
1677 else if (op == EXACTLY) {
1678 /* Literal string, where present. */
1680 while (preg->program[s]) {
1681 buf[utf8_getchars(buf, preg->program[s])] = 0;
1682 printf("%s", buf);
1683 s++;
1685 s++;
1687 putchar('\n');
1690 if (op == END) {
1691 /* Header fields of interest. */
1692 if (preg->regstart) {
1693 buf[utf8_getchars(buf, preg->regstart)] = 0;
1694 printf("start '%s' ", buf);
1696 if (preg->reganch)
1697 printf("anchored ");
1698 if (preg->regmust != 0) {
1699 int i;
1700 printf("must have:");
1701 for (i = 0; i < preg->regmlen; i++) {
1702 putchar(preg->program[preg->regmust + i]);
1704 putchar('\n');
1707 printf("\n");
1711 - regprop - printable representation of opcode
1713 static const char *regprop( int op )
1715 static char buf[50];
1717 switch (op) {
1718 case BOL:
1719 return "BOL";
1720 case EOL:
1721 return "EOL";
1722 case ANY:
1723 return "ANY";
1724 case ANYOF:
1725 return "ANYOF";
1726 case ANYBUT:
1727 return "ANYBUT";
1728 case BRANCH:
1729 return "BRANCH";
1730 case EXACTLY:
1731 return "EXACTLY";
1732 case NOTHING:
1733 return "NOTHING";
1734 case BACK:
1735 return "BACK";
1736 case END:
1737 return "END";
1738 case REP:
1739 return "REP";
1740 case REPMIN:
1741 return "REPMIN";
1742 case REPX:
1743 return "REPX";
1744 case REPXMIN:
1745 return "REPXMIN";
1746 case WORDA:
1747 return "WORDA";
1748 case WORDZ:
1749 return "WORDZ";
1750 case OPENNC:
1751 return "OPEN";
1752 case CLOSENC:
1753 return "CLOSE";
1754 default:
1755 if (op >= OPEN && op < CLOSE) {
1756 snprintf(buf, sizeof(buf), "OPEN%d", op-OPEN);
1758 else if (op >= CLOSE && op < CLOSE_END) {
1759 snprintf(buf, sizeof(buf), "CLOSE%d", op-CLOSE);
1761 else {
1762 snprintf(buf, sizeof(buf), "?%d?\n", op);
1764 return(buf);
1767 #endif /* JIM_BOOTSTRAP */
1769 size_t regerror(int errcode, const regex_t *preg, char *errbuf, size_t errbuf_size)
1771 static const char *error_strings[] = {
1772 "success",
1773 "no match",
1774 "bad pattern",
1775 "null argument",
1776 "unknown error",
1777 "too big",
1778 "out of memory",
1779 "too many ()",
1780 "parentheses () not balanced",
1781 "braces {} not balanced",
1782 "invalid repetition count(s)",
1783 "extra characters",
1784 "*+ of empty atom",
1785 "nested count",
1786 "internal error",
1787 "count follows nothing",
1788 "trailing backslash",
1789 "corrupted program",
1790 "contains null char",
1792 const char *err;
1794 if (errcode < 0 || errcode >= REG_ERR_NUM) {
1795 err = "Bad error code";
1797 else {
1798 err = error_strings[errcode];
1801 return snprintf(errbuf, errbuf_size, "%s", err);
1804 void regfree(regex_t *preg)
1806 free(preg->program);
1809 #endif