auto.def: tclprefix should not be enabled by default
[jimtcl.git] / jimregexp.c
blobcf315589b2fb3189e9809a5f3f3039c483d5c0da
1 /*
2 * vi:se ts=8:
4 * regcomp and regexec -- regsub and regerror are elsewhere
6 * Copyright (c) 1986 by University of Toronto.
7 * Written by Henry Spencer. Not derived from licensed software.
9 * Permission is granted to anyone to use this software for any
10 * purpose on any computer system, and to redistribute it freely,
11 * subject to the following restrictions:
13 * 1. The author is not responsible for the consequences of use of
14 * this software, no matter how awful, even if they arise
15 * from defects in it.
17 * 2. The origin of this software must not be misrepresented, either
18 * by explicit claim or by omission.
20 * 3. Altered versions must be plainly marked as such, and must not
21 * be misrepresented as being the original software.
22 *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore,
23 *** hoptoad!gnu, on 27 Dec 1986, to add \n as an alternative to |
24 *** to assist in implementing egrep.
25 *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore,
26 *** hoptoad!gnu, on 27 Dec 1986, to add \< and \> for word-matching
27 *** as in BSD grep and ex.
28 *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore,
29 *** hoptoad!gnu, on 28 Dec 1986, to optimize characters quoted with \.
30 *** THIS IS AN ALTERED VERSION. It was altered by James A. Woods,
31 *** ames!jaw, on 19 June 1987, to quash a regcomp() redundancy.
32 *** THIS IS AN ALTERED VERSION. It was altered by Christopher Seiwald
33 *** seiwald@vix.com, on 28 August 1993, for use in jam. Regmagic.h
34 *** was moved into regexp.h, and the include of regexp.h now uses "'s
35 *** to avoid conflicting with the system regexp.h. Const, bless its
36 *** soul, was removed so it can compile everywhere. The declaration
37 *** of strchr() was in conflict on AIX, so it was removed (as it is
38 *** happily defined in string.h).
39 *** THIS IS AN ALTERED VERSION. It was altered by Christopher Seiwald
40 *** seiwald@perforce.com, on 20 January 2000, to use function prototypes.
41 *** THIS IS AN ALTERED VERSION. It was altered by Christopher Seiwald
42 *** seiwald@perforce.com, on 05 November 2002, to const string literals.
44 * THIS IS AN ALTERED VERSION. It was altered by Steve Bennett <steveb@workware.net.au>
45 * on 16 October 2010, to remove static state and add better Tcl ARE compatibility.
46 * This includes counted repetitions, UTF-8 support, character classes,
47 * shorthand character classes, increased number of parentheses to 100,
48 * backslash escape sequences. It also removes \n as an alternative to |.
50 * Beware that some of this code is subtly aware of the way operator
51 * precedence is structured in regular expressions. Serious changes in
52 * regular-expression syntax might require a total rethink.
55 #include "jimautoconf.h"
57 #if defined(JIM_REGEXP)
58 #include <stdio.h>
59 #include <ctype.h>
60 #include <stdlib.h>
61 #include <string.h>
63 #include "jim.h"
64 #include "jimregexp.h"
65 #include "utf8.h"
67 /* An arbitrary limit, but this seems enough. Must be less than 1000. */
68 #define REG_MAX_PAREN 100
71 * Structure for regexp "program". This is essentially a linear encoding
72 * of a nondeterministic finite-state machine (aka syntax charts or
73 * "railroad normal form" in parsing technology). Each node is an opcode
74 * plus a "next" pointer, possibly plus an operand. "Next" pointers of
75 * all nodes except BRANCH implement concatenation; a "next" pointer with
76 * a BRANCH on both ends of it is connecting two alternatives. (Here we
77 * have one of the subtle syntax dependencies: an individual BRANCH (as
78 * opposed to a collection of them) is never concatenated with anything
79 * because of operator precedence.) The operand of some types of node is
80 * a literal string; for others, it is a node leading into a sub-FSM. In
81 * particular, the operand of a BRANCH node is the first node of the branch.
82 * (NB this is *not* a tree structure: the tail of the branch connects
83 * to the thing following the set of BRANCHes.) The opcodes are:
86 /* definition number opnd? meaning */
87 #define END 0 /* no End of program. */
88 #define BOL 1 /* no Match "" at beginning of line. */
89 #define EOL 2 /* no Match "" at end of line. */
90 #define ANY 3 /* no Match any one character. */
91 #define ANYOF 4 /* str Match any character in this string. */
92 #define ANYBUT 5 /* str Match any character not in this string. */
93 #define BRANCH 6 /* node Match this alternative, or the next... */
94 #define BACK 7 /* no Match "", "next" ptr points backward. */
95 #define EXACTLY 8 /* str Match this string. */
96 #define NOTHING 9 /* no Match empty string. */
97 #define REP 10 /* max,min Match this (simple) thing [min,max] times. */
98 #define REPMIN 11 /* max,min Match this (simple) thing [min,max] times, minimal match. */
99 #define REPX 12 /* max,min Match this (complex) thing [min,max] times. */
100 #define REPXMIN 13 /* max,min Match this (complex) thing [min,max] times, minimal match. */
101 #define BOLX 14 /* no Match "" at beginning of input. */
102 #define EOLX 15 /* no Match "" at end of input. */
103 #define WORDA 16 /* no Match "" at wordchar, where prev is nonword */
104 #define WORDZ 17 /* no Match "" at nonwordchar, where prev is word */
106 #define OPENNC 1000 /* no Non-capturing parentheses - must be OPEN-1 */
107 #define OPEN 1001 /* no Mark this point in input as start of #n. */
108 /* OPEN+1 is number 1, etc. */
110 /* must not be any other opts between OPEN and CLOSE */
112 #define CLOSENC 2000 /* no Non-capturing parentheses - must be CLOSE-1 */
113 #define CLOSE 2001 /* no Analogous to OPEN. */
114 #define CLOSE_END (CLOSE+REG_MAX_PAREN)
117 * The first word of the regexp internal "program" is actually this magic
118 * number; the start node begins in the second word.
120 #define REG_MAGIC 0xFADED00D
123 * Opcode notes:
125 * BRANCH The set of branches constituting a single choice are hooked
126 * together with their "next" pointers, since precedence prevents
127 * anything being concatenated to any individual branch. The
128 * "next" pointer of the last BRANCH in a choice points to the
129 * thing following the whole choice. This is also where the
130 * final "next" pointer of each individual branch points; each
131 * branch starts with the operand node of a BRANCH node.
133 * BACK Normal "next" pointers all implicitly point forward; BACK
134 * exists to make loop structures possible.
136 * REP,REPX Repeated matches ('?', '*', '+' and {min,max}) are implemented
137 * as either simple repeats (REP) or complex repeats (REPX).
138 * These opcodes include a "min" and "max" count after the opcode.
139 * This is followed by a fourth "current count" word that is
140 * only used by REPX, as it implements a recursive match.
141 * REPMIN and REPXMIN are identical except they implement minimal repeats.
143 * OPEN,CLOSE ...are numbered at compile time.
147 * A node is one word of opcode followed by one word of "next" pointer.
148 * The "next" pointer value is a positive offset from the opcode of the node
149 * containing it.
150 * An operand, if any, simply follows the node. (Note that much of the
151 * code generation knows about this implicit relationship.)
153 #define OP(preg, p) (preg->program[p])
154 #define NEXT(preg, p) (preg->program[p + 1])
155 #define OPERAND(p) ((p) + 2)
158 * See regmagic.h for one further detail of program structure.
163 * Utility definitions.
166 #define FAIL(R,M) { (R)->err = (M); return (M); }
167 #define ISMULT(c) ((c) == '*' || (c) == '+' || (c) == '?' || (c) == '{')
168 #define META "^$.[()|?{+*"
171 * Flags to be passed up and down.
173 #define HASWIDTH 1 /* Known never to match null string. */
174 #define SIMPLE 2 /* Simple enough to be STAR/PLUS operand. */
175 #define SPSTART 4 /* Starts with * or +. */
176 #define WORST 0 /* Worst case. */
178 #define MAX_REP_COUNT 1000000
181 * Forward declarations for regcomp()'s friends.
183 static int reg(regex_t *preg, int paren /* Parenthesized? */, int *flagp );
184 static int regpiece(regex_t *preg, int *flagp );
185 static int regbranch(regex_t *preg, int *flagp );
186 static int regatom(regex_t *preg, int *flagp );
187 static int regnode(regex_t *preg, int op );
188 static int regnext(regex_t *preg, int p );
189 static void regc(regex_t *preg, int b );
190 static int reginsert(regex_t *preg, int op, int size, int opnd );
191 static void regtail(regex_t *preg, int p, int val);
192 static void regoptail(regex_t *preg, int p, int val );
193 static int regopsize(regex_t *preg, int p );
195 static int reg_range_find(const int *string, int c);
196 static const char *str_find(const char *string, int c, int nocase);
197 static int prefix_cmp(const int *prog, int proglen, const char *string, int nocase);
199 /*#define DEBUG*/
200 #ifdef DEBUG
201 static int regnarrate = 0;
202 static void regdump(regex_t *preg);
203 static const char *regprop( int op );
204 #endif
208 * Returns the length of the null-terminated integer sequence.
210 static int str_int_len(const int *seq)
212 int n = 0;
213 while (*seq++) {
214 n++;
216 return n;
220 - regcomp - compile a regular expression into internal code
222 * We can't allocate space until we know how big the compiled form will be,
223 * but we can't compile it (and thus know how big it is) until we've got a
224 * place to put the code. So we cheat: we compile it twice, once with code
225 * generation turned off and size counting turned on, and once "for real".
226 * This also means that we don't allocate space until we are sure that the
227 * thing really will compile successfully, and we never have to move the
228 * code and thus invalidate pointers into it. (Note that it has to be in
229 * one piece because free() must be able to free it all.)
231 * Beware that the optimization-preparation code in here knows about some
232 * of the structure of the compiled regexp.
234 int regcomp(regex_t *preg, const char *exp, int cflags)
236 int scan;
237 int longest;
238 unsigned len;
239 int flags;
241 #ifdef DEBUG
242 fprintf(stderr, "Compiling: '%s'\n", exp);
243 #endif
244 memset(preg, 0, sizeof(*preg));
246 if (exp == NULL)
247 FAIL(preg, REG_ERR_NULL_ARGUMENT);
249 /* First pass: determine size, legality. */
250 preg->cflags = cflags;
251 preg->regparse = exp;
253 /* Allocate space. */
254 preg->proglen = (strlen(exp) + 1) * 5;
255 preg->program = malloc(preg->proglen * sizeof(int));
256 if (preg->program == NULL)
257 FAIL(preg, REG_ERR_NOMEM);
259 /* Note that since we store a magic value as the first item in the program,
260 * program offsets will never be 0
262 regc(preg, REG_MAGIC);
263 if (reg(preg, 0, &flags) == 0) {
264 return preg->err;
267 /* Small enough for pointer-storage convention? */
268 if (preg->re_nsub >= REG_MAX_PAREN) /* Probably could be 65535L. */
269 FAIL(preg,REG_ERR_TOO_BIG);
271 /* Dig out information for optimizations. */
272 preg->regstart = 0; /* Worst-case defaults. */
273 preg->reganch = 0;
274 preg->regmust = 0;
275 preg->regmlen = 0;
276 scan = 1; /* First BRANCH. */
277 if (OP(preg, regnext(preg, scan)) == END) { /* Only one top-level choice. */
278 scan = OPERAND(scan);
280 /* Starting-point info. */
281 if (OP(preg, scan) == EXACTLY) {
282 preg->regstart = preg->program[OPERAND(scan)];
284 else if (OP(preg, scan) == BOL)
285 preg->reganch++;
288 * If there's something expensive in the r.e., find the
289 * longest literal string that must appear and make it the
290 * regmust. Resolve ties in favor of later strings, since
291 * the regstart check works with the beginning of the r.e.
292 * and avoiding duplication strengthens checking. Not a
293 * strong reason, but sufficient in the absence of others.
295 if (flags&SPSTART) {
296 longest = 0;
297 len = 0;
298 for (; scan != 0; scan = regnext(preg, scan)) {
299 if (OP(preg, scan) == EXACTLY) {
300 int plen = str_int_len(preg->program + OPERAND(scan));
301 if (plen >= len) {
302 longest = OPERAND(scan);
303 len = plen;
307 preg->regmust = longest;
308 preg->regmlen = len;
312 #ifdef DEBUG
313 regdump(preg);
314 #endif
316 return 0;
320 - reg - regular expression, i.e. main body or parenthesized thing
322 * Caller must absorb opening parenthesis.
324 * Combining parenthesis handling with the base level of regular expression
325 * is a trifle forced, but the need to tie the tails of the branches to what
326 * follows makes it hard to avoid.
328 static int reg(regex_t *preg, int paren /* Parenthesized? */, int *flagp )
330 int ret;
331 int br;
332 int ender;
333 int parno = 0;
334 int flags;
336 *flagp = HASWIDTH; /* Tentatively. */
338 /* Make an OPEN node, if parenthesized. */
339 if (paren) {
340 if (preg->regparse[0] == '?' && preg->regparse[1] == ':') {
341 /* non-capturing paren */
342 preg->regparse += 2;
343 parno = -1;
345 else {
346 parno = ++preg->re_nsub;
348 ret = regnode(preg, OPEN+parno);
349 } else
350 ret = 0;
352 /* Pick up the branches, linking them together. */
353 br = regbranch(preg, &flags);
354 if (br == 0)
355 return 0;
356 if (ret != 0)
357 regtail(preg, ret, br); /* OPEN -> first. */
358 else
359 ret = br;
360 if (!(flags&HASWIDTH))
361 *flagp &= ~HASWIDTH;
362 *flagp |= flags&SPSTART;
363 while (*preg->regparse == '|') {
364 preg->regparse++;
365 br = regbranch(preg, &flags);
366 if (br == 0)
367 return 0;
368 regtail(preg, ret, br); /* BRANCH -> BRANCH. */
369 if (!(flags&HASWIDTH))
370 *flagp &= ~HASWIDTH;
371 *flagp |= flags&SPSTART;
374 /* Make a closing node, and hook it on the end. */
375 ender = regnode(preg, (paren) ? CLOSE+parno : END);
376 regtail(preg, ret, ender);
378 /* Hook the tails of the branches to the closing node. */
379 for (br = ret; br != 0; br = regnext(preg, br))
380 regoptail(preg, br, ender);
382 /* Check for proper termination. */
383 if (paren && *preg->regparse++ != ')') {
384 preg->err = REG_ERR_UNMATCHED_PAREN;
385 return 0;
386 } else if (!paren && *preg->regparse != '\0') {
387 if (*preg->regparse == ')') {
388 preg->err = REG_ERR_UNMATCHED_PAREN;
389 return 0;
390 } else {
391 preg->err = REG_ERR_JUNK_ON_END;
392 return 0;
396 return(ret);
400 - regbranch - one alternative of an | operator
402 * Implements the concatenation operator.
404 static int regbranch(regex_t *preg, int *flagp )
406 int ret;
407 int chain;
408 int latest;
409 int flags;
411 *flagp = WORST; /* Tentatively. */
413 ret = regnode(preg, BRANCH);
414 chain = 0;
415 while (*preg->regparse != '\0' && *preg->regparse != ')' &&
416 *preg->regparse != '|') {
417 latest = regpiece(preg, &flags);
418 if (latest == 0)
419 return 0;
420 *flagp |= flags&HASWIDTH;
421 if (chain == 0) {/* First piece. */
422 *flagp |= flags&SPSTART;
424 else {
425 regtail(preg, chain, latest);
427 chain = latest;
429 if (chain == 0) /* Loop ran zero times. */
430 (void) regnode(preg, NOTHING);
432 return(ret);
436 - regpiece - something followed by possible [*+?]
438 * Note that the branching code sequences used for ? and the general cases
439 * of * and + are somewhat optimized: they use the same NOTHING node as
440 * both the endmarker for their branch list and the body of the last branch.
441 * It might seem that this node could be dispensed with entirely, but the
442 * endmarker role is not redundant.
444 static int regpiece(regex_t *preg, int *flagp)
446 int ret;
447 char op;
448 int next;
449 int flags;
450 int min;
451 int max;
453 ret = regatom(preg, &flags);
454 if (ret == 0)
455 return 0;
457 op = *preg->regparse;
458 if (!ISMULT(op)) {
459 *flagp = flags;
460 return(ret);
463 if (!(flags&HASWIDTH) && op != '?') {
464 preg->err = REG_ERR_OPERAND_COULD_BE_EMPTY;
465 return 0;
468 /* Handle braces (counted repetition) by expansion */
469 if (op == '{') {
470 char *end;
472 min = strtoul(preg->regparse + 1, &end, 10);
473 if (end == preg->regparse + 1) {
474 preg->err = REG_ERR_BAD_COUNT;
475 return 0;
477 if (*end == '}') {
478 max = min;
480 else if (*end == '\0') {
481 preg->err = REG_ERR_UNMATCHED_BRACES;
482 return 0;
484 else {
485 preg->regparse = end;
486 max = strtoul(preg->regparse + 1, &end, 10);
487 if (*end != '}') {
488 preg->err = REG_ERR_UNMATCHED_BRACES;
489 return 0;
492 if (end == preg->regparse + 1) {
493 max = MAX_REP_COUNT;
495 else if (max < min || max >= 100) {
496 preg->err = REG_ERR_BAD_COUNT;
497 return 0;
499 if (min >= 100) {
500 preg->err = REG_ERR_BAD_COUNT;
501 return 0;
504 preg->regparse = strchr(preg->regparse, '}');
506 else {
507 min = (op == '+');
508 max = (op == '?' ? 1 : MAX_REP_COUNT);
511 if (preg->regparse[1] == '?') {
512 preg->regparse++;
513 next = reginsert(preg, flags & SIMPLE ? REPMIN : REPXMIN, 5, ret);
515 else {
516 next = reginsert(preg, flags & SIMPLE ? REP: REPX, 5, ret);
518 preg->program[ret + 2] = max;
519 preg->program[ret + 3] = min;
520 preg->program[ret + 4] = 0;
522 *flagp = (min) ? (WORST|HASWIDTH) : (WORST|SPSTART);
524 if (!(flags & SIMPLE)) {
525 int back = regnode(preg, BACK);
526 regtail(preg, back, ret);
527 regtail(preg, next, back);
530 preg->regparse++;
531 if (ISMULT(*preg->regparse)) {
532 preg->err = REG_ERR_NESTED_COUNT;
533 return 0;
536 return ret;
540 * Add all characters in the inclusive range between lower and upper.
542 * Handles a swapped range (upper < lower).
544 static void reg_addrange(regex_t *preg, int lower, int upper)
546 if (lower > upper) {
547 reg_addrange(preg, upper, lower);
549 /* Add a range as length, start */
550 regc(preg, upper - lower + 1);
551 regc(preg, lower);
555 * Add a null-terminated literal string as a set of ranges.
557 static void reg_addrange_str(regex_t *preg, const char *str)
559 while (*str) {
560 reg_addrange(preg, *str, *str);
561 str++;
566 * Extracts the next unicode char from utf8.
568 * If 'upper' is set, converts the char to uppercase.
570 static int reg_utf8_tounicode_case(const char *s, int *uc, int upper)
572 int l = utf8_tounicode(s, uc);
573 if (upper) {
574 *uc = utf8_upper(*uc);
576 return l;
580 * Converts a hex digit to decimal.
582 * Returns -1 for an invalid hex digit.
584 static int hexdigitval(int c)
586 if (c >= '0' && c <= '9')
587 return c - '0';
588 if (c >= 'a' && c <= 'f')
589 return c - 'a' + 10;
590 if (c >= 'A' && c <= 'F')
591 return c - 'A' + 10;
592 return -1;
596 * Parses up to 'n' hex digits at 's' and stores the result in *uc.
598 * Returns the number of hex digits parsed.
599 * If there are no hex digits, returns 0 and stores nothing.
601 static int parse_hex(const char *s, int n, int *uc)
603 int val = 0;
604 int k;
606 for (k = 0; k < n; k++) {
607 int c = hexdigitval(*s++);
608 if (c == -1) {
609 break;
611 val = (val << 4) | c;
613 if (k) {
614 *uc = val;
616 return k;
620 * Call for chars after a backlash to decode the escape sequence.
622 * Stores the result in *ch.
624 * Returns the number of bytes consumed.
626 static int reg_decode_escape(const char *s, int *ch)
628 int n;
629 const char *s0 = s;
631 *ch = *s++;
633 switch (*ch) {
634 case 'b': *ch = '\b'; break;
635 case 'e': *ch = 27; break;
636 case 'f': *ch = '\f'; break;
637 case 'n': *ch = '\n'; break;
638 case 'r': *ch = '\r'; break;
639 case 't': *ch = '\t'; break;
640 case 'v': *ch = '\v'; break;
641 case 'u':
642 if (*s == '{') {
643 /* Expect \u{NNNN} */
644 n = parse_hex(s + 1, 6, ch);
645 if (n > 0 && s[n + 1] == '}' && *ch >= 0 && *ch <= 0x1fffff) {
646 s += n + 2;
648 else {
649 /* Invalid, so just treat as an escaped 'u' */
650 *ch = 'u';
653 else if ((n = parse_hex(s, 4, ch)) > 0) {
654 s += n;
656 break;
657 case 'U':
658 if ((n = parse_hex(s, 8, ch)) > 0) {
659 s += n;
661 break;
662 case 'x':
663 if ((n = parse_hex(s, 2, ch)) > 0) {
664 s += n;
666 break;
667 case '\0':
668 s--;
669 *ch = '\\';
670 break;
672 return s - s0;
676 - regatom - the lowest level
678 * Optimization: gobbles an entire sequence of ordinary characters so that
679 * it can turn them into a single node, which is smaller to store and
680 * faster to run. Backslashed characters are exceptions, each becoming a
681 * separate node; the code is simpler that way and it's not worth fixing.
683 static int regatom(regex_t *preg, int *flagp)
685 int ret;
686 int flags;
687 int nocase = (preg->cflags & REG_ICASE);
689 int ch;
690 int n = reg_utf8_tounicode_case(preg->regparse, &ch, nocase);
692 *flagp = WORST; /* Tentatively. */
694 preg->regparse += n;
695 switch (ch) {
696 /* FIXME: these chars only have meaning at beg/end of pat? */
697 case '^':
698 ret = regnode(preg, BOL);
699 break;
700 case '$':
701 ret = regnode(preg, EOL);
702 break;
703 case '.':
704 ret = regnode(preg, ANY);
705 *flagp |= HASWIDTH|SIMPLE;
706 break;
707 case '[': {
708 const char *pattern = preg->regparse;
710 if (*pattern == '^') { /* Complement of range. */
711 ret = regnode(preg, ANYBUT);
712 pattern++;
713 } else
714 ret = regnode(preg, ANYOF);
716 /* Special case. If the first char is ']' or '-', it is part of the set */
717 if (*pattern == ']' || *pattern == '-') {
718 reg_addrange(preg, *pattern, *pattern);
719 pattern++;
722 while (*pattern && *pattern != ']') {
723 /* Is this a range? a-z */
724 int start;
725 int end;
727 pattern += reg_utf8_tounicode_case(pattern, &start, nocase);
728 if (start == '\\') {
729 pattern += reg_decode_escape(pattern, &start);
730 if (start == 0) {
731 preg->err = REG_ERR_NULL_CHAR;
732 return 0;
735 if (pattern[0] == '-' && pattern[1] && pattern[1] != ']') {
736 /* skip '-' */
737 pattern += utf8_tounicode(pattern, &end);
738 pattern += reg_utf8_tounicode_case(pattern, &end, nocase);
739 if (end == '\\') {
740 pattern += reg_decode_escape(pattern, &end);
741 if (end == 0) {
742 preg->err = REG_ERR_NULL_CHAR;
743 return 0;
747 reg_addrange(preg, start, end);
748 continue;
750 if (start == '[' && pattern[0] == ':') {
751 static const char *character_class[] = {
752 ":alpha:", ":alnum:", ":space:", ":blank:", ":upper:", ":lower:",
753 ":digit:", ":xdigit:", ":cntrl:", ":graph:", ":print:", ":punct:",
755 enum {
756 CC_ALPHA, CC_ALNUM, CC_SPACE, CC_BLANK, CC_UPPER, CC_LOWER,
757 CC_DIGIT, CC_XDIGIT, CC_CNTRL, CC_GRAPH, CC_PRINT, CC_PUNCT,
758 CC_NUM
760 int i;
762 for (i = 0; i < CC_NUM; i++) {
763 n = strlen(character_class[i]);
764 if (strncmp(pattern, character_class[i], n) == 0) {
765 /* Found a character class */
766 pattern += n + 1;
767 break;
770 if (i != CC_NUM) {
771 switch (i) {
772 case CC_ALNUM:
773 reg_addrange(preg, '0', '9');
774 /* Fall through */
775 case CC_ALPHA:
776 if ((preg->cflags & REG_ICASE) == 0) {
777 reg_addrange(preg, 'a', 'z');
779 reg_addrange(preg, 'A', 'Z');
780 break;
781 case CC_SPACE:
782 reg_addrange_str(preg, " \t\r\n\f\v");
783 break;
784 case CC_BLANK:
785 reg_addrange_str(preg, " \t");
786 break;
787 case CC_UPPER:
788 reg_addrange(preg, 'A', 'Z');
789 break;
790 case CC_LOWER:
791 reg_addrange(preg, 'a', 'z');
792 break;
793 case CC_XDIGIT:
794 reg_addrange(preg, 'a', 'f');
795 reg_addrange(preg, 'A', 'F');
796 /* Fall through */
797 case CC_DIGIT:
798 reg_addrange(preg, '0', '9');
799 break;
800 case CC_CNTRL:
801 reg_addrange(preg, 0, 31);
802 reg_addrange(preg, 127, 127);
803 break;
804 case CC_PRINT:
805 reg_addrange(preg, ' ', '~');
806 break;
807 case CC_GRAPH:
808 reg_addrange(preg, '!', '~');
809 break;
810 case CC_PUNCT:
811 reg_addrange(preg, '!', '/');
812 reg_addrange(preg, ':', '@');
813 reg_addrange(preg, '[', '`');
814 reg_addrange(preg, '{', '~');
815 break;
817 continue;
820 /* Not a range, so just add the char */
821 reg_addrange(preg, start, start);
823 regc(preg, '\0');
825 if (*pattern) {
826 pattern++;
828 preg->regparse = pattern;
830 *flagp |= HASWIDTH|SIMPLE;
832 break;
833 case '(':
834 ret = reg(preg, 1, &flags);
835 if (ret == 0)
836 return 0;
837 *flagp |= flags&(HASWIDTH|SPSTART);
838 break;
839 case '\0':
840 case '|':
841 case ')':
842 preg->err = REG_ERR_INTERNAL;
843 return 0; /* Supposed to be caught earlier. */
844 case '?':
845 case '+':
846 case '*':
847 case '{':
848 preg->err = REG_ERR_COUNT_FOLLOWS_NOTHING;
849 return 0;
850 case '\\':
851 ch = *preg->regparse++;
852 switch (ch) {
853 case '\0':
854 preg->err = REG_ERR_TRAILING_BACKSLASH;
855 return 0;
856 case 'A':
857 ret = regnode(preg, BOLX);
858 break;
859 case 'Z':
860 ret = regnode(preg, EOLX);
861 break;
862 case '<':
863 case 'm':
864 ret = regnode(preg, WORDA);
865 break;
866 case '>':
867 case 'M':
868 ret = regnode(preg, WORDZ);
869 break;
870 case 'd':
871 case 'D':
872 ret = regnode(preg, ch == 'd' ? ANYOF : ANYBUT);
873 reg_addrange(preg, '0', '9');
874 regc(preg, '\0');
875 *flagp |= HASWIDTH|SIMPLE;
876 break;
877 case 'w':
878 case 'W':
879 ret = regnode(preg, ch == 'w' ? ANYOF : ANYBUT);
880 if ((preg->cflags & REG_ICASE) == 0) {
881 reg_addrange(preg, 'a', 'z');
883 reg_addrange(preg, 'A', 'Z');
884 reg_addrange(preg, '0', '9');
885 reg_addrange(preg, '_', '_');
886 regc(preg, '\0');
887 *flagp |= HASWIDTH|SIMPLE;
888 break;
889 case 's':
890 case 'S':
891 ret = regnode(preg, ch == 's' ? ANYOF : ANYBUT);
892 reg_addrange_str(preg," \t\r\n\f\v");
893 regc(preg, '\0');
894 *flagp |= HASWIDTH|SIMPLE;
895 break;
896 /* FIXME: Someday handle \1, \2, ... */
897 default:
898 /* Handle general quoted chars in exact-match routine */
899 /* Back up to include the backslash */
900 preg->regparse--;
901 goto de_fault;
903 break;
904 de_fault:
905 default: {
907 * Encode a string of characters to be matched exactly.
909 int added = 0;
911 /* Back up to pick up the first char of interest */
912 preg->regparse -= n;
914 ret = regnode(preg, EXACTLY);
916 /* Note that a META operator such as ? or * consumes the
917 * preceding char.
918 * Thus we must be careful to look ahead by 2 and add the
919 * last char as it's own EXACTLY if necessary
922 /* Until end of string or a META char is reached */
923 while (*preg->regparse && strchr(META, *preg->regparse) == NULL) {
924 n = reg_utf8_tounicode_case(preg->regparse, &ch, (preg->cflags & REG_ICASE));
925 if (ch == '\\' && preg->regparse[n]) {
926 /* Non-trailing backslash.
927 * Is this a special escape, or a regular escape?
929 if (strchr("<>mMwWdDsSAZ", preg->regparse[n])) {
930 /* A special escape. All done with EXACTLY */
931 break;
933 /* Decode it. Note that we add the length for the escape
934 * sequence to the length for the backlash so we can skip
935 * the entire sequence, or not as required.
937 n += reg_decode_escape(preg->regparse + n, &ch);
938 if (ch == 0) {
939 preg->err = REG_ERR_NULL_CHAR;
940 return 0;
944 /* Now we have one char 'ch' of length 'n'.
945 * Check to see if the following char is a MULT
948 if (ISMULT(preg->regparse[n])) {
949 /* Yes. But do we already have some EXACTLY chars? */
950 if (added) {
951 /* Yes, so return what we have and pick up the current char next time around */
952 break;
954 /* No, so add this single char and finish */
955 regc(preg, ch);
956 added++;
957 preg->regparse += n;
958 break;
961 /* No, so just add this char normally */
962 regc(preg, ch);
963 added++;
964 preg->regparse += n;
966 regc(preg, '\0');
968 *flagp |= HASWIDTH;
969 if (added == 1)
970 *flagp |= SIMPLE;
971 break;
973 break;
976 return(ret);
979 static void reg_grow(regex_t *preg, int n)
981 if (preg->p + n >= preg->proglen) {
982 preg->proglen = (preg->p + n) * 2;
983 preg->program = realloc(preg->program, preg->proglen * sizeof(int));
988 - regnode - emit a node
990 /* Location. */
991 static int regnode(regex_t *preg, int op)
993 reg_grow(preg, 2);
995 /* The OP followed by a next pointer */
996 preg->program[preg->p++] = op;
997 preg->program[preg->p++] = 0;
999 /* Return the start of the node */
1000 return preg->p - 2;
1004 - regc - emit (if appropriate) a byte of code
1006 static void regc(regex_t *preg, int b )
1008 reg_grow(preg, 1);
1009 preg->program[preg->p++] = b;
1013 - reginsert - insert an operator in front of already-emitted operand
1015 * Means relocating the operand.
1016 * Returns the new location of the original operand.
1018 static int reginsert(regex_t *preg, int op, int size, int opnd )
1020 reg_grow(preg, size);
1022 /* Move everything from opnd up */
1023 memmove(preg->program + opnd + size, preg->program + opnd, sizeof(int) * (preg->p - opnd));
1024 /* Zero out the new space */
1025 memset(preg->program + opnd, 0, sizeof(int) * size);
1027 preg->program[opnd] = op;
1029 preg->p += size;
1031 return opnd + size;
1035 - regtail - set the next-pointer at the end of a node chain
1037 static void regtail(regex_t *preg, int p, int val)
1039 int scan;
1040 int temp;
1041 int offset;
1043 /* Find last node. */
1044 scan = p;
1045 for (;;) {
1046 temp = regnext(preg, scan);
1047 if (temp == 0)
1048 break;
1049 scan = temp;
1052 if (OP(preg, scan) == BACK)
1053 offset = scan - val;
1054 else
1055 offset = val - scan;
1057 preg->program[scan + 1] = offset;
1061 - regoptail - regtail on operand of first argument; nop if operandless
1064 static void regoptail(regex_t *preg, int p, int val )
1066 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
1067 if (p != 0 && OP(preg, p) == BRANCH) {
1068 regtail(preg, OPERAND(p), val);
1073 * regexec and friends
1077 * Forwards.
1079 static int regtry(regex_t *preg, const char *string );
1080 static int regmatch(regex_t *preg, int prog);
1081 static int regrepeat(regex_t *preg, int p, int max);
1084 - regexec - match a regexp against a string
1086 int regexec(regex_t *preg, const char *string, size_t nmatch, regmatch_t pmatch[], int eflags)
1088 const char *s;
1089 int scan;
1091 /* Be paranoid... */
1092 if (preg == NULL || preg->program == NULL || string == NULL) {
1093 return REG_ERR_NULL_ARGUMENT;
1096 /* Check validity of program. */
1097 if (*preg->program != REG_MAGIC) {
1098 return REG_ERR_CORRUPTED;
1101 #ifdef DEBUG
1102 fprintf(stderr, "regexec: %s\n", string);
1103 regdump(preg);
1104 #endif
1106 preg->eflags = eflags;
1107 preg->pmatch = pmatch;
1108 preg->nmatch = nmatch;
1109 preg->start = string; /* All offsets are computed from here */
1111 /* Must clear out the embedded repeat counts of REPX and REPXMIN opcodes */
1112 for (scan = OPERAND(1); scan != 0; scan += regopsize(preg, scan)) {
1113 int op = OP(preg, scan);
1114 if (op == END)
1115 break;
1116 if (op == REPX || op == REPXMIN)
1117 preg->program[scan + 4] = 0;
1120 /* If there is a "must appear" string, look for it. */
1121 if (preg->regmust != 0) {
1122 s = string;
1123 while ((s = str_find(s, preg->program[preg->regmust], preg->cflags & REG_ICASE)) != NULL) {
1124 if (prefix_cmp(preg->program + preg->regmust, preg->regmlen, s, preg->cflags & REG_ICASE) >= 0) {
1125 break;
1127 s++;
1129 if (s == NULL) /* Not present. */
1130 return REG_NOMATCH;
1133 /* Mark beginning of line for ^ . */
1134 preg->regbol = string;
1136 /* Simplest case: anchored match need be tried only once (maybe per line). */
1137 if (preg->reganch) {
1138 if (eflags & REG_NOTBOL) {
1139 /* This is an anchored search, but not an BOL, so possibly skip to the next line */
1140 goto nextline;
1142 while (1) {
1143 if (regtry(preg, string)) {
1144 return REG_NOERROR;
1146 if (*string) {
1147 nextline:
1148 if (preg->cflags & REG_NEWLINE) {
1149 /* Try the next anchor? */
1150 string = strchr(string, '\n');
1151 if (string) {
1152 preg->regbol = ++string;
1153 continue;
1157 return REG_NOMATCH;
1161 /* Messy cases: unanchored match. */
1162 s = string;
1163 if (preg->regstart != '\0') {
1164 /* We know what char it must start with. */
1165 while ((s = str_find(s, preg->regstart, preg->cflags & REG_ICASE)) != NULL) {
1166 if (regtry(preg, s))
1167 return REG_NOERROR;
1168 s++;
1171 else
1172 /* We don't -- general case. */
1173 while (1) {
1174 if (regtry(preg, s))
1175 return REG_NOERROR;
1176 if (*s == '\0') {
1177 break;
1179 else {
1180 int c;
1181 s += utf8_tounicode(s, &c);
1185 /* Failure. */
1186 return REG_NOMATCH;
1190 - regtry - try match at specific point
1192 /* 0 failure, 1 success */
1193 static int regtry( regex_t *preg, const char *string )
1195 int i;
1197 preg->reginput = string;
1199 for (i = 0; i < preg->nmatch; i++) {
1200 preg->pmatch[i].rm_so = -1;
1201 preg->pmatch[i].rm_eo = -1;
1203 if (regmatch(preg, 1)) {
1204 preg->pmatch[0].rm_so = string - preg->start;
1205 preg->pmatch[0].rm_eo = preg->reginput - preg->start;
1206 return(1);
1207 } else
1208 return(0);
1212 * Returns bytes matched if 'pattern' is a prefix of 'string'.
1214 * If 'nocase' is non-zero, does a case-insensitive match.
1216 * Returns -1 on not found.
1218 static int prefix_cmp(const int *prog, int proglen, const char *string, int nocase)
1220 const char *s = string;
1221 while (proglen && *s) {
1222 int ch;
1223 int n = reg_utf8_tounicode_case(s, &ch, nocase);
1224 if (ch != *prog) {
1225 return -1;
1227 prog++;
1228 s += n;
1229 proglen--;
1231 if (proglen == 0) {
1232 return s - string;
1234 return -1;
1238 * Searchs for 'c' in the range 'range'.
1240 * Returns 1 if found, or 0 if not.
1242 static int reg_range_find(const int *range, int c)
1244 while (*range) {
1245 /*printf("Checking %d in range [%d,%d]\n", c, range[1], (range[0] + range[1] - 1));*/
1246 if (c >= range[1] && c <= (range[0] + range[1] - 1)) {
1247 return 1;
1249 range += 2;
1251 return 0;
1255 * Search for the character 'c' in the utf-8 string 'string'.
1257 * If 'nocase' is set, the 'string' is assumed to be uppercase
1258 * and 'c' is converted to uppercase before matching.
1260 * Returns the byte position in the string where the 'c' was found, or
1261 * NULL if not found.
1263 static const char *str_find(const char *string, int c, int nocase)
1265 if (nocase) {
1266 /* The "string" should already be converted to uppercase */
1267 c = utf8_upper(c);
1269 while (*string) {
1270 int ch;
1271 int n = reg_utf8_tounicode_case(string, &ch, nocase);
1272 if (c == ch) {
1273 return string;
1275 string += n;
1277 return NULL;
1281 * Returns true if 'ch' is an end-of-line char.
1283 * In REG_NEWLINE mode, \n is considered EOL in
1284 * addition to \0
1286 static int reg_iseol(regex_t *preg, int ch)
1288 if (preg->cflags & REG_NEWLINE) {
1289 return ch == '\0' || ch == '\n';
1291 else {
1292 return ch == '\0';
1296 static int regmatchsimplerepeat(regex_t *preg, int scan, int matchmin)
1298 int nextch = '\0';
1299 const char *save;
1300 int no;
1301 int c;
1303 int max = preg->program[scan + 2];
1304 int min = preg->program[scan + 3];
1305 int next = regnext(preg, scan);
1308 * Lookahead to avoid useless match attempts
1309 * when we know what character comes next.
1311 if (OP(preg, next) == EXACTLY) {
1312 nextch = preg->program[OPERAND(next)];
1314 save = preg->reginput;
1315 no = regrepeat(preg, scan + 5, max);
1316 if (no < min) {
1317 return 0;
1319 if (matchmin) {
1320 /* from min up to no */
1321 max = no;
1322 no = min;
1324 /* else from no down to min */
1325 while (1) {
1326 if (matchmin) {
1327 if (no > max) {
1328 break;
1331 else {
1332 if (no < min) {
1333 break;
1336 preg->reginput = save + utf8_index(save, no);
1337 reg_utf8_tounicode_case(preg->reginput, &c, (preg->cflags & REG_ICASE));
1338 /* If it could work, try it. */
1339 if (reg_iseol(preg, nextch) || c == nextch) {
1340 if (regmatch(preg, next)) {
1341 return(1);
1344 if (matchmin) {
1345 /* Couldn't or didn't, add one more */
1346 no++;
1348 else {
1349 /* Couldn't or didn't -- back up. */
1350 no--;
1353 return(0);
1356 static int regmatchrepeat(regex_t *preg, int scan, int matchmin)
1358 int *scanpt = preg->program + scan;
1360 int max = scanpt[2];
1361 int min = scanpt[3];
1363 /* Have we reached min? */
1364 if (scanpt[4] < min) {
1365 /* No, so get another one */
1366 scanpt[4]++;
1367 if (regmatch(preg, scan + 5)) {
1368 return 1;
1370 scanpt[4]--;
1371 return 0;
1373 if (scanpt[4] > max) {
1374 return 0;
1377 if (matchmin) {
1378 /* minimal, so try other branch first */
1379 if (regmatch(preg, regnext(preg, scan))) {
1380 return 1;
1382 /* No, so try one more */
1383 scanpt[4]++;
1384 if (regmatch(preg, scan + 5)) {
1385 return 1;
1387 scanpt[4]--;
1388 return 0;
1390 /* maximal, so try this branch again */
1391 if (scanpt[4] < max) {
1392 scanpt[4]++;
1393 if (regmatch(preg, scan + 5)) {
1394 return 1;
1396 scanpt[4]--;
1398 /* At this point we are at max with no match. Try the other branch */
1399 return regmatch(preg, regnext(preg, scan));
1403 - regmatch - main matching routine
1405 * Conceptually the strategy is simple: check to see whether the current
1406 * node matches, call self recursively to see whether the rest matches,
1407 * and then act accordingly. In practice we make some effort to avoid
1408 * recursion, in particular by going through "ordinary" nodes (that don't
1409 * need to know whether the rest of the match failed) by a loop instead of
1410 * by recursion.
1412 /* 0 failure, 1 success */
1413 static int regmatch(regex_t *preg, int prog)
1415 int scan; /* Current node. */
1416 int next; /* Next node. */
1417 const char *save;
1419 scan = prog;
1421 #ifdef DEBUG
1422 if (scan != 0 && regnarrate)
1423 fprintf(stderr, "%s(\n", regprop(scan));
1424 #endif
1425 while (scan != 0) {
1426 int n;
1427 int c;
1428 #ifdef DEBUG
1429 if (regnarrate) {
1430 fprintf(stderr, "%3d: %s...\n", scan, regprop(OP(preg, scan))); /* Where, what. */
1432 #endif
1433 next = regnext(preg, scan);
1434 n = reg_utf8_tounicode_case(preg->reginput, &c, (preg->cflags & REG_ICASE));
1436 switch (OP(preg, scan)) {
1437 case BOLX:
1438 if ((preg->eflags & REG_NOTBOL)) {
1439 return(0);
1441 /* Fall through */
1442 case BOL:
1443 if (preg->reginput != preg->regbol) {
1444 return(0);
1446 break;
1447 case EOLX:
1448 if (c != 0) {
1449 /* For EOLX, only match real end of line, not newline */
1450 return 0;
1452 break;
1453 case EOL:
1454 if (!reg_iseol(preg, c)) {
1455 return(0);
1457 break;
1458 case WORDA:
1459 /* Must be looking at a letter, digit, or _ */
1460 if ((!isalnum(UCHAR(c))) && c != '_')
1461 return(0);
1462 /* Prev must be BOL or nonword */
1463 if (preg->reginput > preg->regbol &&
1464 (isalnum(UCHAR(preg->reginput[-1])) || preg->reginput[-1] == '_'))
1465 return(0);
1466 break;
1467 case WORDZ:
1468 /* Can't match at BOL */
1469 if (preg->reginput > preg->regbol) {
1470 /* Current must be EOL or nonword */
1471 if (reg_iseol(preg, c) || !isalnum(UCHAR(c)) || c != '_') {
1472 c = preg->reginput[-1];
1473 /* Previous must be word */
1474 if (isalnum(UCHAR(c)) || c == '_') {
1475 break;
1479 /* No */
1480 return(0);
1482 case ANY:
1483 if (reg_iseol(preg, c))
1484 return 0;
1485 preg->reginput += n;
1486 break;
1487 case EXACTLY: {
1488 int opnd;
1489 int len;
1490 int slen;
1492 opnd = OPERAND(scan);
1493 len = str_int_len(preg->program + opnd);
1495 slen = prefix_cmp(preg->program + opnd, len, preg->reginput, preg->cflags & REG_ICASE);
1496 if (slen < 0) {
1497 return(0);
1499 preg->reginput += slen;
1501 break;
1502 case ANYOF:
1503 if (reg_iseol(preg, c) || reg_range_find(preg->program + OPERAND(scan), c) == 0) {
1504 return(0);
1506 preg->reginput += n;
1507 break;
1508 case ANYBUT:
1509 if (reg_iseol(preg, c) || reg_range_find(preg->program + OPERAND(scan), c) != 0) {
1510 return(0);
1512 preg->reginput += n;
1513 break;
1514 case NOTHING:
1515 break;
1516 case BACK:
1517 break;
1518 case BRANCH:
1519 if (OP(preg, next) != BRANCH) /* No choice. */
1520 next = OPERAND(scan); /* Avoid recursion. */
1521 else {
1522 do {
1523 save = preg->reginput;
1524 if (regmatch(preg, OPERAND(scan))) {
1525 return(1);
1527 preg->reginput = save;
1528 scan = regnext(preg, scan);
1529 } while (scan != 0 && OP(preg, scan) == BRANCH);
1530 return(0);
1531 /* NOTREACHED */
1533 break;
1534 case REP:
1535 case REPMIN:
1536 return regmatchsimplerepeat(preg, scan, OP(preg, scan) == REPMIN);
1538 case REPX:
1539 case REPXMIN:
1540 return regmatchrepeat(preg, scan, OP(preg, scan) == REPXMIN);
1542 case END:
1543 return 1; /* Success! */
1545 case OPENNC:
1546 case CLOSENC:
1547 return regmatch(preg, next);
1549 default:
1550 if (OP(preg, scan) >= OPEN+1 && OP(preg, scan) < CLOSE_END) {
1551 save = preg->reginput;
1552 if (regmatch(preg, next)) {
1553 if (OP(preg, scan) < CLOSE) {
1554 int no = OP(preg, scan) - OPEN;
1555 if (no < preg->nmatch && preg->pmatch[no].rm_so == -1) {
1556 preg->pmatch[no].rm_so = save - preg->start;
1559 else {
1560 int no = OP(preg, scan) - CLOSE;
1561 if (no < preg->nmatch && preg->pmatch[no].rm_eo == -1) {
1562 preg->pmatch[no].rm_eo = save - preg->start;
1565 return(1);
1567 return(0);
1569 return REG_ERR_INTERNAL;
1572 scan = next;
1576 * We get here only if there's trouble -- normally "case END" is
1577 * the terminating point.
1579 return REG_ERR_INTERNAL;
1583 - regrepeat - repeatedly match something simple, report how many
1585 static int regrepeat(regex_t *preg, int p, int max)
1587 int count = 0;
1588 const char *scan;
1589 int opnd;
1590 int ch;
1591 int n;
1593 scan = preg->reginput;
1594 opnd = OPERAND(p);
1595 switch (OP(preg, p)) {
1596 case ANY:
1597 /* No need to handle utf8 specially here */
1598 while (!reg_iseol(preg, *scan) && count < max) {
1599 count++;
1600 scan++;
1602 break;
1603 case EXACTLY:
1604 while (count < max) {
1605 n = reg_utf8_tounicode_case(scan, &ch, preg->cflags & REG_ICASE);
1606 if (preg->program[opnd] != ch) {
1607 break;
1609 count++;
1610 scan += n;
1612 break;
1613 case ANYOF:
1614 while (count < max) {
1615 n = reg_utf8_tounicode_case(scan, &ch, preg->cflags & REG_ICASE);
1616 if (reg_iseol(preg, ch) || reg_range_find(preg->program + opnd, ch) == 0) {
1617 break;
1619 count++;
1620 scan += n;
1622 break;
1623 case ANYBUT:
1624 while (count < max) {
1625 n = reg_utf8_tounicode_case(scan, &ch, preg->cflags & REG_ICASE);
1626 if (reg_iseol(preg, ch) || reg_range_find(preg->program + opnd, ch) != 0) {
1627 break;
1629 count++;
1630 scan += n;
1632 break;
1633 default: /* Oh dear. Called inappropriately. */
1634 preg->err = REG_ERR_INTERNAL;
1635 count = 0; /* Best compromise. */
1636 break;
1638 preg->reginput = scan;
1640 return(count);
1644 - regnext - dig the "next" pointer out of a node
1646 static int regnext(regex_t *preg, int p )
1648 int offset;
1650 offset = NEXT(preg, p);
1652 if (offset == 0)
1653 return 0;
1655 if (OP(preg, p) == BACK)
1656 return(p-offset);
1657 else
1658 return(p+offset);
1662 - regopsize - returns the size of opcode + operands at 'p' in words
1664 static int regopsize(regex_t *preg, int p )
1666 /* Almost all opcodes are 2 words, but some are more */
1667 switch (OP(preg, p)) {
1668 case REP:
1669 case REPMIN:
1670 case REPX:
1671 case REPXMIN:
1672 return 5;
1674 case ANYOF:
1675 case ANYBUT:
1676 case EXACTLY: {
1677 int s = p + 2;
1678 while (preg->program[s++]) {
1680 return s - p;
1683 return 2;
1686 #if defined(DEBUG) && !defined(JIM_BOOTSTRAP)
1689 - regdump - dump a regexp onto stdout in vaguely comprehensible form
1691 static void regdump(regex_t *preg)
1693 int s;
1694 int op = EXACTLY; /* Arbitrary non-END op. */
1695 int next;
1696 char buf[MAX_UTF8_LEN + 1];
1698 int i;
1699 for (i = 1; i < preg->p; i++) {
1700 printf("%02x ", (unsigned char)preg->program[i]);
1701 if (i % 16 == 0) {
1702 printf("\n");
1705 printf("\n");
1707 s = 1;
1708 while (op != END && s < preg->p) { /* While that wasn't END last time... */
1709 op = OP(preg, s);
1710 printf("%3d: %s", s, regprop(op)); /* Where, what. */
1711 next = regnext(preg, s);
1712 if (next == 0) /* Next ptr. */
1713 printf("(0)");
1714 else
1715 printf("(%d)", next);
1716 s += 2;
1717 if (op == REP || op == REPMIN || op == REPX || op == REPXMIN) {
1718 int max = preg->program[s];
1719 int min = preg->program[s + 1];
1720 if (max == 65535) {
1721 printf("{%d,*}", min);
1723 else {
1724 printf("{%d,%d}", min, max);
1726 printf(" %d", preg->program[s + 2]);
1727 s += 3;
1729 else if (op == ANYOF || op == ANYBUT) {
1730 /* set of ranges */
1732 while (preg->program[s]) {
1733 int len = preg->program[s++];
1734 int first = preg->program[s++];
1735 buf[utf8_getchars(buf, first)] = 0;
1736 printf("%s", buf);
1737 if (len > 1) {
1738 buf[utf8_getchars(buf, first + len - 1)] = 0;
1739 printf("-%s", buf);
1742 s++;
1744 else if (op == EXACTLY) {
1745 /* Literal string, where present. */
1747 while (preg->program[s]) {
1748 buf[utf8_getchars(buf, preg->program[s])] = 0;
1749 printf("%s", buf);
1750 s++;
1752 s++;
1754 putchar('\n');
1757 if (op == END) {
1758 /* Header fields of interest. */
1759 if (preg->regstart) {
1760 buf[utf8_getchars(buf, preg->regstart)] = 0;
1761 printf("start '%s' ", buf);
1763 if (preg->reganch)
1764 printf("anchored ");
1765 if (preg->regmust != 0) {
1766 int i;
1767 printf("must have:");
1768 for (i = 0; i < preg->regmlen; i++) {
1769 putchar(preg->program[preg->regmust + i]);
1771 putchar('\n');
1774 printf("\n");
1778 - regprop - printable representation of opcode
1780 static const char *regprop( int op )
1782 static char buf[50];
1784 switch (op) {
1785 case BOL:
1786 return "BOL";
1787 case EOL:
1788 return "EOL";
1789 case BOLX:
1790 return "BOLX";
1791 case EOLX:
1792 return "EOLX";
1793 case ANY:
1794 return "ANY";
1795 case ANYOF:
1796 return "ANYOF";
1797 case ANYBUT:
1798 return "ANYBUT";
1799 case BRANCH:
1800 return "BRANCH";
1801 case EXACTLY:
1802 return "EXACTLY";
1803 case NOTHING:
1804 return "NOTHING";
1805 case BACK:
1806 return "BACK";
1807 case END:
1808 return "END";
1809 case REP:
1810 return "REP";
1811 case REPMIN:
1812 return "REPMIN";
1813 case REPX:
1814 return "REPX";
1815 case REPXMIN:
1816 return "REPXMIN";
1817 case WORDA:
1818 return "WORDA";
1819 case WORDZ:
1820 return "WORDZ";
1821 case OPENNC:
1822 return "OPEN";
1823 case CLOSENC:
1824 return "CLOSE";
1825 default:
1826 if (op >= OPEN && op < CLOSE) {
1827 snprintf(buf, sizeof(buf), "OPEN%d", op-OPEN);
1829 else if (op >= CLOSE && op < CLOSE_END) {
1830 snprintf(buf, sizeof(buf), "CLOSE%d", op-CLOSE);
1832 else {
1833 snprintf(buf, sizeof(buf), "?%d?\n", op);
1835 return(buf);
1838 #endif /* JIM_BOOTSTRAP */
1840 size_t regerror(int errcode, const regex_t *preg, char *errbuf, size_t errbuf_size)
1842 static const char *error_strings[] = {
1843 "success",
1844 "no match",
1845 "bad pattern",
1846 "null argument",
1847 "unknown error",
1848 "too big",
1849 "out of memory",
1850 "too many ()",
1851 "parentheses () not balanced",
1852 "braces {} not balanced",
1853 "invalid repetition count(s)",
1854 "extra characters",
1855 "*+ of empty atom",
1856 "nested count",
1857 "internal error",
1858 "count follows nothing",
1859 "trailing backslash",
1860 "corrupted program",
1861 "contains null char",
1863 const char *err;
1865 if (errcode < 0 || errcode >= REG_ERR_NUM) {
1866 err = "Bad error code";
1868 else {
1869 err = error_strings[errcode];
1872 return snprintf(errbuf, errbuf_size, "%s", err);
1875 void regfree(regex_t *preg)
1877 free(preg->program);
1880 #endif