Add an uninstall target
[jimtcl.git] / jimregexp.c
blob8da93476fd854d7d2a82badc013ff452c92d8d1d
1 /*
2 * regcomp and regexec -- regsub and regerror are elsewhere
4 * Copyright (c) 1986 by University of Toronto.
5 * Written by Henry Spencer. Not derived from licensed software.
7 * Permission is granted to anyone to use this software for any
8 * purpose on any computer system, and to redistribute it freely,
9 * subject to the following restrictions:
11 * 1. The author is not responsible for the consequences of use of
12 * this software, no matter how awful, even if they arise
13 * from defects in it.
15 * 2. The origin of this software must not be misrepresented, either
16 * by explicit claim or by omission.
18 * 3. Altered versions must be plainly marked as such, and must not
19 * be misrepresented as being the original software.
20 *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore,
21 *** hoptoad!gnu, on 27 Dec 1986, to add \n as an alternative to |
22 *** to assist in implementing egrep.
23 *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore,
24 *** hoptoad!gnu, on 27 Dec 1986, to add \< and \> for word-matching
25 *** as in BSD grep and ex.
26 *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore,
27 *** hoptoad!gnu, on 28 Dec 1986, to optimize characters quoted with \.
28 *** THIS IS AN ALTERED VERSION. It was altered by James A. Woods,
29 *** ames!jaw, on 19 June 1987, to quash a regcomp() redundancy.
30 *** THIS IS AN ALTERED VERSION. It was altered by Christopher Seiwald
31 *** seiwald@vix.com, on 28 August 1993, for use in jam. Regmagic.h
32 *** was moved into regexp.h, and the include of regexp.h now uses "'s
33 *** to avoid conflicting with the system regexp.h. Const, bless its
34 *** soul, was removed so it can compile everywhere. The declaration
35 *** of strchr() was in conflict on AIX, so it was removed (as it is
36 *** happily defined in string.h).
37 *** THIS IS AN ALTERED VERSION. It was altered by Christopher Seiwald
38 *** seiwald@perforce.com, on 20 January 2000, to use function prototypes.
39 *** THIS IS AN ALTERED VERSION. It was altered by Christopher Seiwald
40 *** seiwald@perforce.com, on 05 November 2002, to const string literals.
42 * THIS IS AN ALTERED VERSION. It was altered by Steve Bennett <steveb@workware.net.au>
43 * on 16 October 2010, to remove static state and add better Tcl ARE compatibility.
44 * This includes counted repetitions, UTF-8 support, character classes,
45 * shorthand character classes, increased number of parentheses to 100,
46 * backslash escape sequences. It also removes \n as an alternative to |.
48 * Beware that some of this code is subtly aware of the way operator
49 * precedence is structured in regular expressions. Serious changes in
50 * regular-expression syntax might require a total rethink.
52 #include <stdio.h>
53 #include <ctype.h>
54 #include <stdlib.h>
55 #include <string.h>
57 #include "jim.h"
58 #include "jimautoconf.h"
59 #include "jimregexp.h"
60 #include "utf8.h"
62 #if !defined(HAVE_REGCOMP) || defined(JIM_REGEXP)
65 * Structure for regexp "program". This is essentially a linear encoding
66 * of a nondeterministic finite-state machine (aka syntax charts or
67 * "railroad normal form" in parsing technology). Each node is an opcode
68 * plus a "next" pointer, possibly plus an operand. "Next" pointers of
69 * all nodes except BRANCH implement concatenation; a "next" pointer with
70 * a BRANCH on both ends of it is connecting two alternatives. (Here we
71 * have one of the subtle syntax dependencies: an individual BRANCH (as
72 * opposed to a collection of them) is never concatenated with anything
73 * because of operator precedence.) The operand of some types of node is
74 * a literal string; for others, it is a node leading into a sub-FSM. In
75 * particular, the operand of a BRANCH node is the first node of the branch.
76 * (NB this is *not* a tree structure: the tail of the branch connects
77 * to the thing following the set of BRANCHes.) The opcodes are:
80 /* This *MUST* be less than (255-20)/2=117 */
81 #define REG_MAX_PAREN 100
83 /* definition number opnd? meaning */
84 #define END 0 /* no End of program. */
85 #define BOL 1 /* no Match "" at beginning of line. */
86 #define EOL 2 /* no Match "" at end of line. */
87 #define ANY 3 /* no Match any one character. */
88 #define ANYOF 4 /* str Match any character in this string. */
89 #define ANYBUT 5 /* str Match any character not in this string. */
90 #define BRANCH 6 /* node Match this alternative, or the next... */
91 #define BACK 7 /* no Match "", "next" ptr points backward. */
92 #define EXACTLY 8 /* str Match this string. */
93 #define NOTHING 9 /* no Match empty string. */
94 #define REP 10 /* max,min Match this (simple) thing [min,max] times. */
95 #define REPMIN 11 /* max,min Match this (simple) thing [min,max] times, mininal match. */
96 #define REPX 12 /* max,min Match this (complex) thing [min,max] times. */
97 #define REPXMIN 13 /* max,min Match this (complex) thing [min,max] times, minimal match. */
99 #define WORDA 15 /* no Match "" at wordchar, where prev is nonword */
100 #define WORDZ 16 /* no Match "" at nonwordchar, where prev is word */
101 #define OPEN 20 /* no Mark this point in input as start of #n. */
102 /* OPEN+1 is number 1, etc. */
103 #define CLOSE (OPEN+REG_MAX_PAREN) /* no Analogous to OPEN. */
104 #define CLOSE_END (CLOSE+REG_MAX_PAREN)
107 * The first byte of the regexp internal "program" is actually this magic
108 * number; the start node begins in the second byte.
110 #define REG_MAGIC 0xFADED00D
113 * Opcode notes:
115 * BRANCH The set of branches constituting a single choice are hooked
116 * together with their "next" pointers, since precedence prevents
117 * anything being concatenated to any individual branch. The
118 * "next" pointer of the last BRANCH in a choice points to the
119 * thing following the whole choice. This is also where the
120 * final "next" pointer of each individual branch points; each
121 * branch starts with the operand node of a BRANCH node.
123 * BACK Normal "next" pointers all implicitly point forward; BACK
124 * exists to make loop structures possible.
126 * STAR,PLUS '?', and complex '*' and '+', are implemented as circular
127 * BRANCH structures using BACK. Simple cases (one character
128 * per match) are implemented with STAR and PLUS for speed
129 * and to minimize recursive plunges.
131 * OPEN,CLOSE ...are numbered at compile time.
135 * A node is one char of opcode followed by two chars of "next" pointer.
136 * "Next" pointers are stored as two 8-bit pieces, high order first. The
137 * value is a positive offset from the opcode of the node containing it.
138 * An operand, if any, simply follows the node. (Note that much of the
139 * code generation knows about this implicit relationship.)
141 * Using two bytes for the "next" pointer is vast overkill for most things,
142 * but allows patterns to get big without disasters.
144 #define OP(p) ((p)[0])
145 #define NEXT(p) ((p)[1])
146 #define OPERAND(p) ((p) + 2)
149 * See regmagic.h for one further detail of program structure.
154 * Utility definitions.
157 #define FAIL(R,M) { (R)->err = (M); return (M); }
158 #define ISMULT(c) ((c) == '*' || (c) == '+' || (c) == '?' || (c) == '{')
159 #define META "^$.[()|?{+*"
162 * Flags to be passed up and down.
164 #define HASWIDTH 01 /* Known never to match null string. */
165 #define SIMPLE 02 /* Simple enough to be STAR/PLUS operand. */
166 #define SPSTART 04 /* Starts with * or +. */
167 #define WORST 0 /* Worst case. */
169 #define MAX_REP_COUNT 1000000
172 * Forward declarations for regcomp()'s friends.
174 static int *reg(regex_t *preg, int paren /* Parenthesized? */, int *flagp );
175 static int *regpiece(regex_t *preg, int *flagp );
176 static int *regbranch(regex_t *preg, int *flagp );
177 static int *regatom(regex_t *preg, int *flagp );
178 static int *regnode(regex_t *preg, int op );
179 static const int *regnext(regex_t *preg, const int *p );
180 static void regc(regex_t *preg, int b );
181 static int *reginsert(regex_t *preg, int op, int size, int *opnd );
182 static void regtail(regex_t *preg, int *p, const int *val );
183 static void regoptail(regex_t *preg, int *p, const int *val );
185 static int reg_range_find(const int *string, int c);
186 static const char *str_find(const char *string, int c, int nocase);
187 static int prefix_cmp(const int *prog, int proglen, const char *string, int nocase);
189 /*#define DEBUG*/
190 #ifdef DEBUG
191 int regnarrate = 0;
192 static void regdump(regex_t *preg);
193 static const char *regprop( const int *op );
194 #endif
197 static int regdummy;
200 * Returns the length of the null-terminated integer sequence.
202 static int str_int_len(const int *seq)
204 int n = 0;
205 while (*seq++) {
206 n++;
208 return n;
212 - regcomp - compile a regular expression into internal code
214 * We can't allocate space until we know how big the compiled form will be,
215 * but we can't compile it (and thus know how big it is) until we've got a
216 * place to put the code. So we cheat: we compile it twice, once with code
217 * generation turned off and size counting turned on, and once "for real".
218 * This also means that we don't allocate space until we are sure that the
219 * thing really will compile successfully, and we never have to move the
220 * code and thus invalidate pointers into it. (Note that it has to be in
221 * one piece because free() must be able to free it all.)
223 * Beware that the optimization-preparation code in here knows about some
224 * of the structure of the compiled regexp.
226 int regcomp(regex_t *preg, const char *exp, int cflags)
228 const int *scan;
229 const int *longest;
230 unsigned len;
231 int flags;
233 #ifdef DEBUG
234 fprintf(stderr, "Compiling: '%s'\n", exp);
235 #endif
236 memset(preg, 0, sizeof(*preg));
238 if (exp == NULL)
239 FAIL(preg, REG_ERR_NULL_ARGUMENT);
241 /* First pass: determine size, legality. */
242 preg->cflags = cflags;
243 preg->regparse = exp;
244 preg->re_nsub = 0;
245 preg->regsize = 0L;
246 preg->regcode = &regdummy;
247 regc(preg, REG_MAGIC);
248 if (reg(preg, 0, &flags) == NULL)
249 return preg->err;
251 /* Small enough for pointer-storage convention? */
252 if (preg->regsize >= 32767L || preg->re_nsub >= REG_MAX_PAREN) /* Probably could be 65535L. */
253 FAIL(preg,REG_ERR_TOO_BIG);
255 /* Allocate space. */
256 preg->program = malloc(preg->regsize * sizeof(*preg->program));
257 if (preg->program == NULL)
258 FAIL(preg, REG_ERR_NOMEM);
260 /* Second pass: emit code. */
261 preg->regparse = exp;
262 preg->re_nsub = 0;
263 preg->regsize = 0L;
264 preg->regcode = preg->program;
265 regc(preg, REG_MAGIC);
266 if (reg(preg, 0, &flags) == NULL)
267 return preg->err;
269 /* Dig out information for optimizations. */
270 preg->regstart = 0; /* Worst-case defaults. */
271 preg->reganch = 0;
272 preg->regmust = NULL;
273 preg->regmlen = 0;
274 scan = preg->program+1; /* First BRANCH. */
275 if (OP(regnext(preg, scan)) == END) { /* Only one top-level choice. */
276 scan = OPERAND(scan);
278 /* Starting-point info. */
279 if (OP(scan) == EXACTLY) {
280 preg->regstart = *OPERAND(scan);
282 else if (OP(scan) == BOL)
283 preg->reganch++;
286 * If there's something expensive in the r.e., find the
287 * longest literal string that must appear and make it the
288 * regmust. Resolve ties in favor of later strings, since
289 * the regstart check works with the beginning of the r.e.
290 * and avoiding duplication strengthens checking. Not a
291 * strong reason, but sufficient in the absence of others.
293 if (flags&SPSTART) {
294 longest = NULL;
295 len = 0;
296 for (; scan != NULL; scan = regnext(preg, scan)) {
297 if (OP(scan) == EXACTLY) {
298 int plen = str_int_len(OPERAND(scan));
299 if (plen >= len) {
300 longest = OPERAND(scan);
301 len = plen;
305 preg->regmust = longest;
306 preg->regmlen = len;
310 #ifdef DEBUG
311 regdump(preg);
312 #endif
314 return 0;
318 - reg - regular expression, i.e. main body or parenthesized thing
320 * Caller must absorb opening parenthesis.
322 * Combining parenthesis handling with the base level of regular expression
323 * is a trifle forced, but the need to tie the tails of the branches to what
324 * follows makes it hard to avoid.
326 static int *reg(regex_t *preg, int paren /* Parenthesized? */, int *flagp )
328 int *ret;
329 int *br;
330 const int *ender;
331 int parno = 0;
332 int flags;
334 *flagp = HASWIDTH; /* Tentatively. */
336 /* Make an OPEN node, if parenthesized. */
337 if (paren) {
338 parno = ++preg->re_nsub;
339 ret = regnode(preg, OPEN+parno);
340 } else
341 ret = NULL;
343 /* Pick up the branches, linking them together. */
344 br = regbranch(preg, &flags);
345 if (br == NULL)
346 return(NULL);
347 if (ret != NULL)
348 regtail(preg, ret, br); /* OPEN -> first. */
349 else
350 ret = br;
351 if (!(flags&HASWIDTH))
352 *flagp &= ~HASWIDTH;
353 *flagp |= flags&SPSTART;
354 while (*preg->regparse == '|') {
355 preg->regparse++;
356 br = regbranch(preg, &flags);
357 if (br == NULL)
358 return(NULL);
359 regtail(preg, ret, br); /* BRANCH -> BRANCH. */
360 if (!(flags&HASWIDTH))
361 *flagp &= ~HASWIDTH;
362 *flagp |= flags&SPSTART;
365 /* Make a closing node, and hook it on the end. */
366 ender = regnode(preg, (paren) ? CLOSE+parno : END);
367 regtail(preg, ret, ender);
369 /* Hook the tails of the branches to the closing node. */
370 for (br = ret; br != NULL; br = (int *)regnext(preg, br))
371 regoptail(preg, br, ender);
373 /* Check for proper termination. */
374 if (paren && *preg->regparse++ != ')') {
375 preg->err = REG_ERR_UNMATCHED_PAREN;
376 return NULL;
377 } else if (!paren && *preg->regparse != '\0') {
378 if (*preg->regparse == ')') {
379 preg->err = REG_ERR_UNMATCHED_PAREN;
380 return NULL;
381 } else {
382 preg->err = REG_ERR_JUNK_ON_END;
383 return NULL;
387 return(ret);
391 - regbranch - one alternative of an | operator
393 * Implements the concatenation operator.
395 static int *regbranch(regex_t *preg, int *flagp )
397 int *ret;
398 int *chain;
399 int *latest;
400 int flags;
402 *flagp = WORST; /* Tentatively. */
404 ret = regnode(preg, BRANCH);
405 chain = NULL;
406 while (*preg->regparse != '\0' && *preg->regparse != ')' &&
407 *preg->regparse != '|') {
408 latest = regpiece(preg, &flags);
409 if (latest == NULL)
410 return(NULL);
411 *flagp |= flags&HASWIDTH;
412 if (chain == NULL) {/* First piece. */
413 *flagp |= flags&SPSTART;
415 else {
416 regtail(preg, chain, latest);
418 chain = latest;
420 if (chain == NULL) /* Loop ran zero times. */
421 (void) regnode(preg, NOTHING);
423 return(ret);
427 - regpiece - something followed by possible [*+?]
429 * Note that the branching code sequences used for ? and the general cases
430 * of * and + are somewhat optimized: they use the same NOTHING node as
431 * both the endmarker for their branch list and the body of the last branch.
432 * It might seem that this node could be dispensed with entirely, but the
433 * endmarker role is not redundant.
435 static int *regpiece(regex_t *preg, int *flagp)
437 int *ret;
438 char op;
439 int *next;
440 int flags;
441 int size = preg->regsize;
442 int *chain = NULL;
443 int min;
444 int max;
446 ret = regatom(preg, &flags);
447 if (ret == NULL)
448 return(NULL);
450 size = preg->regsize - size;
452 op = *preg->regparse;
453 if (!ISMULT(op)) {
454 *flagp = flags;
455 return(ret);
458 if (!(flags&HASWIDTH) && op != '?') {
459 preg->err = REG_ERR_OPERAND_COULD_BE_EMPTY;
460 return NULL;
463 /* Handle braces (counted repetition) by expansion */
464 if (op == '{') {
465 char *end;
467 min = strtoul(preg->regparse + 1, &end, 10);
468 if (end == preg->regparse + 1) {
469 preg->err = REG_ERR_BAD_COUNT;
470 return NULL;
472 if (*end == '}') {
473 max = min;
475 else {
476 preg->regparse = end;
477 max = strtoul(preg->regparse + 1, &end, 10);
478 if (*end != '}') {
479 preg->err = REG_ERR_UNMATCHED_BRACES;
480 return NULL;
483 if (end == preg->regparse + 1) {
484 max = MAX_REP_COUNT;
486 else if (max < min || max >= 100) {
487 preg->err = REG_ERR_BAD_COUNT;
488 return NULL;
490 if (min >= 100) {
491 preg->err = REG_ERR_BAD_COUNT;
492 return NULL;
495 preg->regparse = strchr(preg->regparse, '}');
497 else {
498 min = (op == '+');
499 max = (op == '?' ? 1 : MAX_REP_COUNT);
502 if (preg->regparse[1] == '?') {
503 preg->regparse++;
504 next = reginsert(preg, flags & SIMPLE ? REPMIN : REPXMIN, 5, ret);
506 else {
507 next = reginsert(preg, flags & SIMPLE ? REP: REPX, 5, ret);
509 ret[2] = max;
510 ret[3] = min;
511 ret[4] = 0;
513 *flagp = (min) ? (WORST|HASWIDTH) : (WORST|SPSTART);
515 if (!(flags & SIMPLE)) {
516 int *back = regnode(preg, BACK);
517 regtail(preg, back, ret);
518 regtail(preg, next, back);
521 preg->regparse++;
522 if (ISMULT(*preg->regparse)) {
523 preg->err = REG_ERR_NESTED_COUNT;
524 return NULL;
527 return chain ? chain : ret;
531 * Add all characters in the inclusive range between lower and upper.
533 * Handles a swapped range (upper < lower).
535 static void reg_addrange(regex_t *preg, int lower, int upper)
537 if (lower > upper) {
538 reg_addrange(preg, upper, lower);
540 /* Add a range as length, start */
541 regc(preg, upper - lower + 1);
542 regc(preg, lower);
546 * Add a null-terminated literal string as a set of ranges.
548 static void reg_addrange_str(regex_t *preg, const char *str)
550 while (*str) {
551 reg_addrange(preg, *str, *str);
552 str++;
557 * Extracts the next unicode char from utf8.
559 * If 'upper' is set, converts the char to uppercase.
561 static int reg_utf8_tounicode_case(const char *s, int *uc, int upper)
563 int l = utf8_tounicode(s, uc);
564 if (upper) {
565 *uc = utf8_upper(*uc);
567 return l;
571 * Converts a hex digit to decimal.
573 * Returns -1 for an invalid hex digit.
575 static int hexdigitval(int c)
577 if (c >= '0' && c <= '9')
578 return c - '0';
579 if (c >= 'a' && c <= 'f')
580 return c - 'a' + 10;
581 if (c >= 'A' && c <= 'F')
582 return c - 'A' + 10;
583 return -1;
587 * Parses up to 'n' hex digits at 's' and stores the result in *uc.
589 * Returns the number of hex digits parsed.
590 * If there are no hex digits, returns 0 and stores nothing.
592 static int parse_hex(const char *s, int n, int *uc)
594 int val = 0;
595 int k;
597 for (k = 0; k < n; k++) {
598 int c = hexdigitval(*s++);
599 if (c == -1) {
600 break;
602 val = (val << 4) | c;
604 if (k) {
605 *uc = val;
607 return k;
611 * Call for chars after a backlash to decode the escape sequence.
613 * Stores the result in *ch.
615 * Returns the number of bytes consumed.
617 static int reg_decode_escape(const char *s, int *ch)
619 int n;
620 const char *s0 = s;
622 *ch = *s++;
624 switch (*ch) {
625 case 'b': *ch = '\b'; break;
626 case 'e': *ch = 27; break;
627 case 'f': *ch = '\f'; break;
628 case 'n': *ch = '\n'; break;
629 case 'r': *ch = '\r'; break;
630 case 't': *ch = '\t'; break;
631 case 'v': *ch = '\v'; break;
632 case 'u':
633 if ((n = parse_hex(s, 4, ch)) > 0) {
634 s += n;
636 break;
637 case 'x':
638 if ((n = parse_hex(s, 2, ch)) > 0) {
639 s += n;
641 break;
642 case '\0':
643 s--;
644 *ch = '\\';
645 break;
647 return s - s0;
651 - regatom - the lowest level
653 * Optimization: gobbles an entire sequence of ordinary characters so that
654 * it can turn them into a single node, which is smaller to store and
655 * faster to run. Backslashed characters are exceptions, each becoming a
656 * separate node; the code is simpler that way and it's not worth fixing.
658 static int *regatom(regex_t *preg, int *flagp)
660 int *ret;
661 int flags;
662 int nocase = (preg->cflags & REG_ICASE);
664 int ch;
665 int n = reg_utf8_tounicode_case(preg->regparse, &ch, nocase);
667 *flagp = WORST; /* Tentatively. */
669 preg->regparse += n;
670 switch (ch) {
671 /* FIXME: these chars only have meaning at beg/end of pat? */
672 case '^':
673 ret = regnode(preg, BOL);
674 break;
675 case '$':
676 ret = regnode(preg, EOL);
677 break;
678 case '.':
679 ret = regnode(preg, ANY);
680 *flagp |= HASWIDTH|SIMPLE;
681 break;
682 case '[': {
683 const char *pattern = preg->regparse;
685 if (*pattern == '^') { /* Complement of range. */
686 ret = regnode(preg, ANYBUT);
687 pattern++;
688 } else
689 ret = regnode(preg, ANYOF);
691 /* Special case. If the first char is ']' or '-', it is part of the set */
692 if (*pattern == ']' || *pattern == '-') {
693 reg_addrange(preg, *pattern, *pattern);
694 pattern++;
697 while (*pattern && *pattern != ']') {
698 /* Is this a range? a-z */
699 int start;
700 int end;
702 pattern += reg_utf8_tounicode_case(pattern, &start, nocase);
703 if (start == '\\') {
704 pattern += reg_decode_escape(pattern, &start);
705 if (start == 0) {
706 preg->err = REG_ERR_NULL_CHAR;
707 return NULL;
710 if (pattern[0] == '-' && pattern[1]) {
711 /* skip '-' */
712 pattern += utf8_tounicode(pattern, &end);
713 pattern += reg_utf8_tounicode_case(pattern, &end, nocase);
714 if (end == '\\') {
715 pattern += reg_decode_escape(pattern, &end);
716 if (end == 0) {
717 preg->err = REG_ERR_NULL_CHAR;
718 return NULL;
722 reg_addrange(preg, start, end);
723 continue;
725 if (start == '[') {
726 if (strncmp(pattern, ":alpha:]", 8) == 0) {
727 if ((preg->cflags & REG_ICASE) == 0) {
728 reg_addrange(preg, 'a', 'z');
730 reg_addrange(preg, 'A', 'Z');
731 pattern += 8;
732 continue;
734 if (strncmp(pattern, ":alnum:]", 8) == 0) {
735 if ((preg->cflags & REG_ICASE) == 0) {
736 reg_addrange(preg, 'a', 'z');
738 reg_addrange(preg, 'A', 'Z');
739 reg_addrange(preg, '0', '9');
740 pattern += 8;
741 continue;
743 if (strncmp(pattern, ":space:]", 8) == 0) {
744 reg_addrange_str(preg, " \t\r\n\f\v");
745 pattern += 8;
746 continue;
749 /* Not a range, so just add the char */
750 reg_addrange(preg, start, start);
752 regc(preg, '\0');
754 if (*pattern) {
755 pattern++;
757 preg->regparse = pattern;
759 *flagp |= HASWIDTH|SIMPLE;
761 break;
762 case '(':
763 ret = reg(preg, 1, &flags);
764 if (ret == NULL)
765 return(NULL);
766 *flagp |= flags&(HASWIDTH|SPSTART);
767 break;
768 case '\0':
769 case '|':
770 case ')':
771 preg->err = REG_ERR_INTERNAL;
772 return NULL; /* Supposed to be caught earlier. */
773 case '?':
774 case '+':
775 case '*':
776 case '{':
777 preg->err = REG_ERR_COUNT_FOLLOWS_NOTHING;
778 return NULL;
779 case '\\':
780 switch (*preg->regparse++) {
781 case '\0':
782 preg->err = REG_ERR_TRAILING_BACKSLASH;
783 return NULL;
784 break;
785 case '<':
786 case 'm':
787 ret = regnode(preg, WORDA);
788 break;
789 case '>':
790 case 'M':
791 ret = regnode(preg, WORDZ);
792 break;
793 case 'd':
794 ret = regnode(preg, ANYOF);
795 reg_addrange(preg, '0', '9');
796 regc(preg, '\0');
797 *flagp |= HASWIDTH|SIMPLE;
798 break;
799 case 'w':
800 ret = regnode(preg, ANYOF);
801 if ((preg->cflags & REG_ICASE) == 0) {
802 reg_addrange(preg, 'a', 'z');
804 reg_addrange(preg, 'A', 'Z');
805 reg_addrange(preg, '0', '9');
806 reg_addrange(preg, '_', '_');
807 regc(preg, '\0');
808 *flagp |= HASWIDTH|SIMPLE;
809 break;
810 case 's':
811 ret = regnode(preg, ANYOF);
812 reg_addrange_str(preg," \t\r\n\f\v");
813 regc(preg, '\0');
814 *flagp |= HASWIDTH|SIMPLE;
815 break;
816 /* FIXME: Someday handle \1, \2, ... */
817 default:
818 /* Handle general quoted chars in exact-match routine */
819 /* Back up to include the backslash */
820 preg->regparse--;
821 goto de_fault;
823 break;
824 de_fault:
825 default: {
827 * Encode a string of characters to be matched exactly.
829 int added = 0;
831 /* Back up to pick up the first char of interest */
832 preg->regparse -= n;
834 ret = regnode(preg, EXACTLY);
836 /* Note that a META operator such as ? or * consumes the
837 * preceding char.
838 * Thus we must be careful to look ahead by 2 and add the
839 * last char as it's own EXACTLY if necessary
842 /* Until end of string or a META char is reached */
843 while (*preg->regparse && strchr(META, *preg->regparse) == NULL) {
844 n = reg_utf8_tounicode_case(preg->regparse, &ch, (preg->cflags & REG_ICASE));
845 if (ch == '\\' && preg->regparse[n]) {
846 /* Non-trailing backslash.
847 * Is this a special escape, or a regular escape?
849 if (strchr("<>mMwds", preg->regparse[n])) {
850 /* A special escape. All done with EXACTLY */
851 break;
853 /* Decode it. Note that we add the length for the escape
854 * sequence to the length for the backlash so we can skip
855 * the entire sequence, or not as required.
857 n += reg_decode_escape(preg->regparse + n, &ch);
858 if (ch == 0) {
859 preg->err = REG_ERR_NULL_CHAR;
860 return NULL;
864 /* Now we have one char 'ch' of length 'n'.
865 * Check to see if the following char is a MULT
868 if (ISMULT(preg->regparse[n])) {
869 /* Yes. But do we already have some EXACTLY chars? */
870 if (added) {
871 /* Yes, so return what we have and pick up the current char next time around */
872 break;
874 /* No, so add this single char and finish */
875 regc(preg, ch);
876 added++;
877 preg->regparse += n;
878 break;
881 /* No, so just add this char normally */
882 regc(preg, ch);
883 added++;
884 preg->regparse += n;
886 regc(preg, '\0');
888 *flagp |= HASWIDTH;
889 if (added == 1)
890 *flagp |= SIMPLE;
891 break;
893 break;
896 return(ret);
900 - regnode - emit a node
902 /* Location. */
903 static int *regnode(regex_t *preg, int op)
905 int *ret;
906 int *ptr;
908 preg->regsize += 2;
909 ret = preg->regcode;
910 if (ret == &regdummy) {
911 return(ret);
914 ptr = ret;
915 *ptr++ = op;
916 *ptr++ = 0; /* Null "next" pointer. */
917 preg->regcode = ptr;
919 return(ret);
923 - regc - emit (if appropriate) a byte of code
925 static void regc(regex_t *preg, int b )
927 preg->regsize++;
928 if (preg->regcode != &regdummy)
929 *preg->regcode++ = b;
933 - reginsert - insert an operator in front of already-emitted operand
935 * Means relocating the operand.
936 * Returns the new location of the original operand.
938 static int *reginsert(regex_t *preg, int op, int size, int *opnd )
940 int *src;
941 int *dst;
942 int *place;
944 preg->regsize += size;
946 if (preg->regcode == &regdummy) {
947 return opnd;
950 src = preg->regcode;
951 preg->regcode += size;
952 dst = preg->regcode;
953 while (src > opnd)
954 *--dst = *--src;
956 place = opnd; /* Op node, where operand used to be. */
957 *place++ = op;
958 while (--size) {
959 *place++ = 0;
962 return place;
966 - regtail - set the next-pointer at the end of a node chain
968 static void regtail(regex_t *preg, int *p, const int *val )
970 int *scan;
971 int *temp;
972 int offset;
974 if (p == &regdummy)
975 return;
977 /* Find last node. */
978 scan = p;
979 for (;;) {
980 temp = (int *)regnext(preg, scan);
981 if (temp == NULL)
982 break;
983 scan = temp;
986 if (OP(scan) == BACK)
987 offset = scan - val;
988 else
989 offset = val - scan;
991 scan[1] = offset;
995 - regoptail - regtail on operand of first argument; nop if operandless
998 static void regoptail(regex_t *preg, int *p, const int *val )
1000 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
1001 if (p == NULL || p == &regdummy || OP(p) != BRANCH)
1002 return;
1003 regtail(preg, OPERAND(p), val);
1007 * regexec and friends
1011 * Forwards.
1013 static int regtry(regex_t *preg, const char *string );
1014 static int regmatch(regex_t *preg, const int *prog);
1015 static int regrepeat(regex_t *preg, const int *p, int max);
1018 - regexec - match a regexp against a string
1020 int regexec(regex_t *preg, const char *string, size_t nmatch, regmatch_t pmatch[], int eflags)
1022 const char *s;
1023 const int *scan;
1025 /* Be paranoid... */
1026 if (preg == NULL || preg->program == NULL || string == NULL) {
1027 return REG_ERR_NULL_ARGUMENT;
1030 /* Check validity of program. */
1031 if (*preg->program != REG_MAGIC) {
1032 return REG_ERR_CORRUPTED;
1035 #ifdef DEBUG
1036 fprintf(stderr, "regexec: %s\n", string);
1037 regdump(preg);
1038 #endif
1040 preg->eflags = eflags;
1041 preg->pmatch = pmatch;
1042 preg->nmatch = nmatch;
1043 preg->start = string; /* All offsets are computed from here */
1045 /* Must clear out the embedded repeat counts */
1046 for (scan = OPERAND(preg->program + 1); scan != NULL; scan = regnext(preg, scan)) {
1047 switch (OP(scan)) {
1048 case REP:
1049 case REPMIN:
1050 case REPX:
1051 case REPXMIN:
1052 *(int *)(scan + 4) = 0;
1053 break;
1057 /* If there is a "must appear" string, look for it. */
1058 if (preg->regmust != NULL) {
1059 s = string;
1060 while ((s = str_find(s, preg->regmust[0], preg->cflags & REG_ICASE)) != NULL) {
1061 if (prefix_cmp(preg->regmust, preg->regmlen, s, preg->cflags & REG_ICASE) >= 0) {
1062 break;
1064 s++;
1066 if (s == NULL) /* Not present. */
1067 return REG_NOMATCH;
1070 /* Mark beginning of line for ^ . */
1071 preg->regbol = string;
1073 /* Simplest case: anchored match need be tried only once (maybe per line). */
1074 if (preg->reganch) {
1075 if (eflags & REG_NOTBOL) {
1076 /* This is an anchored search, but not an BOL, so possibly skip to the next line */
1077 goto nextline;
1079 while (1) {
1080 int ret = regtry(preg, string);
1081 if (ret) {
1082 return REG_NOERROR;
1084 if (*string) {
1085 nextline:
1086 if (preg->cflags & REG_NEWLINE) {
1087 /* Try the next anchor? */
1088 string = strchr(string, '\n');
1089 if (string) {
1090 preg->regbol = ++string;
1091 continue;
1095 return REG_NOMATCH;
1099 /* Messy cases: unanchored match. */
1100 s = string;
1101 if (preg->regstart != '\0') {
1102 /* We know what char it must start with. */
1103 while ((s = str_find(s, preg->regstart, preg->cflags & REG_ICASE)) != NULL) {
1104 if (regtry(preg, s))
1105 return REG_NOERROR;
1106 s++;
1109 else
1110 /* We don't -- general case. */
1111 while (1) {
1112 if (regtry(preg, s))
1113 return REG_NOERROR;
1114 if (*s == '\0') {
1115 break;
1117 s += utf8_charlen(*s);
1120 /* Failure. */
1121 return REG_NOMATCH;
1125 - regtry - try match at specific point
1127 /* 0 failure, 1 success */
1128 static int regtry( regex_t *preg, const char *string )
1130 int i;
1132 preg->reginput = string;
1134 for (i = 0; i < preg->nmatch; i++) {
1135 preg->pmatch[i].rm_so = -1;
1136 preg->pmatch[i].rm_eo = -1;
1138 if (regmatch(preg, preg->program + 1)) {
1139 preg->pmatch[0].rm_so = string - preg->start;
1140 preg->pmatch[0].rm_eo = preg->reginput - preg->start;
1141 return(1);
1142 } else
1143 return(0);
1147 * Returns bytes matched if 'pattern' is a prefix of 'string'.
1149 * If 'nocase' is non-zero, does a case-insensitive match.
1151 * Returns -1 on not found.
1153 static int prefix_cmp(const int *prog, int proglen, const char *string, int nocase)
1155 const char *s = string;
1156 while (proglen && *s) {
1157 int ch;
1158 int n = reg_utf8_tounicode_case(s, &ch, nocase);
1159 if (ch != *prog) {
1160 return -1;
1162 prog++;
1163 s += n;
1164 proglen--;
1166 if (proglen == 0) {
1167 return s - string;
1169 return -1;
1173 * Searchs for 'c' in the range 'range'.
1175 * Returns 1 if found, or 0 if not.
1177 static int reg_range_find(const int *range, int c)
1179 while (*range) {
1180 /*printf("Checking %d in range [%d,%d]\n", c, range[1], (range[0] + range[1] - 1));*/
1181 if (c >= range[1] && c <= (range[0] + range[1] - 1)) {
1182 return 1;
1184 range += 2;
1186 return 0;
1190 * Search for the character 'c' in the utf-8 string 'string'.
1192 * If 'nocase' is set, the 'string' is assumed to be uppercase
1193 * and 'c' is converted to uppercase before matching.
1195 * Returns the byte position in the string where the 'c' was found, or
1196 * NULL if not found.
1198 static const char *str_find(const char *string, int c, int nocase)
1200 if (nocase) {
1201 /* The "string" should already be converted to uppercase */
1202 c = utf8_upper(c);
1204 while (*string) {
1205 int ch;
1206 int n = reg_utf8_tounicode_case(string, &ch, nocase);
1207 if (c == ch) {
1208 return string;
1210 string += n;
1212 return NULL;
1216 * Returns true if 'ch' is an end-of-line char.
1218 * In REG_NEWLINE mode, \n is considered EOL in
1219 * addition to \0
1221 static int reg_iseol(regex_t *preg, int ch)
1223 if (preg->cflags & REG_NEWLINE) {
1224 return ch == '\0' || ch == '\n';
1226 else {
1227 return ch == '\0';
1231 static int regmatchsimplerepeat(regex_t *preg, const int *scan, int matchmin)
1233 int nextch = '\0';
1234 const char *save;
1235 int no;
1236 int c;
1238 int max = scan[2];
1239 int min = scan[3];
1240 const int *next = regnext(preg, scan);
1243 * Lookahead to avoid useless match attempts
1244 * when we know what character comes next.
1246 if (OP(next) == EXACTLY) {
1247 nextch = *OPERAND(next);
1249 save = preg->reginput;
1250 no = regrepeat(preg, scan + 5, max);
1251 if (no < min) {
1252 return 0;
1254 if (matchmin) {
1255 /* from min up to no */
1256 max = no;
1257 no = min;
1259 /* else from no down to min */
1260 while (1) {
1261 if (matchmin) {
1262 if (no > max) {
1263 break;
1266 else {
1267 if (no < min) {
1268 break;
1271 preg->reginput = save + utf8_index(save, no);
1272 reg_utf8_tounicode_case(preg->reginput, &c, (preg->cflags & REG_ICASE));
1273 /* If it could work, try it. */
1274 if (reg_iseol(preg, nextch) || c == nextch) {
1275 if (regmatch(preg, next)) {
1276 return(1);
1279 if (matchmin) {
1280 /* Couldn't or didn't, add one more */
1281 no++;
1283 else {
1284 /* Couldn't or didn't -- back up. */
1285 no--;
1288 return(0);
1291 static int regmatchrepeat(regex_t *preg, int *scan, int matchmin)
1293 const char *save;
1295 int max = scan[2];
1296 int min = scan[3];
1298 save = preg->reginput;
1300 /* Have we reached min? */
1301 if (scan[4] < min) {
1302 /* No, so get another one */
1303 scan[4]++;
1304 if (regmatch(preg, scan + 5)) {
1305 return 1;
1307 scan[4]--;
1308 return 0;
1310 if (scan[4] > max) {
1311 return 0;
1314 if (matchmin) {
1315 /* minimal, so try other branch first */
1316 if (regmatch(preg, regnext(preg, scan))) {
1317 return 1;
1319 /* No, so try one more */
1320 scan[4]++;
1321 if (regmatch(preg, scan + 5)) {
1322 return 1;
1324 scan[4]--;
1325 return 0;
1327 /* maximal, so try this branch again */
1328 save = preg->reginput;
1329 if (scan[4] < max) {
1330 scan[4]++;
1331 if (regmatch(preg, scan + 5)) {
1332 return 1;
1334 scan[4]--;
1336 /* At this point we are at max with no match. Back up by one and try the other branch */
1337 preg->reginput = save;
1338 int ret = regmatch(preg, regnext(preg, scan));
1339 return ret;
1343 - regmatch - main matching routine
1345 * Conceptually the strategy is simple: check to see whether the current
1346 * node matches, call self recursively to see whether the rest matches,
1347 * and then act accordingly. In practice we make some effort to avoid
1348 * recursion, in particular by going through "ordinary" nodes (that don't
1349 * need to know whether the rest of the match failed) by a loop instead of
1350 * by recursion.
1352 /* 0 failure, 1 success */
1353 static int regmatch(regex_t *preg, const int *prog)
1355 const int *scan; /* Current node. */
1356 const int *next; /* Next node. */
1358 scan = prog;
1360 #ifdef DEBUG
1361 if (scan != NULL && regnarrate)
1362 fprintf(stderr, "%s(\n", regprop(scan));
1363 #endif
1364 while (scan != NULL) {
1365 int n;
1366 int c;
1367 #ifdef DEBUG
1368 if (regnarrate) {
1369 //fprintf(stderr, "%s...\n", regprop(scan));
1370 int op = OP(scan);
1371 fprintf(stderr, "%2d{%02x}%s...\n", (int)(scan-preg->program), op, regprop(scan)); /* Where, what. */
1373 #endif
1374 next = regnext(preg, scan);
1375 n = reg_utf8_tounicode_case(preg->reginput, &c, (preg->cflags & REG_ICASE));
1377 switch (OP(scan)) {
1378 case BOL:
1379 if (preg->reginput != preg->regbol)
1380 return(0);
1381 break;
1382 case EOL:
1383 if (!reg_iseol(preg, c)) {
1384 return(0);
1386 break;
1387 case WORDA:
1388 /* Must be looking at a letter, digit, or _ */
1389 if ((!isalnum(UCHAR(c))) && c != '_')
1390 return(0);
1391 /* Prev must be BOL or nonword */
1392 if (preg->reginput > preg->regbol &&
1393 (isalnum(UCHAR(preg->reginput[-1])) || preg->reginput[-1] == '_'))
1394 return(0);
1395 break;
1396 case WORDZ:
1397 /* Can't match at BOL */
1398 if (preg->reginput > preg->regbol) {
1399 /* Current must be EOL or nonword */
1400 if (reg_iseol(preg, c) || !isalnum(UCHAR(c)) || c != '_') {
1401 c = preg->reginput[-1];
1402 /* Previous must be word */
1403 if (isalnum(UCHAR(c)) || c == '_') {
1404 break;
1408 /* No */
1409 return(0);
1411 case ANY:
1412 if (reg_iseol(preg, c))
1413 return 0;
1414 preg->reginput += n;
1415 break;
1416 case EXACTLY: {
1417 const int *opnd;
1418 int len;
1419 int slen;
1421 opnd = OPERAND(scan);
1422 len = str_int_len(opnd);
1424 slen = prefix_cmp(opnd, len, preg->reginput, preg->cflags & REG_ICASE);
1425 if (slen < 0) {
1426 return(0);
1428 preg->reginput += slen;
1430 break;
1431 case ANYOF:
1432 if (reg_iseol(preg, c) || reg_range_find(OPERAND(scan), c) == 0) {
1433 return(0);
1435 preg->reginput += n;
1436 break;
1437 case ANYBUT:
1438 if (reg_iseol(preg, c) || reg_range_find(OPERAND(scan), c) != 0) {
1439 return(0);
1441 preg->reginput += n;
1442 break;
1443 case NOTHING:
1444 break;
1445 case BACK:
1446 break;
1447 case BRANCH: {
1448 const char *save;
1450 if (OP(next) != BRANCH) /* No choice. */
1451 next = OPERAND(scan); /* Avoid recursion. */
1452 else {
1453 do {
1454 save = preg->reginput;
1455 if (regmatch(preg, OPERAND(scan))) {
1456 return(1);
1458 preg->reginput = save;
1459 scan = regnext(preg, scan);
1460 } while (scan != NULL && OP(scan) == BRANCH);
1461 return(0);
1462 /* NOTREACHED */
1465 break;
1466 case REP:
1467 case REPMIN:
1468 return regmatchsimplerepeat(preg, scan, OP(scan) == REPMIN);
1470 case REPX:
1471 case REPXMIN:
1472 return regmatchrepeat(preg, (int *)scan, OP(scan) == REPXMIN);
1474 case END:
1475 return(1); /* Success! */
1476 break;
1477 default:
1478 if (OP(scan) >= OPEN+1 && OP(scan) < CLOSE_END) {
1479 const char *save;
1481 save = preg->reginput;
1483 if (regmatch(preg, next)) {
1484 int no;
1486 * Don't set startp if some later
1487 * invocation of the same parentheses
1488 * already has.
1490 if (OP(scan) < CLOSE) {
1491 no = OP(scan) - OPEN;
1492 if (no < preg->nmatch && preg->pmatch[no].rm_so == -1) {
1493 preg->pmatch[no].rm_so = save - preg->start;
1496 else {
1497 no = OP(scan) - CLOSE;
1498 if (no < preg->nmatch && preg->pmatch[no].rm_eo == -1) {
1499 preg->pmatch[no].rm_eo = save - preg->start;
1502 return(1);
1503 } else
1504 return(0);
1506 return REG_ERR_INTERNAL;
1509 scan = next;
1513 * We get here only if there's trouble -- normally "case END" is
1514 * the terminating point.
1516 return REG_ERR_INTERNAL;
1520 - regrepeat - repeatedly match something simple, report how many
1522 static int regrepeat(regex_t *preg, const int *p, int max)
1524 int count = 0;
1525 const char *scan;
1526 const int *opnd;
1527 int ch;
1528 int n;
1530 scan = preg->reginput;
1531 opnd = OPERAND(p);
1532 switch (OP(p)) {
1533 case ANY:
1534 /* No need to handle utf8 specially here */
1535 while (!reg_iseol(preg, *scan) && count < max) {
1536 count++;
1537 scan++;
1539 break;
1540 case EXACTLY:
1541 while (count < max) {
1542 n = reg_utf8_tounicode_case(scan, &ch, preg->cflags & REG_ICASE);
1543 if (*opnd != ch) {
1544 break;
1546 count++;
1547 scan += n;
1549 break;
1550 case ANYOF:
1551 while (count < max) {
1552 n = reg_utf8_tounicode_case(scan, &ch, preg->cflags & REG_ICASE);
1553 if (reg_iseol(preg, ch) || reg_range_find(opnd, ch) == 0) {
1554 break;
1556 count++;
1557 scan += n;
1559 break;
1560 case ANYBUT:
1561 while (count < max) {
1562 n = reg_utf8_tounicode_case(scan, &ch, preg->cflags & REG_ICASE);
1563 if (reg_iseol(preg, ch) || reg_range_find(opnd, ch) != 0) {
1564 break;
1566 count++;
1567 scan += n;
1569 break;
1570 default: /* Oh dear. Called inappropriately. */
1571 preg->err = REG_ERR_INTERNAL;
1572 count = 0; /* Best compromise. */
1573 break;
1575 preg->reginput = scan;
1577 return(count);
1581 - regnext - dig the "next" pointer out of a node
1583 static const int *regnext(regex_t *preg, const int *p )
1585 int offset;
1587 if (p == &regdummy)
1588 return(NULL);
1590 offset = NEXT(p);
1592 if (offset == 0)
1593 return(NULL);
1595 if (OP(p) == BACK)
1596 return(p-offset);
1597 else
1598 return(p+offset);
1601 #ifdef DEBUG
1604 - regdump - dump a regexp onto stdout in vaguely comprehensible form
1606 static void regdump(regex_t *preg)
1608 const int *s;
1609 char op = EXACTLY; /* Arbitrary non-END op. */
1610 const int *next;
1611 char buf[4];
1613 if (preg->regcode == &regdummy)
1614 return;
1616 s = preg->program + 1;
1617 while (op != END && s < preg->regcode) { /* While that wasn't END last time... */
1618 op = OP(s);
1619 printf("%2d{%02x}%s", (int)(s-preg->program), op, regprop(s)); /* Where, what. */
1620 next = regnext(preg, s);
1621 if (next == NULL) /* Next ptr. */
1622 printf("(0)");
1623 else
1624 printf("(%d)", (int)((s-preg->program)+(next-s)));
1625 s += 2;
1626 if (op == REP || op == REPMIN || op == REPX || op == REPXMIN) {
1627 int max = s[0];
1628 int min = s[1];
1629 if (max == 65535) {
1630 printf("{%d,*}", min);
1632 else {
1633 printf("{%d,%d}", min, max);
1635 printf(" %d", s[2]);
1636 s += 3;
1638 else if (op == ANYOF || op == ANYBUT) {
1639 /* set of ranges */
1641 while (*s) {
1642 int len = *s++;
1643 int first = *s++;
1644 buf[utf8_fromunicode(buf, first)] = 0;
1645 printf("%s", buf);
1646 if (len > 1) {
1647 buf[utf8_fromunicode(buf, first + len - 1)] = 0;
1648 printf("-%s", buf);
1651 s++;
1653 else if (op == EXACTLY) {
1654 /* Literal string, where present. */
1656 while (*s) {
1657 buf[utf8_fromunicode(buf, *s)] = 0;
1658 printf("%s", buf);
1659 s++;
1661 s++;
1663 putchar('\n');
1666 if (op == END) {
1667 /* Header fields of interest. */
1668 if (preg->regstart) {
1669 buf[utf8_fromunicode(buf, preg->regstart)] = 0;
1670 printf("start '%s' ", buf);
1672 if (preg->reganch)
1673 printf("anchored ");
1674 if (preg->regmust != NULL) {
1675 int i;
1676 printf("must have:");
1677 for (i = 0; i < preg->regmlen; i++) {
1678 putchar(preg->regmust[i]);
1680 putchar('\n');
1683 printf("\n");
1687 - regprop - printable representation of opcode
1689 static const char *regprop( const int *op )
1691 char *p;
1692 static char buf[50];
1694 (void) strcpy(buf, ":");
1696 switch (OP(op)) {
1697 case BOL:
1698 p = "BOL";
1699 break;
1700 case EOL:
1701 p = "EOL";
1702 break;
1703 case ANY:
1704 p = "ANY";
1705 break;
1706 case ANYOF:
1707 p = "ANYOF";
1708 break;
1709 case ANYBUT:
1710 p = "ANYBUT";
1711 break;
1712 case BRANCH:
1713 p = "BRANCH";
1714 break;
1715 case EXACTLY:
1716 p = "EXACTLY";
1717 break;
1718 case NOTHING:
1719 p = "NOTHING";
1720 break;
1721 case BACK:
1722 p = "BACK";
1723 break;
1724 case END:
1725 p = "END";
1726 break;
1727 case REP:
1728 p = "REP";
1729 break;
1730 case REPMIN:
1731 p = "REPMIN";
1732 break;
1733 case REPX:
1734 p = "REPX";
1735 break;
1736 case REPXMIN:
1737 p = "REPXMIN";
1738 break;
1739 case WORDA:
1740 p = "WORDA";
1741 break;
1742 case WORDZ:
1743 p = "WORDZ";
1744 break;
1745 default:
1746 if (OP(op) >= OPEN && OP(op) < CLOSE) {
1747 sprintf(buf+strlen(buf), "OPEN%d", OP(op)-OPEN);
1749 else if (OP(op) >= CLOSE && OP(op) < CLOSE_END) {
1750 sprintf(buf+strlen(buf), "CLOSE%d", OP(op)-CLOSE);
1752 else {
1753 abort();
1755 p = NULL;
1756 break;
1758 if (p != NULL)
1759 (void) strcat(buf, p);
1760 return(buf);
1762 #endif
1764 size_t regerror(int errcode, const regex_t *preg, char *errbuf, size_t errbuf_size)
1766 static const char *error_strings[] = {
1767 "success",
1768 "no match",
1769 "bad pattern",
1770 "null argument",
1771 "unknown error",
1772 "too big",
1773 "out of memory",
1774 "too many ()",
1775 "parentheses () not balanced",
1776 "braces {} not balanced",
1777 "invalid repetition count(s)",
1778 "extra characters",
1779 "*+ of empty atom",
1780 "nested count",
1781 "internal error",
1782 "count follows nothing",
1783 "trailing backslash",
1784 "corrupted program",
1785 "contains null char",
1787 const char *err;
1789 if (errcode < 0 || errcode >= REG_ERR_NUM) {
1790 err = "Bad error code";
1792 else {
1793 err = error_strings[errcode];
1796 return snprintf(errbuf, errbuf_size, "%s", err);
1799 void regfree(regex_t *preg)
1801 free(preg->program);
1804 #endif