cosmetix
[k8jam.git] / src / hsregexp.c
blobba89027c92bcd7ea5042090a7fe4ef4e09ae1abe
1 /*
2 * hsRxCompile and hsRxExec -- regsub and hsRxError are elsewhere
4 * Copyright (c) 1986 by University of Toronto.
5 * Written by Henry Spencer. Not derived from licensed software.
7 * Permission is granted to anyone to use this software for any
8 * purpose on any computer system, and to redistribute it freely,
9 * subject to the following restrictions:
11 * 1. The author is not responsible for the consequences of use of
12 * this software, no matter how awful, even if they arise
13 * from defects in it.
15 * 2. The origin of this software must not be misrepresented, either
16 * by explicit claim or by omission.
18 * 3. Altered versions must be plainly marked as such, and must not
19 * be misrepresented as being the original software.
20 *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore,
21 *** hoptoad!gnu, on 27 Dec 1986, to add \n as an alternative to |
22 *** to assist in implementing egrep.
23 *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore,
24 *** hoptoad!gnu, on 27 Dec 1986, to add \< and \> for word-matching
25 *** as in BSD grep and ex.
26 *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore,
27 *** hoptoad!gnu, on 28 Dec 1986, to optimize characters quoted with \.
28 *** THIS IS AN ALTERED VERSION. It was altered by James A. Woods,
29 *** ames!jaw, on 19 June 1987, to quash a hsRxCompile() redundancy.
30 *** THIS IS AN ALTERED VERSION. It was altered by Christopher Seiwald
31 *** seiwald@vix.com, on 28 August 1993, for use in jam. Regmagic.h
32 *** was moved into hsregexp.h, and the include of hsregexp.h now uses "'s
33 *** to avoid conflicting with the system hsregexp.h. Const, bless its
34 *** soul, was removed so it can compile everywhere. The declaration
35 *** of strchr() was in conflict on AIX, so it was removed (as it is
36 *** happily defined in string.h).
37 *** THIS IS AN ALTERED VERSION. It was altered by Christopher Seiwald
38 *** seiwald@perforce.com, on 20 January 2000, to use function prototypes.
39 *** THIS IS AN ALTERED VERSION. It was altered by Christopher Seiwald
40 *** seiwald@perforce.com, on 05 November 2002, to const string literals.
42 * Beware that some of this code is subtly aware of the way operator
43 * precedence is structured in regular expressions. Serious changes in
44 * regular-expression syntax might require a total rethink.
46 #include "hsregexp.h"
48 #include <ctype.h>
49 #include <stdio.h>
50 #include <stdlib.h>
51 #include <string.h>
54 * The "internal use only" fields in hsregexp.h are present to pass info from
55 * compile to execute that permits the execute phase to run lots faster on
56 * simple cases. They are:
58 * regstart char that must begin a match; '\0' if none obvious
59 * reganch is the match anchored (at beginning-of-line only)?
60 * regmust string (pointer into program) that match must include, or NULL
61 * regmlen length of regmust string
63 * Regstart and reganch permit very fast decisions on suitable starting points
64 * for a match, cutting down the work a lot. Regmust permits fast rejection
65 * of lines that cannot possibly match. The regmust tests are costly enough
66 * that hsRxCompile() supplies a regmust only if the r.e. contains something
67 * potentially expensive (at present, the only such thing detected is * or +
68 * at the start of the r.e., which can involve a lot of backup). Regmlen is
69 * supplied because the test in hsRxExec() needs it and hsRxCompile() is computing
70 * it anyway.
74 * Structure for regexp "program". This is essentially a linear encoding
75 * of a nondeterministic finite-state machine (aka syntax charts or
76 * "railroad normal form" in parsing technology). Each node is an opcode
77 * plus a "next" pointer, possibly plus an operand. "Next" pointers of
78 * all nodes except BRANCH implement concatenation; a "next" pointer with
79 * a BRANCH on both ends of it is connecting two alternatives. (Here we
80 * have one of the subtle syntax dependencies: an individual BRANCH (as
81 * opposed to a collection of them) is never concatenated with anything
82 * because of operator precedence.) The operand of some types of node is
83 * a literal string; for others, it is a node leading into a sub-FSM. In
84 * particular, the operand of a BRANCH node is the first node of the branch.
85 * (NB this is *not* a tree structure: the tail of the branch connects
86 * to the thing following the set of BRANCHes.) The opcodes are:
89 /* definition opnd? meaning */
90 enum {
91 END, /* no End of program */
92 BOL, /* no Match "" at beginning of line */
93 EOL, /* no Match "" at end of line */
94 ANY, /* no Match any one character */
95 ANYOF, /* str Match any character in this string */
96 ANYBUT, /* str Match any character not in this string */
97 BRANCH, /* node Match this alternative, or the next */
98 BACK, /* no Match "", "next" ptr points backward */
99 EXACTLY, /* str Match this string */
100 NOTHING, /* no Match empty string */
101 STAR, /* node Match this (simple) thing 0 or more times */
102 PLUS, /* node Match this (simple) thing 1 or more times */
103 WORDA, /* no Match "" at wordchar, where prev is nonword */
104 WORDZ, /* no Match "" at nonwordchar, where prev is word */
106 OPEN = 40, /* no Mark this point in input as start of #n */
107 /* OPEN+1 is number 1, etc. */
108 CLOSE = 60 /* no Analogous to OPEN */
113 * Opcode notes:
115 * BRANCH
116 * The set of branches constituting a single choice are hooked
117 * together with their "next" pointers, since precedence prevents
118 * anything being concatenated to any individual branch.
119 * The "next" pointer of the last BRANCH in a choice points to the
120 * thing following the whole choice. This is also where the
121 * final "next" pointer of each individual branch points; each
122 * branch starts with the operand node of a BRANCH node.
124 * BACK
125 * Normal "next" pointers all implicitly point forward; BACK
126 * exists to make loop structures possible.
128 * STAR, PLUS
129 * '?' and complex '*' and '+', are implemented as circular
130 * BRANCH structures using BACK. Simple cases (one character
131 * per match) are implemented with STAR and PLUS for speed
132 * and to minimize recursive plunges.
134 * OPEN,CLOSE ...are numbered at compile time.
138 * A node is one char of opcode followed by two chars of "next" pointer.
139 * "Next" pointers are stored as two 8-bit pieces, high order first.
140 * The value is a positive offset from the opcode of the node containing it.
141 * An operand, if any, simply follows the node. (Note that much of the
142 * code generation knows about this implicit relationship.)
144 * Using two bytes for the "next" pointer is vast overkill for most things,
145 * but allows patterns to get big without disasters.
147 #define OP(p) (*(p))
148 #define NEXT(p) (((*((p)+1)&0377)<<8)+(*((p)+2)&0377))
149 #define OPERAND(p) ((p)+3)
153 * utility definitions
155 #ifndef CHARBITS
156 # define UCHARAT(p) ((int)*(unsigned char *)(p))
157 #else
158 # define UCHARAT(p) ((int)*(p)&CHARBITS)
159 #endif
161 #define FAIL(m) { hsRxError(m); return(NULL); }
162 #define ISMULT(c) ((c) == '*' || (c) == '+' || (c) == '?')
166 * flags to be passed up and down
168 #define WORST 0 /* worst case */
169 #define HASWIDTH 1 /* known never to match null string */
170 #define SIMPLE 2 /* simple enough to be STAR/PLUS operand */
171 #define SPSTART 4 /* starts with * or + */
175 * The first byte of the regexp internal "program" is actually this magic
176 * number; the start node begins in the second byte.
178 #define MAGIC (0234)
182 * global work variables for hsRxCompile()
184 static char *regparse; /* input-scan pointer */
185 static int regnpar; /* () count */
186 static char regdummy;
187 static char *regcode; /* code-emit pointer; &regdummy = don't */
188 static long regsize; /* code size */
191 * forward declarations for hsRxCompile()'s friends
193 static char *reg (int paren, int *flagp);
194 static char *regbranch (int *flagp);
195 static char *regpiece (int *flagp);
196 static char *regatom (int *flagp);
197 static char *regnode (int op);
198 static char *regnext (char *p);
199 static void regc (int b);
200 static void reginsert (char op, char *opnd);
201 static void regtail (char *p, char *val);
202 static void regoptail (char *p, char *val);
203 #ifdef STRCSPN
204 static int strcspn (const char *s1, const char *s2);
205 #endif
209 - hsRxCompile - compile a regular expression into internal code
211 * We can't allocate space until we know how big the compiled form will be,
212 * but we can't compile it (and thus know how big it is) until we've got a
213 * place to put the code. So we cheat: we compile it twice, once with code
214 * generation turned off and size counting turned on, and once "for real".
215 * This also means that we don't allocate space until we are sure that the
216 * thing really will compile successfully, and we never have to move the
217 * code and thus invalidate pointers into it. (Note that it has to be in
218 * one piece because free() must be able to free it all.)
220 * Beware that the optimization-preparation code in here knows about some
221 * of the structure of the compiled regexp.
223 HSRegExp *hsRxCompile (const char *exp) {
224 HSRegExp *r;
225 char *scan;
226 char *longest;
227 unsigned len;
228 int flags;
230 if (exp == NULL) FAIL("NULL argument");
231 /* first pass: determine size, legality */
232 #ifdef notdef
233 if (exp[0] == '.' && exp[1] == '*') exp += 2; /* aid grep */
234 #endif
235 regparse = (char *)exp;
236 regnpar = 1;
237 regsize = 0L;
238 regcode = &regdummy;
239 regc(MAGIC);
240 if (reg(0, &flags) == NULL) return NULL;
241 /* small enough for pointer-storage convention? */
242 if (regsize >= 32767L) FAIL("regexp too big"); /* probably could be 65535L */
243 /* allocate space. */
244 r = (HSRegExp *)malloc(sizeof(HSRegExp)+(unsigned)regsize);
245 if (r == NULL) FAIL("out of space");
246 /* second pass: emit code */
247 r->parcnt = regnpar;
248 regparse = (char *)exp;
249 regnpar = 1;
250 regcode = r->program;
251 regc(MAGIC);
252 if (reg(0, &flags) == NULL) return NULL;
253 /* dig out information for optimizations */
254 r->regstart = '\0'; /* worst-case defaults */
255 r->reganch = 0;
256 r->regmust = NULL;
257 r->regmlen = 0;
258 scan = r->program+1; /* first BRANCH */
259 if (OP(regnext(scan)) == END) {
260 /* only one top-level choice */
261 scan = OPERAND(scan);
262 /* starting-point info */
263 if (OP(scan) == EXACTLY) r->regstart = *OPERAND(scan);
264 else if (OP(scan) == BOL) ++(r->reganch);
265 /* if there's something expensive in the r.e., find the longest
266 * literal string that must appear and make it the regmust */
267 /* resolve ties in favor of later strings, since the regstart
268 * check works with the beginning of the r.e. and avoiding
269 * duplication strengthens checking */
270 /* not a strong reason, but sufficient in the absence of others */
271 if (flags&SPSTART) {
272 longest = NULL;
273 len = 0;
274 for (; scan != NULL; scan = regnext(scan)) {
275 if (OP(scan) == EXACTLY && strlen(OPERAND(scan)) >= len) {
276 longest = OPERAND(scan);
277 len = strlen(OPERAND(scan));
280 r->regmust = longest;
281 r->regmlen = len;
284 return r;
289 - reg - regular expression, i.e. main body or parenthesized thing
291 * Caller must absorb opening parenthesis.
293 * Combining parenthesis handling with the base level of regular expression
294 * is a trifle forced, but the need to tie the tails of the branches to what
295 * follows makes it hard to avoid.
297 static char *reg (int paren/*parenthesized?*/, int *flagp) {
298 char *ret;
299 char *br;
300 char *ender;
301 int parno = 0; /*k8*/
302 int flags;
304 *flagp = HASWIDTH; /* tentatively */
305 /* make an OPEN node, if parenthesized */
306 if (paren) {
307 if (regnpar >= NSUBEXP) FAIL("too many ()");
308 parno = regnpar;
309 ++regnpar;
310 ret = regnode(OPEN+parno);
311 } else {
312 ret = NULL;
314 /* pick up the branches, linking them together */
315 br = regbranch(&flags);
316 if (br == NULL) return NULL;
317 if (ret != NULL) regtail(ret, br); /* OPEN -> first */ else ret = br;
318 if (!(flags&HASWIDTH)) *flagp &= ~HASWIDTH;
319 *flagp |= flags&SPSTART;
320 while (*regparse == '|' || *regparse == '\n') {
321 ++regparse;
322 br = regbranch(&flags);
323 if (br == NULL) return NULL;
324 regtail(ret, br); /* BRANCH -> BRANCH */
325 if (!(flags&HASWIDTH)) *flagp &= ~HASWIDTH;
326 *flagp |= flags&SPSTART;
328 /* make a closing node, and hook it on the end */
329 ender = regnode(paren?CLOSE+parno:END);
330 regtail(ret, ender);
331 /* hook the tails of the branches to the closing node */
332 for (br = ret; br != NULL; br = regnext(br)) regoptail(br, ender);
333 /* check for proper termination */
334 if (paren && *regparse++ != ')') {
335 FAIL("unmatched ()");
336 } else if (!paren && *regparse != '\0') {
337 if (*regparse == ')') {
338 FAIL("unmatched ()");
339 } else {
340 FAIL("junk on end"); /* "Can't happen". */
342 /* NOTREACHED */
344 return ret;
349 - regbranch - one alternative of an | operator
351 * Implements the concatenation operator.
353 static char *regbranch (int *flagp) {
354 char *ret;
355 char *chain;
356 char *latest;
357 int flags;
359 *flagp = WORST; /* tentatively */
360 ret = regnode(BRANCH);
361 chain = NULL;
362 while (*regparse != '\0' && *regparse != ')' && *regparse != '\n' && *regparse != '|') {
363 latest = regpiece(&flags);
364 if (latest == NULL) return NULL;
365 *flagp |= flags&HASWIDTH;
366 if (chain == NULL) *flagp |= flags&SPSTART; /* first piece */ else regtail(chain, latest);
367 chain = latest;
369 if (chain == NULL) regnode(NOTHING); /* loop ran zero times */
370 return ret;
375 - regpiece - something followed by possible [*+?]
377 * Note that the branching code sequences used for ? and the general cases
378 * of * and + are somewhat optimized: they use the same NOTHING node as
379 * both the endmarker for their branch list and the body of the last branch.
380 * It might seem that this node could be dispensed with entirely, but the
381 * endmarker role is not redundant.
383 static char *regpiece (int *flagp) {
384 char *ret;
385 char op;
386 char *next;
387 int flags;
389 ret = regatom(&flags);
390 if (ret == NULL) return NULL;
391 op = *regparse;
392 if (!ISMULT(op)) { *flagp = flags; return ret; }
393 if (!(flags&HASWIDTH) && op != '?') FAIL("*+ operand could be empty");
394 *flagp = (op != '+')?(WORST|SPSTART):(WORST|HASWIDTH);
395 if (op == '*' && (flags&SIMPLE)) {
396 reginsert(STAR, ret);
397 } else if (op == '*') {
398 /* emit x* as (x&|), where & means "self" */
399 reginsert(BRANCH, ret); /* either x */
400 regoptail(ret, regnode(BACK)); /* and loop */
401 regoptail(ret, ret); /* back */
402 regtail(ret, regnode(BRANCH)); /* or */
403 regtail(ret, regnode(NOTHING)); /* null */
404 } else if (op == '+' && (flags&SIMPLE)) {
405 reginsert(PLUS, ret);
406 } else if (op == '+') {
407 /* emit x+ as x(&|), where & means "self" */
408 next = regnode(BRANCH); /* either */
409 regtail(ret, next);
410 regtail(regnode(BACK), ret); /* loop back */
411 regtail(next, regnode(BRANCH)); /* or */
412 regtail(ret, regnode(NOTHING)); /* null */
413 } else if (op == '?') {
414 /* emit x? as (x|) */
415 reginsert(BRANCH, ret); /* either x */
416 regtail(ret, regnode(BRANCH)); /* or */
417 next = regnode(NOTHING); /* null */
418 regtail(ret, next);
419 regoptail(ret, next);
421 ++regparse;
422 if (ISMULT(*regparse)) FAIL("nested *?+");
423 return(ret);
428 - regatom - the lowest level
430 * Optimization: gobbles an entire sequence of ordinary characters so that
431 * it can turn them into a single node, which is smaller to store and
432 * faster to run. Backslashed characters are exceptions, each becoming a
433 * separate node; the code is simpler that way and it's not worth fixing.
435 static char *regatom (int *flagp) {
436 char *ret;
437 int flags;
439 *flagp = WORST; /* tentatively */
440 switch (*regparse++) {
441 /* FIXME: these chars only have meaning at beg/end of pat? */
442 case '^':
443 ret = regnode(BOL);
444 break;
445 case '$':
446 ret = regnode(EOL);
447 break;
449 case '.':
450 ret = regnode(ANY);
451 *flagp |= HASWIDTH|SIMPLE;
452 break;
453 case '[': {
454 int classr, classend;
456 if (*regparse == '^') {
457 /* complement of range */
458 ret = regnode(ANYBUT);
459 ++regparse;
460 } else {
461 ret = regnode(ANYOF);
463 if (*regparse == ']' || *regparse == '-') regc(*regparse++);
464 while (*regparse != '\0' && *regparse != ']') {
465 if (*regparse == '-') {
466 ++regparse;
467 if (*regparse == ']' || *regparse == '\0') {
468 regc('-');
469 } else {
470 classr = UCHARAT(regparse-2)+1;
471 classend = UCHARAT(regparse);
472 if (classr > classend+1) FAIL("invalid [] range");
473 for (; classr <= classend; ++classr) regc(classr);
474 ++regparse;
476 } else {
477 regc(*regparse++);
480 regc('\0');
481 if (*regparse != ']') FAIL("unmatched []");
482 ++regparse;
483 *flagp |= HASWIDTH|SIMPLE;
484 } break;
485 case '(':
486 ret = reg(1, &flags);
487 if (ret == NULL) return NULL;
488 *flagp |= flags&(HASWIDTH|SPSTART);
489 break;
490 case '\0':
491 case '|':
492 case '\n':
493 case ')':
494 FAIL("internal urp"); /* supposed to be caught earlier */
495 break;
496 case '?':
497 case '+':
498 case '*':
499 FAIL("?+* follows nothing");
500 break;
501 case '\\':
502 switch (*regparse++) {
503 case '\0': FAIL("trailing \\"); break;
504 case '<': ret = regnode(WORDA); break;
505 case '>': ret = regnode(WORDZ); break;
506 /* FIXME: Someday handle \1, \2, ... */
507 default: goto de_fault; /* Handle general quoted chars in exact-match routine */
509 break;
510 de_fault:
511 default:
513 * Encode a string of characters to be matched exactly.
515 * This is a bit tricky due to quoted chars and due to
516 * '*', '+', and '?' taking the SINGLE char previous
517 * as their operand.
519 * On entry, the char at regparse[-1] is going to go
520 * into the string, no matter what it is. (It could be
521 * following a \ if we are entered from the '\' case.)
523 * Basic idea is to pick up a good char in ch and
524 * examine the next char. If it's *+? then we twiddle.
525 * If it's \ then we frozzle. If it's other magic char
526 * we push ch and terminate the string. If none of the
527 * above, we push ch on the string and go around again.
529 * regprev is used to remember where "the current char"
530 * starts in the string, if due to a *+? we need to back
531 * up and put the current char in a separate, 1-char, string.
532 * When regprev is NULL, ch is the only char in the
533 * string; this is used in *+? handling, and in setting
534 * flags |= SIMPLE at the end.
537 char *regprev;
538 char ch;
540 --regparse; /* look at cur char */
541 ret = regnode(EXACTLY);
542 for (regprev = 0; ;) {
543 ch = *regparse++; /* get current char */
544 /* look at next one */
545 switch (*regparse) {
546 case '.': case '[': case '(':
547 case ')': case '|': case '\n':
548 case '$': case '^':
549 case '\0':
550 /* FIXME, $ and ^ should not always be magic */
551 magic: regc(ch); /* dump cur char */
552 goto done; /* and we are done */
553 case '?': case '+': case '*':
554 if (!regprev) goto magic; /* if just ch in str, use it */
555 /* end mult-char string one early */
556 regparse = regprev; /* back up parse */
557 goto done;
558 case '\\':
559 regc(ch); /* cur char OK */
560 /* look after \ */
561 switch (regparse[1]) {
562 case '\0':
563 case '<':
564 case '>':
565 /* FIXME: someday handle \1, \2, ... */
566 goto done; /* not quoted */
567 default:
568 /* backup point is \, scan point is after it */
569 regprev = regparse;
570 ++regparse;
571 continue; /* NOT break; */
573 default:
574 regc(ch); /* add cur to string */
575 break;
577 regprev = regparse; /* set backup point */
579 done: regc('\0');
580 *flagp |= HASWIDTH;
581 if (!regprev) *flagp |= SIMPLE; /* one char? */
583 break;
585 return ret;
590 - regnode - emit a node
592 /* return location */
593 static char *regnode (int op) {
594 char *ret;
595 char *ptr;
597 ret = regcode;
598 if (ret == &regdummy) { regsize += 3; return ret; }
599 ptr = ret;
600 *ptr++ = op;
601 *ptr++ = '\0'; /* null "next" pointer */
602 *ptr++ = '\0';
603 regcode = ptr;
604 return ret;
609 - regc - emit (if appropriate) a byte of code
611 static void regc (int b) {
612 if (regcode != &regdummy) *regcode++ = b; else ++regsize;
617 - reginsert - insert an operator in front of already-emitted operand
619 * Means relocating the operand.
621 static void reginsert (char op, char *opnd) {
622 char *src;
623 char *dst;
624 char *place;
626 if (regcode == &regdummy) { regsize += 3; return; }
627 src = regcode;
628 regcode += 3;
629 dst = regcode;
630 while (src > opnd) *--dst = *--src;
631 place = opnd; /* op node, where operand used to be */
632 *place++ = op;
633 *place++ = '\0';
634 *place++ = '\0';
639 - regtail - set the next-pointer at the end of a node chain
641 static void regtail (char *p, char *val) {
642 char *scan;
643 char *temp;
644 int offset;
646 if (p == &regdummy) return;
647 /* find last node */
648 scan = p;
649 for (;;) {
650 temp = regnext(scan);
651 if (temp == NULL) break;
652 scan = temp;
654 offset = OP(scan)==BACK?scan-val:val-scan;
655 *(scan+1) = (offset>>8)&0377;
656 *(scan+2) = offset&0377;
661 - regoptail - regtail on operand of first argument; nop if operandless
663 static void regoptail (char *p, char *val) {
664 /* "operandless" and "op != BRANCH" are synonymous in practice */
665 if (p == NULL || p == &regdummy || OP(p) != BRANCH) return;
666 regtail(OPERAND(p), val);
671 * hsRxExec and friends
675 * global work variables for hsRxExec()
677 static const char *reginput; /* string-input pointer */
678 static char *regbol; /* beginning of input, for ^ check */
679 static const char **regstartp; /* pointer to startp array */
680 static const char **regendp; /* ditto for endp */
682 /* forwards */
683 static int regtry (HSRegExp *prog, const char *string);
684 static int regmatch (char *prog);
685 static int regrepeat (char *p);
687 #ifdef RXDEBUG
688 int hsRxNarrate = 0;
689 static const char *regprop (const char *op);
690 #endif
693 int hsRxFree (HSRegExp *re) {
694 if (re != NULL) free(re);
695 return 0;
700 - hsRxExec - match a regexp against a string
702 int hsRxExec (HSRegExp *prog, const char *string) {
703 char *s;
705 /* be paranoid... */
706 if (prog == NULL || string == NULL) { hsRxError("NULL parameter"); return 0; }
707 /* check validity of program */
708 if (UCHARAT(prog->program) != MAGIC) { hsRxError("corrupted program"); return 0; }
709 /* if there is a "must appear" string, look for it */
710 if (prog->regmust != NULL) {
711 s = (char *)string;
712 while ((s = strchr(s, prog->regmust[0])) != NULL) {
713 if (strncmp(s, prog->regmust, prog->regmlen) == 0) break; /* found it */
714 ++s;
716 if (s == NULL) return 0; /* not present */
718 /* mark beginning of line for ^ */
719 regbol = (char *)string;
720 /* simplest case: anchored match need be tried only once */
721 if (prog->reganch) return regtry(prog, string);
722 /* messy cases: unanchored match */
723 s = (char *)string;
724 if (prog->regstart != '\0') {
725 /* we know what char it must start with */
726 while ((s = strchr(s, prog->regstart)) != NULL) {
727 if (regtry(prog, s)) return 1;
728 ++s;
730 } else {
731 /* we don't -- general case */
732 do {
733 if (regtry(prog, s)) return 1;
734 } while (*s++ != '\0');
736 /* failure */
737 return 0;
742 - regtry - try match at specific point
744 /* 0 failure, 1 success */
745 static int regtry (HSRegExp *prog, const char *string) {
746 int i;
747 const char **sp, **ep;
749 reginput = string;
750 regstartp = prog->startp;
751 regendp = prog->endp;
752 sp = prog->startp;
753 ep = prog->endp;
754 for (i = NSUBEXP; i > 0; --i) { *sp++ = NULL; *ep++ = NULL; }
755 if (regmatch(prog->program+1)) {
756 prog->startp[0] = string;
757 prog->endp[0] = reginput;
758 return 1;
760 return 0;
765 - regmatch - main matching routine
767 * Conceptually the strategy is simple: check to see whether the current
768 * node matches, call self recursively to see whether the rest matches,
769 * and then act accordingly. In practice we make some effort to avoid
770 * recursion, in particular by going through "ordinary" nodes (that don't
771 * need to know whether the rest of the match failed) by a loop instead of
772 * by recursion.
774 /* 0 failure, 1 success */
775 static int regmatch (char *prog) {
776 const char *save;
777 char *scan; /* current node */
778 char *next; /* next node */
779 int no;
781 scan = prog;
782 #ifdef RXDEBUG
783 if (scan != NULL && hsRxNarrate) fprintf(stderr, "%s(\n", regprop(scan));
784 #endif
785 while (scan != NULL) {
786 #ifdef RXDEBUG
787 if (hsRxNarrate) fprintf(stderr, "%s...\n", regprop(scan));
788 #endif
789 next = regnext(scan);
790 if (OP(scan) >= OPEN && OP(scan) < OPEN+NSUBEXP) {
791 no = OP(scan)-OPEN;
792 save = reginput;
793 if (regmatch(next)) {
794 /* don't set startp if some later invocation of the same parentheses already has */
795 if (regstartp[no] == NULL || regstartp[no] > save) regstartp[no] = save;
796 return 1;
798 return 0;
800 if (OP(scan) >= CLOSE && OP(scan) < CLOSE+NSUBEXP) {
801 no = OP(scan)-CLOSE;
802 save = reginput;
803 if (regmatch(next)) {
804 /* don't set endp if some later invocation of the same parentheses already has */
805 if (regendp[no] == NULL || regendp[no] < save) regendp[no] = save;
806 return 1;
808 return 0;
810 switch (OP(scan)) {
811 case BOL: if (reginput != regbol) return 0; break;
812 case EOL: if (*reginput != '\0') return 0; break;
813 case WORDA:
814 /* must be looking at a letter, digit, or _ */
815 if ((!isalnum(*reginput)) && *reginput != '_') return 0;
816 /* prev must be BOL or nonword */
817 if (reginput > regbol && (isalnum(reginput[-1]) || reginput[-1] == '_')) return 0;
818 break;
819 case WORDZ:
820 /* must be looking at non letter, digit, or _ */
821 if (isalnum(*reginput) || *reginput == '_') return 0;
822 /* we don't care what the previous char was */
823 break;
824 case ANY:
825 if (*reginput == '\0') return 0;
826 ++reginput;
827 break;
828 case EXACTLY: {
829 int len;
830 char *opnd;
832 opnd = OPERAND(scan);
833 /* inline the first character, for speed */
834 if (*opnd != *reginput) return 0;
835 len = strlen(opnd);
836 if (len > 1 && strncmp(opnd, reginput, len) != 0) return 0;
837 reginput += len;
838 } break;
839 case ANYOF:
840 if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) == NULL) return 0;
841 ++reginput;
842 break;
843 case ANYBUT:
844 if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) != NULL) return 0;
845 ++reginput;
846 break;
847 case NOTHING: break;
848 case BACK: break;
849 case BRANCH: {
851 if (OP(next) != BRANCH) {
852 /* no choice */
853 /* avoid recursion, i.e. move PC to the next VM instruction */
854 next = OPERAND(scan);
855 } else {
856 do {
857 save = reginput;
858 if (regmatch(OPERAND(scan))) return 1; /* execute from next instruction */
859 reginput = save;
860 /* perform jump, if failed */
861 scan = regnext(scan);
862 } while (scan != NULL && OP(scan) == BRANCH);
863 return 0;
864 /* NOTREACHED */
866 } break;
867 case STAR:
868 case PLUS: {
869 char nextch;
870 int min;
872 /* lookahead to avoid useless match attempts when we know what character comes next */
873 nextch = '\0';
874 if (OP(next) == EXACTLY) nextch = *OPERAND(next);
875 min = (OP(scan) == STAR)?0:1;
876 save = reginput;
877 no = regrepeat(OPERAND(scan));
878 while (no >= min) {
879 /* if it could work, try it */
880 if (nextch == '\0' || *reginput == nextch) {
881 if (regmatch(next)) return 1;
883 /* couldn't or didn't -- back up */
884 --no;
885 reginput = save+no;
887 return 0;
888 } break;
889 case END: return 1; /* success! */
890 default: hsRxError("memory corruption"); return 0;
891 } /* switch */
892 scan = next;
893 } /* while */
894 /* we get here only if there's trouble -- normally "case END" is the terminating point */
895 hsRxError("corrupted pointers");
896 return 0;
901 - regrepeat - repeatedly match something simple, report how many
903 static int regrepeat (char *p) {
904 int count = 0;
905 const char *scan;
906 char *opnd;
908 scan = reginput;
909 opnd = OPERAND(p);
910 switch (OP(p)) {
911 case ANY:
912 count = strlen(scan);
913 scan += count;
914 break;
915 case EXACTLY:
916 while (*opnd == *scan) { ++count; ++scan; }
917 break;
918 case ANYOF:
919 while (*scan != '\0' && strchr(opnd, *scan) != NULL) { ++count; ++scan; }
920 break;
921 case ANYBUT:
922 while (*scan != '\0' && strchr(opnd, *scan) == NULL) { ++count; ++scan; }
923 break;
924 default: /* oh dear! called inappropriately */
925 hsRxError("internal foulup");
926 count = 0; /* best compromise */
927 break;
929 reginput = scan;
930 return count;
935 - regnext - dig the "next" pointer out of a node
937 static char *regnext (char *p) {
938 int offset;
940 if (p == &regdummy) return NULL;
941 offset = NEXT(p);
942 if (offset == 0) return NULL;
943 if (OP(p) == BACK) return p-offset;
944 return p+offset;
948 #ifdef RXDEBUG
950 - hsRxDump - dump a regexp onto stdout in vaguely comprehensible form
952 void hsRxDump (HSRegExp *r) {
953 char *s, *next;
954 char op = EXACTLY; /* arbitrary non-END op */
956 s = r->program+1;
957 /* while that wasn't END last time... */
958 while (op != END) {
959 op = OP(s);
960 printf("%5d: %s", s-r->program, regprop(s)); /* where, what */
961 next = regnext(s);
962 if (next == NULL) printf("%3d", 0); /* next ptr */
963 else printf("%3d", (s-r->program)+(next-s));
964 s += 3;
965 if (op == ANYOF || op == ANYBUT || op == EXACTLY) {
966 /* literal string, where present */
967 putchar(' ');
968 putchar('"');
969 while (*s != '\0') {
970 char ch = *s++;
971 if ((unsigned char)ch < ' ' || ch == 127 || ch == '"') {
972 switch (ch) {
973 case '"': printf("\\\""); break;
974 case '\t': printf("\\t"); break;
975 case '\n': printf("\\n"); break;
976 case '\r': printf("\\r"); break;
977 default: printf("\\x%02x", (unsigned char)ch); break;
979 } else {
980 putchar(ch);
983 putchar('"');
984 ++s;
986 putchar('\n');
988 /* header fields of interest */
989 if (r->regstart != '\0') printf("start `%c' ", r->regstart);
990 if (r->reganch) printf("anchored ");
991 if (r->regmust != NULL) printf("must have \"%s\"", r->regmust);
992 printf("\n");
997 - regprop - printable representation of opcode
999 static const char *regprop (const char *op) {
1000 char *p = NULL;
1001 static char buf[50];
1003 buf[0] = '\0';
1004 if (OP(op) >= OPEN && OP(op) < OPEN+NSUBEXP) {
1005 sprintf(buf+strlen(buf), "OPEN%02d", OP(op)-OPEN);
1006 } else if (OP(op) >= CLOSE && OP(op) < CLOSE+NSUBEXP) {
1007 sprintf(buf+strlen(buf), "CLOSE%02d", OP(op)-OPEN);
1008 } else {
1009 switch (OP(op)) {
1010 case BOL: p = "BOL"; break;
1011 case EOL: p = "EOL"; break;
1012 case ANY: p = "ANY"; break;
1013 case ANYOF: p = "ANYOF"; break;
1014 case ANYBUT: p = "ANYBUT"; break;
1015 case BRANCH: p = "BRANCH"; break;
1016 case EXACTLY: p = "EXACTLY"; break;
1017 case NOTHING: p = "NOTHING"; break;
1018 case BACK: p = "BACK"; break;
1019 case END: p = "END"; break;
1020 case STAR: p = "STAR"; break;
1021 case PLUS: p = "PLUS"; break;
1022 case WORDA: p = "WORDA"; break;
1023 case WORDZ: p = "WORDZ"; break;
1024 default: hsRxError("corrupted opcode"); break;
1027 if (p != NULL) strcat(buf, p);
1028 while (strlen(buf) < 10) strcat(buf, " ");
1029 return buf;
1031 #endif
1035 * The following is provided for those people who do not have strcspn() in
1036 * their C libraries. They should get off their butts and do something
1037 * about it; at least one public-domain implementation of those (highly
1038 * useful) string routines has been published on Usenet.
1040 #ifdef STRCSPN
1042 * strcspn - find length of initial segment of s1 consisting entirely
1043 * of characters not from s2
1046 static int strcspn (const char *s1, const char *s2) {
1047 const char *scan1, *scan2;
1048 int count;
1050 count = 0;
1051 for (scan1 = s1; *scan1 != '\0'; ++scan1) {
1052 for (scan2 = s2; *scan2 != '\0';) /* ++ moved down. */
1053 if (*scan1 == *scan2++) return count ;
1054 ++count;
1056 return count;
1058 #endif
1061 void hsRxError (const char *s) {
1062 printf("re error %s\n", s);