2 * hsRxCompile and hsRxExec -- regsub and hsRxError are elsewhere
4 * Copyright (c) 1986 by University of Toronto.
5 * Written by Henry Spencer. Not derived from licensed software.
7 * Permission is granted to anyone to use this software for any
8 * purpose on any computer system, and to redistribute it freely,
9 * subject to the following restrictions:
11 * 1. The author is not responsible for the consequences of use of
12 * this software, no matter how awful, even if they arise
15 * 2. The origin of this software must not be misrepresented, either
16 * by explicit claim or by omission.
18 * 3. Altered versions must be plainly marked as such, and must not
19 * be misrepresented as being the original software.
20 *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore,
21 *** hoptoad!gnu, on 27 Dec 1986, to add \n as an alternative to |
22 *** to assist in implementing egrep.
23 *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore,
24 *** hoptoad!gnu, on 27 Dec 1986, to add \< and \> for word-matching
25 *** as in BSD grep and ex.
26 *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore,
27 *** hoptoad!gnu, on 28 Dec 1986, to optimize characters quoted with \.
28 *** THIS IS AN ALTERED VERSION. It was altered by James A. Woods,
29 *** ames!jaw, on 19 June 1987, to quash a hsRxCompile() redundancy.
30 *** THIS IS AN ALTERED VERSION. It was altered by Christopher Seiwald
31 *** seiwald@vix.com, on 28 August 1993, for use in jam. Regmagic.h
32 *** was moved into hsregexp.h, and the include of hsregexp.h now uses "'s
33 *** to avoid conflicting with the system hsregexp.h. Const, bless its
34 *** soul, was removed so it can compile everywhere. The declaration
35 *** of strchr() was in conflict on AIX, so it was removed (as it is
36 *** happily defined in string.h).
37 *** THIS IS AN ALTERED VERSION. It was altered by Christopher Seiwald
38 *** seiwald@perforce.com, on 20 January 2000, to use function prototypes.
39 *** THIS IS AN ALTERED VERSION. It was altered by Christopher Seiwald
40 *** seiwald@perforce.com, on 05 November 2002, to const string literals.
42 * Beware that some of this code is subtly aware of the way operator
43 * precedence is structured in regular expressions. Serious changes in
44 * regular-expression syntax might require a total rethink.
54 * The "internal use only" fields in hsregexp.h are present to pass info from
55 * compile to execute that permits the execute phase to run lots faster on
56 * simple cases. They are:
58 * regstart char that must begin a match; '\0' if none obvious
59 * reganch is the match anchored (at beginning-of-line only)?
60 * regmust string (pointer into program) that match must include, or NULL
61 * regmlen length of regmust string
63 * Regstart and reganch permit very fast decisions on suitable starting points
64 * for a match, cutting down the work a lot. Regmust permits fast rejection
65 * of lines that cannot possibly match. The regmust tests are costly enough
66 * that hsRxCompile() supplies a regmust only if the r.e. contains something
67 * potentially expensive (at present, the only such thing detected is * or +
68 * at the start of the r.e., which can involve a lot of backup). Regmlen is
69 * supplied because the test in hsRxExec() needs it and hsRxCompile() is computing
74 * Structure for regexp "program". This is essentially a linear encoding
75 * of a nondeterministic finite-state machine (aka syntax charts or
76 * "railroad normal form" in parsing technology). Each node is an opcode
77 * plus a "next" pointer, possibly plus an operand. "Next" pointers of
78 * all nodes except BRANCH implement concatenation; a "next" pointer with
79 * a BRANCH on both ends of it is connecting two alternatives. (Here we
80 * have one of the subtle syntax dependencies: an individual BRANCH (as
81 * opposed to a collection of them) is never concatenated with anything
82 * because of operator precedence.) The operand of some types of node is
83 * a literal string; for others, it is a node leading into a sub-FSM. In
84 * particular, the operand of a BRANCH node is the first node of the branch.
85 * (NB this is *not* a tree structure: the tail of the branch connects
86 * to the thing following the set of BRANCHes.) The opcodes are:
89 /* definition opnd? meaning */
91 END
, /* no End of program */
92 BOL
, /* no Match "" at beginning of line */
93 EOL
, /* no Match "" at end of line */
94 ANY
, /* no Match any one character */
95 ANYOF
, /* str Match any character in this string */
96 ANYBUT
, /* str Match any character not in this string */
97 BRANCH
, /* node Match this alternative, or the next */
98 BACK
, /* no Match "", "next" ptr points backward */
99 EXACTLY
, /* str Match this string */
100 NOTHING
, /* no Match empty string */
101 STAR
, /* node Match this (simple) thing 0 or more times */
102 PLUS
, /* node Match this (simple) thing 1 or more times */
103 WORDA
, /* no Match "" at wordchar, where prev is nonword */
104 WORDZ
, /* no Match "" at nonwordchar, where prev is word */
106 OPEN
= 40, /* no Mark this point in input as start of #n */
107 /* OPEN+1 is number 1, etc. */
108 CLOSE
= 60 /* no Analogous to OPEN */
116 * The set of branches constituting a single choice are hooked
117 * together with their "next" pointers, since precedence prevents
118 * anything being concatenated to any individual branch.
119 * The "next" pointer of the last BRANCH in a choice points to the
120 * thing following the whole choice. This is also where the
121 * final "next" pointer of each individual branch points; each
122 * branch starts with the operand node of a BRANCH node.
125 * Normal "next" pointers all implicitly point forward; BACK
126 * exists to make loop structures possible.
129 * '?' and complex '*' and '+', are implemented as circular
130 * BRANCH structures using BACK. Simple cases (one character
131 * per match) are implemented with STAR and PLUS for speed
132 * and to minimize recursive plunges.
134 * OPEN,CLOSE ...are numbered at compile time.
138 * A node is one char of opcode followed by two chars of "next" pointer.
139 * "Next" pointers are stored as two 8-bit pieces, high order first.
140 * The value is a positive offset from the opcode of the node containing it.
141 * An operand, if any, simply follows the node. (Note that much of the
142 * code generation knows about this implicit relationship.)
144 * Using two bytes for the "next" pointer is vast overkill for most things,
145 * but allows patterns to get big without disasters.
148 #define NEXT(p) (((*((p)+1)&0377)<<8)+(*((p)+2)&0377))
149 #define OPERAND(p) ((p)+3)
153 * utility definitions
156 # define UCHARAT(p) ((int)*(unsigned char *)(p))
158 # define UCHARAT(p) ((int)*(p)&CHARBITS)
161 #define FAIL(m) { hsRxError(m); return(NULL); }
162 #define ISMULT(c) ((c) == '*' || (c) == '+' || (c) == '?')
166 * flags to be passed up and down
168 #define WORST 0 /* worst case */
169 #define HASWIDTH 1 /* known never to match null string */
170 #define SIMPLE 2 /* simple enough to be STAR/PLUS operand */
171 #define SPSTART 4 /* starts with * or + */
175 * The first byte of the regexp internal "program" is actually this magic
176 * number; the start node begins in the second byte.
182 * global work variables for hsRxCompile()
184 static char *regparse
; /* input-scan pointer */
185 static int regnpar
; /* () count */
186 static char regdummy
;
187 static char *regcode
; /* code-emit pointer; ®dummy = don't */
188 static long regsize
; /* code size */
191 * forward declarations for hsRxCompile()'s friends
193 static char *reg (int paren
, int *flagp
);
194 static char *regbranch (int *flagp
);
195 static char *regpiece (int *flagp
);
196 static char *regatom (int *flagp
);
197 static char *regnode (int op
);
198 static char *regnext (char *p
);
199 static void regc (int b
);
200 static void reginsert (char op
, char *opnd
);
201 static void regtail (char *p
, char *val
);
202 static void regoptail (char *p
, char *val
);
204 static int strcspn (const char *s1
, const char *s2
);
209 - hsRxCompile - compile a regular expression into internal code
211 * We can't allocate space until we know how big the compiled form will be,
212 * but we can't compile it (and thus know how big it is) until we've got a
213 * place to put the code. So we cheat: we compile it twice, once with code
214 * generation turned off and size counting turned on, and once "for real".
215 * This also means that we don't allocate space until we are sure that the
216 * thing really will compile successfully, and we never have to move the
217 * code and thus invalidate pointers into it. (Note that it has to be in
218 * one piece because free() must be able to free it all.)
220 * Beware that the optimization-preparation code in here knows about some
221 * of the structure of the compiled regexp.
223 HSRegExp
*hsRxCompile (const char *exp
) {
230 if (exp
== NULL
) FAIL("NULL argument");
231 /* first pass: determine size, legality */
233 if (exp
[0] == '.' && exp
[1] == '*') exp
+= 2; /* aid grep */
235 regparse
= (char *)exp
;
240 if (reg(0, &flags
) == NULL
) return NULL
;
241 /* small enough for pointer-storage convention? */
242 if (regsize
>= 32767L) FAIL("regexp too big"); /* probably could be 65535L */
243 /* allocate space. */
244 r
= (HSRegExp
*)malloc(sizeof(HSRegExp
)+(unsigned)regsize
);
245 if (r
== NULL
) FAIL("out of space");
246 /* second pass: emit code */
248 regparse
= (char *)exp
;
250 regcode
= r
->program
;
252 if (reg(0, &flags
) == NULL
) return NULL
;
253 /* dig out information for optimizations */
254 r
->regstart
= '\0'; /* worst-case defaults */
258 scan
= r
->program
+1; /* first BRANCH */
259 if (OP(regnext(scan
)) == END
) {
260 /* only one top-level choice */
261 scan
= OPERAND(scan
);
262 /* starting-point info */
263 if (OP(scan
) == EXACTLY
) r
->regstart
= *OPERAND(scan
);
264 else if (OP(scan
) == BOL
) ++(r
->reganch
);
265 /* if there's something expensive in the r.e., find the longest
266 * literal string that must appear and make it the regmust */
267 /* resolve ties in favor of later strings, since the regstart
268 * check works with the beginning of the r.e. and avoiding
269 * duplication strengthens checking */
270 /* not a strong reason, but sufficient in the absence of others */
274 for (; scan
!= NULL
; scan
= regnext(scan
)) {
275 if (OP(scan
) == EXACTLY
&& strlen(OPERAND(scan
)) >= len
) {
276 longest
= OPERAND(scan
);
277 len
= strlen(OPERAND(scan
));
280 r
->regmust
= longest
;
289 - reg - regular expression, i.e. main body or parenthesized thing
291 * Caller must absorb opening parenthesis.
293 * Combining parenthesis handling with the base level of regular expression
294 * is a trifle forced, but the need to tie the tails of the branches to what
295 * follows makes it hard to avoid.
297 static char *reg (int paren
/*parenthesized?*/, int *flagp
) {
301 int parno
= 0; /*k8*/
304 *flagp
= HASWIDTH
; /* tentatively */
305 /* make an OPEN node, if parenthesized */
307 if (regnpar
>= NSUBEXP
) FAIL("too many ()");
310 ret
= regnode(OPEN
+parno
);
314 /* pick up the branches, linking them together */
315 br
= regbranch(&flags
);
316 if (br
== NULL
) return NULL
;
317 if (ret
!= NULL
) regtail(ret
, br
); /* OPEN -> first */ else ret
= br
;
318 if (!(flags
&HASWIDTH
)) *flagp
&= ~HASWIDTH
;
319 *flagp
|= flags
&SPSTART
;
320 while (*regparse
== '|' || *regparse
== '\n') {
322 br
= regbranch(&flags
);
323 if (br
== NULL
) return NULL
;
324 regtail(ret
, br
); /* BRANCH -> BRANCH */
325 if (!(flags
&HASWIDTH
)) *flagp
&= ~HASWIDTH
;
326 *flagp
|= flags
&SPSTART
;
328 /* make a closing node, and hook it on the end */
329 ender
= regnode(paren
?CLOSE
+parno
:END
);
331 /* hook the tails of the branches to the closing node */
332 for (br
= ret
; br
!= NULL
; br
= regnext(br
)) regoptail(br
, ender
);
333 /* check for proper termination */
334 if (paren
&& *regparse
++ != ')') {
335 FAIL("unmatched ()");
336 } else if (!paren
&& *regparse
!= '\0') {
337 if (*regparse
== ')') {
338 FAIL("unmatched ()");
340 FAIL("junk on end"); /* "Can't happen". */
349 - regbranch - one alternative of an | operator
351 * Implements the concatenation operator.
353 static char *regbranch (int *flagp
) {
359 *flagp
= WORST
; /* tentatively */
360 ret
= regnode(BRANCH
);
362 while (*regparse
!= '\0' && *regparse
!= ')' && *regparse
!= '\n' && *regparse
!= '|') {
363 latest
= regpiece(&flags
);
364 if (latest
== NULL
) return NULL
;
365 *flagp
|= flags
&HASWIDTH
;
366 if (chain
== NULL
) *flagp
|= flags
&SPSTART
; /* first piece */ else regtail(chain
, latest
);
369 if (chain
== NULL
) regnode(NOTHING
); /* loop ran zero times */
375 - regpiece - something followed by possible [*+?]
377 * Note that the branching code sequences used for ? and the general cases
378 * of * and + are somewhat optimized: they use the same NOTHING node as
379 * both the endmarker for their branch list and the body of the last branch.
380 * It might seem that this node could be dispensed with entirely, but the
381 * endmarker role is not redundant.
383 static char *regpiece (int *flagp
) {
389 ret
= regatom(&flags
);
390 if (ret
== NULL
) return NULL
;
392 if (!ISMULT(op
)) { *flagp
= flags
; return ret
; }
393 if (!(flags
&HASWIDTH
) && op
!= '?') FAIL("*+ operand could be empty");
394 *flagp
= (op
!= '+')?(WORST
|SPSTART
):(WORST
|HASWIDTH
);
395 if (op
== '*' && (flags
&SIMPLE
)) {
396 reginsert(STAR
, ret
);
397 } else if (op
== '*') {
398 /* emit x* as (x&|), where & means "self" */
399 reginsert(BRANCH
, ret
); /* either x */
400 regoptail(ret
, regnode(BACK
)); /* and loop */
401 regoptail(ret
, ret
); /* back */
402 regtail(ret
, regnode(BRANCH
)); /* or */
403 regtail(ret
, regnode(NOTHING
)); /* null */
404 } else if (op
== '+' && (flags
&SIMPLE
)) {
405 reginsert(PLUS
, ret
);
406 } else if (op
== '+') {
407 /* emit x+ as x(&|), where & means "self" */
408 next
= regnode(BRANCH
); /* either */
410 regtail(regnode(BACK
), ret
); /* loop back */
411 regtail(next
, regnode(BRANCH
)); /* or */
412 regtail(ret
, regnode(NOTHING
)); /* null */
413 } else if (op
== '?') {
414 /* emit x? as (x|) */
415 reginsert(BRANCH
, ret
); /* either x */
416 regtail(ret
, regnode(BRANCH
)); /* or */
417 next
= regnode(NOTHING
); /* null */
419 regoptail(ret
, next
);
422 if (ISMULT(*regparse
)) FAIL("nested *?+");
428 - regatom - the lowest level
430 * Optimization: gobbles an entire sequence of ordinary characters so that
431 * it can turn them into a single node, which is smaller to store and
432 * faster to run. Backslashed characters are exceptions, each becoming a
433 * separate node; the code is simpler that way and it's not worth fixing.
435 static char *regatom (int *flagp
) {
439 *flagp
= WORST
; /* tentatively */
440 switch (*regparse
++) {
441 /* FIXME: these chars only have meaning at beg/end of pat? */
451 *flagp
|= HASWIDTH
|SIMPLE
;
454 int classr
, classend
;
456 if (*regparse
== '^') {
457 /* complement of range */
458 ret
= regnode(ANYBUT
);
461 ret
= regnode(ANYOF
);
463 if (*regparse
== ']' || *regparse
== '-') regc(*regparse
++);
464 while (*regparse
!= '\0' && *regparse
!= ']') {
465 if (*regparse
== '-') {
467 if (*regparse
== ']' || *regparse
== '\0') {
470 classr
= UCHARAT(regparse
-2)+1;
471 classend
= UCHARAT(regparse
);
472 if (classr
> classend
+1) FAIL("invalid [] range");
473 for (; classr
<= classend
; ++classr
) regc(classr
);
481 if (*regparse
!= ']') FAIL("unmatched []");
483 *flagp
|= HASWIDTH
|SIMPLE
;
486 ret
= reg(1, &flags
);
487 if (ret
== NULL
) return NULL
;
488 *flagp
|= flags
&(HASWIDTH
|SPSTART
);
494 FAIL("internal urp"); /* supposed to be caught earlier */
499 FAIL("?+* follows nothing");
502 switch (*regparse
++) {
503 case '\0': FAIL("trailing \\"); break;
504 case '<': ret
= regnode(WORDA
); break;
505 case '>': ret
= regnode(WORDZ
); break;
506 /* FIXME: Someday handle \1, \2, ... */
507 default: goto de_fault
; /* Handle general quoted chars in exact-match routine */
513 * Encode a string of characters to be matched exactly.
515 * This is a bit tricky due to quoted chars and due to
516 * '*', '+', and '?' taking the SINGLE char previous
519 * On entry, the char at regparse[-1] is going to go
520 * into the string, no matter what it is. (It could be
521 * following a \ if we are entered from the '\' case.)
523 * Basic idea is to pick up a good char in ch and
524 * examine the next char. If it's *+? then we twiddle.
525 * If it's \ then we frozzle. If it's other magic char
526 * we push ch and terminate the string. If none of the
527 * above, we push ch on the string and go around again.
529 * regprev is used to remember where "the current char"
530 * starts in the string, if due to a *+? we need to back
531 * up and put the current char in a separate, 1-char, string.
532 * When regprev is NULL, ch is the only char in the
533 * string; this is used in *+? handling, and in setting
534 * flags |= SIMPLE at the end.
540 --regparse
; /* look at cur char */
541 ret
= regnode(EXACTLY
);
542 for (regprev
= 0; ;) {
543 ch
= *regparse
++; /* get current char */
544 /* look at next one */
546 case '.': case '[': case '(':
547 case ')': case '|': case '\n':
550 /* FIXME, $ and ^ should not always be magic */
551 magic
: regc(ch
); /* dump cur char */
552 goto done
; /* and we are done */
553 case '?': case '+': case '*':
554 if (!regprev
) goto magic
; /* if just ch in str, use it */
555 /* end mult-char string one early */
556 regparse
= regprev
; /* back up parse */
559 regc(ch
); /* cur char OK */
561 switch (regparse
[1]) {
565 /* FIXME: someday handle \1, \2, ... */
566 goto done
; /* not quoted */
568 /* backup point is \, scan point is after it */
571 continue; /* NOT break; */
574 regc(ch
); /* add cur to string */
577 regprev
= regparse
; /* set backup point */
581 if (!regprev
) *flagp
|= SIMPLE
; /* one char? */
590 - regnode - emit a node
592 /* return location */
593 static char *regnode (int op
) {
598 if (ret
== ®dummy
) { regsize
+= 3; return ret
; }
601 *ptr
++ = '\0'; /* null "next" pointer */
609 - regc - emit (if appropriate) a byte of code
611 static void regc (int b
) {
612 if (regcode
!= ®dummy
) *regcode
++ = b
; else ++regsize
;
617 - reginsert - insert an operator in front of already-emitted operand
619 * Means relocating the operand.
621 static void reginsert (char op
, char *opnd
) {
626 if (regcode
== ®dummy
) { regsize
+= 3; return; }
630 while (src
> opnd
) *--dst
= *--src
;
631 place
= opnd
; /* op node, where operand used to be */
639 - regtail - set the next-pointer at the end of a node chain
641 static void regtail (char *p
, char *val
) {
646 if (p
== ®dummy
) return;
650 temp
= regnext(scan
);
651 if (temp
== NULL
) break;
654 offset
= OP(scan
)==BACK
?scan
-val
:val
-scan
;
655 *(scan
+1) = (offset
>>8)&0377;
656 *(scan
+2) = offset
&0377;
661 - regoptail - regtail on operand of first argument; nop if operandless
663 static void regoptail (char *p
, char *val
) {
664 /* "operandless" and "op != BRANCH" are synonymous in practice */
665 if (p
== NULL
|| p
== ®dummy
|| OP(p
) != BRANCH
) return;
666 regtail(OPERAND(p
), val
);
671 * hsRxExec and friends
675 * global work variables for hsRxExec()
677 static const char *reginput
; /* string-input pointer */
678 static char *regbol
; /* beginning of input, for ^ check */
679 static const char **regstartp
; /* pointer to startp array */
680 static const char **regendp
; /* ditto for endp */
683 static int regtry (HSRegExp
*prog
, const char *string
);
684 static int regmatch (char *prog
);
685 static int regrepeat (char *p
);
689 static const char *regprop (const char *op
);
693 int hsRxFree (HSRegExp
*re
) {
694 if (re
!= NULL
) free(re
);
700 - hsRxExec - match a regexp against a string
702 int hsRxExec (HSRegExp
*prog
, const char *string
) {
706 if (prog
== NULL
|| string
== NULL
) { hsRxError("NULL parameter"); return 0; }
707 /* check validity of program */
708 if (UCHARAT(prog
->program
) != MAGIC
) { hsRxError("corrupted program"); return 0; }
709 /* if there is a "must appear" string, look for it */
710 if (prog
->regmust
!= NULL
) {
712 while ((s
= strchr(s
, prog
->regmust
[0])) != NULL
) {
713 if (strncmp(s
, prog
->regmust
, prog
->regmlen
) == 0) break; /* found it */
716 if (s
== NULL
) return 0; /* not present */
718 /* mark beginning of line for ^ */
719 regbol
= (char *)string
;
720 /* simplest case: anchored match need be tried only once */
721 if (prog
->reganch
) return regtry(prog
, string
);
722 /* messy cases: unanchored match */
724 if (prog
->regstart
!= '\0') {
725 /* we know what char it must start with */
726 while ((s
= strchr(s
, prog
->regstart
)) != NULL
) {
727 if (regtry(prog
, s
)) return 1;
731 /* we don't -- general case */
733 if (regtry(prog
, s
)) return 1;
734 } while (*s
++ != '\0');
742 - regtry - try match at specific point
744 /* 0 failure, 1 success */
745 static int regtry (HSRegExp
*prog
, const char *string
) {
747 const char **sp
, **ep
;
750 regstartp
= prog
->startp
;
751 regendp
= prog
->endp
;
754 for (i
= NSUBEXP
; i
> 0; --i
) { *sp
++ = NULL
; *ep
++ = NULL
; }
755 if (regmatch(prog
->program
+1)) {
756 prog
->startp
[0] = string
;
757 prog
->endp
[0] = reginput
;
765 - regmatch - main matching routine
767 * Conceptually the strategy is simple: check to see whether the current
768 * node matches, call self recursively to see whether the rest matches,
769 * and then act accordingly. In practice we make some effort to avoid
770 * recursion, in particular by going through "ordinary" nodes (that don't
771 * need to know whether the rest of the match failed) by a loop instead of
774 /* 0 failure, 1 success */
775 static int regmatch (char *prog
) {
777 char *scan
; /* current node */
778 char *next
; /* next node */
783 if (scan
!= NULL
&& hsRxNarrate
) fprintf(stderr
, "%s(\n", regprop(scan
));
785 while (scan
!= NULL
) {
787 if (hsRxNarrate
) fprintf(stderr
, "%s...\n", regprop(scan
));
789 next
= regnext(scan
);
790 if (OP(scan
) >= OPEN
&& OP(scan
) < OPEN
+NSUBEXP
) {
793 if (regmatch(next
)) {
794 /* don't set startp if some later invocation of the same parentheses already has */
795 if (regstartp
[no
] == NULL
|| regstartp
[no
] > save
) regstartp
[no
] = save
;
800 if (OP(scan
) >= CLOSE
&& OP(scan
) < CLOSE
+NSUBEXP
) {
803 if (regmatch(next
)) {
804 /* don't set endp if some later invocation of the same parentheses already has */
805 if (regendp
[no
] == NULL
|| regendp
[no
] < save
) regendp
[no
] = save
;
811 case BOL
: if (reginput
!= regbol
) return 0; break;
812 case EOL
: if (*reginput
!= '\0') return 0; break;
814 /* must be looking at a letter, digit, or _ */
815 if ((!isalnum(*reginput
)) && *reginput
!= '_') return 0;
816 /* prev must be BOL or nonword */
817 if (reginput
> regbol
&& (isalnum(reginput
[-1]) || reginput
[-1] == '_')) return 0;
820 /* must be looking at non letter, digit, or _ */
821 if (isalnum(*reginput
) || *reginput
== '_') return 0;
822 /* we don't care what the previous char was */
825 if (*reginput
== '\0') return 0;
832 opnd
= OPERAND(scan
);
833 /* inline the first character, for speed */
834 if (*opnd
!= *reginput
) return 0;
836 if (len
> 1 && strncmp(opnd
, reginput
, len
) != 0) return 0;
840 if (*reginput
== '\0' || strchr(OPERAND(scan
), *reginput
) == NULL
) return 0;
844 if (*reginput
== '\0' || strchr(OPERAND(scan
), *reginput
) != NULL
) return 0;
851 if (OP(next
) != BRANCH
) {
853 /* avoid recursion, i.e. move PC to the next VM instruction */
854 next
= OPERAND(scan
);
858 if (regmatch(OPERAND(scan
))) return 1; /* execute from next instruction */
860 /* perform jump, if failed */
861 scan
= regnext(scan
);
862 } while (scan
!= NULL
&& OP(scan
) == BRANCH
);
872 /* lookahead to avoid useless match attempts when we know what character comes next */
874 if (OP(next
) == EXACTLY
) nextch
= *OPERAND(next
);
875 min
= (OP(scan
) == STAR
)?0:1;
877 no
= regrepeat(OPERAND(scan
));
879 /* if it could work, try it */
880 if (nextch
== '\0' || *reginput
== nextch
) {
881 if (regmatch(next
)) return 1;
883 /* couldn't or didn't -- back up */
889 case END
: return 1; /* success! */
890 default: hsRxError("memory corruption"); return 0;
894 /* we get here only if there's trouble -- normally "case END" is the terminating point */
895 hsRxError("corrupted pointers");
901 - regrepeat - repeatedly match something simple, report how many
903 static int regrepeat (char *p
) {
912 count
= strlen(scan
);
916 while (*opnd
== *scan
) { ++count
; ++scan
; }
919 while (*scan
!= '\0' && strchr(opnd
, *scan
) != NULL
) { ++count
; ++scan
; }
922 while (*scan
!= '\0' && strchr(opnd
, *scan
) == NULL
) { ++count
; ++scan
; }
924 default: /* oh dear! called inappropriately */
925 hsRxError("internal foulup");
926 count
= 0; /* best compromise */
935 - regnext - dig the "next" pointer out of a node
937 static char *regnext (char *p
) {
940 if (p
== ®dummy
) return NULL
;
942 if (offset
== 0) return NULL
;
943 if (OP(p
) == BACK
) return p
-offset
;
950 - hsRxDump - dump a regexp onto stdout in vaguely comprehensible form
952 void hsRxDump (HSRegExp
*r
) {
954 char op
= EXACTLY
; /* arbitrary non-END op */
957 /* while that wasn't END last time... */
960 printf("%5d: %s", s
-r
->program
, regprop(s
)); /* where, what */
962 if (next
== NULL
) printf("%3d", 0); /* next ptr */
963 else printf("%3d", (s
-r
->program
)+(next
-s
));
965 if (op
== ANYOF
|| op
== ANYBUT
|| op
== EXACTLY
) {
966 /* literal string, where present */
971 if ((unsigned char)ch
< ' ' || ch
== 127 || ch
== '"') {
973 case '"': printf("\\\""); break;
974 case '\t': printf("\\t"); break;
975 case '\n': printf("\\n"); break;
976 case '\r': printf("\\r"); break;
977 default: printf("\\x%02x", (unsigned char)ch
); break;
988 /* header fields of interest */
989 if (r
->regstart
!= '\0') printf("start `%c' ", r
->regstart
);
990 if (r
->reganch
) printf("anchored ");
991 if (r
->regmust
!= NULL
) printf("must have \"%s\"", r
->regmust
);
997 - regprop - printable representation of opcode
999 static const char *regprop (const char *op
) {
1001 static char buf
[50];
1004 if (OP(op
) >= OPEN
&& OP(op
) < OPEN
+NSUBEXP
) {
1005 sprintf(buf
+strlen(buf
), "OPEN%02d", OP(op
)-OPEN
);
1006 } else if (OP(op
) >= CLOSE
&& OP(op
) < CLOSE
+NSUBEXP
) {
1007 sprintf(buf
+strlen(buf
), "CLOSE%02d", OP(op
)-OPEN
);
1010 case BOL
: p
= "BOL"; break;
1011 case EOL
: p
= "EOL"; break;
1012 case ANY
: p
= "ANY"; break;
1013 case ANYOF
: p
= "ANYOF"; break;
1014 case ANYBUT
: p
= "ANYBUT"; break;
1015 case BRANCH
: p
= "BRANCH"; break;
1016 case EXACTLY
: p
= "EXACTLY"; break;
1017 case NOTHING
: p
= "NOTHING"; break;
1018 case BACK
: p
= "BACK"; break;
1019 case END
: p
= "END"; break;
1020 case STAR
: p
= "STAR"; break;
1021 case PLUS
: p
= "PLUS"; break;
1022 case WORDA
: p
= "WORDA"; break;
1023 case WORDZ
: p
= "WORDZ"; break;
1024 default: hsRxError("corrupted opcode"); break;
1027 if (p
!= NULL
) strcat(buf
, p
);
1028 while (strlen(buf
) < 10) strcat(buf
, " ");
1035 * The following is provided for those people who do not have strcspn() in
1036 * their C libraries. They should get off their butts and do something
1037 * about it; at least one public-domain implementation of those (highly
1038 * useful) string routines has been published on Usenet.
1042 * strcspn - find length of initial segment of s1 consisting entirely
1043 * of characters not from s2
1046 static int strcspn (const char *s1
, const char *s2
) {
1047 const char *scan1
, *scan2
;
1051 for (scan1
= s1
; *scan1
!= '\0'; ++scan1
) {
1052 for (scan2
= s2
; *scan2
!= '\0';) /* ++ moved down. */
1053 if (*scan1
== *scan2
++) return count
;
1061 void hsRxError (const char *s
) {
1062 printf("re error %s\n", s
);