* added compilers lcc and bcc (linux86)
[mascara-docs.git] / compilers / linux86-0.16.17 / libc / regexp / regexp.c
blob860b485887b395bf14f09fef972b6637cc1bc8ba
1 /*
2 * regcomp and regexec -- regsub and regerror are elsewhere
4 * Copyright (c) 1986 by University of Toronto.
5 * Written by Henry Spencer. Not derived from licensed software.
7 * Permission is granted to anyone to use this software for any
8 * purpose on any computer system, and to redistribute it freely,
9 * subject to the following restrictions:
11 * 1. The author is not responsible for the consequences of use of
12 * this software, no matter how awful, even if they arise
13 * from defects in it.
15 * 2. The origin of this software must not be misrepresented, either
16 * by explicit claim or by omission.
18 * 3. Altered versions must be plainly marked as such, and must not
19 * be misrepresented as being the original software.
21 * Beware that some of this code is subtly aware of the way operator
22 * precedence is structured in regular expressions. Serious changes in
23 * regular-expression syntax might require a total rethink.
25 #include <stdio.h>
26 #include <regexp.h>
27 #include <malloc.h>
28 #include <string.h>
29 #include "regmagic.h"
32 * The "internal use only" fields in regexp.h are present to pass info from
33 * compile to execute that permits the execute phase to run lots faster on
34 * simple cases. They are:
36 * regstart char that must begin a match; '\0' if none obvious
37 * reganch is the match anchored (at beginning-of-line only)?
38 * regmust string (pointer into program) that match must include, or NULL
39 * regmlen length of regmust string
41 * Regstart and reganch permit very fast decisions on suitable starting points
42 * for a match, cutting down the work a lot. Regmust permits fast rejection
43 * of lines that cannot possibly match. The regmust tests are costly enough
44 * that regcomp() supplies a regmust only if the r.e. contains something
45 * potentially expensive (at present, the only such thing detected is * or +
46 * at the start of the r.e., which can involve a lot of backup). Regmlen is
47 * supplied because the test in regexec() needs it and regcomp() is computing
48 * it anyway.
52 * Structure for regexp "program". This is essentially a linear encoding
53 * of a nondeterministic finite-state machine (aka syntax charts or
54 * "railroad normal form" in parsing technology). Each node is an opcode
55 * plus a "next" pointer, possibly plus an operand. "Next" pointers of
56 * all nodes except BRANCH implement concatenation; a "next" pointer with
57 * a BRANCH on both ends of it is connecting two alternatives. (Here we
58 * have one of the subtle syntax dependencies: an individual BRANCH (as
59 * opposed to a collection of them) is never concatenated with anything
60 * because of operator precedence.) The operand of some types of node is
61 * a literal string; for others, it is a node leading into a sub-FSM. In
62 * particular, the operand of a BRANCH node is the first node of the branch.
63 * (NB this is *not* a tree structure: the tail of the branch connects
64 * to the thing following the set of BRANCHes.) The opcodes are:
67 /* definition number opnd? meaning */
68 #define END 0 /* no End of program. */
69 #define BOL 1 /* no Match "" at beginning of line. */
70 #define EOL 2 /* no Match "" at end of line. */
71 #define ANY 3 /* no Match any one character. */
72 #define ANYOF 4 /* str Match any character in this string. */
73 #define ANYBUT 5 /* str Match any character not in this string. */
74 #define BRANCH 6 /* node Match this alternative, or the next... */
75 #define BACK 7 /* no Match "", "next" ptr points backward. */
76 #define EXACTLY 8 /* str Match this string. */
77 #define NOTHING 9 /* no Match empty string. */
78 #define STAR 10 /* node Match this (simple) thing 0 or more times. */
79 #define PLUS 11 /* node Match this (simple) thing 1 or more times. */
80 #define OPEN 20 /* no Mark this point in input as start of #n. */
81 /* OPEN+1 is number 1, etc. */
82 #define CLOSE 30 /* no Analogous to OPEN. */
85 * Opcode notes:
87 * BRANCH The set of branches constituting a single choice are hooked
88 * together with their "next" pointers, since precedence prevents
89 * anything being concatenated to any individual branch. The
90 * "next" pointer of the last BRANCH in a choice points to the
91 * thing following the whole choice. This is also where the
92 * final "next" pointer of each individual branch points; each
93 * branch starts with the operand node of a BRANCH node.
95 * BACK Normal "next" pointers all implicitly point forward; BACK
96 * exists to make loop structures possible.
98 * STAR,PLUS '?', and complex '*' and '+', are implemented as circular
99 * BRANCH structures using BACK. Simple cases (one character
100 * per match) are implemented with STAR and PLUS for speed
101 * and to minimize recursive plunges.
103 * OPEN,CLOSE ...are numbered at compile time.
107 * A node is one char of opcode followed by two chars of "next" pointer.
108 * "Next" pointers are stored as two 8-bit pieces, high order first. The
109 * value is a positive offset from the opcode of the node containing it.
110 * An operand, if any, simply follows the node. (Note that much of the
111 * code generation knows about this implicit relationship.)
113 * Using two bytes for the "next" pointer is vast overkill for most things,
114 * but allows patterns to get big without disasters.
116 #define OP(p) (*(p))
117 #define NEXT(p) (((*((p)+1)&0377)<<8) + (*((p)+2)&0377))
118 #define OPERAND(p) ((p) + 3)
121 * See regmagic.h for one further detail of program structure.
126 * Utility definitions.
128 #ifndef CHARBITS
129 #define UCHARAT(p) ((int)*(unsigned char *)(p))
130 #else
131 #define UCHARAT(p) ((int)*(p)&CHARBITS)
132 #endif
134 #define FAIL(m) { regerror(m); return(NULL); }
135 #define ISMULT(c) ((c) == '*' || (c) == '+' || (c) == '?')
136 #define META "^$.[()|?+*\\"
139 * Flags to be passed up and down.
141 #define HASWIDTH 01 /* Known never to match null string. */
142 #define SIMPLE 02 /* Simple enough to be STAR/PLUS operand. */
143 #define SPSTART 04 /* Starts with * or +. */
144 #define WORST 0 /* Worst case. */
147 * Global work variables for regcomp().
149 static char *regparse; /* Input-scan pointer. */
150 static int regnpar; /* () count. */
151 static char regdummy;
152 static char *regcode; /* Code-emit pointer; &regdummy = don't. */
153 static long regsize; /* Code size. */
156 * Forward declarations for regcomp()'s friends.
158 #ifndef STATIC
159 #define STATIC static
160 #endif
161 STATIC char *reg();
162 STATIC char *regbranch();
163 STATIC char *regpiece();
164 STATIC char *regatom();
165 STATIC char *regnode();
166 STATIC char *regnext();
167 STATIC void regc();
168 STATIC void reginsert();
169 STATIC void regtail();
170 STATIC void regoptail();
171 #ifdef STRCSPN
172 STATIC int strcspn();
173 #endif
176 - regcomp - compile a regular expression into internal code
178 * We can't allocate space until we know how big the compiled form will be,
179 * but we can't compile it (and thus know how big it is) until we've got a
180 * place to put the code. So we cheat: we compile it twice, once with code
181 * generation turned off and size counting turned on, and once "for real".
182 * This also means that we don't allocate space until we are sure that the
183 * thing really will compile successfully, and we never have to move the
184 * code and thus invalidate pointers into it. (Note that it has to be in
185 * one piece because free() must be able to free it all.)
187 * Beware that the optimization-preparation code in here knows about some
188 * of the structure of the compiled regexp.
190 regexp *
191 regcomp(exp)
192 char *exp;
194 register regexp *r;
195 register char *scan;
196 register char *longest;
197 register int len;
198 int flags;
200 if (exp == NULL)
201 FAIL("NULL argument");
203 /* First pass: determine size, legality. */
204 regparse = exp;
205 regnpar = 1;
206 regsize = 0L;
207 regcode = &regdummy;
208 regc(MAGIC);
209 if (reg(0, &flags) == NULL)
210 return(NULL);
212 /* Small enough for pointer-storage convention? */
213 if (regsize >= 32767L) /* Probably could be 65535L. */
214 FAIL("regexp too big");
216 /* Allocate space. */
217 r = (regexp *)malloc(sizeof(regexp) + (unsigned)regsize);
218 if (r == NULL)
219 FAIL("out of space");
221 /* Second pass: emit code. */
222 regparse = exp;
223 regnpar = 1;
224 regcode = r->program;
225 regc(MAGIC);
226 if (reg(0, &flags) == NULL)
227 return(NULL);
229 /* Dig out information for optimizations. */
230 r->regstart = '\0'; /* Worst-case defaults. */
231 r->reganch = 0;
232 r->regmust = NULL;
233 r->regmlen = 0;
234 scan = r->program+1; /* First BRANCH. */
235 if (OP(regnext(scan)) == END) { /* Only one top-level choice. */
236 scan = OPERAND(scan);
238 /* Starting-point info. */
239 if (OP(scan) == EXACTLY)
240 r->regstart = *OPERAND(scan);
241 else if (OP(scan) == BOL)
242 r->reganch++;
245 * If there's something expensive in the r.e., find the
246 * longest literal string that must appear and make it the
247 * regmust. Resolve ties in favor of later strings, since
248 * the regstart check works with the beginning of the r.e.
249 * and avoiding duplication strengthens checking. Not a
250 * strong reason, but sufficient in the absence of others.
252 if (flags&SPSTART) {
253 longest = NULL;
254 len = 0;
255 for (; scan != NULL; scan = regnext(scan))
256 if (OP(scan) == EXACTLY && strlen(OPERAND(scan)) >= len) {
257 longest = OPERAND(scan);
258 len = strlen(OPERAND(scan));
260 r->regmust = longest;
261 r->regmlen = len;
265 return(r);
269 - reg - regular expression, i.e. main body or parenthesized thing
271 * Caller must absorb opening parenthesis.
273 * Combining parenthesis handling with the base level of regular expression
274 * is a trifle forced, but the need to tie the tails of the branches to what
275 * follows makes it hard to avoid.
277 static char *
278 reg(paren, flagp)
279 int paren; /* Parenthesized? */
280 int *flagp;
282 register char *ret;
283 register char *br;
284 register char *ender;
285 register int parno;
286 int flags;
288 *flagp = HASWIDTH; /* Tentatively. */
290 /* Make an OPEN node, if parenthesized. */
291 if (paren) {
292 if (regnpar >= NSUBEXP)
293 FAIL("too many ()");
294 parno = regnpar;
295 regnpar++;
296 ret = regnode(OPEN+parno);
297 } else
298 ret = NULL;
300 /* Pick up the branches, linking them together. */
301 br = regbranch(&flags);
302 if (br == NULL)
303 return(NULL);
304 if (ret != NULL)
305 regtail(ret, br); /* OPEN -> first. */
306 else
307 ret = br;
308 if (!(flags&HASWIDTH))
309 *flagp &= ~HASWIDTH;
310 *flagp |= flags&SPSTART;
311 while (*regparse == '|') {
312 regparse++;
313 br = regbranch(&flags);
314 if (br == NULL)
315 return(NULL);
316 regtail(ret, br); /* BRANCH -> BRANCH. */
317 if (!(flags&HASWIDTH))
318 *flagp &= ~HASWIDTH;
319 *flagp |= flags&SPSTART;
322 /* Make a closing node, and hook it on the end. */
323 ender = regnode((paren) ? CLOSE+parno : END);
324 regtail(ret, ender);
326 /* Hook the tails of the branches to the closing node. */
327 for (br = ret; br != NULL; br = regnext(br))
328 regoptail(br, ender);
330 /* Check for proper termination. */
331 if (paren && *regparse++ != ')') {
332 FAIL("unmatched ()");
333 } else if (!paren && *regparse != '\0') {
334 if (*regparse == ')') {
335 FAIL("unmatched ()");
336 } else
337 FAIL("junk on end"); /* "Can't happen". */
338 /* NOTREACHED */
341 return(ret);
345 - regbranch - one alternative of an | operator
347 * Implements the concatenation operator.
349 static char *
350 regbranch(flagp)
351 int *flagp;
353 register char *ret;
354 register char *chain;
355 register char *latest;
356 int flags;
358 *flagp = WORST; /* Tentatively. */
360 ret = regnode(BRANCH);
361 chain = NULL;
362 while (*regparse != '\0' && *regparse != '|' && *regparse != ')') {
363 latest = regpiece(&flags);
364 if (latest == NULL)
365 return(NULL);
366 *flagp |= flags&HASWIDTH;
367 if (chain == NULL) /* First piece. */
368 *flagp |= flags&SPSTART;
369 else
370 regtail(chain, latest);
371 chain = latest;
373 if (chain == NULL) /* Loop ran zero times. */
374 (void) regnode(NOTHING);
376 return(ret);
380 - regpiece - something followed by possible [*+?]
382 * Note that the branching code sequences used for ? and the general cases
383 * of * and + are somewhat optimized: they use the same NOTHING node as
384 * both the endmarker for their branch list and the body of the last branch.
385 * It might seem that this node could be dispensed with entirely, but the
386 * endmarker role is not redundant.
388 static char *
389 regpiece(flagp)
390 int *flagp;
392 register char *ret;
393 register char op;
394 register char *next;
395 int flags;
397 ret = regatom(&flags);
398 if (ret == NULL)
399 return(NULL);
401 op = *regparse;
402 if (!ISMULT(op)) {
403 *flagp = flags;
404 return(ret);
407 if (!(flags&HASWIDTH) && op != '?')
408 FAIL("*+ operand could be empty");
409 *flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH);
411 if (op == '*' && (flags&SIMPLE))
412 reginsert(STAR, ret);
413 else if (op == '*') {
414 /* Emit x* as (x&|), where & means "self". */
415 reginsert(BRANCH, ret); /* Either x */
416 regoptail(ret, regnode(BACK)); /* and loop */
417 regoptail(ret, ret); /* back */
418 regtail(ret, regnode(BRANCH)); /* or */
419 regtail(ret, regnode(NOTHING)); /* null. */
420 } else if (op == '+' && (flags&SIMPLE))
421 reginsert(PLUS, ret);
422 else if (op == '+') {
423 /* Emit x+ as x(&|), where & means "self". */
424 next = regnode(BRANCH); /* Either */
425 regtail(ret, next);
426 regtail(regnode(BACK), ret); /* loop back */
427 regtail(next, regnode(BRANCH)); /* or */
428 regtail(ret, regnode(NOTHING)); /* null. */
429 } else if (op == '?') {
430 /* Emit x? as (x|) */
431 reginsert(BRANCH, ret); /* Either x */
432 regtail(ret, regnode(BRANCH)); /* or */
433 next = regnode(NOTHING); /* null. */
434 regtail(ret, next);
435 regoptail(ret, next);
437 regparse++;
438 if (ISMULT(*regparse))
439 FAIL("nested *?+");
441 return(ret);
445 - regatom - the lowest level
447 * Optimization: gobbles an entire sequence of ordinary characters so that
448 * it can turn them into a single node, which is smaller to store and
449 * faster to run. Backslashed characters are exceptions, each becoming a
450 * separate node; the code is simpler that way and it's not worth fixing.
452 static char *
453 regatom(flagp)
454 int *flagp;
456 register char *ret;
457 int flags;
459 *flagp = WORST; /* Tentatively. */
461 switch (*regparse++) {
462 case '^':
463 ret = regnode(BOL);
464 break;
465 case '$':
466 ret = regnode(EOL);
467 break;
468 case '.':
469 ret = regnode(ANY);
470 *flagp |= HASWIDTH|SIMPLE;
471 break;
472 case '[': {
473 register int class;
474 register int classend;
476 if (*regparse == '^') { /* Complement of range. */
477 ret = regnode(ANYBUT);
478 regparse++;
479 } else
480 ret = regnode(ANYOF);
481 if (*regparse == ']' || *regparse == '-')
482 regc(*regparse++);
483 while (*regparse != '\0' && *regparse != ']') {
484 if (*regparse == '-') {
485 regparse++;
486 if (*regparse == ']' || *regparse == '\0')
487 regc('-');
488 else {
489 class = UCHARAT(regparse-2)+1;
490 classend = UCHARAT(regparse);
491 if (class > classend+1)
492 FAIL("invalid [] range");
493 for (; class <= classend; class++)
494 regc(class);
495 regparse++;
497 } else
498 regc(*regparse++);
500 regc('\0');
501 if (*regparse != ']')
502 FAIL("unmatched []");
503 regparse++;
504 *flagp |= HASWIDTH|SIMPLE;
506 break;
507 case '(':
508 ret = reg(1, &flags);
509 if (ret == NULL)
510 return(NULL);
511 *flagp |= flags&(HASWIDTH|SPSTART);
512 break;
513 case '\0':
514 case '|':
515 case ')':
516 FAIL("internal urp"); /* Supposed to be caught earlier. */
517 break;
518 case '?':
519 case '+':
520 case '*':
521 FAIL("?+* follows nothing");
522 break;
523 case '\\':
524 if (*regparse == '\0')
525 FAIL("trailing \\");
526 ret = regnode(EXACTLY);
527 regc(*regparse++);
528 regc('\0');
529 *flagp |= HASWIDTH|SIMPLE;
530 break;
531 default: {
532 register int len;
533 register char ender;
535 regparse--;
536 len = strcspn(regparse, META);
537 if (len <= 0)
538 FAIL("internal disaster");
539 ender = *(regparse+len);
540 if (len > 1 && ISMULT(ender))
541 len--; /* Back off clear of ?+* operand. */
542 *flagp |= HASWIDTH;
543 if (len == 1)
544 *flagp |= SIMPLE;
545 ret = regnode(EXACTLY);
546 while (len > 0) {
547 regc(*regparse++);
548 len--;
550 regc('\0');
552 break;
555 return(ret);
559 - regnode - emit a node
561 static char * /* Location. */
562 regnode(op)
563 char op;
565 register char *ret;
566 register char *ptr;
568 ret = regcode;
569 if (ret == &regdummy) {
570 regsize += 3;
571 return(ret);
574 ptr = ret;
575 *ptr++ = op;
576 *ptr++ = '\0'; /* Null "next" pointer. */
577 *ptr++ = '\0';
578 regcode = ptr;
580 return(ret);
584 - regc - emit (if appropriate) a byte of code
586 static void
587 regc(b)
588 char b;
590 if (regcode != &regdummy)
591 *regcode++ = b;
592 else
593 regsize++;
597 - reginsert - insert an operator in front of already-emitted operand
599 * Means relocating the operand.
601 static void
602 reginsert(op, opnd)
603 char op;
604 char *opnd;
606 register char *src;
607 register char *dst;
608 register char *place;
610 if (regcode == &regdummy) {
611 regsize += 3;
612 return;
615 src = regcode;
616 regcode += 3;
617 dst = regcode;
618 while (src > opnd)
619 *--dst = *--src;
621 place = opnd; /* Op node, where operand used to be. */
622 *place++ = op;
623 *place++ = '\0';
624 *place++ = '\0';
628 - regtail - set the next-pointer at the end of a node chain
630 static void
631 regtail(p, val)
632 char *p;
633 char *val;
635 register char *scan;
636 register char *temp;
637 register int offset;
639 if (p == &regdummy)
640 return;
642 /* Find last node. */
643 scan = p;
644 for (;;) {
645 temp = regnext(scan);
646 if (temp == NULL)
647 break;
648 scan = temp;
651 if (OP(scan) == BACK)
652 offset = scan - val;
653 else
654 offset = val - scan;
655 *(scan+1) = (offset>>8)&0377;
656 *(scan+2) = offset&0377;
660 - regoptail - regtail on operand of first argument; nop if operandless
662 static void
663 regoptail(p, val)
664 char *p;
665 char *val;
667 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
668 if (p == NULL || p == &regdummy || OP(p) != BRANCH)
669 return;
670 regtail(OPERAND(p), val);
674 * regexec and friends
678 * Global work variables for regexec().
680 static char *reginput; /* String-input pointer. */
681 static char *regbol; /* Beginning of input, for ^ check. */
682 static char **regstartp; /* Pointer to startp array. */
683 static char **regendp; /* Ditto for endp. */
686 * Forwards.
688 STATIC int regtry();
689 STATIC int regmatch();
690 STATIC int regrepeat();
692 #ifdef DEBUG
693 int regnarrate = 0;
694 void regdump();
695 STATIC char *regprop();
696 #endif
699 - regexec - match a regexp against a string
702 regexec(prog, string)
703 register regexp *prog;
704 register char *string;
706 register char *s;
708 /* Be paranoid... */
709 if (prog == NULL || string == NULL) {
710 regerror("NULL parameter");
711 return(0);
714 /* Check validity of program. */
715 if (UCHARAT(prog->program) != MAGIC) {
716 regerror("corrupted program");
717 return(0);
720 /* If there is a "must appear" string, look for it. */
721 if (prog->regmust != NULL) {
722 s = string;
723 while ((s = strchr(s, prog->regmust[0])) != NULL) {
724 if (strncmp(s, prog->regmust, prog->regmlen) == 0)
725 break; /* Found it. */
726 s++;
728 if (s == NULL) /* Not present. */
729 return(0);
732 /* Mark beginning of line for ^ . */
733 regbol = string;
735 /* Simplest case: anchored match need be tried only once. */
736 if (prog->reganch)
737 return(regtry(prog, string));
739 /* Messy cases: unanchored match. */
740 s = string;
741 if (prog->regstart != '\0')
742 /* We know what char it must start with. */
743 while ((s = strchr(s, prog->regstart)) != NULL) {
744 if (regtry(prog, s))
745 return(1);
746 s++;
748 else
749 /* We don't -- general case. */
750 do {
751 if (regtry(prog, s))
752 return(1);
753 } while (*s++ != '\0');
755 /* Failure. */
756 return(0);
760 - regtry - try match at specific point
762 static int /* 0 failure, 1 success */
763 regtry(prog, string)
764 regexp *prog;
765 char *string;
767 register int i;
768 register char **sp;
769 register char **ep;
771 reginput = string;
772 regstartp = prog->startp;
773 regendp = prog->endp;
775 sp = prog->startp;
776 ep = prog->endp;
777 for (i = NSUBEXP; i > 0; i--) {
778 *sp++ = NULL;
779 *ep++ = NULL;
781 if (regmatch(prog->program + 1)) {
782 prog->startp[0] = string;
783 prog->endp[0] = reginput;
784 return(1);
785 } else
786 return(0);
790 - regmatch - main matching routine
792 * Conceptually the strategy is simple: check to see whether the current
793 * node matches, call self recursively to see whether the rest matches,
794 * and then act accordingly. In practice we make some effort to avoid
795 * recursion, in particular by going through "ordinary" nodes (that don't
796 * need to know whether the rest of the match failed) by a loop instead of
797 * by recursion.
799 static int /* 0 failure, 1 success */
800 regmatch(prog)
801 char *prog;
803 register char *scan; /* Current node. */
804 char *next; /* Next node. */
805 extern char *strchr();
807 scan = prog;
808 #ifdef DEBUG
809 if (scan != NULL && regnarrate)
810 fprintf(stderr, "%s(\n", regprop(scan));
811 #endif
812 while (scan != NULL) {
813 #ifdef DEBUG
814 if (regnarrate)
815 fprintf(stderr, "%s...\n", regprop(scan));
816 #endif
817 next = regnext(scan);
819 switch (OP(scan)) {
820 case BOL:
821 if (reginput != regbol)
822 return(0);
823 break;
824 case EOL:
825 if (*reginput != '\0')
826 return(0);
827 break;
828 case ANY:
829 if (*reginput == '\0')
830 return(0);
831 reginput++;
832 break;
833 case EXACTLY: {
834 register int len;
835 register char *opnd;
837 opnd = OPERAND(scan);
838 /* Inline the first character, for speed. */
839 if (*opnd != *reginput)
840 return(0);
841 len = strlen(opnd);
842 if (len > 1 && strncmp(opnd, reginput, len) != 0)
843 return(0);
844 reginput += len;
846 break;
847 case ANYOF:
848 if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) == NULL)
849 return(0);
850 reginput++;
851 break;
852 case ANYBUT:
853 if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) != NULL)
854 return(0);
855 reginput++;
856 break;
857 case NOTHING:
858 break;
859 case BACK:
860 break;
861 case OPEN+1:
862 case OPEN+2:
863 case OPEN+3:
864 case OPEN+4:
865 case OPEN+5:
866 case OPEN+6:
867 case OPEN+7:
868 case OPEN+8:
869 case OPEN+9: {
870 register int no;
871 register char *save;
873 no = OP(scan) - OPEN;
874 save = reginput;
876 if (regmatch(next)) {
878 * Don't set startp if some later
879 * invocation of the same parentheses
880 * already has.
882 if (regstartp[no] == NULL)
883 regstartp[no] = save;
884 return(1);
885 } else
886 return(0);
888 break;
889 case CLOSE+1:
890 case CLOSE+2:
891 case CLOSE+3:
892 case CLOSE+4:
893 case CLOSE+5:
894 case CLOSE+6:
895 case CLOSE+7:
896 case CLOSE+8:
897 case CLOSE+9: {
898 register int no;
899 register char *save;
901 no = OP(scan) - CLOSE;
902 save = reginput;
904 if (regmatch(next)) {
906 * Don't set endp if some later
907 * invocation of the same parentheses
908 * already has.
910 if (regendp[no] == NULL)
911 regendp[no] = save;
912 return(1);
913 } else
914 return(0);
916 break;
917 case BRANCH: {
918 register char *save;
920 if (OP(next) != BRANCH) /* No choice. */
921 next = OPERAND(scan); /* Avoid recursion. */
922 else {
923 do {
924 save = reginput;
925 if (regmatch(OPERAND(scan)))
926 return(1);
927 reginput = save;
928 scan = regnext(scan);
929 } while (scan != NULL && OP(scan) == BRANCH);
930 return(0);
931 /* NOTREACHED */
934 break;
935 case STAR:
936 case PLUS: {
937 register char nextch;
938 register int no;
939 register char *save;
940 register int min;
943 * Lookahead to avoid useless match attempts
944 * when we know what character comes next.
946 nextch = '\0';
947 if (OP(next) == EXACTLY)
948 nextch = *OPERAND(next);
949 min = (OP(scan) == STAR) ? 0 : 1;
950 save = reginput;
951 no = regrepeat(OPERAND(scan));
952 while (no >= min) {
953 /* If it could work, try it. */
954 if (nextch == '\0' || *reginput == nextch)
955 if (regmatch(next))
956 return(1);
957 /* Couldn't or didn't -- back up. */
958 no--;
959 reginput = save + no;
961 return(0);
963 break;
964 case END:
965 return(1); /* Success! */
966 break;
967 default:
968 regerror("memory corruption");
969 return(0);
970 break;
973 scan = next;
977 * We get here only if there's trouble -- normally "case END" is
978 * the terminating point.
980 regerror("corrupted pointers");
981 return(0);
985 - regrepeat - repeatedly match something simple, report how many
987 static int
988 regrepeat(p)
989 char *p;
991 register int count = 0;
992 register char *scan;
993 register char *opnd;
995 scan = reginput;
996 opnd = OPERAND(p);
997 switch (OP(p)) {
998 case ANY:
999 count = strlen(scan);
1000 scan += count;
1001 break;
1002 case EXACTLY:
1003 while (*opnd == *scan) {
1004 count++;
1005 scan++;
1007 break;
1008 case ANYOF:
1009 while (*scan != '\0' && strchr(opnd, *scan) != NULL) {
1010 count++;
1011 scan++;
1013 break;
1014 case ANYBUT:
1015 while (*scan != '\0' && strchr(opnd, *scan) == NULL) {
1016 count++;
1017 scan++;
1019 break;
1020 default: /* Oh dear. Called inappropriately. */
1021 regerror("internal foulup");
1022 count = 0; /* Best compromise. */
1023 break;
1025 reginput = scan;
1027 return(count);
1031 - regnext - dig the "next" pointer out of a node
1033 static char *
1034 regnext(p)
1035 register char *p;
1037 register int offset;
1039 if (p == &regdummy)
1040 return(NULL);
1042 offset = NEXT(p);
1043 if (offset == 0)
1044 return(NULL);
1046 if (OP(p) == BACK)
1047 return(p-offset);
1048 else
1049 return(p+offset);
1052 #ifdef DEBUG
1054 STATIC char *regprop();
1057 - regdump - dump a regexp onto stdout in vaguely comprehensible form
1059 void
1060 regdump(r)
1061 regexp *r;
1063 register char *s;
1064 register char op = EXACTLY; /* Arbitrary non-END op. */
1065 register char *next;
1066 extern char *strchr();
1069 s = r->program + 1;
1070 while (op != END) { /* While that wasn't END last time... */
1071 op = OP(s);
1072 printf("%2d%s", s-r->program, regprop(s)); /* Where, what. */
1073 next = regnext(s);
1074 if (next == NULL) /* Next ptr. */
1075 printf("(0)");
1076 else
1077 printf("(%d)", (s-r->program)+(next-s));
1078 s += 3;
1079 if (op == ANYOF || op == ANYBUT || op == EXACTLY) {
1080 /* Literal string, where present. */
1081 while (*s != '\0') {
1082 putchar(*s);
1083 s++;
1085 s++;
1087 putchar('\n');
1090 /* Header fields of interest. */
1091 if (r->regstart != '\0')
1092 printf("start `%c' ", r->regstart);
1093 if (r->reganch)
1094 printf("anchored ");
1095 if (r->regmust != NULL)
1096 printf("must have \"%s\"", r->regmust);
1097 printf("\n");
1101 - regprop - printable representation of opcode
1103 static char *
1104 regprop(op)
1105 char *op;
1107 register char *p;
1108 static char buf[50];
1110 (void) strcpy(buf, ":");
1112 switch (OP(op)) {
1113 case BOL:
1114 p = "BOL";
1115 break;
1116 case EOL:
1117 p = "EOL";
1118 break;
1119 case ANY:
1120 p = "ANY";
1121 break;
1122 case ANYOF:
1123 p = "ANYOF";
1124 break;
1125 case ANYBUT:
1126 p = "ANYBUT";
1127 break;
1128 case BRANCH:
1129 p = "BRANCH";
1130 break;
1131 case EXACTLY:
1132 p = "EXACTLY";
1133 break;
1134 case NOTHING:
1135 p = "NOTHING";
1136 break;
1137 case BACK:
1138 p = "BACK";
1139 break;
1140 case END:
1141 p = "END";
1142 break;
1143 case OPEN+1:
1144 case OPEN+2:
1145 case OPEN+3:
1146 case OPEN+4:
1147 case OPEN+5:
1148 case OPEN+6:
1149 case OPEN+7:
1150 case OPEN+8:
1151 case OPEN+9:
1152 sprintf(buf+strlen(buf), "OPEN%d", OP(op)-OPEN);
1153 p = NULL;
1154 break;
1155 case CLOSE+1:
1156 case CLOSE+2:
1157 case CLOSE+3:
1158 case CLOSE+4:
1159 case CLOSE+5:
1160 case CLOSE+6:
1161 case CLOSE+7:
1162 case CLOSE+8:
1163 case CLOSE+9:
1164 sprintf(buf+strlen(buf), "CLOSE%d", OP(op)-CLOSE);
1165 p = NULL;
1166 break;
1167 case STAR:
1168 p = "STAR";
1169 break;
1170 case PLUS:
1171 p = "PLUS";
1172 break;
1173 default:
1174 regerror("corrupted opcode");
1175 break;
1177 if (p != NULL)
1178 (void) strcat(buf, p);
1179 return(buf);
1181 #endif
1184 * The following is provided for those people who do not have strcspn() in
1185 * their C libraries. They should get off their butts and do something
1186 * about it; at least one public-domain implementation of those (highly
1187 * useful) string routines has been published on Usenet.
1189 #ifdef STRCSPN
1191 * strcspn - find length of initial segment of s1 consisting entirely
1192 * of characters not from s2
1195 static int
1196 strcspn(s1, s2)
1197 char *s1;
1198 char *s2;
1200 register char *scan1;
1201 register char *scan2;
1202 register int count;
1204 count = 0;
1205 for (scan1 = s1; *scan1 != '\0'; scan1++) {
1206 for (scan2 = s2; *scan2 != '\0';) /* ++ moved down. */
1207 if (*scan1 == *scan2++)
1208 return(count);
1209 count++;
1211 return(count);
1213 #endif