MOXA linux-2.6.x / linux-2.6.19-uc1 from UC-7110-LX-BOOTLOADER-1.9_VERSION-4.2.tgz
[linux-2.6.19-moxart.git] / net / ipv4 / netfilter / regexp / regexp.c
blob900698886531051a217e17754a7ac0c397a5cc1e
1 /*
2 * regcomp and regexec -- regsub and regerror are elsewhere
3 * @(#)regexp.c 1.3 of 18 April 87
5 * Copyright (c) 1986 by University of Toronto.
6 * Written by Henry Spencer. Not derived from licensed software.
8 * Permission is granted to anyone to use this software for any
9 * purpose on any computer system, and to redistribute it freely,
10 * subject to the following restrictions:
12 * 1. The author is not responsible for the consequences of use of
13 * this software, no matter how awful, even if they arise
14 * from defects in it.
16 * 2. The origin of this software must not be misrepresented, either
17 * by explicit claim or by omission.
19 * 3. Altered versions must be plainly marked as such, and must not
20 * be misrepresented as being the original software.
22 * Beware that some of this code is subtly aware of the way operator
23 * precedence is structured in regular expressions. Serious changes in
24 * regular-expression syntax might require a total rethink.
26 * This code was modified by Ethan Sommer to work within the kernel
27 * (it now uses kmalloc etc..)
29 * Modified slightly by Matthew Strait to use more modern C.
32 #include "regexp.h"
33 #include "regmagic.h"
35 /* added by ethan and matt. Lets it work in both kernel and user space.
36 (So iptables can use it, for instance.) Yea, it goes both ways... */
37 #if __KERNEL__
38 #define malloc(foo) kmalloc(foo,GFP_ATOMIC)
39 #else
40 #define printk(format,args...) printf(format,##args)
41 #endif
43 void regerror(char * s)
45 printk("<3>Regexp: %s\n", s);
46 /* NOTREACHED */
50 * The "internal use only" fields in regexp.h are present to pass info from
51 * compile to execute that permits the execute phase to run lots faster on
52 * simple cases. They are:
54 * regstart char that must begin a match; '\0' if none obvious
55 * reganch is the match anchored (at beginning-of-line only)?
56 * regmust string (pointer into program) that match must include, or NULL
57 * regmlen length of regmust string
59 * Regstart and reganch permit very fast decisions on suitable starting points
60 * for a match, cutting down the work a lot. Regmust permits fast rejection
61 * of lines that cannot possibly match. The regmust tests are costly enough
62 * that regcomp() supplies a regmust only if the r.e. contains something
63 * potentially expensive (at present, the only such thing detected is * or +
64 * at the start of the r.e., which can involve a lot of backup). Regmlen is
65 * supplied because the test in regexec() needs it and regcomp() is computing
66 * it anyway.
70 * Structure for regexp "program". This is essentially a linear encoding
71 * of a nondeterministic finite-state machine (aka syntax charts or
72 * "railroad normal form" in parsing technology). Each node is an opcode
73 * plus a "next" pointer, possibly plus an operand. "Next" pointers of
74 * all nodes except BRANCH implement concatenation; a "next" pointer with
75 * a BRANCH on both ends of it is connecting two alternatives. (Here we
76 * have one of the subtle syntax dependencies: an individual BRANCH (as
77 * opposed to a collection of them) is never concatenated with anything
78 * because of operator precedence.) The operand of some types of node is
79 * a literal string; for others, it is a node leading into a sub-FSM. In
80 * particular, the operand of a BRANCH node is the first node of the branch.
81 * (NB this is *not* a tree structure: the tail of the branch connects
82 * to the thing following the set of BRANCHes.) The opcodes are:
85 /* definition number opnd? meaning */
86 #define END 0 /* no End of program. */
87 #define BOL 1 /* no Match "" at beginning of line. */
88 #define EOL 2 /* no Match "" at end of line. */
89 #define ANY 3 /* no Match any one character. */
90 #define ANYOF 4 /* str Match any character in this string. */
91 #define ANYBUT 5 /* str Match any character not in this string. */
92 #define BRANCH 6 /* node Match this alternative, or the next... */
93 #define BACK 7 /* no Match "", "next" ptr points backward. */
94 #define EXACTLY 8 /* str Match this string. */
95 #define NOTHING 9 /* no Match empty string. */
96 #define STAR 10 /* node Match this (simple) thing 0 or more times. */
97 #define PLUS 11 /* node Match this (simple) thing 1 or more times. */
98 #define OPEN 20 /* no Mark this point in input as start of #n. */
99 /* OPEN+1 is number 1, etc. */
100 #define CLOSE 30 /* no Analogous to OPEN. */
103 * Opcode notes:
105 * BRANCH The set of branches constituting a single choice are hooked
106 * together with their "next" pointers, since precedence prevents
107 * anything being concatenated to any individual branch. The
108 * "next" pointer of the last BRANCH in a choice points to the
109 * thing following the whole choice. This is also where the
110 * final "next" pointer of each individual branch points; each
111 * branch starts with the operand node of a BRANCH node.
113 * BACK Normal "next" pointers all implicitly point forward; BACK
114 * exists to make loop structures possible.
116 * STAR,PLUS '?', and complex '*' and '+', are implemented as circular
117 * BRANCH structures using BACK. Simple cases (one character
118 * per match) are implemented with STAR and PLUS for speed
119 * and to minimize recursive plunges.
121 * OPEN,CLOSE ...are numbered at compile time.
125 * A node is one char of opcode followed by two chars of "next" pointer.
126 * "Next" pointers are stored as two 8-bit pieces, high order first. The
127 * value is a positive offset from the opcode of the node containing it.
128 * An operand, if any, simply follows the node. (Note that much of the
129 * code generation knows about this implicit relationship.)
131 * Using two bytes for the "next" pointer is vast overkill for most things,
132 * but allows patterns to get big without disasters.
134 #define OP(p) (*(p))
135 #define NEXT(p) (((*((p)+1)&0377)<<8) + (*((p)+2)&0377))
136 #define OPERAND(p) ((p) + 3)
139 * See regmagic.h for one further detail of program structure.
144 * Utility definitions.
146 #ifndef CHARBITS
147 #define UCHARAT(p) ((int)*(unsigned char *)(p))
148 #else
149 #define UCHARAT(p) ((int)*(p)&CHARBITS)
150 #endif
152 #define FAIL(m) { regerror(m); return(NULL); }
153 #define ISMULT(c) ((c) == '*' || (c) == '+' || (c) == '?')
154 #define META "^$.[()|?+*\\"
157 * Flags to be passed up and down.
159 #define HASWIDTH 01 /* Known never to match null string. */
160 #define SIMPLE 02 /* Simple enough to be STAR/PLUS operand. */
161 #define SPSTART 04 /* Starts with * or +. */
162 #define WORST 0 /* Worst case. */
165 * Global work variables for regcomp().
167 struct match_globals {
168 char *reginput; /* String-input pointer. */
169 char *regbol; /* Beginning of input, for ^ check. */
170 char **regstartp; /* Pointer to startp array. */
171 char **regendp; /* Ditto for endp. */
172 char *regparse; /* Input-scan pointer. */
173 int regnpar; /* () count. */
174 char regdummy;
175 char *regcode; /* Code-emit pointer; &regdummy = don't. */
176 long regsize; /* Code size. */
180 * Forward declarations for regcomp()'s friends.
182 #ifndef STATIC
183 #define STATIC static
184 #endif
185 STATIC char *reg(struct match_globals *g, int paren,int *flagp);
186 STATIC char *regbranch(struct match_globals *g, int *flagp);
187 STATIC char *regpiece(struct match_globals *g, int *flagp);
188 STATIC char *regatom(struct match_globals *g, int *flagp);
189 STATIC char *regnode(struct match_globals *g, char op);
190 STATIC char *regnext(struct match_globals *g, char *p);
191 STATIC void regc(struct match_globals *g, char b);
192 STATIC void reginsert(struct match_globals *g, char op, char *opnd);
193 STATIC void regtail(struct match_globals *g, char *p, char *val);
194 STATIC void regoptail(struct match_globals *g, char *p, char *val);
197 __kernel_size_t my_strcspn(const char *s1,const char *s2)
199 char *scan1;
200 char *scan2;
201 int count;
203 count = 0;
204 for (scan1 = (char *)s1; *scan1 != '\0'; scan1++) {
205 for (scan2 = (char *)s2; *scan2 != '\0';) /* ++ moved down. */
206 if (*scan1 == *scan2++)
207 return(count);
208 count++;
210 return(count);
214 - regcomp - compile a regular expression into internal code
216 * We can't allocate space until we know how big the compiled form will be,
217 * but we can't compile it (and thus know how big it is) until we've got a
218 * place to put the code. So we cheat: we compile it twice, once with code
219 * generation turned off and size counting turned on, and once "for real".
220 * This also means that we don't allocate space until we are sure that the
221 * thing really will compile successfully, and we never have to move the
222 * code and thus invalidate pointers into it. (Note that it has to be in
223 * one piece because free() must be able to free it all.)
225 * Beware that the optimization-preparation code in here knows about some
226 * of the structure of the compiled regexp.
228 regexp *
229 regcomp(char *exp,int *patternsize)
231 register regexp *r;
232 register char *scan;
233 register char *longest;
234 register int len;
235 int flags;
236 struct match_globals g;
238 /* commented out by ethan
239 extern char *malloc();
242 if (exp == NULL)
243 FAIL("NULL argument");
245 /* First pass: determine size, legality. */
246 g.regparse = exp;
247 g.regnpar = 1;
248 g.regsize = 0L;
249 g.regcode = &g.regdummy;
250 regc(&g, MAGIC);
251 if (reg(&g, 0, &flags) == NULL)
252 return(NULL);
254 /* Small enough for pointer-storage convention? */
255 if (g.regsize >= 32767L) /* Probably could be 65535L. */
256 FAIL("regexp too big");
258 /* Allocate space. */
259 *patternsize=sizeof(regexp) + (unsigned)g.regsize;
260 r = (regexp *)malloc(sizeof(regexp) + (unsigned)g.regsize);
261 if (r == NULL)
262 FAIL("out of space");
264 /* Second pass: emit code. */
265 g.regparse = exp;
266 g.regnpar = 1;
267 g.regcode = r->program;
268 regc(&g, MAGIC);
269 if (reg(&g, 0, &flags) == NULL)
270 return(NULL);
272 /* Dig out information for optimizations. */
273 r->regstart = '\0'; /* Worst-case defaults. */
274 r->reganch = 0;
275 r->regmust = NULL;
276 r->regmlen = 0;
277 scan = r->program+1; /* First BRANCH. */
278 if (OP(regnext(&g, scan)) == END) { /* Only one top-level choice. */
279 scan = OPERAND(scan);
281 /* Starting-point info. */
282 if (OP(scan) == EXACTLY)
283 r->regstart = *OPERAND(scan);
284 else if (OP(scan) == BOL)
285 r->reganch++;
288 * If there's something expensive in the r.e., find the
289 * longest literal string that must appear and make it the
290 * regmust. Resolve ties in favor of later strings, since
291 * the regstart check works with the beginning of the r.e.
292 * and avoiding duplication strengthens checking. Not a
293 * strong reason, but sufficient in the absence of others.
295 if (flags&SPSTART) {
296 longest = NULL;
297 len = 0;
298 for (; scan != NULL; scan = regnext(&g, scan))
299 if (OP(scan) == EXACTLY && strlen(OPERAND(scan)) >= len) {
300 longest = OPERAND(scan);
301 len = strlen(OPERAND(scan));
303 r->regmust = longest;
304 r->regmlen = len;
308 return(r);
312 - reg - regular expression, i.e. main body or parenthesized thing
314 * Caller must absorb opening parenthesis.
316 * Combining parenthesis handling with the base level of regular expression
317 * is a trifle forced, but the need to tie the tails of the branches to what
318 * follows makes it hard to avoid.
320 static char *
321 reg(struct match_globals *g, int paren, int *flagp /* Parenthesized? */ )
323 register char *ret;
324 register char *br;
325 register char *ender;
326 register int parno = 0; /* 0 makes gcc happy */
327 int flags;
329 *flagp = HASWIDTH; /* Tentatively. */
331 /* Make an OPEN node, if parenthesized. */
332 if (paren) {
333 if (g->regnpar >= NSUBEXP)
334 FAIL("too many ()");
335 parno = g->regnpar;
336 g->regnpar++;
337 ret = regnode(g, OPEN+parno);
338 } else
339 ret = NULL;
341 /* Pick up the branches, linking them together. */
342 br = regbranch(g, &flags);
343 if (br == NULL)
344 return(NULL);
345 if (ret != NULL)
346 regtail(g, ret, br); /* OPEN -> first. */
347 else
348 ret = br;
349 if (!(flags&HASWIDTH))
350 *flagp &= ~HASWIDTH;
351 *flagp |= flags&SPSTART;
352 while (*g->regparse == '|') {
353 g->regparse++;
354 br = regbranch(g, &flags);
355 if (br == NULL)
356 return(NULL);
357 regtail(g, ret, br); /* BRANCH -> BRANCH. */
358 if (!(flags&HASWIDTH))
359 *flagp &= ~HASWIDTH;
360 *flagp |= flags&SPSTART;
363 /* Make a closing node, and hook it on the end. */
364 ender = regnode(g, (paren) ? CLOSE+parno : END);
365 regtail(g, ret, ender);
367 /* Hook the tails of the branches to the closing node. */
368 for (br = ret; br != NULL; br = regnext(g, br))
369 regoptail(g, br, ender);
371 /* Check for proper termination. */
372 if (paren && *g->regparse++ != ')') {
373 FAIL("unmatched ()");
374 } else if (!paren && *g->regparse != '\0') {
375 if (*g->regparse == ')') {
376 FAIL("unmatched ()");
377 } else
378 FAIL("junk on end"); /* "Can't happen". */
379 /* NOTREACHED */
382 return(ret);
386 - regbranch - one alternative of an | operator
388 * Implements the concatenation operator.
390 static char *
391 regbranch(struct match_globals *g, int *flagp)
393 register char *ret;
394 register char *chain;
395 register char *latest;
396 int flags;
398 *flagp = WORST; /* Tentatively. */
400 ret = regnode(g, BRANCH);
401 chain = NULL;
402 while (*g->regparse != '\0' && *g->regparse != '|' && *g->regparse != ')') {
403 latest = regpiece(g, &flags);
404 if (latest == NULL)
405 return(NULL);
406 *flagp |= flags&HASWIDTH;
407 if (chain == NULL) /* First piece. */
408 *flagp |= flags&SPSTART;
409 else
410 regtail(g, chain, latest);
411 chain = latest;
413 if (chain == NULL) /* Loop ran zero times. */
414 (void) regnode(g, NOTHING);
416 return(ret);
420 - regpiece - something followed by possible [*+?]
422 * Note that the branching code sequences used for ? and the general cases
423 * of * and + are somewhat optimized: they use the same NOTHING node as
424 * both the endmarker for their branch list and the body of the last branch.
425 * It might seem that this node could be dispensed with entirely, but the
426 * endmarker role is not redundant.
428 static char *
429 regpiece(struct match_globals *g, int *flagp)
431 register char *ret;
432 register char op;
433 register char *next;
434 int flags;
436 ret = regatom(g, &flags);
437 if (ret == NULL)
438 return(NULL);
440 op = *g->regparse;
441 if (!ISMULT(op)) {
442 *flagp = flags;
443 return(ret);
446 if (!(flags&HASWIDTH) && op != '?')
447 FAIL("*+ operand could be empty");
448 *flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH);
450 if (op == '*' && (flags&SIMPLE))
451 reginsert(g, STAR, ret);
452 else if (op == '*') {
453 /* Emit x* as (x&|), where & means "self". */
454 reginsert(g, BRANCH, ret); /* Either x */
455 regoptail(g, ret, regnode(g, BACK)); /* and loop */
456 regoptail(g, ret, ret); /* back */
457 regtail(g, ret, regnode(g, BRANCH)); /* or */
458 regtail(g, ret, regnode(g, NOTHING)); /* null. */
459 } else if (op == '+' && (flags&SIMPLE))
460 reginsert(g, PLUS, ret);
461 else if (op == '+') {
462 /* Emit x+ as x(&|), where & means "self". */
463 next = regnode(g, BRANCH); /* Either */
464 regtail(g, ret, next);
465 regtail(g, regnode(g, BACK), ret); /* loop back */
466 regtail(g, next, regnode(g, BRANCH)); /* or */
467 regtail(g, ret, regnode(g, NOTHING)); /* null. */
468 } else if (op == '?') {
469 /* Emit x? as (x|) */
470 reginsert(g, BRANCH, ret); /* Either x */
471 regtail(g, ret, regnode(g, BRANCH)); /* or */
472 next = regnode(g, NOTHING); /* null. */
473 regtail(g, ret, next);
474 regoptail(g, ret, next);
476 g->regparse++;
477 if (ISMULT(*g->regparse))
478 FAIL("nested *?+");
480 return(ret);
484 - regatom - the lowest level
486 * Optimization: gobbles an entire sequence of ordinary characters so that
487 * it can turn them into a single node, which is smaller to store and
488 * faster to run. Backslashed characters are exceptions, each becoming a
489 * separate node; the code is simpler that way and it's not worth fixing.
491 static char *
492 regatom(struct match_globals *g, int *flagp)
494 register char *ret;
495 int flags;
497 *flagp = WORST; /* Tentatively. */
499 switch (*g->regparse++) {
500 case '^':
501 ret = regnode(g, BOL);
502 break;
503 case '$':
504 ret = regnode(g, EOL);
505 break;
506 case '.':
507 ret = regnode(g, ANY);
508 *flagp |= HASWIDTH|SIMPLE;
509 break;
510 case '[': {
511 register int class;
512 register int classend;
514 if (*g->regparse == '^') { /* Complement of range. */
515 ret = regnode(g, ANYBUT);
516 g->regparse++;
517 } else
518 ret = regnode(g, ANYOF);
519 if (*g->regparse == ']' || *g->regparse == '-')
520 regc(g, *g->regparse++);
521 while (*g->regparse != '\0' && *g->regparse != ']') {
522 if (*g->regparse == '-') {
523 g->regparse++;
524 if (*g->regparse == ']' || *g->regparse == '\0')
525 regc(g, '-');
526 else {
527 class = UCHARAT(g->regparse-2)+1;
528 classend = UCHARAT(g->regparse);
529 if (class > classend+1)
530 FAIL("invalid [] range");
531 for (; class <= classend; class++)
532 regc(g, class);
533 g->regparse++;
535 } else
536 regc(g, *g->regparse++);
538 regc(g, '\0');
539 if (*g->regparse != ']')
540 FAIL("unmatched []");
541 g->regparse++;
542 *flagp |= HASWIDTH|SIMPLE;
544 break;
545 case '(':
546 ret = reg(g, 1, &flags);
547 if (ret == NULL)
548 return(NULL);
549 *flagp |= flags&(HASWIDTH|SPSTART);
550 break;
551 case '\0':
552 case '|':
553 case ')':
554 FAIL("internal urp"); /* Supposed to be caught earlier. */
555 break;
556 case '?':
557 case '+':
558 case '*':
559 FAIL("?+* follows nothing");
560 break;
561 case '\\':
562 if (*g->regparse == '\0')
563 FAIL("trailing \\");
564 ret = regnode(g, EXACTLY);
565 regc(g, *g->regparse++);
566 regc(g, '\0');
567 *flagp |= HASWIDTH|SIMPLE;
568 break;
569 default: {
570 register int len;
571 register char ender;
573 g->regparse--;
574 len = my_strcspn((const char *)g->regparse, (const char *)META);
575 if (len <= 0)
576 FAIL("internal disaster");
577 ender = *(g->regparse+len);
578 if (len > 1 && ISMULT(ender))
579 len--; /* Back off clear of ?+* operand. */
580 *flagp |= HASWIDTH;
581 if (len == 1)
582 *flagp |= SIMPLE;
583 ret = regnode(g, EXACTLY);
584 while (len > 0) {
585 regc(g, *g->regparse++);
586 len--;
588 regc(g, '\0');
590 break;
593 return(ret);
597 - regnode - emit a node
599 static char * /* Location. */
600 regnode(struct match_globals *g, char op)
602 register char *ret;
603 register char *ptr;
605 ret = g->regcode;
606 if (ret == &g->regdummy) {
607 g->regsize += 3;
608 return(ret);
611 ptr = ret;
612 *ptr++ = op;
613 *ptr++ = '\0'; /* Null "next" pointer. */
614 *ptr++ = '\0';
615 g->regcode = ptr;
617 return(ret);
621 - regc - emit (if appropriate) a byte of code
623 static void
624 regc(struct match_globals *g, char b)
626 if (g->regcode != &g->regdummy)
627 *g->regcode++ = b;
628 else
629 g->regsize++;
633 - reginsert - insert an operator in front of already-emitted operand
635 * Means relocating the operand.
637 static void
638 reginsert(struct match_globals *g, char op, char* opnd)
640 register char *src;
641 register char *dst;
642 register char *place;
644 if (g->regcode == &g->regdummy) {
645 g->regsize += 3;
646 return;
649 src = g->regcode;
650 g->regcode += 3;
651 dst = g->regcode;
652 while (src > opnd)
653 *--dst = *--src;
655 place = opnd; /* Op node, where operand used to be. */
656 *place++ = op;
657 *place++ = '\0';
658 *place++ = '\0';
662 - regtail - set the next-pointer at the end of a node chain
664 static void
665 regtail(struct match_globals *g, char *p, char *val)
667 register char *scan;
668 register char *temp;
669 register int offset;
671 if (p == &g->regdummy)
672 return;
674 /* Find last node. */
675 scan = p;
676 for (;;) {
677 temp = regnext(g, scan);
678 if (temp == NULL)
679 break;
680 scan = temp;
683 if (OP(scan) == BACK)
684 offset = scan - val;
685 else
686 offset = val - scan;
687 *(scan+1) = (offset>>8)&0377;
688 *(scan+2) = offset&0377;
692 - regoptail - regtail on operand of first argument; nop if operandless
694 static void
695 regoptail(struct match_globals *g, char *p, char *val)
697 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
698 if (p == NULL || p == &g->regdummy || OP(p) != BRANCH)
699 return;
700 regtail(g, OPERAND(p), val);
704 * regexec and friends
709 * Forwards.
711 STATIC int regtry(struct match_globals *g, regexp *prog, char *string);
712 STATIC int regmatch(struct match_globals *g, char *prog);
713 STATIC int regrepeat(struct match_globals *g, char *p);
715 #ifdef DEBUG
716 int regnarrate = 0;
717 void regdump();
718 STATIC char *regprop(char *op);
719 #endif
722 - regexec - match a regexp against a string
725 regexec(regexp *prog, char *string)
727 register char *s;
728 struct match_globals g;
730 /* Be paranoid... */
731 if (prog == NULL || string == NULL) {
732 printk("<3>Regexp: NULL parameter\n");
733 return(0);
736 /* Check validity of program. */
737 if (UCHARAT(prog->program) != MAGIC) {
738 printk("<3>Regexp: corrupted program\n");
739 return(0);
742 /* If there is a "must appear" string, look for it. */
743 if (prog->regmust != NULL) {
744 s = string;
745 while ((s = strchr(s, prog->regmust[0])) != NULL) {
746 if (strncmp(s, prog->regmust, prog->regmlen) == 0)
747 break; /* Found it. */
748 s++;
750 if (s == NULL) /* Not present. */
751 return(0);
754 /* Mark beginning of line for ^ . */
755 g.regbol = string;
757 /* Simplest case: anchored match need be tried only once. */
758 if (prog->reganch)
759 return(regtry(&g, prog, string));
761 /* Messy cases: unanchored match. */
762 s = string;
763 if (prog->regstart != '\0')
764 /* We know what char it must start with. */
765 while ((s = strchr(s, prog->regstart)) != NULL) {
766 if (regtry(&g, prog, s))
767 return(1);
768 s++;
770 else
771 /* We don't -- general case. */
772 do {
773 if (regtry(&g, prog, s))
774 return(1);
775 } while (*s++ != '\0');
777 /* Failure. */
778 return(0);
782 - regtry - try match at specific point
784 static int /* 0 failure, 1 success */
785 regtry(struct match_globals *g, regexp *prog, char *string)
787 register int i;
788 register char **sp;
789 register char **ep;
791 g->reginput = string;
792 g->regstartp = prog->startp;
793 g->regendp = prog->endp;
795 sp = prog->startp;
796 ep = prog->endp;
797 for (i = NSUBEXP; i > 0; i--) {
798 *sp++ = NULL;
799 *ep++ = NULL;
801 if (regmatch(g, prog->program + 1)) {
802 prog->startp[0] = string;
803 prog->endp[0] = g->reginput;
804 return(1);
805 } else
806 return(0);
810 - regmatch - main matching routine
812 * Conceptually the strategy is simple: check to see whether the current
813 * node matches, call self recursively to see whether the rest matches,
814 * and then act accordingly. In practice we make some effort to avoid
815 * recursion, in particular by going through "ordinary" nodes (that don't
816 * need to know whether the rest of the match failed) by a loop instead of
817 * by recursion.
819 static int /* 0 failure, 1 success */
820 regmatch(struct match_globals *g, char *prog)
822 register char *scan = prog; /* Current node. */
823 char *next; /* Next node. */
825 #ifdef DEBUG
826 if (scan != NULL && regnarrate)
827 fprintf(stderr, "%s(\n", regprop(scan));
828 #endif
829 while (scan != NULL) {
830 #ifdef DEBUG
831 if (regnarrate)
832 fprintf(stderr, "%s...\n", regprop(scan));
833 #endif
834 next = regnext(g, scan);
836 switch (OP(scan)) {
837 case BOL:
838 if (g->reginput != g->regbol)
839 return(0);
840 break;
841 case EOL:
842 if (*g->reginput != '\0')
843 return(0);
844 break;
845 case ANY:
846 if (*g->reginput == '\0')
847 return(0);
848 g->reginput++;
849 break;
850 case EXACTLY: {
851 register int len;
852 register char *opnd;
854 opnd = OPERAND(scan);
855 /* Inline the first character, for speed. */
856 if (*opnd != *g->reginput)
857 return(0);
858 len = strlen(opnd);
859 if (len > 1 && strncmp(opnd, g->reginput, len) != 0)
860 return(0);
861 g->reginput += len;
863 break;
864 case ANYOF:
865 if (*g->reginput == '\0' || strchr(OPERAND(scan), *g->reginput) == NULL)
866 return(0);
867 g->reginput++;
868 break;
869 case ANYBUT:
870 if (*g->reginput == '\0' || strchr(OPERAND(scan), *g->reginput) != NULL)
871 return(0);
872 g->reginput++;
873 break;
874 case NOTHING:
875 case BACK:
876 break;
877 case OPEN+1:
878 case OPEN+2:
879 case OPEN+3:
880 case OPEN+4:
881 case OPEN+5:
882 case OPEN+6:
883 case OPEN+7:
884 case OPEN+8:
885 case OPEN+9: {
886 register int no;
887 register char *save;
889 no = OP(scan) - OPEN;
890 save = g->reginput;
892 if (regmatch(g, next)) {
894 * Don't set startp if some later
895 * invocation of the same parentheses
896 * already has.
898 if (g->regstartp[no] == NULL)
899 g->regstartp[no] = save;
900 return(1);
901 } else
902 return(0);
904 break;
905 case CLOSE+1:
906 case CLOSE+2:
907 case CLOSE+3:
908 case CLOSE+4:
909 case CLOSE+5:
910 case CLOSE+6:
911 case CLOSE+7:
912 case CLOSE+8:
913 case CLOSE+9:
915 register int no;
916 register char *save;
918 no = OP(scan) - CLOSE;
919 save = g->reginput;
921 if (regmatch(g, next)) {
923 * Don't set endp if some later
924 * invocation of the same parentheses
925 * already has.
927 if (g->regendp[no] == NULL)
928 g->regendp[no] = save;
929 return(1);
930 } else
931 return(0);
933 break;
934 case BRANCH: {
935 register char *save;
937 if (OP(next) != BRANCH) /* No choice. */
938 next = OPERAND(scan); /* Avoid recursion. */
939 else {
940 do {
941 save = g->reginput;
942 if (regmatch(g, OPERAND(scan)))
943 return(1);
944 g->reginput = save;
945 scan = regnext(g, scan);
946 } while (scan != NULL && OP(scan) == BRANCH);
947 return(0);
948 /* NOTREACHED */
951 break;
952 case STAR:
953 case PLUS: {
954 register char nextch;
955 register int no;
956 register char *save;
957 register int min;
960 * Lookahead to avoid useless match attempts
961 * when we know what character comes next.
963 nextch = '\0';
964 if (OP(next) == EXACTLY)
965 nextch = *OPERAND(next);
966 min = (OP(scan) == STAR) ? 0 : 1;
967 save = g->reginput;
968 no = regrepeat(g, OPERAND(scan));
969 while (no >= min) {
970 /* If it could work, try it. */
971 if (nextch == '\0' || *g->reginput == nextch)
972 if (regmatch(g, next))
973 return(1);
974 /* Couldn't or didn't -- back up. */
975 no--;
976 g->reginput = save + no;
978 return(0);
980 break;
981 case END:
982 return(1); /* Success! */
983 break;
984 default:
985 printk("<3>Regexp: memory corruption\n");
986 return(0);
987 break;
990 scan = next;
994 * We get here only if there's trouble -- normally "case END" is
995 * the terminating point.
997 printk("<3>Regexp: corrupted pointers\n");
998 return(0);
1002 - regrepeat - repeatedly match something simple, report how many
1004 static int
1005 regrepeat(struct match_globals *g, char *p)
1007 register int count = 0;
1008 register char *scan;
1009 register char *opnd;
1011 scan = g->reginput;
1012 opnd = OPERAND(p);
1013 switch (OP(p)) {
1014 case ANY:
1015 count = strlen(scan);
1016 scan += count;
1017 break;
1018 case EXACTLY:
1019 while (*opnd == *scan) {
1020 count++;
1021 scan++;
1023 break;
1024 case ANYOF:
1025 while (*scan != '\0' && strchr(opnd, *scan) != NULL) {
1026 count++;
1027 scan++;
1029 break;
1030 case ANYBUT:
1031 while (*scan != '\0' && strchr(opnd, *scan) == NULL) {
1032 count++;
1033 scan++;
1035 break;
1036 default: /* Oh dear. Called inappropriately. */
1037 printk("<3>Regexp: internal foulup\n");
1038 count = 0; /* Best compromise. */
1039 break;
1041 g->reginput = scan;
1043 return(count);
1047 - regnext - dig the "next" pointer out of a node
1049 static char*
1050 regnext(struct match_globals *g, char *p)
1052 register int offset;
1054 if (p == &g->regdummy)
1055 return(NULL);
1057 offset = NEXT(p);
1058 if (offset == 0)
1059 return(NULL);
1061 if (OP(p) == BACK)
1062 return(p-offset);
1063 else
1064 return(p+offset);
1067 #ifdef DEBUG
1069 STATIC char *regprop();
1072 - regdump - dump a regexp onto stdout in vaguely comprehensible form
1074 void
1075 regdump(regexp *r)
1077 register char *s;
1078 register char op = EXACTLY; /* Arbitrary non-END op. */
1079 register char *next;
1080 /* extern char *strchr(); */
1083 s = r->program + 1;
1084 while (op != END) { /* While that wasn't END last time... */
1085 op = OP(s);
1086 printf("%2d%s", s-r->program, regprop(s)); /* Where, what. */
1087 next = regnext(s);
1088 if (next == NULL) /* Next ptr. */
1089 printf("(0)");
1090 else
1091 printf("(%d)", (s-r->program)+(next-s));
1092 s += 3;
1093 if (op == ANYOF || op == ANYBUT || op == EXACTLY) {
1094 /* Literal string, where present. */
1095 while (*s != '\0') {
1096 putchar(*s);
1097 s++;
1099 s++;
1101 putchar('\n');
1104 /* Header fields of interest. */
1105 if (r->regstart != '\0')
1106 printf("start `%c' ", r->regstart);
1107 if (r->reganch)
1108 printf("anchored ");
1109 if (r->regmust != NULL)
1110 printf("must have \"%s\"", r->regmust);
1111 printf("\n");
1115 - regprop - printable representation of opcode
1117 static char *
1118 regprop(char *op)
1120 #define BUFLEN 50
1121 register char *p;
1122 static char buf[BUFLEN];
1124 strcpy(buf, ":");
1126 switch (OP(op)) {
1127 case BOL:
1128 p = "BOL";
1129 break;
1130 case EOL:
1131 p = "EOL";
1132 break;
1133 case ANY:
1134 p = "ANY";
1135 break;
1136 case ANYOF:
1137 p = "ANYOF";
1138 break;
1139 case ANYBUT:
1140 p = "ANYBUT";
1141 break;
1142 case BRANCH:
1143 p = "BRANCH";
1144 break;
1145 case EXACTLY:
1146 p = "EXACTLY";
1147 break;
1148 case NOTHING:
1149 p = "NOTHING";
1150 break;
1151 case BACK:
1152 p = "BACK";
1153 break;
1154 case END:
1155 p = "END";
1156 break;
1157 case OPEN+1:
1158 case OPEN+2:
1159 case OPEN+3:
1160 case OPEN+4:
1161 case OPEN+5:
1162 case OPEN+6:
1163 case OPEN+7:
1164 case OPEN+8:
1165 case OPEN+9:
1166 snprintf(buf+strlen(buf),BUFLEN-strlen(buf), "OPEN%d", OP(op)-OPEN);
1167 p = NULL;
1168 break;
1169 case CLOSE+1:
1170 case CLOSE+2:
1171 case CLOSE+3:
1172 case CLOSE+4:
1173 case CLOSE+5:
1174 case CLOSE+6:
1175 case CLOSE+7:
1176 case CLOSE+8:
1177 case CLOSE+9:
1178 snprintf(buf+strlen(buf),BUFLEN-strlen(buf), "CLOSE%d", OP(op)-CLOSE);
1179 p = NULL;
1180 break;
1181 case STAR:
1182 p = "STAR";
1183 break;
1184 case PLUS:
1185 p = "PLUS";
1186 break;
1187 default:
1188 printk("<3>Regexp: corrupted opcode\n");
1189 break;
1191 if (p != NULL)
1192 strncat(buf, p, BUFLEN-strlen(buf));
1193 return(buf);
1195 #endif