1 /****************************************************************
2 Copyright (C) Lucent Technologies 1997
5 Permission to use, copy, modify, and distribute this software and
6 its documentation for any purpose and without fee is hereby
7 granted, provided that the above copyright notice appear in all
8 copies and that both that the copyright notice and this
9 permission notice and warranty disclaimer appear in supporting
10 documentation, and that the name Lucent Technologies or any of
11 its entities not be used in advertising or publicity pertaining
12 to distribution of the software without specific, written prior
15 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
16 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
17 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
18 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
20 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
21 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
23 ****************************************************************/
25 /* lasciate ogne speranza, voi ch'entrate. */
36 #define HAT (NCHARS+2) /* matches ^ in regular expr */
40 #define type(v) (v)->nobj /* badly overloaded here */
41 #define info(v) (v)->ntype /* badly overloaded here */
42 #define left(v) (v)->narg[0]
43 #define right(v) (v)->narg[1]
44 #define parent(v) (v)->nnext
46 #define LEAF case CCL: case NCCL: case CHAR: case DOT: case FINAL: case ALL:
47 #define UNARY case STAR: case PLUS: case QUEST:
49 /* encoding in tree Nodes:
50 leaf (CCL, NCCL, CHAR, DOT, FINAL, ALL):
51 left is index, right contains value or pointer to value
52 unary (STAR, PLUS, QUEST): left is child, right is null
53 binary (CAT, OR): left and right are children
54 parent contains pointer to parent
62 int rtok
; /* next token in current re */
64 static uschar
*rlxstr
;
65 static uschar
*prestr
; /* current position in current re */
66 static uschar
*lastre
; /* origin of last re */
74 #define NFA 20 /* cache this many dynamic fa's */
76 int nfatab
= 0; /* entries in fatab */
78 fa
*makedfa(const char *s
, int anchor
) /* returns dfa for reg expr s */
84 if (setvec
== 0) { /* first time through any RE */
86 setvec
= (int *) malloc(maxsetvec
* sizeof(int));
87 tmpset
= (int *) malloc(maxsetvec
* sizeof(int));
88 if (setvec
== 0 || tmpset
== 0)
89 overflo("out of space initializing makedfa");
92 if (compile_time
) /* a constant for sure */
93 return mkdfa(s
, anchor
);
94 for (i
= 0; i
< nfatab
; i
++) /* is it there already? */
95 if (fatab
[i
]->anchor
== anchor
96 && strcmp((const char *) fatab
[i
]->restr
, s
) == 0) {
97 fatab
[i
]->use
= now
++;
100 pfa
= mkdfa(s
, anchor
);
101 if (nfatab
< NFA
) { /* room for another */
103 fatab
[nfatab
]->use
= now
++;
107 use
= fatab
[0]->use
; /* replace least-recently used */
109 for (i
= 1; i
< nfatab
; i
++)
110 if (fatab
[i
]->use
< use
) {
120 fa
*mkdfa(const char *s
, int anchor
) /* does the real work of making a dfa */
121 /* anchor = 1 for anchored matches, else 0 */
127 p1
= op2(CAT
, op2(STAR
, op2(ALL
, NIL
, NIL
), NIL
), p
);
128 /* put ALL STAR in front of reg. exp. */
129 p1
= op2(CAT
, p1
, op2(FINAL
, NIL
, NIL
));
130 /* put FINAL after reg. exp. */
133 penter(p1
); /* enter parent pointers and leaf indices */
134 if ((f
= (fa
*) calloc(1, sizeof(fa
) + poscnt
*sizeof(rrow
))) == NULL
)
135 overflo("out of space for fa");
136 f
->accept
= poscnt
-1; /* penter has computed number of positions in re */
137 cfoll(f
, p1
); /* set up follow sets */
139 if ((f
->posns
[0] = (int *) calloc(1, *(f
->re
[0].lfollow
)*sizeof(int))) == NULL
)
140 overflo("out of space in makedfa");
141 if ((f
->posns
[1] = (int *) calloc(1, sizeof(int))) == NULL
)
142 overflo("out of space in makedfa");
144 f
->initstat
= makeinit(f
, anchor
);
146 f
->restr
= (uschar
*) tostring(s
);
150 int makeinit(fa
*f
, int anchor
)
157 k
= *(f
->re
[0].lfollow
);
159 if ((f
->posns
[2] = (int *) calloc(1, (k
+1)*sizeof(int))) == NULL
)
160 overflo("out of space in makeinit");
161 for (i
=0; i
<= k
; i
++) {
162 (f
->posns
[2])[i
] = (f
->re
[0].lfollow
)[i
];
164 if ((f
->posns
[2])[1] == f
->accept
)
166 for (i
=0; i
< NCHARS
; i
++)
167 f
->gototab
[2][i
] = 0;
168 f
->curstat
= cgoto(f
, 2, HAT
);
170 *f
->posns
[2] = k
-1; /* leave out position 0 */
171 for (i
=0; i
< k
; i
++) {
172 (f
->posns
[0])[i
] = (f
->posns
[2])[i
];
175 f
->out
[0] = f
->out
[2];
177 --(*f
->posns
[f
->curstat
]);
182 void penter(Node
*p
) /* set up parent pointers and leaf indices */
198 parent(right(p
)) = p
;
200 default: /* can't happen */
201 FATAL("can't happen: unknown type %d in penter", type(p
));
206 void freetr(Node
*p
) /* free parse tree */
222 default: /* can't happen */
223 FATAL("can't happen: unknown type %d in freetr", type(p
));
228 /* in the parsing of regular expressions, metacharacters like . have */
229 /* to be seen literally; \056 is not a metacharacter. */
231 int hexstr(char **pp
) /* find and eval hex string at pp, return new p */
232 { /* only pick up one 8-bit byte (2 chars) */
237 for (i
= 0, p
= (uschar
*) *pp
; i
< 2 && isxdigit(*p
); i
++, p
++) {
239 n
= 16 * n
+ *p
- '0';
240 else if (*p
>= 'a' && *p
<= 'f')
241 n
= 16 * n
+ *p
- 'a' + 10;
242 else if (*p
>= 'A' && *p
<= 'F')
243 n
= 16 * n
+ *p
- 'A' + 10;
249 #define isoctdigit(c) ((c) >= '0' && (c) <= '7') /* multiple use of arg */
251 int quoted(char **pp
) /* pick up next thing after a \\ */
252 /* and increment *pp */
257 if ((c
= *p
++) == 't')
269 else if (c
== 'x') { /* hexadecimal goo follows */
270 c
= hexstr(&p
); /* this adds a null if number is invalid */
271 } else if (isoctdigit(c
)) { /* \d \dd \ddd */
273 if (isoctdigit(*p
)) {
274 n
= 8 * n
+ *p
++ - '0';
276 n
= 8 * n
+ *p
++ - '0';
285 char *cclenter(const char *argp
) /* add a character class */
288 uschar
*p
= (uschar
*) argp
;
290 static uschar
*buf
= 0;
291 static int bufsz
= 100;
294 if (buf
== 0 && (buf
= (uschar
*) malloc(bufsz
)) == NULL
)
295 FATAL("out of space for character class [%.10s...] 1", p
);
297 for (i
= 0; (c
= *p
++) != 0; ) {
299 c
= quoted((char **) &p
);
300 } else if (c
== '-' && i
> 0 && bp
[-1] != 0) {
305 c2
= quoted((char **) &p
);
306 if (c
> c2
) { /* empty; ignore */
312 if (!adjbuf((char **) &buf
, &bufsz
, bp
-buf
+2, 100, (char **) &bp
, 0))
313 FATAL("out of space for character class [%.10s...] 2", p
);
320 if (!adjbuf((char **) &buf
, &bufsz
, bp
-buf
+2, 100, (char **) &bp
, 0))
321 FATAL("out of space for character class [%.10s...] 3", p
);
326 dprintf( ("cclenter: in = |%s|, out = |%s|\n", op
, buf
) );
328 return (char *) tostring((char *) buf
);
331 void overflo(const char *s
)
333 FATAL("regular expression too big: %.30s...", s
);
336 void cfoll(fa
*f
, Node
*v
) /* enter follow set of each leaf of vertex v into lfollow[leaf] */
343 f
->re
[info(v
)].ltype
= type(v
);
344 f
->re
[info(v
)].lval
.np
= right(v
);
345 while (f
->accept
>= maxsetvec
) { /* guessing here! */
347 setvec
= (int *) realloc(setvec
, maxsetvec
* sizeof(int));
348 tmpset
= (int *) realloc(tmpset
, maxsetvec
* sizeof(int));
349 if (setvec
== 0 || tmpset
== 0)
350 overflo("out of space in cfoll()");
352 for (i
= 0; i
<= f
->accept
; i
++)
355 follow(v
); /* computes setvec and setcnt */
356 if ((p
= (int *) calloc(1, (setcnt
+1)*sizeof(int))) == NULL
)
357 overflo("out of space building follow set");
358 f
->re
[info(v
)].lfollow
= p
;
360 for (i
= f
->accept
; i
>= 0; i
--)
372 default: /* can't happen */
373 FATAL("can't happen: unknown type %d in cfoll", type(v
));
377 int first(Node
*p
) /* collects initially active leaves of p into setvec */
378 /* returns 1 if p matches empty string */
384 lp
= info(p
); /* look for high-water mark of subscripts */
385 while (setcnt
>= maxsetvec
|| lp
>= maxsetvec
) { /* guessing here! */
387 setvec
= (int *) realloc(setvec
, maxsetvec
* sizeof(int));
388 tmpset
= (int *) realloc(tmpset
, maxsetvec
* sizeof(int));
389 if (setvec
== 0 || tmpset
== 0)
390 overflo("out of space in first()");
392 if (setvec
[lp
] != 1) {
396 if (type(p
) == CCL
&& (*(char *) right(p
)) == '\0')
397 return(0); /* empty CCL */
400 if (first(left(p
)) == 0) return(0);
407 if (first(left(p
)) == 0 && first(right(p
)) == 0) return(0);
411 if (first(left(p
)) == 0 || b
== 0) return(0);
414 FATAL("can't happen: unknown type %d in first", type(p
)); /* can't happen */
418 void follow(Node
*v
) /* collects leaves that can follow v into setvec */
422 if (type(v
) == FINAL
)
438 if (v
== left(p
)) { /* v is left child of p */
439 if (first(right(p
)) == 0) {
443 } else /* v is right child */
449 int member(int c
, const char *sarg
) /* is c in s? */
451 uschar
*s
= (uschar
*) sarg
;
459 int match(fa
*f
, const char *p0
) /* shortest match ? */
462 uschar
*p
= (uschar
*) p0
;
464 s
= f
->reset
? makeinit(f
,0) : f
->initstat
;
469 if ((ns
= f
->gototab
[s
][*p
]) != 0)
479 int pmatch(fa
*f
, const char *p0
) /* longest match, for sub */
482 uschar
*p
= (uschar
*) p0
;
486 /* s = f->reset ? makeinit(f,1) : f->initstat; */
488 f
->initstat
= s
= makeinit(f
,1);
497 if (f
->out
[s
]) /* final state */
500 if ((ns
= f
->gototab
[s
][*q
]) != 0)
504 if (s
== 1) { /* no transition */
510 goto nextin
; /* no match */
514 patlen
= q
-p
-1; /* don't count $ */
522 for (i
= 2; i
<= f
->curstat
; i
++)
525 if ((f
->posns
[2] = (int *) calloc(1, (k
+1)*sizeof(int))) == NULL
)
526 overflo("out of space in pmatch");
527 for (i
= 0; i
<= k
; i
++)
528 (f
->posns
[2])[i
] = (f
->posns
[0])[i
];
529 f
->initstat
= f
->curstat
= 2;
530 f
->out
[2] = f
->out
[0];
531 for (i
= 0; i
< NCHARS
; i
++)
532 f
->gototab
[2][i
] = 0;
538 int nematch(fa
*f
, const char *p0
) /* non-empty match, for sub */
541 uschar
*p
= (uschar
*) p0
;
545 /* s = f->reset ? makeinit(f,1) : f->initstat; */
547 f
->initstat
= s
= makeinit(f
,1);
555 if (f
->out
[s
]) /* final state */
558 if ((ns
= f
->gototab
[s
][*q
]) != 0)
562 if (s
== 1) { /* no transition */
567 goto nnextin
; /* no nonempty match */
571 patlen
= q
-p
-1; /* don't count $ */
579 for (i
= 2; i
<= f
->curstat
; i
++)
582 if ((f
->posns
[2] = (int *) calloc(1, (k
+1)*sizeof(int))) == NULL
)
583 overflo("out of state space");
584 for (i
= 0; i
<= k
; i
++)
585 (f
->posns
[2])[i
] = (f
->posns
[0])[i
];
586 f
->initstat
= f
->curstat
= 2;
587 f
->out
[2] = f
->out
[0];
588 for (i
= 0; i
< NCHARS
; i
++)
589 f
->gototab
[2][i
] = 0;
596 Node
*reparse(const char *p
) /* parses regular expression pointed to by p */
597 { /* uses relex() to scan regular expression */
600 dprintf( ("reparse <%s>\n", p
) );
601 lastre
= prestr
= (uschar
*) p
; /* prestr points to string to be parsed */
603 /* GNU compatibility: an empty regexp matches anything */
605 /* FATAL("empty regular expression"); previous */
606 return(op2(ALL
, NIL
, NIL
));
609 FATAL("syntax error in regular expression %s at %s", lastre
, prestr
);
613 Node
*regexp(void) /* top-level parse of reg expr */
615 return (alt(concat(primary())));
624 np
= op2(CHAR
, NIL
, itonp(rlxval
));
629 return (unary(op2(ALL
, NIL
, NIL
)));
632 return (unary(op2(DOT
, NIL
, NIL
)));
634 np
= op2(CCL
, NIL
, (Node
*) cclenter((char *) rlxstr
));
638 np
= op2(NCCL
, NIL
, (Node
*) cclenter((char *) rlxstr
));
643 return (unary(op2(CHAR
, NIL
, itonp(HAT
))));
646 return (unary(op2(CHAR
, NIL
, NIL
)));
649 if (rtok
== ')') { /* special pleading for () */
651 return unary(op2(CCL
, NIL
, (Node
*) tostring("")));
659 FATAL("syntax error in regular expression %s at %s", lastre
, prestr
);
661 FATAL("illegal primary in regular expression %s at %s", lastre
, prestr
);
663 return 0; /*NOTREACHED*/
666 Node
*concat(Node
*np
)
669 case CHAR
: case DOT
: case ALL
: case CCL
: case NCCL
: case '$': case '(':
670 return (concat(op2(CAT
, np
, primary())));
679 return (alt(op2(OR
, np
, concat(primary()))));
684 Node
*unary(Node
*np
)
689 return (unary(op2(STAR
, np
, NIL
)));
692 return (unary(op2(PLUS
, np
, NIL
)));
695 return (unary(op2(QUEST
, np
, NIL
)));
702 * Character class definitions conformant to the POSIX locale as
703 * defined in IEEE P1003.1 draft 7 of June 2001, assuming the source
704 * and operating character sets are both ASCII (ISO646) or supersets
707 * Note that to avoid overflowing the temporary buffer used in
708 * relex(), the expanded character class (prior to range expansion)
709 * must be less than twice the size of their full name.
712 /* Because isblank doesn't show up in any of the header files on any
713 * system i use, it's defined here. if some other locale has a richer
714 * definition of "blank", define HAS_ISBLANK and provide your own
716 * the parentheses here are an attempt to find a path through the maze
717 * of macro definition and/or function and/or version provided. thanks
718 * to nelson beebe for the suggestion; let's see if it works everywhere.
725 return c
==' ' || c
=='\t';
735 { "alnum", 5, isalnum
},
736 { "alpha", 5, isalpha
},
737 { "blank", 5, isblank
},
738 { "cntrl", 5, iscntrl
},
739 { "digit", 5, isdigit
},
740 { "graph", 5, isgraph
},
741 { "lower", 5, islower
},
742 { "print", 5, isprint
},
743 { "punct", 5, ispunct
},
744 { "space", 5, isspace
},
745 { "upper", 5, isupper
},
746 { "xdigit", 6, isxdigit
},
751 int relex(void) /* lexical analyzer for reparse */
755 static uschar
*buf
= 0;
756 static int bufsz
= 100;
758 struct charclass
*cc
;
761 switch (c
= *prestr
++) {
763 case '*': return STAR
;
764 case '+': return PLUS
;
765 case '?': return QUEST
;
766 case '.': return DOT
;
767 case '\0': prestr
--; return '\0';
774 rlxval
= quoted((char **) &prestr
);
780 if (buf
== 0 && (buf
= (uschar
*) malloc(bufsz
)) == NULL
)
781 FATAL("out of space in reg expr %.10s..", lastre
);
783 if (*prestr
== '^') {
789 n
= 2 * strlen((const char *) prestr
)+1;
790 if (!adjbuf((char **) &buf
, &bufsz
, n
, n
, (char **) &bp
, 0))
791 FATAL("out of space for reg expr %.10s...", lastre
);
793 if ((c
= *prestr
++) == '\\') {
795 if ((c
= *prestr
++) == '\0')
796 FATAL("nonterminated character class %.20s...", lastre
);
798 /* } else if (c == '\n') { */
799 /* FATAL("newline in character class %.20s...", lastre); */
800 } else if (c
== '[' && *prestr
== ':') {
801 /* POSIX char class names, Dag-Erling Smorgrav, des@ofug.org */
802 for (cc
= charclasses
; cc
->cc_name
; cc
++)
803 if (strncmp((const char *) prestr
+ 1, (const char *) cc
->cc_name
, cc
->cc_namelen
) == 0)
805 if (cc
->cc_name
!= NULL
&& prestr
[1 + cc
->cc_namelen
] == ':' &&
806 prestr
[2 + cc
->cc_namelen
] == ']') {
807 prestr
+= cc
->cc_namelen
+ 3;
808 for (i
= 0; i
< NCHARS
; i
++) {
809 if (!adjbuf((char **) &buf
, &bufsz
, bp
-buf
+1, 100, (char **) &bp
, 0))
810 FATAL("out of space for reg expr %.10s...", lastre
);
811 if (cc
->cc_func(i
)) {
818 } else if (c
== '\0') {
819 FATAL("nonterminated character class %.20s", lastre
);
820 } else if (bp
== buf
) { /* 1st char is special */
822 } else if (c
== ']') {
824 rlxstr
= (uschar
*) tostring((char *) buf
);
835 int cgoto(fa
*f
, int s
, int c
)
840 assert(c
== HAT
|| c
< NCHARS
);
841 while (f
->accept
>= maxsetvec
) { /* guessing here! */
843 setvec
= (int *) realloc(setvec
, maxsetvec
* sizeof(int));
844 tmpset
= (int *) realloc(tmpset
, maxsetvec
* sizeof(int));
845 if (setvec
== 0 || tmpset
== 0)
846 overflo("out of space in cgoto()");
848 for (i
= 0; i
<= f
->accept
; i
++)
851 /* compute positions of gototab[s,c] into setvec */
853 for (i
= 1; i
<= *p
; i
++) {
854 if ((k
= f
->re
[p
[i
]].ltype
) != FINAL
) {
855 if ((k
== CHAR
&& c
== ptoi(f
->re
[p
[i
]].lval
.np
))
856 || (k
== DOT
&& c
!= 0 && c
!= HAT
)
857 || (k
== ALL
&& c
!= 0)
858 || (k
== CCL
&& member(c
, (char *) f
->re
[p
[i
]].lval
.up
))
859 || (k
== NCCL
&& !member(c
, (char *) f
->re
[p
[i
]].lval
.up
) && c
!= 0 && c
!= HAT
)) {
860 q
= f
->re
[p
[i
]].lfollow
;
861 for (j
= 1; j
<= *q
; j
++) {
862 if (q
[j
] >= maxsetvec
) {
864 setvec
= (int *) realloc(setvec
, maxsetvec
* sizeof(int));
865 tmpset
= (int *) realloc(setvec
, maxsetvec
* sizeof(int));
866 if (setvec
== 0 || tmpset
== 0)
867 overflo("cgoto overflow");
869 if (setvec
[q
[j
]] == 0) {
877 /* determine if setvec is a previous state */
880 for (i
= f
->accept
; i
>= 0; i
--)
884 /* tmpset == previous state? */
885 for (i
= 1; i
<= f
->curstat
; i
++) {
887 if ((k
= tmpset
[0]) != p
[0])
889 for (j
= 1; j
<= k
; j
++)
890 if (tmpset
[j
] != p
[j
])
892 /* setvec is state i */
893 f
->gototab
[s
][c
] = i
;
898 /* add tmpset to current set of states */
899 if (f
->curstat
>= NSTATES
-1) {
902 for (i
= 2; i
< NSTATES
; i
++)
906 for (i
= 0; i
< NCHARS
; i
++)
907 f
->gototab
[f
->curstat
][i
] = 0;
908 xfree(f
->posns
[f
->curstat
]);
909 if ((p
= (int *) calloc(1, (setcnt
+1)*sizeof(int))) == NULL
)
910 overflo("out of space in cgoto");
912 f
->posns
[f
->curstat
] = p
;
913 f
->gototab
[s
][c
] = f
->curstat
;
914 for (i
= 0; i
<= setcnt
; i
++)
916 if (setvec
[f
->accept
])
917 f
->out
[f
->curstat
] = 1;
919 f
->out
[f
->curstat
] = 0;
924 void freefa(fa
*f
) /* free a finite automaton */
930 for (i
= 0; i
<= f
->curstat
; i
++)
932 for (i
= 0; i
<= f
->accept
; i
++) {
933 xfree(f
->re
[i
].lfollow
);
934 if (f
->re
[i
].ltype
== CCL
|| f
->re
[i
].ltype
== NCCL
)
935 xfree((f
->re
[i
].lval
.np
));