1 /****************************************************************
2 Copyright (C) Lucent Technologies 1997
5 Permission to use, copy, modify, and distribute this software and
6 its documentation for any purpose and without fee is hereby
7 granted, provided that the above copyright notice appear in all
8 copies and that both that the copyright notice and this
9 permission notice and warranty disclaimer appear in supporting
10 documentation, and that the name Lucent Technologies or any of
11 its entities not be used in advertising or publicity pertaining
12 to distribution of the software without specific, written prior
15 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
16 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
17 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
18 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
20 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
21 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
23 ****************************************************************/
25 /* lasciate ogne speranza, voi ch'intrate. */
36 #define HAT (NCHARS+2) /* matches ^ in regular expr */
40 #define type(v) (v)->nobj /* badly overloaded here */
41 #define info(v) (v)->ntype /* badly overloaded here */
42 #define left(v) (v)->narg[0]
43 #define right(v) (v)->narg[1]
44 #define parent(v) (v)->nnext
46 #define LEAF case CCL: case NCCL: case CHAR: case DOT: case FINAL: case ALL:
47 #define ELEAF case EMPTYRE: /* empty string in regexp */
48 #define UNARY case STAR: case PLUS: case QUEST:
50 /* encoding in tree Nodes:
51 leaf (CCL, NCCL, CHAR, DOT, FINAL, ALL, EMPTYRE):
52 left is index, right contains value or pointer to value
53 unary (STAR, PLUS, QUEST): left is child, right is null
54 binary (CAT, OR): left and right are children
55 parent contains pointer to parent
63 int rtok
; /* next token in current re */
65 static uschar
*rlxstr
;
66 static uschar
*prestr
; /* current position in current re */
67 static uschar
*lastre
; /* origin of last re */
75 #define NFA 20 /* cache this many dynamic fa's */
77 int nfatab
= 0; /* entries in fatab */
79 fa
*makedfa(const char *s
, int anchor
) /* returns dfa for reg expr s */
85 if (setvec
== NULL
) { /* first time through any RE */
87 setvec
= (int *) malloc(maxsetvec
* sizeof(int));
88 tmpset
= (int *) malloc(maxsetvec
* sizeof(int));
89 if (setvec
== NULL
|| tmpset
== NULL
)
90 overflo("out of space initializing makedfa");
93 if (compile_time
) /* a constant for sure */
94 return mkdfa(s
, anchor
);
95 for (i
= 0; i
< nfatab
; i
++) /* is it there already? */
96 if (fatab
[i
]->anchor
== anchor
97 && strcmp((const char *) fatab
[i
]->restr
, s
) == 0) {
98 fatab
[i
]->use
= now
++;
101 pfa
= mkdfa(s
, anchor
);
102 if (nfatab
< NFA
) { /* room for another */
104 fatab
[nfatab
]->use
= now
++;
108 use
= fatab
[0]->use
; /* replace least-recently used */
110 for (i
= 1; i
< nfatab
; i
++)
111 if (fatab
[i
]->use
< use
) {
121 fa
*mkdfa(const char *s
, int anchor
) /* does the real work of making a dfa */
122 /* anchor = 1 for anchored matches, else 0 */
128 p1
= op2(CAT
, op2(STAR
, op2(ALL
, NIL
, NIL
), NIL
), p
);
129 /* put ALL STAR in front of reg. exp. */
130 p1
= op2(CAT
, p1
, op2(FINAL
, NIL
, NIL
));
131 /* put FINAL after reg. exp. */
134 penter(p1
); /* enter parent pointers and leaf indices */
135 if ((f
= (fa
*) calloc(1, sizeof(fa
) + poscnt
*sizeof(rrow
))) == NULL
)
136 overflo("out of space for fa");
137 f
->accept
= poscnt
-1; /* penter has computed number of positions in re */
138 cfoll(f
, p1
); /* set up follow sets */
140 if ((f
->posns
[0] = (int *) calloc(*(f
->re
[0].lfollow
), sizeof(int))) == NULL
)
141 overflo("out of space in makedfa");
142 if ((f
->posns
[1] = (int *) calloc(1, sizeof(int))) == NULL
)
143 overflo("out of space in makedfa");
145 f
->initstat
= makeinit(f
, anchor
);
147 f
->restr
= (uschar
*) tostring(s
);
151 int makeinit(fa
*f
, int anchor
)
158 k
= *(f
->re
[0].lfollow
);
160 if ((f
->posns
[2] = (int *) calloc(k
+1, sizeof(int))) == NULL
)
161 overflo("out of space in makeinit");
162 for (i
=0; i
<= k
; i
++) {
163 (f
->posns
[2])[i
] = (f
->re
[0].lfollow
)[i
];
165 if ((f
->posns
[2])[1] == f
->accept
)
167 for (i
=0; i
< NCHARS
; i
++)
168 f
->gototab
[2][i
] = 0;
169 f
->curstat
= cgoto(f
, 2, HAT
);
171 *f
->posns
[2] = k
-1; /* leave out position 0 */
172 for (i
=0; i
< k
; i
++) {
173 (f
->posns
[0])[i
] = (f
->posns
[2])[i
];
176 f
->out
[0] = f
->out
[2];
178 --(*f
->posns
[f
->curstat
]);
183 void penter(Node
*p
) /* set up parent pointers and leaf indices */
200 parent(right(p
)) = p
;
202 default: /* can't happen */
203 FATAL("can't happen: unknown type %d in penter", type(p
));
208 void freetr(Node
*p
) /* free parse tree */
225 default: /* can't happen */
226 FATAL("can't happen: unknown type %d in freetr", type(p
));
231 /* in the parsing of regular expressions, metacharacters like . have */
232 /* to be seen literally; \056 is not a metacharacter. */
234 int hexstr(uschar
**pp
) /* find and eval hex string at pp, return new p */
235 { /* only pick up one 8-bit byte (2 chars) */
240 for (i
= 0, p
= (uschar
*) *pp
; i
< 2 && isxdigit(*p
); i
++, p
++) {
242 n
= 16 * n
+ *p
- '0';
243 else if (*p
>= 'a' && *p
<= 'f')
244 n
= 16 * n
+ *p
- 'a' + 10;
245 else if (*p
>= 'A' && *p
<= 'F')
246 n
= 16 * n
+ *p
- 'A' + 10;
252 #define isoctdigit(c) ((c) >= '0' && (c) <= '7') /* multiple use of arg */
254 int quoted(uschar
**pp
) /* pick up next thing after a \\ */
255 /* and increment *pp */
260 if ((c
= *p
++) == 't')
272 else if (c
== 'x') { /* hexadecimal goo follows */
273 c
= hexstr(&p
); /* this adds a null if number is invalid */
274 } else if (isoctdigit(c
)) { /* \d \dd \ddd */
276 if (isoctdigit(*p
)) {
277 n
= 8 * n
+ *p
++ - '0';
279 n
= 8 * n
+ *p
++ - '0';
288 static int collate_range_cmp(int a
, int b
)
292 if ((uschar
)a
== (uschar
)b
)
296 return (strcoll(s
[0], s
[1]));
299 char *cclenter(const char *argp
) /* add a character class */
303 uschar
*p
= (uschar
*) argp
;
305 static uschar
*buf
= NULL
;
306 static int bufsz
= 100;
309 if (buf
== NULL
&& (buf
= (uschar
*) malloc(bufsz
)) == NULL
)
310 FATAL("out of space for character class [%.10s...] 1", p
);
312 for (i
= 0; (c
= *p
++) != 0; ) {
315 } else if (c
== '-' && i
> 0 && bp
[-1] != 0) {
321 if (collate_range_cmp(c
, c2
) > 0) {
326 for (j
= 0; j
< NCHARS
; j
++) {
327 if ((collate_range_cmp(c
, j
) > 0) ||
328 collate_range_cmp(j
, c2
) > 0)
330 if (!adjbuf((char **) &buf
, &bufsz
, bp
-buf
+2, 100, (char **) &bp
, "cclenter1"))
331 FATAL("out of space for character class [%.10s...] 2", p
);
338 if (!adjbuf((char **) &buf
, &bufsz
, bp
-buf
+2, 100, (char **) &bp
, "cclenter2"))
339 FATAL("out of space for character class [%.10s...] 3", p
);
344 dprintf( ("cclenter: in = |%s|, out = |%s|\n", op
, buf
) );
346 return (char *) tostring((char *) buf
);
349 void overflo(const char *s
)
351 FATAL("regular expression too big: %.30s...", s
);
354 void cfoll(fa
*f
, Node
*v
) /* enter follow set of each leaf of vertex v into lfollow[leaf] */
362 f
->re
[info(v
)].ltype
= type(v
);
363 f
->re
[info(v
)].lval
.np
= right(v
);
364 while (f
->accept
>= maxsetvec
) { /* guessing here! */
366 setvec
= (int *) realloc(setvec
, maxsetvec
* sizeof(int));
367 tmpset
= (int *) realloc(tmpset
, maxsetvec
* sizeof(int));
368 if (setvec
== NULL
|| tmpset
== NULL
)
369 overflo("out of space in cfoll()");
371 for (i
= 0; i
<= f
->accept
; i
++)
374 follow(v
); /* computes setvec and setcnt */
375 if ((p
= (int *) calloc(setcnt
+1, sizeof(int))) == NULL
)
376 overflo("out of space building follow set");
377 f
->re
[info(v
)].lfollow
= p
;
379 for (i
= f
->accept
; i
>= 0; i
--)
391 default: /* can't happen */
392 FATAL("can't happen: unknown type %d in cfoll", type(v
));
396 int first(Node
*p
) /* collects initially active leaves of p into setvec */
397 /* returns 0 if p matches empty string */
404 lp
= info(p
); /* look for high-water mark of subscripts */
405 while (setcnt
>= maxsetvec
|| lp
>= maxsetvec
) { /* guessing here! */
407 setvec
= (int *) realloc(setvec
, maxsetvec
* sizeof(int));
408 tmpset
= (int *) realloc(tmpset
, maxsetvec
* sizeof(int));
409 if (setvec
== NULL
|| tmpset
== NULL
)
410 overflo("out of space in first()");
412 if (type(p
) == EMPTYRE
) {
416 if (setvec
[lp
] != 1) {
420 if (type(p
) == CCL
&& (*(char *) right(p
)) == '\0')
421 return(0); /* empty CCL */
424 if (first(left(p
)) == 0) return(0);
431 if (first(left(p
)) == 0 && first(right(p
)) == 0) return(0);
435 if (first(left(p
)) == 0 || b
== 0) return(0);
438 FATAL("can't happen: unknown type %d in first", type(p
)); /* can't happen */
442 void follow(Node
*v
) /* collects leaves that can follow v into setvec */
446 if (type(v
) == FINAL
)
462 if (v
== left(p
)) { /* v is left child of p */
463 if (first(right(p
)) == 0) {
467 } else /* v is right child */
473 int member(int c
, const char *sarg
) /* is c in s? */
475 uschar
*s
= (uschar
*) sarg
;
483 int match(fa
*f
, const char *p0
) /* shortest match ? */
486 uschar
*p
= (uschar
*) p0
;
488 s
= f
->reset
? makeinit(f
,0) : f
->initstat
;
492 /* assert(*p < NCHARS); */
493 if ((ns
= f
->gototab
[s
][*p
]) != 0)
503 int pmatch(fa
*f
, const char *p0
) /* longest match, for sub */
506 uschar
*p
= (uschar
*) p0
;
510 /* s = f->reset ? makeinit(f,1) : f->initstat; */
512 f
->initstat
= s
= makeinit(f
,1);
521 if (f
->out
[s
]) /* final state */
523 /* assert(*q < NCHARS); */
524 if ((ns
= f
->gototab
[s
][*q
]) != 0)
528 if (s
== 1) { /* no transition */
534 goto nextin
; /* no match */
538 patlen
= q
-p
-1; /* don't count $ */
546 for (i
= 2; i
<= f
->curstat
; i
++)
549 if ((f
->posns
[2] = (int *) calloc(k
+1, sizeof(int))) == NULL
)
550 overflo("out of space in pmatch");
551 for (i
= 0; i
<= k
; i
++)
552 (f
->posns
[2])[i
] = (f
->posns
[0])[i
];
553 f
->initstat
= f
->curstat
= 2;
554 f
->out
[2] = f
->out
[0];
555 for (i
= 0; i
< NCHARS
; i
++)
556 f
->gototab
[2][i
] = 0;
562 int nematch(fa
*f
, const char *p0
) /* non-empty match, for sub */
565 uschar
*p
= (uschar
*) p0
;
569 /* s = f->reset ? makeinit(f,1) : f->initstat; */
571 f
->initstat
= s
= makeinit(f
,1);
579 if (f
->out
[s
]) /* final state */
581 /* assert(*q < NCHARS); */
582 if ((ns
= f
->gototab
[s
][*q
]) != 0)
586 if (s
== 1) { /* no transition */
591 goto nnextin
; /* no nonempty match */
595 patlen
= q
-p
-1; /* don't count $ */
603 for (i
= 2; i
<= f
->curstat
; i
++)
606 if ((f
->posns
[2] = (int *) calloc(k
+1, sizeof(int))) == NULL
)
607 overflo("out of state space");
608 for (i
= 0; i
<= k
; i
++)
609 (f
->posns
[2])[i
] = (f
->posns
[0])[i
];
610 f
->initstat
= f
->curstat
= 2;
611 f
->out
[2] = f
->out
[0];
612 for (i
= 0; i
< NCHARS
; i
++)
613 f
->gototab
[2][i
] = 0;
620 Node
*reparse(const char *p
) /* parses regular expression pointed to by p */
621 { /* uses relex() to scan regular expression */
624 dprintf( ("reparse <%s>\n", p
) );
625 lastre
= prestr
= (uschar
*) p
; /* prestr points to string to be parsed */
627 /* GNU compatibility: an empty regexp matches anything */
629 /* FATAL("empty regular expression"); previous */
630 return(op2(EMPTYRE
, NIL
, NIL
));
634 FATAL("syntax error in regular expression %s at %s", lastre
, prestr
);
638 Node
*regexp(void) /* top-level parse of reg expr */
640 return (alt(concat(primary())));
649 np
= op2(CHAR
, NIL
, itonp(rlxval
));
654 return (unary(op2(ALL
, NIL
, NIL
)));
657 return (unary(op2(ALL
, NIL
, NIL
)));
660 return (unary(op2(DOT
, NIL
, NIL
)));
662 np
= op2(CCL
, NIL
, (Node
*) cclenter((char *) rlxstr
));
666 np
= op2(NCCL
, NIL
, (Node
*) cclenter((char *) rlxstr
));
671 return (unary(op2(CHAR
, NIL
, itonp(HAT
))));
674 return (unary(op2(CHAR
, NIL
, NIL
)));
677 if (rtok
== ')') { /* special pleading for () */
679 return unary(op2(CCL
, NIL
, (Node
*) tostring("")));
687 FATAL("syntax error in regular expression %s at %s", lastre
, prestr
);
689 FATAL("illegal primary in regular expression %s at %s", lastre
, prestr
);
691 return 0; /*NOTREACHED*/
694 Node
*concat(Node
*np
)
697 case CHAR
: case DOT
: case ALL
: case EMPTYRE
: case CCL
: case NCCL
: case '$': case '(':
698 return (concat(op2(CAT
, np
, primary())));
707 return (alt(op2(OR
, np
, concat(primary()))));
712 Node
*unary(Node
*np
)
717 return (unary(op2(STAR
, np
, NIL
)));
720 return (unary(op2(PLUS
, np
, NIL
)));
723 return (unary(op2(QUEST
, np
, NIL
)));
730 * Character class definitions conformant to the POSIX locale as
731 * defined in IEEE P1003.1 draft 7 of June 2001, assuming the source
732 * and operating character sets are both ASCII (ISO646) or supersets
735 * Note that to avoid overflowing the temporary buffer used in
736 * relex(), the expanded character class (prior to range expansion)
737 * must be less than twice the size of their full name.
740 /* Because isblank doesn't show up in any of the header files on any
741 * system i use, it's defined here. if some other locale has a richer
742 * definition of "blank", define HAS_ISBLANK and provide your own
744 * the parentheses here are an attempt to find a path through the maze
745 * of macro definition and/or function and/or version provided. thanks
746 * to nelson beebe for the suggestion; let's see if it works everywhere.
749 /* #define HAS_ISBLANK */
752 int (xisblank
)(int c
)
754 return c
==' ' || c
=='\t';
764 { "alnum", 5, isalnum
},
765 { "alpha", 5, isalpha
},
767 { "blank", 5, isspace
}, /* was isblank */
769 { "blank", 5, isblank
},
771 { "cntrl", 5, iscntrl
},
772 { "digit", 5, isdigit
},
773 { "graph", 5, isgraph
},
774 { "lower", 5, islower
},
775 { "print", 5, isprint
},
776 { "punct", 5, ispunct
},
777 { "space", 5, isspace
},
778 { "upper", 5, isupper
},
779 { "xdigit", 6, isxdigit
},
784 int relex(void) /* lexical analyzer for reparse */
788 static uschar
*buf
= NULL
;
789 static int bufsz
= 100;
791 struct charclass
*cc
;
794 switch (c
= *prestr
++) {
796 case '*': return STAR
;
797 case '+': return PLUS
;
798 case '?': return QUEST
;
799 case '.': return DOT
;
800 case '\0': prestr
--; return '\0';
807 rlxval
= quoted(&prestr
);
813 if (buf
== NULL
&& (buf
= (uschar
*) malloc(bufsz
)) == NULL
)
814 FATAL("out of space in reg expr %.10s..", lastre
);
816 if (*prestr
== '^') {
822 n
= 2 * strlen((const char *) prestr
)+1;
823 if (!adjbuf((char **) &buf
, &bufsz
, n
, n
, (char **) &bp
, "relex1"))
824 FATAL("out of space for reg expr %.10s...", lastre
);
826 if ((c
= *prestr
++) == '\\') {
828 if ((c
= *prestr
++) == '\0')
829 FATAL("nonterminated character class %.20s...", lastre
);
831 /* } else if (c == '\n') { */
832 /* FATAL("newline in character class %.20s...", lastre); */
833 } else if (c
== '[' && *prestr
== ':') {
834 /* POSIX char class names, Dag-Erling Smorgrav, des@ofug.org */
835 for (cc
= charclasses
; cc
->cc_name
; cc
++)
836 if (strncmp((const char *) prestr
+ 1, (const char *) cc
->cc_name
, cc
->cc_namelen
) == 0)
838 if (cc
->cc_name
!= NULL
&& prestr
[1 + cc
->cc_namelen
] == ':' &&
839 prestr
[2 + cc
->cc_namelen
] == ']') {
840 prestr
+= cc
->cc_namelen
+ 3;
841 for (i
= 1; i
< NCHARS
; i
++) {
842 if (!adjbuf((char **) &buf
, &bufsz
, bp
-buf
+1, 100, (char **) &bp
, "relex2"))
843 FATAL("out of space for reg expr %.10s...", lastre
);
844 if (cc
->cc_func(i
)) {
851 } else if (c
== '\0') {
852 FATAL("nonterminated character class %.20s", lastre
);
853 } else if (bp
== buf
) { /* 1st char is special */
855 } else if (c
== ']') {
857 rlxstr
= (uschar
*) tostring((char *) buf
);
868 int cgoto(fa
*f
, int s
, int c
)
873 assert(c
== HAT
|| c
< NCHARS
);
874 while (f
->accept
>= maxsetvec
) { /* guessing here! */
876 setvec
= (int *) realloc(setvec
, maxsetvec
* sizeof(int));
877 tmpset
= (int *) realloc(tmpset
, maxsetvec
* sizeof(int));
878 if (setvec
== NULL
|| tmpset
== NULL
)
879 overflo("out of space in cgoto()");
881 for (i
= 0; i
<= f
->accept
; i
++)
884 /* compute positions of gototab[s,c] into setvec */
886 for (i
= 1; i
<= *p
; i
++) {
887 if ((k
= f
->re
[p
[i
]].ltype
) != FINAL
) {
888 if ((k
== CHAR
&& c
== ptoi(f
->re
[p
[i
]].lval
.np
))
889 || (k
== DOT
&& c
!= 0 && c
!= HAT
)
890 || (k
== ALL
&& c
!= 0)
891 || (k
== EMPTYRE
&& c
!= 0)
892 || (k
== CCL
&& member(c
, (char *) f
->re
[p
[i
]].lval
.up
))
893 || (k
== NCCL
&& !member(c
, (char *) f
->re
[p
[i
]].lval
.up
) && c
!= 0 && c
!= HAT
)) {
894 q
= f
->re
[p
[i
]].lfollow
;
895 for (j
= 1; j
<= *q
; j
++) {
896 if (q
[j
] >= maxsetvec
) {
898 setvec
= (int *) realloc(setvec
, maxsetvec
* sizeof(int));
899 tmpset
= (int *) realloc(tmpset
, maxsetvec
* sizeof(int));
900 if (setvec
== NULL
|| tmpset
== NULL
)
901 overflo("cgoto overflow");
903 if (setvec
[q
[j
]] == 0) {
911 /* determine if setvec is a previous state */
914 for (i
= f
->accept
; i
>= 0; i
--)
918 /* tmpset == previous state? */
919 for (i
= 1; i
<= f
->curstat
; i
++) {
921 if ((k
= tmpset
[0]) != p
[0])
923 for (j
= 1; j
<= k
; j
++)
924 if (tmpset
[j
] != p
[j
])
926 /* setvec is state i */
927 f
->gototab
[s
][c
] = i
;
932 /* add tmpset to current set of states */
933 if (f
->curstat
>= NSTATES
-1) {
936 for (i
= 2; i
< NSTATES
; i
++)
940 for (i
= 0; i
< NCHARS
; i
++)
941 f
->gototab
[f
->curstat
][i
] = 0;
942 xfree(f
->posns
[f
->curstat
]);
943 if ((p
= (int *) calloc(setcnt
+1, sizeof(int))) == NULL
)
944 overflo("out of space in cgoto");
946 f
->posns
[f
->curstat
] = p
;
947 f
->gototab
[s
][c
] = f
->curstat
;
948 for (i
= 0; i
<= setcnt
; i
++)
950 if (setvec
[f
->accept
])
951 f
->out
[f
->curstat
] = 1;
953 f
->out
[f
->curstat
] = 0;
958 void freefa(fa
*f
) /* free a finite automaton */
964 for (i
= 0; i
<= f
->curstat
; i
++)
966 for (i
= 0; i
<= f
->accept
; i
++) {
967 xfree(f
->re
[i
].lfollow
);
968 if (f
->re
[i
].ltype
== CCL
|| f
->re
[i
].ltype
== NCCL
)
969 xfree((f
->re
[i
].lval
.np
));