1 /****************************************************************
2 Copyright (C) Lucent Technologies 1997
5 Permission to use, copy, modify, and distribute this software and
6 its documentation for any purpose and without fee is hereby
7 granted, provided that the above copyright notice appear in all
8 copies and that both that the copyright notice and this
9 permission notice and warranty disclaimer appear in supporting
10 documentation, and that the name Lucent Technologies or any of
11 its entities not be used in advertising or publicity pertaining
12 to distribution of the software without specific, written prior
15 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
16 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
17 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
18 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
20 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
21 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
23 ****************************************************************/
25 /* lasciate ogne speranza, voi ch'intrate. */
36 #define HAT (NCHARS+2) /* matches ^ in regular expr */
40 #define type(v) (v)->nobj /* badly overloaded here */
41 #define info(v) (v)->ntype /* badly overloaded here */
42 #define left(v) (v)->narg[0]
43 #define right(v) (v)->narg[1]
44 #define parent(v) (v)->nnext
46 #define LEAF case CCL: case NCCL: case CHAR: case DOT: case FINAL: case ALL:
47 #define ELEAF case EMPTYRE: /* empty string in regexp */
48 #define UNARY case STAR: case PLUS: case QUEST:
50 /* encoding in tree Nodes:
51 leaf (CCL, NCCL, CHAR, DOT, FINAL, ALL, EMPTYRE):
52 left is index, right contains value or pointer to value
53 unary (STAR, PLUS, QUEST): left is child, right is null
54 binary (CAT, OR): left and right are children
55 parent contains pointer to parent
63 int rtok
; /* next token in current re */
65 static uschar
*rlxstr
;
66 static uschar
*prestr
; /* current position in current re */
67 static uschar
*lastre
; /* origin of last re */
75 #define NFA 20 /* cache this many dynamic fa's */
77 int nfatab
= 0; /* entries in fatab */
79 fa
*makedfa(const char *s
, int anchor
) /* returns dfa for reg expr s */
85 if (setvec
== 0) { /* first time through any RE */
87 setvec
= (int *) malloc(maxsetvec
* sizeof(int));
88 tmpset
= (int *) malloc(maxsetvec
* sizeof(int));
89 if (setvec
== 0 || tmpset
== 0)
90 overflo("out of space initializing makedfa");
93 if (compile_time
) /* a constant for sure */
94 return mkdfa(s
, anchor
);
95 for (i
= 0; i
< nfatab
; i
++) /* is it there already? */
96 if (fatab
[i
]->anchor
== anchor
97 && strcmp((const char *) fatab
[i
]->restr
, s
) == 0) {
98 fatab
[i
]->use
= now
++;
101 pfa
= mkdfa(s
, anchor
);
102 if (nfatab
< NFA
) { /* room for another */
104 fatab
[nfatab
]->use
= now
++;
108 use
= fatab
[0]->use
; /* replace least-recently used */
110 for (i
= 1; i
< nfatab
; i
++)
111 if (fatab
[i
]->use
< use
) {
121 fa
*mkdfa(const char *s
, int anchor
) /* does the real work of making a dfa */
122 /* anchor = 1 for anchored matches, else 0 */
128 p1
= op2(CAT
, op2(STAR
, op2(ALL
, NIL
, NIL
), NIL
), p
);
129 /* put ALL STAR in front of reg. exp. */
130 p1
= op2(CAT
, p1
, op2(FINAL
, NIL
, NIL
));
131 /* put FINAL after reg. exp. */
134 penter(p1
); /* enter parent pointers and leaf indices */
135 if ((f
= (fa
*) calloc(1, sizeof(fa
) + poscnt
*sizeof(rrow
))) == NULL
)
136 overflo("out of space for fa");
137 f
->accept
= poscnt
-1; /* penter has computed number of positions in re */
138 cfoll(f
, p1
); /* set up follow sets */
140 if ((f
->posns
[0] = (int *) calloc(1, *(f
->re
[0].lfollow
)*sizeof(int))) == NULL
)
141 overflo("out of space in makedfa");
142 if ((f
->posns
[1] = (int *) calloc(1, sizeof(int))) == NULL
)
143 overflo("out of space in makedfa");
145 f
->initstat
= makeinit(f
, anchor
);
147 f
->restr
= (uschar
*) tostring(s
);
151 int makeinit(fa
*f
, int anchor
)
158 k
= *(f
->re
[0].lfollow
);
160 if ((f
->posns
[2] = (int *) calloc(1, (k
+1)*sizeof(int))) == NULL
)
161 overflo("out of space in makeinit");
162 for (i
=0; i
<= k
; i
++) {
163 (f
->posns
[2])[i
] = (f
->re
[0].lfollow
)[i
];
165 if ((f
->posns
[2])[1] == f
->accept
)
167 for (i
=0; i
< NCHARS
; i
++)
168 f
->gototab
[2][i
] = 0;
169 f
->curstat
= cgoto(f
, 2, HAT
);
171 *f
->posns
[2] = k
-1; /* leave out position 0 */
172 for (i
=0; i
< k
; i
++) {
173 (f
->posns
[0])[i
] = (f
->posns
[2])[i
];
176 f
->out
[0] = f
->out
[2];
178 --(*f
->posns
[f
->curstat
]);
183 void penter(Node
*p
) /* set up parent pointers and leaf indices */
200 parent(right(p
)) = p
;
202 default: /* can't happen */
203 FATAL("can't happen: unknown type %d in penter", type(p
));
208 void freetr(Node
*p
) /* free parse tree */
225 default: /* can't happen */
226 FATAL("can't happen: unknown type %d in freetr", type(p
));
231 /* in the parsing of regular expressions, metacharacters like . have */
232 /* to be seen literally; \056 is not a metacharacter. */
234 int hexstr(uschar
**pp
) /* find and eval hex string at pp, return new p */
235 { /* only pick up one 8-bit byte (2 chars) */
240 for (i
= 0, p
= (uschar
*) *pp
; i
< 2 && isxdigit(*p
); i
++, p
++) {
242 n
= 16 * n
+ *p
- '0';
243 else if (*p
>= 'a' && *p
<= 'f')
244 n
= 16 * n
+ *p
- 'a' + 10;
245 else if (*p
>= 'A' && *p
<= 'F')
246 n
= 16 * n
+ *p
- 'A' + 10;
252 #define isoctdigit(c) ((c) >= '0' && (c) <= '7') /* multiple use of arg */
254 int quoted(uschar
**pp
) /* pick up next thing after a \\ */
255 /* and increment *pp */
260 if ((c
= *p
++) == 't')
272 else if (c
== 'x') { /* hexadecimal goo follows */
273 c
= hexstr(&p
); /* this adds a null if number is invalid */
274 } else if (isoctdigit(c
)) { /* \d \dd \ddd */
276 if (isoctdigit(*p
)) {
277 n
= 8 * n
+ *p
++ - '0';
279 n
= 8 * n
+ *p
++ - '0';
288 char *cclenter(const char *argp
) /* add a character class */
291 uschar
*p
= (uschar
*) argp
;
293 static uschar
*buf
= 0;
294 static int bufsz
= 100;
297 if (buf
== 0 && (buf
= (uschar
*) malloc(bufsz
)) == NULL
)
298 FATAL("out of space for character class [%.10s...] 1", p
);
300 for (i
= 0; (c
= *p
++) != 0; ) {
303 } else if (c
== '-' && i
> 0 && bp
[-1] != 0) {
309 if (c
> c2
) { /* empty; ignore */
315 if (!adjbuf((char **) &buf
, &bufsz
, bp
-buf
+2, 100, (char **) &bp
, "cclenter1"))
316 FATAL("out of space for character class [%.10s...] 2", p
);
323 if (!adjbuf((char **) &buf
, &bufsz
, bp
-buf
+2, 100, (char **) &bp
, "cclenter2"))
324 FATAL("out of space for character class [%.10s...] 3", p
);
329 dprintf( ("cclenter: in = |%s|, out = |%s|\n", op
, buf
) );
331 return (char *) tostring((char *) buf
);
334 void overflo(const char *s
)
336 FATAL("regular expression too big: %.30s...", s
);
339 void cfoll(fa
*f
, Node
*v
) /* enter follow set of each leaf of vertex v into lfollow[leaf] */
347 f
->re
[info(v
)].ltype
= type(v
);
348 f
->re
[info(v
)].lval
.np
= right(v
);
349 while (f
->accept
>= maxsetvec
) { /* guessing here! */
351 setvec
= (int *) realloc(setvec
, maxsetvec
* sizeof(int));
352 tmpset
= (int *) realloc(tmpset
, maxsetvec
* sizeof(int));
353 if (setvec
== 0 || tmpset
== 0)
354 overflo("out of space in cfoll()");
356 for (i
= 0; i
<= f
->accept
; i
++)
359 follow(v
); /* computes setvec and setcnt */
360 if ((p
= (int *) calloc(1, (setcnt
+1)*sizeof(int))) == NULL
)
361 overflo("out of space building follow set");
362 f
->re
[info(v
)].lfollow
= p
;
364 for (i
= f
->accept
; i
>= 0; i
--)
376 default: /* can't happen */
377 FATAL("can't happen: unknown type %d in cfoll", type(v
));
381 int first(Node
*p
) /* collects initially active leaves of p into setvec */
382 /* returns 0 if p matches empty string */
389 lp
= info(p
); /* look for high-water mark of subscripts */
390 while (setcnt
>= maxsetvec
|| lp
>= maxsetvec
) { /* guessing here! */
392 setvec
= (int *) realloc(setvec
, maxsetvec
* sizeof(int));
393 tmpset
= (int *) realloc(tmpset
, maxsetvec
* sizeof(int));
394 if (setvec
== 0 || tmpset
== 0)
395 overflo("out of space in first()");
397 if (type(p
) == EMPTYRE
) {
401 if (setvec
[lp
] != 1) {
405 if (type(p
) == CCL
&& (*(char *) right(p
)) == '\0')
406 return(0); /* empty CCL */
409 if (first(left(p
)) == 0) return(0);
416 if (first(left(p
)) == 0 && first(right(p
)) == 0) return(0);
420 if (first(left(p
)) == 0 || b
== 0) return(0);
423 FATAL("can't happen: unknown type %d in first", type(p
)); /* can't happen */
427 void follow(Node
*v
) /* collects leaves that can follow v into setvec */
431 if (type(v
) == FINAL
)
447 if (v
== left(p
)) { /* v is left child of p */
448 if (first(right(p
)) == 0) {
452 } else /* v is right child */
458 int member(int c
, const char *sarg
) /* is c in s? */
460 uschar
*s
= (uschar
*) sarg
;
468 int match(fa
*f
, const char *p0
) /* shortest match ? */
471 uschar
*p
= (uschar
*) p0
;
473 s
= f
->reset
? makeinit(f
,0) : f
->initstat
;
477 /* assert(*p < NCHARS); */
478 if ((ns
= f
->gototab
[s
][*p
]) != 0)
488 int pmatch(fa
*f
, const char *p0
) /* longest match, for sub */
491 uschar
*p
= (uschar
*) p0
;
495 /* s = f->reset ? makeinit(f,1) : f->initstat; */
497 f
->initstat
= s
= makeinit(f
,1);
506 if (f
->out
[s
]) /* final state */
508 /* assert(*q < NCHARS); */
509 if ((ns
= f
->gototab
[s
][*q
]) != 0)
513 if (s
== 1) { /* no transition */
519 goto nextin
; /* no match */
523 patlen
= q
-p
-1; /* don't count $ */
531 for (i
= 2; i
<= f
->curstat
; i
++)
534 if ((f
->posns
[2] = (int *) calloc(1, (k
+1)*sizeof(int))) == NULL
)
535 overflo("out of space in pmatch");
536 for (i
= 0; i
<= k
; i
++)
537 (f
->posns
[2])[i
] = (f
->posns
[0])[i
];
538 f
->initstat
= f
->curstat
= 2;
539 f
->out
[2] = f
->out
[0];
540 for (i
= 0; i
< NCHARS
; i
++)
541 f
->gototab
[2][i
] = 0;
547 int nematch(fa
*f
, const char *p0
) /* non-empty match, for sub */
550 uschar
*p
= (uschar
*) p0
;
554 /* s = f->reset ? makeinit(f,1) : f->initstat; */
556 f
->initstat
= s
= makeinit(f
,1);
564 if (f
->out
[s
]) /* final state */
566 /* assert(*q < NCHARS); */
567 if ((ns
= f
->gototab
[s
][*q
]) != 0)
571 if (s
== 1) { /* no transition */
576 goto nnextin
; /* no nonempty match */
580 patlen
= q
-p
-1; /* don't count $ */
588 for (i
= 2; i
<= f
->curstat
; i
++)
591 if ((f
->posns
[2] = (int *) calloc(1, (k
+1)*sizeof(int))) == NULL
)
592 overflo("out of state space");
593 for (i
= 0; i
<= k
; i
++)
594 (f
->posns
[2])[i
] = (f
->posns
[0])[i
];
595 f
->initstat
= f
->curstat
= 2;
596 f
->out
[2] = f
->out
[0];
597 for (i
= 0; i
< NCHARS
; i
++)
598 f
->gototab
[2][i
] = 0;
605 Node
*reparse(const char *p
) /* parses regular expression pointed to by p */
606 { /* uses relex() to scan regular expression */
609 dprintf( ("reparse <%s>\n", p
) );
610 lastre
= prestr
= (uschar
*) p
; /* prestr points to string to be parsed */
612 /* GNU compatibility: an empty regexp matches anything */
614 /* FATAL("empty regular expression"); previous */
615 return(op2(EMPTYRE
, NIL
, NIL
));
619 FATAL("syntax error in regular expression %s at %s", lastre
, prestr
);
623 Node
*regexp(void) /* top-level parse of reg expr */
625 return (alt(concat(primary())));
634 np
= op2(CHAR
, NIL
, itonp(rlxval
));
639 return (unary(op2(ALL
, NIL
, NIL
)));
642 return (unary(op2(ALL
, NIL
, NIL
)));
645 return (unary(op2(DOT
, NIL
, NIL
)));
647 np
= op2(CCL
, NIL
, (Node
*) cclenter((char *) rlxstr
));
651 np
= op2(NCCL
, NIL
, (Node
*) cclenter((char *) rlxstr
));
656 return (unary(op2(CHAR
, NIL
, itonp(HAT
))));
659 return (unary(op2(CHAR
, NIL
, NIL
)));
662 if (rtok
== ')') { /* special pleading for () */
664 return unary(op2(CCL
, NIL
, (Node
*) tostring("")));
672 FATAL("syntax error in regular expression %s at %s", lastre
, prestr
);
674 FATAL("illegal primary in regular expression %s at %s", lastre
, prestr
);
676 return 0; /*NOTREACHED*/
679 Node
*concat(Node
*np
)
682 case CHAR
: case DOT
: case ALL
: case EMPTYRE
: case CCL
: case NCCL
: case '$': case '(':
683 return (concat(op2(CAT
, np
, primary())));
692 return (alt(op2(OR
, np
, concat(primary()))));
697 Node
*unary(Node
*np
)
702 return (unary(op2(STAR
, np
, NIL
)));
705 return (unary(op2(PLUS
, np
, NIL
)));
708 return (unary(op2(QUEST
, np
, NIL
)));
715 * Character class definitions conformant to the POSIX locale as
716 * defined in IEEE P1003.1 draft 7 of June 2001, assuming the source
717 * and operating character sets are both ASCII (ISO646) or supersets
720 * Note that to avoid overflowing the temporary buffer used in
721 * relex(), the expanded character class (prior to range expansion)
722 * must be less than twice the size of their full name.
725 /* Because isblank doesn't show up in any of the header files on any
726 * system i use, it's defined here. if some other locale has a richer
727 * definition of "blank", define HAS_ISBLANK and provide your own
729 * the parentheses here are an attempt to find a path through the maze
730 * of macro definition and/or function and/or version provided. thanks
731 * to nelson beebe for the suggestion; let's see if it works everywhere.
734 /* #define HAS_ISBLANK */
737 int (xisblank
)(int c
)
739 return c
==' ' || c
=='\t';
749 { "alnum", 5, isalnum
},
750 { "alpha", 5, isalpha
},
752 { "blank", 5, isspace
}, /* was isblank */
754 { "blank", 5, isblank
},
756 { "cntrl", 5, iscntrl
},
757 { "digit", 5, isdigit
},
758 { "graph", 5, isgraph
},
759 { "lower", 5, islower
},
760 { "print", 5, isprint
},
761 { "punct", 5, ispunct
},
762 { "space", 5, isspace
},
763 { "upper", 5, isupper
},
764 { "xdigit", 6, isxdigit
},
769 int relex(void) /* lexical analyzer for reparse */
773 static uschar
*buf
= 0;
774 static int bufsz
= 100;
776 struct charclass
*cc
;
779 switch (c
= *prestr
++) {
781 case '*': return STAR
;
782 case '+': return PLUS
;
783 case '?': return QUEST
;
784 case '.': return DOT
;
785 case '\0': prestr
--; return '\0';
792 rlxval
= quoted(&prestr
);
798 if (buf
== 0 && (buf
= (uschar
*) malloc(bufsz
)) == NULL
)
799 FATAL("out of space in reg expr %.10s..", lastre
);
801 if (*prestr
== '^') {
807 n
= 2 * strlen((const char *) prestr
)+1;
808 if (!adjbuf((char **) &buf
, &bufsz
, n
, n
, (char **) &bp
, "relex1"))
809 FATAL("out of space for reg expr %.10s...", lastre
);
811 if ((c
= *prestr
++) == '\\') {
813 if ((c
= *prestr
++) == '\0')
814 FATAL("nonterminated character class %.20s...", lastre
);
816 /* } else if (c == '\n') { */
817 /* FATAL("newline in character class %.20s...", lastre); */
818 } else if (c
== '[' && *prestr
== ':') {
819 /* POSIX char class names, Dag-Erling Smorgrav, des@ofug.org */
820 for (cc
= charclasses
; cc
->cc_name
; cc
++)
821 if (strncmp((const char *) prestr
+ 1, (const char *) cc
->cc_name
, cc
->cc_namelen
) == 0)
823 if (cc
->cc_name
!= NULL
&& prestr
[1 + cc
->cc_namelen
] == ':' &&
824 prestr
[2 + cc
->cc_namelen
] == ']') {
825 prestr
+= cc
->cc_namelen
+ 3;
826 for (i
= 0; i
< NCHARS
; i
++) {
827 if (!adjbuf((char **) &buf
, &bufsz
, bp
-buf
+1, 100, (char **) &bp
, "relex2"))
828 FATAL("out of space for reg expr %.10s...", lastre
);
829 if (cc
->cc_func(i
)) {
836 } else if (c
== '\0') {
837 FATAL("nonterminated character class %.20s", lastre
);
838 } else if (bp
== buf
) { /* 1st char is special */
840 } else if (c
== ']') {
842 rlxstr
= (uschar
*) tostring((char *) buf
);
853 int cgoto(fa
*f
, int s
, int c
)
858 assert(c
== HAT
|| c
< NCHARS
);
859 while (f
->accept
>= maxsetvec
) { /* guessing here! */
861 setvec
= (int *) realloc(setvec
, maxsetvec
* sizeof(int));
862 tmpset
= (int *) realloc(tmpset
, maxsetvec
* sizeof(int));
863 if (setvec
== 0 || tmpset
== 0)
864 overflo("out of space in cgoto()");
866 for (i
= 0; i
<= f
->accept
; i
++)
869 /* compute positions of gototab[s,c] into setvec */
871 for (i
= 1; i
<= *p
; i
++) {
872 if ((k
= f
->re
[p
[i
]].ltype
) != FINAL
) {
873 if ((k
== CHAR
&& c
== ptoi(f
->re
[p
[i
]].lval
.np
))
874 || (k
== DOT
&& c
!= 0 && c
!= HAT
)
875 || (k
== ALL
&& c
!= 0)
876 || (k
== EMPTYRE
&& c
!= 0)
877 || (k
== CCL
&& member(c
, (char *) f
->re
[p
[i
]].lval
.up
))
878 || (k
== NCCL
&& !member(c
, (char *) f
->re
[p
[i
]].lval
.up
) && c
!= 0 && c
!= HAT
)) {
879 q
= f
->re
[p
[i
]].lfollow
;
880 for (j
= 1; j
<= *q
; j
++) {
881 if (q
[j
] >= maxsetvec
) {
883 setvec
= (int *) realloc(setvec
, maxsetvec
* sizeof(int));
884 tmpset
= (int *) realloc(tmpset
, maxsetvec
* sizeof(int));
885 if (setvec
== 0 || tmpset
== 0)
886 overflo("cgoto overflow");
888 if (setvec
[q
[j
]] == 0) {
896 /* determine if setvec is a previous state */
899 for (i
= f
->accept
; i
>= 0; i
--)
903 /* tmpset == previous state? */
904 for (i
= 1; i
<= f
->curstat
; i
++) {
906 if ((k
= tmpset
[0]) != p
[0])
908 for (j
= 1; j
<= k
; j
++)
909 if (tmpset
[j
] != p
[j
])
911 /* setvec is state i */
912 f
->gototab
[s
][c
] = i
;
917 /* add tmpset to current set of states */
918 if (f
->curstat
>= NSTATES
-1) {
921 for (i
= 2; i
< NSTATES
; i
++)
925 for (i
= 0; i
< NCHARS
; i
++)
926 f
->gototab
[f
->curstat
][i
] = 0;
927 xfree(f
->posns
[f
->curstat
]);
928 if ((p
= (int *) calloc(1, (setcnt
+1)*sizeof(int))) == NULL
)
929 overflo("out of space in cgoto");
931 f
->posns
[f
->curstat
] = p
;
932 f
->gototab
[s
][c
] = f
->curstat
;
933 for (i
= 0; i
<= setcnt
; i
++)
935 if (setvec
[f
->accept
])
936 f
->out
[f
->curstat
] = 1;
938 f
->out
[f
->curstat
] = 0;
943 void freefa(fa
*f
) /* free a finite automaton */
949 for (i
= 0; i
<= f
->curstat
; i
++)
951 for (i
= 0; i
<= f
->accept
; i
++) {
952 xfree(f
->re
[i
].lfollow
);
953 if (f
->re
[i
].ltype
== CCL
|| f
->re
[i
].ltype
== NCCL
)
954 xfree((f
->re
[i
].lval
.np
));