1 /* dfa.c - deterministic extended regexp routines for GNU
2 Copyright (C) 1988, 1998 Free Software Foundation, Inc.
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2, or (at your option)
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA */
18 /* Written June, 1988 by Mike Haertel
19 Modified July, 1988 by Arthur David Olson to assist BMG speedups */
21 /* $FreeBSD: src/gnu/usr.bin/grep/dfa.c,v 1.12 2000/01/31 13:28:08 ru Exp $ */
22 /* $DragonFly: src/gnu/usr.bin/grep/dfa.c,v 1.3 2004/02/03 20:22:57 dillon Exp $ */
32 #include <sys/types.h>
36 extern char *calloc(), *malloc(), *realloc();
40 #if defined(HAVE_STRING_H) || defined(STDC_HEADERS)
48 #ifndef DEBUG /* use the same approach as regex.c */
54 #define isgraph(C) (isprint(C) && !isspace(C))
57 #if defined (STDC_HEADERS) || (!defined (isascii) && !defined (HAVE_ISASCII))
58 #define ISALPHA(C) isalpha(C)
59 #define ISUPPER(C) isupper(C)
60 #define ISLOWER(C) islower(C)
61 #define ISDIGIT(C) isdigit(C)
62 #define ISXDIGIT(C) isxdigit(C)
63 #define ISSPACE(C) isspace(C)
64 #define ISPUNCT(C) ispunct(C)
65 #define ISALNUM(C) isalnum(C)
66 #define ISPRINT(C) isprint(C)
67 #define ISGRAPH(C) isgraph(C)
68 #define ISCNTRL(C) iscntrl(C)
70 #define ISALPHA(C) (isascii(C) && isalpha(C))
71 #define ISUPPER(C) (isascii(C) && isupper(C))
72 #define ISLOWER(C) (isascii(C) && islower(C))
73 #define ISDIGIT(C) (isascii(C) && isdigit(C))
74 #define ISXDIGIT(C) (isascii(C) && isxdigit(C))
75 #define ISSPACE(C) (isascii(C) && isspace(C))
76 #define ISPUNCT(C) (isascii(C) && ispunct(C))
77 #define ISALNUM(C) (isascii(C) && isalnum(C))
78 #define ISPRINT(C) (isascii(C) && isprint(C))
79 #define ISGRAPH(C) (isascii(C) && isgraph(C))
80 #define ISCNTRL(C) (isascii(C) && iscntrl(C))
83 /* ISASCIIDIGIT differs from ISDIGIT, as follows:
84 - Its arg may be any int or unsigned int; it need not be an unsigned char.
85 - It's guaranteed to evaluate its argument exactly once.
86 - It's typically faster.
87 Posix 1003.2-1992 section 2.5.2.1 page 50 lines 1556-1558 says that
88 only '0' through '9' are digits. Prefer ISASCIIDIGIT to ISDIGIT unless
89 it's important to use the locale's definition of `digit' even when the
90 host does not conform to Posix. */
91 #define ISASCIIDIGIT(c) ((unsigned) (c) - '0' <= 9)
93 /* If we (don't) have I18N. */
96 # ifdef HAVE_LIBINTL_H
99 # define _(Str) gettext (Str)
102 # define _(Str) (Str)
106 #include <gnuregex.h>
109 /* HPUX, define those as macros in sys/param.h */
117 static void dfamust
PARAMS ((struct dfa
*dfa
));
119 static ptr_t xcalloc
PARAMS ((size_t n
, size_t s
));
120 static ptr_t xmalloc
PARAMS ((size_t n
));
121 static ptr_t xrealloc
PARAMS ((ptr_t p
, size_t n
));
123 static void prtok
PARAMS ((token t
));
125 static int tstbit
PARAMS ((int b
, charclass c
));
126 static void setbit
PARAMS ((int b
, charclass c
));
127 static void clrbit
PARAMS ((int b
, charclass c
));
128 static void copyset
PARAMS ((charclass src
, charclass dst
));
129 static void zeroset
PARAMS ((charclass s
));
130 static void notset
PARAMS ((charclass s
));
131 static int equal
PARAMS ((charclass s1
, charclass s2
));
132 static int charclass_index
PARAMS ((charclass s
));
133 static int looking_at
PARAMS ((const char *s
));
134 static token lex
PARAMS ((void));
135 static void addtok
PARAMS ((token t
));
136 static void atom
PARAMS ((void));
137 static int nsubtoks
PARAMS ((int tindex
));
138 static void copytoks
PARAMS ((int tindex
, int ntokens
));
139 static void closure
PARAMS ((void));
140 static void branch
PARAMS ((void));
141 static void regexp
PARAMS ((int toplevel
));
142 static void copy
PARAMS ((position_set
*src
, position_set
*dst
));
143 static void insert
PARAMS ((position p
, position_set
*s
));
144 static void merge
PARAMS ((position_set
*s1
, position_set
*s2
, position_set
*m
));
145 static void delete PARAMS ((position p
, position_set
*s
));
146 static int state_index
PARAMS ((struct dfa
*d
, position_set
*s
,
147 int newline
, int letter
));
148 static void build_state
PARAMS ((int s
, struct dfa
*d
));
149 static void build_state_zero
PARAMS ((struct dfa
*d
));
150 static char *icatalloc
PARAMS ((char *old
, char *new));
151 static char *icpyalloc
PARAMS ((char *string
));
152 static char *istrstr
PARAMS ((char *lookin
, char *lookfor
));
153 static void ifree
PARAMS ((char *cp
));
154 static void freelist
PARAMS ((char **cpp
));
155 static char **enlist
PARAMS ((char **cpp
, char *new, size_t len
));
156 static char **comsubs
PARAMS ((char *left
, char *right
));
157 static char **addlists
PARAMS ((char **old
, char **new));
158 static char **inboth
PARAMS ((char **left
, char **right
));
161 xcalloc (size_t n
, size_t s
)
163 ptr_t r
= calloc(n
, s
);
166 dfaerror(_("Memory exhausted"));
177 dfaerror(_("Memory exhausted"));
182 xrealloc (ptr_t p
, size_t n
)
184 ptr_t r
= realloc(p
, n
);
188 dfaerror(_("Memory exhausted"));
192 #define CALLOC(p, t, n) ((p) = (t *) xcalloc((size_t)(n), sizeof (t)))
193 #define MALLOC(p, t, n) ((p) = (t *) xmalloc((n) * sizeof (t)))
194 #define REALLOC(p, t, n) ((p) = (t *) xrealloc((ptr_t) (p), (n) * sizeof (t)))
196 /* Reallocate an array of type t if nalloc is too small for index. */
197 #define REALLOC_IF_NECESSARY(p, t, nalloc, index) \
198 if ((index) >= (nalloc)) \
200 while ((index) >= (nalloc)) \
202 REALLOC(p, t, nalloc); \
213 fprintf(stderr
, "END");
214 else if (t
< NOTCHAR
)
215 fprintf(stderr
, "%c", t
);
220 case EMPTY
: s
= "EMPTY"; break;
221 case BACKREF
: s
= "BACKREF"; break;
222 case BEGLINE
: s
= "BEGLINE"; break;
223 case ENDLINE
: s
= "ENDLINE"; break;
224 case BEGWORD
: s
= "BEGWORD"; break;
225 case ENDWORD
: s
= "ENDWORD"; break;
226 case LIMWORD
: s
= "LIMWORD"; break;
227 case NOTLIMWORD
: s
= "NOTLIMWORD"; break;
228 case QMARK
: s
= "QMARK"; break;
229 case STAR
: s
= "STAR"; break;
230 case PLUS
: s
= "PLUS"; break;
231 case CAT
: s
= "CAT"; break;
232 case OR
: s
= "OR"; break;
233 case ORTOP
: s
= "ORTOP"; break;
234 case LPAREN
: s
= "LPAREN"; break;
235 case RPAREN
: s
= "RPAREN"; break;
236 default: s
= "CSET"; break;
238 fprintf(stderr
, "%s", s
);
243 /* Stuff pertaining to charclasses. */
246 tstbit (int b
, charclass c
)
248 return c
[b
/ INTBITS
] & 1 << b
% INTBITS
;
252 setbit (int b
, charclass c
)
254 c
[b
/ INTBITS
] |= 1 << b
% INTBITS
;
258 clrbit (int b
, charclass c
)
260 c
[b
/ INTBITS
] &= ~(1 << b
% INTBITS
);
264 copyset (charclass src
, charclass dst
)
268 for (i
= 0; i
< CHARCLASS_INTS
; ++i
)
273 zeroset (charclass s
)
277 for (i
= 0; i
< CHARCLASS_INTS
; ++i
)
286 for (i
= 0; i
< CHARCLASS_INTS
; ++i
)
291 equal (charclass s1
, charclass s2
)
295 for (i
= 0; i
< CHARCLASS_INTS
; ++i
)
301 /* A pointer to the current dfa is kept here during parsing. */
302 static struct dfa
*dfa
;
304 /* Find the index of charclass s in dfa->charclasses, or allocate a new charclass. */
306 charclass_index (charclass s
)
310 for (i
= 0; i
< dfa
->cindex
; ++i
)
311 if (equal(s
, dfa
->charclasses
[i
]))
313 REALLOC_IF_NECESSARY(dfa
->charclasses
, charclass
, dfa
->calloc
, dfa
->cindex
);
315 copyset(s
, dfa
->charclasses
[i
]);
319 /* Syntax bits controlling the behavior of the lexical analyzer. */
320 static reg_syntax_t syntax_bits
, syntax_bits_set
;
322 /* Flag for case-folding letters into sets. */
323 static int case_fold
;
325 /* End-of-line byte in data. */
326 static unsigned char eolbyte
;
328 /* Entry point to set syntax options. */
330 dfasyntax (reg_syntax_t bits
, int fold
, int eol
)
338 /* Lexical analyzer. All the dross that deals with the obnoxious
339 GNU Regex syntax bits is located here. The poor, suffering
340 reader is referred to the GNU Regex documentation for the
341 meaning of the @#%!@#%^!@ syntax bits. */
343 static char *lexstart
; /* Pointer to beginning of input string. */
344 static char *lexptr
; /* Pointer to next input character. */
345 static int lexleft
; /* Number of characters remaining. */
346 static token lasttok
; /* Previous token returned; initially END. */
347 static int laststart
; /* True if we're separated from beginning or (, |
348 only by zero-width characters. */
349 static int parens
; /* Count of outstanding left parens. */
350 static int minrep
, maxrep
; /* Repeat counts for {m,n}. */
352 /* Note that characters become unsigned here. */
353 #define FETCH(c, eoferr) \
360 return lasttok = END; \
362 (c) = (unsigned char) *lexptr++; \
367 #define FUNC(F, P) static int F(int c) { return P(c); }
369 #define FUNC(F, P) static int F(c) int c; { return P(c); }
372 FUNC(is_alpha
, ISALPHA
)
373 FUNC(is_upper
, ISUPPER
)
374 FUNC(is_lower
, ISLOWER
)
375 FUNC(is_digit
, ISDIGIT
)
376 FUNC(is_xdigit
, ISXDIGIT
)
377 FUNC(is_space
, ISSPACE
)
378 FUNC(is_punct
, ISPUNCT
)
379 FUNC(is_alnum
, ISALNUM
)
380 FUNC(is_print
, ISPRINT
)
381 FUNC(is_graph
, ISGRAPH
)
382 FUNC(is_cntrl
, ISCNTRL
)
387 return (c
== ' ' || c
== '\t');
390 /* The following list maps the names of the Posix named character classes
391 to predicate functions that determine whether a given character is in
392 the class. The leading [ has already been eaten by the lexical analyzer. */
395 int (*pred
) PARAMS ((int));
397 { ":alpha:]", is_alpha
},
398 { ":upper:]", is_upper
},
399 { ":lower:]", is_lower
},
400 { ":digit:]", is_digit
},
401 { ":xdigit:]", is_xdigit
},
402 { ":space:]", is_space
},
403 { ":punct:]", is_punct
},
404 { ":alnum:]", is_alnum
},
405 { ":print:]", is_print
},
406 { ":graph:]", is_graph
},
407 { ":cntrl:]", is_cntrl
},
408 { ":blank:]", is_blank
},
412 /* Return non-zero if C is a `word-constituent' byte; zero otherwise. */
413 #define IS_WORD_CONSTITUENT(C) (ISALNUM(C) || (C) == '_')
416 looking_at (char const *s
)
423 return strncmp(s
, lexptr
, len
) == 0;
430 int backslash
= 0, invert
;
436 /* Basic plan: We fetch a character. If it's a backslash,
437 we set the backslash flag and go through the loop again.
438 On the plus side, this avoids having a duplicate of the
439 main switch inside the backslash case. On the minus side,
440 it means that just about every case begins with
441 "if (backslash) ...". */
442 for (i
= 0; i
< 2; ++i
)
451 dfaerror(_("Unfinished \\ escape"));
458 if (syntax_bits
& RE_CONTEXT_INDEP_ANCHORS
462 return lasttok
= BEGLINE
;
468 if (syntax_bits
& RE_CONTEXT_INDEP_ANCHORS
470 || (syntax_bits
& RE_NO_BK_PARENS
471 ? lexleft
> 0 && *lexptr
== ')'
472 : lexleft
> 1 && lexptr
[0] == '\\' && lexptr
[1] == ')')
473 || (syntax_bits
& RE_NO_BK_VBAR
474 ? lexleft
> 0 && *lexptr
== '|'
475 : lexleft
> 1 && lexptr
[0] == '\\' && lexptr
[1] == '|')
476 || ((syntax_bits
& RE_NEWLINE_ALT
)
477 && lexleft
> 0 && *lexptr
== '\n'))
478 return lasttok
= ENDLINE
;
490 if (backslash
&& !(syntax_bits
& RE_NO_BK_REFS
))
493 return lasttok
= BACKREF
;
498 if (backslash
&& !(syntax_bits
& RE_NO_GNU_OPS
))
499 return lasttok
= BEGLINE
; /* FIXME: should be beginning of string */
503 if (backslash
&& !(syntax_bits
& RE_NO_GNU_OPS
))
504 return lasttok
= ENDLINE
; /* FIXME: should be end of string */
508 if (backslash
&& !(syntax_bits
& RE_NO_GNU_OPS
))
509 return lasttok
= BEGWORD
;
513 if (backslash
&& !(syntax_bits
& RE_NO_GNU_OPS
))
514 return lasttok
= ENDWORD
;
518 if (backslash
&& !(syntax_bits
& RE_NO_GNU_OPS
))
519 return lasttok
= LIMWORD
;
523 if (backslash
&& !(syntax_bits
& RE_NO_GNU_OPS
))
524 return lasttok
= NOTLIMWORD
;
528 if (syntax_bits
& RE_LIMITED_OPS
)
530 if (backslash
!= ((syntax_bits
& RE_BK_PLUS_QM
) != 0))
532 if (!(syntax_bits
& RE_CONTEXT_INDEP_OPS
) && laststart
)
534 return lasttok
= QMARK
;
539 if (!(syntax_bits
& RE_CONTEXT_INDEP_OPS
) && laststart
)
541 return lasttok
= STAR
;
544 if (syntax_bits
& RE_LIMITED_OPS
)
546 if (backslash
!= ((syntax_bits
& RE_BK_PLUS_QM
) != 0))
548 if (!(syntax_bits
& RE_CONTEXT_INDEP_OPS
) && laststart
)
550 return lasttok
= PLUS
;
553 if (!(syntax_bits
& RE_INTERVALS
))
555 if (backslash
!= ((syntax_bits
& RE_NO_BK_BRACES
) == 0))
557 if (!(syntax_bits
& RE_CONTEXT_INDEP_OPS
) && laststart
)
560 if (syntax_bits
& RE_NO_BK_BRACES
)
562 /* Scan ahead for a valid interval; if it's not valid,
563 treat it as a literal '{'. */
564 int lo
= -1, hi
= -1;
565 char const *p
= lexptr
;
566 char const *lim
= p
+ lexleft
;
567 for (; p
!= lim
&& ISASCIIDIGIT (*p
); p
++)
568 lo
= (lo
< 0 ? 0 : lo
* 10) + *p
- '0';
569 if (p
!= lim
&& *p
== ',')
570 while (++p
!= lim
&& ISASCIIDIGIT (*p
))
571 hi
= (hi
< 0 ? 0 : hi
* 10) + *p
- '0';
574 if (p
== lim
|| *p
!= '}'
575 || lo
< 0 || RE_DUP_MAX
< hi
|| (0 <= hi
&& hi
< lo
))
582 {M,} - minimum count, maximum is infinity
583 {M,N} - M through N */
584 FETCH(c
, _("unfinished repeat count"));
585 if (ISASCIIDIGIT (c
))
590 FETCH(c
, _("unfinished repeat count"));
591 if (! ISASCIIDIGIT (c
))
593 minrep
= 10 * minrep
+ c
- '0';
597 dfaerror(_("malformed repeat count"));
600 FETCH (c
, _("unfinished repeat count"));
601 if (! ISASCIIDIGIT (c
))
608 FETCH (c
, _("unfinished repeat count"));
609 if (! ISASCIIDIGIT (c
))
611 maxrep
= 10 * maxrep
+ c
- '0';
613 if (0 <= maxrep
&& maxrep
< minrep
)
614 dfaerror (_("malformed repeat count"));
619 if (!(syntax_bits
& RE_NO_BK_BRACES
))
622 dfaerror(_("malformed repeat count"));
623 FETCH(c
, _("unfinished repeat count"));
626 dfaerror(_("malformed repeat count"));
628 return lasttok
= REPMN
;
631 if (syntax_bits
& RE_LIMITED_OPS
)
633 if (backslash
!= ((syntax_bits
& RE_NO_BK_VBAR
) == 0))
639 if (syntax_bits
& RE_LIMITED_OPS
641 || !(syntax_bits
& RE_NEWLINE_ALT
))
647 if (backslash
!= ((syntax_bits
& RE_NO_BK_PARENS
) == 0))
651 return lasttok
= LPAREN
;
654 if (backslash
!= ((syntax_bits
& RE_NO_BK_PARENS
) == 0))
656 if (parens
== 0 && syntax_bits
& RE_UNMATCHED_RIGHT_PAREN_ORD
)
660 return lasttok
= RPAREN
;
667 if (!(syntax_bits
& RE_DOT_NEWLINE
))
668 clrbit(eolbyte
, ccl
);
669 if (syntax_bits
& RE_DOT_NOT_NULL
)
672 return lasttok
= CSET
+ charclass_index(ccl
);
676 if (!backslash
|| (syntax_bits
& RE_NO_GNU_OPS
))
679 for (c2
= 0; c2
< NOTCHAR
; ++c2
)
680 if (IS_WORD_CONSTITUENT(c2
))
685 return lasttok
= CSET
+ charclass_index(ccl
);
691 FETCH(c
, _("Unbalanced ["));
694 FETCH(c
, _("Unbalanced ["));
701 /* Nobody ever said this had to be fast. :-)
702 Note that if we're looking at some other [:...:]
703 construct, we just treat it as a bunch of ordinary
704 characters. We can do this because we assume
705 regex has checked for syntax errors before
706 dfa is ever called. */
707 if (c
== '[' && (syntax_bits
& RE_CHAR_CLASSES
))
708 for (c1
= 0; prednames
[c1
].name
; ++c1
)
709 if (looking_at(prednames
[c1
].name
))
711 int (*pred
)() = prednames
[c1
].pred
;
713 && (pred
== is_upper
|| pred
== is_lower
))
716 for (c2
= 0; c2
< NOTCHAR
; ++c2
)
719 lexptr
+= strlen(prednames
[c1
].name
);
720 lexleft
-= strlen(prednames
[c1
].name
);
721 FETCH(c1
, _("Unbalanced ["));
724 if (c
== '\\' && (syntax_bits
& RE_BACKSLASH_ESCAPE_IN_LISTS
))
725 FETCH(c
, _("Unbalanced ["));
726 FETCH(c1
, _("Unbalanced ["));
729 FETCH(c2
, _("Unbalanced ["));
732 /* In the case [x-], the - is an ordinary hyphen,
733 which is left in c1, the lookahead character. */
741 && (syntax_bits
& RE_BACKSLASH_ESCAPE_IN_LISTS
))
742 FETCH(c2
, _("Unbalanced ["));
743 FETCH(c1
, _("Unbalanced ["));
749 lo
[0] = c
; lo
[1] = '\0';
750 hi
[0] = c2
; hi
[1] = '\0';
751 for (c
= 0; c
< NOTCHAR
; c
++)
754 ch
[0] = c
; ch
[1] = '\0';
755 if (strcoll (lo
, ch
) <= 0 && strcoll (ch
, hi
) <= 0)
761 setbit (tolower (c
), ccl
);
762 else if (ISLOWER (c
))
763 setbit (toupper (c
), ccl
);
771 while ((c
= c1
) != ']');
775 if (syntax_bits
& RE_HAT_LISTS_NOT_NEWLINE
)
776 clrbit(eolbyte
, ccl
);
779 return lasttok
= CSET
+ charclass_index(ccl
);
784 if (case_fold
&& ISALPHA(c
))
789 setbit(tolower(c
), ccl
);
791 setbit(toupper(c
), ccl
);
792 return lasttok
= CSET
+ charclass_index(ccl
);
798 /* The above loop should consume at most a backslash
799 and some other character. */
801 return END
; /* keeps pedantic compilers happy. */
804 /* Recursive descent parser for regular expressions. */
806 static token tok
; /* Lookahead token. */
807 static int depth
; /* Current depth of a hypothetical stack
808 holding deferred productions. This is
809 used to determine the depth that will be
810 required of the real stack later on in
813 /* Add the given token to the parse tree, maintaining the depth count and
814 updating the maximum depth if necessary. */
818 REALLOC_IF_NECESSARY(dfa
->tokens
, token
, dfa
->talloc
, dfa
->tindex
);
819 dfa
->tokens
[dfa
->tindex
++] = t
;
840 if (depth
> dfa
->depth
)
844 /* The grammar understood by the parser is as follows.
872 The parser builds a parse tree in postfix form in an array of tokens. */
877 if ((tok
>= 0 && tok
< NOTCHAR
) || tok
>= CSET
|| tok
== BACKREF
878 || tok
== BEGLINE
|| tok
== ENDLINE
|| tok
== BEGWORD
879 || tok
== ENDWORD
|| tok
== LIMWORD
|| tok
== NOTLIMWORD
)
884 else if (tok
== LPAREN
)
889 dfaerror(_("Unbalanced ("));
896 /* Return the number of tokens in the given subexpression. */
898 nsubtoks (int tindex
)
902 switch (dfa
->tokens
[tindex
- 1])
909 return 1 + nsubtoks(tindex
- 1);
913 ntoks1
= nsubtoks(tindex
- 1);
914 return 1 + ntoks1
+ nsubtoks(tindex
- 1 - ntoks1
);
918 /* Copy the given subexpression to the top of the tree. */
920 copytoks (int tindex
, int ntokens
)
924 for (i
= 0; i
< ntokens
; ++i
)
925 addtok(dfa
->tokens
[tindex
+ i
]);
931 int tindex
, ntokens
, i
;
934 while (tok
== QMARK
|| tok
== STAR
|| tok
== PLUS
|| tok
== REPMN
)
937 ntokens
= nsubtoks(dfa
->tindex
);
938 tindex
= dfa
->tindex
- ntokens
;
943 for (i
= 1; i
< minrep
; ++i
)
945 copytoks(tindex
, ntokens
);
948 for (; i
< maxrep
; ++i
)
950 copytoks(tindex
, ntokens
);
967 while (tok
!= RPAREN
&& tok
!= OR
&& tok
>= 0)
975 regexp (int toplevel
)
989 /* Main entry point for the parser. S is a string to be parsed, len is the
990 length of the string, so s can include NUL characters. D is a pointer to
991 the struct dfa to parse into. */
993 dfaparse (char *s
, size_t len
, struct dfa
*d
)
996 lexstart
= lexptr
= s
;
1002 if (! syntax_bits_set
)
1003 dfaerror(_("No syntax specified"));
1011 dfaerror(_("Unbalanced )"));
1013 addtok(END
- d
->nregexps
);
1022 /* Some primitives for operating on sets of positions. */
1024 /* Copy one set to another; the destination must be large enough. */
1026 copy (position_set
*src
, position_set
*dst
)
1030 for (i
= 0; i
< src
->nelem
; ++i
)
1031 dst
->elems
[i
] = src
->elems
[i
];
1032 dst
->nelem
= src
->nelem
;
1035 /* Insert a position in a set. Position sets are maintained in sorted
1036 order according to index. If position already exists in the set with
1037 the same index then their constraints are logically or'd together.
1038 S->elems must point to an array large enough to hold the resulting set. */
1040 insert (position p
, position_set
*s
)
1045 for (i
= 0; i
< s
->nelem
&& p
.index
< s
->elems
[i
].index
; ++i
)
1047 if (i
< s
->nelem
&& p
.index
== s
->elems
[i
].index
)
1048 s
->elems
[i
].constraint
|= p
.constraint
;
1053 while (i
< s
->nelem
)
1062 /* Merge two sets of positions into a third. The result is exactly as if
1063 the positions of both sets were inserted into an initially empty set. */
1065 merge (position_set
*s1
, position_set
*s2
, position_set
*m
)
1070 while (i
< s1
->nelem
&& j
< s2
->nelem
)
1071 if (s1
->elems
[i
].index
> s2
->elems
[j
].index
)
1072 m
->elems
[m
->nelem
++] = s1
->elems
[i
++];
1073 else if (s1
->elems
[i
].index
< s2
->elems
[j
].index
)
1074 m
->elems
[m
->nelem
++] = s2
->elems
[j
++];
1077 m
->elems
[m
->nelem
] = s1
->elems
[i
++];
1078 m
->elems
[m
->nelem
++].constraint
|= s2
->elems
[j
++].constraint
;
1080 while (i
< s1
->nelem
)
1081 m
->elems
[m
->nelem
++] = s1
->elems
[i
++];
1082 while (j
< s2
->nelem
)
1083 m
->elems
[m
->nelem
++] = s2
->elems
[j
++];
1086 /* Delete a position from a set. */
1088 delete (position p
, position_set
*s
)
1092 for (i
= 0; i
< s
->nelem
; ++i
)
1093 if (p
.index
== s
->elems
[i
].index
)
1096 for (--s
->nelem
; i
< s
->nelem
; ++i
)
1097 s
->elems
[i
] = s
->elems
[i
+ 1];
1100 /* Find the index of the state corresponding to the given position set with
1101 the given preceding context, or create a new state if there is no such
1102 state. Newline and letter tell whether we got here on a newline or
1103 letter, respectively. */
1105 state_index (struct dfa
*d
, position_set
*s
, int newline
, int letter
)
1111 newline
= newline
? 1 : 0;
1112 letter
= letter
? 1 : 0;
1114 for (i
= 0; i
< s
->nelem
; ++i
)
1115 hash
^= s
->elems
[i
].index
+ s
->elems
[i
].constraint
;
1117 /* Try to find a state that exactly matches the proposed one. */
1118 for (i
= 0; i
< d
->sindex
; ++i
)
1120 if (hash
!= d
->states
[i
].hash
|| s
->nelem
!= d
->states
[i
].elems
.nelem
1121 || newline
!= d
->states
[i
].newline
|| letter
!= d
->states
[i
].letter
)
1123 for (j
= 0; j
< s
->nelem
; ++j
)
1124 if (s
->elems
[j
].constraint
1125 != d
->states
[i
].elems
.elems
[j
].constraint
1126 || s
->elems
[j
].index
!= d
->states
[i
].elems
.elems
[j
].index
)
1132 /* We'll have to create a new state. */
1133 REALLOC_IF_NECESSARY(d
->states
, dfa_state
, d
->salloc
, d
->sindex
);
1134 d
->states
[i
].hash
= hash
;
1135 MALLOC(d
->states
[i
].elems
.elems
, position
, s
->nelem
);
1136 copy(s
, &d
->states
[i
].elems
);
1137 d
->states
[i
].newline
= newline
;
1138 d
->states
[i
].letter
= letter
;
1139 d
->states
[i
].backref
= 0;
1140 d
->states
[i
].constraint
= 0;
1141 d
->states
[i
].first_end
= 0;
1142 for (j
= 0; j
< s
->nelem
; ++j
)
1143 if (d
->tokens
[s
->elems
[j
].index
] < 0)
1145 constraint
= s
->elems
[j
].constraint
;
1146 if (SUCCEEDS_IN_CONTEXT(constraint
, newline
, 0, letter
, 0)
1147 || SUCCEEDS_IN_CONTEXT(constraint
, newline
, 0, letter
, 1)
1148 || SUCCEEDS_IN_CONTEXT(constraint
, newline
, 1, letter
, 0)
1149 || SUCCEEDS_IN_CONTEXT(constraint
, newline
, 1, letter
, 1))
1150 d
->states
[i
].constraint
|= constraint
;
1151 if (! d
->states
[i
].first_end
)
1152 d
->states
[i
].first_end
= d
->tokens
[s
->elems
[j
].index
];
1154 else if (d
->tokens
[s
->elems
[j
].index
] == BACKREF
)
1156 d
->states
[i
].constraint
= NO_CONSTRAINT
;
1157 d
->states
[i
].backref
= 1;
1165 /* Find the epsilon closure of a set of positions. If any position of the set
1166 contains a symbol that matches the empty string in some context, replace
1167 that position with the elements of its follow labeled with an appropriate
1168 constraint. Repeat exhaustively until no funny positions are left.
1169 S->elems must be large enough to hold the result. */
1171 epsclosure (position_set
*s
, struct dfa
*d
)
1177 MALLOC(visited
, int, d
->tindex
);
1178 for (i
= 0; i
< d
->tindex
; ++i
)
1181 for (i
= 0; i
< s
->nelem
; ++i
)
1182 if (d
->tokens
[s
->elems
[i
].index
] >= NOTCHAR
1183 && d
->tokens
[s
->elems
[i
].index
] != BACKREF
1184 && d
->tokens
[s
->elems
[i
].index
] < CSET
)
1187 p
.constraint
= old
.constraint
;
1188 delete(s
->elems
[i
], s
);
1189 if (visited
[old
.index
])
1194 visited
[old
.index
] = 1;
1195 switch (d
->tokens
[old
.index
])
1198 p
.constraint
&= BEGLINE_CONSTRAINT
;
1201 p
.constraint
&= ENDLINE_CONSTRAINT
;
1204 p
.constraint
&= BEGWORD_CONSTRAINT
;
1207 p
.constraint
&= ENDWORD_CONSTRAINT
;
1210 p
.constraint
&= LIMWORD_CONSTRAINT
;
1213 p
.constraint
&= NOTLIMWORD_CONSTRAINT
;
1218 for (j
= 0; j
< d
->follows
[old
.index
].nelem
; ++j
)
1220 p
.index
= d
->follows
[old
.index
].elems
[j
].index
;
1223 /* Force rescan to start at the beginning. */
1230 /* Perform bottom-up analysis on the parse tree, computing various functions.
1231 Note that at this point, we're pretending constructs like \< are real
1232 characters rather than constraints on what can follow them.
1234 Nullable: A node is nullable if it is at the root of a regexp that can
1235 match the empty string.
1236 * EMPTY leaves are nullable.
1237 * No other leaf is nullable.
1238 * A QMARK or STAR node is nullable.
1239 * A PLUS node is nullable if its argument is nullable.
1240 * A CAT node is nullable if both its arguments are nullable.
1241 * An OR node is nullable if either argument is nullable.
1243 Firstpos: The firstpos of a node is the set of positions (nonempty leaves)
1244 that could correspond to the first character of a string matching the
1245 regexp rooted at the given node.
1246 * EMPTY leaves have empty firstpos.
1247 * The firstpos of a nonempty leaf is that leaf itself.
1248 * The firstpos of a QMARK, STAR, or PLUS node is the firstpos of its
1250 * The firstpos of a CAT node is the firstpos of the left argument, union
1251 the firstpos of the right if the left argument is nullable.
1252 * The firstpos of an OR node is the union of firstpos of each argument.
1254 Lastpos: The lastpos of a node is the set of positions that could
1255 correspond to the last character of a string matching the regexp at
1257 * EMPTY leaves have empty lastpos.
1258 * The lastpos of a nonempty leaf is that leaf itself.
1259 * The lastpos of a QMARK, STAR, or PLUS node is the lastpos of its
1261 * The lastpos of a CAT node is the lastpos of its right argument, union
1262 the lastpos of the left if the right argument is nullable.
1263 * The lastpos of an OR node is the union of the lastpos of each argument.
1265 Follow: The follow of a position is the set of positions that could
1266 correspond to the character following a character matching the node in
1267 a string matching the regexp. At this point we consider special symbols
1268 that match the empty string in some context to be just normal characters.
1269 Later, if we find that a special symbol is in a follow set, we will
1270 replace it with the elements of its follow, labeled with an appropriate
1272 * Every node in the firstpos of the argument of a STAR or PLUS node is in
1273 the follow of every node in the lastpos.
1274 * Every node in the firstpos of the second argument of a CAT node is in
1275 the follow of every node in the lastpos of the first argument.
1277 Because of the postfix representation of the parse tree, the depth-first
1278 analysis is conveniently done by a linear scan with the aid of a stack.
1279 Sets are stored as arrays of the elements, obeying a stack-like allocation
1280 scheme; the number of elements in each set deeper in the stack can be
1281 used to determine the address of a particular set's array. */
1283 dfaanalyze (struct dfa
*d
, int searchflag
)
1285 int *nullable
; /* Nullable stack. */
1286 int *nfirstpos
; /* Element count stack for firstpos sets. */
1287 position
*firstpos
; /* Array where firstpos elements are stored. */
1288 int *nlastpos
; /* Element count stack for lastpos sets. */
1289 position
*lastpos
; /* Array where lastpos elements are stored. */
1290 int *nalloc
; /* Sizes of arrays allocated to follow sets. */
1291 position_set tmp
; /* Temporary set for merging sets. */
1292 position_set merged
; /* Result of merging sets. */
1293 int wants_newline
; /* True if some position wants newline info. */
1295 int *o_nfirst
, *o_nlast
;
1296 position
*o_firstpos
, *o_lastpos
;
1301 fprintf(stderr
, "dfaanalyze:\n");
1302 for (i
= 0; i
< d
->tindex
; ++i
)
1304 fprintf(stderr
, " %d:", i
);
1305 prtok(d
->tokens
[i
]);
1310 d
->searchflag
= searchflag
;
1312 MALLOC(nullable
, int, d
->depth
);
1313 o_nullable
= nullable
;
1314 MALLOC(nfirstpos
, int, d
->depth
);
1315 o_nfirst
= nfirstpos
;
1316 MALLOC(firstpos
, position
, d
->nleaves
);
1317 o_firstpos
= firstpos
, firstpos
+= d
->nleaves
;
1318 MALLOC(nlastpos
, int, d
->depth
);
1320 MALLOC(lastpos
, position
, d
->nleaves
);
1321 o_lastpos
= lastpos
, lastpos
+= d
->nleaves
;
1322 MALLOC(nalloc
, int, d
->tindex
);
1323 for (i
= 0; i
< d
->tindex
; ++i
)
1325 MALLOC(merged
.elems
, position
, d
->nleaves
);
1327 CALLOC(d
->follows
, position_set
, d
->tindex
);
1329 for (i
= 0; i
< d
->tindex
; ++i
)
1331 { /* Nonsyntactic #ifdef goo... */
1333 switch (d
->tokens
[i
])
1336 /* The empty set is nullable. */
1339 /* The firstpos and lastpos of the empty leaf are both empty. */
1340 *nfirstpos
++ = *nlastpos
++ = 0;
1345 /* Every element in the firstpos of the argument is in the follow
1346 of every element in the lastpos. */
1347 tmp
.nelem
= nfirstpos
[-1];
1348 tmp
.elems
= firstpos
;
1350 for (j
= 0; j
< nlastpos
[-1]; ++j
)
1352 merge(&tmp
, &d
->follows
[pos
[j
].index
], &merged
);
1353 REALLOC_IF_NECESSARY(d
->follows
[pos
[j
].index
].elems
, position
,
1354 nalloc
[pos
[j
].index
], merged
.nelem
- 1);
1355 copy(&merged
, &d
->follows
[pos
[j
].index
]);
1359 /* A QMARK or STAR node is automatically nullable. */
1360 if (d
->tokens
[i
] != PLUS
)
1365 /* Every element in the firstpos of the second argument is in the
1366 follow of every element in the lastpos of the first argument. */
1367 tmp
.nelem
= nfirstpos
[-1];
1368 tmp
.elems
= firstpos
;
1369 pos
= lastpos
+ nlastpos
[-1];
1370 for (j
= 0; j
< nlastpos
[-2]; ++j
)
1372 merge(&tmp
, &d
->follows
[pos
[j
].index
], &merged
);
1373 REALLOC_IF_NECESSARY(d
->follows
[pos
[j
].index
].elems
, position
,
1374 nalloc
[pos
[j
].index
], merged
.nelem
- 1);
1375 copy(&merged
, &d
->follows
[pos
[j
].index
]);
1378 /* The firstpos of a CAT node is the firstpos of the first argument,
1379 union that of the second argument if the first is nullable. */
1381 nfirstpos
[-2] += nfirstpos
[-1];
1383 firstpos
+= nfirstpos
[-1];
1386 /* The lastpos of a CAT node is the lastpos of the second argument,
1387 union that of the first argument if the second is nullable. */
1389 nlastpos
[-2] += nlastpos
[-1];
1392 pos
= lastpos
+ nlastpos
[-2];
1393 for (j
= nlastpos
[-1] - 1; j
>= 0; --j
)
1394 pos
[j
] = lastpos
[j
];
1395 lastpos
+= nlastpos
[-2];
1396 nlastpos
[-2] = nlastpos
[-1];
1400 /* A CAT node is nullable if both arguments are nullable. */
1401 nullable
[-2] = nullable
[-1] && nullable
[-2];
1407 /* The firstpos is the union of the firstpos of each argument. */
1408 nfirstpos
[-2] += nfirstpos
[-1];
1411 /* The lastpos is the union of the lastpos of each argument. */
1412 nlastpos
[-2] += nlastpos
[-1];
1415 /* An OR node is nullable if either argument is nullable. */
1416 nullable
[-2] = nullable
[-1] || nullable
[-2];
1421 /* Anything else is a nonempty position. (Note that special
1422 constructs like \< are treated as nonempty strings here;
1423 an "epsilon closure" effectively makes them nullable later.
1424 Backreferences have to get a real position so we can detect
1425 transitions on them later. But they are nullable. */
1426 *nullable
++ = d
->tokens
[i
] == BACKREF
;
1428 /* This position is in its own firstpos and lastpos. */
1429 *nfirstpos
++ = *nlastpos
++ = 1;
1430 --firstpos
, --lastpos
;
1431 firstpos
->index
= lastpos
->index
= i
;
1432 firstpos
->constraint
= lastpos
->constraint
= NO_CONSTRAINT
;
1434 /* Allocate the follow set for this position. */
1436 MALLOC(d
->follows
[i
].elems
, position
, nalloc
[i
]);
1440 /* ... balance the above nonsyntactic #ifdef goo... */
1441 fprintf(stderr
, "node %d:", i
);
1442 prtok(d
->tokens
[i
]);
1444 fprintf(stderr
, nullable
[-1] ? " nullable: yes\n" : " nullable: no\n");
1445 fprintf(stderr
, " firstpos:");
1446 for (j
= nfirstpos
[-1] - 1; j
>= 0; --j
)
1448 fprintf(stderr
, " %d:", firstpos
[j
].index
);
1449 prtok(d
->tokens
[firstpos
[j
].index
]);
1451 fprintf(stderr
, "\n lastpos:");
1452 for (j
= nlastpos
[-1] - 1; j
>= 0; --j
)
1454 fprintf(stderr
, " %d:", lastpos
[j
].index
);
1455 prtok(d
->tokens
[lastpos
[j
].index
]);
1461 /* For each follow set that is the follow set of a real position, replace
1462 it with its epsilon closure. */
1463 for (i
= 0; i
< d
->tindex
; ++i
)
1464 if (d
->tokens
[i
] < NOTCHAR
|| d
->tokens
[i
] == BACKREF
1465 || d
->tokens
[i
] >= CSET
)
1468 fprintf(stderr
, "follows(%d:", i
);
1469 prtok(d
->tokens
[i
]);
1470 fprintf(stderr
, "):");
1471 for (j
= d
->follows
[i
].nelem
- 1; j
>= 0; --j
)
1473 fprintf(stderr
, " %d:", d
->follows
[i
].elems
[j
].index
);
1474 prtok(d
->tokens
[d
->follows
[i
].elems
[j
].index
]);
1478 copy(&d
->follows
[i
], &merged
);
1479 epsclosure(&merged
, d
);
1480 if (d
->follows
[i
].nelem
< merged
.nelem
)
1481 REALLOC(d
->follows
[i
].elems
, position
, merged
.nelem
);
1482 copy(&merged
, &d
->follows
[i
]);
1485 /* Get the epsilon closure of the firstpos of the regexp. The result will
1486 be the set of positions of state 0. */
1488 for (i
= 0; i
< nfirstpos
[-1]; ++i
)
1489 insert(firstpos
[i
], &merged
);
1490 epsclosure(&merged
, d
);
1492 /* Check if any of the positions of state 0 will want newline context. */
1494 for (i
= 0; i
< merged
.nelem
; ++i
)
1495 if (PREV_NEWLINE_DEPENDENT(merged
.elems
[i
].constraint
))
1498 /* Build the initial state. */
1501 MALLOC(d
->states
, dfa_state
, d
->salloc
);
1502 state_index(d
, &merged
, wants_newline
, 0);
1513 /* Find, for each character, the transition out of state s of d, and store
1514 it in the appropriate slot of trans.
1516 We divide the positions of s into groups (positions can appear in more
1517 than one group). Each group is labeled with a set of characters that
1518 every position in the group matches (taking into account, if necessary,
1519 preceding context information of s). For each group, find the union
1520 of the its elements' follows. This set is the set of positions of the
1521 new state. For each character in the group's label, set the transition
1522 on this character to be to a state corresponding to the set's positions,
1523 and its associated backward context information, if necessary.
1525 If we are building a searching matcher, we include the positions of state
1528 The collection of groups is constructed by building an equivalence-class
1529 partition of the positions of s.
1531 For each position, find the set of characters C that it matches. Eliminate
1532 any characters from C that fail on grounds of backward context.
1534 Search through the groups, looking for a group whose label L has nonempty
1535 intersection with C. If L - C is nonempty, create a new group labeled
1536 L - C and having the same positions as the current group, and set L to
1537 the intersection of L and C. Insert the position in this group, set
1538 C = C - L, and resume scanning.
1540 If after comparing with every group there are characters remaining in C,
1541 create a new group labeled with the characters of C and insert this
1542 position in that group. */
1544 dfastate (int s
, struct dfa
*d
, int trans
[])
1546 position_set grps
[NOTCHAR
]; /* As many as will ever be needed. */
1547 charclass labels
[NOTCHAR
]; /* Labels corresponding to the groups. */
1548 int ngrps
= 0; /* Number of groups actually used. */
1549 position pos
; /* Current position being considered. */
1550 charclass matches
; /* Set of matching characters. */
1551 int matchesf
; /* True if matches is nonempty. */
1552 charclass intersect
; /* Intersection with some label set. */
1553 int intersectf
; /* True if intersect is nonempty. */
1554 charclass leftovers
; /* Stuff in the label that didn't match. */
1555 int leftoversf
; /* True if leftovers is nonempty. */
1556 static charclass letters
; /* Set of characters considered letters. */
1557 static charclass newline
; /* Set of characters that aren't newline. */
1558 position_set follows
; /* Union of the follows of some group. */
1559 position_set tmp
; /* Temporary space for merging sets. */
1560 int state
; /* New state. */
1561 int wants_newline
; /* New state wants to know newline context. */
1562 int state_newline
; /* New state on a newline transition. */
1563 int wants_letter
; /* New state wants to know letter context. */
1564 int state_letter
; /* New state on a letter transition. */
1565 static int initialized
; /* Flag for static initialization. */
1568 /* Initialize the set of letters, if necessary. */
1572 for (i
= 0; i
< NOTCHAR
; ++i
)
1573 if (IS_WORD_CONSTITUENT(i
))
1575 setbit(eolbyte
, newline
);
1580 for (i
= 0; i
< d
->states
[s
].elems
.nelem
; ++i
)
1582 pos
= d
->states
[s
].elems
.elems
[i
];
1583 if (d
->tokens
[pos
.index
] >= 0 && d
->tokens
[pos
.index
] < NOTCHAR
)
1584 setbit(d
->tokens
[pos
.index
], matches
);
1585 else if (d
->tokens
[pos
.index
] >= CSET
)
1586 copyset(d
->charclasses
[d
->tokens
[pos
.index
] - CSET
], matches
);
1590 /* Some characters may need to be eliminated from matches because
1591 they fail in the current context. */
1592 if (pos
.constraint
!= 0xFF)
1594 if (! MATCHES_NEWLINE_CONTEXT(pos
.constraint
,
1595 d
->states
[s
].newline
, 1))
1596 clrbit(eolbyte
, matches
);
1597 if (! MATCHES_NEWLINE_CONTEXT(pos
.constraint
,
1598 d
->states
[s
].newline
, 0))
1599 for (j
= 0; j
< CHARCLASS_INTS
; ++j
)
1600 matches
[j
] &= newline
[j
];
1601 if (! MATCHES_LETTER_CONTEXT(pos
.constraint
,
1602 d
->states
[s
].letter
, 1))
1603 for (j
= 0; j
< CHARCLASS_INTS
; ++j
)
1604 matches
[j
] &= ~letters
[j
];
1605 if (! MATCHES_LETTER_CONTEXT(pos
.constraint
,
1606 d
->states
[s
].letter
, 0))
1607 for (j
= 0; j
< CHARCLASS_INTS
; ++j
)
1608 matches
[j
] &= letters
[j
];
1610 /* If there are no characters left, there's no point in going on. */
1611 for (j
= 0; j
< CHARCLASS_INTS
&& !matches
[j
]; ++j
)
1613 if (j
== CHARCLASS_INTS
)
1617 for (j
= 0; j
< ngrps
; ++j
)
1619 /* If matches contains a single character only, and the current
1620 group's label doesn't contain that character, go on to the
1622 if (d
->tokens
[pos
.index
] >= 0 && d
->tokens
[pos
.index
] < NOTCHAR
1623 && !tstbit(d
->tokens
[pos
.index
], labels
[j
]))
1626 /* Check if this group's label has a nonempty intersection with
1629 for (k
= 0; k
< CHARCLASS_INTS
; ++k
)
1630 (intersect
[k
] = matches
[k
] & labels
[j
][k
]) ? (intersectf
= 1) : 0;
1634 /* It does; now find the set differences both ways. */
1635 leftoversf
= matchesf
= 0;
1636 for (k
= 0; k
< CHARCLASS_INTS
; ++k
)
1638 /* Even an optimizing compiler can't know this for sure. */
1639 int match
= matches
[k
], label
= labels
[j
][k
];
1641 (leftovers
[k
] = ~match
& label
) ? (leftoversf
= 1) : 0;
1642 (matches
[k
] = match
& ~label
) ? (matchesf
= 1) : 0;
1645 /* If there were leftovers, create a new group labeled with them. */
1648 copyset(leftovers
, labels
[ngrps
]);
1649 copyset(intersect
, labels
[j
]);
1650 MALLOC(grps
[ngrps
].elems
, position
, d
->nleaves
);
1651 copy(&grps
[j
], &grps
[ngrps
]);
1655 /* Put the position in the current group. Note that there is no
1656 reason to call insert() here. */
1657 grps
[j
].elems
[grps
[j
].nelem
++] = pos
;
1659 /* If every character matching the current position has been
1660 accounted for, we're done. */
1665 /* If we've passed the last group, and there are still characters
1666 unaccounted for, then we'll have to create a new group. */
1669 copyset(matches
, labels
[ngrps
]);
1671 MALLOC(grps
[ngrps
].elems
, position
, d
->nleaves
);
1672 grps
[ngrps
].nelem
= 1;
1673 grps
[ngrps
].elems
[0] = pos
;
1678 MALLOC(follows
.elems
, position
, d
->nleaves
);
1679 MALLOC(tmp
.elems
, position
, d
->nleaves
);
1681 /* If we are a searching matcher, the default transition is to a state
1682 containing the positions of state 0, otherwise the default transition
1683 is to fail miserably. */
1688 for (i
= 0; i
< d
->states
[0].elems
.nelem
; ++i
)
1690 if (PREV_NEWLINE_DEPENDENT(d
->states
[0].elems
.elems
[i
].constraint
))
1692 if (PREV_LETTER_DEPENDENT(d
->states
[0].elems
.elems
[i
].constraint
))
1695 copy(&d
->states
[0].elems
, &follows
);
1696 state
= state_index(d
, &follows
, 0, 0);
1698 state_newline
= state_index(d
, &follows
, 1, 0);
1700 state_newline
= state
;
1702 state_letter
= state_index(d
, &follows
, 0, 1);
1704 state_letter
= state
;
1705 for (i
= 0; i
< NOTCHAR
; ++i
)
1706 trans
[i
] = (IS_WORD_CONSTITUENT(i
)) ? state_letter
: state
;
1707 trans
[eolbyte
] = state_newline
;
1710 for (i
= 0; i
< NOTCHAR
; ++i
)
1713 for (i
= 0; i
< ngrps
; ++i
)
1717 /* Find the union of the follows of the positions of the group.
1718 This is a hideously inefficient loop. Fix it someday. */
1719 for (j
= 0; j
< grps
[i
].nelem
; ++j
)
1720 for (k
= 0; k
< d
->follows
[grps
[i
].elems
[j
].index
].nelem
; ++k
)
1721 insert(d
->follows
[grps
[i
].elems
[j
].index
].elems
[k
], &follows
);
1723 /* If we are building a searching matcher, throw in the positions
1724 of state 0 as well. */
1726 for (j
= 0; j
< d
->states
[0].elems
.nelem
; ++j
)
1727 insert(d
->states
[0].elems
.elems
[j
], &follows
);
1729 /* Find out if the new state will want any context information. */
1731 if (tstbit(eolbyte
, labels
[i
]))
1732 for (j
= 0; j
< follows
.nelem
; ++j
)
1733 if (PREV_NEWLINE_DEPENDENT(follows
.elems
[j
].constraint
))
1737 for (j
= 0; j
< CHARCLASS_INTS
; ++j
)
1738 if (labels
[i
][j
] & letters
[j
])
1740 if (j
< CHARCLASS_INTS
)
1741 for (j
= 0; j
< follows
.nelem
; ++j
)
1742 if (PREV_LETTER_DEPENDENT(follows
.elems
[j
].constraint
))
1745 /* Find the state(s) corresponding to the union of the follows. */
1746 state
= state_index(d
, &follows
, 0, 0);
1748 state_newline
= state_index(d
, &follows
, 1, 0);
1750 state_newline
= state
;
1752 state_letter
= state_index(d
, &follows
, 0, 1);
1754 state_letter
= state
;
1756 /* Set the transitions for each character in the current label. */
1757 for (j
= 0; j
< CHARCLASS_INTS
; ++j
)
1758 for (k
= 0; k
< INTBITS
; ++k
)
1759 if (labels
[i
][j
] & 1 << k
)
1761 int c
= j
* INTBITS
+ k
;
1764 trans
[c
] = state_newline
;
1765 else if (IS_WORD_CONSTITUENT(c
))
1766 trans
[c
] = state_letter
;
1767 else if (c
< NOTCHAR
)
1772 for (i
= 0; i
< ngrps
; ++i
)
1773 free(grps
[i
].elems
);
1774 free(follows
.elems
);
1778 /* Some routines for manipulating a compiled dfa's transition tables.
1779 Each state may or may not have a transition table; if it does, and it
1780 is a non-accepting state, then d->trans[state] points to its table.
1781 If it is an accepting state then d->fails[state] points to its table.
1782 If it has no table at all, then d->trans[state] is NULL.
1783 TODO: Improve this comment, get rid of the unnecessary redundancy. */
1786 build_state (int s
, struct dfa
*d
)
1788 int *trans
; /* The new transition table. */
1791 /* Set an upper limit on the number of transition tables that will ever
1792 exist at once. 1024 is arbitrary. The idea is that the frequently
1793 used transition tables will be quickly rebuilt, whereas the ones that
1794 were only needed once or twice will be cleared away. */
1795 if (d
->trcount
>= 1024)
1797 for (i
= 0; i
< d
->tralloc
; ++i
)
1800 free((ptr_t
) d
->trans
[i
]);
1803 else if (d
->fails
[i
])
1805 free((ptr_t
) d
->fails
[i
]);
1813 /* Set up the success bits for this state. */
1815 if (ACCEPTS_IN_CONTEXT(d
->states
[s
].newline
, 1, d
->states
[s
].letter
, 0,
1818 if (ACCEPTS_IN_CONTEXT(d
->states
[s
].newline
, 0, d
->states
[s
].letter
, 1,
1821 if (ACCEPTS_IN_CONTEXT(d
->states
[s
].newline
, 0, d
->states
[s
].letter
, 0,
1825 MALLOC(trans
, int, NOTCHAR
);
1826 dfastate(s
, d
, trans
);
1828 /* Now go through the new transition table, and make sure that the trans
1829 and fail arrays are allocated large enough to hold a pointer for the
1830 largest state mentioned in the table. */
1831 for (i
= 0; i
< NOTCHAR
; ++i
)
1832 if (trans
[i
] >= d
->tralloc
)
1834 int oldalloc
= d
->tralloc
;
1836 while (trans
[i
] >= d
->tralloc
)
1838 REALLOC(d
->realtrans
, int *, d
->tralloc
+ 1);
1839 d
->trans
= d
->realtrans
+ 1;
1840 REALLOC(d
->fails
, int *, d
->tralloc
);
1841 REALLOC(d
->success
, int, d
->tralloc
);
1842 REALLOC(d
->newlines
, int, d
->tralloc
);
1843 while (oldalloc
< d
->tralloc
)
1845 d
->trans
[oldalloc
] = NULL
;
1846 d
->fails
[oldalloc
++] = NULL
;
1850 /* Keep the newline transition in a special place so we can use it as
1852 d
->newlines
[s
] = trans
[eolbyte
];
1853 trans
[eolbyte
] = -1;
1855 if (ACCEPTING(s
, *d
))
1856 d
->fails
[s
] = trans
;
1858 d
->trans
[s
] = trans
;
1862 build_state_zero (struct dfa
*d
)
1866 CALLOC(d
->realtrans
, int *, d
->tralloc
+ 1);
1867 d
->trans
= d
->realtrans
+ 1;
1868 CALLOC(d
->fails
, int *, d
->tralloc
);
1869 MALLOC(d
->success
, int, d
->tralloc
);
1870 MALLOC(d
->newlines
, int, d
->tralloc
);
1874 /* Search through a buffer looking for a match to the given struct dfa.
1875 Find the first occurrence of a string matching the regexp in the buffer,
1876 and the shortest possible version thereof. Return a pointer to the first
1877 character after the match, or NULL if none is found. Begin points to
1878 the beginning of the buffer, and end points to the first character after
1879 its end. We store a newline in *end to act as a sentinel, so end had
1880 better point somewhere valid. Newline is a flag indicating whether to
1881 allow newlines to be in the matching string. If count is non-
1882 NULL it points to a place we're supposed to increment every time we
1883 see a newline. Finally, if backref is non-NULL it points to a place
1884 where we're supposed to store a 1 if backreferencing happened and the
1885 match needs to be verified by a backtracking matcher. Otherwise
1886 we store a 0 in *backref. */
1888 dfaexec (struct dfa
*d
, char *begin
, char *end
,
1889 int newline
, int *count
, int *backref
)
1891 register int s
, s1
, tmp
; /* Current state. */
1892 register unsigned char *p
; /* Current input character. */
1893 register int **trans
, *t
; /* Copy of d->trans so it can be optimized
1895 register unsigned char eol
= eolbyte
; /* Likewise for eolbyte. */
1896 static int sbit
[NOTCHAR
]; /* Table for anding with d->success. */
1897 static int sbit_init
;
1904 for (i
= 0; i
< NOTCHAR
; ++i
)
1905 sbit
[i
] = (IS_WORD_CONSTITUENT(i
)) ? 2 : 1;
1910 build_state_zero(d
);
1913 p
= (unsigned char *) begin
;
1919 while ((t
= trans
[s
]) != 0) { /* hand-optimized loop */
1921 if ((t
= trans
[s1
]) == 0) {
1922 tmp
= s
; s
= s1
; s1
= tmp
; /* swap */
1928 if (s
>= 0 && p
<= (unsigned char *) end
&& d
->fails
[s
])
1930 if (d
->success
[s
] & sbit
[*p
])
1933 *backref
= (d
->states
[s
].backref
!= 0);
1938 s
= d
->fails
[s
][*p
++];
1942 /* If the previous character was a newline, count it. */
1943 if (count
&& (char *) p
<= end
&& p
[-1] == eol
)
1946 /* Check if we've run off the end of the buffer. */
1947 if ((char *) p
> end
)
1957 if (p
[-1] == eol
&& newline
)
1959 s
= d
->newlines
[s1
];
1967 /* Initialize the components of a dfa that the other routines don't
1968 initialize for themselves. */
1970 dfainit (struct dfa
*d
)
1973 MALLOC(d
->charclasses
, charclass
, d
->calloc
);
1977 MALLOC(d
->tokens
, token
, d
->talloc
);
1978 d
->tindex
= d
->depth
= d
->nleaves
= d
->nregexps
= 0;
1986 /* Parse and analyze a single string of the given length. */
1988 dfacomp (char *s
, size_t len
, struct dfa
*d
, int searchflag
)
1990 if (case_fold
) /* dummy folding in service of dfamust() */
1995 lcopy
= malloc(len
);
1997 dfaerror(_("out of memory"));
1999 /* This is a kludge. */
2001 for (i
= 0; i
< len
; ++i
)
2002 if (ISUPPER ((unsigned char) s
[i
]))
2003 lcopy
[i
] = tolower ((unsigned char) s
[i
]);
2008 dfaparse(lcopy
, len
, d
);
2011 d
->cindex
= d
->tindex
= d
->depth
= d
->nleaves
= d
->nregexps
= 0;
2013 dfaparse(s
, len
, d
);
2014 dfaanalyze(d
, searchflag
);
2019 dfaparse(s
, len
, d
);
2021 dfaanalyze(d
, searchflag
);
2025 /* Free the storage held by the components of a dfa. */
2027 dfafree (struct dfa
*d
)
2030 struct dfamust
*dm
, *ndm
;
2032 free((ptr_t
) d
->charclasses
);
2033 free((ptr_t
) d
->tokens
);
2034 for (i
= 0; i
< d
->sindex
; ++i
)
2035 free((ptr_t
) d
->states
[i
].elems
.elems
);
2036 free((ptr_t
) d
->states
);
2037 for (i
= 0; i
< d
->tindex
; ++i
)
2038 if (d
->follows
[i
].elems
)
2039 free((ptr_t
) d
->follows
[i
].elems
);
2040 free((ptr_t
) d
->follows
);
2041 for (i
= 0; i
< d
->tralloc
; ++i
)
2043 free((ptr_t
) d
->trans
[i
]);
2044 else if (d
->fails
[i
])
2045 free((ptr_t
) d
->fails
[i
]);
2046 if (d
->realtrans
) free((ptr_t
) d
->realtrans
);
2047 if (d
->fails
) free((ptr_t
) d
->fails
);
2048 if (d
->newlines
) free((ptr_t
) d
->newlines
);
2049 if (d
->success
) free((ptr_t
) d
->success
);
2050 for (dm
= d
->musts
; dm
; dm
= ndm
)
2058 /* Having found the postfix representation of the regular expression,
2059 try to find a long sequence of characters that must appear in any line
2061 Finding a "longest" sequence is beyond the scope here;
2062 we take an easy way out and hope for the best.
2063 (Take "(ab|a)b"--please.)
2065 We do a bottom-up calculation of sequences of characters that must appear
2066 in matches of r.e.'s represented by trees rooted at the nodes of the postfix
2068 sequences that must appear at the left of the match ("left")
2069 sequences that must appear at the right of the match ("right")
2070 lists of sequences that must appear somewhere in the match ("in")
2071 sequences that must constitute the match ("is")
2073 When we get to the root of the tree, we use one of the longest of its
2074 calculated "in" sequences as our answer. The sequence we find is returned in
2075 d->must (where "d" is the single argument passed to "dfamust");
2076 the length of the sequence is returned in d->mustn.
2078 The sequences calculated for the various types of node (in pseudo ANSI c)
2079 are shown below. "p" is the operand of unary operators (and the left-hand
2080 operand of binary operators); "q" is the right-hand operand of binary
2083 "ZERO" means "a zero-length sequence" below.
2085 Type left right is in
2086 ---- ---- ----- -- --
2087 char c # c # c # c # c
2089 CSET ZERO ZERO ZERO ZERO
2091 STAR ZERO ZERO ZERO ZERO
2093 QMARK ZERO ZERO ZERO ZERO
2095 PLUS p->left p->right ZERO p->in
2097 CAT (p->is==ZERO)? (q->is==ZERO)? (p->is!=ZERO && p->in plus
2098 p->left : q->right : q->is!=ZERO) ? q->in plus
2099 p->is##q->left p->right##q->is p->is##q->is : p->right##q->left
2102 OR longest common longest common (do p->is and substrings common to
2103 leading trailing q->is have same p->in and q->in
2104 (sub)sequence (sub)sequence length and
2105 of p->left of p->right content) ?
2106 and q->left and q->right p->is : NULL
2108 If there's anything else we recognize in the tree, all four sequences get set
2109 to zero-length sequences. If there's something we don't recognize in the tree,
2110 we just return a zero-length sequence.
2112 Break ties in favor of infrequent letters (choosing 'zzz' in preference to
2115 And. . .is it here or someplace that we might ponder "optimizations" such as
2116 egrep 'psi|epsilon' -> egrep 'psi'
2117 egrep 'pepsi|epsilon' -> egrep 'epsi'
2118 (Yes, we now find "epsi" as a "string
2119 that must occur", but we might also
2120 simplify the *entire* r.e. being sought)
2121 grep '[c]' -> grep 'c'
2122 grep '(ab|a)b' -> grep 'ab'
2123 grep 'ab*' -> grep 'a'
2124 grep 'a*b' -> grep 'b'
2126 There are several issues:
2128 Is optimization easy (enough)?
2130 Does optimization actually accomplish anything,
2131 or is the automaton you get from "psi|epsilon" (for example)
2132 the same as the one you get from "psi" (for example)?
2134 Are optimizable r.e.'s likely to be used in real-life situations
2135 (something like 'ab*' is probably unlikely; something like is
2136 'psi|epsilon' is likelier)? */
2139 icatalloc (char *old
, char *new)
2142 size_t oldsize
, newsize
;
2144 newsize
= (new == NULL
) ? 0 : strlen(new);
2147 else if (newsize
== 0)
2149 else oldsize
= strlen(old
);
2151 result
= (char *) malloc(newsize
+ 1);
2153 result
= (char *) realloc((void *) old
, oldsize
+ newsize
+ 1);
2154 if (result
!= NULL
&& new != NULL
)
2155 (void) strcpy(result
+ oldsize
, new);
2160 icpyalloc (char *string
)
2162 return icatalloc((char *) NULL
, string
);
2166 istrstr (char *lookin
, char *lookfor
)
2171 len
= strlen(lookfor
);
2172 for (cp
= lookin
; *cp
!= '\0'; ++cp
)
2173 if (strncmp(cp
, lookfor
, len
) == 0)
2186 freelist (char **cpp
)
2192 for (i
= 0; cpp
[i
] != NULL
; ++i
)
2200 enlist (char **cpp
, char *new, size_t len
)
2206 if ((new = icpyalloc(new)) == NULL
)
2212 /* Is there already something in the list that's new (or longer)? */
2213 for (i
= 0; cpp
[i
] != NULL
; ++i
)
2214 if (istrstr(cpp
[i
], new) != NULL
)
2219 /* Eliminate any obsoleted strings. */
2221 while (cpp
[j
] != NULL
)
2222 if (istrstr(new, cpp
[j
]) == NULL
)
2232 /* Add the new string. */
2233 cpp
= (char **) realloc((char *) cpp
, (i
+ 2) * sizeof *cpp
);
2241 /* Given pointers to two strings, return a pointer to an allocated
2242 list of their distinct common substrings. Return NULL if something
2245 comsubs (char *left
, char *right
)
2252 if (left
== NULL
|| right
== NULL
)
2254 cpp
= (char **) malloc(sizeof *cpp
);
2258 for (lcp
= left
; *lcp
!= '\0'; ++lcp
)
2261 rcp
= index(right
, *lcp
);
2264 for (i
= 1; lcp
[i
] != '\0' && lcp
[i
] == rcp
[i
]; ++i
)
2268 rcp
= index(rcp
+ 1, *lcp
);
2272 if ((cpp
= enlist(cpp
, lcp
, len
)) == NULL
)
2279 addlists (char **old
, char **new)
2283 if (old
== NULL
|| new == NULL
)
2285 for (i
= 0; new[i
] != NULL
; ++i
)
2287 old
= enlist(old
, new[i
], strlen(new[i
]));
2294 /* Given two lists of substrings, return a new list giving substrings
2297 inboth (char **left
, char **right
)
2303 if (left
== NULL
|| right
== NULL
)
2305 both
= (char **) malloc(sizeof *both
);
2309 for (lnum
= 0; left
[lnum
] != NULL
; ++lnum
)
2311 for (rnum
= 0; right
[rnum
] != NULL
; ++rnum
)
2313 temp
= comsubs(left
[lnum
], right
[rnum
]);
2319 both
= addlists(both
, temp
);
2338 resetmust (must
*mp
)
2340 mp
->left
[0] = mp
->right
[0] = mp
->is
[0] = '\0';
2345 dfamust (struct dfa
*dfa
)
2356 static char empty_string
[] = "";
2358 result
= empty_string
;
2360 musts
= (must
*) malloc((dfa
->tindex
+ 1) * sizeof *musts
);
2364 for (i
= 0; i
<= dfa
->tindex
; ++i
)
2366 for (i
= 0; i
<= dfa
->tindex
; ++i
)
2368 mp
[i
].in
= (char **) malloc(sizeof *mp
[i
].in
);
2369 mp
[i
].left
= malloc(2);
2370 mp
[i
].right
= malloc(2);
2371 mp
[i
].is
= malloc(2);
2372 if (mp
[i
].in
== NULL
|| mp
[i
].left
== NULL
||
2373 mp
[i
].right
== NULL
|| mp
[i
].is
== NULL
)
2375 mp
[i
].left
[0] = mp
[i
].right
[0] = mp
[i
].is
[0] = '\0';
2379 fprintf(stderr
, "dfamust:\n");
2380 for (i
= 0; i
< dfa
->tindex
; ++i
)
2382 fprintf(stderr
, " %d:", i
);
2383 prtok(dfa
->tokens
[i
]);
2387 for (ri
= 0; ri
< dfa
->tindex
; ++ri
)
2389 switch (t
= dfa
->tokens
[ri
])
2393 goto done
; /* "cannot happen" */
2407 goto done
; /* "cannot happen" */
2414 goto done
; /* "cannot happen" */
2423 /* Guaranteed to be. Unlikely, but. . . */
2424 if (strcmp(lmp
->is
, rmp
->is
) != 0)
2426 /* Left side--easy */
2428 while (lmp
->left
[i
] != '\0' && lmp
->left
[i
] == rmp
->left
[i
])
2430 lmp
->left
[i
] = '\0';
2432 ln
= strlen(lmp
->right
);
2433 rn
= strlen(rmp
->right
);
2437 for (i
= 0; i
< n
; ++i
)
2438 if (lmp
->right
[ln
- i
- 1] != rmp
->right
[rn
- i
- 1])
2440 for (j
= 0; j
< i
; ++j
)
2441 lmp
->right
[j
] = lmp
->right
[(ln
- i
) + j
];
2442 lmp
->right
[j
] = '\0';
2443 new = inboth(lmp
->in
, rmp
->in
);
2447 free((char *) lmp
->in
);
2453 goto done
; /* "cannot happen" */
2458 if (mp
!= &musts
[1])
2459 goto done
; /* "cannot happen" */
2460 for (i
= 0; musts
[0].in
[i
] != NULL
; ++i
)
2461 if (strlen(musts
[0].in
[i
]) > strlen(result
))
2462 result
= musts
[0].in
[i
];
2463 if (strcmp(result
, musts
[0].is
) == 0)
2468 goto done
; /* "cannot happen" */
2475 /* In. Everything in left, plus everything in
2476 right, plus catenation of
2477 left's right and right's left. */
2478 lmp
->in
= addlists(lmp
->in
, rmp
->in
);
2479 if (lmp
->in
== NULL
)
2481 if (lmp
->right
[0] != '\0' &&
2482 rmp
->left
[0] != '\0')
2486 tp
= icpyalloc(lmp
->right
);
2489 tp
= icatalloc(tp
, rmp
->left
);
2492 lmp
->in
= enlist(lmp
->in
, tp
,
2495 if (lmp
->in
== NULL
)
2499 if (lmp
->is
[0] != '\0')
2501 lmp
->left
= icatalloc(lmp
->left
,
2503 if (lmp
->left
== NULL
)
2507 if (rmp
->is
[0] == '\0')
2508 lmp
->right
[0] = '\0';
2509 lmp
->right
= icatalloc(lmp
->right
, rmp
->right
);
2510 if (lmp
->right
== NULL
)
2512 /* Guaranteed to be */
2513 if (lmp
->is
[0] != '\0' && rmp
->is
[0] != '\0')
2515 lmp
->is
= icatalloc(lmp
->is
, rmp
->is
);
2516 if (lmp
->is
== NULL
)
2526 /* "cannot happen" */
2531 /* not on *my* shift */
2541 /* plain character */
2543 mp
->is
[0] = mp
->left
[0] = mp
->right
[0] = t
;
2544 mp
->is
[1] = mp
->left
[1] = mp
->right
[1] = '\0';
2545 mp
->in
= enlist(mp
->in
, mp
->is
, (size_t)1);
2552 fprintf(stderr
, " node: %d:", ri
);
2553 prtok(dfa
->tokens
[ri
]);
2554 fprintf(stderr
, "\n in:");
2555 for (i
= 0; mp
->in
[i
]; ++i
)
2556 fprintf(stderr
, " \"%s\"", mp
->in
[i
]);
2557 fprintf(stderr
, "\n is: \"%s\"\n", mp
->is
);
2558 fprintf(stderr
, " left: \"%s\"\n", mp
->left
);
2559 fprintf(stderr
, " right: \"%s\"\n", mp
->right
);
2566 dm
= (struct dfamust
*) malloc(sizeof (struct dfamust
));
2568 dm
->must
= malloc(strlen(result
) + 1);
2569 strcpy(dm
->must
, result
);
2570 dm
->next
= dfa
->musts
;
2574 for (i
= 0; i
<= dfa
->tindex
; ++i
)
2577 ifree((char *) mp
[i
].in
);