HAMMER 61F2/Many: Fix bug in last commit
[dragonfly.git] / gnu / usr.bin / grep / dfa.c
blobbf2c2009b6f315cab2dc60fe9ef780de3eaa37fc
1 /* dfa.c - deterministic extended regexp routines for GNU
2 Copyright (C) 1988, 1998 Free Software Foundation, Inc.
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2, or (at your option)
7 any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA */
18 /* Written June, 1988 by Mike Haertel
19 Modified July, 1988 by Arthur David Olson to assist BMG speedups */
21 /* $FreeBSD: src/gnu/usr.bin/grep/dfa.c,v 1.12 2000/01/31 13:28:08 ru Exp $ */
22 /* $DragonFly: src/gnu/usr.bin/grep/dfa.c,v 1.3 2004/02/03 19:22:57 dillon Exp $ */
24 #ifdef HAVE_CONFIG_H
25 #include <config.h>
26 #endif
28 #include <assert.h>
29 #include <ctype.h>
30 #include <stdio.h>
32 #include <sys/types.h>
33 #ifdef STDC_HEADERS
34 #include <stdlib.h>
35 #else
36 extern char *calloc(), *malloc(), *realloc();
37 extern void free();
38 #endif
40 #if defined(HAVE_STRING_H) || defined(STDC_HEADERS)
41 #include <string.h>
42 #undef index
43 #define index strchr
44 #else
45 #include <strings.h>
46 #endif
48 #ifndef DEBUG /* use the same approach as regex.c */
49 #undef assert
50 #define assert(e)
51 #endif /* DEBUG */
53 #ifndef isgraph
54 #define isgraph(C) (isprint(C) && !isspace(C))
55 #endif
57 #if defined (STDC_HEADERS) || (!defined (isascii) && !defined (HAVE_ISASCII))
58 #define ISALPHA(C) isalpha(C)
59 #define ISUPPER(C) isupper(C)
60 #define ISLOWER(C) islower(C)
61 #define ISDIGIT(C) isdigit(C)
62 #define ISXDIGIT(C) isxdigit(C)
63 #define ISSPACE(C) isspace(C)
64 #define ISPUNCT(C) ispunct(C)
65 #define ISALNUM(C) isalnum(C)
66 #define ISPRINT(C) isprint(C)
67 #define ISGRAPH(C) isgraph(C)
68 #define ISCNTRL(C) iscntrl(C)
69 #else
70 #define ISALPHA(C) (isascii(C) && isalpha(C))
71 #define ISUPPER(C) (isascii(C) && isupper(C))
72 #define ISLOWER(C) (isascii(C) && islower(C))
73 #define ISDIGIT(C) (isascii(C) && isdigit(C))
74 #define ISXDIGIT(C) (isascii(C) && isxdigit(C))
75 #define ISSPACE(C) (isascii(C) && isspace(C))
76 #define ISPUNCT(C) (isascii(C) && ispunct(C))
77 #define ISALNUM(C) (isascii(C) && isalnum(C))
78 #define ISPRINT(C) (isascii(C) && isprint(C))
79 #define ISGRAPH(C) (isascii(C) && isgraph(C))
80 #define ISCNTRL(C) (isascii(C) && iscntrl(C))
81 #endif
83 /* ISASCIIDIGIT differs from ISDIGIT, as follows:
84 - Its arg may be any int or unsigned int; it need not be an unsigned char.
85 - It's guaranteed to evaluate its argument exactly once.
86 - It's typically faster.
87 Posix 1003.2-1992 section 2.5.2.1 page 50 lines 1556-1558 says that
88 only '0' through '9' are digits. Prefer ISASCIIDIGIT to ISDIGIT unless
89 it's important to use the locale's definition of `digit' even when the
90 host does not conform to Posix. */
91 #define ISASCIIDIGIT(c) ((unsigned) (c) - '0' <= 9)
93 /* If we (don't) have I18N. */
94 /* glibc defines _ */
95 #ifndef _
96 # ifdef HAVE_LIBINTL_H
97 # include <libintl.h>
98 # ifndef _
99 # define _(Str) gettext (Str)
100 # endif
101 # else
102 # define _(Str) (Str)
103 # endif
104 #endif
106 #include <gnuregex.h>
107 #include "dfa.h"
109 /* HPUX, define those as macros in sys/param.h */
110 #ifdef setbit
111 # undef setbit
112 #endif
113 #ifdef clrbit
114 # undef clrbit
115 #endif
117 static void dfamust PARAMS ((struct dfa *dfa));
119 static ptr_t xcalloc PARAMS ((size_t n, size_t s));
120 static ptr_t xmalloc PARAMS ((size_t n));
121 static ptr_t xrealloc PARAMS ((ptr_t p, size_t n));
122 #ifdef DEBUG
123 static void prtok PARAMS ((token t));
124 #endif
125 static int tstbit PARAMS ((int b, charclass c));
126 static void setbit PARAMS ((int b, charclass c));
127 static void clrbit PARAMS ((int b, charclass c));
128 static void copyset PARAMS ((charclass src, charclass dst));
129 static void zeroset PARAMS ((charclass s));
130 static void notset PARAMS ((charclass s));
131 static int equal PARAMS ((charclass s1, charclass s2));
132 static int charclass_index PARAMS ((charclass s));
133 static int looking_at PARAMS ((const char *s));
134 static token lex PARAMS ((void));
135 static void addtok PARAMS ((token t));
136 static void atom PARAMS ((void));
137 static int nsubtoks PARAMS ((int tindex));
138 static void copytoks PARAMS ((int tindex, int ntokens));
139 static void closure PARAMS ((void));
140 static void branch PARAMS ((void));
141 static void regexp PARAMS ((int toplevel));
142 static void copy PARAMS ((position_set *src, position_set *dst));
143 static void insert PARAMS ((position p, position_set *s));
144 static void merge PARAMS ((position_set *s1, position_set *s2, position_set *m));
145 static void delete PARAMS ((position p, position_set *s));
146 static int state_index PARAMS ((struct dfa *d, position_set *s,
147 int newline, int letter));
148 static void build_state PARAMS ((int s, struct dfa *d));
149 static void build_state_zero PARAMS ((struct dfa *d));
150 static char *icatalloc PARAMS ((char *old, char *new));
151 static char *icpyalloc PARAMS ((char *string));
152 static char *istrstr PARAMS ((char *lookin, char *lookfor));
153 static void ifree PARAMS ((char *cp));
154 static void freelist PARAMS ((char **cpp));
155 static char **enlist PARAMS ((char **cpp, char *new, size_t len));
156 static char **comsubs PARAMS ((char *left, char *right));
157 static char **addlists PARAMS ((char **old, char **new));
158 static char **inboth PARAMS ((char **left, char **right));
160 static ptr_t
161 xcalloc (size_t n, size_t s)
163 ptr_t r = calloc(n, s);
165 if (!r)
166 dfaerror(_("Memory exhausted"));
167 return r;
170 static ptr_t
171 xmalloc (size_t n)
173 ptr_t r = malloc(n);
175 assert(n != 0);
176 if (!r)
177 dfaerror(_("Memory exhausted"));
178 return r;
181 static ptr_t
182 xrealloc (ptr_t p, size_t n)
184 ptr_t r = realloc(p, n);
186 assert(n != 0);
187 if (!r)
188 dfaerror(_("Memory exhausted"));
189 return r;
192 #define CALLOC(p, t, n) ((p) = (t *) xcalloc((size_t)(n), sizeof (t)))
193 #define MALLOC(p, t, n) ((p) = (t *) xmalloc((n) * sizeof (t)))
194 #define REALLOC(p, t, n) ((p) = (t *) xrealloc((ptr_t) (p), (n) * sizeof (t)))
196 /* Reallocate an array of type t if nalloc is too small for index. */
197 #define REALLOC_IF_NECESSARY(p, t, nalloc, index) \
198 if ((index) >= (nalloc)) \
200 while ((index) >= (nalloc)) \
201 (nalloc) *= 2; \
202 REALLOC(p, t, nalloc); \
205 #ifdef DEBUG
207 static void
208 prtok (token t)
210 char *s;
212 if (t < 0)
213 fprintf(stderr, "END");
214 else if (t < NOTCHAR)
215 fprintf(stderr, "%c", t);
216 else
218 switch (t)
220 case EMPTY: s = "EMPTY"; break;
221 case BACKREF: s = "BACKREF"; break;
222 case BEGLINE: s = "BEGLINE"; break;
223 case ENDLINE: s = "ENDLINE"; break;
224 case BEGWORD: s = "BEGWORD"; break;
225 case ENDWORD: s = "ENDWORD"; break;
226 case LIMWORD: s = "LIMWORD"; break;
227 case NOTLIMWORD: s = "NOTLIMWORD"; break;
228 case QMARK: s = "QMARK"; break;
229 case STAR: s = "STAR"; break;
230 case PLUS: s = "PLUS"; break;
231 case CAT: s = "CAT"; break;
232 case OR: s = "OR"; break;
233 case ORTOP: s = "ORTOP"; break;
234 case LPAREN: s = "LPAREN"; break;
235 case RPAREN: s = "RPAREN"; break;
236 default: s = "CSET"; break;
238 fprintf(stderr, "%s", s);
241 #endif /* DEBUG */
243 /* Stuff pertaining to charclasses. */
245 static int
246 tstbit (int b, charclass c)
248 return c[b / INTBITS] & 1 << b % INTBITS;
251 static void
252 setbit (int b, charclass c)
254 c[b / INTBITS] |= 1 << b % INTBITS;
257 static void
258 clrbit (int b, charclass c)
260 c[b / INTBITS] &= ~(1 << b % INTBITS);
263 static void
264 copyset (charclass src, charclass dst)
266 int i;
268 for (i = 0; i < CHARCLASS_INTS; ++i)
269 dst[i] = src[i];
272 static void
273 zeroset (charclass s)
275 int i;
277 for (i = 0; i < CHARCLASS_INTS; ++i)
278 s[i] = 0;
281 static void
282 notset (charclass s)
284 int i;
286 for (i = 0; i < CHARCLASS_INTS; ++i)
287 s[i] = ~s[i];
290 static int
291 equal (charclass s1, charclass s2)
293 int i;
295 for (i = 0; i < CHARCLASS_INTS; ++i)
296 if (s1[i] != s2[i])
297 return 0;
298 return 1;
301 /* A pointer to the current dfa is kept here during parsing. */
302 static struct dfa *dfa;
304 /* Find the index of charclass s in dfa->charclasses, or allocate a new charclass. */
305 static int
306 charclass_index (charclass s)
308 int i;
310 for (i = 0; i < dfa->cindex; ++i)
311 if (equal(s, dfa->charclasses[i]))
312 return i;
313 REALLOC_IF_NECESSARY(dfa->charclasses, charclass, dfa->calloc, dfa->cindex);
314 ++dfa->cindex;
315 copyset(s, dfa->charclasses[i]);
316 return i;
319 /* Syntax bits controlling the behavior of the lexical analyzer. */
320 static reg_syntax_t syntax_bits, syntax_bits_set;
322 /* Flag for case-folding letters into sets. */
323 static int case_fold;
325 /* End-of-line byte in data. */
326 static unsigned char eolbyte;
328 /* Entry point to set syntax options. */
329 void
330 dfasyntax (reg_syntax_t bits, int fold, int eol)
332 syntax_bits_set = 1;
333 syntax_bits = bits;
334 case_fold = fold;
335 eolbyte = eol;
338 /* Lexical analyzer. All the dross that deals with the obnoxious
339 GNU Regex syntax bits is located here. The poor, suffering
340 reader is referred to the GNU Regex documentation for the
341 meaning of the @#%!@#%^!@ syntax bits. */
343 static char *lexstart; /* Pointer to beginning of input string. */
344 static char *lexptr; /* Pointer to next input character. */
345 static int lexleft; /* Number of characters remaining. */
346 static token lasttok; /* Previous token returned; initially END. */
347 static int laststart; /* True if we're separated from beginning or (, |
348 only by zero-width characters. */
349 static int parens; /* Count of outstanding left parens. */
350 static int minrep, maxrep; /* Repeat counts for {m,n}. */
352 /* Note that characters become unsigned here. */
353 #define FETCH(c, eoferr) \
355 if (! lexleft) \
357 if (eoferr != 0) \
358 dfaerror (eoferr); \
359 else \
360 return lasttok = END; \
362 (c) = (unsigned char) *lexptr++; \
363 --lexleft; \
366 #ifdef __STDC__
367 #define FUNC(F, P) static int F(int c) { return P(c); }
368 #else
369 #define FUNC(F, P) static int F(c) int c; { return P(c); }
370 #endif
372 FUNC(is_alpha, ISALPHA)
373 FUNC(is_upper, ISUPPER)
374 FUNC(is_lower, ISLOWER)
375 FUNC(is_digit, ISDIGIT)
376 FUNC(is_xdigit, ISXDIGIT)
377 FUNC(is_space, ISSPACE)
378 FUNC(is_punct, ISPUNCT)
379 FUNC(is_alnum, ISALNUM)
380 FUNC(is_print, ISPRINT)
381 FUNC(is_graph, ISGRAPH)
382 FUNC(is_cntrl, ISCNTRL)
384 static int
385 is_blank (int c)
387 return (c == ' ' || c == '\t');
390 /* The following list maps the names of the Posix named character classes
391 to predicate functions that determine whether a given character is in
392 the class. The leading [ has already been eaten by the lexical analyzer. */
393 static struct {
394 const char *name;
395 int (*pred) PARAMS ((int));
396 } prednames[] = {
397 { ":alpha:]", is_alpha },
398 { ":upper:]", is_upper },
399 { ":lower:]", is_lower },
400 { ":digit:]", is_digit },
401 { ":xdigit:]", is_xdigit },
402 { ":space:]", is_space },
403 { ":punct:]", is_punct },
404 { ":alnum:]", is_alnum },
405 { ":print:]", is_print },
406 { ":graph:]", is_graph },
407 { ":cntrl:]", is_cntrl },
408 { ":blank:]", is_blank },
409 { 0 }
412 /* Return non-zero if C is a `word-constituent' byte; zero otherwise. */
413 #define IS_WORD_CONSTITUENT(C) (ISALNUM(C) || (C) == '_')
415 static int
416 looking_at (char const *s)
418 size_t len;
420 len = strlen(s);
421 if (lexleft < len)
422 return 0;
423 return strncmp(s, lexptr, len) == 0;
426 static token
427 lex (void)
429 token c, c1, c2;
430 int backslash = 0, invert;
431 charclass ccl;
432 int i;
433 char lo[2];
434 char hi[2];
436 /* Basic plan: We fetch a character. If it's a backslash,
437 we set the backslash flag and go through the loop again.
438 On the plus side, this avoids having a duplicate of the
439 main switch inside the backslash case. On the minus side,
440 it means that just about every case begins with
441 "if (backslash) ...". */
442 for (i = 0; i < 2; ++i)
444 FETCH(c, 0);
445 switch (c)
447 case '\\':
448 if (backslash)
449 goto normal_char;
450 if (lexleft == 0)
451 dfaerror(_("Unfinished \\ escape"));
452 backslash = 1;
453 break;
455 case '^':
456 if (backslash)
457 goto normal_char;
458 if (syntax_bits & RE_CONTEXT_INDEP_ANCHORS
459 || lasttok == END
460 || lasttok == LPAREN
461 || lasttok == OR)
462 return lasttok = BEGLINE;
463 goto normal_char;
465 case '$':
466 if (backslash)
467 goto normal_char;
468 if (syntax_bits & RE_CONTEXT_INDEP_ANCHORS
469 || lexleft == 0
470 || (syntax_bits & RE_NO_BK_PARENS
471 ? lexleft > 0 && *lexptr == ')'
472 : lexleft > 1 && lexptr[0] == '\\' && lexptr[1] == ')')
473 || (syntax_bits & RE_NO_BK_VBAR
474 ? lexleft > 0 && *lexptr == '|'
475 : lexleft > 1 && lexptr[0] == '\\' && lexptr[1] == '|')
476 || ((syntax_bits & RE_NEWLINE_ALT)
477 && lexleft > 0 && *lexptr == '\n'))
478 return lasttok = ENDLINE;
479 goto normal_char;
481 case '1':
482 case '2':
483 case '3':
484 case '4':
485 case '5':
486 case '6':
487 case '7':
488 case '8':
489 case '9':
490 if (backslash && !(syntax_bits & RE_NO_BK_REFS))
492 laststart = 0;
493 return lasttok = BACKREF;
495 goto normal_char;
497 case '`':
498 if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
499 return lasttok = BEGLINE; /* FIXME: should be beginning of string */
500 goto normal_char;
502 case '\'':
503 if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
504 return lasttok = ENDLINE; /* FIXME: should be end of string */
505 goto normal_char;
507 case '<':
508 if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
509 return lasttok = BEGWORD;
510 goto normal_char;
512 case '>':
513 if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
514 return lasttok = ENDWORD;
515 goto normal_char;
517 case 'b':
518 if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
519 return lasttok = LIMWORD;
520 goto normal_char;
522 case 'B':
523 if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
524 return lasttok = NOTLIMWORD;
525 goto normal_char;
527 case '?':
528 if (syntax_bits & RE_LIMITED_OPS)
529 goto normal_char;
530 if (backslash != ((syntax_bits & RE_BK_PLUS_QM) != 0))
531 goto normal_char;
532 if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
533 goto normal_char;
534 return lasttok = QMARK;
536 case '*':
537 if (backslash)
538 goto normal_char;
539 if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
540 goto normal_char;
541 return lasttok = STAR;
543 case '+':
544 if (syntax_bits & RE_LIMITED_OPS)
545 goto normal_char;
546 if (backslash != ((syntax_bits & RE_BK_PLUS_QM) != 0))
547 goto normal_char;
548 if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
549 goto normal_char;
550 return lasttok = PLUS;
552 case '{':
553 if (!(syntax_bits & RE_INTERVALS))
554 goto normal_char;
555 if (backslash != ((syntax_bits & RE_NO_BK_BRACES) == 0))
556 goto normal_char;
557 if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
558 goto normal_char;
560 if (syntax_bits & RE_NO_BK_BRACES)
562 /* Scan ahead for a valid interval; if it's not valid,
563 treat it as a literal '{'. */
564 int lo = -1, hi = -1;
565 char const *p = lexptr;
566 char const *lim = p + lexleft;
567 for (; p != lim && ISASCIIDIGIT (*p); p++)
568 lo = (lo < 0 ? 0 : lo * 10) + *p - '0';
569 if (p != lim && *p == ',')
570 while (++p != lim && ISASCIIDIGIT (*p))
571 hi = (hi < 0 ? 0 : hi * 10) + *p - '0';
572 else
573 hi = lo;
574 if (p == lim || *p != '}'
575 || lo < 0 || RE_DUP_MAX < hi || (0 <= hi && hi < lo))
576 goto normal_char;
579 minrep = 0;
580 /* Cases:
581 {M} - exact count
582 {M,} - minimum count, maximum is infinity
583 {M,N} - M through N */
584 FETCH(c, _("unfinished repeat count"));
585 if (ISASCIIDIGIT (c))
587 minrep = c - '0';
588 for (;;)
590 FETCH(c, _("unfinished repeat count"));
591 if (! ISASCIIDIGIT (c))
592 break;
593 minrep = 10 * minrep + c - '0';
596 else
597 dfaerror(_("malformed repeat count"));
598 if (c == ',')
600 FETCH (c, _("unfinished repeat count"));
601 if (! ISASCIIDIGIT (c))
602 maxrep = -1;
603 else
605 maxrep = c - '0';
606 for (;;)
608 FETCH (c, _("unfinished repeat count"));
609 if (! ISASCIIDIGIT (c))
610 break;
611 maxrep = 10 * maxrep + c - '0';
613 if (0 <= maxrep && maxrep < minrep)
614 dfaerror (_("malformed repeat count"));
617 else
618 maxrep = minrep;
619 if (!(syntax_bits & RE_NO_BK_BRACES))
621 if (c != '\\')
622 dfaerror(_("malformed repeat count"));
623 FETCH(c, _("unfinished repeat count"));
625 if (c != '}')
626 dfaerror(_("malformed repeat count"));
627 laststart = 0;
628 return lasttok = REPMN;
630 case '|':
631 if (syntax_bits & RE_LIMITED_OPS)
632 goto normal_char;
633 if (backslash != ((syntax_bits & RE_NO_BK_VBAR) == 0))
634 goto normal_char;
635 laststart = 1;
636 return lasttok = OR;
638 case '\n':
639 if (syntax_bits & RE_LIMITED_OPS
640 || backslash
641 || !(syntax_bits & RE_NEWLINE_ALT))
642 goto normal_char;
643 laststart = 1;
644 return lasttok = OR;
646 case '(':
647 if (backslash != ((syntax_bits & RE_NO_BK_PARENS) == 0))
648 goto normal_char;
649 ++parens;
650 laststart = 1;
651 return lasttok = LPAREN;
653 case ')':
654 if (backslash != ((syntax_bits & RE_NO_BK_PARENS) == 0))
655 goto normal_char;
656 if (parens == 0 && syntax_bits & RE_UNMATCHED_RIGHT_PAREN_ORD)
657 goto normal_char;
658 --parens;
659 laststart = 0;
660 return lasttok = RPAREN;
662 case '.':
663 if (backslash)
664 goto normal_char;
665 zeroset(ccl);
666 notset(ccl);
667 if (!(syntax_bits & RE_DOT_NEWLINE))
668 clrbit(eolbyte, ccl);
669 if (syntax_bits & RE_DOT_NOT_NULL)
670 clrbit('\0', ccl);
671 laststart = 0;
672 return lasttok = CSET + charclass_index(ccl);
674 case 'w':
675 case 'W':
676 if (!backslash || (syntax_bits & RE_NO_GNU_OPS))
677 goto normal_char;
678 zeroset(ccl);
679 for (c2 = 0; c2 < NOTCHAR; ++c2)
680 if (IS_WORD_CONSTITUENT(c2))
681 setbit(c2, ccl);
682 if (c == 'W')
683 notset(ccl);
684 laststart = 0;
685 return lasttok = CSET + charclass_index(ccl);
687 case '[':
688 if (backslash)
689 goto normal_char;
690 zeroset(ccl);
691 FETCH(c, _("Unbalanced ["));
692 if (c == '^')
694 FETCH(c, _("Unbalanced ["));
695 invert = 1;
697 else
698 invert = 0;
701 /* Nobody ever said this had to be fast. :-)
702 Note that if we're looking at some other [:...:]
703 construct, we just treat it as a bunch of ordinary
704 characters. We can do this because we assume
705 regex has checked for syntax errors before
706 dfa is ever called. */
707 if (c == '[' && (syntax_bits & RE_CHAR_CLASSES))
708 for (c1 = 0; prednames[c1].name; ++c1)
709 if (looking_at(prednames[c1].name))
711 int (*pred)() = prednames[c1].pred;
712 if (case_fold
713 && (pred == is_upper || pred == is_lower))
714 pred = is_alpha;
716 for (c2 = 0; c2 < NOTCHAR; ++c2)
717 if ((*pred)(c2))
718 setbit(c2, ccl);
719 lexptr += strlen(prednames[c1].name);
720 lexleft -= strlen(prednames[c1].name);
721 FETCH(c1, _("Unbalanced ["));
722 goto skip;
724 if (c == '\\' && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
725 FETCH(c, _("Unbalanced ["));
726 FETCH(c1, _("Unbalanced ["));
727 if (c1 == '-')
729 FETCH(c2, _("Unbalanced ["));
730 if (c2 == ']')
732 /* In the case [x-], the - is an ordinary hyphen,
733 which is left in c1, the lookahead character. */
734 --lexptr;
735 ++lexleft;
736 c2 = c;
738 else
740 if (c2 == '\\'
741 && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
742 FETCH(c2, _("Unbalanced ["));
743 FETCH(c1, _("Unbalanced ["));
746 else
747 c2 = c;
749 lo[0] = c; lo[1] = '\0';
750 hi[0] = c2; hi[1] = '\0';
751 for (c = 0; c < NOTCHAR; c++)
753 char ch[2];
754 ch[0] = c; ch[1] = '\0';
755 if (strcoll (lo, ch) <= 0 && strcoll (ch, hi) <= 0)
757 setbit (c, ccl);
758 if (case_fold)
760 if (ISUPPER (c))
761 setbit (tolower (c), ccl);
762 else if (ISLOWER (c))
763 setbit (toupper (c), ccl);
768 skip:
771 while ((c = c1) != ']');
772 if (invert)
774 notset(ccl);
775 if (syntax_bits & RE_HAT_LISTS_NOT_NEWLINE)
776 clrbit(eolbyte, ccl);
778 laststart = 0;
779 return lasttok = CSET + charclass_index(ccl);
781 default:
782 normal_char:
783 laststart = 0;
784 if (case_fold && ISALPHA(c))
786 zeroset(ccl);
787 setbit(c, ccl);
788 if (isupper(c))
789 setbit(tolower(c), ccl);
790 else
791 setbit(toupper(c), ccl);
792 return lasttok = CSET + charclass_index(ccl);
794 return c;
798 /* The above loop should consume at most a backslash
799 and some other character. */
800 abort();
801 return END; /* keeps pedantic compilers happy. */
804 /* Recursive descent parser for regular expressions. */
806 static token tok; /* Lookahead token. */
807 static int depth; /* Current depth of a hypothetical stack
808 holding deferred productions. This is
809 used to determine the depth that will be
810 required of the real stack later on in
811 dfaanalyze(). */
813 /* Add the given token to the parse tree, maintaining the depth count and
814 updating the maximum depth if necessary. */
815 static void
816 addtok (token t)
818 REALLOC_IF_NECESSARY(dfa->tokens, token, dfa->talloc, dfa->tindex);
819 dfa->tokens[dfa->tindex++] = t;
821 switch (t)
823 case QMARK:
824 case STAR:
825 case PLUS:
826 break;
828 case CAT:
829 case OR:
830 case ORTOP:
831 --depth;
832 break;
834 default:
835 ++dfa->nleaves;
836 case EMPTY:
837 ++depth;
838 break;
840 if (depth > dfa->depth)
841 dfa->depth = depth;
844 /* The grammar understood by the parser is as follows.
846 regexp:
847 regexp OR branch
848 branch
850 branch:
851 branch closure
852 closure
854 closure:
855 closure QMARK
856 closure STAR
857 closure PLUS
858 atom
860 atom:
861 <normal character>
862 CSET
863 BACKREF
864 BEGLINE
865 ENDLINE
866 BEGWORD
867 ENDWORD
868 LIMWORD
869 NOTLIMWORD
870 <empty>
872 The parser builds a parse tree in postfix form in an array of tokens. */
874 static void
875 atom (void)
877 if ((tok >= 0 && tok < NOTCHAR) || tok >= CSET || tok == BACKREF
878 || tok == BEGLINE || tok == ENDLINE || tok == BEGWORD
879 || tok == ENDWORD || tok == LIMWORD || tok == NOTLIMWORD)
881 addtok(tok);
882 tok = lex();
884 else if (tok == LPAREN)
886 tok = lex();
887 regexp(0);
888 if (tok != RPAREN)
889 dfaerror(_("Unbalanced ("));
890 tok = lex();
892 else
893 addtok(EMPTY);
896 /* Return the number of tokens in the given subexpression. */
897 static int
898 nsubtoks (int tindex)
900 int ntoks1;
902 switch (dfa->tokens[tindex - 1])
904 default:
905 return 1;
906 case QMARK:
907 case STAR:
908 case PLUS:
909 return 1 + nsubtoks(tindex - 1);
910 case CAT:
911 case OR:
912 case ORTOP:
913 ntoks1 = nsubtoks(tindex - 1);
914 return 1 + ntoks1 + nsubtoks(tindex - 1 - ntoks1);
918 /* Copy the given subexpression to the top of the tree. */
919 static void
920 copytoks (int tindex, int ntokens)
922 int i;
924 for (i = 0; i < ntokens; ++i)
925 addtok(dfa->tokens[tindex + i]);
928 static void
929 closure (void)
931 int tindex, ntokens, i;
933 atom();
934 while (tok == QMARK || tok == STAR || tok == PLUS || tok == REPMN)
935 if (tok == REPMN)
937 ntokens = nsubtoks(dfa->tindex);
938 tindex = dfa->tindex - ntokens;
939 if (maxrep < 0)
940 addtok(PLUS);
941 if (minrep == 0)
942 addtok(QMARK);
943 for (i = 1; i < minrep; ++i)
945 copytoks(tindex, ntokens);
946 addtok(CAT);
948 for (; i < maxrep; ++i)
950 copytoks(tindex, ntokens);
951 addtok(QMARK);
952 addtok(CAT);
954 tok = lex();
956 else
958 addtok(tok);
959 tok = lex();
963 static void
964 branch (void)
966 closure();
967 while (tok != RPAREN && tok != OR && tok >= 0)
969 closure();
970 addtok(CAT);
974 static void
975 regexp (int toplevel)
977 branch();
978 while (tok == OR)
980 tok = lex();
981 branch();
982 if (toplevel)
983 addtok(ORTOP);
984 else
985 addtok(OR);
989 /* Main entry point for the parser. S is a string to be parsed, len is the
990 length of the string, so s can include NUL characters. D is a pointer to
991 the struct dfa to parse into. */
992 void
993 dfaparse (char *s, size_t len, struct dfa *d)
995 dfa = d;
996 lexstart = lexptr = s;
997 lexleft = len;
998 lasttok = END;
999 laststart = 1;
1000 parens = 0;
1002 if (! syntax_bits_set)
1003 dfaerror(_("No syntax specified"));
1005 tok = lex();
1006 depth = d->depth;
1008 regexp(1);
1010 if (tok != END)
1011 dfaerror(_("Unbalanced )"));
1013 addtok(END - d->nregexps);
1014 addtok(CAT);
1016 if (d->nregexps)
1017 addtok(ORTOP);
1019 ++d->nregexps;
1022 /* Some primitives for operating on sets of positions. */
1024 /* Copy one set to another; the destination must be large enough. */
1025 static void
1026 copy (position_set *src, position_set *dst)
1028 int i;
1030 for (i = 0; i < src->nelem; ++i)
1031 dst->elems[i] = src->elems[i];
1032 dst->nelem = src->nelem;
1035 /* Insert a position in a set. Position sets are maintained in sorted
1036 order according to index. If position already exists in the set with
1037 the same index then their constraints are logically or'd together.
1038 S->elems must point to an array large enough to hold the resulting set. */
1039 static void
1040 insert (position p, position_set *s)
1042 int i;
1043 position t1, t2;
1045 for (i = 0; i < s->nelem && p.index < s->elems[i].index; ++i)
1046 continue;
1047 if (i < s->nelem && p.index == s->elems[i].index)
1048 s->elems[i].constraint |= p.constraint;
1049 else
1051 t1 = p;
1052 ++s->nelem;
1053 while (i < s->nelem)
1055 t2 = s->elems[i];
1056 s->elems[i++] = t1;
1057 t1 = t2;
1062 /* Merge two sets of positions into a third. The result is exactly as if
1063 the positions of both sets were inserted into an initially empty set. */
1064 static void
1065 merge (position_set *s1, position_set *s2, position_set *m)
1067 int i = 0, j = 0;
1069 m->nelem = 0;
1070 while (i < s1->nelem && j < s2->nelem)
1071 if (s1->elems[i].index > s2->elems[j].index)
1072 m->elems[m->nelem++] = s1->elems[i++];
1073 else if (s1->elems[i].index < s2->elems[j].index)
1074 m->elems[m->nelem++] = s2->elems[j++];
1075 else
1077 m->elems[m->nelem] = s1->elems[i++];
1078 m->elems[m->nelem++].constraint |= s2->elems[j++].constraint;
1080 while (i < s1->nelem)
1081 m->elems[m->nelem++] = s1->elems[i++];
1082 while (j < s2->nelem)
1083 m->elems[m->nelem++] = s2->elems[j++];
1086 /* Delete a position from a set. */
1087 static void
1088 delete (position p, position_set *s)
1090 int i;
1092 for (i = 0; i < s->nelem; ++i)
1093 if (p.index == s->elems[i].index)
1094 break;
1095 if (i < s->nelem)
1096 for (--s->nelem; i < s->nelem; ++i)
1097 s->elems[i] = s->elems[i + 1];
1100 /* Find the index of the state corresponding to the given position set with
1101 the given preceding context, or create a new state if there is no such
1102 state. Newline and letter tell whether we got here on a newline or
1103 letter, respectively. */
1104 static int
1105 state_index (struct dfa *d, position_set *s, int newline, int letter)
1107 int hash = 0;
1108 int constraint;
1109 int i, j;
1111 newline = newline ? 1 : 0;
1112 letter = letter ? 1 : 0;
1114 for (i = 0; i < s->nelem; ++i)
1115 hash ^= s->elems[i].index + s->elems[i].constraint;
1117 /* Try to find a state that exactly matches the proposed one. */
1118 for (i = 0; i < d->sindex; ++i)
1120 if (hash != d->states[i].hash || s->nelem != d->states[i].elems.nelem
1121 || newline != d->states[i].newline || letter != d->states[i].letter)
1122 continue;
1123 for (j = 0; j < s->nelem; ++j)
1124 if (s->elems[j].constraint
1125 != d->states[i].elems.elems[j].constraint
1126 || s->elems[j].index != d->states[i].elems.elems[j].index)
1127 break;
1128 if (j == s->nelem)
1129 return i;
1132 /* We'll have to create a new state. */
1133 REALLOC_IF_NECESSARY(d->states, dfa_state, d->salloc, d->sindex);
1134 d->states[i].hash = hash;
1135 MALLOC(d->states[i].elems.elems, position, s->nelem);
1136 copy(s, &d->states[i].elems);
1137 d->states[i].newline = newline;
1138 d->states[i].letter = letter;
1139 d->states[i].backref = 0;
1140 d->states[i].constraint = 0;
1141 d->states[i].first_end = 0;
1142 for (j = 0; j < s->nelem; ++j)
1143 if (d->tokens[s->elems[j].index] < 0)
1145 constraint = s->elems[j].constraint;
1146 if (SUCCEEDS_IN_CONTEXT(constraint, newline, 0, letter, 0)
1147 || SUCCEEDS_IN_CONTEXT(constraint, newline, 0, letter, 1)
1148 || SUCCEEDS_IN_CONTEXT(constraint, newline, 1, letter, 0)
1149 || SUCCEEDS_IN_CONTEXT(constraint, newline, 1, letter, 1))
1150 d->states[i].constraint |= constraint;
1151 if (! d->states[i].first_end)
1152 d->states[i].first_end = d->tokens[s->elems[j].index];
1154 else if (d->tokens[s->elems[j].index] == BACKREF)
1156 d->states[i].constraint = NO_CONSTRAINT;
1157 d->states[i].backref = 1;
1160 ++d->sindex;
1162 return i;
1165 /* Find the epsilon closure of a set of positions. If any position of the set
1166 contains a symbol that matches the empty string in some context, replace
1167 that position with the elements of its follow labeled with an appropriate
1168 constraint. Repeat exhaustively until no funny positions are left.
1169 S->elems must be large enough to hold the result. */
1170 static void
1171 epsclosure (position_set *s, struct dfa *d)
1173 int i, j;
1174 int *visited;
1175 position p, old;
1177 MALLOC(visited, int, d->tindex);
1178 for (i = 0; i < d->tindex; ++i)
1179 visited[i] = 0;
1181 for (i = 0; i < s->nelem; ++i)
1182 if (d->tokens[s->elems[i].index] >= NOTCHAR
1183 && d->tokens[s->elems[i].index] != BACKREF
1184 && d->tokens[s->elems[i].index] < CSET)
1186 old = s->elems[i];
1187 p.constraint = old.constraint;
1188 delete(s->elems[i], s);
1189 if (visited[old.index])
1191 --i;
1192 continue;
1194 visited[old.index] = 1;
1195 switch (d->tokens[old.index])
1197 case BEGLINE:
1198 p.constraint &= BEGLINE_CONSTRAINT;
1199 break;
1200 case ENDLINE:
1201 p.constraint &= ENDLINE_CONSTRAINT;
1202 break;
1203 case BEGWORD:
1204 p.constraint &= BEGWORD_CONSTRAINT;
1205 break;
1206 case ENDWORD:
1207 p.constraint &= ENDWORD_CONSTRAINT;
1208 break;
1209 case LIMWORD:
1210 p.constraint &= LIMWORD_CONSTRAINT;
1211 break;
1212 case NOTLIMWORD:
1213 p.constraint &= NOTLIMWORD_CONSTRAINT;
1214 break;
1215 default:
1216 break;
1218 for (j = 0; j < d->follows[old.index].nelem; ++j)
1220 p.index = d->follows[old.index].elems[j].index;
1221 insert(p, s);
1223 /* Force rescan to start at the beginning. */
1224 i = -1;
1227 free(visited);
1230 /* Perform bottom-up analysis on the parse tree, computing various functions.
1231 Note that at this point, we're pretending constructs like \< are real
1232 characters rather than constraints on what can follow them.
1234 Nullable: A node is nullable if it is at the root of a regexp that can
1235 match the empty string.
1236 * EMPTY leaves are nullable.
1237 * No other leaf is nullable.
1238 * A QMARK or STAR node is nullable.
1239 * A PLUS node is nullable if its argument is nullable.
1240 * A CAT node is nullable if both its arguments are nullable.
1241 * An OR node is nullable if either argument is nullable.
1243 Firstpos: The firstpos of a node is the set of positions (nonempty leaves)
1244 that could correspond to the first character of a string matching the
1245 regexp rooted at the given node.
1246 * EMPTY leaves have empty firstpos.
1247 * The firstpos of a nonempty leaf is that leaf itself.
1248 * The firstpos of a QMARK, STAR, or PLUS node is the firstpos of its
1249 argument.
1250 * The firstpos of a CAT node is the firstpos of the left argument, union
1251 the firstpos of the right if the left argument is nullable.
1252 * The firstpos of an OR node is the union of firstpos of each argument.
1254 Lastpos: The lastpos of a node is the set of positions that could
1255 correspond to the last character of a string matching the regexp at
1256 the given node.
1257 * EMPTY leaves have empty lastpos.
1258 * The lastpos of a nonempty leaf is that leaf itself.
1259 * The lastpos of a QMARK, STAR, or PLUS node is the lastpos of its
1260 argument.
1261 * The lastpos of a CAT node is the lastpos of its right argument, union
1262 the lastpos of the left if the right argument is nullable.
1263 * The lastpos of an OR node is the union of the lastpos of each argument.
1265 Follow: The follow of a position is the set of positions that could
1266 correspond to the character following a character matching the node in
1267 a string matching the regexp. At this point we consider special symbols
1268 that match the empty string in some context to be just normal characters.
1269 Later, if we find that a special symbol is in a follow set, we will
1270 replace it with the elements of its follow, labeled with an appropriate
1271 constraint.
1272 * Every node in the firstpos of the argument of a STAR or PLUS node is in
1273 the follow of every node in the lastpos.
1274 * Every node in the firstpos of the second argument of a CAT node is in
1275 the follow of every node in the lastpos of the first argument.
1277 Because of the postfix representation of the parse tree, the depth-first
1278 analysis is conveniently done by a linear scan with the aid of a stack.
1279 Sets are stored as arrays of the elements, obeying a stack-like allocation
1280 scheme; the number of elements in each set deeper in the stack can be
1281 used to determine the address of a particular set's array. */
1282 void
1283 dfaanalyze (struct dfa *d, int searchflag)
1285 int *nullable; /* Nullable stack. */
1286 int *nfirstpos; /* Element count stack for firstpos sets. */
1287 position *firstpos; /* Array where firstpos elements are stored. */
1288 int *nlastpos; /* Element count stack for lastpos sets. */
1289 position *lastpos; /* Array where lastpos elements are stored. */
1290 int *nalloc; /* Sizes of arrays allocated to follow sets. */
1291 position_set tmp; /* Temporary set for merging sets. */
1292 position_set merged; /* Result of merging sets. */
1293 int wants_newline; /* True if some position wants newline info. */
1294 int *o_nullable;
1295 int *o_nfirst, *o_nlast;
1296 position *o_firstpos, *o_lastpos;
1297 int i, j;
1298 position *pos;
1300 #ifdef DEBUG
1301 fprintf(stderr, "dfaanalyze:\n");
1302 for (i = 0; i < d->tindex; ++i)
1304 fprintf(stderr, " %d:", i);
1305 prtok(d->tokens[i]);
1307 putc('\n', stderr);
1308 #endif
1310 d->searchflag = searchflag;
1312 MALLOC(nullable, int, d->depth);
1313 o_nullable = nullable;
1314 MALLOC(nfirstpos, int, d->depth);
1315 o_nfirst = nfirstpos;
1316 MALLOC(firstpos, position, d->nleaves);
1317 o_firstpos = firstpos, firstpos += d->nleaves;
1318 MALLOC(nlastpos, int, d->depth);
1319 o_nlast = nlastpos;
1320 MALLOC(lastpos, position, d->nleaves);
1321 o_lastpos = lastpos, lastpos += d->nleaves;
1322 MALLOC(nalloc, int, d->tindex);
1323 for (i = 0; i < d->tindex; ++i)
1324 nalloc[i] = 0;
1325 MALLOC(merged.elems, position, d->nleaves);
1327 CALLOC(d->follows, position_set, d->tindex);
1329 for (i = 0; i < d->tindex; ++i)
1330 #ifdef DEBUG
1331 { /* Nonsyntactic #ifdef goo... */
1332 #endif
1333 switch (d->tokens[i])
1335 case EMPTY:
1336 /* The empty set is nullable. */
1337 *nullable++ = 1;
1339 /* The firstpos and lastpos of the empty leaf are both empty. */
1340 *nfirstpos++ = *nlastpos++ = 0;
1341 break;
1343 case STAR:
1344 case PLUS:
1345 /* Every element in the firstpos of the argument is in the follow
1346 of every element in the lastpos. */
1347 tmp.nelem = nfirstpos[-1];
1348 tmp.elems = firstpos;
1349 pos = lastpos;
1350 for (j = 0; j < nlastpos[-1]; ++j)
1352 merge(&tmp, &d->follows[pos[j].index], &merged);
1353 REALLOC_IF_NECESSARY(d->follows[pos[j].index].elems, position,
1354 nalloc[pos[j].index], merged.nelem - 1);
1355 copy(&merged, &d->follows[pos[j].index]);
1358 case QMARK:
1359 /* A QMARK or STAR node is automatically nullable. */
1360 if (d->tokens[i] != PLUS)
1361 nullable[-1] = 1;
1362 break;
1364 case CAT:
1365 /* Every element in the firstpos of the second argument is in the
1366 follow of every element in the lastpos of the first argument. */
1367 tmp.nelem = nfirstpos[-1];
1368 tmp.elems = firstpos;
1369 pos = lastpos + nlastpos[-1];
1370 for (j = 0; j < nlastpos[-2]; ++j)
1372 merge(&tmp, &d->follows[pos[j].index], &merged);
1373 REALLOC_IF_NECESSARY(d->follows[pos[j].index].elems, position,
1374 nalloc[pos[j].index], merged.nelem - 1);
1375 copy(&merged, &d->follows[pos[j].index]);
1378 /* The firstpos of a CAT node is the firstpos of the first argument,
1379 union that of the second argument if the first is nullable. */
1380 if (nullable[-2])
1381 nfirstpos[-2] += nfirstpos[-1];
1382 else
1383 firstpos += nfirstpos[-1];
1384 --nfirstpos;
1386 /* The lastpos of a CAT node is the lastpos of the second argument,
1387 union that of the first argument if the second is nullable. */
1388 if (nullable[-1])
1389 nlastpos[-2] += nlastpos[-1];
1390 else
1392 pos = lastpos + nlastpos[-2];
1393 for (j = nlastpos[-1] - 1; j >= 0; --j)
1394 pos[j] = lastpos[j];
1395 lastpos += nlastpos[-2];
1396 nlastpos[-2] = nlastpos[-1];
1398 --nlastpos;
1400 /* A CAT node is nullable if both arguments are nullable. */
1401 nullable[-2] = nullable[-1] && nullable[-2];
1402 --nullable;
1403 break;
1405 case OR:
1406 case ORTOP:
1407 /* The firstpos is the union of the firstpos of each argument. */
1408 nfirstpos[-2] += nfirstpos[-1];
1409 --nfirstpos;
1411 /* The lastpos is the union of the lastpos of each argument. */
1412 nlastpos[-2] += nlastpos[-1];
1413 --nlastpos;
1415 /* An OR node is nullable if either argument is nullable. */
1416 nullable[-2] = nullable[-1] || nullable[-2];
1417 --nullable;
1418 break;
1420 default:
1421 /* Anything else is a nonempty position. (Note that special
1422 constructs like \< are treated as nonempty strings here;
1423 an "epsilon closure" effectively makes them nullable later.
1424 Backreferences have to get a real position so we can detect
1425 transitions on them later. But they are nullable. */
1426 *nullable++ = d->tokens[i] == BACKREF;
1428 /* This position is in its own firstpos and lastpos. */
1429 *nfirstpos++ = *nlastpos++ = 1;
1430 --firstpos, --lastpos;
1431 firstpos->index = lastpos->index = i;
1432 firstpos->constraint = lastpos->constraint = NO_CONSTRAINT;
1434 /* Allocate the follow set for this position. */
1435 nalloc[i] = 1;
1436 MALLOC(d->follows[i].elems, position, nalloc[i]);
1437 break;
1439 #ifdef DEBUG
1440 /* ... balance the above nonsyntactic #ifdef goo... */
1441 fprintf(stderr, "node %d:", i);
1442 prtok(d->tokens[i]);
1443 putc('\n', stderr);
1444 fprintf(stderr, nullable[-1] ? " nullable: yes\n" : " nullable: no\n");
1445 fprintf(stderr, " firstpos:");
1446 for (j = nfirstpos[-1] - 1; j >= 0; --j)
1448 fprintf(stderr, " %d:", firstpos[j].index);
1449 prtok(d->tokens[firstpos[j].index]);
1451 fprintf(stderr, "\n lastpos:");
1452 for (j = nlastpos[-1] - 1; j >= 0; --j)
1454 fprintf(stderr, " %d:", lastpos[j].index);
1455 prtok(d->tokens[lastpos[j].index]);
1457 putc('\n', stderr);
1459 #endif
1461 /* For each follow set that is the follow set of a real position, replace
1462 it with its epsilon closure. */
1463 for (i = 0; i < d->tindex; ++i)
1464 if (d->tokens[i] < NOTCHAR || d->tokens[i] == BACKREF
1465 || d->tokens[i] >= CSET)
1467 #ifdef DEBUG
1468 fprintf(stderr, "follows(%d:", i);
1469 prtok(d->tokens[i]);
1470 fprintf(stderr, "):");
1471 for (j = d->follows[i].nelem - 1; j >= 0; --j)
1473 fprintf(stderr, " %d:", d->follows[i].elems[j].index);
1474 prtok(d->tokens[d->follows[i].elems[j].index]);
1476 putc('\n', stderr);
1477 #endif
1478 copy(&d->follows[i], &merged);
1479 epsclosure(&merged, d);
1480 if (d->follows[i].nelem < merged.nelem)
1481 REALLOC(d->follows[i].elems, position, merged.nelem);
1482 copy(&merged, &d->follows[i]);
1485 /* Get the epsilon closure of the firstpos of the regexp. The result will
1486 be the set of positions of state 0. */
1487 merged.nelem = 0;
1488 for (i = 0; i < nfirstpos[-1]; ++i)
1489 insert(firstpos[i], &merged);
1490 epsclosure(&merged, d);
1492 /* Check if any of the positions of state 0 will want newline context. */
1493 wants_newline = 0;
1494 for (i = 0; i < merged.nelem; ++i)
1495 if (PREV_NEWLINE_DEPENDENT(merged.elems[i].constraint))
1496 wants_newline = 1;
1498 /* Build the initial state. */
1499 d->salloc = 1;
1500 d->sindex = 0;
1501 MALLOC(d->states, dfa_state, d->salloc);
1502 state_index(d, &merged, wants_newline, 0);
1504 free(o_nullable);
1505 free(o_nfirst);
1506 free(o_firstpos);
1507 free(o_nlast);
1508 free(o_lastpos);
1509 free(nalloc);
1510 free(merged.elems);
1513 /* Find, for each character, the transition out of state s of d, and store
1514 it in the appropriate slot of trans.
1516 We divide the positions of s into groups (positions can appear in more
1517 than one group). Each group is labeled with a set of characters that
1518 every position in the group matches (taking into account, if necessary,
1519 preceding context information of s). For each group, find the union
1520 of the its elements' follows. This set is the set of positions of the
1521 new state. For each character in the group's label, set the transition
1522 on this character to be to a state corresponding to the set's positions,
1523 and its associated backward context information, if necessary.
1525 If we are building a searching matcher, we include the positions of state
1526 0 in every state.
1528 The collection of groups is constructed by building an equivalence-class
1529 partition of the positions of s.
1531 For each position, find the set of characters C that it matches. Eliminate
1532 any characters from C that fail on grounds of backward context.
1534 Search through the groups, looking for a group whose label L has nonempty
1535 intersection with C. If L - C is nonempty, create a new group labeled
1536 L - C and having the same positions as the current group, and set L to
1537 the intersection of L and C. Insert the position in this group, set
1538 C = C - L, and resume scanning.
1540 If after comparing with every group there are characters remaining in C,
1541 create a new group labeled with the characters of C and insert this
1542 position in that group. */
1543 void
1544 dfastate (int s, struct dfa *d, int trans[])
1546 position_set grps[NOTCHAR]; /* As many as will ever be needed. */
1547 charclass labels[NOTCHAR]; /* Labels corresponding to the groups. */
1548 int ngrps = 0; /* Number of groups actually used. */
1549 position pos; /* Current position being considered. */
1550 charclass matches; /* Set of matching characters. */
1551 int matchesf; /* True if matches is nonempty. */
1552 charclass intersect; /* Intersection with some label set. */
1553 int intersectf; /* True if intersect is nonempty. */
1554 charclass leftovers; /* Stuff in the label that didn't match. */
1555 int leftoversf; /* True if leftovers is nonempty. */
1556 static charclass letters; /* Set of characters considered letters. */
1557 static charclass newline; /* Set of characters that aren't newline. */
1558 position_set follows; /* Union of the follows of some group. */
1559 position_set tmp; /* Temporary space for merging sets. */
1560 int state; /* New state. */
1561 int wants_newline; /* New state wants to know newline context. */
1562 int state_newline; /* New state on a newline transition. */
1563 int wants_letter; /* New state wants to know letter context. */
1564 int state_letter; /* New state on a letter transition. */
1565 static int initialized; /* Flag for static initialization. */
1566 int i, j, k;
1568 /* Initialize the set of letters, if necessary. */
1569 if (! initialized)
1571 initialized = 1;
1572 for (i = 0; i < NOTCHAR; ++i)
1573 if (IS_WORD_CONSTITUENT(i))
1574 setbit(i, letters);
1575 setbit(eolbyte, newline);
1578 zeroset(matches);
1580 for (i = 0; i < d->states[s].elems.nelem; ++i)
1582 pos = d->states[s].elems.elems[i];
1583 if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR)
1584 setbit(d->tokens[pos.index], matches);
1585 else if (d->tokens[pos.index] >= CSET)
1586 copyset(d->charclasses[d->tokens[pos.index] - CSET], matches);
1587 else
1588 continue;
1590 /* Some characters may need to be eliminated from matches because
1591 they fail in the current context. */
1592 if (pos.constraint != 0xFF)
1594 if (! MATCHES_NEWLINE_CONTEXT(pos.constraint,
1595 d->states[s].newline, 1))
1596 clrbit(eolbyte, matches);
1597 if (! MATCHES_NEWLINE_CONTEXT(pos.constraint,
1598 d->states[s].newline, 0))
1599 for (j = 0; j < CHARCLASS_INTS; ++j)
1600 matches[j] &= newline[j];
1601 if (! MATCHES_LETTER_CONTEXT(pos.constraint,
1602 d->states[s].letter, 1))
1603 for (j = 0; j < CHARCLASS_INTS; ++j)
1604 matches[j] &= ~letters[j];
1605 if (! MATCHES_LETTER_CONTEXT(pos.constraint,
1606 d->states[s].letter, 0))
1607 for (j = 0; j < CHARCLASS_INTS; ++j)
1608 matches[j] &= letters[j];
1610 /* If there are no characters left, there's no point in going on. */
1611 for (j = 0; j < CHARCLASS_INTS && !matches[j]; ++j)
1612 continue;
1613 if (j == CHARCLASS_INTS)
1614 continue;
1617 for (j = 0; j < ngrps; ++j)
1619 /* If matches contains a single character only, and the current
1620 group's label doesn't contain that character, go on to the
1621 next group. */
1622 if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR
1623 && !tstbit(d->tokens[pos.index], labels[j]))
1624 continue;
1626 /* Check if this group's label has a nonempty intersection with
1627 matches. */
1628 intersectf = 0;
1629 for (k = 0; k < CHARCLASS_INTS; ++k)
1630 (intersect[k] = matches[k] & labels[j][k]) ? (intersectf = 1) : 0;
1631 if (! intersectf)
1632 continue;
1634 /* It does; now find the set differences both ways. */
1635 leftoversf = matchesf = 0;
1636 for (k = 0; k < CHARCLASS_INTS; ++k)
1638 /* Even an optimizing compiler can't know this for sure. */
1639 int match = matches[k], label = labels[j][k];
1641 (leftovers[k] = ~match & label) ? (leftoversf = 1) : 0;
1642 (matches[k] = match & ~label) ? (matchesf = 1) : 0;
1645 /* If there were leftovers, create a new group labeled with them. */
1646 if (leftoversf)
1648 copyset(leftovers, labels[ngrps]);
1649 copyset(intersect, labels[j]);
1650 MALLOC(grps[ngrps].elems, position, d->nleaves);
1651 copy(&grps[j], &grps[ngrps]);
1652 ++ngrps;
1655 /* Put the position in the current group. Note that there is no
1656 reason to call insert() here. */
1657 grps[j].elems[grps[j].nelem++] = pos;
1659 /* If every character matching the current position has been
1660 accounted for, we're done. */
1661 if (! matchesf)
1662 break;
1665 /* If we've passed the last group, and there are still characters
1666 unaccounted for, then we'll have to create a new group. */
1667 if (j == ngrps)
1669 copyset(matches, labels[ngrps]);
1670 zeroset(matches);
1671 MALLOC(grps[ngrps].elems, position, d->nleaves);
1672 grps[ngrps].nelem = 1;
1673 grps[ngrps].elems[0] = pos;
1674 ++ngrps;
1678 MALLOC(follows.elems, position, d->nleaves);
1679 MALLOC(tmp.elems, position, d->nleaves);
1681 /* If we are a searching matcher, the default transition is to a state
1682 containing the positions of state 0, otherwise the default transition
1683 is to fail miserably. */
1684 if (d->searchflag)
1686 wants_newline = 0;
1687 wants_letter = 0;
1688 for (i = 0; i < d->states[0].elems.nelem; ++i)
1690 if (PREV_NEWLINE_DEPENDENT(d->states[0].elems.elems[i].constraint))
1691 wants_newline = 1;
1692 if (PREV_LETTER_DEPENDENT(d->states[0].elems.elems[i].constraint))
1693 wants_letter = 1;
1695 copy(&d->states[0].elems, &follows);
1696 state = state_index(d, &follows, 0, 0);
1697 if (wants_newline)
1698 state_newline = state_index(d, &follows, 1, 0);
1699 else
1700 state_newline = state;
1701 if (wants_letter)
1702 state_letter = state_index(d, &follows, 0, 1);
1703 else
1704 state_letter = state;
1705 for (i = 0; i < NOTCHAR; ++i)
1706 trans[i] = (IS_WORD_CONSTITUENT(i)) ? state_letter : state;
1707 trans[eolbyte] = state_newline;
1709 else
1710 for (i = 0; i < NOTCHAR; ++i)
1711 trans[i] = -1;
1713 for (i = 0; i < ngrps; ++i)
1715 follows.nelem = 0;
1717 /* Find the union of the follows of the positions of the group.
1718 This is a hideously inefficient loop. Fix it someday. */
1719 for (j = 0; j < grps[i].nelem; ++j)
1720 for (k = 0; k < d->follows[grps[i].elems[j].index].nelem; ++k)
1721 insert(d->follows[grps[i].elems[j].index].elems[k], &follows);
1723 /* If we are building a searching matcher, throw in the positions
1724 of state 0 as well. */
1725 if (d->searchflag)
1726 for (j = 0; j < d->states[0].elems.nelem; ++j)
1727 insert(d->states[0].elems.elems[j], &follows);
1729 /* Find out if the new state will want any context information. */
1730 wants_newline = 0;
1731 if (tstbit(eolbyte, labels[i]))
1732 for (j = 0; j < follows.nelem; ++j)
1733 if (PREV_NEWLINE_DEPENDENT(follows.elems[j].constraint))
1734 wants_newline = 1;
1736 wants_letter = 0;
1737 for (j = 0; j < CHARCLASS_INTS; ++j)
1738 if (labels[i][j] & letters[j])
1739 break;
1740 if (j < CHARCLASS_INTS)
1741 for (j = 0; j < follows.nelem; ++j)
1742 if (PREV_LETTER_DEPENDENT(follows.elems[j].constraint))
1743 wants_letter = 1;
1745 /* Find the state(s) corresponding to the union of the follows. */
1746 state = state_index(d, &follows, 0, 0);
1747 if (wants_newline)
1748 state_newline = state_index(d, &follows, 1, 0);
1749 else
1750 state_newline = state;
1751 if (wants_letter)
1752 state_letter = state_index(d, &follows, 0, 1);
1753 else
1754 state_letter = state;
1756 /* Set the transitions for each character in the current label. */
1757 for (j = 0; j < CHARCLASS_INTS; ++j)
1758 for (k = 0; k < INTBITS; ++k)
1759 if (labels[i][j] & 1 << k)
1761 int c = j * INTBITS + k;
1763 if (c == eolbyte)
1764 trans[c] = state_newline;
1765 else if (IS_WORD_CONSTITUENT(c))
1766 trans[c] = state_letter;
1767 else if (c < NOTCHAR)
1768 trans[c] = state;
1772 for (i = 0; i < ngrps; ++i)
1773 free(grps[i].elems);
1774 free(follows.elems);
1775 free(tmp.elems);
1778 /* Some routines for manipulating a compiled dfa's transition tables.
1779 Each state may or may not have a transition table; if it does, and it
1780 is a non-accepting state, then d->trans[state] points to its table.
1781 If it is an accepting state then d->fails[state] points to its table.
1782 If it has no table at all, then d->trans[state] is NULL.
1783 TODO: Improve this comment, get rid of the unnecessary redundancy. */
1785 static void
1786 build_state (int s, struct dfa *d)
1788 int *trans; /* The new transition table. */
1789 int i;
1791 /* Set an upper limit on the number of transition tables that will ever
1792 exist at once. 1024 is arbitrary. The idea is that the frequently
1793 used transition tables will be quickly rebuilt, whereas the ones that
1794 were only needed once or twice will be cleared away. */
1795 if (d->trcount >= 1024)
1797 for (i = 0; i < d->tralloc; ++i)
1798 if (d->trans[i])
1800 free((ptr_t) d->trans[i]);
1801 d->trans[i] = NULL;
1803 else if (d->fails[i])
1805 free((ptr_t) d->fails[i]);
1806 d->fails[i] = NULL;
1808 d->trcount = 0;
1811 ++d->trcount;
1813 /* Set up the success bits for this state. */
1814 d->success[s] = 0;
1815 if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 1, d->states[s].letter, 0,
1816 s, *d))
1817 d->success[s] |= 4;
1818 if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 0, d->states[s].letter, 1,
1819 s, *d))
1820 d->success[s] |= 2;
1821 if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 0, d->states[s].letter, 0,
1822 s, *d))
1823 d->success[s] |= 1;
1825 MALLOC(trans, int, NOTCHAR);
1826 dfastate(s, d, trans);
1828 /* Now go through the new transition table, and make sure that the trans
1829 and fail arrays are allocated large enough to hold a pointer for the
1830 largest state mentioned in the table. */
1831 for (i = 0; i < NOTCHAR; ++i)
1832 if (trans[i] >= d->tralloc)
1834 int oldalloc = d->tralloc;
1836 while (trans[i] >= d->tralloc)
1837 d->tralloc *= 2;
1838 REALLOC(d->realtrans, int *, d->tralloc + 1);
1839 d->trans = d->realtrans + 1;
1840 REALLOC(d->fails, int *, d->tralloc);
1841 REALLOC(d->success, int, d->tralloc);
1842 REALLOC(d->newlines, int, d->tralloc);
1843 while (oldalloc < d->tralloc)
1845 d->trans[oldalloc] = NULL;
1846 d->fails[oldalloc++] = NULL;
1850 /* Keep the newline transition in a special place so we can use it as
1851 a sentinel. */
1852 d->newlines[s] = trans[eolbyte];
1853 trans[eolbyte] = -1;
1855 if (ACCEPTING(s, *d))
1856 d->fails[s] = trans;
1857 else
1858 d->trans[s] = trans;
1861 static void
1862 build_state_zero (struct dfa *d)
1864 d->tralloc = 1;
1865 d->trcount = 0;
1866 CALLOC(d->realtrans, int *, d->tralloc + 1);
1867 d->trans = d->realtrans + 1;
1868 CALLOC(d->fails, int *, d->tralloc);
1869 MALLOC(d->success, int, d->tralloc);
1870 MALLOC(d->newlines, int, d->tralloc);
1871 build_state(0, d);
1874 /* Search through a buffer looking for a match to the given struct dfa.
1875 Find the first occurrence of a string matching the regexp in the buffer,
1876 and the shortest possible version thereof. Return a pointer to the first
1877 character after the match, or NULL if none is found. Begin points to
1878 the beginning of the buffer, and end points to the first character after
1879 its end. We store a newline in *end to act as a sentinel, so end had
1880 better point somewhere valid. Newline is a flag indicating whether to
1881 allow newlines to be in the matching string. If count is non-
1882 NULL it points to a place we're supposed to increment every time we
1883 see a newline. Finally, if backref is non-NULL it points to a place
1884 where we're supposed to store a 1 if backreferencing happened and the
1885 match needs to be verified by a backtracking matcher. Otherwise
1886 we store a 0 in *backref. */
1887 char *
1888 dfaexec (struct dfa *d, char *begin, char *end,
1889 int newline, int *count, int *backref)
1891 register int s, s1, tmp; /* Current state. */
1892 register unsigned char *p; /* Current input character. */
1893 register int **trans, *t; /* Copy of d->trans so it can be optimized
1894 into a register. */
1895 register unsigned char eol = eolbyte; /* Likewise for eolbyte. */
1896 static int sbit[NOTCHAR]; /* Table for anding with d->success. */
1897 static int sbit_init;
1899 if (! sbit_init)
1901 int i;
1903 sbit_init = 1;
1904 for (i = 0; i < NOTCHAR; ++i)
1905 sbit[i] = (IS_WORD_CONSTITUENT(i)) ? 2 : 1;
1906 sbit[eol] = 4;
1909 if (! d->tralloc)
1910 build_state_zero(d);
1912 s = s1 = 0;
1913 p = (unsigned char *) begin;
1914 trans = d->trans;
1915 *end = eol;
1917 for (;;)
1919 while ((t = trans[s]) != 0) { /* hand-optimized loop */
1920 s1 = t[*p++];
1921 if ((t = trans[s1]) == 0) {
1922 tmp = s ; s = s1 ; s1 = tmp ; /* swap */
1923 break;
1925 s = t[*p++];
1928 if (s >= 0 && p <= (unsigned char *) end && d->fails[s])
1930 if (d->success[s] & sbit[*p])
1932 if (backref)
1933 *backref = (d->states[s].backref != 0);
1934 return (char *) p;
1937 s1 = s;
1938 s = d->fails[s][*p++];
1939 continue;
1942 /* If the previous character was a newline, count it. */
1943 if (count && (char *) p <= end && p[-1] == eol)
1944 ++*count;
1946 /* Check if we've run off the end of the buffer. */
1947 if ((char *) p > end)
1948 return NULL;
1950 if (s >= 0)
1952 build_state(s, d);
1953 trans = d->trans;
1954 continue;
1957 if (p[-1] == eol && newline)
1959 s = d->newlines[s1];
1960 continue;
1963 s = 0;
1967 /* Initialize the components of a dfa that the other routines don't
1968 initialize for themselves. */
1969 void
1970 dfainit (struct dfa *d)
1972 d->calloc = 1;
1973 MALLOC(d->charclasses, charclass, d->calloc);
1974 d->cindex = 0;
1976 d->talloc = 1;
1977 MALLOC(d->tokens, token, d->talloc);
1978 d->tindex = d->depth = d->nleaves = d->nregexps = 0;
1980 d->searchflag = 0;
1981 d->tralloc = 0;
1983 d->musts = 0;
1986 /* Parse and analyze a single string of the given length. */
1987 void
1988 dfacomp (char *s, size_t len, struct dfa *d, int searchflag)
1990 if (case_fold) /* dummy folding in service of dfamust() */
1992 char *lcopy;
1993 int i;
1995 lcopy = malloc(len);
1996 if (!lcopy)
1997 dfaerror(_("out of memory"));
1999 /* This is a kludge. */
2000 case_fold = 0;
2001 for (i = 0; i < len; ++i)
2002 if (ISUPPER ((unsigned char) s[i]))
2003 lcopy[i] = tolower ((unsigned char) s[i]);
2004 else
2005 lcopy[i] = s[i];
2007 dfainit(d);
2008 dfaparse(lcopy, len, d);
2009 free(lcopy);
2010 dfamust(d);
2011 d->cindex = d->tindex = d->depth = d->nleaves = d->nregexps = 0;
2012 case_fold = 1;
2013 dfaparse(s, len, d);
2014 dfaanalyze(d, searchflag);
2016 else
2018 dfainit(d);
2019 dfaparse(s, len, d);
2020 dfamust(d);
2021 dfaanalyze(d, searchflag);
2025 /* Free the storage held by the components of a dfa. */
2026 void
2027 dfafree (struct dfa *d)
2029 int i;
2030 struct dfamust *dm, *ndm;
2032 free((ptr_t) d->charclasses);
2033 free((ptr_t) d->tokens);
2034 for (i = 0; i < d->sindex; ++i)
2035 free((ptr_t) d->states[i].elems.elems);
2036 free((ptr_t) d->states);
2037 for (i = 0; i < d->tindex; ++i)
2038 if (d->follows[i].elems)
2039 free((ptr_t) d->follows[i].elems);
2040 free((ptr_t) d->follows);
2041 for (i = 0; i < d->tralloc; ++i)
2042 if (d->trans[i])
2043 free((ptr_t) d->trans[i]);
2044 else if (d->fails[i])
2045 free((ptr_t) d->fails[i]);
2046 if (d->realtrans) free((ptr_t) d->realtrans);
2047 if (d->fails) free((ptr_t) d->fails);
2048 if (d->newlines) free((ptr_t) d->newlines);
2049 if (d->success) free((ptr_t) d->success);
2050 for (dm = d->musts; dm; dm = ndm)
2052 ndm = dm->next;
2053 free(dm->must);
2054 free((ptr_t) dm);
2058 /* Having found the postfix representation of the regular expression,
2059 try to find a long sequence of characters that must appear in any line
2060 containing the r.e.
2061 Finding a "longest" sequence is beyond the scope here;
2062 we take an easy way out and hope for the best.
2063 (Take "(ab|a)b"--please.)
2065 We do a bottom-up calculation of sequences of characters that must appear
2066 in matches of r.e.'s represented by trees rooted at the nodes of the postfix
2067 representation:
2068 sequences that must appear at the left of the match ("left")
2069 sequences that must appear at the right of the match ("right")
2070 lists of sequences that must appear somewhere in the match ("in")
2071 sequences that must constitute the match ("is")
2073 When we get to the root of the tree, we use one of the longest of its
2074 calculated "in" sequences as our answer. The sequence we find is returned in
2075 d->must (where "d" is the single argument passed to "dfamust");
2076 the length of the sequence is returned in d->mustn.
2078 The sequences calculated for the various types of node (in pseudo ANSI c)
2079 are shown below. "p" is the operand of unary operators (and the left-hand
2080 operand of binary operators); "q" is the right-hand operand of binary
2081 operators.
2083 "ZERO" means "a zero-length sequence" below.
2085 Type left right is in
2086 ---- ---- ----- -- --
2087 char c # c # c # c # c
2089 CSET ZERO ZERO ZERO ZERO
2091 STAR ZERO ZERO ZERO ZERO
2093 QMARK ZERO ZERO ZERO ZERO
2095 PLUS p->left p->right ZERO p->in
2097 CAT (p->is==ZERO)? (q->is==ZERO)? (p->is!=ZERO && p->in plus
2098 p->left : q->right : q->is!=ZERO) ? q->in plus
2099 p->is##q->left p->right##q->is p->is##q->is : p->right##q->left
2100 ZERO
2102 OR longest common longest common (do p->is and substrings common to
2103 leading trailing q->is have same p->in and q->in
2104 (sub)sequence (sub)sequence length and
2105 of p->left of p->right content) ?
2106 and q->left and q->right p->is : NULL
2108 If there's anything else we recognize in the tree, all four sequences get set
2109 to zero-length sequences. If there's something we don't recognize in the tree,
2110 we just return a zero-length sequence.
2112 Break ties in favor of infrequent letters (choosing 'zzz' in preference to
2113 'aaa')?
2115 And. . .is it here or someplace that we might ponder "optimizations" such as
2116 egrep 'psi|epsilon' -> egrep 'psi'
2117 egrep 'pepsi|epsilon' -> egrep 'epsi'
2118 (Yes, we now find "epsi" as a "string
2119 that must occur", but we might also
2120 simplify the *entire* r.e. being sought)
2121 grep '[c]' -> grep 'c'
2122 grep '(ab|a)b' -> grep 'ab'
2123 grep 'ab*' -> grep 'a'
2124 grep 'a*b' -> grep 'b'
2126 There are several issues:
2128 Is optimization easy (enough)?
2130 Does optimization actually accomplish anything,
2131 or is the automaton you get from "psi|epsilon" (for example)
2132 the same as the one you get from "psi" (for example)?
2134 Are optimizable r.e.'s likely to be used in real-life situations
2135 (something like 'ab*' is probably unlikely; something like is
2136 'psi|epsilon' is likelier)? */
2138 static char *
2139 icatalloc (char *old, char *new)
2141 char *result;
2142 size_t oldsize, newsize;
2144 newsize = (new == NULL) ? 0 : strlen(new);
2145 if (old == NULL)
2146 oldsize = 0;
2147 else if (newsize == 0)
2148 return old;
2149 else oldsize = strlen(old);
2150 if (old == NULL)
2151 result = (char *) malloc(newsize + 1);
2152 else
2153 result = (char *) realloc((void *) old, oldsize + newsize + 1);
2154 if (result != NULL && new != NULL)
2155 (void) strcpy(result + oldsize, new);
2156 return result;
2159 static char *
2160 icpyalloc (char *string)
2162 return icatalloc((char *) NULL, string);
2165 static char *
2166 istrstr (char *lookin, char *lookfor)
2168 char *cp;
2169 size_t len;
2171 len = strlen(lookfor);
2172 for (cp = lookin; *cp != '\0'; ++cp)
2173 if (strncmp(cp, lookfor, len) == 0)
2174 return cp;
2175 return NULL;
2178 static void
2179 ifree (char *cp)
2181 if (cp != NULL)
2182 free(cp);
2185 static void
2186 freelist (char **cpp)
2188 int i;
2190 if (cpp == NULL)
2191 return;
2192 for (i = 0; cpp[i] != NULL; ++i)
2194 free(cpp[i]);
2195 cpp[i] = NULL;
2199 static char **
2200 enlist (char **cpp, char *new, size_t len)
2202 int i, j;
2204 if (cpp == NULL)
2205 return NULL;
2206 if ((new = icpyalloc(new)) == NULL)
2208 freelist(cpp);
2209 return NULL;
2211 new[len] = '\0';
2212 /* Is there already something in the list that's new (or longer)? */
2213 for (i = 0; cpp[i] != NULL; ++i)
2214 if (istrstr(cpp[i], new) != NULL)
2216 free(new);
2217 return cpp;
2219 /* Eliminate any obsoleted strings. */
2220 j = 0;
2221 while (cpp[j] != NULL)
2222 if (istrstr(new, cpp[j]) == NULL)
2223 ++j;
2224 else
2226 free(cpp[j]);
2227 if (--i == j)
2228 break;
2229 cpp[j] = cpp[i];
2230 cpp[i] = NULL;
2232 /* Add the new string. */
2233 cpp = (char **) realloc((char *) cpp, (i + 2) * sizeof *cpp);
2234 if (cpp == NULL)
2235 return NULL;
2236 cpp[i] = new;
2237 cpp[i + 1] = NULL;
2238 return cpp;
2241 /* Given pointers to two strings, return a pointer to an allocated
2242 list of their distinct common substrings. Return NULL if something
2243 seems wild. */
2244 static char **
2245 comsubs (char *left, char *right)
2247 char **cpp;
2248 char *lcp;
2249 char *rcp;
2250 size_t i, len;
2252 if (left == NULL || right == NULL)
2253 return NULL;
2254 cpp = (char **) malloc(sizeof *cpp);
2255 if (cpp == NULL)
2256 return NULL;
2257 cpp[0] = NULL;
2258 for (lcp = left; *lcp != '\0'; ++lcp)
2260 len = 0;
2261 rcp = index(right, *lcp);
2262 while (rcp != NULL)
2264 for (i = 1; lcp[i] != '\0' && lcp[i] == rcp[i]; ++i)
2265 continue;
2266 if (i > len)
2267 len = i;
2268 rcp = index(rcp + 1, *lcp);
2270 if (len == 0)
2271 continue;
2272 if ((cpp = enlist(cpp, lcp, len)) == NULL)
2273 break;
2275 return cpp;
2278 static char **
2279 addlists (char **old, char **new)
2281 int i;
2283 if (old == NULL || new == NULL)
2284 return NULL;
2285 for (i = 0; new[i] != NULL; ++i)
2287 old = enlist(old, new[i], strlen(new[i]));
2288 if (old == NULL)
2289 break;
2291 return old;
2294 /* Given two lists of substrings, return a new list giving substrings
2295 common to both. */
2296 static char **
2297 inboth (char **left, char **right)
2299 char **both;
2300 char **temp;
2301 int lnum, rnum;
2303 if (left == NULL || right == NULL)
2304 return NULL;
2305 both = (char **) malloc(sizeof *both);
2306 if (both == NULL)
2307 return NULL;
2308 both[0] = NULL;
2309 for (lnum = 0; left[lnum] != NULL; ++lnum)
2311 for (rnum = 0; right[rnum] != NULL; ++rnum)
2313 temp = comsubs(left[lnum], right[rnum]);
2314 if (temp == NULL)
2316 freelist(both);
2317 return NULL;
2319 both = addlists(both, temp);
2320 freelist(temp);
2321 free(temp);
2322 if (both == NULL)
2323 return NULL;
2326 return both;
2329 typedef struct
2331 char **in;
2332 char *left;
2333 char *right;
2334 char *is;
2335 } must;
2337 static void
2338 resetmust (must *mp)
2340 mp->left[0] = mp->right[0] = mp->is[0] = '\0';
2341 freelist(mp->in);
2344 static void
2345 dfamust (struct dfa *dfa)
2347 must *musts;
2348 must *mp;
2349 char *result;
2350 int ri;
2351 int i;
2352 int exact;
2353 token t;
2354 static must must0;
2355 struct dfamust *dm;
2356 static char empty_string[] = "";
2358 result = empty_string;
2359 exact = 0;
2360 musts = (must *) malloc((dfa->tindex + 1) * sizeof *musts);
2361 if (musts == NULL)
2362 return;
2363 mp = musts;
2364 for (i = 0; i <= dfa->tindex; ++i)
2365 mp[i] = must0;
2366 for (i = 0; i <= dfa->tindex; ++i)
2368 mp[i].in = (char **) malloc(sizeof *mp[i].in);
2369 mp[i].left = malloc(2);
2370 mp[i].right = malloc(2);
2371 mp[i].is = malloc(2);
2372 if (mp[i].in == NULL || mp[i].left == NULL ||
2373 mp[i].right == NULL || mp[i].is == NULL)
2374 goto done;
2375 mp[i].left[0] = mp[i].right[0] = mp[i].is[0] = '\0';
2376 mp[i].in[0] = NULL;
2378 #ifdef DEBUG
2379 fprintf(stderr, "dfamust:\n");
2380 for (i = 0; i < dfa->tindex; ++i)
2382 fprintf(stderr, " %d:", i);
2383 prtok(dfa->tokens[i]);
2385 putc('\n', stderr);
2386 #endif
2387 for (ri = 0; ri < dfa->tindex; ++ri)
2389 switch (t = dfa->tokens[ri])
2391 case LPAREN:
2392 case RPAREN:
2393 goto done; /* "cannot happen" */
2394 case EMPTY:
2395 case BEGLINE:
2396 case ENDLINE:
2397 case BEGWORD:
2398 case ENDWORD:
2399 case LIMWORD:
2400 case NOTLIMWORD:
2401 case BACKREF:
2402 resetmust(mp);
2403 break;
2404 case STAR:
2405 case QMARK:
2406 if (mp <= musts)
2407 goto done; /* "cannot happen" */
2408 --mp;
2409 resetmust(mp);
2410 break;
2411 case OR:
2412 case ORTOP:
2413 if (mp < &musts[2])
2414 goto done; /* "cannot happen" */
2416 char **new;
2417 must *lmp;
2418 must *rmp;
2419 int j, ln, rn, n;
2421 rmp = --mp;
2422 lmp = --mp;
2423 /* Guaranteed to be. Unlikely, but. . . */
2424 if (strcmp(lmp->is, rmp->is) != 0)
2425 lmp->is[0] = '\0';
2426 /* Left side--easy */
2427 i = 0;
2428 while (lmp->left[i] != '\0' && lmp->left[i] == rmp->left[i])
2429 ++i;
2430 lmp->left[i] = '\0';
2431 /* Right side */
2432 ln = strlen(lmp->right);
2433 rn = strlen(rmp->right);
2434 n = ln;
2435 if (n > rn)
2436 n = rn;
2437 for (i = 0; i < n; ++i)
2438 if (lmp->right[ln - i - 1] != rmp->right[rn - i - 1])
2439 break;
2440 for (j = 0; j < i; ++j)
2441 lmp->right[j] = lmp->right[(ln - i) + j];
2442 lmp->right[j] = '\0';
2443 new = inboth(lmp->in, rmp->in);
2444 if (new == NULL)
2445 goto done;
2446 freelist(lmp->in);
2447 free((char *) lmp->in);
2448 lmp->in = new;
2450 break;
2451 case PLUS:
2452 if (mp <= musts)
2453 goto done; /* "cannot happen" */
2454 --mp;
2455 mp->is[0] = '\0';
2456 break;
2457 case END:
2458 if (mp != &musts[1])
2459 goto done; /* "cannot happen" */
2460 for (i = 0; musts[0].in[i] != NULL; ++i)
2461 if (strlen(musts[0].in[i]) > strlen(result))
2462 result = musts[0].in[i];
2463 if (strcmp(result, musts[0].is) == 0)
2464 exact = 1;
2465 goto done;
2466 case CAT:
2467 if (mp < &musts[2])
2468 goto done; /* "cannot happen" */
2470 must *lmp;
2471 must *rmp;
2473 rmp = --mp;
2474 lmp = --mp;
2475 /* In. Everything in left, plus everything in
2476 right, plus catenation of
2477 left's right and right's left. */
2478 lmp->in = addlists(lmp->in, rmp->in);
2479 if (lmp->in == NULL)
2480 goto done;
2481 if (lmp->right[0] != '\0' &&
2482 rmp->left[0] != '\0')
2484 char *tp;
2486 tp = icpyalloc(lmp->right);
2487 if (tp == NULL)
2488 goto done;
2489 tp = icatalloc(tp, rmp->left);
2490 if (tp == NULL)
2491 goto done;
2492 lmp->in = enlist(lmp->in, tp,
2493 strlen(tp));
2494 free(tp);
2495 if (lmp->in == NULL)
2496 goto done;
2498 /* Left-hand */
2499 if (lmp->is[0] != '\0')
2501 lmp->left = icatalloc(lmp->left,
2502 rmp->left);
2503 if (lmp->left == NULL)
2504 goto done;
2506 /* Right-hand */
2507 if (rmp->is[0] == '\0')
2508 lmp->right[0] = '\0';
2509 lmp->right = icatalloc(lmp->right, rmp->right);
2510 if (lmp->right == NULL)
2511 goto done;
2512 /* Guaranteed to be */
2513 if (lmp->is[0] != '\0' && rmp->is[0] != '\0')
2515 lmp->is = icatalloc(lmp->is, rmp->is);
2516 if (lmp->is == NULL)
2517 goto done;
2519 else
2520 lmp->is[0] = '\0';
2522 break;
2523 default:
2524 if (t < END)
2526 /* "cannot happen" */
2527 goto done;
2529 else if (t == '\0')
2531 /* not on *my* shift */
2532 goto done;
2534 else if (t >= CSET)
2536 /* easy enough */
2537 resetmust(mp);
2539 else
2541 /* plain character */
2542 resetmust(mp);
2543 mp->is[0] = mp->left[0] = mp->right[0] = t;
2544 mp->is[1] = mp->left[1] = mp->right[1] = '\0';
2545 mp->in = enlist(mp->in, mp->is, (size_t)1);
2546 if (mp->in == NULL)
2547 goto done;
2549 break;
2551 #ifdef DEBUG
2552 fprintf(stderr, " node: %d:", ri);
2553 prtok(dfa->tokens[ri]);
2554 fprintf(stderr, "\n in:");
2555 for (i = 0; mp->in[i]; ++i)
2556 fprintf(stderr, " \"%s\"", mp->in[i]);
2557 fprintf(stderr, "\n is: \"%s\"\n", mp->is);
2558 fprintf(stderr, " left: \"%s\"\n", mp->left);
2559 fprintf(stderr, " right: \"%s\"\n", mp->right);
2560 #endif
2561 ++mp;
2563 done:
2564 if (strlen(result))
2566 dm = (struct dfamust *) malloc(sizeof (struct dfamust));
2567 dm->exact = exact;
2568 dm->must = malloc(strlen(result) + 1);
2569 strcpy(dm->must, result);
2570 dm->next = dfa->musts;
2571 dfa->musts = dm;
2573 mp = musts;
2574 for (i = 0; i <= dfa->tindex; ++i)
2576 freelist(mp[i].in);
2577 ifree((char *) mp[i].in);
2578 ifree(mp[i].left);
2579 ifree(mp[i].right);
2580 ifree(mp[i].is);
2582 free((char *) mp);