*-map tests: Fix compilation error.
[gnulib.git] / lib / dfa.c
blob329a2092a7eeeaeb1ee5379738ec2050f61a862e
1 /* dfa.c - deterministic extended regexp routines for GNU
2 Copyright (C) 1988, 1998, 2000, 2002, 2004-2005, 2007-2019 Free Software
3 Foundation, Inc.
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3, or (at your option)
8 any later version.
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc.,
18 51 Franklin Street - Fifth Floor, Boston, MA 02110-1301, USA */
20 /* Written June, 1988 by Mike Haertel
21 Modified July, 1988 by Arthur David Olson to assist BMG speedups */
23 #include <config.h>
25 #include "dfa.h"
27 #include <assert.h>
28 #include <ctype.h>
29 #include <stdint.h>
30 #include <stdio.h>
31 #include <stdlib.h>
32 #include <limits.h>
33 #include <string.h>
34 #include <locale.h>
36 static bool
37 streq (char const *a, char const *b)
39 return strcmp (a, b) == 0;
42 static bool
43 isasciidigit (char c)
45 return '0' <= c && c <= '9';
48 #include "gettext.h"
49 #define _(str) gettext (str)
51 #include <wchar.h>
53 #include "intprops.h"
54 #include "xalloc.h"
55 #include "localeinfo.h"
57 #ifndef FALLTHROUGH
58 # if __GNUC__ < 7
59 # define FALLTHROUGH ((void) 0)
60 # else
61 # define FALLTHROUGH __attribute__ ((__fallthrough__))
62 # endif
63 #endif
65 #ifndef MIN
66 # define MIN(a,b) ((a) < (b) ? (a) : (b))
67 #endif
69 /* HPUX defines these as macros in sys/param.h. */
70 #ifdef setbit
71 # undef setbit
72 #endif
73 #ifdef clrbit
74 # undef clrbit
75 #endif
77 /* First integer value that is greater than any character code. */
78 enum { NOTCHAR = 1 << CHAR_BIT };
80 /* This represents part of a character class. It must be unsigned and
81 at least CHARCLASS_WORD_BITS wide. Any excess bits are zero. */
82 typedef unsigned long int charclass_word;
84 /* CHARCLASS_WORD_BITS is the number of bits used in a charclass word.
85 CHARCLASS_PAIR (LO, HI) is part of a charclass initializer, and
86 represents 64 bits' worth of a charclass, where LO and HI are the
87 low and high-order 32 bits of the 64-bit quantity. */
88 #if ULONG_MAX >> 31 >> 31 < 3
89 enum { CHARCLASS_WORD_BITS = 32 };
90 # define CHARCLASS_PAIR(lo, hi) lo, hi
91 #else
92 enum { CHARCLASS_WORD_BITS = 64 };
93 # define CHARCLASS_PAIR(lo, hi) (((charclass_word) (hi) << 32) + (lo))
94 #endif
96 /* An initializer for a charclass whose 32-bit words are A through H. */
97 #define CHARCLASS_INIT(a, b, c, d, e, f, g, h) \
98 {{ \
99 CHARCLASS_PAIR (a, b), CHARCLASS_PAIR (c, d), \
100 CHARCLASS_PAIR (e, f), CHARCLASS_PAIR (g, h) \
103 /* The maximum useful value of a charclass_word; all used bits are 1. */
104 static charclass_word const CHARCLASS_WORD_MASK
105 = ((charclass_word) 1 << (CHARCLASS_WORD_BITS - 1) << 1) - 1;
107 /* Number of words required to hold a bit for every character. */
108 enum
110 CHARCLASS_WORDS = (NOTCHAR + CHARCLASS_WORD_BITS - 1) / CHARCLASS_WORD_BITS
113 /* Sets of unsigned characters are stored as bit vectors in arrays of ints. */
114 typedef struct { charclass_word w[CHARCLASS_WORDS]; } charclass;
116 /* Convert a possibly-signed character to an unsigned character. This is
117 a bit safer than casting to unsigned char, since it catches some type
118 errors that the cast doesn't. */
119 static unsigned char
120 to_uchar (char ch)
122 return ch;
125 /* Contexts tell us whether a character is a newline or a word constituent.
126 Word-constituent characters are those that satisfy iswalnum, plus '_'.
127 Each character has a single CTX_* value; bitmasks of CTX_* values denote
128 a particular character class.
130 A state also stores a context value, which is a bitmask of CTX_* values.
131 A state's context represents a set of characters that the state's
132 predecessors must match. For example, a state whose context does not
133 include CTX_LETTER will never have transitions where the previous
134 character is a word constituent. A state whose context is CTX_ANY
135 might have transitions from any character. */
137 enum
139 CTX_NONE = 1,
140 CTX_LETTER = 2,
141 CTX_NEWLINE = 4,
142 CTX_ANY = 7
145 /* Sometimes characters can only be matched depending on the surrounding
146 context. Such context decisions depend on what the previous character
147 was, and the value of the current (lookahead) character. Context
148 dependent constraints are encoded as 9-bit integers. Each bit that
149 is set indicates that the constraint succeeds in the corresponding
150 context.
152 bit 6-8 - valid contexts when next character is CTX_NEWLINE
153 bit 3-5 - valid contexts when next character is CTX_LETTER
154 bit 0-2 - valid contexts when next character is CTX_NONE
156 succeeds_in_context determines whether a given constraint
157 succeeds in a particular context. Prev is a bitmask of possible
158 context values for the previous character, curr is the (single-bit)
159 context value for the lookahead character. */
160 static int
161 newline_constraint (int constraint)
163 return (constraint >> 6) & 7;
165 static int
166 letter_constraint (int constraint)
168 return (constraint >> 3) & 7;
170 static int
171 other_constraint (int constraint)
173 return constraint & 7;
176 static bool
177 succeeds_in_context (int constraint, int prev, int curr)
179 return !! (((curr & CTX_NONE ? other_constraint (constraint) : 0) \
180 | (curr & CTX_LETTER ? letter_constraint (constraint) : 0) \
181 | (curr & CTX_NEWLINE ? newline_constraint (constraint) : 0)) \
182 & prev);
185 /* The following describe what a constraint depends on. */
186 static bool
187 prev_newline_dependent (int constraint)
189 return ((constraint ^ constraint >> 2) & 0111) != 0;
191 static bool
192 prev_letter_dependent (int constraint)
194 return ((constraint ^ constraint >> 1) & 0111) != 0;
197 /* Tokens that match the empty string subject to some constraint actually
198 work by applying that constraint to determine what may follow them,
199 taking into account what has gone before. The following values are
200 the constraints corresponding to the special tokens previously defined. */
201 enum
203 NO_CONSTRAINT = 0777,
204 BEGLINE_CONSTRAINT = 0444,
205 ENDLINE_CONSTRAINT = 0700,
206 BEGWORD_CONSTRAINT = 0050,
207 ENDWORD_CONSTRAINT = 0202,
208 LIMWORD_CONSTRAINT = 0252,
209 NOTLIMWORD_CONSTRAINT = 0525
212 /* The regexp is parsed into an array of tokens in postfix form. Some tokens
213 are operators and others are terminal symbols. Most (but not all) of these
214 codes are returned by the lexical analyzer. */
216 typedef ptrdiff_t token;
217 static ptrdiff_t const TOKEN_MAX = PTRDIFF_MAX;
219 /* States are indexed by state_num values. These are normally
220 nonnegative but -1 is used as a special value. */
221 typedef ptrdiff_t state_num;
223 /* Predefined token values. */
224 enum
226 END = -1, /* END is a terminal symbol that matches the
227 end of input; any value of END or less in
228 the parse tree is such a symbol. Accepting
229 states of the DFA are those that would have
230 a transition on END. This is -1, not some
231 more-negative value, to tweak the speed of
232 comparisons to END. */
234 /* Ordinary character values are terminal symbols that match themselves. */
236 /* CSET must come last in the following list of special tokens. Otherwise,
237 the list order matters only for performance. Related special tokens
238 should have nearby values so that code like (t == ANYCHAR || t == MBCSET
239 || CSET <= t) can be done with a single machine-level comparison. */
241 EMPTY = NOTCHAR, /* EMPTY is a terminal symbol that matches
242 the empty string. */
244 QMARK, /* QMARK is an operator of one argument that
245 matches zero or one occurrences of its
246 argument. */
248 STAR, /* STAR is an operator of one argument that
249 matches the Kleene closure (zero or more
250 occurrences) of its argument. */
252 PLUS, /* PLUS is an operator of one argument that
253 matches the positive closure (one or more
254 occurrences) of its argument. */
256 REPMN, /* REPMN is a lexical token corresponding
257 to the {m,n} construct. REPMN never
258 appears in the compiled token vector. */
260 CAT, /* CAT is an operator of two arguments that
261 matches the concatenation of its
262 arguments. CAT is never returned by the
263 lexical analyzer. */
265 OR, /* OR is an operator of two arguments that
266 matches either of its arguments. */
268 LPAREN, /* LPAREN never appears in the parse tree,
269 it is only a lexeme. */
271 RPAREN, /* RPAREN never appears in the parse tree. */
273 WCHAR, /* Only returned by lex. wctok contains
274 the wide character representation. */
276 ANYCHAR, /* ANYCHAR is a terminal symbol that matches
277 a valid multibyte (or single byte) character.
278 It is used only if MB_CUR_MAX > 1. */
280 BEG, /* BEG is an initial symbol that matches the
281 beginning of input. */
283 BEGLINE, /* BEGLINE is a terminal symbol that matches
284 the empty string at the beginning of a
285 line. */
287 ENDLINE, /* ENDLINE is a terminal symbol that matches
288 the empty string at the end of a line. */
290 BEGWORD, /* BEGWORD is a terminal symbol that matches
291 the empty string at the beginning of a
292 word. */
294 ENDWORD, /* ENDWORD is a terminal symbol that matches
295 the empty string at the end of a word. */
297 LIMWORD, /* LIMWORD is a terminal symbol that matches
298 the empty string at the beginning or the
299 end of a word. */
301 NOTLIMWORD, /* NOTLIMWORD is a terminal symbol that
302 matches the empty string not at
303 the beginning or end of a word. */
305 BACKREF, /* BACKREF is generated by \<digit>
306 or by any other construct that
307 is not completely handled. If the scanner
308 detects a transition on backref, it returns
309 a kind of "semi-success" indicating that
310 the match will have to be verified with
311 a backtracking matcher. */
313 MBCSET, /* MBCSET is similar to CSET, but for
314 multibyte characters. */
316 CSET /* CSET and (and any value greater) is a
317 terminal symbol that matches any of a
318 class of characters. */
322 /* States of the recognizer correspond to sets of positions in the parse
323 tree, together with the constraints under which they may be matched.
324 So a position is encoded as an index into the parse tree together with
325 a constraint. */
326 typedef struct
328 size_t index; /* Index into the parse array. */
329 unsigned int constraint; /* Constraint for matching this position. */
330 } position;
332 /* Sets of positions are stored as arrays. */
333 typedef struct
335 position *elems; /* Elements of this position set. */
336 ptrdiff_t nelem; /* Number of elements in this set. */
337 ptrdiff_t alloc; /* Number of elements allocated in ELEMS. */
338 } position_set;
340 /* A state of the dfa consists of a set of positions, some flags,
341 and the token value of the lowest-numbered position of the state that
342 contains an END token. */
343 typedef struct
345 size_t hash; /* Hash of the positions of this state. */
346 position_set elems; /* Positions this state could match. */
347 unsigned char context; /* Context from previous state. */
348 unsigned short constraint; /* Constraint for this state to accept. */
349 token first_end; /* Token value of the first END in elems. */
350 position_set mbps; /* Positions which can match multibyte
351 characters or the follows, e.g., period.
352 Used only if MB_CUR_MAX > 1. */
353 state_num mb_trindex; /* Index of this state in MB_TRANS, or
354 negative if the state does not have
355 ANYCHAR. */
356 } dfa_state;
358 /* Maximum for any transition table count. This should be at least 3,
359 for the initial state setup. */
360 enum { MAX_TRCOUNT = 1024 };
362 /* A bracket operator.
363 e.g., [a-c], [[:alpha:]], etc. */
364 struct mb_char_classes
366 ptrdiff_t cset;
367 bool invert;
368 wchar_t *chars; /* Normal characters. */
369 ptrdiff_t nchars;
370 ptrdiff_t nchars_alloc;
373 struct regex_syntax
375 /* Syntax bits controlling the behavior of the lexical analyzer. */
376 reg_syntax_t syntax_bits;
377 bool syntax_bits_set;
379 /* Flag for case-folding letters into sets. */
380 bool case_fold;
382 /* True if ^ and $ match only the start and end of data, and do not match
383 end-of-line within data. */
384 bool anchor;
386 /* End-of-line byte in data. */
387 unsigned char eolbyte;
389 /* Cache of char-context values. */
390 char sbit[NOTCHAR];
392 /* If never_trail[B], the byte B cannot be a non-initial byte in a
393 multibyte character. */
394 bool never_trail[NOTCHAR];
396 /* Set of characters considered letters. */
397 charclass letters;
399 /* Set of characters that are newline. */
400 charclass newline;
403 /* Lexical analyzer. All the dross that deals with the obnoxious
404 GNU Regex syntax bits is located here. The poor, suffering
405 reader is referred to the GNU Regex documentation for the
406 meaning of the @#%!@#%^!@ syntax bits. */
407 struct lexer_state
409 char const *ptr; /* Pointer to next input character. */
410 size_t left; /* Number of characters remaining. */
411 token lasttok; /* Previous token returned; initially END. */
412 size_t parens; /* Count of outstanding left parens. */
413 int minrep, maxrep; /* Repeat counts for {m,n}. */
415 /* Wide character representation of the current multibyte character,
416 or WEOF if there was an encoding error. Used only if
417 MB_CUR_MAX > 1. */
418 wint_t wctok;
420 /* Length of the multibyte representation of wctok. */
421 int cur_mb_len;
423 /* The most recently analyzed multibyte bracket expression. */
424 struct mb_char_classes brack;
426 /* We're separated from beginning or (, | only by zero-width characters. */
427 bool laststart;
430 /* Recursive descent parser for regular expressions. */
432 struct parser_state
434 token tok; /* Lookahead token. */
435 size_t depth; /* Current depth of a hypothetical stack
436 holding deferred productions. This is
437 used to determine the depth that will be
438 required of the real stack later on in
439 dfaanalyze. */
442 /* A compiled regular expression. */
443 struct dfa
445 /* Syntax configuration */
446 struct regex_syntax syntax;
448 /* Fields filled by the scanner. */
449 charclass *charclasses; /* Array of character sets for CSET tokens. */
450 ptrdiff_t cindex; /* Index for adding new charclasses. */
451 ptrdiff_t calloc; /* Number of charclasses allocated. */
452 size_t canychar; /* Index of anychar class, or (size_t) -1. */
454 /* Scanner state */
455 struct lexer_state lex;
457 /* Parser state */
458 struct parser_state parse;
460 /* Fields filled by the parser. */
461 token *tokens; /* Postfix parse array. */
462 size_t tindex; /* Index for adding new tokens. */
463 size_t talloc; /* Number of tokens currently allocated. */
464 size_t depth; /* Depth required of an evaluation stack
465 used for depth-first traversal of the
466 parse tree. */
467 size_t nleaves; /* Number of leaves on the parse tree. */
468 size_t nregexps; /* Count of parallel regexps being built
469 with dfaparse. */
470 bool fast; /* The DFA is fast. */
471 token utf8_anychar_classes[5]; /* To lower ANYCHAR in UTF-8 locales. */
472 mbstate_t mbs; /* Multibyte conversion state. */
474 /* The following are valid only if MB_CUR_MAX > 1. */
476 /* The value of multibyte_prop[i] is defined by following rule.
477 if tokens[i] < NOTCHAR
478 bit 0 : tokens[i] is the first byte of a character, including
479 single-byte characters.
480 bit 1 : tokens[i] is the last byte of a character, including
481 single-byte characters.
483 e.g.
484 tokens
485 = 'single_byte_a', 'multi_byte_A', single_byte_b'
486 = 'sb_a', 'mb_A(1st byte)', 'mb_A(2nd byte)', 'mb_A(3rd byte)', 'sb_b'
487 multibyte_prop
488 = 3 , 1 , 0 , 2 , 3
490 char *multibyte_prop;
492 /* Fields filled by the superset. */
493 struct dfa *superset; /* Hint of the dfa. */
495 /* Fields filled by the state builder. */
496 dfa_state *states; /* States of the dfa. */
497 state_num sindex; /* Index for adding new states. */
498 ptrdiff_t salloc; /* Number of states currently allocated. */
500 /* Fields filled by the parse tree->NFA conversion. */
501 position_set *follows; /* Array of follow sets, indexed by position
502 index. The follow of a position is the set
503 of positions containing characters that
504 could conceivably follow a character
505 matching the given position in a string
506 matching the regexp. Allocated to the
507 maximum possible position index. */
508 bool searchflag; /* We are supposed to build a searching
509 as opposed to an exact matcher. A searching
510 matcher finds the first and shortest string
511 matching a regexp anywhere in the buffer,
512 whereas an exact matcher finds the longest
513 string matching, but anchored to the
514 beginning of the buffer. */
516 /* Fields filled by dfaanalyze. */
517 int *constraints; /* Array of union of accepting constraints
518 in the follow of a position. */
519 int *separates; /* Array of contexts on follow of a
520 position. */
522 /* Fields filled by dfaexec. */
523 state_num tralloc; /* Number of transition tables that have
524 slots so far, not counting trans[-1] and
525 trans[-2]. */
526 int trcount; /* Number of transition tables that have
527 been built, other than for initial
528 states. */
529 int min_trcount; /* Number of initial states. Equivalently,
530 the minimum state number for which trcount
531 counts transitions. */
532 state_num **trans; /* Transition tables for states that can
533 never accept. If the transitions for a
534 state have not yet been computed, or the
535 state could possibly accept, its entry in
536 this table is NULL. This points to two
537 past the start of the allocated array,
538 and trans[-1] and trans[-2] are always
539 NULL. */
540 state_num **fails; /* Transition tables after failing to accept
541 on a state that potentially could do so.
542 If trans[i] is non-null, fails[i] must
543 be null. */
544 char *success; /* Table of acceptance conditions used in
545 dfaexec and computed in build_state. */
546 state_num *newlines; /* Transitions on newlines. The entry for a
547 newline in any transition table is always
548 -1 so we can count lines without wasting
549 too many cycles. The transition for a
550 newline is stored separately and handled
551 as a special case. Newline is also used
552 as a sentinel at the end of the buffer. */
553 state_num initstate_notbol; /* Initial state for CTX_LETTER and CTX_NONE
554 context in multibyte locales, in which we
555 do not distinguish between their contexts,
556 as not supported word. */
557 position_set mb_follows; /* Follow set added by ANYCHAR on demand. */
558 state_num **mb_trans; /* Transition tables for states with
559 ANYCHAR. */
560 state_num mb_trcount; /* Number of transition tables for states with
561 ANYCHAR that have actually been built. */
563 /* Information derived from the locale. This is at the end so that
564 a quick memset need not clear it specially. */
566 /* dfaexec implementation. */
567 char *(*dfaexec) (struct dfa *, char const *, char *,
568 bool, size_t *, bool *);
570 /* The locale is simple, like the C locale. These locales can be
571 processed more efficiently, as they are single-byte, their native
572 character set is in collating-sequence order, and they do not
573 have multi-character collating elements. */
574 bool simple_locale;
576 /* Other cached information derived from the locale. */
577 struct localeinfo localeinfo;
580 /* User access to dfa internals. */
582 /* S could possibly be an accepting state of R. */
583 static bool
584 accepting (state_num s, struct dfa const *r)
586 return r->states[s].constraint != 0;
589 /* STATE accepts in the specified context. */
590 static bool
591 accepts_in_context (int prev, int curr, state_num state, struct dfa const *dfa)
593 return succeeds_in_context (dfa->states[state].constraint, prev, curr);
596 static void regexp (struct dfa *dfa);
598 /* Store into *PWC the result of converting the leading bytes of the
599 multibyte buffer S of length N bytes, using D->localeinfo.sbctowc
600 and updating the conversion state in *D. On conversion error,
601 convert just a single byte, to WEOF. Return the number of bytes
602 converted.
604 This differs from mbrtowc (PWC, S, N, &D->mbs) as follows:
606 * PWC points to wint_t, not to wchar_t.
607 * The last arg is a dfa *D instead of merely a multibyte conversion
608 state D->mbs.
609 * N must be at least 1.
610 * S[N - 1] must be a sentinel byte.
611 * Shift encodings are not supported.
612 * The return value is always in the range 1..N.
613 * D->mbs is always valid afterwards.
614 * *PWC is always set to something. */
615 static size_t
616 mbs_to_wchar (wint_t *pwc, char const *s, size_t n, struct dfa *d)
618 unsigned char uc = s[0];
619 wint_t wc = d->localeinfo.sbctowc[uc];
621 if (wc == WEOF)
623 wchar_t wch;
624 size_t nbytes = mbrtowc (&wch, s, n, &d->mbs);
625 if (0 < nbytes && nbytes < (size_t) -2)
627 *pwc = wch;
628 return nbytes;
630 memset (&d->mbs, 0, sizeof d->mbs);
633 *pwc = wc;
634 return 1;
637 #ifdef DEBUG
639 static void
640 prtok (token t)
642 if (t <= END)
643 fprintf (stderr, "END");
644 else if (0 <= t && t < NOTCHAR)
646 unsigned int ch = t;
647 fprintf (stderr, "0x%02x", ch);
649 else
651 char const *s;
652 switch (t)
654 case BEG:
655 s = "BEG";
656 break;
657 case EMPTY:
658 s = "EMPTY";
659 break;
660 case BACKREF:
661 s = "BACKREF";
662 break;
663 case BEGLINE:
664 s = "BEGLINE";
665 break;
666 case ENDLINE:
667 s = "ENDLINE";
668 break;
669 case BEGWORD:
670 s = "BEGWORD";
671 break;
672 case ENDWORD:
673 s = "ENDWORD";
674 break;
675 case LIMWORD:
676 s = "LIMWORD";
677 break;
678 case NOTLIMWORD:
679 s = "NOTLIMWORD";
680 break;
681 case QMARK:
682 s = "QMARK";
683 break;
684 case STAR:
685 s = "STAR";
686 break;
687 case PLUS:
688 s = "PLUS";
689 break;
690 case CAT:
691 s = "CAT";
692 break;
693 case OR:
694 s = "OR";
695 break;
696 case LPAREN:
697 s = "LPAREN";
698 break;
699 case RPAREN:
700 s = "RPAREN";
701 break;
702 case ANYCHAR:
703 s = "ANYCHAR";
704 break;
705 case MBCSET:
706 s = "MBCSET";
707 break;
708 default:
709 s = "CSET";
710 break;
712 fprintf (stderr, "%s", s);
715 #endif /* DEBUG */
717 /* Stuff pertaining to charclasses. */
719 static bool
720 tstbit (unsigned int b, charclass const *c)
722 return c->w[b / CHARCLASS_WORD_BITS] >> b % CHARCLASS_WORD_BITS & 1;
725 static void
726 setbit (unsigned int b, charclass *c)
728 charclass_word one = 1;
729 c->w[b / CHARCLASS_WORD_BITS] |= one << b % CHARCLASS_WORD_BITS;
732 static void
733 clrbit (unsigned int b, charclass *c)
735 charclass_word one = 1;
736 c->w[b / CHARCLASS_WORD_BITS] &= ~(one << b % CHARCLASS_WORD_BITS);
739 static void
740 zeroset (charclass *s)
742 memset (s, 0, sizeof *s);
745 static void
746 fillset (charclass *s)
748 for (int i = 0; i < CHARCLASS_WORDS; i++)
749 s->w[i] = CHARCLASS_WORD_MASK;
752 static void
753 notset (charclass *s)
755 for (int i = 0; i < CHARCLASS_WORDS; ++i)
756 s->w[i] = CHARCLASS_WORD_MASK & ~s->w[i];
759 static bool
760 equal (charclass const *s1, charclass const *s2)
762 charclass_word w = 0;
763 for (int i = 0; i < CHARCLASS_WORDS; i++)
764 w |= s1->w[i] ^ s2->w[i];
765 return w == 0;
768 static bool
769 emptyset (charclass const *s)
771 charclass_word w = 0;
772 for (int i = 0; i < CHARCLASS_WORDS; i++)
773 w |= s->w[i];
774 return w == 0;
777 /* Grow PA, which points to an array of *NITEMS items, and return the
778 location of the reallocated array, updating *NITEMS to reflect its
779 new size. The new array will contain at least NITEMS_INCR_MIN more
780 items, but will not contain more than NITEMS_MAX items total.
781 ITEM_SIZE is the size of each item, in bytes.
783 ITEM_SIZE and NITEMS_INCR_MIN must be positive. *NITEMS must be
784 nonnegative. If NITEMS_MAX is -1, it is treated as if it were
785 infinity.
787 If PA is null, then allocate a new array instead of reallocating
788 the old one.
790 Thus, to grow an array A without saving its old contents, do
791 { free (A); A = xpalloc (NULL, &AITEMS, ...); }. */
793 static void *
794 xpalloc (void *pa, ptrdiff_t *nitems, ptrdiff_t nitems_incr_min,
795 ptrdiff_t nitems_max, ptrdiff_t item_size)
797 ptrdiff_t n0 = *nitems;
799 /* The approximate size to use for initial small allocation
800 requests. This is the largest "small" request for the GNU C
801 library malloc. */
802 enum { DEFAULT_MXFAST = 64 * sizeof (size_t) / 4 };
804 /* If the array is tiny, grow it to about (but no greater than)
805 DEFAULT_MXFAST bytes. Otherwise, grow it by about 50%.
806 Adjust the growth according to three constraints: NITEMS_INCR_MIN,
807 NITEMS_MAX, and what the C language can represent safely. */
809 ptrdiff_t n, nbytes;
810 if (INT_ADD_WRAPV (n0, n0 >> 1, &n))
811 n = PTRDIFF_MAX;
812 if (0 <= nitems_max && nitems_max < n)
813 n = nitems_max;
815 ptrdiff_t adjusted_nbytes
816 = ((INT_MULTIPLY_WRAPV (n, item_size, &nbytes) || SIZE_MAX < nbytes)
817 ? MIN (PTRDIFF_MAX, SIZE_MAX)
818 : nbytes < DEFAULT_MXFAST ? DEFAULT_MXFAST : 0);
819 if (adjusted_nbytes)
821 n = adjusted_nbytes / item_size;
822 nbytes = adjusted_nbytes - adjusted_nbytes % item_size;
825 if (! pa)
826 *nitems = 0;
827 if (n - n0 < nitems_incr_min
828 && (INT_ADD_WRAPV (n0, nitems_incr_min, &n)
829 || (0 <= nitems_max && nitems_max < n)
830 || INT_MULTIPLY_WRAPV (n, item_size, &nbytes)))
831 xalloc_die ();
832 pa = xrealloc (pa, nbytes);
833 *nitems = n;
834 return pa;
837 /* Ensure that the array addressed by PA holds at least I + 1 items.
838 Either return PA, or reallocate the array and return its new address.
839 Although PA may be null, the returned value is never null.
841 The array holds *NITEMS items, where 0 <= I <= *NITEMS; *NITEMS
842 is updated on reallocation. If PA is null, *NITEMS must be zero.
843 Do not allocate more than NITEMS_MAX items total; -1 means no limit.
844 ITEM_SIZE is the size of one item; it must be positive.
845 Avoid O(N**2) behavior on arrays growing linearly. */
846 static void *
847 maybe_realloc (void *pa, ptrdiff_t i, ptrdiff_t *nitems,
848 ptrdiff_t nitems_max, ptrdiff_t item_size)
850 if (i < *nitems)
851 return pa;
852 return xpalloc (pa, nitems, 1, nitems_max, item_size);
855 /* In DFA D, find the index of charclass S, or allocate a new one. */
856 static ptrdiff_t
857 charclass_index (struct dfa *d, charclass *s)
859 ptrdiff_t i;
861 for (i = 0; i < d->cindex; ++i)
862 if (equal (s, &d->charclasses[i]))
863 return i;
864 d->charclasses = maybe_realloc (d->charclasses, d->cindex, &d->calloc,
865 TOKEN_MAX - CSET, sizeof *d->charclasses);
866 ++d->cindex;
867 d->charclasses[i] = *s;
868 return i;
871 static bool
872 unibyte_word_constituent (struct dfa const *dfa, unsigned char c)
874 return dfa->localeinfo.sbctowc[c] != WEOF && (isalnum (c) || (c) == '_');
877 static int
878 char_context (struct dfa const *dfa, unsigned char c)
880 if (c == dfa->syntax.eolbyte && !dfa->syntax.anchor)
881 return CTX_NEWLINE;
882 if (unibyte_word_constituent (dfa, c))
883 return CTX_LETTER;
884 return CTX_NONE;
887 /* Set a bit in the charclass for the given wchar_t. Do nothing if WC
888 is represented by a multi-byte sequence. Even for MB_CUR_MAX == 1,
889 this may happen when folding case in weird Turkish locales where
890 dotless i/dotted I are not included in the chosen character set.
891 Return whether a bit was set in the charclass. */
892 static bool
893 setbit_wc (wint_t wc, charclass *c)
895 int b = wctob (wc);
896 if (b < 0)
897 return false;
899 setbit (b, c);
900 return true;
903 /* Set a bit for B and its case variants in the charclass C.
904 MB_CUR_MAX must be 1. */
905 static void
906 setbit_case_fold_c (int b, charclass *c)
908 int ub = toupper (b);
909 for (int i = 0; i < NOTCHAR; i++)
910 if (toupper (i) == ub)
911 setbit (i, c);
914 /* Return true if the locale compatible with the C locale. */
916 static bool
917 using_simple_locale (bool multibyte)
919 /* The native character set is known to be compatible with
920 the C locale. The following test isn't perfect, but it's good
921 enough in practice, as only ASCII and EBCDIC are in common use
922 and this test correctly accepts ASCII and rejects EBCDIC. */
923 enum { native_c_charset =
924 ('\b' == 8 && '\t' == 9 && '\n' == 10 && '\v' == 11 && '\f' == 12
925 && '\r' == 13 && ' ' == 32 && '!' == 33 && '"' == 34 && '#' == 35
926 && '%' == 37 && '&' == 38 && '\'' == 39 && '(' == 40 && ')' == 41
927 && '*' == 42 && '+' == 43 && ',' == 44 && '-' == 45 && '.' == 46
928 && '/' == 47 && '0' == 48 && '9' == 57 && ':' == 58 && ';' == 59
929 && '<' == 60 && '=' == 61 && '>' == 62 && '?' == 63 && 'A' == 65
930 && 'Z' == 90 && '[' == 91 && '\\' == 92 && ']' == 93 && '^' == 94
931 && '_' == 95 && 'a' == 97 && 'z' == 122 && '{' == 123 && '|' == 124
932 && '}' == 125 && '~' == 126)
935 if (!native_c_charset || multibyte)
936 return false;
937 else
939 /* Treat C and POSIX locales as being compatible. Also, treat
940 errors as compatible, as these are invariably from stubs. */
941 char const *loc = setlocale (LC_ALL, NULL);
942 return !loc || streq (loc, "C") || streq (loc, "POSIX");
946 /* Fetch the next lexical input character from the pattern. There
947 must at least one byte of pattern input. Set DFA->lex.wctok to the
948 value of the character or to WEOF depending on whether the input is
949 a valid multibyte character (possibly of length 1). Then return
950 the next input byte value, except return EOF if the input is a
951 multibyte character of length greater than 1. */
952 static int
953 fetch_wc (struct dfa *dfa)
955 size_t nbytes = mbs_to_wchar (&dfa->lex.wctok, dfa->lex.ptr, dfa->lex.left,
956 dfa);
957 dfa->lex.cur_mb_len = nbytes;
958 int c = nbytes == 1 ? to_uchar (dfa->lex.ptr[0]) : EOF;
959 dfa->lex.ptr += nbytes;
960 dfa->lex.left -= nbytes;
961 return c;
964 /* If there is no more input, report an error about unbalanced brackets.
965 Otherwise, behave as with fetch_wc (DFA). */
966 static int
967 bracket_fetch_wc (struct dfa *dfa)
969 if (! dfa->lex.left)
970 dfaerror (_("unbalanced ["));
971 return fetch_wc (dfa);
974 typedef int predicate (int);
976 /* The following list maps the names of the Posix named character classes
977 to predicate functions that determine whether a given character is in
978 the class. The leading [ has already been eaten by the lexical
979 analyzer. */
980 struct dfa_ctype
982 const char *name;
983 predicate *func;
984 bool single_byte_only;
987 static const struct dfa_ctype prednames[] = {
988 {"alpha", isalpha, false},
989 {"upper", isupper, false},
990 {"lower", islower, false},
991 {"digit", isdigit, true},
992 {"xdigit", isxdigit, false},
993 {"space", isspace, false},
994 {"punct", ispunct, false},
995 {"alnum", isalnum, false},
996 {"print", isprint, false},
997 {"graph", isgraph, false},
998 {"cntrl", iscntrl, false},
999 {"blank", isblank, false},
1000 {NULL, NULL, false}
1003 static const struct dfa_ctype *_GL_ATTRIBUTE_PURE
1004 find_pred (const char *str)
1006 for (unsigned int i = 0; prednames[i].name; ++i)
1007 if (streq (str, prednames[i].name))
1008 return &prednames[i];
1009 return NULL;
1012 /* Parse a bracket expression, which possibly includes multibyte
1013 characters. */
1014 static token
1015 parse_bracket_exp (struct dfa *dfa)
1017 /* This is a bracket expression that dfaexec is known to
1018 process correctly. */
1019 bool known_bracket_exp = true;
1021 /* Used to warn about [:space:].
1022 Bit 0 = first character is a colon.
1023 Bit 1 = last character is a colon.
1024 Bit 2 = includes any other character but a colon.
1025 Bit 3 = includes ranges, char/equiv classes or collation elements. */
1026 int colon_warning_state;
1028 dfa->lex.brack.nchars = 0;
1029 charclass ccl;
1030 zeroset (&ccl);
1031 int c = bracket_fetch_wc (dfa);
1032 bool invert = c == '^';
1033 if (invert)
1035 c = bracket_fetch_wc (dfa);
1036 known_bracket_exp = dfa->simple_locale;
1038 wint_t wc = dfa->lex.wctok;
1039 int c1;
1040 wint_t wc1;
1041 colon_warning_state = (c == ':');
1044 c1 = NOTCHAR; /* Mark c1 as not initialized. */
1045 colon_warning_state &= ~2;
1047 /* Note that if we're looking at some other [:...:] construct,
1048 we just treat it as a bunch of ordinary characters. We can do
1049 this because we assume regex has checked for syntax errors before
1050 dfa is ever called. */
1051 if (c == '[')
1053 c1 = bracket_fetch_wc (dfa);
1054 wc1 = dfa->lex.wctok;
1056 if ((c1 == ':' && (dfa->syntax.syntax_bits & RE_CHAR_CLASSES))
1057 || c1 == '.' || c1 == '=')
1059 enum { MAX_BRACKET_STRING_LEN = 32 };
1060 char str[MAX_BRACKET_STRING_LEN + 1];
1061 size_t len = 0;
1062 for (;;)
1064 c = bracket_fetch_wc (dfa);
1065 if (dfa->lex.left == 0
1066 || (c == c1 && dfa->lex.ptr[0] == ']'))
1067 break;
1068 if (len < MAX_BRACKET_STRING_LEN)
1069 str[len++] = c;
1070 else
1071 /* This is in any case an invalid class name. */
1072 str[0] = '\0';
1074 str[len] = '\0';
1076 /* Fetch bracket. */
1077 c = bracket_fetch_wc (dfa);
1078 wc = dfa->lex.wctok;
1079 if (c1 == ':')
1080 /* Build character class. POSIX allows character
1081 classes to match multicharacter collating elements,
1082 but the regex code does not support that, so do not
1083 worry about that possibility. */
1085 char const *class
1086 = (dfa->syntax.case_fold && (streq (str, "upper")
1087 || streq (str, "lower"))
1088 ? "alpha" : str);
1089 const struct dfa_ctype *pred = find_pred (class);
1090 if (!pred)
1091 dfaerror (_("invalid character class"));
1093 if (dfa->localeinfo.multibyte && !pred->single_byte_only)
1094 known_bracket_exp = false;
1095 else
1096 for (int c2 = 0; c2 < NOTCHAR; ++c2)
1097 if (pred->func (c2))
1098 setbit (c2, &ccl);
1100 else
1101 known_bracket_exp = false;
1103 colon_warning_state |= 8;
1105 /* Fetch new lookahead character. */
1106 c1 = bracket_fetch_wc (dfa);
1107 wc1 = dfa->lex.wctok;
1108 continue;
1111 /* We treat '[' as a normal character here. c/c1/wc/wc1
1112 are already set up. */
1115 if (c == '\\'
1116 && (dfa->syntax.syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
1118 c = bracket_fetch_wc (dfa);
1119 wc = dfa->lex.wctok;
1122 if (c1 == NOTCHAR)
1124 c1 = bracket_fetch_wc (dfa);
1125 wc1 = dfa->lex.wctok;
1128 if (c1 == '-')
1129 /* build range characters. */
1131 int c2 = bracket_fetch_wc (dfa);
1132 wint_t wc2 = dfa->lex.wctok;
1134 /* A bracket expression like [a-[.aa.]] matches an unknown set.
1135 Treat it like [-a[.aa.]] while parsing it, and
1136 remember that the set is unknown. */
1137 if (c2 == '[' && dfa->lex.ptr[0] == '.')
1139 known_bracket_exp = false;
1140 c2 = ']';
1143 if (c2 == ']')
1145 /* In the case [x-], the - is an ordinary hyphen,
1146 which is left in c1, the lookahead character. */
1147 dfa->lex.ptr -= dfa->lex.cur_mb_len;
1148 dfa->lex.left += dfa->lex.cur_mb_len;
1150 else
1152 if (c2 == '\\' && (dfa->syntax.syntax_bits
1153 & RE_BACKSLASH_ESCAPE_IN_LISTS))
1155 c2 = bracket_fetch_wc (dfa);
1156 wc2 = dfa->lex.wctok;
1159 colon_warning_state |= 8;
1160 c1 = bracket_fetch_wc (dfa);
1161 wc1 = dfa->lex.wctok;
1163 /* Treat [x-y] as a range if x != y. */
1164 if (wc != wc2 || wc == WEOF)
1166 if (dfa->simple_locale
1167 || (isasciidigit (c) & isasciidigit (c2)))
1169 for (int ci = c; ci <= c2; ci++)
1170 if (dfa->syntax.case_fold && isalpha (ci))
1171 setbit_case_fold_c (ci, &ccl);
1172 else
1173 setbit (ci, &ccl);
1175 else
1176 known_bracket_exp = false;
1178 continue;
1183 colon_warning_state |= (c == ':') ? 2 : 4;
1185 if (!dfa->localeinfo.multibyte)
1187 if (dfa->syntax.case_fold && isalpha (c))
1188 setbit_case_fold_c (c, &ccl);
1189 else
1190 setbit (c, &ccl);
1191 continue;
1194 if (wc == WEOF)
1195 known_bracket_exp = false;
1196 else
1198 wchar_t folded[CASE_FOLDED_BUFSIZE + 1];
1199 unsigned int n = (dfa->syntax.case_fold
1200 ? case_folded_counterparts (wc, folded + 1) + 1
1201 : 1);
1202 folded[0] = wc;
1203 for (unsigned int i = 0; i < n; i++)
1204 if (!setbit_wc (folded[i], &ccl))
1206 dfa->lex.brack.chars
1207 = maybe_realloc (dfa->lex.brack.chars, dfa->lex.brack.nchars,
1208 &dfa->lex.brack.nchars_alloc, -1,
1209 sizeof *dfa->lex.brack.chars);
1210 dfa->lex.brack.chars[dfa->lex.brack.nchars++] = folded[i];
1214 while ((wc = wc1, (c = c1) != ']'));
1216 if (colon_warning_state == 7)
1217 dfawarn (_("character class syntax is [[:space:]], not [:space:]"));
1219 if (! known_bracket_exp)
1220 return BACKREF;
1222 if (dfa->localeinfo.multibyte && (invert || dfa->lex.brack.nchars != 0))
1224 dfa->lex.brack.invert = invert;
1225 dfa->lex.brack.cset = emptyset (&ccl) ? -1 : charclass_index (dfa, &ccl);
1226 return MBCSET;
1229 if (invert)
1231 notset (&ccl);
1232 if (dfa->syntax.syntax_bits & RE_HAT_LISTS_NOT_NEWLINE)
1233 clrbit ('\n', &ccl);
1236 return CSET + charclass_index (dfa, &ccl);
1239 struct lexptr
1241 char const *ptr;
1242 size_t left;
1245 static void
1246 push_lex_state (struct dfa *dfa, struct lexptr *ls, char const *s)
1248 ls->ptr = dfa->lex.ptr;
1249 ls->left = dfa->lex.left;
1250 dfa->lex.ptr = s;
1251 dfa->lex.left = strlen (s);
1254 static void
1255 pop_lex_state (struct dfa *dfa, struct lexptr const *ls)
1257 dfa->lex.ptr = ls->ptr;
1258 dfa->lex.left = ls->left;
1261 static token
1262 lex (struct dfa *dfa)
1264 bool backslash = false;
1266 /* Basic plan: We fetch a character. If it's a backslash,
1267 we set the backslash flag and go through the loop again.
1268 On the plus side, this avoids having a duplicate of the
1269 main switch inside the backslash case. On the minus side,
1270 it means that just about every case begins with
1271 "if (backslash) ...". */
1272 for (int i = 0; i < 2; ++i)
1274 if (! dfa->lex.left)
1275 return dfa->lex.lasttok = END;
1276 int c = fetch_wc (dfa);
1278 switch (c)
1280 case '\\':
1281 if (backslash)
1282 goto normal_char;
1283 if (dfa->lex.left == 0)
1284 dfaerror (_("unfinished \\ escape"));
1285 backslash = true;
1286 break;
1288 case '^':
1289 if (backslash)
1290 goto normal_char;
1291 if (dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_ANCHORS
1292 || dfa->lex.lasttok == END || dfa->lex.lasttok == LPAREN
1293 || dfa->lex.lasttok == OR)
1294 return dfa->lex.lasttok = BEGLINE;
1295 goto normal_char;
1297 case '$':
1298 if (backslash)
1299 goto normal_char;
1300 if (dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_ANCHORS
1301 || dfa->lex.left == 0
1302 || ((dfa->lex.left
1303 > !(dfa->syntax.syntax_bits & RE_NO_BK_PARENS))
1304 && (dfa->lex.ptr[!(dfa->syntax.syntax_bits & RE_NO_BK_PARENS)
1305 & (dfa->lex.ptr[0] == '\\')]
1306 == ')'))
1307 || ((dfa->lex.left
1308 > !(dfa->syntax.syntax_bits & RE_NO_BK_VBAR))
1309 && (dfa->lex.ptr[!(dfa->syntax.syntax_bits & RE_NO_BK_VBAR)
1310 & (dfa->lex.ptr[0] == '\\')]
1311 == '|'))
1312 || ((dfa->syntax.syntax_bits & RE_NEWLINE_ALT)
1313 && dfa->lex.left > 0 && dfa->lex.ptr[0] == '\n'))
1314 return dfa->lex.lasttok = ENDLINE;
1315 goto normal_char;
1317 case '1':
1318 case '2':
1319 case '3':
1320 case '4':
1321 case '5':
1322 case '6':
1323 case '7':
1324 case '8':
1325 case '9':
1326 if (backslash && !(dfa->syntax.syntax_bits & RE_NO_BK_REFS))
1328 dfa->lex.laststart = false;
1329 return dfa->lex.lasttok = BACKREF;
1331 goto normal_char;
1333 case '`':
1334 if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1336 /* FIXME: should be beginning of string */
1337 return dfa->lex.lasttok = BEGLINE;
1339 goto normal_char;
1341 case '\'':
1342 if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1344 /* FIXME: should be end of string */
1345 return dfa->lex.lasttok = ENDLINE;
1347 goto normal_char;
1349 case '<':
1350 if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1351 return dfa->lex.lasttok = BEGWORD;
1352 goto normal_char;
1354 case '>':
1355 if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1356 return dfa->lex.lasttok = ENDWORD;
1357 goto normal_char;
1359 case 'b':
1360 if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1361 return dfa->lex.lasttok = LIMWORD;
1362 goto normal_char;
1364 case 'B':
1365 if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1366 return dfa->lex.lasttok = NOTLIMWORD;
1367 goto normal_char;
1369 case '?':
1370 if (dfa->syntax.syntax_bits & RE_LIMITED_OPS)
1371 goto normal_char;
1372 if (backslash != ((dfa->syntax.syntax_bits & RE_BK_PLUS_QM) != 0))
1373 goto normal_char;
1374 if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS)
1375 && dfa->lex.laststart)
1376 goto normal_char;
1377 return dfa->lex.lasttok = QMARK;
1379 case '*':
1380 if (backslash)
1381 goto normal_char;
1382 if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS)
1383 && dfa->lex.laststart)
1384 goto normal_char;
1385 return dfa->lex.lasttok = STAR;
1387 case '+':
1388 if (dfa->syntax.syntax_bits & RE_LIMITED_OPS)
1389 goto normal_char;
1390 if (backslash != ((dfa->syntax.syntax_bits & RE_BK_PLUS_QM) != 0))
1391 goto normal_char;
1392 if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS)
1393 && dfa->lex.laststart)
1394 goto normal_char;
1395 return dfa->lex.lasttok = PLUS;
1397 case '{':
1398 if (!(dfa->syntax.syntax_bits & RE_INTERVALS))
1399 goto normal_char;
1400 if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_BRACES) == 0))
1401 goto normal_char;
1402 if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS)
1403 && dfa->lex.laststart)
1404 goto normal_char;
1406 /* Cases:
1407 {M} - exact count
1408 {M,} - minimum count, maximum is infinity
1409 {,N} - 0 through N
1410 {,} - 0 to infinity (same as '*')
1411 {M,N} - M through N */
1413 char const *p = dfa->lex.ptr;
1414 char const *lim = p + dfa->lex.left;
1415 dfa->lex.minrep = dfa->lex.maxrep = -1;
1416 for (; p != lim && isasciidigit (*p); p++)
1417 dfa->lex.minrep = (dfa->lex.minrep < 0
1418 ? *p - '0'
1419 : MIN (RE_DUP_MAX + 1,
1420 dfa->lex.minrep * 10 + *p - '0'));
1421 if (p != lim)
1423 if (*p != ',')
1424 dfa->lex.maxrep = dfa->lex.minrep;
1425 else
1427 if (dfa->lex.minrep < 0)
1428 dfa->lex.minrep = 0;
1429 while (++p != lim && isasciidigit (*p))
1430 dfa->lex.maxrep
1431 = (dfa->lex.maxrep < 0
1432 ? *p - '0'
1433 : MIN (RE_DUP_MAX + 1,
1434 dfa->lex.maxrep * 10 + *p - '0'));
1437 if (! ((! backslash || (p != lim && *p++ == '\\'))
1438 && p != lim && *p++ == '}'
1439 && 0 <= dfa->lex.minrep
1440 && (dfa->lex.maxrep < 0
1441 || dfa->lex.minrep <= dfa->lex.maxrep)))
1443 if (dfa->syntax.syntax_bits & RE_INVALID_INTERVAL_ORD)
1444 goto normal_char;
1445 dfaerror (_("invalid content of \\{\\}"));
1447 if (RE_DUP_MAX < dfa->lex.maxrep)
1448 dfaerror (_("regular expression too big"));
1449 dfa->lex.ptr = p;
1450 dfa->lex.left = lim - p;
1452 dfa->lex.laststart = false;
1453 return dfa->lex.lasttok = REPMN;
1455 case '|':
1456 if (dfa->syntax.syntax_bits & RE_LIMITED_OPS)
1457 goto normal_char;
1458 if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_VBAR) == 0))
1459 goto normal_char;
1460 dfa->lex.laststart = true;
1461 return dfa->lex.lasttok = OR;
1463 case '\n':
1464 if (dfa->syntax.syntax_bits & RE_LIMITED_OPS
1465 || backslash || !(dfa->syntax.syntax_bits & RE_NEWLINE_ALT))
1466 goto normal_char;
1467 dfa->lex.laststart = true;
1468 return dfa->lex.lasttok = OR;
1470 case '(':
1471 if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_PARENS) == 0))
1472 goto normal_char;
1473 dfa->lex.parens++;
1474 dfa->lex.laststart = true;
1475 return dfa->lex.lasttok = LPAREN;
1477 case ')':
1478 if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_PARENS) == 0))
1479 goto normal_char;
1480 if (dfa->lex.parens == 0
1481 && dfa->syntax.syntax_bits & RE_UNMATCHED_RIGHT_PAREN_ORD)
1482 goto normal_char;
1483 dfa->lex.parens--;
1484 dfa->lex.laststart = false;
1485 return dfa->lex.lasttok = RPAREN;
1487 case '.':
1488 if (backslash)
1489 goto normal_char;
1490 if (dfa->canychar == (size_t) -1)
1492 charclass ccl;
1493 fillset (&ccl);
1494 if (!(dfa->syntax.syntax_bits & RE_DOT_NEWLINE))
1495 clrbit ('\n', &ccl);
1496 if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL)
1497 clrbit ('\0', &ccl);
1498 if (dfa->localeinfo.multibyte)
1499 for (int c2 = 0; c2 < NOTCHAR; c2++)
1500 if (dfa->localeinfo.sbctowc[c2] == WEOF)
1501 clrbit (c2, &ccl);
1502 dfa->canychar = charclass_index (dfa, &ccl);
1504 dfa->lex.laststart = false;
1505 return dfa->lex.lasttok = (dfa->localeinfo.multibyte
1506 ? ANYCHAR
1507 : CSET + dfa->canychar);
1509 case 's':
1510 case 'S':
1511 if (!backslash || (dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1512 goto normal_char;
1513 if (!dfa->localeinfo.multibyte)
1515 charclass ccl;
1516 zeroset (&ccl);
1517 for (int c2 = 0; c2 < NOTCHAR; ++c2)
1518 if (isspace (c2))
1519 setbit (c2, &ccl);
1520 if (c == 'S')
1521 notset (&ccl);
1522 dfa->lex.laststart = false;
1523 return dfa->lex.lasttok = CSET + charclass_index (dfa, &ccl);
1526 /* FIXME: see if optimizing this, as is done with ANYCHAR and
1527 add_utf8_anychar, makes sense. */
1529 /* \s and \S are documented to be equivalent to [[:space:]] and
1530 [^[:space:]] respectively, so tell the lexer to process those
1531 strings, each minus its "already processed" '['. */
1533 struct lexptr ls;
1534 push_lex_state (dfa, &ls, &"^[:space:]]"[c == 's']);
1535 dfa->lex.lasttok = parse_bracket_exp (dfa);
1536 pop_lex_state (dfa, &ls);
1539 dfa->lex.laststart = false;
1540 return dfa->lex.lasttok;
1542 case 'w':
1543 case 'W':
1544 if (!backslash || (dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1545 goto normal_char;
1547 if (!dfa->localeinfo.multibyte)
1549 charclass ccl;
1550 zeroset (&ccl);
1551 for (int c2 = 0; c2 < NOTCHAR; ++c2)
1552 if (dfa->syntax.sbit[c2] == CTX_LETTER)
1553 setbit (c2, &ccl);
1554 if (c == 'W')
1555 notset (&ccl);
1556 dfa->lex.laststart = false;
1557 return dfa->lex.lasttok = CSET + charclass_index (dfa, &ccl);
1560 /* FIXME: see if optimizing this, as is done with ANYCHAR and
1561 add_utf8_anychar, makes sense. */
1563 /* \w and \W are documented to be equivalent to [_[:alnum:]] and
1564 [^_[:alnum:]] respectively, so tell the lexer to process those
1565 strings, each minus its "already processed" '['. */
1567 struct lexptr ls;
1568 push_lex_state (dfa, &ls, &"^_[:alnum:]]"[c == 'w']);
1569 dfa->lex.lasttok = parse_bracket_exp (dfa);
1570 pop_lex_state (dfa, &ls);
1573 dfa->lex.laststart = false;
1574 return dfa->lex.lasttok;
1576 case '[':
1577 if (backslash)
1578 goto normal_char;
1579 dfa->lex.laststart = false;
1580 return dfa->lex.lasttok = parse_bracket_exp (dfa);
1582 default:
1583 normal_char:
1584 dfa->lex.laststart = false;
1585 /* For multibyte character sets, folding is done in atom. Always
1586 return WCHAR. */
1587 if (dfa->localeinfo.multibyte)
1588 return dfa->lex.lasttok = WCHAR;
1590 if (dfa->syntax.case_fold && isalpha (c))
1592 charclass ccl;
1593 zeroset (&ccl);
1594 setbit_case_fold_c (c, &ccl);
1595 return dfa->lex.lasttok = CSET + charclass_index (dfa, &ccl);
1598 return dfa->lex.lasttok = c;
1602 /* The above loop should consume at most a backslash
1603 and some other character. */
1604 abort ();
1605 return END; /* keeps pedantic compilers happy. */
1608 static void
1609 addtok_mb (struct dfa *dfa, token t, char mbprop)
1611 if (dfa->talloc == dfa->tindex)
1613 dfa->tokens = x2nrealloc (dfa->tokens, &dfa->talloc,
1614 sizeof *dfa->tokens);
1615 if (dfa->localeinfo.multibyte)
1616 dfa->multibyte_prop = xnrealloc (dfa->multibyte_prop, dfa->talloc,
1617 sizeof *dfa->multibyte_prop);
1619 if (dfa->localeinfo.multibyte)
1620 dfa->multibyte_prop[dfa->tindex] = mbprop;
1621 dfa->tokens[dfa->tindex++] = t;
1623 switch (t)
1625 case QMARK:
1626 case STAR:
1627 case PLUS:
1628 break;
1630 case CAT:
1631 case OR:
1632 dfa->parse.depth--;
1633 break;
1635 case BACKREF:
1636 dfa->fast = false;
1637 FALLTHROUGH;
1638 default:
1639 dfa->nleaves++;
1640 FALLTHROUGH;
1641 case EMPTY:
1642 dfa->parse.depth++;
1643 break;
1645 if (dfa->parse.depth > dfa->depth)
1646 dfa->depth = dfa->parse.depth;
1649 static void addtok_wc (struct dfa *dfa, wint_t wc);
1651 /* Add the given token to the parse tree, maintaining the depth count and
1652 updating the maximum depth if necessary. */
1653 static void
1654 addtok (struct dfa *dfa, token t)
1656 if (dfa->localeinfo.multibyte && t == MBCSET)
1658 bool need_or = false;
1660 /* Extract wide characters into alternations for better performance.
1661 This does not require UTF-8. */
1662 for (ptrdiff_t i = 0; i < dfa->lex.brack.nchars; i++)
1664 addtok_wc (dfa, dfa->lex.brack.chars[i]);
1665 if (need_or)
1666 addtok (dfa, OR);
1667 need_or = true;
1669 dfa->lex.brack.nchars = 0;
1671 /* Wide characters have been handled above, so it is possible
1672 that the set is empty now. Do nothing in that case. */
1673 if (dfa->lex.brack.cset != -1)
1675 addtok (dfa, CSET + dfa->lex.brack.cset);
1676 if (need_or)
1677 addtok (dfa, OR);
1680 else
1682 addtok_mb (dfa, t, 3);
1686 /* We treat a multibyte character as a single atom, so that DFA
1687 can treat a multibyte character as a single expression.
1689 e.g., we construct the following tree from "<mb1><mb2>".
1690 <mb1(1st-byte)><mb1(2nd-byte)><CAT><mb1(3rd-byte)><CAT>
1691 <mb2(1st-byte)><mb2(2nd-byte)><CAT><mb2(3rd-byte)><CAT><CAT> */
1692 static void
1693 addtok_wc (struct dfa *dfa, wint_t wc)
1695 unsigned char buf[MB_LEN_MAX];
1696 mbstate_t s = { 0 };
1697 size_t stored_bytes = wcrtomb ((char *) buf, wc, &s);
1699 if (stored_bytes != (size_t) -1)
1700 dfa->lex.cur_mb_len = stored_bytes;
1701 else
1703 /* This is merely stop-gap. buf[0] is undefined, yet skipping
1704 the addtok_mb call altogether can corrupt the heap. */
1705 dfa->lex.cur_mb_len = 1;
1706 buf[0] = 0;
1709 addtok_mb (dfa, buf[0], dfa->lex.cur_mb_len == 1 ? 3 : 1);
1710 for (int i = 1; i < dfa->lex.cur_mb_len; i++)
1712 addtok_mb (dfa, buf[i], i == dfa->lex.cur_mb_len - 1 ? 2 : 0);
1713 addtok (dfa, CAT);
1717 static void
1718 add_utf8_anychar (struct dfa *dfa)
1720 static charclass const utf8_classes[5] = {
1721 /* 80-bf: non-leading bytes. */
1722 CHARCLASS_INIT (0, 0, 0, 0, 0xffffffff, 0xffffffff, 0, 0),
1724 /* 00-7f: 1-byte sequence. */
1725 CHARCLASS_INIT (0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0, 0, 0, 0),
1727 /* c2-df: 2-byte sequence. */
1728 CHARCLASS_INIT (0, 0, 0, 0, 0, 0, 0xfffffffc, 0),
1730 /* e0-ef: 3-byte sequence. */
1731 CHARCLASS_INIT (0, 0, 0, 0, 0, 0, 0, 0xffff),
1733 /* f0-f7: 4-byte sequence. */
1734 CHARCLASS_INIT (0, 0, 0, 0, 0, 0, 0, 0xff0000)
1736 const unsigned int n = sizeof (utf8_classes) / sizeof (utf8_classes[0]);
1738 /* Define the five character classes that are needed below. */
1739 if (dfa->utf8_anychar_classes[0] == 0)
1740 for (unsigned int i = 0; i < n; i++)
1742 charclass c = utf8_classes[i];
1743 if (i == 1)
1745 if (!(dfa->syntax.syntax_bits & RE_DOT_NEWLINE))
1746 clrbit ('\n', &c);
1747 if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL)
1748 clrbit ('\0', &c);
1750 dfa->utf8_anychar_classes[i] = CSET + charclass_index (dfa, &c);
1753 /* A valid UTF-8 character is
1755 ([0x00-0x7f]
1756 |[0xc2-0xdf][0x80-0xbf]
1757 |[0xe0-0xef[0x80-0xbf][0x80-0xbf]
1758 |[0xf0-f7][0x80-0xbf][0x80-0xbf][0x80-0xbf])
1760 which I'll write more concisely "B|CA|DAA|EAAA". Factor the [0x00-0x7f]
1761 and you get "B|(C|(D|EA)A)A". And since the token buffer is in reverse
1762 Polish notation, you get "B C D E A CAT OR A CAT OR A CAT OR". */
1763 unsigned int i;
1764 for (i = 1; i < n; i++)
1765 addtok (dfa, dfa->utf8_anychar_classes[i]);
1766 while (--i > 1)
1768 addtok (dfa, dfa->utf8_anychar_classes[0]);
1769 addtok (dfa, CAT);
1770 addtok (dfa, OR);
1774 /* The grammar understood by the parser is as follows.
1776 regexp:
1777 regexp OR branch
1778 branch
1780 branch:
1781 branch closure
1782 closure
1784 closure:
1785 closure QMARK
1786 closure STAR
1787 closure PLUS
1788 closure REPMN
1789 atom
1791 atom:
1792 <normal character>
1793 <multibyte character>
1794 ANYCHAR
1795 MBCSET
1796 CSET
1797 BACKREF
1798 BEGLINE
1799 ENDLINE
1800 BEGWORD
1801 ENDWORD
1802 LIMWORD
1803 NOTLIMWORD
1804 LPAREN regexp RPAREN
1805 <empty>
1807 The parser builds a parse tree in postfix form in an array of tokens. */
1809 static void
1810 atom (struct dfa *dfa)
1812 if ((0 <= dfa->parse.tok && dfa->parse.tok < NOTCHAR)
1813 || dfa->parse.tok >= CSET
1814 || dfa->parse.tok == BEG || dfa->parse.tok == BACKREF
1815 || dfa->parse.tok == BEGLINE || dfa->parse.tok == ENDLINE
1816 || dfa->parse.tok == BEGWORD || dfa->parse.tok == ENDWORD
1817 || dfa->parse.tok == LIMWORD || dfa->parse.tok == NOTLIMWORD
1818 || dfa->parse.tok == ANYCHAR || dfa->parse.tok == MBCSET)
1820 if (dfa->parse.tok == ANYCHAR && dfa->localeinfo.using_utf8)
1822 /* For UTF-8 expand the period to a series of CSETs that define a
1823 valid UTF-8 character. This avoids using the slow multibyte
1824 path. I'm pretty sure it would be both profitable and correct to
1825 do it for any encoding; however, the optimization must be done
1826 manually as it is done above in add_utf8_anychar. So, let's
1827 start with UTF-8: it is the most used, and the structure of the
1828 encoding makes the correctness more obvious. */
1829 add_utf8_anychar (dfa);
1831 else
1832 addtok (dfa, dfa->parse.tok);
1833 dfa->parse.tok = lex (dfa);
1835 else if (dfa->parse.tok == WCHAR)
1837 if (dfa->lex.wctok == WEOF)
1838 addtok (dfa, BACKREF);
1839 else
1841 addtok_wc (dfa, dfa->lex.wctok);
1843 if (dfa->syntax.case_fold)
1845 wchar_t folded[CASE_FOLDED_BUFSIZE];
1846 unsigned int n = case_folded_counterparts (dfa->lex.wctok,
1847 folded);
1848 for (unsigned int i = 0; i < n; i++)
1850 addtok_wc (dfa, folded[i]);
1851 addtok (dfa, OR);
1856 dfa->parse.tok = lex (dfa);
1858 else if (dfa->parse.tok == LPAREN)
1860 dfa->parse.tok = lex (dfa);
1861 regexp (dfa);
1862 if (dfa->parse.tok != RPAREN)
1863 dfaerror (_("unbalanced ("));
1864 dfa->parse.tok = lex (dfa);
1866 else
1867 addtok (dfa, EMPTY);
1870 /* Return the number of tokens in the given subexpression. */
1871 static size_t _GL_ATTRIBUTE_PURE
1872 nsubtoks (struct dfa const *dfa, size_t tindex)
1874 switch (dfa->tokens[tindex - 1])
1876 default:
1877 return 1;
1878 case QMARK:
1879 case STAR:
1880 case PLUS:
1881 return 1 + nsubtoks (dfa, tindex - 1);
1882 case CAT:
1883 case OR:
1885 size_t ntoks1 = nsubtoks (dfa, tindex - 1);
1886 return 1 + ntoks1 + nsubtoks (dfa, tindex - 1 - ntoks1);
1891 /* Copy the given subexpression to the top of the tree. */
1892 static void
1893 copytoks (struct dfa *dfa, size_t tindex, size_t ntokens)
1895 if (dfa->localeinfo.multibyte)
1896 for (size_t i = 0; i < ntokens; ++i)
1897 addtok_mb (dfa, dfa->tokens[tindex + i],
1898 dfa->multibyte_prop[tindex + i]);
1899 else
1900 for (size_t i = 0; i < ntokens; ++i)
1901 addtok_mb (dfa, dfa->tokens[tindex + i], 3);
1904 static void
1905 closure (struct dfa *dfa)
1907 atom (dfa);
1908 while (dfa->parse.tok == QMARK || dfa->parse.tok == STAR
1909 || dfa->parse.tok == PLUS || dfa->parse.tok == REPMN)
1910 if (dfa->parse.tok == REPMN && (dfa->lex.minrep || dfa->lex.maxrep))
1912 size_t ntokens = nsubtoks (dfa, dfa->tindex);
1913 size_t tindex = dfa->tindex - ntokens;
1914 if (dfa->lex.maxrep < 0)
1915 addtok (dfa, PLUS);
1916 if (dfa->lex.minrep == 0)
1917 addtok (dfa, QMARK);
1918 int i;
1919 for (i = 1; i < dfa->lex.minrep; i++)
1921 copytoks (dfa, tindex, ntokens);
1922 addtok (dfa, CAT);
1924 for (; i < dfa->lex.maxrep; i++)
1926 copytoks (dfa, tindex, ntokens);
1927 addtok (dfa, QMARK);
1928 addtok (dfa, CAT);
1930 dfa->parse.tok = lex (dfa);
1932 else if (dfa->parse.tok == REPMN)
1934 dfa->tindex -= nsubtoks (dfa, dfa->tindex);
1935 dfa->parse.tok = lex (dfa);
1936 closure (dfa);
1938 else
1940 addtok (dfa, dfa->parse.tok);
1941 dfa->parse.tok = lex (dfa);
1945 static void
1946 branch (struct dfa* dfa)
1948 closure (dfa);
1949 while (dfa->parse.tok != RPAREN && dfa->parse.tok != OR
1950 && dfa->parse.tok >= 0)
1952 closure (dfa);
1953 addtok (dfa, CAT);
1957 static void
1958 regexp (struct dfa *dfa)
1960 branch (dfa);
1961 while (dfa->parse.tok == OR)
1963 dfa->parse.tok = lex (dfa);
1964 branch (dfa);
1965 addtok (dfa, OR);
1969 /* Main entry point for the parser. S is a string to be parsed, len is the
1970 length of the string, so s can include NUL characters. D is a pointer to
1971 the struct dfa to parse into. */
1972 static void
1973 dfaparse (char const *s, size_t len, struct dfa *d)
1975 d->lex.ptr = s;
1976 d->lex.left = len;
1977 d->lex.lasttok = END;
1978 d->lex.laststart = true;
1980 if (!d->syntax.syntax_bits_set)
1981 dfaerror (_("no syntax specified"));
1983 if (!d->nregexps)
1984 addtok (d, BEG);
1986 d->parse.tok = lex (d);
1987 d->parse.depth = d->depth;
1989 regexp (d);
1991 if (d->parse.tok != END)
1992 dfaerror (_("unbalanced )"));
1994 addtok (d, END - d->nregexps);
1995 addtok (d, CAT);
1997 if (d->nregexps)
1998 addtok (d, OR);
2000 ++d->nregexps;
2003 /* Some primitives for operating on sets of positions. */
2005 /* Copy one set to another. */
2006 static void
2007 copy (position_set const *src, position_set *dst)
2009 if (dst->alloc < src->nelem)
2011 free (dst->elems);
2012 dst->elems = xpalloc (NULL, &dst->alloc, src->nelem - dst->alloc, -1,
2013 sizeof *dst->elems);
2015 dst->nelem = src->nelem;
2016 if (src->nelem != 0)
2017 memcpy (dst->elems, src->elems, src->nelem * sizeof *dst->elems);
2020 static void
2021 alloc_position_set (position_set *s, size_t size)
2023 s->elems = xnmalloc (size, sizeof *s->elems);
2024 s->alloc = size;
2025 s->nelem = 0;
2028 /* Insert position P in set S. S is maintained in sorted order on
2029 decreasing index. If there is already an entry in S with P.index
2030 then merge (logically-OR) P's constraints into the one in S.
2031 S->elems must point to an array large enough to hold the resulting set. */
2032 static void
2033 insert (position p, position_set *s)
2035 ptrdiff_t count = s->nelem;
2036 ptrdiff_t lo = 0, hi = count;
2037 while (lo < hi)
2039 ptrdiff_t mid = (lo + hi) >> 1;
2040 if (s->elems[mid].index < p.index)
2041 lo = mid + 1;
2042 else if (s->elems[mid].index == p.index)
2044 s->elems[mid].constraint |= p.constraint;
2045 return;
2047 else
2048 hi = mid;
2051 s->elems = maybe_realloc (s->elems, count, &s->alloc, -1, sizeof *s->elems);
2052 for (ptrdiff_t i = count; i > lo; i--)
2053 s->elems[i] = s->elems[i - 1];
2054 s->elems[lo] = p;
2055 ++s->nelem;
2058 static void
2059 append (position p, position_set *s)
2061 ptrdiff_t count = s->nelem;
2062 s->elems = maybe_realloc (s->elems, count, &s->alloc, -1, sizeof *s->elems);
2063 s->elems[s->nelem++] = p;
2066 /* Merge S1 and S2 (with the additional constraint C2) into M. The
2067 result is as if the positions of S1, and of S2 with the additional
2068 constraint C2, were inserted into an initially empty set. */
2069 static void
2070 merge_constrained (position_set const *s1, position_set const *s2,
2071 unsigned int c2, position_set *m)
2073 ptrdiff_t i = 0, j = 0;
2075 if (m->alloc - s1->nelem < s2->nelem)
2077 free (m->elems);
2078 m->alloc = s1->nelem;
2079 m->elems = xpalloc (NULL, &m->alloc, s2->nelem, -1, sizeof *m->elems);
2081 m->nelem = 0;
2082 while (i < s1->nelem || j < s2->nelem)
2083 if (! (j < s2->nelem)
2084 || (i < s1->nelem && s1->elems[i].index <= s2->elems[j].index))
2086 unsigned int c = ((i < s1->nelem && j < s2->nelem
2087 && s1->elems[i].index == s2->elems[j].index)
2088 ? s2->elems[j++].constraint & c2
2089 : 0);
2090 m->elems[m->nelem].index = s1->elems[i].index;
2091 m->elems[m->nelem++].constraint = s1->elems[i++].constraint | c;
2093 else
2095 if (s2->elems[j].constraint & c2)
2097 m->elems[m->nelem].index = s2->elems[j].index;
2098 m->elems[m->nelem++].constraint = s2->elems[j].constraint & c2;
2100 j++;
2104 /* Merge two sets of positions into a third. The result is exactly as if
2105 the positions of both sets were inserted into an initially empty set. */
2106 static void
2107 merge (position_set const *s1, position_set const *s2, position_set *m)
2109 merge_constrained (s1, s2, -1, m);
2112 static void
2113 merge2 (position_set *dst, position_set const *src, position_set *m)
2115 if (src->nelem < 4)
2117 for (ptrdiff_t i = 0; i < src->nelem; ++i)
2118 insert (src->elems[i], dst);
2120 else
2122 merge (src, dst, m);
2123 copy (m, dst);
2127 /* Delete a position from a set. Return the nonzero constraint of the
2128 deleted position, or zero if there was no such position. */
2129 static unsigned int
2130 delete (size_t del, position_set *s)
2132 size_t count = s->nelem;
2133 size_t lo = 0, hi = count;
2134 while (lo < hi)
2136 size_t mid = (lo + hi) >> 1;
2137 if (s->elems[mid].index < del)
2138 lo = mid + 1;
2139 else if (s->elems[mid].index == del)
2141 unsigned int c = s->elems[mid].constraint;
2142 size_t i;
2143 for (i = mid; i + 1 < count; i++)
2144 s->elems[i] = s->elems[i + 1];
2145 s->nelem = i;
2146 return c;
2148 else
2149 hi = mid;
2151 return 0;
2154 /* Replace a position with the followed set. */
2155 static void
2156 replace (position_set *dst, size_t del, position_set *add,
2157 unsigned int constraint, position_set *tmp)
2159 unsigned int c = delete (del, dst) & constraint;
2161 if (c)
2163 copy (dst, tmp);
2164 merge_constrained (tmp, add, c, dst);
2168 /* Find the index of the state corresponding to the given position set with
2169 the given preceding context, or create a new state if there is no such
2170 state. Context tells whether we got here on a newline or letter. */
2171 static state_num
2172 state_index (struct dfa *d, position_set const *s, int context)
2174 size_t hash = 0;
2175 int constraint = 0;
2176 state_num i;
2177 token first_end = 0;
2179 for (i = 0; i < s->nelem; ++i)
2180 hash ^= s->elems[i].index + s->elems[i].constraint;
2182 /* Try to find a state that exactly matches the proposed one. */
2183 for (i = 0; i < d->sindex; ++i)
2185 if (hash != d->states[i].hash || s->nelem != d->states[i].elems.nelem
2186 || context != d->states[i].context)
2187 continue;
2188 state_num j;
2189 for (j = 0; j < s->nelem; ++j)
2190 if (s->elems[j].constraint != d->states[i].elems.elems[j].constraint
2191 || s->elems[j].index != d->states[i].elems.elems[j].index)
2192 break;
2193 if (j == s->nelem)
2194 return i;
2197 #ifdef DEBUG
2198 fprintf (stderr, "new state %zd\n nextpos:", i);
2199 for (state_num j = 0; j < s->nelem; j++)
2201 fprintf (stderr, " %zu:", s->elems[j].index);
2202 prtok (d->tokens[s->elems[j].index]);
2204 fprintf (stderr, "\n context:");
2205 if (context ^ CTX_ANY)
2207 if (context & CTX_NONE)
2208 fprintf (stderr, " CTX_NONE");
2209 if (context & CTX_LETTER)
2210 fprintf (stderr, " CTX_LETTER");
2211 if (context & CTX_NEWLINE)
2212 fprintf (stderr, " CTX_NEWLINE");
2214 else
2215 fprintf (stderr, " CTX_ANY");
2216 fprintf (stderr, "\n");
2217 #endif
2219 for (state_num j = 0; j < s->nelem; j++)
2221 int c = d->constraints[s->elems[j].index];
2223 if (c != 0)
2225 if (succeeds_in_context (c, context, CTX_ANY))
2226 constraint |= c;
2227 if (!first_end)
2228 first_end = d->tokens[s->elems[j].index];
2230 else if (d->tokens[s->elems[j].index] == BACKREF)
2231 constraint = NO_CONSTRAINT;
2235 /* Create a new state. */
2236 d->states = maybe_realloc (d->states, d->sindex, &d->salloc, -1,
2237 sizeof *d->states);
2238 d->states[i].hash = hash;
2239 alloc_position_set (&d->states[i].elems, s->nelem);
2240 copy (s, &d->states[i].elems);
2241 d->states[i].context = context;
2242 d->states[i].constraint = constraint;
2243 d->states[i].first_end = first_end;
2244 d->states[i].mbps.nelem = 0;
2245 d->states[i].mbps.elems = NULL;
2246 d->states[i].mb_trindex = -1;
2248 ++d->sindex;
2250 return i;
2253 /* Find the epsilon closure of a set of positions. If any position of the set
2254 contains a symbol that matches the empty string in some context, replace
2255 that position with the elements of its follow labeled with an appropriate
2256 constraint. Repeat exhaustively until no funny positions are left.
2257 S->elems must be large enough to hold the result. */
2258 static void
2259 epsclosure (struct dfa const *d)
2261 position_set tmp;
2262 alloc_position_set (&tmp, d->nleaves);
2263 for (size_t i = 0; i < d->tindex; ++i)
2264 if (d->follows[i].nelem > 0 && d->tokens[i] >= NOTCHAR
2265 && d->tokens[i] != BACKREF && d->tokens[i] != ANYCHAR
2266 && d->tokens[i] != MBCSET && d->tokens[i] < CSET)
2268 unsigned int constraint;
2269 switch (d->tokens[i])
2271 case BEGLINE:
2272 constraint = BEGLINE_CONSTRAINT;
2273 break;
2274 case ENDLINE:
2275 constraint = ENDLINE_CONSTRAINT;
2276 break;
2277 case BEGWORD:
2278 constraint = BEGWORD_CONSTRAINT;
2279 break;
2280 case ENDWORD:
2281 constraint = ENDWORD_CONSTRAINT;
2282 break;
2283 case LIMWORD:
2284 constraint = LIMWORD_CONSTRAINT;
2285 break;
2286 case NOTLIMWORD:
2287 constraint = NOTLIMWORD_CONSTRAINT;
2288 break;
2289 default:
2290 constraint = NO_CONSTRAINT;
2291 break;
2294 delete (i, &d->follows[i]);
2296 for (size_t j = 0; j < d->tindex; j++)
2297 if (i != j && d->follows[j].nelem > 0)
2298 replace (&d->follows[j], i, &d->follows[i], constraint, &tmp);
2300 free (tmp.elems);
2303 /* Returns the set of contexts for which there is at least one
2304 character included in C. */
2306 static int
2307 charclass_context (struct dfa const *dfa, charclass const *c)
2309 int context = 0;
2311 for (unsigned int j = 0; j < CHARCLASS_WORDS; ++j)
2313 if (c->w[j] & dfa->syntax.newline.w[j])
2314 context |= CTX_NEWLINE;
2315 if (c->w[j] & dfa->syntax.letters.w[j])
2316 context |= CTX_LETTER;
2317 if (c->w[j] & ~(dfa->syntax.letters.w[j] | dfa->syntax.newline.w[j]))
2318 context |= CTX_NONE;
2321 return context;
2324 /* Returns the contexts on which the position set S depends. Each context
2325 in the set of returned contexts (let's call it SC) may have a different
2326 follow set than other contexts in SC, and also different from the
2327 follow set of the complement set (sc ^ CTX_ANY). However, all contexts
2328 in the complement set will have the same follow set. */
2330 static int _GL_ATTRIBUTE_PURE
2331 state_separate_contexts (struct dfa *d, position_set const *s)
2333 int separate_contexts = 0;
2335 for (size_t j = 0; j < s->nelem; j++)
2336 separate_contexts |= d->separates[s->elems[j].index];
2338 return separate_contexts;
2341 enum
2343 /* Single token is repeated. It is distinguished from non-repeated. */
2344 OPT_REPEAT = (1 << 0),
2346 /* Multiple tokens are repeated. This flag is on at head of tokens. The
2347 node is not merged. */
2348 OPT_LPAREN = (1 << 1),
2350 /* Multiple branches are joined. The node is not merged. */
2351 OPT_RPAREN = (1 << 2),
2353 /* The node is walked. If the node is found in walking again, OPT_RPAREN
2354 flag is turned on. */
2355 OPT_WALKED = (1 << 3),
2357 /* The node is queued. The node is not queued again. */
2358 OPT_QUEUED = (1 << 4)
2361 static void
2362 merge_nfa_state (struct dfa *d, size_t tindex, char *flags,
2363 position_set *merged)
2365 position_set *follows = d->follows;
2366 ptrdiff_t nelem = 0;
2368 d->constraints[tindex] = 0;
2370 for (ptrdiff_t i = 0; i < follows[tindex].nelem; i++)
2372 size_t sindex = follows[tindex].elems[i].index;
2374 /* Skip the node as pruned in future. */
2375 unsigned int iconstraint = follows[tindex].elems[i].constraint;
2376 if (iconstraint == 0)
2377 continue;
2379 if (d->tokens[follows[tindex].elems[i].index] <= END)
2381 d->constraints[tindex] |= follows[tindex].elems[i].constraint;
2382 continue;
2385 if (!(flags[sindex] & (OPT_LPAREN | OPT_RPAREN)))
2387 ptrdiff_t j;
2389 for (j = 0; j < nelem; j++)
2391 size_t dindex = follows[tindex].elems[j].index;
2393 if (follows[tindex].elems[j].constraint != iconstraint)
2394 continue;
2396 if (flags[dindex] & (OPT_LPAREN | OPT_RPAREN))
2397 continue;
2399 if (d->tokens[sindex] != d->tokens[dindex])
2400 continue;
2402 if ((flags[sindex] ^ flags[dindex]) & OPT_REPEAT)
2403 continue;
2405 if (flags[sindex] & OPT_REPEAT)
2406 delete (sindex, &follows[sindex]);
2408 merge2 (&follows[dindex], &follows[sindex], merged);
2410 break;
2413 if (j < nelem)
2414 continue;
2417 follows[tindex].elems[nelem++] = follows[tindex].elems[i];
2418 flags[sindex] |= OPT_QUEUED;
2421 follows[tindex].nelem = nelem;
2424 static int
2425 compare (const void *a, const void *b)
2427 int aindex;
2428 int bindex;
2430 aindex = (int) ((position *) a)->index;
2431 bindex = (int) ((position *) b)->index;
2433 return aindex - bindex;
2436 static void
2437 reorder_tokens (struct dfa *d)
2439 ptrdiff_t nleaves;
2440 ptrdiff_t *map;
2441 token *tokens;
2442 position_set *follows;
2443 int *constraints;
2444 char *multibyte_prop;
2446 nleaves = 0;
2448 map = xnmalloc (d->tindex, sizeof *map);
2450 map[0] = nleaves++;
2452 for (ptrdiff_t i = 1; i < d->tindex; i++)
2453 map[i] = -1;
2455 tokens = xnmalloc (d->nleaves, sizeof *tokens);
2456 follows = xnmalloc (d->nleaves, sizeof *follows);
2457 constraints = xnmalloc (d->nleaves, sizeof *constraints);
2459 if (d->localeinfo.multibyte)
2460 multibyte_prop = xnmalloc (d->nleaves, sizeof *multibyte_prop);
2461 else
2462 multibyte_prop = NULL;
2464 for (ptrdiff_t i = 0; i < d->tindex; i++)
2466 if (map[i] == -1)
2468 free (d->follows[i].elems);
2469 d->follows[i].elems = NULL;
2470 d->follows[i].nelem = 0;
2471 continue;
2474 tokens[map[i]] = d->tokens[i];
2475 follows[map[i]] = d->follows[i];
2476 constraints[map[i]] = d->constraints[i];
2478 if (multibyte_prop != NULL)
2479 multibyte_prop[map[i]] = d->multibyte_prop[i];
2481 for (ptrdiff_t j = 0; j < d->follows[i].nelem; j++)
2483 if (map[d->follows[i].elems[j].index] == -1)
2484 map[d->follows[i].elems[j].index] = nleaves++;
2486 d->follows[i].elems[j].index = map[d->follows[i].elems[j].index];
2489 qsort (d->follows[i].elems, d->follows[i].nelem,
2490 sizeof *d->follows[i].elems, compare);
2493 for (ptrdiff_t i = 0; i < nleaves; i++)
2495 d->tokens[i] = tokens[i];
2496 d->follows[i] = follows[i];
2497 d->constraints[i] = constraints[i];
2499 if (multibyte_prop != NULL)
2500 d->multibyte_prop[i] = multibyte_prop[i];
2503 d->tindex = d->nleaves = nleaves;
2505 free (tokens);
2506 free (follows);
2507 free (constraints);
2508 free (multibyte_prop);
2509 free (map);
2512 static void
2513 dfaoptimize (struct dfa *d)
2515 char *flags;
2516 position_set merged0;
2517 position_set *merged;
2519 flags = xmalloc (d->tindex * sizeof *flags);
2520 memset (flags, 0, d->tindex * sizeof *flags);
2522 for (size_t i = 0; i < d->tindex; i++)
2524 for (ptrdiff_t j = 0; j < d->follows[i].nelem; j++)
2526 if (d->follows[i].elems[j].index == i)
2527 flags[d->follows[i].elems[j].index] |= OPT_REPEAT;
2528 else if (d->follows[i].elems[j].index < i)
2529 flags[d->follows[i].elems[j].index] |= OPT_LPAREN;
2530 else if (flags[d->follows[i].elems[j].index] &= OPT_WALKED)
2531 flags[d->follows[i].elems[j].index] |= OPT_RPAREN;
2532 else
2533 flags[d->follows[i].elems[j].index] |= OPT_WALKED;
2537 flags[0] |= OPT_QUEUED;
2539 merged = &merged0;
2540 alloc_position_set (merged, d->nleaves);
2542 d->constraints = xnmalloc (d->tindex, sizeof *d->constraints);
2544 for (ptrdiff_t i = 0; i < d->tindex; i++)
2545 if (flags[i] & OPT_QUEUED)
2546 merge_nfa_state (d, i, flags, merged);
2548 reorder_tokens (d);
2550 free (merged->elems);
2551 free (flags);
2554 /* Perform bottom-up analysis on the parse tree, computing various functions.
2555 Note that at this point, we're pretending constructs like \< are real
2556 characters rather than constraints on what can follow them.
2558 Nullable: A node is nullable if it is at the root of a regexp that can
2559 match the empty string.
2560 * EMPTY leaves are nullable.
2561 * No other leaf is nullable.
2562 * A QMARK or STAR node is nullable.
2563 * A PLUS node is nullable if its argument is nullable.
2564 * A CAT node is nullable if both its arguments are nullable.
2565 * An OR node is nullable if either argument is nullable.
2567 Firstpos: The firstpos of a node is the set of positions (nonempty leaves)
2568 that could correspond to the first character of a string matching the
2569 regexp rooted at the given node.
2570 * EMPTY leaves have empty firstpos.
2571 * The firstpos of a nonempty leaf is that leaf itself.
2572 * The firstpos of a QMARK, STAR, or PLUS node is the firstpos of its
2573 argument.
2574 * The firstpos of a CAT node is the firstpos of the left argument, union
2575 the firstpos of the right if the left argument is nullable.
2576 * The firstpos of an OR node is the union of firstpos of each argument.
2578 Lastpos: The lastpos of a node is the set of positions that could
2579 correspond to the last character of a string matching the regexp at
2580 the given node.
2581 * EMPTY leaves have empty lastpos.
2582 * The lastpos of a nonempty leaf is that leaf itself.
2583 * The lastpos of a QMARK, STAR, or PLUS node is the lastpos of its
2584 argument.
2585 * The lastpos of a CAT node is the lastpos of its right argument, union
2586 the lastpos of the left if the right argument is nullable.
2587 * The lastpos of an OR node is the union of the lastpos of each argument.
2589 Follow: The follow of a position is the set of positions that could
2590 correspond to the character following a character matching the node in
2591 a string matching the regexp. At this point we consider special symbols
2592 that match the empty string in some context to be just normal characters.
2593 Later, if we find that a special symbol is in a follow set, we will
2594 replace it with the elements of its follow, labeled with an appropriate
2595 constraint.
2596 * Every node in the firstpos of the argument of a STAR or PLUS node is in
2597 the follow of every node in the lastpos.
2598 * Every node in the firstpos of the second argument of a CAT node is in
2599 the follow of every node in the lastpos of the first argument.
2601 Because of the postfix representation of the parse tree, the depth-first
2602 analysis is conveniently done by a linear scan with the aid of a stack.
2603 Sets are stored as arrays of the elements, obeying a stack-like allocation
2604 scheme; the number of elements in each set deeper in the stack can be
2605 used to determine the address of a particular set's array. */
2606 static void
2607 dfaanalyze (struct dfa *d, bool searchflag)
2609 /* Array allocated to hold position sets. */
2610 position *posalloc = xnmalloc (d->nleaves, 2 * sizeof *posalloc);
2611 /* Firstpos and lastpos elements. */
2612 position *firstpos = posalloc;
2613 position *lastpos = firstpos + d->nleaves;
2614 position pos;
2615 position_set tmp;
2617 /* Stack for element counts and nullable flags. */
2618 struct
2620 /* Whether the entry is nullable. */
2621 bool nullable;
2623 /* Counts of firstpos and lastpos sets. */
2624 size_t nfirstpos;
2625 size_t nlastpos;
2626 } *stkalloc = xnmalloc (d->depth, sizeof *stkalloc), *stk = stkalloc;
2628 position_set merged; /* Result of merging sets. */
2630 addtok (d, CAT);
2632 #ifdef DEBUG
2633 fprintf (stderr, "dfaanalyze:\n");
2634 for (size_t i = 0; i < d->tindex; ++i)
2636 fprintf (stderr, " %zu:", i);
2637 prtok (d->tokens[i]);
2639 putc ('\n', stderr);
2640 #endif
2642 d->searchflag = searchflag;
2643 alloc_position_set (&merged, d->nleaves);
2644 d->follows = xcalloc (d->tindex, sizeof *d->follows);
2646 for (size_t i = 0; i < d->tindex; ++i)
2648 switch (d->tokens[i])
2650 case EMPTY:
2651 /* The empty set is nullable. */
2652 stk->nullable = true;
2654 /* The firstpos and lastpos of the empty leaf are both empty. */
2655 stk->nfirstpos = stk->nlastpos = 0;
2656 stk++;
2657 break;
2659 case STAR:
2660 case PLUS:
2661 /* Every element in the firstpos of the argument is in the follow
2662 of every element in the lastpos. */
2664 tmp.elems = firstpos - stk[-1].nfirstpos;
2665 tmp.nelem = stk[-1].nfirstpos;
2666 position *p = lastpos - stk[-1].nlastpos;
2667 for (size_t j = 0; j < stk[-1].nlastpos; j++)
2669 merge (&tmp, &d->follows[p[j].index], &merged);
2670 copy (&merged, &d->follows[p[j].index]);
2673 FALLTHROUGH;
2674 case QMARK:
2675 /* A QMARK or STAR node is automatically nullable. */
2676 if (d->tokens[i] != PLUS)
2677 stk[-1].nullable = true;
2678 break;
2680 case CAT:
2681 /* Every element in the firstpos of the second argument is in the
2682 follow of every element in the lastpos of the first argument. */
2684 tmp.nelem = stk[-1].nfirstpos;
2685 tmp.elems = firstpos - stk[-1].nfirstpos;
2686 position *p = lastpos - stk[-1].nlastpos - stk[-2].nlastpos;
2687 for (size_t j = 0; j < stk[-2].nlastpos; j++)
2689 merge (&tmp, &d->follows[p[j].index], &merged);
2690 copy (&merged, &d->follows[p[j].index]);
2694 /* The firstpos of a CAT node is the firstpos of the first argument,
2695 union that of the second argument if the first is nullable. */
2696 if (stk[-2].nullable)
2697 stk[-2].nfirstpos += stk[-1].nfirstpos;
2698 else
2699 firstpos -= stk[-1].nfirstpos;
2701 /* The lastpos of a CAT node is the lastpos of the second argument,
2702 union that of the first argument if the second is nullable. */
2703 if (stk[-1].nullable)
2704 stk[-2].nlastpos += stk[-1].nlastpos;
2705 else
2707 position *p = lastpos - stk[-1].nlastpos - stk[-2].nlastpos;
2708 for (size_t j = 0; j < stk[-1].nlastpos; j++)
2709 p[j] = p[j + stk[-2].nlastpos];
2710 lastpos -= stk[-2].nlastpos;
2711 stk[-2].nlastpos = stk[-1].nlastpos;
2714 /* A CAT node is nullable if both arguments are nullable. */
2715 stk[-2].nullable &= stk[-1].nullable;
2716 stk--;
2717 break;
2719 case OR:
2720 /* The firstpos is the union of the firstpos of each argument. */
2721 stk[-2].nfirstpos += stk[-1].nfirstpos;
2723 /* The lastpos is the union of the lastpos of each argument. */
2724 stk[-2].nlastpos += stk[-1].nlastpos;
2726 /* An OR node is nullable if either argument is nullable. */
2727 stk[-2].nullable |= stk[-1].nullable;
2728 stk--;
2729 break;
2731 default:
2732 /* Anything else is a nonempty position. (Note that special
2733 constructs like \< are treated as nonempty strings here;
2734 an "epsilon closure" effectively makes them nullable later.
2735 Backreferences have to get a real position so we can detect
2736 transitions on them later. But they are nullable. */
2737 stk->nullable = d->tokens[i] == BACKREF;
2739 /* This position is in its own firstpos and lastpos. */
2740 stk->nfirstpos = stk->nlastpos = 1;
2741 stk++;
2743 firstpos->index = lastpos->index = i;
2744 firstpos->constraint = lastpos->constraint = NO_CONSTRAINT;
2745 firstpos++, lastpos++;
2747 break;
2749 #ifdef DEBUG
2750 /* ... balance the above nonsyntactic #ifdef goo... */
2751 fprintf (stderr, "node %zu:", i);
2752 prtok (d->tokens[i]);
2753 putc ('\n', stderr);
2754 fprintf (stderr,
2755 stk[-1].nullable ? " nullable: yes\n" : " nullable: no\n");
2756 fprintf (stderr, " firstpos:");
2757 for (size_t j = 0; j < stk[-1].nfirstpos; j++)
2759 fprintf (stderr, " %zu:", firstpos[j - stk[-1].nfirstpos].index);
2760 prtok (d->tokens[firstpos[j - stk[-1].nfirstpos].index]);
2762 fprintf (stderr, "\n lastpos:");
2763 for (size_t j = 0; j < stk[-1].nlastpos; j++)
2765 fprintf (stderr, " %zu:", lastpos[j - stk[-1].nlastpos].index);
2766 prtok (d->tokens[lastpos[j - stk[-1].nlastpos].index]);
2768 putc ('\n', stderr);
2769 #endif
2772 /* For each follow set that is the follow set of a real position, replace
2773 it with its epsilon closure. */
2774 epsclosure (d);
2776 dfaoptimize (d);
2778 #ifdef DEBUG
2779 for (size_t i = 0; i < d->tindex; ++i)
2780 if (d->tokens[i] == BEG || d->tokens[i] < NOTCHAR
2781 || d->tokens[i] == BACKREF || d->tokens[i] == ANYCHAR
2782 || d->tokens[i] == MBCSET || d->tokens[i] >= CSET)
2784 fprintf (stderr, "follows(%zu:", i);
2785 prtok (d->tokens[i]);
2786 fprintf (stderr, "):");
2787 for (size_t j = 0; j < d->follows[i].nelem; j++)
2789 fprintf (stderr, " %zu:", d->follows[i].elems[j].index);
2790 prtok (d->tokens[d->follows[i].elems[j].index]);
2792 putc ('\n', stderr);
2794 #endif
2796 pos.index = 0;
2797 pos.constraint = NO_CONSTRAINT;
2799 alloc_position_set (&tmp, 1);
2801 append (pos, &tmp);
2803 d->separates = xnmalloc (d->tindex, sizeof *d->separates);
2805 for (ptrdiff_t i = 0; i < d->tindex; i++)
2807 d->separates[i] = 0;
2809 if (prev_newline_dependent (d->constraints[i]))
2810 d->separates[i] |= CTX_NEWLINE;
2811 if (prev_letter_dependent (d->constraints[i]))
2812 d->separates[i] |= CTX_LETTER;
2814 for (ptrdiff_t j = 0; j < d->follows[i].nelem; j++)
2816 if (prev_newline_dependent (d->follows[i].elems[j].constraint))
2817 d->separates[i] |= CTX_NEWLINE;
2818 if (prev_letter_dependent (d->follows[i].elems[j].constraint))
2819 d->separates[i] |= CTX_LETTER;
2823 /* Context wanted by some position. */
2824 int separate_contexts = state_separate_contexts (d, &tmp);
2826 /* Build the initial state. */
2827 if (separate_contexts & CTX_NEWLINE)
2828 state_index (d, &tmp, CTX_NEWLINE);
2829 d->initstate_notbol = d->min_trcount
2830 = state_index (d, &tmp, separate_contexts ^ CTX_ANY);
2831 if (separate_contexts & CTX_LETTER)
2832 d->min_trcount = state_index (d, &tmp, CTX_LETTER);
2833 d->min_trcount++;
2834 d->trcount = 0;
2836 free (posalloc);
2837 free (stkalloc);
2838 free (merged.elems);
2839 free (tmp.elems);
2842 /* Make sure D's state arrays are large enough to hold NEW_STATE. */
2843 static void
2844 realloc_trans_if_necessary (struct dfa *d)
2846 state_num oldalloc = d->tralloc;
2847 if (oldalloc < d->sindex)
2849 state_num **realtrans = d->trans ? d->trans - 2 : NULL;
2850 ptrdiff_t newalloc1 = realtrans ? d->tralloc + 2 : 0;
2851 realtrans = xpalloc (realtrans, &newalloc1, d->sindex - oldalloc,
2852 -1, sizeof *realtrans);
2853 realtrans[0] = realtrans[1] = NULL;
2854 d->trans = realtrans + 2;
2855 ptrdiff_t newalloc = d->tralloc = newalloc1 - 2;
2856 d->fails = xnrealloc (d->fails, newalloc, sizeof *d->fails);
2857 d->success = xnrealloc (d->success, newalloc, sizeof *d->success);
2858 d->newlines = xnrealloc (d->newlines, newalloc, sizeof *d->newlines);
2859 if (d->localeinfo.multibyte)
2861 realtrans = d->mb_trans ? d->mb_trans - 2 : NULL;
2862 realtrans = xnrealloc (realtrans, newalloc1, sizeof *realtrans);
2863 if (oldalloc == 0)
2864 realtrans[0] = realtrans[1] = NULL;
2865 d->mb_trans = realtrans + 2;
2867 for (; oldalloc < newalloc; oldalloc++)
2869 d->trans[oldalloc] = NULL;
2870 d->fails[oldalloc] = NULL;
2871 if (d->localeinfo.multibyte)
2872 d->mb_trans[oldalloc] = NULL;
2878 Calculate the transition table for a new state derived from state s
2879 for a compiled dfa d after input character uc, and return the new
2880 state number.
2882 Do not worry about all possible input characters; calculate just the group
2883 of positions that match uc. Label it with the set of characters that
2884 every position in the group matches (taking into account, if necessary,
2885 preceding context information of s). Then find the union
2886 of these positions' follows, i.e., the set of positions of the
2887 new state. For each character in the group's label, set the transition
2888 on this character to be to a state corresponding to the set's positions,
2889 and its associated backward context information, if necessary.
2891 When building a searching matcher, include the positions of state
2892 0 in every state.
2894 The group is constructed by building an equivalence-class
2895 partition of the positions of s.
2897 For each position, find the set of characters C that it matches. Eliminate
2898 any characters from C that fail on grounds of backward context.
2900 Check whether the group's label L has nonempty
2901 intersection with C. If L - C is nonempty, create a new group labeled
2902 L - C and having the same positions as the current group, and set L to
2903 the intersection of L and C. Insert the position in the group, set
2904 C = C - L, and resume scanning.
2906 If after comparing with every group there are characters remaining in C,
2907 create a new group labeled with the characters of C and insert this
2908 position in that group. */
2910 static state_num
2911 build_state (state_num s, struct dfa *d, unsigned char uc)
2913 position_set follows; /* Union of the follows for each
2914 position of the current state. */
2915 position_set group; /* Positions that match the input char. */
2916 position_set tmp; /* Temporary space for merging sets. */
2917 state_num state; /* New state. */
2918 state_num state_newline; /* New state on a newline transition. */
2919 state_num state_letter; /* New state on a letter transition. */
2921 #ifdef DEBUG
2922 fprintf (stderr, "build state %td\n", s);
2923 #endif
2925 /* A pointer to the new transition table, and the table itself. */
2926 state_num **ptrans = (accepting (s, d) ? d->fails : d->trans) + s;
2927 state_num *trans = *ptrans;
2929 if (!trans)
2931 /* MAX_TRCOUNT is an arbitrary upper limit on the number of
2932 transition tables that can exist at once, other than for
2933 initial states. Often-used transition tables are quickly
2934 rebuilt, whereas rarely-used ones are cleared away. */
2935 if (MAX_TRCOUNT <= d->trcount)
2937 for (state_num i = d->min_trcount; i < d->tralloc; i++)
2939 free (d->trans[i]);
2940 free (d->fails[i]);
2941 d->trans[i] = d->fails[i] = NULL;
2943 d->trcount = 0;
2946 d->trcount++;
2947 *ptrans = trans = xmalloc (NOTCHAR * sizeof *trans);
2949 /* Fill transition table with a default value which means that the
2950 transited state has not been calculated yet. */
2951 for (int i = 0; i < NOTCHAR; i++)
2952 trans[i] = -2;
2955 /* Set up the success bits for this state. */
2956 d->success[s] = 0;
2957 if (accepts_in_context (d->states[s].context, CTX_NEWLINE, s, d))
2958 d->success[s] |= CTX_NEWLINE;
2959 if (accepts_in_context (d->states[s].context, CTX_LETTER, s, d))
2960 d->success[s] |= CTX_LETTER;
2961 if (accepts_in_context (d->states[s].context, CTX_NONE, s, d))
2962 d->success[s] |= CTX_NONE;
2964 alloc_position_set (&follows, d->nleaves);
2966 /* Find the union of the follows of the positions of the group.
2967 This is a hideously inefficient loop. Fix it someday. */
2968 for (size_t j = 0; j < d->states[s].elems.nelem; ++j)
2969 for (size_t k = 0;
2970 k < d->follows[d->states[s].elems.elems[j].index].nelem; ++k)
2971 insert (d->follows[d->states[s].elems.elems[j].index].elems[k],
2972 &follows);
2974 /* Positions that match the input char. */
2975 alloc_position_set (&group, d->nleaves);
2977 /* The group's label. */
2978 charclass label;
2979 fillset (&label);
2981 for (size_t i = 0; i < follows.nelem; ++i)
2983 charclass matches; /* Set of matching characters. */
2984 position pos = follows.elems[i];
2985 bool matched = false;
2986 if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR)
2988 zeroset (&matches);
2989 setbit (d->tokens[pos.index], &matches);
2990 if (d->tokens[pos.index] == uc)
2991 matched = true;
2993 else if (d->tokens[pos.index] >= CSET)
2995 matches = d->charclasses[d->tokens[pos.index] - CSET];
2996 if (tstbit (uc, &matches))
2997 matched = true;
2999 else if (d->tokens[pos.index] == ANYCHAR)
3001 matches = d->charclasses[d->canychar];
3002 if (tstbit (uc, &matches))
3003 matched = true;
3005 /* ANYCHAR must match with a single character, so we must put
3006 it to D->states[s].mbps which contains the positions which
3007 can match with a single character not a byte. If all
3008 positions which has ANYCHAR does not depend on context of
3009 next character, we put the follows instead of it to
3010 D->states[s].mbps to optimize. */
3011 if (succeeds_in_context (pos.constraint, d->states[s].context,
3012 CTX_NONE))
3014 if (d->states[s].mbps.nelem == 0)
3015 alloc_position_set (&d->states[s].mbps, 1);
3016 insert (pos, &d->states[s].mbps);
3019 else
3020 continue;
3022 /* Some characters may need to be eliminated from matches because
3023 they fail in the current context. */
3024 if (pos.constraint != NO_CONSTRAINT)
3026 if (!succeeds_in_context (pos.constraint,
3027 d->states[s].context, CTX_NEWLINE))
3028 for (size_t j = 0; j < CHARCLASS_WORDS; ++j)
3029 matches.w[j] &= ~d->syntax.newline.w[j];
3030 if (!succeeds_in_context (pos.constraint,
3031 d->states[s].context, CTX_LETTER))
3032 for (size_t j = 0; j < CHARCLASS_WORDS; ++j)
3033 matches.w[j] &= ~d->syntax.letters.w[j];
3034 if (!succeeds_in_context (pos.constraint,
3035 d->states[s].context, CTX_NONE))
3036 for (size_t j = 0; j < CHARCLASS_WORDS; ++j)
3037 matches.w[j] &= d->syntax.letters.w[j] | d->syntax.newline.w[j];
3039 /* If there are no characters left, there's no point in going on. */
3040 if (emptyset (&matches))
3041 continue;
3043 /* If we have reset the bit that made us declare "matched", reset
3044 that indicator, too. This is required to avoid an infinite loop
3045 with this command: echo cx | LC_ALL=C grep -E 'c\b[x ]' */
3046 if (!tstbit (uc, &matches))
3047 matched = false;
3050 #ifdef DEBUG
3051 fprintf (stderr, " nextpos %zu:", pos.index);
3052 prtok (d->tokens[pos.index]);
3053 fprintf (stderr, " of");
3054 for (size_t j = 0; j < NOTCHAR; j++)
3055 if (tstbit (j, &matches))
3056 fprintf (stderr, " 0x%02zx", j);
3057 fprintf (stderr, "\n");
3058 #endif
3060 if (matched)
3062 for (size_t k = 0; k < CHARCLASS_WORDS; ++k)
3063 label.w[k] &= matches.w[k];
3064 append (pos, &group);
3066 else
3068 for (size_t k = 0; k < CHARCLASS_WORDS; ++k)
3069 label.w[k] &= ~matches.w[k];
3073 alloc_position_set (&tmp, d->nleaves);
3075 if (group.nelem > 0)
3077 /* If we are building a searching matcher, throw in the positions
3078 of state 0 as well, if possible. */
3079 if (d->searchflag)
3081 /* If a token in follows.elems is not 1st byte of a multibyte
3082 character, or the states of follows must accept the bytes
3083 which are not 1st byte of the multibyte character.
3084 Then, if a state of follows encounters a byte, it must not be
3085 a 1st byte of a multibyte character nor a single byte character.
3086 In this case, do not add state[0].follows to next state, because
3087 state[0] must accept 1st-byte.
3089 For example, suppose <sb a> is a certain single byte character,
3090 <mb A> is a certain multibyte character, and the codepoint of
3091 <sb a> equals the 2nd byte of the codepoint of <mb A>. When
3092 state[0] accepts <sb a>, state[i] transits to state[i+1] by
3093 accepting the 1st byte of <mb A>, and state[i+1] accepts the
3094 2nd byte of <mb A>, if state[i+1] encounters the codepoint of
3095 <sb a>, it must not be <sb a> but the 2nd byte of <mb A>, so do
3096 not add state[0]. */
3098 bool mergeit = !d->localeinfo.multibyte;
3099 if (!mergeit)
3101 mergeit = true;
3102 for (size_t j = 0; mergeit && j < group.nelem; j++)
3103 mergeit &= d->multibyte_prop[group.elems[j].index];
3105 if (mergeit)
3107 merge (&d->states[0].elems, &group, &tmp);
3108 copy (&tmp, &group);
3112 /* Find out if the new state will want any context information,
3113 by calculating possible contexts that the group can match,
3114 and separate contexts that the new state wants to know. */
3115 int possible_contexts = charclass_context (d, &label);
3116 int separate_contexts = state_separate_contexts (d, &group);
3118 /* Find the state(s) corresponding to the union of the follows. */
3119 if (possible_contexts & ~separate_contexts)
3120 state = state_index (d, &group, separate_contexts ^ CTX_ANY);
3121 else
3122 state = -1;
3123 if (separate_contexts & possible_contexts & CTX_NEWLINE)
3124 state_newline = state_index (d, &group, CTX_NEWLINE);
3125 else
3126 state_newline = state;
3127 if (separate_contexts & possible_contexts & CTX_LETTER)
3128 state_letter = state_index (d, &group, CTX_LETTER);
3129 else
3130 state_letter = state;
3132 /* Reallocate now, to reallocate any newline transition properly. */
3133 realloc_trans_if_necessary (d);
3136 /* If we are a searching matcher, the default transition is to a state
3137 containing the positions of state 0, otherwise the default transition
3138 is to fail miserably. */
3139 else if (d->searchflag)
3141 state_newline = 0;
3142 state_letter = d->min_trcount - 1;
3143 state = d->initstate_notbol;
3145 else
3147 state_newline = -1;
3148 state_letter = -1;
3149 state = -1;
3152 /* Set the transitions for each character in the label. */
3153 for (size_t i = 0; i < NOTCHAR; i++)
3154 if (tstbit (i, &label))
3155 switch (d->syntax.sbit[i])
3157 case CTX_NEWLINE:
3158 trans[i] = state_newline;
3159 break;
3160 case CTX_LETTER:
3161 trans[i] = state_letter;
3162 break;
3163 default:
3164 trans[i] = state;
3165 break;
3168 #ifdef DEBUG
3169 fprintf (stderr, "trans table %td", s);
3170 for (size_t i = 0; i < NOTCHAR; ++i)
3172 if (!(i & 0xf))
3173 fprintf (stderr, "\n");
3174 fprintf (stderr, " %2td", trans[i]);
3176 fprintf (stderr, "\n");
3177 #endif
3179 free (group.elems);
3180 free (follows.elems);
3181 free (tmp.elems);
3183 /* Keep the newline transition in a special place so we can use it as
3184 a sentinel. */
3185 if (tstbit (d->syntax.eolbyte, &label))
3187 d->newlines[s] = trans[d->syntax.eolbyte];
3188 trans[d->syntax.eolbyte] = -1;
3191 return trans[uc];
3194 /* Multibyte character handling sub-routines for dfaexec. */
3196 /* Consume a single byte and transit state from 's' to '*next_state'.
3197 This function is almost same as the state transition routin in dfaexec.
3198 But state transition is done just once, otherwise matching succeed or
3199 reach the end of the buffer. */
3200 static state_num
3201 transit_state_singlebyte (struct dfa *d, state_num s, unsigned char const **pp)
3203 state_num *t;
3205 if (d->trans[s])
3206 t = d->trans[s];
3207 else if (d->fails[s])
3208 t = d->fails[s];
3209 else
3211 build_state (s, d, **pp);
3212 if (d->trans[s])
3213 t = d->trans[s];
3214 else
3216 t = d->fails[s];
3217 assert (t);
3221 if (t[**pp] == -2)
3222 build_state (s, d, **pp);
3224 return t[*(*pp)++];
3227 /* Transit state from s, then return new state and update the pointer of
3228 the buffer. This function is for a period operator which can match a
3229 multi-byte character. */
3230 static state_num
3231 transit_state (struct dfa *d, state_num s, unsigned char const **pp,
3232 unsigned char const *end)
3234 wint_t wc;
3236 int mbclen = mbs_to_wchar (&wc, (char const *) *pp, end - *pp, d);
3238 /* This state has some operators which can match a multibyte character. */
3239 d->mb_follows.nelem = 0;
3241 /* Calculate the state which can be reached from the state 's' by
3242 consuming 'mbclen' single bytes from the buffer. */
3243 state_num s1 = s;
3244 int mbci;
3245 for (mbci = 0; mbci < mbclen && (mbci == 0 || d->min_trcount <= s); mbci++)
3246 s = transit_state_singlebyte (d, s, pp);
3247 *pp += mbclen - mbci;
3249 if (wc == WEOF)
3251 /* It is an invalid character, so ANYCHAR is not accepted. */
3252 return s;
3255 /* If all positions which have ANYCHAR do not depend on the context
3256 of the next character, calculate the next state with
3257 pre-calculated follows and cache the result. */
3258 if (d->states[s1].mb_trindex < 0)
3260 if (MAX_TRCOUNT <= d->mb_trcount)
3262 state_num s3;
3263 for (s3 = -1; s3 < d->tralloc; s3++)
3265 free (d->mb_trans[s3]);
3266 d->mb_trans[s3] = NULL;
3269 for (state_num i = 0; i < d->sindex; i++)
3270 d->states[i].mb_trindex = -1;
3271 d->mb_trcount = 0;
3273 d->states[s1].mb_trindex = d->mb_trcount++;
3276 if (! d->mb_trans[s])
3278 enum { TRANSPTR_SIZE = sizeof *d->mb_trans[s] };
3279 enum { TRANSALLOC_SIZE = MAX_TRCOUNT * TRANSPTR_SIZE };
3280 d->mb_trans[s] = xmalloc (TRANSALLOC_SIZE);
3281 for (int i = 0; i < MAX_TRCOUNT; i++)
3282 d->mb_trans[s][i] = -1;
3284 else if (d->mb_trans[s][d->states[s1].mb_trindex] >= 0)
3285 return d->mb_trans[s][d->states[s1].mb_trindex];
3287 if (s == -1)
3288 copy (&d->states[s1].mbps, &d->mb_follows);
3289 else
3290 merge (&d->states[s1].mbps, &d->states[s].elems, &d->mb_follows);
3292 int separate_contexts = state_separate_contexts (d, &d->mb_follows);
3293 state_num s2 = state_index (d, &d->mb_follows, separate_contexts ^ CTX_ANY);
3294 realloc_trans_if_necessary (d);
3296 d->mb_trans[s][d->states[s1].mb_trindex] = s2;
3298 return s2;
3301 /* The initial state may encounter a byte which is not a single byte character
3302 nor the first byte of a multibyte character. But it is incorrect for the
3303 initial state to accept such a byte. For example, in Shift JIS the regular
3304 expression "\\" accepts the codepoint 0x5c, but should not accept the second
3305 byte of the codepoint 0x815c. Then the initial state must skip the bytes
3306 that are not a single byte character nor the first byte of a multibyte
3307 character.
3309 Given DFA state d, use mbs_to_wchar to advance MBP until it reaches
3310 or exceeds P, and return the advanced MBP. If WCP is non-NULL and
3311 the result is greater than P, set *WCP to the final wide character
3312 processed, or to WEOF if no wide character is processed. Otherwise,
3313 if WCP is non-NULL, *WCP may or may not be updated.
3315 Both P and MBP must be no larger than END. */
3316 static unsigned char const *
3317 skip_remains_mb (struct dfa *d, unsigned char const *p,
3318 unsigned char const *mbp, char const *end)
3320 if (d->syntax.never_trail[*p])
3321 return p;
3322 while (mbp < p)
3324 wint_t wc;
3325 mbp += mbs_to_wchar (&wc, (char const *) mbp,
3326 end - (char const *) mbp, d);
3328 return mbp;
3331 /* Search through a buffer looking for a match to the struct dfa *D.
3332 Find the first occurrence of a string matching the regexp in the
3333 buffer, and the shortest possible version thereof. Return a pointer to
3334 the first character after the match, or NULL if none is found. BEGIN
3335 points to the beginning of the buffer, and END points to the first byte
3336 after its end. Note however that we store a sentinel byte (usually
3337 newline) in *END, so the actual buffer must be one byte longer.
3338 When ALLOW_NL, newlines may appear in the matching string.
3339 If COUNT is non-NULL, increment *COUNT once for each newline processed.
3340 If MULTIBYTE, the input consists of multibyte characters and/or
3341 encoding-error bytes. Otherwise, it consists of single-byte characters.
3342 Here is the list of features that make this DFA matcher punt:
3343 - [M-N] range in non-simple locale: regex is up to 25% faster on [a-z]
3344 - [^...] in non-simple locale
3345 - [[=foo=]] or [[.foo.]]
3346 - [[:alpha:]] etc. in multibyte locale (except [[:digit:]] works OK)
3347 - back-reference: (.)\1
3348 - word-delimiter in multibyte locale: \<, \>, \b, \B
3349 See using_simple_locale for the definition of "simple locale". */
3351 static inline char *
3352 dfaexec_main (struct dfa *d, char const *begin, char *end, bool allow_nl,
3353 size_t *count, bool multibyte)
3355 if (MAX_TRCOUNT <= d->sindex)
3357 for (state_num s = d->min_trcount; s < d->sindex; s++)
3359 free (d->states[s].elems.elems);
3360 free (d->states[s].mbps.elems);
3362 d->sindex = d->min_trcount;
3364 if (d->trans)
3366 for (state_num s = 0; s < d->tralloc; s++)
3368 free (d->trans[s]);
3369 free (d->fails[s]);
3370 d->trans[s] = d->fails[s] = NULL;
3372 d->trcount = 0;
3375 if (d->localeinfo.multibyte && d->mb_trans)
3377 for (state_num s = -1; s < d->tralloc; s++)
3379 free (d->mb_trans[s]);
3380 d->mb_trans[s] = NULL;
3382 for (state_num s = 0; s < d->min_trcount; s++)
3383 d->states[s].mb_trindex = -1;
3384 d->mb_trcount = 0;
3388 if (!d->tralloc)
3389 realloc_trans_if_necessary (d);
3391 /* Current state. */
3392 state_num s = 0, s1 = 0;
3394 /* Current input character. */
3395 unsigned char const *p = (unsigned char const *) begin;
3396 unsigned char const *mbp = p;
3398 /* Copy of d->trans so it can be optimized into a register. */
3399 state_num **trans = d->trans;
3400 unsigned char eol = d->syntax.eolbyte; /* Likewise for eolbyte. */
3401 unsigned char saved_end = *(unsigned char *) end;
3402 *end = eol;
3404 if (multibyte)
3406 memset (&d->mbs, 0, sizeof d->mbs);
3407 if (d->mb_follows.alloc == 0)
3408 alloc_position_set (&d->mb_follows, d->nleaves);
3411 size_t nlcount = 0;
3412 for (;;)
3414 state_num *t;
3415 while ((t = trans[s]) != NULL)
3417 if (s < d->min_trcount)
3419 if (!multibyte || d->states[s].mbps.nelem == 0)
3421 while (t[*p] == s)
3422 p++;
3424 if (multibyte)
3425 p = mbp = skip_remains_mb (d, p, mbp, end);
3428 if (multibyte)
3430 s1 = s;
3432 if (d->states[s].mbps.nelem == 0
3433 || d->localeinfo.sbctowc[*p] != WEOF || (char *) p >= end)
3435 /* If an input character does not match ANYCHAR, do it
3436 like a single-byte character. */
3437 s = t[*p++];
3439 else
3441 s = transit_state (d, s, &p, (unsigned char *) end);
3442 mbp = p;
3443 trans = d->trans;
3446 else
3448 s1 = t[*p++];
3449 t = trans[s1];
3450 if (! t)
3452 state_num tmp = s;
3453 s = s1;
3454 s1 = tmp; /* swap */
3455 break;
3457 if (s < d->min_trcount)
3459 while (t[*p] == s1)
3460 p++;
3462 s = t[*p++];
3466 if (s < 0)
3468 if (s == -2)
3470 s = build_state (s1, d, p[-1]);
3471 trans = d->trans;
3473 else if ((char *) p <= end && p[-1] == eol && 0 <= d->newlines[s1])
3475 /* The previous character was a newline. Count it, and skip
3476 checking of multibyte character boundary until here. */
3477 nlcount++;
3478 mbp = p;
3480 s = (allow_nl ? d->newlines[s1]
3481 : d->syntax.sbit[eol] == CTX_NEWLINE ? 0
3482 : d->syntax.sbit[eol] == CTX_LETTER ? d->min_trcount - 1
3483 : d->initstate_notbol);
3485 else
3487 p = NULL;
3488 goto done;
3491 else if (d->fails[s])
3493 if ((d->success[s] & d->syntax.sbit[*p])
3494 || ((char *) p == end
3495 && accepts_in_context (d->states[s].context, CTX_NEWLINE, s,
3496 d)))
3497 goto done;
3499 if (multibyte && s < d->min_trcount)
3500 p = mbp = skip_remains_mb (d, p, mbp, end);
3502 s1 = s;
3503 if (!multibyte || d->states[s].mbps.nelem == 0
3504 || d->localeinfo.sbctowc[*p] != WEOF || (char *) p >= end)
3506 /* If a input character does not match ANYCHAR, do it
3507 like a single-byte character. */
3508 s = d->fails[s][*p++];
3510 else
3512 s = transit_state (d, s, &p, (unsigned char *) end);
3513 mbp = p;
3514 trans = d->trans;
3517 else
3519 build_state (s, d, p[0]);
3520 trans = d->trans;
3524 done:
3525 if (count)
3526 *count += nlcount;
3527 *end = saved_end;
3528 return (char *) p;
3531 /* Specialized versions of dfaexec for multibyte and single-byte cases.
3532 This is for performance, as dfaexec_main is an inline function. */
3534 static char *
3535 dfaexec_mb (struct dfa *d, char const *begin, char *end,
3536 bool allow_nl, size_t *count, bool *backref)
3538 return dfaexec_main (d, begin, end, allow_nl, count, true);
3541 static char *
3542 dfaexec_sb (struct dfa *d, char const *begin, char *end,
3543 bool allow_nl, size_t *count, bool *backref)
3545 return dfaexec_main (d, begin, end, allow_nl, count, false);
3548 /* Always set *BACKREF and return BEGIN. Use this wrapper for
3549 any regexp that uses a construct not supported by this code. */
3550 static char *
3551 dfaexec_noop (struct dfa *d, char const *begin, char *end,
3552 bool allow_nl, size_t *count, bool *backref)
3554 *backref = true;
3555 return (char *) begin;
3558 /* Like dfaexec_main (D, BEGIN, END, ALLOW_NL, COUNT, D->localeinfo.multibyte),
3559 but faster and set *BACKREF if the DFA code does not support this
3560 regexp usage. */
3562 char *
3563 dfaexec (struct dfa *d, char const *begin, char *end,
3564 bool allow_nl, size_t *count, bool *backref)
3566 return d->dfaexec (d, begin, end, allow_nl, count, backref);
3569 struct dfa *
3570 dfasuperset (struct dfa const *d)
3572 return d->superset;
3575 bool
3576 dfaisfast (struct dfa const *d)
3578 return d->fast;
3581 static void
3582 free_mbdata (struct dfa *d)
3584 free (d->multibyte_prop);
3585 free (d->lex.brack.chars);
3586 free (d->mb_follows.elems);
3588 if (d->mb_trans)
3590 state_num s;
3591 for (s = -1; s < d->tralloc; s++)
3592 free (d->mb_trans[s]);
3593 free (d->mb_trans - 2);
3597 /* Return true if every construct in D is supported by this DFA matcher. */
3598 static bool _GL_ATTRIBUTE_PURE
3599 dfa_supported (struct dfa const *d)
3601 for (size_t i = 0; i < d->tindex; i++)
3603 switch (d->tokens[i])
3605 case BEGWORD:
3606 case ENDWORD:
3607 case LIMWORD:
3608 case NOTLIMWORD:
3609 if (!d->localeinfo.multibyte)
3610 continue;
3611 FALLTHROUGH;
3612 case BACKREF:
3613 case MBCSET:
3614 return false;
3617 return true;
3620 /* Disable use of the superset DFA if it is not likely to help
3621 performance. */
3622 static void
3623 maybe_disable_superset_dfa (struct dfa *d)
3625 if (!d->localeinfo.using_utf8)
3626 return;
3628 bool have_backref = false;
3629 for (size_t i = 0; i < d->tindex; ++i)
3631 switch (d->tokens[i])
3633 case ANYCHAR:
3634 /* Lowered. */
3635 abort ();
3636 case BACKREF:
3637 have_backref = true;
3638 break;
3639 case MBCSET:
3640 /* Requires multi-byte algorithm. */
3641 return;
3642 default:
3643 break;
3647 if (!have_backref && d->superset)
3649 /* The superset DFA is not likely to be much faster, so remove it. */
3650 dfafree (d->superset);
3651 free (d->superset);
3652 d->superset = NULL;
3655 free_mbdata (d);
3656 d->localeinfo.multibyte = false;
3657 d->dfaexec = dfaexec_sb;
3658 d->fast = true;
3661 static void
3662 dfassbuild (struct dfa *d)
3664 struct dfa *sup = dfaalloc ();
3666 *sup = *d;
3667 sup->localeinfo.multibyte = false;
3668 sup->dfaexec = dfaexec_sb;
3669 sup->multibyte_prop = NULL;
3670 sup->superset = NULL;
3671 sup->states = NULL;
3672 sup->sindex = 0;
3673 sup->constraints = NULL;
3674 sup->separates = NULL;
3675 sup->follows = NULL;
3676 sup->tralloc = 0;
3677 sup->trans = NULL;
3678 sup->fails = NULL;
3679 sup->success = NULL;
3680 sup->newlines = NULL;
3682 sup->charclasses = xnmalloc (sup->calloc, sizeof *sup->charclasses);
3683 if (d->cindex)
3685 memcpy (sup->charclasses, d->charclasses,
3686 d->cindex * sizeof *sup->charclasses);
3689 sup->tokens = xnmalloc (d->tindex, 2 * sizeof *sup->tokens);
3690 sup->talloc = d->tindex * 2;
3692 bool have_achar = false;
3693 bool have_nchar = false;
3694 size_t j;
3695 for (size_t i = j = 0; i < d->tindex; i++)
3697 switch (d->tokens[i])
3699 case ANYCHAR:
3700 case MBCSET:
3701 case BACKREF:
3703 charclass ccl;
3704 fillset (&ccl);
3705 sup->tokens[j++] = CSET + charclass_index (sup, &ccl);
3706 sup->tokens[j++] = STAR;
3707 if (d->tokens[i + 1] == QMARK || d->tokens[i + 1] == STAR
3708 || d->tokens[i + 1] == PLUS)
3709 i++;
3710 have_achar = true;
3712 break;
3713 case BEGWORD:
3714 case ENDWORD:
3715 case LIMWORD:
3716 case NOTLIMWORD:
3717 if (d->localeinfo.multibyte)
3719 /* These constraints aren't supported in a multibyte locale.
3720 Ignore them in the superset DFA. */
3721 sup->tokens[j++] = EMPTY;
3722 break;
3724 FALLTHROUGH;
3725 default:
3726 sup->tokens[j++] = d->tokens[i];
3727 if ((0 <= d->tokens[i] && d->tokens[i] < NOTCHAR)
3728 || d->tokens[i] >= CSET)
3729 have_nchar = true;
3730 break;
3733 sup->tindex = j;
3735 if (have_nchar && (have_achar || d->localeinfo.multibyte))
3736 d->superset = sup;
3737 else
3739 dfafree (sup);
3740 free (sup);
3744 /* Parse and analyze a single string of the given length. */
3745 void
3746 dfacomp (char const *s, size_t len, struct dfa *d, bool searchflag)
3748 dfaparse (s, len, d);
3749 dfassbuild (d);
3751 if (dfa_supported (d))
3753 maybe_disable_superset_dfa (d);
3754 dfaanalyze (d, searchflag);
3756 else
3758 d->dfaexec = dfaexec_noop;
3761 if (d->superset)
3763 d->fast = true;
3764 dfaanalyze (d->superset, searchflag);
3768 /* Free the storage held by the components of a dfa. */
3769 void
3770 dfafree (struct dfa *d)
3772 free (d->charclasses);
3773 free (d->tokens);
3775 if (d->localeinfo.multibyte)
3776 free_mbdata (d);
3778 free (d->constraints);
3779 free (d->separates);
3781 for (size_t i = 0; i < d->sindex; ++i)
3783 free (d->states[i].elems.elems);
3784 free (d->states[i].mbps.elems);
3786 free (d->states);
3788 if (d->follows)
3790 for (size_t i = 0; i < d->tindex; ++i)
3791 free (d->follows[i].elems);
3792 free (d->follows);
3795 if (d->trans)
3797 for (size_t i = 0; i < d->tralloc; ++i)
3799 free (d->trans[i]);
3800 free (d->fails[i]);
3803 free (d->trans - 2);
3804 free (d->fails);
3805 free (d->newlines);
3806 free (d->success);
3809 if (d->superset)
3811 dfafree (d->superset);
3812 free (d->superset);
3816 /* Having found the postfix representation of the regular expression,
3817 try to find a long sequence of characters that must appear in any line
3818 containing the r.e.
3819 Finding a "longest" sequence is beyond the scope here;
3820 we take an easy way out and hope for the best.
3821 (Take "(ab|a)b"--please.)
3823 We do a bottom-up calculation of sequences of characters that must appear
3824 in matches of r.e.'s represented by trees rooted at the nodes of the postfix
3825 representation:
3826 sequences that must appear at the left of the match ("left")
3827 sequences that must appear at the right of the match ("right")
3828 lists of sequences that must appear somewhere in the match ("in")
3829 sequences that must constitute the match ("is")
3831 When we get to the root of the tree, we use one of the longest of its
3832 calculated "in" sequences as our answer.
3834 The sequences calculated for the various types of node (in pseudo ANSI c)
3835 are shown below. "p" is the operand of unary operators (and the left-hand
3836 operand of binary operators); "q" is the right-hand operand of binary
3837 operators.
3839 "ZERO" means "a zero-length sequence" below.
3841 Type left right is in
3842 ---- ---- ----- -- --
3843 char c # c # c # c # c
3845 ANYCHAR ZERO ZERO ZERO ZERO
3847 MBCSET ZERO ZERO ZERO ZERO
3849 CSET ZERO ZERO ZERO ZERO
3851 STAR ZERO ZERO ZERO ZERO
3853 QMARK ZERO ZERO ZERO ZERO
3855 PLUS p->left p->right ZERO p->in
3857 CAT (p->is==ZERO)? (q->is==ZERO)? (p->is!=ZERO && p->in plus
3858 p->left : q->right : q->is!=ZERO) ? q->in plus
3859 p->is##q->left p->right##q->is p->is##q->is : p->right##q->left
3860 ZERO
3862 OR longest common longest common (do p->is and substrings common
3863 leading trailing to q->is have same p->in and
3864 (sub)sequence (sub)sequence q->in length and content) ?
3865 of p->left of p->right
3866 and q->left and q->right p->is : NULL
3868 If there's anything else we recognize in the tree, all four sequences get set
3869 to zero-length sequences. If there's something we don't recognize in the
3870 tree, we just return a zero-length sequence.
3872 Break ties in favor of infrequent letters (choosing 'zzz' in preference to
3873 'aaa')?
3875 And ... is it here or someplace that we might ponder "optimizations" such as
3876 egrep 'psi|epsilon' -> egrep 'psi'
3877 egrep 'pepsi|epsilon' -> egrep 'epsi'
3878 (Yes, we now find "epsi" as a "string
3879 that must occur", but we might also
3880 simplify the *entire* r.e. being sought)
3881 grep '[c]' -> grep 'c'
3882 grep '(ab|a)b' -> grep 'ab'
3883 grep 'ab*' -> grep 'a'
3884 grep 'a*b' -> grep 'b'
3886 There are several issues:
3888 Is optimization easy (enough)?
3890 Does optimization actually accomplish anything,
3891 or is the automaton you get from "psi|epsilon" (for example)
3892 the same as the one you get from "psi" (for example)?
3894 Are optimizable r.e.'s likely to be used in real-life situations
3895 (something like 'ab*' is probably unlikely; something like is
3896 'psi|epsilon' is likelier)? */
3898 static char *
3899 icatalloc (char *old, char const *new)
3901 size_t newsize = strlen (new);
3902 if (newsize == 0)
3903 return old;
3904 size_t oldsize = strlen (old);
3905 char *result = xrealloc (old, oldsize + newsize + 1);
3906 memcpy (result + oldsize, new, newsize + 1);
3907 return result;
3910 static void
3911 freelist (char **cpp)
3913 while (*cpp)
3914 free (*cpp++);
3917 static char **
3918 enlist (char **cpp, char *new, size_t len)
3920 new = memcpy (xmalloc (len + 1), new, len);
3921 new[len] = '\0';
3922 /* Is there already something in the list that's new (or longer)? */
3923 size_t i;
3924 for (i = 0; cpp[i] != NULL; ++i)
3925 if (strstr (cpp[i], new) != NULL)
3927 free (new);
3928 return cpp;
3930 /* Eliminate any obsoleted strings. */
3931 for (size_t j = 0; cpp[j] != NULL; )
3932 if (strstr (new, cpp[j]) == NULL)
3933 ++j;
3934 else
3936 free (cpp[j]);
3937 if (--i == j)
3938 break;
3939 cpp[j] = cpp[i];
3940 cpp[i] = NULL;
3942 /* Add the new string. */
3943 cpp = xnrealloc (cpp, i + 2, sizeof *cpp);
3944 cpp[i] = new;
3945 cpp[i + 1] = NULL;
3946 return cpp;
3949 /* Given pointers to two strings, return a pointer to an allocated
3950 list of their distinct common substrings. */
3951 static char **
3952 comsubs (char *left, char const *right)
3954 char **cpp = xzalloc (sizeof *cpp);
3956 for (char *lcp = left; *lcp != '\0'; lcp++)
3958 size_t len = 0;
3959 char *rcp = strchr (right, *lcp);
3960 while (rcp != NULL)
3962 size_t i;
3963 for (i = 1; lcp[i] != '\0' && lcp[i] == rcp[i]; ++i)
3964 continue;
3965 if (i > len)
3966 len = i;
3967 rcp = strchr (rcp + 1, *lcp);
3969 if (len != 0)
3970 cpp = enlist (cpp, lcp, len);
3972 return cpp;
3975 static char **
3976 addlists (char **old, char **new)
3978 for (; *new; new++)
3979 old = enlist (old, *new, strlen (*new));
3980 return old;
3983 /* Given two lists of substrings, return a new list giving substrings
3984 common to both. */
3985 static char **
3986 inboth (char **left, char **right)
3988 char **both = xzalloc (sizeof *both);
3990 for (size_t lnum = 0; left[lnum] != NULL; ++lnum)
3992 for (size_t rnum = 0; right[rnum] != NULL; ++rnum)
3994 char **temp = comsubs (left[lnum], right[rnum]);
3995 both = addlists (both, temp);
3996 freelist (temp);
3997 free (temp);
4000 return both;
4003 typedef struct must must;
4005 struct must
4007 char **in;
4008 char *left;
4009 char *right;
4010 char *is;
4011 bool begline;
4012 bool endline;
4013 must *prev;
4016 static must *
4017 allocmust (must *mp, size_t size)
4019 must *new_mp = xmalloc (sizeof *new_mp);
4020 new_mp->in = xzalloc (sizeof *new_mp->in);
4021 new_mp->left = xzalloc (size);
4022 new_mp->right = xzalloc (size);
4023 new_mp->is = xzalloc (size);
4024 new_mp->begline = false;
4025 new_mp->endline = false;
4026 new_mp->prev = mp;
4027 return new_mp;
4030 static void
4031 resetmust (must *mp)
4033 freelist (mp->in);
4034 mp->in[0] = NULL;
4035 mp->left[0] = mp->right[0] = mp->is[0] = '\0';
4036 mp->begline = false;
4037 mp->endline = false;
4040 static void
4041 freemust (must *mp)
4043 freelist (mp->in);
4044 free (mp->in);
4045 free (mp->left);
4046 free (mp->right);
4047 free (mp->is);
4048 free (mp);
4051 struct dfamust *
4052 dfamust (struct dfa const *d)
4054 must *mp = NULL;
4055 char const *result = "";
4056 bool exact = false;
4057 bool begline = false;
4058 bool endline = false;
4059 bool need_begline = false;
4060 bool need_endline = false;
4061 bool case_fold_unibyte = d->syntax.case_fold && MB_CUR_MAX == 1;
4063 for (size_t ri = 1; ri + 1 < d->tindex; ri++)
4065 token t = d->tokens[ri];
4066 switch (t)
4068 case BEGLINE:
4069 mp = allocmust (mp, 2);
4070 mp->begline = true;
4071 need_begline = true;
4072 break;
4073 case ENDLINE:
4074 mp = allocmust (mp, 2);
4075 mp->endline = true;
4076 need_endline = true;
4077 break;
4078 case LPAREN:
4079 case RPAREN:
4080 assert (!"neither LPAREN nor RPAREN may appear here");
4082 case EMPTY:
4083 case BEGWORD:
4084 case ENDWORD:
4085 case LIMWORD:
4086 case NOTLIMWORD:
4087 case BACKREF:
4088 case ANYCHAR:
4089 case MBCSET:
4090 mp = allocmust (mp, 2);
4091 break;
4093 case STAR:
4094 case QMARK:
4095 resetmust (mp);
4096 break;
4098 case OR:
4100 char **new;
4101 must *rmp = mp;
4102 must *lmp = mp = mp->prev;
4103 size_t j, ln, rn, n;
4105 /* Guaranteed to be. Unlikely, but ... */
4106 if (streq (lmp->is, rmp->is))
4108 lmp->begline &= rmp->begline;
4109 lmp->endline &= rmp->endline;
4111 else
4113 lmp->is[0] = '\0';
4114 lmp->begline = false;
4115 lmp->endline = false;
4117 /* Left side--easy */
4118 size_t i = 0;
4119 while (lmp->left[i] != '\0' && lmp->left[i] == rmp->left[i])
4120 ++i;
4121 lmp->left[i] = '\0';
4122 /* Right side */
4123 ln = strlen (lmp->right);
4124 rn = strlen (rmp->right);
4125 n = ln;
4126 if (n > rn)
4127 n = rn;
4128 for (i = 0; i < n; ++i)
4129 if (lmp->right[ln - i - 1] != rmp->right[rn - i - 1])
4130 break;
4131 for (j = 0; j < i; ++j)
4132 lmp->right[j] = lmp->right[(ln - i) + j];
4133 lmp->right[j] = '\0';
4134 new = inboth (lmp->in, rmp->in);
4135 freelist (lmp->in);
4136 free (lmp->in);
4137 lmp->in = new;
4138 freemust (rmp);
4140 break;
4142 case PLUS:
4143 mp->is[0] = '\0';
4144 break;
4146 case END:
4147 assert (!mp->prev);
4148 for (size_t i = 0; mp->in[i] != NULL; ++i)
4149 if (strlen (mp->in[i]) > strlen (result))
4150 result = mp->in[i];
4151 if (streq (result, mp->is))
4153 if ((!need_begline || mp->begline) && (!need_endline
4154 || mp->endline))
4155 exact = true;
4156 begline = mp->begline;
4157 endline = mp->endline;
4159 goto done;
4161 case CAT:
4163 must *rmp = mp;
4164 must *lmp = mp = mp->prev;
4166 /* In. Everything in left, plus everything in
4167 right, plus concatenation of
4168 left's right and right's left. */
4169 lmp->in = addlists (lmp->in, rmp->in);
4170 if (lmp->right[0] != '\0' && rmp->left[0] != '\0')
4172 size_t lrlen = strlen (lmp->right);
4173 size_t rllen = strlen (rmp->left);
4174 char *tp = xmalloc (lrlen + rllen);
4175 memcpy (tp, lmp->right, lrlen);
4176 memcpy (tp + lrlen, rmp->left, rllen);
4177 lmp->in = enlist (lmp->in, tp, lrlen + rllen);
4178 free (tp);
4180 /* Left-hand */
4181 if (lmp->is[0] != '\0')
4182 lmp->left = icatalloc (lmp->left, rmp->left);
4183 /* Right-hand */
4184 if (rmp->is[0] == '\0')
4185 lmp->right[0] = '\0';
4186 lmp->right = icatalloc (lmp->right, rmp->right);
4187 /* Guaranteed to be */
4188 if ((lmp->is[0] != '\0' || lmp->begline)
4189 && (rmp->is[0] != '\0' || rmp->endline))
4191 lmp->is = icatalloc (lmp->is, rmp->is);
4192 lmp->endline = rmp->endline;
4194 else
4196 lmp->is[0] = '\0';
4197 lmp->begline = false;
4198 lmp->endline = false;
4200 freemust (rmp);
4202 break;
4204 case '\0':
4205 /* Not on *my* shift. */
4206 goto done;
4208 default:
4209 if (CSET <= t)
4211 /* If T is a singleton, or if case-folding in a unibyte
4212 locale and T's members all case-fold to the same char,
4213 convert T to one of its members. Otherwise, do
4214 nothing further with T. */
4215 charclass *ccl = &d->charclasses[t - CSET];
4216 int j;
4217 for (j = 0; j < NOTCHAR; j++)
4218 if (tstbit (j, ccl))
4219 break;
4220 if (! (j < NOTCHAR))
4222 mp = allocmust (mp, 2);
4223 break;
4225 t = j;
4226 while (++j < NOTCHAR)
4227 if (tstbit (j, ccl)
4228 && ! (case_fold_unibyte
4229 && toupper (j) == toupper (t)))
4230 break;
4231 if (j < NOTCHAR)
4233 mp = allocmust (mp, 2);
4234 break;
4238 size_t rj = ri + 2;
4239 if (d->tokens[ri + 1] == CAT)
4241 for (; rj < d->tindex - 1; rj += 2)
4243 if ((rj != ri && (d->tokens[rj] <= 0
4244 || NOTCHAR <= d->tokens[rj]))
4245 || d->tokens[rj + 1] != CAT)
4246 break;
4249 mp = allocmust (mp, ((rj - ri) >> 1) + 1);
4250 mp->is[0] = mp->left[0] = mp->right[0]
4251 = case_fold_unibyte ? toupper (t) : t;
4253 size_t i;
4254 for (i = 1; ri + 2 < rj; i++)
4256 ri += 2;
4257 t = d->tokens[ri];
4258 mp->is[i] = mp->left[i] = mp->right[i]
4259 = case_fold_unibyte ? toupper (t) : t;
4261 mp->is[i] = mp->left[i] = mp->right[i] = '\0';
4262 mp->in = enlist (mp->in, mp->is, i);
4263 break;
4266 done:;
4268 struct dfamust *dm = NULL;
4269 if (*result)
4271 dm = xmalloc (sizeof *dm);
4272 dm->exact = exact;
4273 dm->begline = begline;
4274 dm->endline = endline;
4275 dm->must = xstrdup (result);
4278 while (mp)
4280 must *prev = mp->prev;
4281 freemust (mp);
4282 mp = prev;
4285 return dm;
4288 void
4289 dfamustfree (struct dfamust *dm)
4291 free (dm->must);
4292 free (dm);
4295 struct dfa *
4296 dfaalloc (void)
4298 return xmalloc (sizeof (struct dfa));
4301 /* Initialize DFA. */
4302 void
4303 dfasyntax (struct dfa *dfa, struct localeinfo const *linfo,
4304 reg_syntax_t bits, int dfaopts)
4306 memset (dfa, 0, offsetof (struct dfa, dfaexec));
4307 dfa->dfaexec = linfo->multibyte ? dfaexec_mb : dfaexec_sb;
4308 dfa->simple_locale = using_simple_locale (linfo->multibyte);
4309 dfa->localeinfo = *linfo;
4311 dfa->fast = !dfa->localeinfo.multibyte;
4313 dfa->canychar = -1;
4314 dfa->lex.cur_mb_len = 1;
4315 dfa->syntax.syntax_bits_set = true;
4316 dfa->syntax.case_fold = (bits & RE_ICASE) != 0;
4317 dfa->syntax.anchor = (dfaopts & DFA_ANCHOR) != 0;
4318 dfa->syntax.eolbyte = dfaopts & DFA_EOL_NUL ? '\0' : '\n';
4319 dfa->syntax.syntax_bits = bits;
4321 for (int i = CHAR_MIN; i <= CHAR_MAX; ++i)
4323 unsigned char uc = i;
4325 dfa->syntax.sbit[uc] = char_context (dfa, uc);
4326 switch (dfa->syntax.sbit[uc])
4328 case CTX_LETTER:
4329 setbit (uc, &dfa->syntax.letters);
4330 break;
4331 case CTX_NEWLINE:
4332 setbit (uc, &dfa->syntax.newline);
4333 break;
4336 /* POSIX requires that the five bytes in "\n\r./" (including the
4337 terminating NUL) cannot occur inside a multibyte character. */
4338 dfa->syntax.never_trail[uc] = (dfa->localeinfo.using_utf8
4339 ? (uc & 0xc0) != 0x80
4340 : strchr ("\n\r./", uc) != NULL);
4344 /* vim:set shiftwidth=2: */