exp2l: Work around a NetBSD 10.0/i386 bug.
[gnulib.git] / lib / dfa.c
blobc79afddcd9189b20bb3f209af7cdcd78d3540da0
1 /* dfa.c - deterministic extended regexp routines for GNU
2 Copyright (C) 1988, 1998, 2000, 2002, 2004-2005, 2007-2024 Free Software
3 Foundation, Inc.
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation, either version 3, or (at your option)
8 any later version.
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc.,
18 51 Franklin Street - Fifth Floor, Boston, MA 02110-1301, USA */
20 /* Written June, 1988 by Mike Haertel
21 Modified July, 1988 by Arthur David Olson to assist BMG speedups */
23 #include <config.h>
25 #include "dfa.h"
27 #include "flexmember.h"
28 #include "idx.h"
29 #include "verify.h"
31 #include <assert.h>
32 #include <ctype.h>
33 #include <stdint.h>
34 #include <stdio.h>
35 #include <stdlib.h>
36 #include <limits.h>
37 #include <string.h>
38 #include <wchar.h>
40 #include "xalloc.h"
41 #include "localeinfo.h"
43 #include "gettext.h"
44 #define _(str) gettext (str)
46 #if GAWK
47 /* Use ISO C 99 API. */
48 # include <wctype.h>
49 # define char32_t wchar_t
50 # define mbrtoc32 mbrtowc
51 # define c32rtomb wcrtomb
52 # define c32tob wctob
53 # define c32isprint iswprint
54 # define c32isspace iswspace
55 # define mbszero(p) memset ((p), 0, sizeof (mbstate_t))
56 #else
57 /* Use ISO C 11 + gnulib API. */
58 # include <uchar.h>
59 #endif
61 /* Pacify gcc -Wanalyzer-null-dereference in areas where GCC
62 understandably cannot deduce that the input comes from a
63 well-formed regular expression. There's little point to the
64 runtime overhead of 'assert' instead of 'assume_nonnull' when the
65 MMU will check anyway. */
66 #define assume_nonnull(x) assume ((x) != NULL)
68 static bool
69 str_eq (char const *a, char const *b)
71 return strcmp (a, b) == 0;
74 static bool
75 c_isdigit (char c)
77 return '0' <= c && c <= '9';
80 #ifndef FALLTHROUGH
81 # if 201710L < __STDC_VERSION__
82 # define FALLTHROUGH [[__fallthrough__]]
83 # elif ((__GNUC__ >= 7) \
84 || (defined __apple_build_version__ \
85 ? __apple_build_version__ >= 12000000 \
86 : __clang_major__ >= 10))
87 # define FALLTHROUGH __attribute__ ((__fallthrough__))
88 # else
89 # define FALLTHROUGH ((void) 0)
90 # endif
91 #endif
93 #ifndef MIN
94 # define MIN(a,b) ((a) < (b) ? (a) : (b))
95 #endif
97 /* HPUX defines these as macros in sys/param.h. */
98 #ifdef setbit
99 # undef setbit
100 #endif
101 #ifdef clrbit
102 # undef clrbit
103 #endif
105 /* For code that does not use Gnulib’s isblank module. */
106 #if !defined isblank && !defined HAVE_ISBLANK && !defined GNULIB_ISBLANK
107 # define isblank dfa_isblank
108 static int
109 isblank (int c)
111 return c == ' ' || c == '\t';
113 #endif
115 /* First integer value that is greater than any character code. */
116 enum { NOTCHAR = 1 << CHAR_BIT };
118 #ifdef UINT_LEAST64_MAX
120 /* Number of bits used in a charclass word. */
121 enum { CHARCLASS_WORD_BITS = 64 };
123 /* This represents part of a character class. It must be unsigned and
124 at least CHARCLASS_WORD_BITS wide. Any excess bits are zero. */
125 typedef uint_least64_t charclass_word;
127 /* Part of a charclass initializer that represents 64 bits' worth of a
128 charclass, where LO and HI are the low and high-order 32 bits of
129 the 64-bit quantity. */
130 # define CHARCLASS_PAIR(lo, hi) (((charclass_word) (hi) << 32) + (lo))
132 #else
133 /* Fallbacks for pre-C99 hosts that lack 64-bit integers. */
134 enum { CHARCLASS_WORD_BITS = 32 };
135 typedef unsigned long charclass_word;
136 # define CHARCLASS_PAIR(lo, hi) lo, hi
137 #endif
139 /* An initializer for a charclass whose 32-bit words are A through H. */
140 #define CHARCLASS_INIT(a, b, c, d, e, f, g, h) \
141 {{ \
142 CHARCLASS_PAIR (a, b), CHARCLASS_PAIR (c, d), \
143 CHARCLASS_PAIR (e, f), CHARCLASS_PAIR (g, h) \
146 /* The maximum useful value of a charclass_word; all used bits are 1. */
147 static charclass_word const CHARCLASS_WORD_MASK
148 = ((charclass_word) 1 << (CHARCLASS_WORD_BITS - 1) << 1) - 1;
150 /* Number of words required to hold a bit for every character. */
151 enum
153 CHARCLASS_WORDS = (NOTCHAR + CHARCLASS_WORD_BITS - 1) / CHARCLASS_WORD_BITS
156 /* Sets of unsigned characters are stored as bit vectors in arrays of ints. */
157 typedef struct { charclass_word w[CHARCLASS_WORDS]; } charclass;
159 /* Convert a possibly-signed character to an unsigned character. This is
160 a bit safer than casting to unsigned char, since it catches some type
161 errors that the cast doesn't. */
162 static unsigned char
163 to_uchar (char ch)
165 return ch;
168 /* Contexts tell us whether a character is a newline or a word constituent.
169 Word-constituent characters are those that satisfy iswalnum, plus '_'.
170 Each character has a single CTX_* value; bitmasks of CTX_* values denote
171 a particular character class.
173 A state also stores a context value, which is a bitmask of CTX_* values.
174 A state's context represents a set of characters that the state's
175 predecessors must match. For example, a state whose context does not
176 include CTX_LETTER will never have transitions where the previous
177 character is a word constituent. A state whose context is CTX_ANY
178 might have transitions from any character. */
180 enum
182 CTX_NONE = 1,
183 CTX_LETTER = 2,
184 CTX_NEWLINE = 4,
185 CTX_ANY = 7
188 /* Sometimes characters can only be matched depending on the surrounding
189 context. Such context decisions depend on what the previous character
190 was, and the value of the current (lookahead) character. Context
191 dependent constraints are encoded as 9-bit integers. Each bit that
192 is set indicates that the constraint succeeds in the corresponding
193 context.
195 bit 6-8 - valid contexts when next character is CTX_NEWLINE
196 bit 3-5 - valid contexts when next character is CTX_LETTER
197 bit 0-2 - valid contexts when next character is CTX_NONE
199 succeeds_in_context determines whether a given constraint
200 succeeds in a particular context. Prev is a bitmask of possible
201 context values for the previous character, curr is the (single-bit)
202 context value for the lookahead character. */
203 static int
204 newline_constraint (int constraint)
206 return (constraint >> 6) & 7;
208 static int
209 letter_constraint (int constraint)
211 return (constraint >> 3) & 7;
213 static int
214 other_constraint (int constraint)
216 return constraint & 7;
219 static bool
220 succeeds_in_context (int constraint, int prev, int curr)
222 return !! (((curr & CTX_NONE ? other_constraint (constraint) : 0) \
223 | (curr & CTX_LETTER ? letter_constraint (constraint) : 0) \
224 | (curr & CTX_NEWLINE ? newline_constraint (constraint) : 0)) \
225 & prev);
228 /* The following describe what a constraint depends on. */
229 static bool
230 prev_newline_dependent (int constraint)
232 return ((constraint ^ constraint >> 2) & 0111) != 0;
234 static bool
235 prev_letter_dependent (int constraint)
237 return ((constraint ^ constraint >> 1) & 0111) != 0;
240 /* Tokens that match the empty string subject to some constraint actually
241 work by applying that constraint to determine what may follow them,
242 taking into account what has gone before. The following values are
243 the constraints corresponding to the special tokens previously defined. */
244 enum
246 NO_CONSTRAINT = 0777,
247 BEGLINE_CONSTRAINT = 0444,
248 ENDLINE_CONSTRAINT = 0700,
249 BEGWORD_CONSTRAINT = 0050,
250 ENDWORD_CONSTRAINT = 0202,
251 LIMWORD_CONSTRAINT = 0252,
252 NOTLIMWORD_CONSTRAINT = 0525
255 /* The regexp is parsed into an array of tokens in postfix form. Some tokens
256 are operators and others are terminal symbols. Most (but not all) of these
257 codes are returned by the lexical analyzer. */
259 typedef ptrdiff_t token;
260 static token const TOKEN_MAX = PTRDIFF_MAX;
262 /* States are indexed by state_num values. These are normally
263 nonnegative but -1 is used as a special value. */
264 typedef ptrdiff_t state_num;
266 /* Predefined token values. */
267 enum
269 END = -1, /* END is a terminal symbol that matches the
270 end of input; any value of END or less in
271 the parse tree is such a symbol. Accepting
272 states of the DFA are those that would have
273 a transition on END. This is -1, not some
274 more-negative value, to tweak the speed of
275 comparisons to END. */
277 /* Ordinary character values are terminal symbols that match themselves. */
279 /* CSET must come last in the following list of special tokens. Otherwise,
280 the list order matters only for performance. Related special tokens
281 should have nearby values so that code like (t == ANYCHAR || t == MBCSET
282 || CSET <= t) can be done with a single machine-level comparison. */
284 EMPTY = NOTCHAR, /* EMPTY is a terminal symbol that matches
285 the empty string. */
287 QMARK, /* QMARK is an operator of one argument that
288 matches zero or one occurrences of its
289 argument. */
291 STAR, /* STAR is an operator of one argument that
292 matches the Kleene closure (zero or more
293 occurrences) of its argument. */
295 PLUS, /* PLUS is an operator of one argument that
296 matches the positive closure (one or more
297 occurrences) of its argument. */
299 REPMN, /* REPMN is a lexical token corresponding
300 to the {m,n} construct. REPMN never
301 appears in the compiled token vector. */
303 CAT, /* CAT is an operator of two arguments that
304 matches the concatenation of its
305 arguments. CAT is never returned by the
306 lexical analyzer. */
308 OR, /* OR is an operator of two arguments that
309 matches either of its arguments. */
311 LPAREN, /* LPAREN never appears in the parse tree,
312 it is only a lexeme. */
314 RPAREN, /* RPAREN never appears in the parse tree. */
316 WCHAR, /* Only returned by lex. wctok contains the
317 32-bit wide character representation. */
319 ANYCHAR, /* ANYCHAR is a terminal symbol that matches
320 a valid multibyte (or single byte) character.
321 It is used only if MB_CUR_MAX > 1. */
323 BEG, /* BEG is an initial symbol that matches the
324 beginning of input. */
326 BEGLINE, /* BEGLINE is a terminal symbol that matches
327 the empty string at the beginning of a
328 line. */
330 ENDLINE, /* ENDLINE is a terminal symbol that matches
331 the empty string at the end of a line. */
333 BEGWORD, /* BEGWORD is a terminal symbol that matches
334 the empty string at the beginning of a
335 word. */
337 ENDWORD, /* ENDWORD is a terminal symbol that matches
338 the empty string at the end of a word. */
340 LIMWORD, /* LIMWORD is a terminal symbol that matches
341 the empty string at the beginning or the
342 end of a word. */
344 NOTLIMWORD, /* NOTLIMWORD is a terminal symbol that
345 matches the empty string not at
346 the beginning or end of a word. */
348 BACKREF, /* BACKREF is generated by \<digit>
349 or by any other construct that
350 is not completely handled. If the scanner
351 detects a transition on backref, it returns
352 a kind of "semi-success" indicating that
353 the match will have to be verified with
354 a backtracking matcher. */
356 MBCSET, /* MBCSET is similar to CSET, but for
357 multibyte characters. */
359 CSET /* CSET and (and any value greater) is a
360 terminal symbol that matches any of a
361 class of characters. */
365 /* States of the recognizer correspond to sets of positions in the parse
366 tree, together with the constraints under which they may be matched.
367 So a position is encoded as an index into the parse tree together with
368 a constraint. */
369 typedef struct
371 idx_t index; /* Index into the parse array. */
372 unsigned int constraint; /* Constraint for matching this position. */
373 } position;
375 /* Sets of positions are stored as arrays. */
376 typedef struct
378 position *elems; /* Elements of this position set. */
379 idx_t nelem; /* Number of elements in this set. */
380 idx_t alloc; /* Number of elements allocated in ELEMS. */
381 } position_set;
383 /* A state of the dfa consists of a set of positions, some flags,
384 and the token value of the lowest-numbered position of the state that
385 contains an END token. */
386 typedef struct
388 size_t hash; /* Hash of the positions of this state. */
389 position_set elems; /* Positions this state could match. */
390 unsigned char context; /* Context from previous state. */
391 unsigned short constraint; /* Constraint for this state to accept. */
392 position_set mbps; /* Positions which can match multibyte
393 characters or the follows, e.g., period.
394 Used only if MB_CUR_MAX > 1. */
395 state_num mb_trindex; /* Index of this state in MB_TRANS, or
396 negative if the state does not have
397 ANYCHAR. */
398 } dfa_state;
400 /* Maximum for any transition table count. This should be at least 3,
401 for the initial state setup. */
402 enum { MAX_TRCOUNT = 1024 };
404 /* A bracket operator.
405 e.g., [a-c], [[:alpha:]], etc. */
406 struct mb_char_classes
408 ptrdiff_t cset;
409 bool invert;
410 char32_t *chars; /* Normal characters. */
411 idx_t nchars;
412 idx_t nchars_alloc;
415 struct regex_syntax
417 /* Syntax bits controlling the behavior of the lexical analyzer. */
418 reg_syntax_t syntax_bits;
419 int dfaopts;
420 bool syntax_bits_set;
422 /* Flag for case-folding letters into sets. */
423 bool case_fold;
425 /* End-of-line byte in data. */
426 unsigned char eolbyte;
428 /* Cache of char-context values. */
429 char sbit[NOTCHAR];
431 /* If never_trail[B], the byte B cannot be a non-initial byte in a
432 multibyte character. */
433 bool never_trail[NOTCHAR];
435 /* Set of characters considered letters. */
436 charclass letters;
438 /* Set of characters that are newline. */
439 charclass newline;
442 /* Lexical analyzer. All the dross that deals with the obnoxious
443 GNU Regex syntax bits is located here. The poor, suffering
444 reader is referred to the GNU Regex documentation for the
445 meaning of the @#%!@#%^!@ syntax bits. */
446 struct lexer_state
448 char const *ptr; /* Pointer to next input character. */
449 idx_t left; /* Number of characters remaining. */
450 token lasttok; /* Previous token returned; initially END. */
451 idx_t parens; /* Count of outstanding left parens. */
452 int minrep, maxrep; /* Repeat counts for {m,n}. */
454 /* 32-bit wide character representation of the current multibyte character,
455 or WEOF if there was an encoding error. Used only if
456 MB_CUR_MAX > 1. */
457 wint_t wctok;
459 /* The most recently analyzed multibyte bracket expression. */
460 struct mb_char_classes brack;
462 /* We're separated from beginning or (, | only by zero-width characters. */
463 bool laststart;
466 /* Recursive descent parser for regular expressions. */
468 struct parser_state
470 token tok; /* Lookahead token. */
471 idx_t depth; /* Current depth of a hypothetical stack
472 holding deferred productions. This is
473 used to determine the depth that will be
474 required of the real stack later on in
475 dfaanalyze. */
478 /* A compiled regular expression. */
479 struct dfa
481 /* Fields filled by the scanner. */
482 charclass *charclasses; /* Array of character sets for CSET tokens. */
483 idx_t cindex; /* Index for adding new charclasses. */
484 idx_t calloc; /* Number of charclasses allocated. */
485 ptrdiff_t canychar; /* Index of anychar class, or -1. */
487 /* Scanner state */
488 struct lexer_state lex;
490 /* Parser state */
491 struct parser_state parse;
493 /* Fields filled by the parser. */
494 token *tokens; /* Postfix parse array. */
495 idx_t tindex; /* Index for adding new tokens. */
496 idx_t talloc; /* Number of tokens currently allocated. */
497 idx_t depth; /* Depth required of an evaluation stack
498 used for depth-first traversal of the
499 parse tree. */
500 idx_t nleaves; /* Number of non-EMPTY leaves
501 in the parse tree. */
502 idx_t nregexps; /* Count of parallel regexps being built
503 with dfaparse. */
504 bool fast; /* The DFA is fast. */
505 bool epsilon; /* Does a token match only the empty string? */
506 token utf8_anychar_classes[9]; /* To lower ANYCHAR in UTF-8 locales. */
507 mbstate_t mbs; /* Multibyte conversion state. */
509 /* The following are valid only if MB_CUR_MAX > 1. */
511 /* The value of multibyte_prop[i] is defined by following rule.
512 if tokens[i] < NOTCHAR
513 bit 0 : tokens[i] is the first byte of a character, including
514 single-byte characters.
515 bit 1 : tokens[i] is the last byte of a character, including
516 single-byte characters.
518 e.g.
519 tokens
520 = 'single_byte_a', 'multi_byte_A', single_byte_b'
521 = 'sb_a', 'mb_A(1st byte)', 'mb_A(2nd byte)', 'mb_A(3rd byte)', 'sb_b'
522 multibyte_prop
523 = 3 , 1 , 0 , 2 , 3
525 char *multibyte_prop;
527 /* Fields filled by the superset. */
528 struct dfa *superset; /* Hint of the dfa. */
530 /* Fields filled by the state builder. */
531 dfa_state *states; /* States of the dfa. */
532 state_num sindex; /* Index for adding new states. */
533 idx_t salloc; /* Number of states currently allocated. */
535 /* Fields filled by the parse tree->NFA conversion. */
536 position_set *follows; /* Array of follow sets, indexed by position
537 index. The follow of a position is the set
538 of positions containing characters that
539 could conceivably follow a character
540 matching the given position in a string
541 matching the regexp. Allocated to the
542 maximum possible position index. */
543 bool searchflag; /* We are supposed to build a searching
544 as opposed to an exact matcher. A searching
545 matcher finds the first and shortest string
546 matching a regexp anywhere in the buffer,
547 whereas an exact matcher finds the longest
548 string matching, but anchored to the
549 beginning of the buffer. */
551 /* Fields filled by dfaanalyze. */
552 int *constraints; /* Array of union of accepting constraints
553 in the follow of a position. */
554 int *separates; /* Array of contexts on follow of a
555 position. */
557 /* Fields filled by dfaexec. */
558 state_num tralloc; /* Number of transition tables that have
559 slots so far, not counting trans[-1] and
560 trans[-2]. */
561 int trcount; /* Number of transition tables that have
562 been built, other than for initial
563 states. */
564 int min_trcount; /* Number of initial states. Equivalently,
565 the minimum state number for which trcount
566 counts transitions. */
567 state_num **trans; /* Transition tables for states that can
568 never accept. If the transitions for a
569 state have not yet been computed, or the
570 state could possibly accept, its entry in
571 this table is NULL. This points to two
572 past the start of the allocated array,
573 and trans[-1] and trans[-2] are always
574 NULL. */
575 state_num **fails; /* Transition tables after failing to accept
576 on a state that potentially could do so.
577 If trans[i] is non-null, fails[i] must
578 be null. */
579 char *success; /* Table of acceptance conditions used in
580 dfaexec and computed in build_state. */
581 state_num *newlines; /* Transitions on newlines. The entry for a
582 newline in any transition table is always
583 -1 so we can count lines without wasting
584 too many cycles. The transition for a
585 newline is stored separately and handled
586 as a special case. Newline is also used
587 as a sentinel at the end of the buffer. */
588 state_num initstate_notbol; /* Initial state for CTX_LETTER and CTX_NONE
589 context in multibyte locales, in which we
590 do not distinguish between their contexts,
591 as not supported word. */
592 position_set mb_follows; /* Follow set added by ANYCHAR on demand. */
593 state_num **mb_trans; /* Transition tables for states with
594 ANYCHAR. */
595 state_num mb_trcount; /* Number of transition tables for states with
596 ANYCHAR that have actually been built. */
598 /* Syntax configuration. This is near the end so that dfacopysyntax
599 can memset up to here. */
600 struct regex_syntax syntax;
602 /* Information derived from the locale. This is at the end so that
603 a quick memset need not clear it specially. */
605 /* dfaexec implementation. */
606 char *(*dfaexec) (struct dfa *, char const *, char *,
607 bool, idx_t *, bool *);
609 /* Other cached information derived from the locale. */
610 struct localeinfo localeinfo;
613 /* User access to dfa internals. */
615 /* S could possibly be an accepting state of R. */
616 static bool
617 accepting (state_num s, struct dfa const *r)
619 return r->states[s].constraint != 0;
622 /* STATE accepts in the specified context. */
623 static bool
624 accepts_in_context (int prev, int curr, state_num state, struct dfa const *dfa)
626 return succeeds_in_context (dfa->states[state].constraint, prev, curr);
629 static void regexp (struct dfa *dfa);
631 /* Store into *PWC the result of converting the leading bytes of the
632 multibyte buffer S of length N bytes, using D->localeinfo.sbctowc
633 and updating the conversion state in *D. On conversion error,
634 convert just a single byte, to WEOF. Return the number of bytes
635 converted.
637 This differs from mbrtoc32 (PWC, S, N, &D->mbs) as follows:
639 * PWC points to wint_t, not to char32_t.
640 * The last arg is a dfa *D instead of merely a multibyte conversion
641 state D->mbs.
642 * N is idx_t not size_t, and must be at least 1.
643 * S[N - 1] must be a sentinel byte.
644 * Shift encodings are not supported.
645 * The return value is always in the range 1..N.
646 * D->mbs is always valid afterwards.
647 * *PWC is always set to something. */
648 static int
649 mbs_to_wchar (wint_t *pwc, char const *s, idx_t n, struct dfa *d)
651 unsigned char uc = s[0];
652 wint_t wc = d->localeinfo.sbctowc[uc];
654 if (wc == WEOF)
656 char32_t wch;
657 size_t nbytes = mbrtoc32 (&wch, s, n, &d->mbs);
658 if (0 < nbytes && nbytes < (size_t) -2)
660 *pwc = wch;
661 /* nbytes cannot be == (size) -3 here, since we rely on the
662 'mbrtoc32-regular' module. */
663 return nbytes;
665 mbszero (&d->mbs);
668 *pwc = wc;
669 return 1;
672 #ifdef DEBUG
674 static void
675 prtok (token t)
677 if (t <= END)
678 fprintf (stderr, "END");
679 else if (0 <= t && t < NOTCHAR)
681 unsigned int ch = t;
682 fprintf (stderr, "0x%02x", ch);
684 else
686 char const *s;
687 switch (t)
689 case BEG:
690 s = "BEG";
691 break;
692 case EMPTY:
693 s = "EMPTY";
694 break;
695 case BACKREF:
696 s = "BACKREF";
697 break;
698 case BEGLINE:
699 s = "BEGLINE";
700 break;
701 case ENDLINE:
702 s = "ENDLINE";
703 break;
704 case BEGWORD:
705 s = "BEGWORD";
706 break;
707 case ENDWORD:
708 s = "ENDWORD";
709 break;
710 case LIMWORD:
711 s = "LIMWORD";
712 break;
713 case NOTLIMWORD:
714 s = "NOTLIMWORD";
715 break;
716 case QMARK:
717 s = "QMARK";
718 break;
719 case STAR:
720 s = "STAR";
721 break;
722 case PLUS:
723 s = "PLUS";
724 break;
725 case CAT:
726 s = "CAT";
727 break;
728 case OR:
729 s = "OR";
730 break;
731 case LPAREN:
732 s = "LPAREN";
733 break;
734 case RPAREN:
735 s = "RPAREN";
736 break;
737 case ANYCHAR:
738 s = "ANYCHAR";
739 break;
740 case MBCSET:
741 s = "MBCSET";
742 break;
743 default:
744 s = "CSET";
745 break;
747 fprintf (stderr, "%s", s);
750 #endif /* DEBUG */
752 /* Stuff pertaining to charclasses. */
754 static bool
755 tstbit (unsigned int b, charclass const *c)
757 return c->w[b / CHARCLASS_WORD_BITS] >> b % CHARCLASS_WORD_BITS & 1;
760 static void
761 setbit (unsigned int b, charclass *c)
763 charclass_word one = 1;
764 c->w[b / CHARCLASS_WORD_BITS] |= one << b % CHARCLASS_WORD_BITS;
767 static void
768 clrbit (unsigned int b, charclass *c)
770 charclass_word one = 1;
771 c->w[b / CHARCLASS_WORD_BITS] &= ~(one << b % CHARCLASS_WORD_BITS);
774 static void
775 zeroset (charclass *s)
777 memset (s, 0, sizeof *s);
780 static void
781 fillset (charclass *s)
783 for (int i = 0; i < CHARCLASS_WORDS; i++)
784 s->w[i] = CHARCLASS_WORD_MASK;
787 static void
788 notset (charclass *s)
790 for (int i = 0; i < CHARCLASS_WORDS; ++i)
791 s->w[i] = CHARCLASS_WORD_MASK & ~s->w[i];
794 static bool
795 equal (charclass const *s1, charclass const *s2)
797 charclass_word w = 0;
798 for (int i = 0; i < CHARCLASS_WORDS; i++)
799 w |= s1->w[i] ^ s2->w[i];
800 return w == 0;
803 static bool
804 emptyset (charclass const *s)
806 charclass_word w = 0;
807 for (int i = 0; i < CHARCLASS_WORDS; i++)
808 w |= s->w[i];
809 return w == 0;
812 /* Ensure that the array addressed by PA holds at least I + 1 items.
813 Either return PA, or reallocate the array and return its new address.
814 Although PA may be null, the returned value is never null.
816 The array holds *NITEMS items, where 0 <= I <= *NITEMS; *NITEMS
817 is updated on reallocation. If PA is null, *NITEMS must be zero.
818 Do not allocate more than NITEMS_MAX items total; -1 means no limit.
819 ITEM_SIZE is the size of one item; it must be positive.
820 Avoid O(N**2) behavior on arrays growing linearly. */
821 static void *
822 maybe_realloc (void *pa, idx_t i, idx_t *nitems,
823 ptrdiff_t nitems_max, idx_t item_size)
825 if (i < *nitems)
826 return pa;
827 return xpalloc (pa, nitems, 1, nitems_max, item_size);
830 /* In DFA D, find the index of charclass S, or allocate a new one. */
831 static idx_t
832 charclass_index (struct dfa *d, charclass const *s)
834 idx_t i;
836 for (i = 0; i < d->cindex; ++i)
837 if (equal (s, &d->charclasses[i]))
838 return i;
839 d->charclasses = maybe_realloc (d->charclasses, d->cindex, &d->calloc,
840 TOKEN_MAX - CSET, sizeof *d->charclasses);
841 ++d->cindex;
842 d->charclasses[i] = *s;
843 return i;
846 static bool
847 unibyte_word_constituent (struct dfa const *dfa, unsigned char c)
849 return dfa->localeinfo.sbctowc[c] != WEOF && (isalnum (c) || (c) == '_');
852 static int
853 char_context (struct dfa const *dfa, unsigned char c)
855 if (c == dfa->syntax.eolbyte && !(dfa->syntax.dfaopts & DFA_ANCHOR))
856 return CTX_NEWLINE;
857 if (unibyte_word_constituent (dfa, c))
858 return CTX_LETTER;
859 return CTX_NONE;
862 /* Set a bit in the charclass for the given char32_t. Do nothing if WC
863 is represented by a multi-byte sequence. Even for MB_CUR_MAX == 1,
864 this may happen when folding case in weird Turkish locales where
865 dotless i/dotted I are not included in the chosen character set.
866 Return whether a bit was set in the charclass. */
867 static bool
868 setbit_wc (char32_t wc, charclass *c)
870 int b = c32tob (wc);
871 if (b < 0)
872 return false;
874 setbit (b, c);
875 return true;
878 /* Set a bit for B and its case variants in the charclass C.
879 MB_CUR_MAX must be 1. */
880 static void
881 setbit_case_fold_c (int b, charclass *c)
883 int ub = toupper (b);
884 for (int i = 0; i < NOTCHAR; i++)
885 if (toupper (i) == ub)
886 setbit (i, c);
889 /* Fetch the next lexical input character from the pattern. There
890 must at least one byte of pattern input. Set DFA->lex.wctok to the
891 value of the character or to WEOF depending on whether the input is
892 a valid multibyte character (possibly of length 1). Then return
893 the next input byte value, except return EOF if the input is a
894 multibyte character of length greater than 1. */
895 static int
896 fetch_wc (struct dfa *dfa)
898 int nbytes = mbs_to_wchar (&dfa->lex.wctok, dfa->lex.ptr, dfa->lex.left,
899 dfa);
900 int c = nbytes == 1 ? to_uchar (dfa->lex.ptr[0]) : EOF;
901 dfa->lex.ptr += nbytes;
902 dfa->lex.left -= nbytes;
903 return c;
906 /* If there is no more input, report an error about unbalanced brackets.
907 Otherwise, behave as with fetch_wc (DFA). */
908 static int
909 bracket_fetch_wc (struct dfa *dfa)
911 if (! dfa->lex.left)
912 dfaerror (_("unbalanced ["));
913 return fetch_wc (dfa);
916 typedef int predicate (int);
918 /* The following list maps the names of the Posix named character classes
919 to predicate functions that determine whether a given character is in
920 the class. The leading [ has already been eaten by the lexical
921 analyzer. */
922 struct dfa_ctype
924 const char *name;
925 predicate *func;
926 bool single_byte_only;
929 static const struct dfa_ctype prednames[] = {
930 {"alpha", isalpha, false},
931 {"upper", isupper, false},
932 {"lower", islower, false},
933 {"digit", isdigit, true},
934 {"xdigit", isxdigit, false},
935 {"space", isspace, false},
936 {"punct", ispunct, false},
937 {"alnum", isalnum, false},
938 {"print", isprint, false},
939 {"graph", isgraph, false},
940 {"cntrl", iscntrl, false},
941 {"blank", isblank, false},
942 {NULL, NULL, false}
945 static const struct dfa_ctype *_GL_ATTRIBUTE_PURE
946 find_pred (const char *str)
948 for (int i = 0; prednames[i].name; i++)
949 if (str_eq (str, prednames[i].name))
950 return &prednames[i];
951 return NULL;
954 /* Parse a bracket expression, which possibly includes multibyte
955 characters. */
956 static token
957 parse_bracket_exp (struct dfa *dfa)
959 /* This is a bracket expression that dfaexec is known to
960 process correctly. */
961 bool known_bracket_exp = true;
963 /* Used to warn about [:space:].
964 Bit 0 = first character is a colon.
965 Bit 1 = last character is a colon.
966 Bit 2 = includes any other character but a colon.
967 Bit 3 = includes ranges, char/equiv classes or collation elements. */
968 int colon_warning_state;
970 dfa->lex.brack.nchars = 0;
971 charclass ccl;
972 zeroset (&ccl);
973 int c = bracket_fetch_wc (dfa);
974 bool invert = c == '^';
975 if (invert)
977 c = bracket_fetch_wc (dfa);
978 known_bracket_exp = dfa->localeinfo.simple;
980 wint_t wc = dfa->lex.wctok;
981 int c1;
982 wint_t wc1;
983 colon_warning_state = (c == ':');
986 c1 = NOTCHAR; /* Mark c1 as not initialized. */
987 colon_warning_state &= ~2;
989 /* Note that if we're looking at some other [:...:] construct,
990 we just treat it as a bunch of ordinary characters. We can do
991 this because we assume regex has checked for syntax errors before
992 dfa is ever called. */
993 if (c == '[')
995 c1 = bracket_fetch_wc (dfa);
996 wc1 = dfa->lex.wctok;
998 if ((c1 == ':' && (dfa->syntax.syntax_bits & RE_CHAR_CLASSES))
999 || c1 == '.' || c1 == '=')
1001 enum { MAX_BRACKET_STRING_LEN = 32 };
1002 char str[MAX_BRACKET_STRING_LEN + 1];
1003 int len = 0;
1004 for (;;)
1006 c = bracket_fetch_wc (dfa);
1007 if (dfa->lex.left == 0
1008 || (c == c1 && dfa->lex.ptr[0] == ']'))
1009 break;
1010 if (len < MAX_BRACKET_STRING_LEN)
1011 str[len++] = c;
1012 else
1013 /* This is in any case an invalid class name. */
1014 str[0] = '\0';
1016 str[len] = '\0';
1018 /* Fetch bracket. */
1019 c = bracket_fetch_wc (dfa);
1020 wc = dfa->lex.wctok;
1021 if (c1 == ':')
1022 /* Build character class. POSIX allows character
1023 classes to match multicharacter collating elements,
1024 but the regex code does not support that, so do not
1025 worry about that possibility. */
1027 char const *class
1028 = (dfa->syntax.case_fold && (str_eq (str, "upper")
1029 || str_eq (str, "lower"))
1030 ? "alpha" : str);
1031 const struct dfa_ctype *pred = find_pred (class);
1032 if (!pred)
1033 dfaerror (_("invalid character class"));
1035 if (dfa->localeinfo.multibyte && !pred->single_byte_only)
1036 known_bracket_exp = false;
1037 else
1038 for (int c2 = 0; c2 < NOTCHAR; ++c2)
1039 if (pred->func (c2))
1040 setbit (c2, &ccl);
1042 else
1043 known_bracket_exp = false;
1045 colon_warning_state |= 8;
1047 /* Fetch new lookahead character. */
1048 c1 = bracket_fetch_wc (dfa);
1049 wc1 = dfa->lex.wctok;
1050 continue;
1053 /* We treat '[' as a normal character here. c/c1/wc/wc1
1054 are already set up. */
1057 if (c == '\\'
1058 && (dfa->syntax.syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
1060 c = bracket_fetch_wc (dfa);
1061 wc = dfa->lex.wctok;
1064 if (c1 == NOTCHAR)
1066 c1 = bracket_fetch_wc (dfa);
1067 wc1 = dfa->lex.wctok;
1070 if (c1 == '-')
1071 /* build range characters. */
1073 int c2 = bracket_fetch_wc (dfa);
1074 wint_t wc2 = dfa->lex.wctok;
1076 /* A bracket expression like [a-[.aa.]] matches an unknown set.
1077 Treat it like [-a[.aa.]] while parsing it, and
1078 remember that the set is unknown. */
1079 if (c2 == '[' && dfa->lex.ptr[0] == '.')
1081 known_bracket_exp = false;
1082 c2 = ']';
1085 if (c2 == ']')
1087 /* In the case [x-], the - is an ordinary hyphen,
1088 which is left in c1, the lookahead character. */
1089 dfa->lex.ptr--;
1090 dfa->lex.left++;
1092 else
1094 if (c2 == '\\' && (dfa->syntax.syntax_bits
1095 & RE_BACKSLASH_ESCAPE_IN_LISTS))
1097 c2 = bracket_fetch_wc (dfa);
1098 wc2 = dfa->lex.wctok;
1101 colon_warning_state |= 8;
1102 c1 = bracket_fetch_wc (dfa);
1103 wc1 = dfa->lex.wctok;
1105 /* Treat [x-y] as a range if x != y. */
1106 if (wc != wc2 || wc == WEOF)
1108 if (dfa->localeinfo.simple
1109 || (c_isdigit (c) & c_isdigit (c2)))
1111 for (int ci = c; ci <= c2; ci++)
1112 if (dfa->syntax.case_fold && isalpha (ci))
1113 setbit_case_fold_c (ci, &ccl);
1114 else
1115 setbit (ci, &ccl);
1117 else
1118 known_bracket_exp = false;
1120 continue;
1125 colon_warning_state |= (c == ':') ? 2 : 4;
1127 if (!dfa->localeinfo.multibyte)
1129 if (dfa->syntax.case_fold && isalpha (c))
1130 setbit_case_fold_c (c, &ccl);
1131 else
1132 setbit (c, &ccl);
1133 continue;
1136 if (wc == WEOF)
1137 known_bracket_exp = false;
1138 else
1140 char32_t folded[CASE_FOLDED_BUFSIZE + 1];
1141 int n = (dfa->syntax.case_fold
1142 ? case_folded_counterparts (wc, folded + 1) + 1
1143 : 1);
1144 folded[0] = wc;
1145 for (int i = 0; i < n; i++)
1146 if (!setbit_wc (folded[i], &ccl))
1148 dfa->lex.brack.chars
1149 = maybe_realloc (dfa->lex.brack.chars, dfa->lex.brack.nchars,
1150 &dfa->lex.brack.nchars_alloc, -1,
1151 sizeof *dfa->lex.brack.chars);
1152 dfa->lex.brack.chars[dfa->lex.brack.nchars++] = folded[i];
1156 while ((wc = wc1, (c = c1) != ']'));
1158 if (colon_warning_state == 7)
1160 char const *msg
1161 = _("character class syntax is [[:space:]], not [:space:]");
1162 if (dfa->syntax.dfaopts & DFA_CONFUSING_BRACKETS_ERROR)
1163 dfaerror (msg);
1164 else
1165 dfawarn (msg);
1168 if (! known_bracket_exp)
1169 return BACKREF;
1171 if (dfa->localeinfo.multibyte && (invert || dfa->lex.brack.nchars != 0))
1173 dfa->lex.brack.invert = invert;
1174 dfa->lex.brack.cset = emptyset (&ccl) ? -1 : charclass_index (dfa, &ccl);
1175 return MBCSET;
1178 if (invert)
1180 notset (&ccl);
1181 if (dfa->syntax.syntax_bits & RE_HAT_LISTS_NOT_NEWLINE)
1182 clrbit ('\n', &ccl);
1185 return CSET + charclass_index (dfa, &ccl);
1188 struct lexptr
1190 char const *ptr;
1191 idx_t left;
1194 static void
1195 push_lex_state (struct dfa *dfa, struct lexptr *ls, char const *s)
1197 ls->ptr = dfa->lex.ptr;
1198 ls->left = dfa->lex.left;
1199 dfa->lex.ptr = s;
1200 dfa->lex.left = strlen (s);
1203 static void
1204 pop_lex_state (struct dfa *dfa, struct lexptr const *ls)
1206 dfa->lex.ptr = ls->ptr;
1207 dfa->lex.left = ls->left;
1210 static token
1211 lex (struct dfa *dfa)
1213 bool backslash = false;
1215 /* Basic plan: We fetch a character. If it's a backslash,
1216 we set the backslash flag and go through the loop again.
1217 On the plus side, this avoids having a duplicate of the
1218 main switch inside the backslash case. On the minus side,
1219 it means that just about every case tests the backslash flag. */
1220 for (int i = 0; ; i++)
1222 /* This loop should consume at most a backslash and some other
1223 character. */
1224 if (2 <= i)
1225 abort ();
1227 if (! dfa->lex.left)
1228 return dfa->lex.lasttok = END;
1229 int c = fetch_wc (dfa);
1231 switch (c)
1233 case '\\':
1234 if (backslash)
1235 goto normal_char;
1236 if (dfa->lex.left == 0)
1237 dfaerror (_("unfinished \\ escape"));
1238 backslash = true;
1239 break;
1241 case '^':
1242 if (backslash)
1243 goto normal_char;
1244 if (dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_ANCHORS
1245 || dfa->lex.lasttok == END || dfa->lex.lasttok == LPAREN
1246 || dfa->lex.lasttok == OR)
1247 return dfa->lex.lasttok = BEGLINE;
1248 goto normal_char;
1250 case '$':
1251 if (backslash)
1252 goto normal_char;
1253 if (dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_ANCHORS
1254 || dfa->lex.left == 0
1255 || ((dfa->lex.left
1256 > !(dfa->syntax.syntax_bits & RE_NO_BK_PARENS))
1257 && (dfa->lex.ptr[!(dfa->syntax.syntax_bits & RE_NO_BK_PARENS)
1258 & (dfa->lex.ptr[0] == '\\')]
1259 == ')'))
1260 || ((dfa->lex.left
1261 > !(dfa->syntax.syntax_bits & RE_NO_BK_VBAR))
1262 && (dfa->lex.ptr[!(dfa->syntax.syntax_bits & RE_NO_BK_VBAR)
1263 & (dfa->lex.ptr[0] == '\\')]
1264 == '|'))
1265 || ((dfa->syntax.syntax_bits & RE_NEWLINE_ALT)
1266 && dfa->lex.left > 0 && dfa->lex.ptr[0] == '\n'))
1267 return dfa->lex.lasttok = ENDLINE;
1268 goto normal_char;
1270 case '1':
1271 case '2':
1272 case '3':
1273 case '4':
1274 case '5':
1275 case '6':
1276 case '7':
1277 case '8':
1278 case '9':
1279 if (!backslash)
1280 goto normal_char;
1281 if (dfa->syntax.syntax_bits & RE_NO_BK_REFS)
1282 goto stray_backslash;
1284 dfa->lex.laststart = false;
1285 return dfa->lex.lasttok = BACKREF;
1287 case '`':
1288 if (!backslash)
1289 goto normal_char;
1290 if (dfa->syntax.syntax_bits & RE_NO_GNU_OPS)
1291 goto stray_backslash;
1293 /* FIXME: should be beginning of string */
1294 return dfa->lex.lasttok = BEGLINE;
1296 case '\'':
1297 if (!backslash)
1298 goto normal_char;
1299 if (dfa->syntax.syntax_bits & RE_NO_GNU_OPS)
1300 goto stray_backslash;
1302 /* FIXME: should be end of string */
1303 return dfa->lex.lasttok = ENDLINE;
1305 case '<':
1306 if (!backslash)
1307 goto normal_char;
1308 if (dfa->syntax.syntax_bits & RE_NO_GNU_OPS)
1309 goto stray_backslash;
1311 return dfa->lex.lasttok = BEGWORD;
1313 case '>':
1314 if (!backslash)
1315 goto normal_char;
1316 if (dfa->syntax.syntax_bits & RE_NO_GNU_OPS)
1317 goto stray_backslash;
1319 return dfa->lex.lasttok = ENDWORD;
1321 case 'b':
1322 if (!backslash)
1323 goto normal_char;
1324 if (dfa->syntax.syntax_bits & RE_NO_GNU_OPS)
1325 goto stray_backslash;
1327 return dfa->lex.lasttok = LIMWORD;
1329 case 'B':
1330 if (!backslash)
1331 goto normal_char;
1332 if (dfa->syntax.syntax_bits & RE_NO_GNU_OPS)
1333 goto stray_backslash;
1335 return dfa->lex.lasttok = NOTLIMWORD;
1337 case '?':
1338 if (dfa->syntax.syntax_bits & RE_LIMITED_OPS)
1339 goto default_case;
1340 if (backslash != ((dfa->syntax.syntax_bits & RE_BK_PLUS_QM) != 0))
1341 goto normal_char;
1342 if (dfa->lex.laststart)
1344 if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS))
1345 goto default_case;
1346 if (dfa->syntax.dfaopts & DFA_PLUS_WARN)
1347 dfawarn (_("? at start of expression"));
1349 return dfa->lex.lasttok = QMARK;
1351 case '*':
1352 if (backslash)
1353 goto normal_char;
1354 if (dfa->lex.laststart)
1356 if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS))
1357 goto default_case;
1358 if (dfa->syntax.dfaopts & DFA_STAR_WARN)
1359 dfawarn (_("* at start of expression"));
1361 return dfa->lex.lasttok = STAR;
1363 case '+':
1364 if (dfa->syntax.syntax_bits & RE_LIMITED_OPS)
1365 goto default_case;
1366 if (backslash != ((dfa->syntax.syntax_bits & RE_BK_PLUS_QM) != 0))
1367 goto normal_char;
1368 if (dfa->lex.laststart)
1370 if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS))
1371 goto default_case;
1372 if (dfa->syntax.dfaopts & DFA_PLUS_WARN)
1373 dfawarn (_("+ at start of expression"));
1375 return dfa->lex.lasttok = PLUS;
1377 case '{':
1378 if (!(dfa->syntax.syntax_bits & RE_INTERVALS))
1379 goto default_case;
1380 if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_BRACES) == 0))
1381 goto normal_char;
1383 /* Cases:
1384 {M} - exact count
1385 {M,} - minimum count, maximum is infinity
1386 {,N} - 0 through N
1387 {,} - 0 to infinity (same as '*')
1388 {M,N} - M through N */
1390 char const *p = dfa->lex.ptr;
1391 char const *lim = p + dfa->lex.left;
1392 dfa->lex.minrep = dfa->lex.maxrep = -1;
1393 for (; p != lim && c_isdigit (*p); p++)
1394 dfa->lex.minrep = (dfa->lex.minrep < 0
1395 ? *p - '0'
1396 : MIN (RE_DUP_MAX + 1,
1397 dfa->lex.minrep * 10 + *p - '0'));
1398 if (p != lim)
1400 if (*p != ',')
1401 dfa->lex.maxrep = dfa->lex.minrep;
1402 else
1404 if (dfa->lex.minrep < 0)
1405 dfa->lex.minrep = 0;
1406 while (++p != lim && c_isdigit (*p))
1407 dfa->lex.maxrep
1408 = (dfa->lex.maxrep < 0
1409 ? *p - '0'
1410 : MIN (RE_DUP_MAX + 1,
1411 dfa->lex.maxrep * 10 + *p - '0'));
1414 bool invalid_content
1415 = ! ((! backslash || (p != lim && *p++ == '\\'))
1416 && p != lim && *p++ == '}'
1417 && 0 <= dfa->lex.minrep
1418 && (dfa->lex.maxrep < 0
1419 || dfa->lex.minrep <= dfa->lex.maxrep));
1420 if (invalid_content
1421 && (dfa->syntax.syntax_bits & RE_INVALID_INTERVAL_ORD))
1422 goto normal_char;
1423 if (dfa->lex.laststart)
1425 if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS))
1426 goto default_case;
1427 if (dfa->syntax.dfaopts & DFA_PLUS_WARN)
1428 dfawarn (_("{...} at start of expression"));
1430 if (invalid_content)
1431 dfaerror (_("invalid content of \\{\\}"));
1432 if (RE_DUP_MAX < dfa->lex.maxrep)
1433 dfaerror (_("regular expression too big"));
1434 dfa->lex.ptr = p;
1435 dfa->lex.left = lim - p;
1437 dfa->lex.laststart = false;
1438 return dfa->lex.lasttok = REPMN;
1440 case '|':
1441 if (dfa->syntax.syntax_bits & RE_LIMITED_OPS)
1442 goto default_case;
1443 if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_VBAR) == 0))
1444 goto normal_char;
1445 dfa->lex.laststart = true;
1446 return dfa->lex.lasttok = OR;
1448 case '\n':
1449 if (!(dfa->syntax.syntax_bits & RE_NEWLINE_ALT))
1450 goto default_case;
1451 if (backslash)
1452 goto normal_char;
1453 dfa->lex.laststart = true;
1454 return dfa->lex.lasttok = OR;
1456 case '(':
1457 if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_PARENS) == 0))
1458 goto normal_char;
1459 dfa->lex.parens++;
1460 dfa->lex.laststart = true;
1461 return dfa->lex.lasttok = LPAREN;
1463 case ')':
1464 if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_PARENS) == 0))
1465 goto normal_char;
1466 if (dfa->lex.parens == 0
1467 && dfa->syntax.syntax_bits & RE_UNMATCHED_RIGHT_PAREN_ORD)
1468 goto normal_char;
1469 dfa->lex.parens--;
1470 dfa->lex.laststart = false;
1471 return dfa->lex.lasttok = RPAREN;
1473 case '.':
1474 if (backslash)
1475 goto normal_char;
1476 if (dfa->canychar < 0)
1478 charclass ccl;
1479 fillset (&ccl);
1480 if (!(dfa->syntax.syntax_bits & RE_DOT_NEWLINE))
1481 clrbit ('\n', &ccl);
1482 if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL)
1483 clrbit ('\0', &ccl);
1484 if (dfa->localeinfo.multibyte)
1485 for (int c2 = 0; c2 < NOTCHAR; c2++)
1486 if (dfa->localeinfo.sbctowc[c2] == WEOF)
1487 clrbit (c2, &ccl);
1488 dfa->canychar = charclass_index (dfa, &ccl);
1490 dfa->lex.laststart = false;
1491 return dfa->lex.lasttok = (dfa->localeinfo.multibyte
1492 ? ANYCHAR
1493 : CSET + dfa->canychar);
1495 case 's':
1496 case 'S':
1497 if (!backslash)
1498 goto normal_char;
1499 if (dfa->syntax.syntax_bits & RE_NO_GNU_OPS)
1500 goto stray_backslash;
1502 if (!dfa->localeinfo.multibyte)
1504 charclass ccl;
1505 zeroset (&ccl);
1506 for (int c2 = 0; c2 < NOTCHAR; ++c2)
1507 if (isspace (c2))
1508 setbit (c2, &ccl);
1509 if (c == 'S')
1510 notset (&ccl);
1511 dfa->lex.laststart = false;
1512 return dfa->lex.lasttok = CSET + charclass_index (dfa, &ccl);
1515 /* FIXME: see if optimizing this, as is done with ANYCHAR and
1516 add_utf8_anychar, makes sense. */
1518 /* \s and \S are documented to be equivalent to [[:space:]] and
1519 [^[:space:]] respectively, so tell the lexer to process those
1520 strings, each minus its "already processed" '['. */
1522 struct lexptr ls;
1523 push_lex_state (dfa, &ls, &"^[:space:]]"[c == 's']);
1524 dfa->lex.lasttok = parse_bracket_exp (dfa);
1525 pop_lex_state (dfa, &ls);
1528 dfa->lex.laststart = false;
1529 return dfa->lex.lasttok;
1531 case 'w':
1532 case 'W':
1533 if (!backslash)
1534 goto normal_char;
1535 if (dfa->syntax.syntax_bits & RE_NO_GNU_OPS)
1536 goto stray_backslash;
1538 if (!dfa->localeinfo.multibyte)
1540 charclass ccl;
1541 zeroset (&ccl);
1542 for (int c2 = 0; c2 < NOTCHAR; ++c2)
1543 if (dfa->syntax.sbit[c2] == CTX_LETTER)
1544 setbit (c2, &ccl);
1545 if (c == 'W')
1546 notset (&ccl);
1547 dfa->lex.laststart = false;
1548 return dfa->lex.lasttok = CSET + charclass_index (dfa, &ccl);
1551 /* FIXME: see if optimizing this, as is done with ANYCHAR and
1552 add_utf8_anychar, makes sense. */
1554 /* \w and \W are documented to be equivalent to [_[:alnum:]] and
1555 [^_[:alnum:]] respectively, so tell the lexer to process those
1556 strings, each minus its "already processed" '['. */
1558 struct lexptr ls;
1559 push_lex_state (dfa, &ls, &"^_[:alnum:]]"[c == 'w']);
1560 dfa->lex.lasttok = parse_bracket_exp (dfa);
1561 pop_lex_state (dfa, &ls);
1564 dfa->lex.laststart = false;
1565 return dfa->lex.lasttok;
1567 case '[':
1568 if (backslash)
1569 goto normal_char;
1570 dfa->lex.laststart = false;
1571 return dfa->lex.lasttok = parse_bracket_exp (dfa);
1573 default:
1574 default_case:
1575 if (!backslash)
1576 goto normal_char;
1577 stray_backslash:
1578 if (dfa->syntax.dfaopts & DFA_STRAY_BACKSLASH_WARN)
1580 char const *msg;
1581 char msgbuf[100];
1582 if (!c32isprint (dfa->lex.wctok))
1583 msg = _("stray \\ before unprintable character");
1584 else if (c32isspace (dfa->lex.wctok))
1585 msg = _("stray \\ before white space");
1586 else
1588 char buf[MB_LEN_MAX + 1];
1589 mbstate_t s;
1590 mbszero (&s);
1591 size_t stored_bytes = c32rtomb (buf, dfa->lex.wctok, &s);
1592 if (stored_bytes < (size_t) -1)
1594 buf[stored_bytes] = '\0';
1595 int n = snprintf (msgbuf, sizeof msgbuf,
1596 _("stray \\ before %s"), buf);
1597 msg = 0 <= n && n < sizeof msgbuf ? msgbuf : _("stray \\");
1599 else
1600 msg = _("stray \\");
1602 dfawarn (msg);
1604 FALLTHROUGH;
1605 case ']': case '}':
1606 normal_char:
1607 dfa->lex.laststart = false;
1608 /* For multibyte character sets, folding is done in atom. Always
1609 return WCHAR. */
1610 if (dfa->localeinfo.multibyte)
1611 return dfa->lex.lasttok = WCHAR;
1613 if (dfa->syntax.case_fold && isalpha (c))
1615 charclass ccl;
1616 zeroset (&ccl);
1617 setbit_case_fold_c (c, &ccl);
1618 return dfa->lex.lasttok = CSET + charclass_index (dfa, &ccl);
1621 return dfa->lex.lasttok = c;
1626 static void
1627 addtok_mb (struct dfa *dfa, token t, char mbprop)
1629 if (dfa->talloc == dfa->tindex)
1631 dfa->tokens = xpalloc (dfa->tokens, &dfa->talloc, 1, -1,
1632 sizeof *dfa->tokens);
1633 if (dfa->localeinfo.multibyte)
1634 dfa->multibyte_prop = xreallocarray (dfa->multibyte_prop, dfa->talloc,
1635 sizeof *dfa->multibyte_prop);
1637 if (dfa->localeinfo.multibyte)
1638 dfa->multibyte_prop[dfa->tindex] = mbprop;
1639 dfa->tokens[dfa->tindex++] = t;
1641 switch (t)
1643 case QMARK:
1644 case STAR:
1645 case PLUS:
1646 break;
1648 case CAT:
1649 case OR:
1650 dfa->parse.depth--;
1651 break;
1653 case EMPTY:
1654 dfa->epsilon = true;
1655 goto increment_depth;
1657 case BACKREF:
1658 dfa->fast = false;
1659 goto increment_nleaves;
1661 case BEGLINE:
1662 case ENDLINE:
1663 case BEGWORD:
1664 case ENDWORD:
1665 case LIMWORD:
1666 case NOTLIMWORD:
1667 dfa->epsilon = true;
1668 FALLTHROUGH;
1669 default:
1670 increment_nleaves:
1671 dfa->nleaves++;
1672 increment_depth:
1673 dfa->parse.depth++;
1674 if (dfa->depth < dfa->parse.depth)
1675 dfa->depth = dfa->parse.depth;
1676 break;
1680 static void addtok_wc (struct dfa *dfa, wint_t wc);
1682 /* Add the given token to the parse tree, maintaining the depth count and
1683 updating the maximum depth if necessary. */
1684 static void
1685 addtok (struct dfa *dfa, token t)
1687 if (dfa->localeinfo.multibyte && t == MBCSET)
1689 bool need_or = false;
1691 /* Extract wide characters into alternations for better performance.
1692 This does not require UTF-8. */
1693 for (idx_t i = 0; i < dfa->lex.brack.nchars; i++)
1695 addtok_wc (dfa, dfa->lex.brack.chars[i]);
1696 if (need_or)
1697 addtok (dfa, OR);
1698 need_or = true;
1700 dfa->lex.brack.nchars = 0;
1702 /* Wide characters have been handled above, so it is possible
1703 that the set is empty now. Do nothing in that case. */
1704 if (dfa->lex.brack.cset != -1)
1706 addtok (dfa, CSET + dfa->lex.brack.cset);
1707 if (need_or)
1708 addtok (dfa, OR);
1711 else
1713 addtok_mb (dfa, t, 3);
1717 /* We treat a multibyte character as a single atom, so that DFA
1718 can treat a multibyte character as a single expression.
1720 e.g., we construct the following tree from "<mb1><mb2>".
1721 <mb1(1st-byte)><mb1(2nd-byte)><CAT><mb1(3rd-byte)><CAT>
1722 <mb2(1st-byte)><mb2(2nd-byte)><CAT><mb2(3rd-byte)><CAT><CAT> */
1723 static void
1724 addtok_wc (struct dfa *dfa, wint_t wc)
1726 unsigned char buf[MB_LEN_MAX];
1727 mbstate_t s;
1728 mbszero (&s);
1729 size_t stored_bytes = c32rtomb ((char *) buf, wc, &s);
1730 int buflen;
1732 if (stored_bytes != (size_t) -1)
1733 buflen = stored_bytes;
1734 else
1736 /* This is merely stop-gap. buf[0] is undefined, yet skipping
1737 the addtok_mb call altogether can corrupt the heap. */
1738 buflen = 1;
1739 buf[0] = 0;
1742 addtok_mb (dfa, buf[0], buflen == 1 ? 3 : 1);
1743 for (int i = 1; i < buflen; i++)
1745 addtok_mb (dfa, buf[i], i == buflen - 1 ? 2 : 0);
1746 addtok (dfa, CAT);
1750 static void
1751 add_utf8_anychar (struct dfa *dfa)
1753 /* Since the Unicode Standard Version 4.0.0 (2003), a well-formed
1754 UTF-8 byte sequence has been defined as follows:
1756 ([\x00-\x7f]
1757 |[\xc2-\xdf][\x80-\xbf]
1758 |[\xe0][\xa0-\xbf][\x80-\xbf]
1759 |[\xe1-\xec\xee-\xef][\x80-\xbf][\x80-\xbf]
1760 |[\xed][\x80-\x9f][\x80-\xbf]
1761 |[\xf0][\x90-\xbf][\x80-\xbf][\x80-\xbf])
1762 |[\xf1-\xf3][\x80-\xbf][\x80-\xbf][\x80-\xbf]
1763 |[\xf4][\x80-\x8f][\x80-\xbf][\x80-\xbf])
1765 which I'll write more concisely "A|BC|DEC|FCC|GHC|IJCC|KCCC|LMCC",
1766 where A = [\x00-\x7f], B = [\xc2-\xdf], C = [\x80-\xbf],
1767 D = [\xe0], E = [\xa0-\xbf], F = [\xe1-\xec\xee-\xef], G = [\xed],
1768 H = [\x80-\x9f], I = [\xf0],
1769 J = [\x90-\xbf], K = [\xf1-\xf3], L = [\xf4], M = [\x80-\x8f].
1771 This can be refactored to "A|(B|DE|GH|(F|IJ|LM|KC)C)C". */
1773 /* Mnemonics for classes containing two or more bytes. */
1774 enum { A, B, C, E, F, H, J, K, M };
1776 /* Mnemonics for single-byte tokens. */
1777 enum { D_token = 0xe0, G_token = 0xed, I_token = 0xf0, L_token = 0xf4 };
1779 static charclass const utf8_classes[] = {
1780 /* A. 00-7f: 1-byte sequence. */
1781 CHARCLASS_INIT (0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0, 0, 0, 0),
1783 /* B. c2-df: 1st byte of a 2-byte sequence. */
1784 CHARCLASS_INIT (0, 0, 0, 0, 0, 0, 0xfffffffc, 0),
1786 /* C. 80-bf: non-leading bytes. */
1787 CHARCLASS_INIT (0, 0, 0, 0, 0xffffffff, 0xffffffff, 0, 0),
1789 /* D. e0 (just a token). */
1791 /* E. a0-bf: 2nd byte of a "DEC" sequence. */
1792 CHARCLASS_INIT (0, 0, 0, 0, 0, 0xffffffff, 0, 0),
1794 /* F. e1-ec + ee-ef: 1st byte of an "FCC" sequence. */
1795 CHARCLASS_INIT (0, 0, 0, 0, 0, 0, 0, 0xdffe),
1797 /* G. ed (just a token). */
1799 /* H. 80-9f: 2nd byte of a "GHC" sequence. */
1800 CHARCLASS_INIT (0, 0, 0, 0, 0xffffffff, 0, 0, 0),
1802 /* I. f0 (just a token). */
1804 /* J. 90-bf: 2nd byte of an "IJCC" sequence. */
1805 CHARCLASS_INIT (0, 0, 0, 0, 0xffff0000, 0xffffffff, 0, 0),
1807 /* K. f1-f3: 1st byte of a "KCCC" sequence. */
1808 CHARCLASS_INIT (0, 0, 0, 0, 0, 0, 0, 0xe0000),
1810 /* L. f4 (just a token). */
1812 /* M. 80-8f: 2nd byte of a "LMCC" sequence. */
1813 CHARCLASS_INIT (0, 0, 0, 0, 0xffff, 0, 0, 0),
1816 /* Define the character classes that are needed below. */
1817 if (dfa->utf8_anychar_classes[0] == 0)
1819 charclass c = utf8_classes[0];
1820 if (! (dfa->syntax.syntax_bits & RE_DOT_NEWLINE))
1821 clrbit ('\n', &c);
1822 if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL)
1823 clrbit ('\0', &c);
1824 dfa->utf8_anychar_classes[0] = CSET + charclass_index (dfa, &c);
1826 for (int i = 1; i < sizeof utf8_classes / sizeof *utf8_classes; i++)
1827 dfa->utf8_anychar_classes[i]
1828 = CSET + charclass_index (dfa, &utf8_classes[i]);
1831 /* Implement the "A|(B|DE|GH|(F|IJ|LM|KC)C)C" pattern mentioned above.
1832 The token buffer is in reverse Polish order, so we get
1833 "A B D E CAT OR G H CAT OR F I J CAT OR L M CAT OR K
1834 C CAT OR C CAT OR C CAT OR". */
1835 addtok (dfa, dfa->utf8_anychar_classes[A]);
1836 addtok (dfa, dfa->utf8_anychar_classes[B]);
1837 addtok (dfa, D_token);
1838 addtok (dfa, dfa->utf8_anychar_classes[E]);
1839 addtok (dfa, CAT);
1840 addtok (dfa, OR);
1841 addtok (dfa, G_token);
1842 addtok (dfa, dfa->utf8_anychar_classes[H]);
1843 addtok (dfa, CAT);
1844 addtok (dfa, OR);
1845 addtok (dfa, dfa->utf8_anychar_classes[F]);
1846 addtok (dfa, I_token);
1847 addtok (dfa, dfa->utf8_anychar_classes[J]);
1848 addtok (dfa, CAT);
1849 addtok (dfa, OR);
1850 addtok (dfa, L_token);
1851 addtok (dfa, dfa->utf8_anychar_classes[M]);
1852 addtok (dfa, CAT);
1853 addtok (dfa, OR);
1854 addtok (dfa, dfa->utf8_anychar_classes[K]);
1855 for (int i = 0; i < 3; i++)
1857 addtok (dfa, dfa->utf8_anychar_classes[C]);
1858 addtok (dfa, CAT);
1859 addtok (dfa, OR);
1863 /* The grammar understood by the parser is as follows.
1865 regexp:
1866 regexp OR branch
1867 branch
1869 branch:
1870 branch closure
1871 closure
1873 closure:
1874 closure QMARK
1875 closure STAR
1876 closure PLUS
1877 closure REPMN
1878 atom
1880 atom:
1881 <normal character>
1882 <multibyte character>
1883 ANYCHAR
1884 MBCSET
1885 CSET
1886 BACKREF
1887 BEGLINE
1888 ENDLINE
1889 BEGWORD
1890 ENDWORD
1891 LIMWORD
1892 NOTLIMWORD
1893 LPAREN regexp RPAREN
1894 <empty>
1896 The parser builds a parse tree in postfix form in an array of tokens. */
1898 static void
1899 atom (struct dfa *dfa)
1901 if ((0 <= dfa->parse.tok && dfa->parse.tok < NOTCHAR)
1902 || dfa->parse.tok >= CSET
1903 || dfa->parse.tok == BEG || dfa->parse.tok == BACKREF
1904 || dfa->parse.tok == BEGLINE || dfa->parse.tok == ENDLINE
1905 || dfa->parse.tok == BEGWORD || dfa->parse.tok == ENDWORD
1906 || dfa->parse.tok == LIMWORD || dfa->parse.tok == NOTLIMWORD
1907 || dfa->parse.tok == ANYCHAR || dfa->parse.tok == MBCSET)
1909 if (dfa->parse.tok == ANYCHAR && dfa->localeinfo.using_utf8)
1911 /* For UTF-8 expand the period to a series of CSETs that define a
1912 valid UTF-8 character. This avoids using the slow multibyte
1913 path. I'm pretty sure it would be both profitable and correct to
1914 do it for any encoding; however, the optimization must be done
1915 manually as it is done above in add_utf8_anychar. So, let's
1916 start with UTF-8: it is the most used, and the structure of the
1917 encoding makes the correctness more obvious. */
1918 add_utf8_anychar (dfa);
1920 else
1921 addtok (dfa, dfa->parse.tok);
1922 dfa->parse.tok = lex (dfa);
1924 else if (dfa->parse.tok == WCHAR)
1926 if (dfa->lex.wctok == WEOF)
1927 addtok (dfa, BACKREF);
1928 else
1930 addtok_wc (dfa, dfa->lex.wctok);
1932 if (dfa->syntax.case_fold)
1934 char32_t folded[CASE_FOLDED_BUFSIZE];
1935 int n = case_folded_counterparts (dfa->lex.wctok, folded);
1936 for (int i = 0; i < n; i++)
1938 addtok_wc (dfa, folded[i]);
1939 addtok (dfa, OR);
1944 dfa->parse.tok = lex (dfa);
1946 else if (dfa->parse.tok == LPAREN)
1948 dfa->parse.tok = lex (dfa);
1949 regexp (dfa);
1950 if (dfa->parse.tok != RPAREN)
1951 dfaerror (_("unbalanced ("));
1952 dfa->parse.tok = lex (dfa);
1954 else
1955 addtok (dfa, EMPTY);
1958 /* Return the number of tokens in the given subexpression. */
1959 static idx_t _GL_ATTRIBUTE_PURE
1960 nsubtoks (struct dfa const *dfa, idx_t tindex)
1962 switch (dfa->tokens[tindex - 1])
1964 default:
1965 return 1;
1966 case QMARK:
1967 case STAR:
1968 case PLUS:
1969 return 1 + nsubtoks (dfa, tindex - 1);
1970 case CAT:
1971 case OR:
1973 idx_t ntoks1 = nsubtoks (dfa, tindex - 1);
1974 return 1 + ntoks1 + nsubtoks (dfa, tindex - 1 - ntoks1);
1979 /* Copy the given subexpression to the top of the tree. */
1980 static void
1981 copytoks (struct dfa *dfa, idx_t tindex, idx_t ntokens)
1983 if (dfa->localeinfo.multibyte)
1984 for (idx_t i = 0; i < ntokens; i++)
1985 addtok_mb (dfa, dfa->tokens[tindex + i],
1986 dfa->multibyte_prop[tindex + i]);
1987 else
1988 for (idx_t i = 0; i < ntokens; i++)
1989 addtok_mb (dfa, dfa->tokens[tindex + i], 3);
1992 static void
1993 closure (struct dfa *dfa)
1995 atom (dfa);
1996 while (dfa->parse.tok == QMARK || dfa->parse.tok == STAR
1997 || dfa->parse.tok == PLUS || dfa->parse.tok == REPMN)
1998 if (dfa->parse.tok == REPMN && (dfa->lex.minrep || dfa->lex.maxrep))
2000 idx_t ntokens = nsubtoks (dfa, dfa->tindex);
2001 idx_t tindex = dfa->tindex - ntokens;
2002 if (dfa->lex.maxrep < 0)
2003 addtok (dfa, PLUS);
2004 if (dfa->lex.minrep == 0)
2005 addtok (dfa, QMARK);
2006 int i;
2007 for (i = 1; i < dfa->lex.minrep; i++)
2009 copytoks (dfa, tindex, ntokens);
2010 addtok (dfa, CAT);
2012 for (; i < dfa->lex.maxrep; i++)
2014 copytoks (dfa, tindex, ntokens);
2015 addtok (dfa, QMARK);
2016 addtok (dfa, CAT);
2018 dfa->parse.tok = lex (dfa);
2020 else if (dfa->parse.tok == REPMN)
2022 dfa->tindex -= nsubtoks (dfa, dfa->tindex);
2023 dfa->parse.tok = lex (dfa);
2024 closure (dfa);
2026 else
2028 addtok (dfa, dfa->parse.tok);
2029 dfa->parse.tok = lex (dfa);
2033 static void
2034 branch (struct dfa* dfa)
2036 closure (dfa);
2037 while (dfa->parse.tok != RPAREN && dfa->parse.tok != OR
2038 && dfa->parse.tok >= 0)
2040 closure (dfa);
2041 addtok (dfa, CAT);
2045 static void
2046 regexp (struct dfa *dfa)
2048 branch (dfa);
2049 while (dfa->parse.tok == OR)
2051 dfa->parse.tok = lex (dfa);
2052 branch (dfa);
2053 addtok (dfa, OR);
2057 /* Parse a string S of length LEN into D. S can include NUL characters.
2058 This is the main entry point for the parser. */
2059 void
2060 dfaparse (char const *s, idx_t len, struct dfa *d)
2062 d->lex.ptr = s;
2063 d->lex.left = len;
2064 d->lex.lasttok = END;
2065 d->lex.laststart = true;
2067 if (!d->syntax.syntax_bits_set)
2068 dfaerror (_("no syntax specified"));
2070 if (!d->nregexps)
2071 addtok (d, BEG);
2073 d->parse.tok = lex (d);
2074 d->parse.depth = d->depth;
2076 regexp (d);
2078 if (d->parse.tok != END)
2079 dfaerror (_("unbalanced )"));
2081 addtok (d, END - d->nregexps);
2082 addtok (d, CAT);
2084 if (d->nregexps)
2085 addtok (d, OR);
2087 ++d->nregexps;
2090 /* Some primitives for operating on sets of positions. */
2092 /* Copy one set to another. */
2093 static void
2094 copy (position_set const *src, position_set *dst)
2096 if (dst->alloc < src->nelem)
2098 free (dst->elems);
2099 dst->elems = xpalloc (NULL, &dst->alloc, src->nelem - dst->alloc, -1,
2100 sizeof *dst->elems);
2102 dst->nelem = src->nelem;
2103 if (src->nelem != 0)
2104 memcpy (dst->elems, src->elems, src->nelem * sizeof *dst->elems);
2107 static void
2108 alloc_position_set (position_set *s, idx_t size)
2110 s->elems = xnmalloc (size, sizeof *s->elems);
2111 s->alloc = size;
2112 s->nelem = 0;
2115 /* Insert position P in set S. S is maintained in sorted order on
2116 decreasing index. If there is already an entry in S with P.index
2117 then merge (logically-OR) P's constraints into the one in S.
2118 S->elems must point to an array large enough to hold the resulting set. */
2119 static void
2120 insert (position p, position_set *s)
2122 idx_t count = s->nelem;
2123 idx_t lo = 0, hi = count;
2124 while (lo < hi)
2126 idx_t mid = (lo + hi) >> 1;
2127 if (s->elems[mid].index < p.index)
2128 lo = mid + 1;
2129 else if (s->elems[mid].index == p.index)
2131 s->elems[mid].constraint |= p.constraint;
2132 return;
2134 else
2135 hi = mid;
2138 s->elems = maybe_realloc (s->elems, count, &s->alloc, -1, sizeof *s->elems);
2139 for (idx_t i = count; i > lo; i--)
2140 s->elems[i] = s->elems[i - 1];
2141 s->elems[lo] = p;
2142 ++s->nelem;
2145 static void
2146 append (position p, position_set *s)
2148 idx_t count = s->nelem;
2149 s->elems = maybe_realloc (s->elems, count, &s->alloc, -1, sizeof *s->elems);
2150 s->elems[s->nelem++] = p;
2153 /* Merge S1 and S2 (with the additional constraint C2) into M. The
2154 result is as if the positions of S1, and of S2 with the additional
2155 constraint C2, were inserted into an initially empty set. */
2156 static void
2157 merge_constrained (position_set const *s1, position_set const *s2,
2158 unsigned int c2, position_set *m)
2160 idx_t i = 0, j = 0;
2162 if (m->alloc - s1->nelem < s2->nelem)
2164 free (m->elems);
2165 m->alloc = s1->nelem;
2166 m->elems = xpalloc (NULL, &m->alloc, s2->nelem, -1, sizeof *m->elems);
2168 m->nelem = 0;
2169 while (i < s1->nelem || j < s2->nelem)
2170 if (! (j < s2->nelem)
2171 || (i < s1->nelem && s1->elems[i].index <= s2->elems[j].index))
2173 unsigned int c = ((i < s1->nelem && j < s2->nelem
2174 && s1->elems[i].index == s2->elems[j].index)
2175 ? s2->elems[j++].constraint & c2
2176 : 0);
2177 m->elems[m->nelem].index = s1->elems[i].index;
2178 m->elems[m->nelem++].constraint = s1->elems[i++].constraint | c;
2180 else
2182 if (s2->elems[j].constraint & c2)
2184 m->elems[m->nelem].index = s2->elems[j].index;
2185 m->elems[m->nelem++].constraint = s2->elems[j].constraint & c2;
2187 j++;
2191 /* Merge two sets of positions into a third. The result is exactly as if
2192 the positions of both sets were inserted into an initially empty set. */
2193 static void
2194 merge (position_set const *s1, position_set const *s2, position_set *m)
2196 merge_constrained (s1, s2, -1, m);
2199 /* Merge into DST all the elements of SRC, possibly destroying
2200 the contents of the temporary M. */
2201 static void
2202 merge2 (position_set *dst, position_set const *src, position_set *m)
2204 if (src->nelem < 4)
2206 for (idx_t i = 0; i < src->nelem; i++)
2207 insert (src->elems[i], dst);
2209 else
2211 merge (src, dst, m);
2212 copy (m, dst);
2216 /* Delete a position from a set. Return the nonzero constraint of the
2217 deleted position, or zero if there was no such position. */
2218 static unsigned int
2219 delete (idx_t del, position_set *s)
2221 idx_t count = s->nelem;
2222 idx_t lo = 0, hi = count;
2223 while (lo < hi)
2225 idx_t mid = (lo + hi) >> 1;
2226 if (s->elems[mid].index < del)
2227 lo = mid + 1;
2228 else if (s->elems[mid].index == del)
2230 unsigned int c = s->elems[mid].constraint;
2231 idx_t i;
2232 for (i = mid; i + 1 < count; i++)
2233 s->elems[i] = s->elems[i + 1];
2234 s->nelem = i;
2235 return c;
2237 else
2238 hi = mid;
2240 return 0;
2243 /* Replace a position with the followed set. */
2244 static void
2245 replace (position_set *dst, idx_t del, position_set *add,
2246 unsigned int constraint, position_set *tmp)
2248 unsigned int c = delete (del, dst) & constraint;
2250 if (c)
2252 copy (dst, tmp);
2253 merge_constrained (tmp, add, c, dst);
2257 /* Find the index of the state corresponding to the given position set with
2258 the given preceding context, or create a new state if there is no such
2259 state. Context tells whether we got here on a newline or letter. */
2260 static state_num
2261 state_index (struct dfa *d, position_set const *s, int context)
2263 size_t hash = 0;
2264 int constraint = 0;
2265 state_num i;
2267 for (i = 0; i < s->nelem; ++i)
2269 idx_t ind = s->elems[i].index;
2270 hash ^= ind + s->elems[i].constraint;
2273 /* Try to find a state that exactly matches the proposed one. */
2274 for (i = 0; i < d->sindex; ++i)
2276 if (hash != d->states[i].hash || s->nelem != d->states[i].elems.nelem
2277 || context != d->states[i].context)
2278 continue;
2279 state_num j;
2280 for (j = 0; j < s->nelem; ++j)
2281 if (s->elems[j].constraint != d->states[i].elems.elems[j].constraint
2282 || s->elems[j].index != d->states[i].elems.elems[j].index)
2283 break;
2284 if (j == s->nelem)
2285 return i;
2288 #ifdef DEBUG
2289 fprintf (stderr, "new state %td\n nextpos:", i);
2290 for (state_num j = 0; j < s->nelem; j++)
2292 fprintf (stderr, " %td:", s->elems[j].index);
2293 prtok (d->tokens[s->elems[j].index]);
2295 fprintf (stderr, "\n context:");
2296 if (context ^ CTX_ANY)
2298 if (context & CTX_NONE)
2299 fprintf (stderr, " CTX_NONE");
2300 if (context & CTX_LETTER)
2301 fprintf (stderr, " CTX_LETTER");
2302 if (context & CTX_NEWLINE)
2303 fprintf (stderr, " CTX_NEWLINE");
2305 else
2306 fprintf (stderr, " CTX_ANY");
2307 fprintf (stderr, "\n");
2308 #endif
2310 for (state_num j = 0; j < s->nelem; j++)
2312 int c = d->constraints[s->elems[j].index];
2314 if (c != 0)
2316 if (succeeds_in_context (c, context, CTX_ANY))
2317 constraint |= c;
2319 else if (d->tokens[s->elems[j].index] == BACKREF)
2320 constraint = NO_CONSTRAINT;
2324 /* Create a new state. */
2325 d->states = maybe_realloc (d->states, d->sindex, &d->salloc, -1,
2326 sizeof *d->states);
2327 d->states[i].hash = hash;
2328 alloc_position_set (&d->states[i].elems, s->nelem);
2329 copy (s, &d->states[i].elems);
2330 d->states[i].context = context;
2331 d->states[i].constraint = constraint;
2332 d->states[i].mbps.nelem = 0;
2333 d->states[i].mbps.elems = NULL;
2334 d->states[i].mb_trindex = -1;
2336 ++d->sindex;
2338 return i;
2341 /* Find the epsilon closure of D's set of positions. If any position of the set
2342 contains a symbol that matches the empty string in some context, replace
2343 that position with the elements of its follow labeled with an appropriate
2344 constraint. Repeat exhaustively until no funny positions are left.
2345 S->elems must be large enough to hold the result. BACKWARD is D's
2346 backward set; use and update it too. */
2347 static void
2348 epsclosure (struct dfa const *d, position_set *backward)
2350 position_set tmp;
2351 alloc_position_set (&tmp, d->nleaves);
2352 for (idx_t i = 0; i < d->tindex; i++)
2353 if (0 < d->follows[i].nelem)
2355 unsigned int constraint;
2356 switch (d->tokens[i])
2358 default:
2359 continue;
2361 case BEGLINE:
2362 constraint = BEGLINE_CONSTRAINT;
2363 break;
2364 case ENDLINE:
2365 constraint = ENDLINE_CONSTRAINT;
2366 break;
2367 case BEGWORD:
2368 constraint = BEGWORD_CONSTRAINT;
2369 break;
2370 case ENDWORD:
2371 constraint = ENDWORD_CONSTRAINT;
2372 break;
2373 case LIMWORD:
2374 constraint = LIMWORD_CONSTRAINT;
2375 break;
2376 case NOTLIMWORD:
2377 constraint = NOTLIMWORD_CONSTRAINT;
2378 break;
2379 case EMPTY:
2380 constraint = NO_CONSTRAINT;
2381 break;
2384 delete (i, &d->follows[i]);
2386 for (idx_t j = 0; j < backward[i].nelem; j++)
2387 replace (&d->follows[backward[i].elems[j].index], i, &d->follows[i],
2388 constraint, &tmp);
2389 for (idx_t j = 0; j < d->follows[i].nelem; j++)
2390 replace (&backward[d->follows[i].elems[j].index], i, &backward[i],
2391 NO_CONSTRAINT, &tmp);
2393 free (tmp.elems);
2396 /* Returns the set of contexts for which there is at least one
2397 character included in C. */
2399 static int
2400 charclass_context (struct dfa const *dfa, charclass const *c)
2402 int context = 0;
2404 for (int j = 0; j < CHARCLASS_WORDS; j++)
2406 if (c->w[j] & dfa->syntax.newline.w[j])
2407 context |= CTX_NEWLINE;
2408 if (c->w[j] & dfa->syntax.letters.w[j])
2409 context |= CTX_LETTER;
2410 if (c->w[j] & ~(dfa->syntax.letters.w[j] | dfa->syntax.newline.w[j]))
2411 context |= CTX_NONE;
2414 return context;
2417 /* Returns the contexts on which the position set S depends. Each context
2418 in the set of returned contexts (let's call it SC) may have a different
2419 follow set than other contexts in SC, and also different from the
2420 follow set of the complement set (sc ^ CTX_ANY). However, all contexts
2421 in the complement set will have the same follow set. */
2423 static int _GL_ATTRIBUTE_PURE
2424 state_separate_contexts (struct dfa *d, position_set const *s)
2426 int separate_contexts = 0;
2428 for (idx_t j = 0; j < s->nelem; j++)
2429 separate_contexts |= d->separates[s->elems[j].index];
2431 return separate_contexts;
2434 enum
2436 /* Single token is repeated. It is distinguished from non-repeated. */
2437 OPT_REPEAT = (1 << 0),
2439 /* Multiple tokens are repeated. This flag is on at head of tokens. The
2440 node is not merged. */
2441 OPT_LPAREN = (1 << 1),
2443 /* Multiple branches are joined. The node is not merged. */
2444 OPT_RPAREN = (1 << 2),
2446 /* The node is walked. If the node is found in walking again, OPT_RPAREN
2447 flag is turned on. */
2448 OPT_WALKED = (1 << 3),
2450 /* The node is queued. The node is not queued again. */
2451 OPT_QUEUED = (1 << 4)
2454 static void
2455 merge_nfa_state (struct dfa *d, idx_t tindex, char *flags,
2456 position_set *merged)
2458 position_set *follows = d->follows;
2459 idx_t nelem = 0;
2461 for (idx_t i = 0; i < follows[tindex].nelem; i++)
2463 idx_t sindex = follows[tindex].elems[i].index;
2465 /* Skip the node as pruned in future. */
2466 unsigned int iconstraint = follows[tindex].elems[i].constraint;
2467 if (iconstraint == 0)
2468 continue;
2470 if (d->tokens[follows[tindex].elems[i].index] <= END)
2472 d->constraints[tindex] |= follows[tindex].elems[i].constraint;
2473 continue;
2476 if (sindex != tindex && !(flags[sindex] & (OPT_LPAREN | OPT_RPAREN)))
2478 idx_t j;
2480 for (j = 0; j < nelem; j++)
2482 idx_t dindex = follows[tindex].elems[j].index;
2484 if (dindex == tindex)
2485 continue;
2487 if (follows[tindex].elems[j].constraint != iconstraint)
2488 continue;
2490 if (flags[dindex] & (OPT_LPAREN | OPT_RPAREN))
2491 continue;
2493 if (d->tokens[sindex] != d->tokens[dindex])
2494 continue;
2496 if ((flags[sindex] ^ flags[dindex]) & OPT_REPEAT)
2497 continue;
2499 if (flags[sindex] & OPT_REPEAT)
2500 delete (sindex, &follows[sindex]);
2502 merge2 (&follows[dindex], &follows[sindex], merged);
2504 break;
2507 if (j < nelem)
2508 continue;
2511 follows[tindex].elems[nelem++] = follows[tindex].elems[i];
2512 flags[sindex] |= OPT_QUEUED;
2515 follows[tindex].nelem = nelem;
2518 static int
2519 compare (const void *a, const void *b)
2521 position const *p = a, *q = b;
2522 return (p->index > q->index) - (p->index < q->index);
2525 static void
2526 reorder_tokens (struct dfa *d)
2528 idx_t nleaves = 0;
2529 ptrdiff_t *map = xnmalloc (d->tindex, sizeof *map);
2530 map[0] = nleaves++;
2531 for (idx_t i = 1; i < d->tindex; i++)
2532 map[i] = -1;
2534 token *tokens = xnmalloc (d->nleaves, sizeof *tokens);
2535 position_set *follows = xnmalloc (d->nleaves, sizeof *follows);
2536 int *constraints = xnmalloc (d->nleaves, sizeof *constraints);
2537 char *multibyte_prop = (d->localeinfo.multibyte
2538 ? xnmalloc (d->nleaves, sizeof *multibyte_prop)
2539 : NULL);
2541 for (idx_t i = 0; i < d->tindex; i++)
2543 if (map[i] < 0)
2545 free (d->follows[i].elems);
2546 d->follows[i].elems = NULL;
2547 d->follows[i].nelem = 0;
2548 continue;
2551 tokens[map[i]] = d->tokens[i];
2552 follows[map[i]] = d->follows[i];
2553 constraints[map[i]] = d->constraints[i];
2555 if (multibyte_prop != NULL)
2556 multibyte_prop[map[i]] = d->multibyte_prop[i];
2558 for (idx_t j = 0; j < d->follows[i].nelem; j++)
2560 if (map[d->follows[i].elems[j].index] == -1)
2561 map[d->follows[i].elems[j].index] = nleaves++;
2563 d->follows[i].elems[j].index = map[d->follows[i].elems[j].index];
2566 qsort (d->follows[i].elems, d->follows[i].nelem,
2567 sizeof *d->follows[i].elems, compare);
2570 for (idx_t i = 0; i < nleaves; i++)
2572 d->tokens[i] = tokens[i];
2573 d->follows[i] = follows[i];
2574 d->constraints[i] = constraints[i];
2576 if (multibyte_prop != NULL)
2577 d->multibyte_prop[i] = multibyte_prop[i];
2580 d->tindex = d->nleaves = nleaves;
2582 free (tokens);
2583 free (follows);
2584 free (constraints);
2585 free (multibyte_prop);
2586 free (map);
2589 static void
2590 dfaoptimize (struct dfa *d)
2592 char *flags = xizalloc (d->tindex);
2594 for (idx_t i = 0; i < d->tindex; i++)
2596 for (idx_t j = 0; j < d->follows[i].nelem; j++)
2598 if (d->follows[i].elems[j].index == i)
2599 flags[d->follows[i].elems[j].index] |= OPT_REPEAT;
2600 else if (d->follows[i].elems[j].index < i)
2601 flags[d->follows[i].elems[j].index] |= OPT_LPAREN;
2602 else if (flags[d->follows[i].elems[j].index] &= OPT_WALKED)
2603 flags[d->follows[i].elems[j].index] |= OPT_RPAREN;
2604 else
2605 flags[d->follows[i].elems[j].index] |= OPT_WALKED;
2609 flags[0] |= OPT_QUEUED;
2611 position_set merged0;
2612 position_set *merged = &merged0;
2613 alloc_position_set (merged, d->nleaves);
2615 d->constraints = xicalloc (d->tindex, sizeof *d->constraints);
2617 for (idx_t i = 0; i < d->tindex; i++)
2618 if (flags[i] & OPT_QUEUED)
2619 merge_nfa_state (d, i, flags, merged);
2621 reorder_tokens (d);
2623 free (merged->elems);
2624 free (flags);
2627 /* Perform bottom-up analysis on the parse tree, computing various functions.
2628 Note that at this point, we're pretending constructs like \< are real
2629 characters rather than constraints on what can follow them.
2631 Nullable: A node is nullable if it is at the root of a regexp that can
2632 match the empty string.
2633 * EMPTY leaves are nullable.
2634 * No other leaf is nullable.
2635 * A QMARK or STAR node is nullable.
2636 * A PLUS node is nullable if its argument is nullable.
2637 * A CAT node is nullable if both its arguments are nullable.
2638 * An OR node is nullable if either argument is nullable.
2640 Firstpos: The firstpos of a node is the set of positions (nonempty leaves)
2641 that could correspond to the first character of a string matching the
2642 regexp rooted at the given node.
2643 * EMPTY leaves have empty firstpos.
2644 * The firstpos of a nonempty leaf is that leaf itself.
2645 * The firstpos of a QMARK, STAR, or PLUS node is the firstpos of its
2646 argument.
2647 * The firstpos of a CAT node is the firstpos of the left argument, union
2648 the firstpos of the right if the left argument is nullable.
2649 * The firstpos of an OR node is the union of firstpos of each argument.
2651 Lastpos: The lastpos of a node is the set of positions that could
2652 correspond to the last character of a string matching the regexp at
2653 the given node.
2654 * EMPTY leaves have empty lastpos.
2655 * The lastpos of a nonempty leaf is that leaf itself.
2656 * The lastpos of a QMARK, STAR, or PLUS node is the lastpos of its
2657 argument.
2658 * The lastpos of a CAT node is the lastpos of its right argument, union
2659 the lastpos of the left if the right argument is nullable.
2660 * The lastpos of an OR node is the union of the lastpos of each argument.
2662 Follow: The follow of a position is the set of positions that could
2663 correspond to the character following a character matching the node in
2664 a string matching the regexp. At this point we consider special symbols
2665 that match the empty string in some context to be just normal characters.
2666 Later, if we find that a special symbol is in a follow set, we will
2667 replace it with the elements of its follow, labeled with an appropriate
2668 constraint.
2669 * Every node in the firstpos of the argument of a STAR or PLUS node is in
2670 the follow of every node in the lastpos.
2671 * Every node in the firstpos of the second argument of a CAT node is in
2672 the follow of every node in the lastpos of the first argument.
2674 Because of the postfix representation of the parse tree, the depth-first
2675 analysis is conveniently done by a linear scan with the aid of a stack.
2676 Sets are stored as arrays of the elements, obeying a stack-like allocation
2677 scheme; the number of elements in each set deeper in the stack can be
2678 used to determine the address of a particular set's array. */
2679 static void
2680 dfaanalyze (struct dfa *d, bool searchflag)
2682 /* Array allocated to hold position sets. */
2683 position *posalloc = xnmalloc (d->nleaves, 2 * sizeof *posalloc);
2684 /* Firstpos and lastpos elements. */
2685 position *firstpos = posalloc;
2686 position *lastpos = firstpos + d->nleaves;
2687 position pos;
2688 position_set tmp;
2690 /* Stack for element counts and nullable flags. */
2691 struct
2693 /* Whether the entry is nullable. */
2694 bool nullable;
2696 /* Counts of firstpos and lastpos sets. */
2697 idx_t nfirstpos;
2698 idx_t nlastpos;
2699 } *stkalloc = xnmalloc (d->depth, sizeof *stkalloc), *stk = stkalloc;
2701 position_set merged; /* Result of merging sets. */
2703 addtok (d, CAT);
2704 idx_t tindex = d->tindex;
2706 #ifdef DEBUG
2707 fprintf (stderr, "dfaanalyze:\n");
2708 for (idx_t i = 0; i < tindex; i++)
2710 fprintf (stderr, " %td:", i);
2711 prtok (d->tokens[i]);
2713 putc ('\n', stderr);
2714 #endif
2716 d->searchflag = searchflag;
2717 alloc_position_set (&merged, d->nleaves);
2718 d->follows = xicalloc (tindex, sizeof *d->follows);
2719 position_set *backward
2720 = d->epsilon ? xicalloc (tindex, sizeof *backward) : NULL;
2722 for (idx_t i = 0; i < tindex; i++)
2724 switch (d->tokens[i])
2726 case EMPTY:
2727 /* The empty set is nullable. */
2728 stk->nullable = true;
2730 /* The firstpos and lastpos of the empty leaf are both empty. */
2731 stk->nfirstpos = stk->nlastpos = 0;
2732 stk++;
2733 break;
2735 case STAR:
2736 case PLUS:
2737 /* Every element in the lastpos of the argument is in the backward
2738 set of every element in the firstpos. */
2739 if (d->epsilon)
2741 tmp.elems = lastpos - stk[-1].nlastpos;
2742 tmp.nelem = stk[-1].nlastpos;
2743 for (position *p = firstpos - stk[-1].nfirstpos;
2744 p < firstpos; p++)
2745 merge2 (&backward[p->index], &tmp, &merged);
2748 /* Every element in the firstpos of the argument is in the follow
2749 of every element in the lastpos. */
2751 tmp.elems = firstpos - stk[-1].nfirstpos;
2752 tmp.nelem = stk[-1].nfirstpos;
2753 for (position *p = lastpos - stk[-1].nlastpos; p < lastpos; p++)
2754 merge2 (&d->follows[p->index], &tmp, &merged);
2756 FALLTHROUGH;
2757 case QMARK:
2758 /* A QMARK or STAR node is automatically nullable. */
2759 if (d->tokens[i] != PLUS)
2760 stk[-1].nullable = true;
2761 break;
2763 case CAT:
2764 /* Every element in the lastpos of the first argument is in
2765 the backward set of every element in the firstpos of the
2766 second argument. */
2767 if (backward)
2769 tmp.nelem = stk[-2].nlastpos;
2770 tmp.elems = lastpos - stk[-1].nlastpos - stk[-2].nlastpos;
2771 for (position *p = firstpos - stk[-1].nfirstpos;
2772 p < firstpos; p++)
2773 merge2 (&backward[p->index], &tmp, &merged);
2776 /* Every element in the firstpos of the second argument is in the
2777 follow of every element in the lastpos of the first argument. */
2779 tmp.nelem = stk[-1].nfirstpos;
2780 tmp.elems = firstpos - stk[-1].nfirstpos;
2781 for (position *plim = lastpos - stk[-1].nlastpos,
2782 *p = plim - stk[-2].nlastpos;
2783 p < plim; p++)
2784 merge2 (&d->follows[p->index], &tmp, &merged);
2787 /* The firstpos of a CAT node is the firstpos of the first argument,
2788 union that of the second argument if the first is nullable. */
2789 if (stk[-2].nullable)
2790 stk[-2].nfirstpos += stk[-1].nfirstpos;
2791 else
2792 firstpos -= stk[-1].nfirstpos;
2794 /* The lastpos of a CAT node is the lastpos of the second argument,
2795 union that of the first argument if the second is nullable. */
2796 if (stk[-1].nullable)
2797 stk[-2].nlastpos += stk[-1].nlastpos;
2798 else
2800 position *p = lastpos - stk[-1].nlastpos - stk[-2].nlastpos;
2801 for (idx_t j = 0; j < stk[-1].nlastpos; j++)
2802 p[j] = p[j + stk[-2].nlastpos];
2803 lastpos -= stk[-2].nlastpos;
2804 stk[-2].nlastpos = stk[-1].nlastpos;
2807 /* A CAT node is nullable if both arguments are nullable. */
2808 stk[-2].nullable &= stk[-1].nullable;
2809 stk--;
2810 break;
2812 case OR:
2813 /* The firstpos is the union of the firstpos of each argument. */
2814 stk[-2].nfirstpos += stk[-1].nfirstpos;
2816 /* The lastpos is the union of the lastpos of each argument. */
2817 stk[-2].nlastpos += stk[-1].nlastpos;
2819 /* An OR node is nullable if either argument is nullable. */
2820 stk[-2].nullable |= stk[-1].nullable;
2821 stk--;
2822 break;
2824 default:
2825 /* Anything else is a nonempty position. (Note that special
2826 constructs like \< are treated as nonempty strings here;
2827 an "epsilon closure" effectively makes them nullable later.
2828 Backreferences have to get a real position so we can detect
2829 transitions on them later. But they are nullable. */
2830 stk->nullable = d->tokens[i] == BACKREF;
2832 /* This position is in its own firstpos and lastpos. */
2833 stk->nfirstpos = stk->nlastpos = 1;
2834 stk++;
2836 firstpos->index = lastpos->index = i;
2837 firstpos->constraint = lastpos->constraint = NO_CONSTRAINT;
2838 firstpos++, lastpos++;
2840 break;
2842 #ifdef DEBUG
2843 /* ... balance the above nonsyntactic #ifdef goo... */
2844 fprintf (stderr, "node %td:", i);
2845 prtok (d->tokens[i]);
2846 putc ('\n', stderr);
2847 fprintf (stderr,
2848 stk[-1].nullable ? " nullable: yes\n" : " nullable: no\n");
2849 fprintf (stderr, " firstpos:");
2850 for (idx_t j = 0; j < stk[-1].nfirstpos; j++)
2852 fprintf (stderr, " %td:", firstpos[j - stk[-1].nfirstpos].index);
2853 prtok (d->tokens[firstpos[j - stk[-1].nfirstpos].index]);
2855 fprintf (stderr, "\n lastpos:");
2856 for (idx_t j = 0; j < stk[-1].nlastpos; j++)
2858 fprintf (stderr, " %td:", lastpos[j - stk[-1].nlastpos].index);
2859 prtok (d->tokens[lastpos[j - stk[-1].nlastpos].index]);
2861 putc ('\n', stderr);
2862 #endif
2865 if (backward)
2867 /* For each follow set that is the follow set of a real position,
2868 replace it with its epsilon closure. */
2869 epsclosure (d, backward);
2871 for (idx_t i = 0; i < tindex; i++)
2872 free (backward[i].elems);
2873 free (backward);
2876 dfaoptimize (d);
2878 #ifdef DEBUG
2879 for (idx_t i = 0; i < tindex; i++)
2880 if (d->tokens[i] == BEG || d->tokens[i] < NOTCHAR
2881 || d->tokens[i] == BACKREF || d->tokens[i] == ANYCHAR
2882 || d->tokens[i] == MBCSET || d->tokens[i] >= CSET)
2884 fprintf (stderr, "follows(%td:", i);
2885 prtok (d->tokens[i]);
2886 fprintf (stderr, "):");
2887 for (idx_t j = 0; j < d->follows[i].nelem; j++)
2889 fprintf (stderr, " %td:", d->follows[i].elems[j].index);
2890 prtok (d->tokens[d->follows[i].elems[j].index]);
2892 putc ('\n', stderr);
2894 #endif
2896 pos.index = 0;
2897 pos.constraint = NO_CONSTRAINT;
2899 alloc_position_set (&tmp, 1);
2901 append (pos, &tmp);
2903 d->separates = xicalloc (tindex, sizeof *d->separates);
2905 for (idx_t i = 0; i < tindex; i++)
2907 if (prev_newline_dependent (d->constraints[i]))
2908 d->separates[i] |= CTX_NEWLINE;
2909 if (prev_letter_dependent (d->constraints[i]))
2910 d->separates[i] |= CTX_LETTER;
2912 for (idx_t j = 0; j < d->follows[i].nelem; j++)
2914 if (prev_newline_dependent (d->follows[i].elems[j].constraint))
2915 d->separates[i] |= CTX_NEWLINE;
2916 if (prev_letter_dependent (d->follows[i].elems[j].constraint))
2917 d->separates[i] |= CTX_LETTER;
2921 /* Context wanted by some position. */
2922 int separate_contexts = state_separate_contexts (d, &tmp);
2924 /* Build the initial state. */
2925 if (separate_contexts & CTX_NEWLINE)
2926 state_index (d, &tmp, CTX_NEWLINE);
2927 d->initstate_notbol = d->min_trcount
2928 = state_index (d, &tmp, separate_contexts ^ CTX_ANY);
2929 if (separate_contexts & CTX_LETTER)
2930 d->min_trcount = state_index (d, &tmp, CTX_LETTER);
2931 d->min_trcount++;
2932 d->trcount = 0;
2934 free (posalloc);
2935 free (stkalloc);
2936 free (merged.elems);
2937 free (tmp.elems);
2940 /* Make sure D's state arrays are large enough to hold NEW_STATE. */
2941 static void
2942 realloc_trans_if_necessary (struct dfa *d)
2944 state_num oldalloc = d->tralloc;
2945 if (oldalloc < d->sindex)
2947 state_num **realtrans = d->trans ? d->trans - 2 : NULL;
2948 idx_t newalloc1 = realtrans ? d->tralloc + 2 : 0;
2949 realtrans = xpalloc (realtrans, &newalloc1, d->sindex - oldalloc,
2950 -1, sizeof *realtrans);
2951 realtrans[0] = realtrans[1] = NULL;
2952 d->trans = realtrans + 2;
2953 idx_t newalloc = d->tralloc = newalloc1 - 2;
2954 d->fails = xreallocarray (d->fails, newalloc, sizeof *d->fails);
2955 d->success = xreallocarray (d->success, newalloc, sizeof *d->success);
2956 d->newlines = xreallocarray (d->newlines, newalloc, sizeof *d->newlines);
2957 if (d->localeinfo.multibyte)
2959 realtrans = d->mb_trans ? d->mb_trans - 2 : NULL;
2960 realtrans = xreallocarray (realtrans, newalloc1, sizeof *realtrans);
2961 if (oldalloc == 0)
2962 realtrans[0] = realtrans[1] = NULL;
2963 d->mb_trans = realtrans + 2;
2965 for (; oldalloc < newalloc; oldalloc++)
2967 d->trans[oldalloc] = NULL;
2968 d->fails[oldalloc] = NULL;
2969 if (d->localeinfo.multibyte)
2970 d->mb_trans[oldalloc] = NULL;
2976 Calculate the transition table for a new state derived from state s
2977 for a compiled dfa d after input character uc, and return the new
2978 state number.
2980 Do not worry about all possible input characters; calculate just the group
2981 of positions that match uc. Label it with the set of characters that
2982 every position in the group matches (taking into account, if necessary,
2983 preceding context information of s). Then find the union
2984 of these positions' follows, i.e., the set of positions of the
2985 new state. For each character in the group's label, set the transition
2986 on this character to be to a state corresponding to the set's positions,
2987 and its associated backward context information, if necessary.
2989 When building a searching matcher, include the positions of state
2990 0 in every state.
2992 The group is constructed by building an equivalence-class
2993 partition of the positions of s.
2995 For each position, find the set of characters C that it matches. Eliminate
2996 any characters from C that fail on grounds of backward context.
2998 Check whether the group's label L has nonempty
2999 intersection with C. If L - C is nonempty, create a new group labeled
3000 L - C and having the same positions as the current group, and set L to
3001 the intersection of L and C. Insert the position in the group, set
3002 C = C - L, and resume scanning.
3004 If after comparing with every group there are characters remaining in C,
3005 create a new group labeled with the characters of C and insert this
3006 position in that group. */
3008 static state_num
3009 build_state (state_num s, struct dfa *d, unsigned char uc)
3011 position_set follows; /* Union of the follows for each
3012 position of the current state. */
3013 position_set group; /* Positions that match the input char. */
3014 position_set tmp; /* Temporary space for merging sets. */
3015 state_num state; /* New state. */
3016 state_num state_newline; /* New state on a newline transition. */
3017 state_num state_letter; /* New state on a letter transition. */
3019 #ifdef DEBUG
3020 fprintf (stderr, "build state %td\n", s);
3021 #endif
3023 /* A pointer to the new transition table, and the table itself. */
3024 state_num **ptrans = (accepting (s, d) ? d->fails : d->trans) + s;
3025 state_num *trans = *ptrans;
3027 if (!trans)
3029 /* MAX_TRCOUNT is an arbitrary upper limit on the number of
3030 transition tables that can exist at once, other than for
3031 initial states. Often-used transition tables are quickly
3032 rebuilt, whereas rarely-used ones are cleared away. */
3033 if (MAX_TRCOUNT <= d->trcount)
3035 for (state_num i = d->min_trcount; i < d->tralloc; i++)
3037 free (d->trans[i]);
3038 free (d->fails[i]);
3039 d->trans[i] = d->fails[i] = NULL;
3041 d->trcount = 0;
3044 d->trcount++;
3045 *ptrans = trans = xmalloc (NOTCHAR * sizeof *trans);
3047 /* Fill transition table with a default value which means that the
3048 transited state has not been calculated yet. */
3049 for (int i = 0; i < NOTCHAR; i++)
3050 trans[i] = -2;
3053 /* Set up the success bits for this state. */
3054 d->success[s] = 0;
3055 if (accepts_in_context (d->states[s].context, CTX_NEWLINE, s, d))
3056 d->success[s] |= CTX_NEWLINE;
3057 if (accepts_in_context (d->states[s].context, CTX_LETTER, s, d))
3058 d->success[s] |= CTX_LETTER;
3059 if (accepts_in_context (d->states[s].context, CTX_NONE, s, d))
3060 d->success[s] |= CTX_NONE;
3062 alloc_position_set (&follows, d->nleaves);
3064 /* Find the union of the follows of the positions of the group.
3065 This is a hideously inefficient loop. Fix it someday. */
3066 for (idx_t j = 0; j < d->states[s].elems.nelem; j++)
3067 for (idx_t k = 0;
3068 k < d->follows[d->states[s].elems.elems[j].index].nelem; ++k)
3069 insert (d->follows[d->states[s].elems.elems[j].index].elems[k],
3070 &follows);
3072 /* Positions that match the input char. */
3073 alloc_position_set (&group, d->nleaves);
3075 /* The group's label. */
3076 charclass label;
3077 fillset (&label);
3079 for (idx_t i = 0; i < follows.nelem; i++)
3081 charclass matches; /* Set of matching characters. */
3082 position pos = follows.elems[i];
3083 bool matched = false;
3084 if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR)
3086 zeroset (&matches);
3087 setbit (d->tokens[pos.index], &matches);
3088 if (d->tokens[pos.index] == uc)
3089 matched = true;
3091 else if (d->tokens[pos.index] >= CSET)
3093 matches = d->charclasses[d->tokens[pos.index] - CSET];
3094 if (tstbit (uc, &matches))
3095 matched = true;
3097 else if (d->tokens[pos.index] == ANYCHAR)
3099 matches = d->charclasses[d->canychar];
3100 if (tstbit (uc, &matches))
3101 matched = true;
3103 /* ANYCHAR must match with a single character, so we must put
3104 it to D->states[s].mbps which contains the positions which
3105 can match with a single character not a byte. If all
3106 positions which has ANYCHAR does not depend on context of
3107 next character, we put the follows instead of it to
3108 D->states[s].mbps to optimize. */
3109 if (succeeds_in_context (pos.constraint, d->states[s].context,
3110 CTX_NONE))
3112 if (d->states[s].mbps.nelem == 0)
3113 alloc_position_set (&d->states[s].mbps, 1);
3114 insert (pos, &d->states[s].mbps);
3117 else
3118 continue;
3120 /* Some characters may need to be eliminated from matches because
3121 they fail in the current context. */
3122 if (pos.constraint != NO_CONSTRAINT)
3124 if (!succeeds_in_context (pos.constraint,
3125 d->states[s].context, CTX_NEWLINE))
3126 for (int j = 0; j < CHARCLASS_WORDS; j++)
3127 matches.w[j] &= ~d->syntax.newline.w[j];
3128 if (!succeeds_in_context (pos.constraint,
3129 d->states[s].context, CTX_LETTER))
3130 for (int j = 0; j < CHARCLASS_WORDS; ++j)
3131 matches.w[j] &= ~d->syntax.letters.w[j];
3132 if (!succeeds_in_context (pos.constraint,
3133 d->states[s].context, CTX_NONE))
3134 for (int j = 0; j < CHARCLASS_WORDS; ++j)
3135 matches.w[j] &= d->syntax.letters.w[j] | d->syntax.newline.w[j];
3137 /* If there are no characters left, there's no point in going on. */
3138 if (emptyset (&matches))
3139 continue;
3141 /* If we have reset the bit that made us declare "matched", reset
3142 that indicator, too. This is required to avoid an infinite loop
3143 with this command: echo cx | LC_ALL=C grep -E 'c\b[x ]' */
3144 if (!tstbit (uc, &matches))
3145 matched = false;
3148 #ifdef DEBUG
3149 fprintf (stderr, " nextpos %td:", pos.index);
3150 prtok (d->tokens[pos.index]);
3151 fprintf (stderr, " of");
3152 for (unsigned j = 0; j < NOTCHAR; j++)
3153 if (tstbit (j, &matches))
3154 fprintf (stderr, " 0x%02x", j);
3155 fprintf (stderr, "\n");
3156 #endif
3158 if (matched)
3160 for (int k = 0; k < CHARCLASS_WORDS; ++k)
3161 label.w[k] &= matches.w[k];
3162 append (pos, &group);
3164 else
3166 for (int k = 0; k < CHARCLASS_WORDS; ++k)
3167 label.w[k] &= ~matches.w[k];
3171 alloc_position_set (&tmp, d->nleaves);
3173 if (group.nelem > 0)
3175 /* If we are building a searching matcher, throw in the positions
3176 of state 0 as well, if possible. */
3177 if (d->searchflag)
3179 /* If a token in follows.elems is not 1st byte of a multibyte
3180 character, or the states of follows must accept the bytes
3181 which are not 1st byte of the multibyte character.
3182 Then, if a state of follows encounters a byte, it must not be
3183 a 1st byte of a multibyte character nor a single byte character.
3184 In this case, do not add state[0].follows to next state, because
3185 state[0] must accept 1st-byte.
3187 For example, suppose <sb a> is a certain single byte character,
3188 <mb A> is a certain multibyte character, and the codepoint of
3189 <sb a> equals the 2nd byte of the codepoint of <mb A>. When
3190 state[0] accepts <sb a>, state[i] transits to state[i+1] by
3191 accepting the 1st byte of <mb A>, and state[i+1] accepts the
3192 2nd byte of <mb A>, if state[i+1] encounters the codepoint of
3193 <sb a>, it must not be <sb a> but the 2nd byte of <mb A>, so do
3194 not add state[0]. */
3196 bool mergeit = !d->localeinfo.multibyte;
3197 if (!mergeit)
3199 mergeit = true;
3200 for (idx_t j = 0; mergeit && j < group.nelem; j++)
3201 mergeit &= d->multibyte_prop[group.elems[j].index];
3203 if (mergeit)
3204 merge2 (&group, &d->states[0].elems, &tmp);
3207 /* Find out if the new state will want any context information,
3208 by calculating possible contexts that the group can match,
3209 and separate contexts that the new state wants to know. */
3210 int possible_contexts = charclass_context (d, &label);
3211 int separate_contexts = state_separate_contexts (d, &group);
3213 /* Find the state(s) corresponding to the union of the follows. */
3214 if (possible_contexts & ~separate_contexts)
3215 state = state_index (d, &group, separate_contexts ^ CTX_ANY);
3216 else
3217 state = -1;
3218 if (separate_contexts & possible_contexts & CTX_NEWLINE)
3219 state_newline = state_index (d, &group, CTX_NEWLINE);
3220 else
3221 state_newline = state;
3222 if (separate_contexts & possible_contexts & CTX_LETTER)
3223 state_letter = state_index (d, &group, CTX_LETTER);
3224 else
3225 state_letter = state;
3227 /* Reallocate now, to reallocate any newline transition properly. */
3228 realloc_trans_if_necessary (d);
3231 /* If we are a searching matcher, the default transition is to a state
3232 containing the positions of state 0, otherwise the default transition
3233 is to fail miserably. */
3234 else if (d->searchflag)
3236 state_newline = 0;
3237 state_letter = d->min_trcount - 1;
3238 state = d->initstate_notbol;
3240 else
3242 state_newline = -1;
3243 state_letter = -1;
3244 state = -1;
3247 /* Set the transitions for each character in the label. */
3248 for (int i = 0; i < NOTCHAR; i++)
3249 if (tstbit (i, &label))
3250 switch (d->syntax.sbit[i])
3252 case CTX_NEWLINE:
3253 trans[i] = state_newline;
3254 break;
3255 case CTX_LETTER:
3256 trans[i] = state_letter;
3257 break;
3258 default:
3259 trans[i] = state;
3260 break;
3263 #ifdef DEBUG
3264 fprintf (stderr, "trans table %td", s);
3265 for (int i = 0; i < NOTCHAR; ++i)
3267 if (!(i & 0xf))
3268 fprintf (stderr, "\n");
3269 fprintf (stderr, " %2td", trans[i]);
3271 fprintf (stderr, "\n");
3272 #endif
3274 free (group.elems);
3275 free (follows.elems);
3276 free (tmp.elems);
3278 /* Keep the newline transition in a special place so we can use it as
3279 a sentinel. */
3280 if (tstbit (d->syntax.eolbyte, &label))
3282 d->newlines[s] = trans[d->syntax.eolbyte];
3283 trans[d->syntax.eolbyte] = -1;
3286 return trans[uc];
3289 /* Multibyte character handling sub-routines for dfaexec. */
3291 /* Consume a single byte and transit state from 's' to '*next_state'.
3292 This is almost the same as the state transition routine in dfaexec.
3293 But the state transition is done just once; otherwise, matching succeeds or
3294 we reach the end of the buffer. */
3295 static state_num
3296 transit_state_singlebyte (struct dfa *d, state_num s, unsigned char const **pp)
3298 state_num *t;
3300 if (d->trans[s])
3301 t = d->trans[s];
3302 else if (d->fails[s])
3303 t = d->fails[s];
3304 else
3306 build_state (s, d, **pp);
3307 if (d->trans[s])
3308 t = d->trans[s];
3309 else
3311 t = d->fails[s];
3312 assert (t);
3316 if (t[**pp] == -2)
3317 build_state (s, d, **pp);
3319 return t[*(*pp)++];
3322 /* Transit state from s, then return new state and update the pointer of
3323 the buffer. This function is for a period operator which can match a
3324 multi-byte character. */
3325 static state_num
3326 transit_state (struct dfa *d, state_num s, unsigned char const **pp,
3327 unsigned char const *end)
3329 wint_t wc;
3331 int mbclen = mbs_to_wchar (&wc, (char const *) *pp, end - *pp, d);
3333 /* This state has some operators which can match a multibyte character. */
3334 d->mb_follows.nelem = 0;
3336 /* Calculate the state which can be reached from the state 's' by
3337 consuming 'mbclen' single bytes from the buffer. */
3338 state_num s1 = s;
3339 int mbci;
3340 for (mbci = 0; mbci < mbclen && (mbci == 0 || d->min_trcount <= s); mbci++)
3341 s = transit_state_singlebyte (d, s, pp);
3342 *pp += mbclen - mbci;
3344 if (wc == WEOF)
3346 /* It is an invalid character, so ANYCHAR is not accepted. */
3347 return s;
3350 /* If all positions which have ANYCHAR do not depend on the context
3351 of the next character, calculate the next state with
3352 pre-calculated follows and cache the result. */
3353 if (d->states[s1].mb_trindex < 0)
3355 if (MAX_TRCOUNT <= d->mb_trcount)
3357 state_num s3;
3358 for (s3 = -1; s3 < d->tralloc; s3++)
3360 free (d->mb_trans[s3]);
3361 d->mb_trans[s3] = NULL;
3364 for (state_num i = 0; i < d->sindex; i++)
3365 d->states[i].mb_trindex = -1;
3366 d->mb_trcount = 0;
3368 d->states[s1].mb_trindex = d->mb_trcount++;
3371 if (! d->mb_trans[s])
3373 enum { TRANSPTR_SIZE = sizeof *d->mb_trans[s] };
3374 enum { TRANSALLOC_SIZE = MAX_TRCOUNT * TRANSPTR_SIZE };
3375 d->mb_trans[s] = xmalloc (TRANSALLOC_SIZE);
3376 for (int i = 0; i < MAX_TRCOUNT; i++)
3377 d->mb_trans[s][i] = -1;
3379 else if (d->mb_trans[s][d->states[s1].mb_trindex] >= 0)
3380 return d->mb_trans[s][d->states[s1].mb_trindex];
3382 if (s == -1)
3383 copy (&d->states[s1].mbps, &d->mb_follows);
3384 else
3385 merge (&d->states[s1].mbps, &d->states[s].elems, &d->mb_follows);
3387 int separate_contexts = state_separate_contexts (d, &d->mb_follows);
3388 state_num s2 = state_index (d, &d->mb_follows, separate_contexts ^ CTX_ANY);
3389 realloc_trans_if_necessary (d);
3391 d->mb_trans[s][d->states[s1].mb_trindex] = s2;
3393 return s2;
3396 /* The initial state may encounter a byte which is not a single byte character
3397 nor the first byte of a multibyte character. But it is incorrect for the
3398 initial state to accept such a byte. For example, in Shift JIS the regular
3399 expression "\\" accepts the codepoint 0x5c, but should not accept the second
3400 byte of the codepoint 0x815c. Then the initial state must skip the bytes
3401 that are not a single byte character nor the first byte of a multibyte
3402 character.
3404 Given DFA state d, use mbs_to_wchar to advance MBP until it reaches
3405 or exceeds P, and return the advanced MBP. If WCP is non-NULL and
3406 the result is greater than P, set *WCP to the final wide character
3407 processed, or to WEOF if no wide character is processed. Otherwise,
3408 if WCP is non-NULL, *WCP may or may not be updated.
3410 Both P and MBP must be no larger than END. */
3411 static unsigned char const *
3412 skip_remains_mb (struct dfa *d, unsigned char const *p,
3413 unsigned char const *mbp, char const *end)
3415 if (d->syntax.never_trail[*p])
3416 return p;
3417 while (mbp < p)
3419 wint_t wc;
3420 mbp += mbs_to_wchar (&wc, (char const *) mbp,
3421 end - (char const *) mbp, d);
3423 return mbp;
3426 /* Search through a buffer looking for a match to the struct dfa *D.
3427 Find the first occurrence of a string matching the regexp in the
3428 buffer, and the shortest possible version thereof. Return a pointer to
3429 the first character after the match, or NULL if none is found. BEGIN
3430 points to the beginning of the buffer, and END points to the first byte
3431 after its end. Note however that we store a sentinel byte (usually
3432 newline) in *END, so the actual buffer must be one byte longer.
3433 When ALLOW_NL, newlines may appear in the matching string.
3434 If COUNT is non-NULL, increment *COUNT once for each newline processed.
3435 If MULTIBYTE, the input consists of multibyte characters and/or
3436 encoding-error bytes. Otherwise, it consists of single-byte characters.
3437 Here is the list of features that make this DFA matcher punt:
3438 - [M-N] range in non-simple locale: regex is up to 25% faster on [a-z]
3439 - [^...] in non-simple locale
3440 - [[=foo=]] or [[.foo.]]
3441 - [[:alpha:]] etc. in multibyte locale (except [[:digit:]] works OK)
3442 - back-reference: (.)\1
3443 - word-delimiter in multibyte locale: \<, \>, \b, \B
3444 See struct localeinfo.simple for the definition of "simple locale". */
3446 static inline char *
3447 dfaexec_main (struct dfa *d, char const *begin, char *end, bool allow_nl,
3448 idx_t *count, bool multibyte)
3450 if (MAX_TRCOUNT <= d->sindex)
3452 for (state_num s = d->min_trcount; s < d->sindex; s++)
3454 free (d->states[s].elems.elems);
3455 free (d->states[s].mbps.elems);
3457 d->sindex = d->min_trcount;
3459 if (d->trans)
3461 for (state_num s = 0; s < d->tralloc; s++)
3463 free (d->trans[s]);
3464 free (d->fails[s]);
3465 d->trans[s] = d->fails[s] = NULL;
3467 d->trcount = 0;
3470 if (d->localeinfo.multibyte && d->mb_trans)
3472 for (state_num s = -1; s < d->tralloc; s++)
3474 free (d->mb_trans[s]);
3475 d->mb_trans[s] = NULL;
3477 for (state_num s = 0; s < d->min_trcount; s++)
3478 d->states[s].mb_trindex = -1;
3479 d->mb_trcount = 0;
3483 if (!d->tralloc)
3484 realloc_trans_if_necessary (d);
3486 /* Current state. */
3487 state_num s = 0, s1 = 0;
3489 /* Current input character. */
3490 unsigned char const *p = (unsigned char const *) begin;
3491 unsigned char const *mbp = p;
3493 /* Copy of d->trans so it can be optimized into a register. */
3494 state_num **trans = d->trans;
3495 unsigned char eol = d->syntax.eolbyte; /* Likewise for eolbyte. */
3496 unsigned char saved_end = *(unsigned char *) end;
3497 *end = eol;
3499 if (multibyte)
3501 mbszero (&d->mbs);
3502 if (d->mb_follows.alloc == 0)
3503 alloc_position_set (&d->mb_follows, d->nleaves);
3506 idx_t nlcount = 0;
3507 for (;;)
3509 state_num *t;
3510 while ((t = trans[s]) != NULL)
3512 if (s < d->min_trcount)
3514 if (!multibyte || d->states[s].mbps.nelem == 0)
3516 while (t[*p] == s)
3517 p++;
3519 if (multibyte)
3520 p = mbp = skip_remains_mb (d, p, mbp, end);
3523 if (multibyte)
3525 s1 = s;
3527 if (d->states[s].mbps.nelem == 0
3528 || d->localeinfo.sbctowc[*p] != WEOF || (char *) p >= end)
3530 /* If an input character does not match ANYCHAR, do it
3531 like a single-byte character. */
3532 s = t[*p++];
3534 else
3536 s = transit_state (d, s, &p, (unsigned char *) end);
3537 mbp = p;
3538 trans = d->trans;
3541 else
3543 s1 = t[*p++];
3544 t = trans[s1];
3545 if (! t)
3547 state_num tmp = s;
3548 s = s1;
3549 s1 = tmp; /* swap */
3550 break;
3552 if (s < d->min_trcount)
3554 while (t[*p] == s1)
3555 p++;
3557 s = t[*p++];
3561 if (s < 0)
3563 if (s == -2)
3565 s = build_state (s1, d, p[-1]);
3566 trans = d->trans;
3568 else if ((char *) p <= end && p[-1] == eol && 0 <= d->newlines[s1])
3570 /* The previous character was a newline. Count it, and skip
3571 checking of multibyte character boundary until here. */
3572 nlcount++;
3573 mbp = p;
3575 s = (allow_nl ? d->newlines[s1]
3576 : d->syntax.sbit[eol] == CTX_NEWLINE ? 0
3577 : d->syntax.sbit[eol] == CTX_LETTER ? d->min_trcount - 1
3578 : d->initstate_notbol);
3580 else
3582 p = NULL;
3583 goto done;
3586 else if (d->fails[s])
3588 if ((d->success[s] & d->syntax.sbit[*p])
3589 || ((char *) p == end
3590 && accepts_in_context (d->states[s].context, CTX_NEWLINE, s,
3591 d)))
3592 goto done;
3594 if (multibyte && s < d->min_trcount)
3595 p = mbp = skip_remains_mb (d, p, mbp, end);
3597 s1 = s;
3598 if (!multibyte || d->states[s].mbps.nelem == 0
3599 || d->localeinfo.sbctowc[*p] != WEOF || (char *) p >= end)
3601 /* If a input character does not match ANYCHAR, do it
3602 like a single-byte character. */
3603 s = d->fails[s][*p++];
3605 else
3607 s = transit_state (d, s, &p, (unsigned char *) end);
3608 mbp = p;
3609 trans = d->trans;
3612 else
3614 build_state (s, d, p[0]);
3615 trans = d->trans;
3619 done:
3620 if (count)
3621 *count += nlcount;
3622 *end = saved_end;
3623 return (char *) p;
3626 /* Specialized versions of dfaexec for multibyte and single-byte cases.
3627 This is for performance, as dfaexec_main is an inline function. */
3629 static char *
3630 dfaexec_mb (struct dfa *d, char const *begin, char *end,
3631 bool allow_nl, idx_t *count, bool *backref)
3633 return dfaexec_main (d, begin, end, allow_nl, count, true);
3636 static char *
3637 dfaexec_sb (struct dfa *d, char const *begin, char *end,
3638 bool allow_nl, idx_t *count, bool *backref)
3640 return dfaexec_main (d, begin, end, allow_nl, count, false);
3643 /* Always set *BACKREF and return BEGIN. Use this wrapper for
3644 any regexp that uses a construct not supported by this code. */
3645 static char *
3646 dfaexec_noop (struct dfa *d, char const *begin, char *end,
3647 bool allow_nl, idx_t *count, bool *backref)
3649 *backref = true;
3650 return (char *) begin;
3653 /* Like dfaexec_main (D, BEGIN, END, ALLOW_NL, COUNT, D->localeinfo.multibyte),
3654 but faster and set *BACKREF if the DFA code does not support this
3655 regexp usage. */
3657 char *
3658 dfaexec (struct dfa *d, char const *begin, char *end,
3659 bool allow_nl, idx_t *count, bool *backref)
3661 return d->dfaexec (d, begin, end, allow_nl, count, backref);
3664 struct dfa *
3665 dfasuperset (struct dfa const *d)
3667 return d->superset;
3670 bool
3671 dfaisfast (struct dfa const *d)
3673 return d->fast;
3676 static void
3677 free_mbdata (struct dfa *d)
3679 free (d->multibyte_prop);
3680 free (d->lex.brack.chars);
3681 free (d->mb_follows.elems);
3683 if (d->mb_trans)
3685 state_num s;
3686 for (s = -1; s < d->tralloc; s++)
3687 free (d->mb_trans[s]);
3688 free (d->mb_trans - 2);
3692 /* Return true if every construct in D is supported by this DFA matcher. */
3693 bool
3694 dfasupported (struct dfa const *d)
3696 for (idx_t i = 0; i < d->tindex; i++)
3698 switch (d->tokens[i])
3700 case BEGWORD:
3701 case ENDWORD:
3702 case LIMWORD:
3703 case NOTLIMWORD:
3704 if (!d->localeinfo.multibyte)
3705 continue;
3706 FALLTHROUGH;
3707 case BACKREF:
3708 case MBCSET:
3709 return false;
3712 return true;
3715 /* Disable use of the superset DFA if it is not likely to help
3716 performance. */
3717 static void
3718 maybe_disable_superset_dfa (struct dfa *d)
3720 if (!d->localeinfo.using_utf8)
3721 return;
3723 bool have_backref = false;
3724 for (idx_t i = 0; i < d->tindex; i++)
3726 switch (d->tokens[i])
3728 case ANYCHAR:
3729 /* Lowered. */
3730 abort ();
3731 case BACKREF:
3732 have_backref = true;
3733 break;
3734 case MBCSET:
3735 /* Requires multi-byte algorithm. */
3736 return;
3737 default:
3738 break;
3742 if (!have_backref && d->superset)
3744 /* The superset DFA is not likely to be much faster, so remove it. */
3745 dfafree (d->superset);
3746 free (d->superset);
3747 d->superset = NULL;
3750 free_mbdata (d);
3751 d->localeinfo.multibyte = false;
3752 d->dfaexec = dfaexec_sb;
3753 d->fast = true;
3756 static void
3757 dfassbuild (struct dfa *d)
3759 struct dfa *sup = dfaalloc ();
3761 *sup = *d;
3762 sup->localeinfo.multibyte = false;
3763 sup->dfaexec = dfaexec_sb;
3764 sup->multibyte_prop = NULL;
3765 sup->superset = NULL;
3766 sup->states = NULL;
3767 sup->sindex = 0;
3768 sup->constraints = NULL;
3769 sup->separates = NULL;
3770 sup->follows = NULL;
3771 sup->tralloc = 0;
3772 sup->trans = NULL;
3773 sup->fails = NULL;
3774 sup->success = NULL;
3775 sup->newlines = NULL;
3777 sup->charclasses = xnmalloc (sup->calloc, sizeof *sup->charclasses);
3778 if (d->cindex)
3780 memcpy (sup->charclasses, d->charclasses,
3781 d->cindex * sizeof *sup->charclasses);
3784 sup->tokens = xnmalloc (d->tindex, 2 * sizeof *sup->tokens);
3785 sup->talloc = d->tindex * 2;
3787 bool have_achar = false;
3788 bool have_nchar = false;
3789 idx_t j;
3790 for (idx_t i = j = 0; i < d->tindex; i++)
3792 switch (d->tokens[i])
3794 case ANYCHAR:
3795 case MBCSET:
3796 case BACKREF:
3798 charclass ccl;
3799 fillset (&ccl);
3800 sup->tokens[j++] = CSET + charclass_index (sup, &ccl);
3801 sup->tokens[j++] = STAR;
3802 if (d->tokens[i + 1] == QMARK || d->tokens[i + 1] == STAR
3803 || d->tokens[i + 1] == PLUS)
3804 i++;
3805 have_achar = true;
3807 break;
3808 case BEGWORD:
3809 case ENDWORD:
3810 case LIMWORD:
3811 case NOTLIMWORD:
3812 if (d->localeinfo.multibyte)
3814 /* These constraints aren't supported in a multibyte locale.
3815 Ignore them in the superset DFA. */
3816 sup->tokens[j++] = EMPTY;
3817 break;
3819 FALLTHROUGH;
3820 default:
3821 sup->tokens[j++] = d->tokens[i];
3822 if ((0 <= d->tokens[i] && d->tokens[i] < NOTCHAR)
3823 || d->tokens[i] >= CSET)
3824 have_nchar = true;
3825 break;
3828 sup->tindex = j;
3830 if (have_nchar && (have_achar || d->localeinfo.multibyte))
3831 d->superset = sup;
3832 else
3834 dfafree (sup);
3835 free (sup);
3839 /* Parse a string S of length LEN into D (but skip this step if S is null).
3840 Then analyze D and build a matcher for it.
3841 SEARCHFLAG says whether to build a searching or an exact matcher. */
3842 void
3843 dfacomp (char const *s, idx_t len, struct dfa *d, bool searchflag)
3845 if (s != NULL)
3846 dfaparse (s, len, d);
3848 dfassbuild (d);
3850 if (dfasupported (d))
3852 maybe_disable_superset_dfa (d);
3853 dfaanalyze (d, searchflag);
3855 else
3857 d->dfaexec = dfaexec_noop;
3860 if (d->superset)
3862 d->fast = true;
3863 dfaanalyze (d->superset, searchflag);
3867 /* Free the storage held by the components of a dfa. */
3868 void
3869 dfafree (struct dfa *d)
3871 free (d->charclasses);
3872 free (d->tokens);
3874 if (d->localeinfo.multibyte)
3875 free_mbdata (d);
3877 free (d->constraints);
3878 free (d->separates);
3880 for (idx_t i = 0; i < d->sindex; i++)
3882 free (d->states[i].elems.elems);
3883 free (d->states[i].mbps.elems);
3885 free (d->states);
3887 if (d->follows)
3889 for (idx_t i = 0; i < d->tindex; i++)
3890 free (d->follows[i].elems);
3891 free (d->follows);
3894 if (d->trans)
3896 for (idx_t i = 0; i < d->tralloc; i++)
3898 free (d->trans[i]);
3899 free (d->fails[i]);
3902 free (d->trans - 2);
3903 free (d->fails);
3904 free (d->newlines);
3905 free (d->success);
3908 if (d->superset)
3910 dfafree (d->superset);
3911 free (d->superset);
3915 /* Having found the postfix representation of the regular expression,
3916 try to find a long sequence of characters that must appear in any line
3917 containing the r.e.
3918 Finding a "longest" sequence is beyond the scope here;
3919 we take an easy way out and hope for the best.
3920 (Take "(ab|a)b"--please.)
3922 We do a bottom-up calculation of sequences of characters that must appear
3923 in matches of r.e.'s represented by trees rooted at the nodes of the postfix
3924 representation:
3925 sequences that must appear at the left of the match ("left")
3926 sequences that must appear at the right of the match ("right")
3927 lists of sequences that must appear somewhere in the match ("in")
3928 sequences that must constitute the match ("is")
3930 When we get to the root of the tree, we use one of the longest of its
3931 calculated "in" sequences as our answer.
3933 The sequences calculated for the various types of node (in pseudo ANSI c)
3934 are shown below. "p" is the operand of unary operators (and the left-hand
3935 operand of binary operators); "q" is the right-hand operand of binary
3936 operators.
3938 "ZERO" means "a zero-length sequence" below.
3940 Type left right is in
3941 ---- ---- ----- -- --
3942 char c # c # c # c # c
3944 ANYCHAR ZERO ZERO ZERO ZERO
3946 MBCSET ZERO ZERO ZERO ZERO
3948 CSET ZERO ZERO ZERO ZERO
3950 STAR ZERO ZERO ZERO ZERO
3952 QMARK ZERO ZERO ZERO ZERO
3954 PLUS p->left p->right ZERO p->in
3956 CAT (p->is==ZERO)? (q->is==ZERO)? (p->is!=ZERO && p->in plus
3957 p->left : q->right : q->is!=ZERO) ? q->in plus
3958 p->is##q->left p->right##q->is p->is##q->is : p->right##q->left
3959 ZERO
3961 OR longest common longest common (do p->is and substrings common
3962 leading trailing to q->is have same p->in and
3963 (sub)sequence (sub)sequence q->in length and content) ?
3964 of p->left of p->right
3965 and q->left and q->right p->is : NULL
3967 If there's anything else we recognize in the tree, all four sequences get set
3968 to zero-length sequences. If there's something we don't recognize in the
3969 tree, we just return a zero-length sequence.
3971 Break ties in favor of infrequent letters (choosing 'zzz' in preference to
3972 'aaa')?
3974 And ... is it here or someplace that we might ponder "optimizations" such as
3975 egrep 'psi|epsilon' -> egrep 'psi'
3976 egrep 'pepsi|epsilon' -> egrep 'epsi'
3977 (Yes, we now find "epsi" as a "string
3978 that must occur", but we might also
3979 simplify the *entire* r.e. being sought)
3980 grep '[c]' -> grep 'c'
3981 grep '(ab|a)b' -> grep 'ab'
3982 grep 'ab*' -> grep 'a'
3983 grep 'a*b' -> grep 'b'
3985 There are several issues:
3987 Is optimization easy (enough)?
3989 Does optimization actually accomplish anything,
3990 or is the automaton you get from "psi|epsilon" (for example)
3991 the same as the one you get from "psi" (for example)?
3993 Are optimizable r.e.'s likely to be used in real-life situations
3994 (something like 'ab*' is probably unlikely; something like is
3995 'psi|epsilon' is likelier)? */
3997 static char *
3998 icatalloc (char *old, char const *new)
4000 idx_t newsize = strlen (new);
4001 if (newsize == 0)
4002 return old;
4003 idx_t oldsize = strlen (old);
4004 char *result = xirealloc (old, oldsize + newsize + 1);
4005 memcpy (result + oldsize, new, newsize + 1);
4006 return result;
4009 static void
4010 freelist (char **cpp)
4012 while (*cpp)
4013 free (*cpp++);
4016 static char **
4017 enlistnew (char **cpp, char *new)
4019 /* Is there already something in the list that's new (or longer)? */
4020 idx_t i;
4021 for (i = 0; cpp[i] != NULL; i++)
4022 if (strstr (cpp[i], new) != NULL)
4024 free (new);
4025 return cpp;
4027 /* Eliminate any obsoleted strings. */
4028 for (idx_t j = 0; cpp[j] != NULL; )
4029 if (strstr (new, cpp[j]) == NULL)
4030 ++j;
4031 else
4033 free (cpp[j]);
4034 if (--i == j)
4035 break;
4036 cpp[j] = cpp[i];
4037 cpp[i] = NULL;
4039 /* Add the new string. */
4040 cpp = xreallocarray (cpp, i + 2, sizeof *cpp);
4041 cpp[i] = new;
4042 cpp[i + 1] = NULL;
4043 return cpp;
4046 static char **
4047 enlist (char **cpp, char const *str, idx_t len)
4049 return enlistnew (cpp, ximemdup0 (str, len));
4052 /* Given pointers to two strings, return a pointer to an allocated
4053 list of their distinct common substrings. */
4054 static char **
4055 comsubs (char *left, char const *right)
4057 char **cpp = xzalloc (sizeof *cpp);
4059 for (char *lcp = left; *lcp != '\0'; lcp++)
4061 idx_t len = 0;
4062 char *rcp = strchr (right, *lcp);
4063 while (rcp != NULL)
4065 idx_t i;
4066 for (i = 1; lcp[i] != '\0' && lcp[i] == rcp[i]; ++i)
4067 continue;
4068 if (i > len)
4069 len = i;
4070 rcp = strchr (rcp + 1, *lcp);
4072 if (len != 0)
4073 cpp = enlist (cpp, lcp, len);
4075 return cpp;
4078 static char **
4079 addlists (char **old, char **new)
4081 for (; *new; new++)
4082 old = enlistnew (old, xstrdup (*new));
4083 return old;
4086 /* Given two lists of substrings, return a new list giving substrings
4087 common to both. */
4088 static char **
4089 inboth (char **left, char **right)
4091 char **both = xzalloc (sizeof *both);
4093 for (idx_t lnum = 0; left[lnum] != NULL; lnum++)
4095 for (idx_t rnum = 0; right[rnum] != NULL; rnum++)
4097 char **temp = comsubs (left[lnum], right[rnum]);
4098 both = addlists (both, temp);
4099 freelist (temp);
4100 free (temp);
4103 return both;
4106 typedef struct must must;
4108 struct must
4110 char **in;
4111 char *left;
4112 char *right;
4113 char *is;
4114 bool begline;
4115 bool endline;
4116 must *prev;
4119 static must *
4120 allocmust (must *mp, idx_t size)
4122 must *new_mp = xmalloc (sizeof *new_mp);
4123 new_mp->in = xzalloc (sizeof *new_mp->in);
4124 new_mp->left = xizalloc (size);
4125 new_mp->right = xizalloc (size);
4126 new_mp->is = xizalloc (size);
4127 new_mp->begline = false;
4128 new_mp->endline = false;
4129 new_mp->prev = mp;
4130 return new_mp;
4133 static void
4134 resetmust (must *mp)
4136 freelist (mp->in);
4137 mp->in[0] = NULL;
4138 mp->left[0] = mp->right[0] = mp->is[0] = '\0';
4139 mp->begline = false;
4140 mp->endline = false;
4143 static void
4144 freemust (must *mp)
4146 freelist (mp->in);
4147 free (mp->in);
4148 free (mp->left);
4149 free (mp->right);
4150 free (mp->is);
4151 free (mp);
4154 struct dfamust *
4155 dfamust (struct dfa const *d)
4157 must *mp = NULL;
4158 char const *result = "";
4159 bool exact = false;
4160 bool begline = false;
4161 bool endline = false;
4162 bool need_begline = false;
4163 bool need_endline = false;
4164 bool case_fold_unibyte = d->syntax.case_fold & !d->localeinfo.multibyte;
4166 for (idx_t ri = 1; ri + 1 < d->tindex; ri++)
4168 token t = d->tokens[ri];
4169 switch (t)
4171 case BEGLINE:
4172 mp = allocmust (mp, 2);
4173 mp->begline = true;
4174 need_begline = true;
4175 break;
4176 case ENDLINE:
4177 mp = allocmust (mp, 2);
4178 mp->endline = true;
4179 need_endline = true;
4180 break;
4181 case LPAREN:
4182 case RPAREN:
4183 assert (!"neither LPAREN nor RPAREN may appear here");
4185 case EMPTY:
4186 case BEGWORD:
4187 case ENDWORD:
4188 case LIMWORD:
4189 case NOTLIMWORD:
4190 case BACKREF:
4191 case ANYCHAR:
4192 case MBCSET:
4193 mp = allocmust (mp, 2);
4194 break;
4196 case STAR:
4197 case QMARK:
4198 assume_nonnull (mp);
4199 resetmust (mp);
4200 break;
4202 case OR:
4204 char **new;
4205 must *rmp = mp;
4206 assume_nonnull (rmp);
4207 must *lmp = mp = mp->prev;
4208 assume_nonnull (lmp);
4209 idx_t j, ln, rn, n;
4211 /* Guaranteed to be. Unlikely, but ... */
4212 if (str_eq (lmp->is, rmp->is))
4214 lmp->begline &= rmp->begline;
4215 lmp->endline &= rmp->endline;
4217 else
4219 lmp->is[0] = '\0';
4220 lmp->begline = false;
4221 lmp->endline = false;
4223 /* Left side--easy */
4224 idx_t i = 0;
4225 while (lmp->left[i] != '\0' && lmp->left[i] == rmp->left[i])
4226 ++i;
4227 lmp->left[i] = '\0';
4228 /* Right side */
4229 ln = strlen (lmp->right);
4230 rn = strlen (rmp->right);
4231 n = ln;
4232 if (n > rn)
4233 n = rn;
4234 for (i = 0; i < n; ++i)
4235 if (lmp->right[ln - i - 1] != rmp->right[rn - i - 1])
4236 break;
4237 for (j = 0; j < i; ++j)
4238 lmp->right[j] = lmp->right[(ln - i) + j];
4239 lmp->right[j] = '\0';
4240 new = inboth (lmp->in, rmp->in);
4241 freelist (lmp->in);
4242 free (lmp->in);
4243 lmp->in = new;
4244 freemust (rmp);
4246 break;
4248 case PLUS:
4249 assume_nonnull (mp);
4250 mp->is[0] = '\0';
4251 break;
4253 case END:
4254 assume_nonnull (mp);
4255 assert (!mp->prev);
4256 for (idx_t i = 0; mp->in[i] != NULL; i++)
4257 if (strlen (mp->in[i]) > strlen (result))
4258 result = mp->in[i];
4259 if (str_eq (result, mp->is))
4261 if ((!need_begline || mp->begline) && (!need_endline
4262 || mp->endline))
4263 exact = true;
4264 begline = mp->begline;
4265 endline = mp->endline;
4267 goto done;
4269 case CAT:
4271 must *rmp = mp;
4272 assume_nonnull (rmp);
4273 must *lmp = mp = mp->prev;
4274 assume_nonnull (lmp);
4276 /* In. Everything in left, plus everything in
4277 right, plus concatenation of
4278 left's right and right's left. */
4279 lmp->in = addlists (lmp->in, rmp->in);
4280 if (lmp->right[0] != '\0' && rmp->left[0] != '\0')
4282 idx_t lrlen = strlen (lmp->right);
4283 idx_t rllen = strlen (rmp->left);
4284 char *tp = ximalloc (lrlen + rllen + 1);
4285 memcpy (tp + lrlen, rmp->left, rllen + 1);
4286 memcpy (tp, lmp->right, lrlen);
4287 lmp->in = enlistnew (lmp->in, tp);
4289 /* Left-hand */
4290 if (lmp->is[0] != '\0')
4291 lmp->left = icatalloc (lmp->left, rmp->left);
4292 /* Right-hand */
4293 if (rmp->is[0] == '\0')
4294 lmp->right[0] = '\0';
4295 lmp->right = icatalloc (lmp->right, rmp->right);
4296 /* Guaranteed to be */
4297 if ((lmp->is[0] != '\0' || lmp->begline)
4298 && (rmp->is[0] != '\0' || rmp->endline))
4300 lmp->is = icatalloc (lmp->is, rmp->is);
4301 lmp->endline = rmp->endline;
4303 else
4305 lmp->is[0] = '\0';
4306 lmp->begline = false;
4307 lmp->endline = false;
4309 freemust (rmp);
4311 break;
4313 case '\0':
4314 /* Not on *my* shift. */
4315 goto done;
4317 default:
4318 if (CSET <= t)
4320 /* If T is a singleton, or if case-folding in a unibyte
4321 locale and T's members all case-fold to the same char,
4322 convert T to one of its members. Otherwise, do
4323 nothing further with T. */
4324 charclass *ccl = &d->charclasses[t - CSET];
4325 int j;
4326 for (j = 0; j < NOTCHAR; j++)
4327 if (tstbit (j, ccl))
4328 break;
4329 if (! (j < NOTCHAR))
4331 mp = allocmust (mp, 2);
4332 break;
4334 t = j;
4335 while (++j < NOTCHAR)
4336 if (tstbit (j, ccl)
4337 && ! (case_fold_unibyte
4338 && toupper (j) == toupper (t)))
4339 break;
4340 if (j < NOTCHAR)
4342 mp = allocmust (mp, 2);
4343 break;
4347 idx_t rj = ri + 2;
4348 if (d->tokens[ri + 1] == CAT)
4350 for (; rj < d->tindex - 1; rj += 2)
4352 if ((rj != ri && (d->tokens[rj] <= 0
4353 || NOTCHAR <= d->tokens[rj]))
4354 || d->tokens[rj + 1] != CAT)
4355 break;
4358 mp = allocmust (mp, ((rj - ri) >> 1) + 1);
4359 mp->is[0] = mp->left[0] = mp->right[0]
4360 = case_fold_unibyte ? toupper (t) : t;
4362 idx_t i;
4363 for (i = 1; ri + 2 < rj; i++)
4365 ri += 2;
4366 t = d->tokens[ri];
4367 mp->is[i] = mp->left[i] = mp->right[i]
4368 = case_fold_unibyte ? toupper (t) : t;
4370 mp->is[i] = mp->left[i] = mp->right[i] = '\0';
4371 mp->in = enlist (mp->in, mp->is, i);
4372 break;
4375 done:;
4377 struct dfamust *dm = NULL;
4378 if (*result)
4380 dm = xmalloc (FLEXSIZEOF (struct dfamust, must, strlen (result) + 1));
4381 dm->exact = exact;
4382 dm->begline = begline;
4383 dm->endline = endline;
4384 strcpy (dm->must, result);
4387 while (mp)
4389 must *prev = mp->prev;
4390 freemust (mp);
4391 mp = prev;
4394 return dm;
4397 void
4398 dfamustfree (struct dfamust *dm)
4400 free (dm);
4403 struct dfa *
4404 dfaalloc (void)
4406 return xmalloc (sizeof (struct dfa));
4409 /* Initialize DFA. */
4410 void
4411 dfasyntax (struct dfa *dfa, struct localeinfo const *linfo,
4412 reg_syntax_t bits, int dfaopts)
4414 memset (dfa, 0, offsetof (struct dfa, dfaexec));
4415 dfa->dfaexec = linfo->multibyte ? dfaexec_mb : dfaexec_sb;
4416 dfa->localeinfo = *linfo;
4418 dfa->fast = !dfa->localeinfo.multibyte;
4420 dfa->canychar = -1;
4421 dfa->syntax.syntax_bits_set = true;
4422 dfa->syntax.case_fold = (bits & RE_ICASE) != 0;
4423 dfa->syntax.eolbyte = dfaopts & DFA_EOL_NUL ? '\0' : '\n';
4424 dfa->syntax.syntax_bits = bits;
4425 dfa->syntax.dfaopts = dfaopts;
4427 for (int i = CHAR_MIN; i <= CHAR_MAX; ++i)
4429 unsigned char uc = i;
4431 dfa->syntax.sbit[uc] = char_context (dfa, uc);
4432 switch (dfa->syntax.sbit[uc])
4434 case CTX_LETTER:
4435 setbit (uc, &dfa->syntax.letters);
4436 break;
4437 case CTX_NEWLINE:
4438 setbit (uc, &dfa->syntax.newline);
4439 break;
4442 /* POSIX requires that the five bytes in "\n\r./" (including the
4443 terminating NUL) cannot occur inside a multibyte character. */
4444 dfa->syntax.never_trail[uc] = (dfa->localeinfo.using_utf8
4445 ? (uc & 0xc0) != 0x80
4446 : strchr ("\n\r./", uc) != NULL);
4450 /* Initialize TO by copying FROM's syntax settings. */
4451 void
4452 dfacopysyntax (struct dfa *to, struct dfa const *from)
4454 memset (to, 0, offsetof (struct dfa, syntax));
4455 to->canychar = -1;
4456 to->fast = from->fast;
4457 to->syntax = from->syntax;
4458 to->dfaexec = from->dfaexec;
4459 to->localeinfo = from->localeinfo;
4462 /* vim:set shiftwidth=2: */