fchmod-tests, fchmodat tests, lchmod tests: Add more tests.
[gnulib.git] / lib / dfa.c
blob4929e4c3487a3a3b6e6f56724c7aac70a3081d3f
1 /* dfa.c - deterministic extended regexp routines for GNU
2 Copyright (C) 1988, 1998, 2000, 2002, 2004-2005, 2007-2021 Free Software
3 Foundation, Inc.
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3, or (at your option)
8 any later version.
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc.,
18 51 Franklin Street - Fifth Floor, Boston, MA 02110-1301, USA */
20 /* Written June, 1988 by Mike Haertel
21 Modified July, 1988 by Arthur David Olson to assist BMG speedups */
23 #include <config.h>
25 #include "dfa.h"
27 #include "flexmember.h"
29 #include <assert.h>
30 #include <ctype.h>
31 #include <stdint.h>
32 #include <stdio.h>
33 #include <stdlib.h>
34 #include <limits.h>
35 #include <string.h>
37 /* Another name for ptrdiff_t, for sizes of objects and nonnegative
38 indexes into objects. It is signed to help catch integer overflow.
39 It has its own name because it is for nonnegative values only. */
40 typedef ptrdiff_t idx_t;
41 static idx_t const IDX_MAX = PTRDIFF_MAX;
43 static bool
44 streq (char const *a, char const *b)
46 return strcmp (a, b) == 0;
49 static bool
50 isasciidigit (char c)
52 return '0' <= c && c <= '9';
55 #include "gettext.h"
56 #define _(str) gettext (str)
58 #include <wchar.h>
60 #include "intprops.h"
61 #include "xalloc.h"
62 #include "localeinfo.h"
64 #ifndef FALLTHROUGH
65 # if 201710L < __STDC_VERSION__
66 # define FALLTHROUGH [[__fallthrough__]]
67 # elif (__GNUC__ >= 7) || (__clang_major__ >= 10)
68 # define FALLTHROUGH __attribute__ ((__fallthrough__))
69 # else
70 # define FALLTHROUGH ((void) 0)
71 # endif
72 #endif
74 #ifndef MIN
75 # define MIN(a,b) ((a) < (b) ? (a) : (b))
76 #endif
78 /* HPUX defines these as macros in sys/param.h. */
79 #ifdef setbit
80 # undef setbit
81 #endif
82 #ifdef clrbit
83 # undef clrbit
84 #endif
86 /* For code that does not use Gnulib’s isblank module. */
87 #if !defined isblank && !defined HAVE_ISBLANK && !defined GNULIB_ISBLANK
88 # define isblank dfa_isblank
89 static int
90 isblank (int c)
92 return c == ' ' || c == '\t';
94 #endif
96 /* First integer value that is greater than any character code. */
97 enum { NOTCHAR = 1 << CHAR_BIT };
99 #ifdef UINT_LEAST64_MAX
101 /* Number of bits used in a charclass word. */
102 enum { CHARCLASS_WORD_BITS = 64 };
104 /* This represents part of a character class. It must be unsigned and
105 at least CHARCLASS_WORD_BITS wide. Any excess bits are zero. */
106 typedef uint_least64_t charclass_word;
108 /* Part of a charclass initializer that represents 64 bits' worth of a
109 charclass, where LO and HI are the low and high-order 32 bits of
110 the 64-bit quantity. */
111 # define CHARCLASS_PAIR(lo, hi) (((charclass_word) (hi) << 32) + (lo))
113 #else
114 /* Fallbacks for pre-C99 hosts that lack 64-bit integers. */
115 enum { CHARCLASS_WORD_BITS = 32 };
116 typedef unsigned long charclass_word;
117 # define CHARCLASS_PAIR(lo, hi) lo, hi
118 #endif
120 /* An initializer for a charclass whose 32-bit words are A through H. */
121 #define CHARCLASS_INIT(a, b, c, d, e, f, g, h) \
122 {{ \
123 CHARCLASS_PAIR (a, b), CHARCLASS_PAIR (c, d), \
124 CHARCLASS_PAIR (e, f), CHARCLASS_PAIR (g, h) \
127 /* The maximum useful value of a charclass_word; all used bits are 1. */
128 static charclass_word const CHARCLASS_WORD_MASK
129 = ((charclass_word) 1 << (CHARCLASS_WORD_BITS - 1) << 1) - 1;
131 /* Number of words required to hold a bit for every character. */
132 enum
134 CHARCLASS_WORDS = (NOTCHAR + CHARCLASS_WORD_BITS - 1) / CHARCLASS_WORD_BITS
137 /* Sets of unsigned characters are stored as bit vectors in arrays of ints. */
138 typedef struct { charclass_word w[CHARCLASS_WORDS]; } charclass;
140 /* Convert a possibly-signed character to an unsigned character. This is
141 a bit safer than casting to unsigned char, since it catches some type
142 errors that the cast doesn't. */
143 static unsigned char
144 to_uchar (char ch)
146 return ch;
149 /* Contexts tell us whether a character is a newline or a word constituent.
150 Word-constituent characters are those that satisfy iswalnum, plus '_'.
151 Each character has a single CTX_* value; bitmasks of CTX_* values denote
152 a particular character class.
154 A state also stores a context value, which is a bitmask of CTX_* values.
155 A state's context represents a set of characters that the state's
156 predecessors must match. For example, a state whose context does not
157 include CTX_LETTER will never have transitions where the previous
158 character is a word constituent. A state whose context is CTX_ANY
159 might have transitions from any character. */
161 enum
163 CTX_NONE = 1,
164 CTX_LETTER = 2,
165 CTX_NEWLINE = 4,
166 CTX_ANY = 7
169 /* Sometimes characters can only be matched depending on the surrounding
170 context. Such context decisions depend on what the previous character
171 was, and the value of the current (lookahead) character. Context
172 dependent constraints are encoded as 9-bit integers. Each bit that
173 is set indicates that the constraint succeeds in the corresponding
174 context.
176 bit 6-8 - valid contexts when next character is CTX_NEWLINE
177 bit 3-5 - valid contexts when next character is CTX_LETTER
178 bit 0-2 - valid contexts when next character is CTX_NONE
180 succeeds_in_context determines whether a given constraint
181 succeeds in a particular context. Prev is a bitmask of possible
182 context values for the previous character, curr is the (single-bit)
183 context value for the lookahead character. */
184 static int
185 newline_constraint (int constraint)
187 return (constraint >> 6) & 7;
189 static int
190 letter_constraint (int constraint)
192 return (constraint >> 3) & 7;
194 static int
195 other_constraint (int constraint)
197 return constraint & 7;
200 static bool
201 succeeds_in_context (int constraint, int prev, int curr)
203 return !! (((curr & CTX_NONE ? other_constraint (constraint) : 0) \
204 | (curr & CTX_LETTER ? letter_constraint (constraint) : 0) \
205 | (curr & CTX_NEWLINE ? newline_constraint (constraint) : 0)) \
206 & prev);
209 /* The following describe what a constraint depends on. */
210 static bool
211 prev_newline_dependent (int constraint)
213 return ((constraint ^ constraint >> 2) & 0111) != 0;
215 static bool
216 prev_letter_dependent (int constraint)
218 return ((constraint ^ constraint >> 1) & 0111) != 0;
221 /* Tokens that match the empty string subject to some constraint actually
222 work by applying that constraint to determine what may follow them,
223 taking into account what has gone before. The following values are
224 the constraints corresponding to the special tokens previously defined. */
225 enum
227 NO_CONSTRAINT = 0777,
228 BEGLINE_CONSTRAINT = 0444,
229 ENDLINE_CONSTRAINT = 0700,
230 BEGWORD_CONSTRAINT = 0050,
231 ENDWORD_CONSTRAINT = 0202,
232 LIMWORD_CONSTRAINT = 0252,
233 NOTLIMWORD_CONSTRAINT = 0525
236 /* The regexp is parsed into an array of tokens in postfix form. Some tokens
237 are operators and others are terminal symbols. Most (but not all) of these
238 codes are returned by the lexical analyzer. */
240 typedef ptrdiff_t token;
241 static token const TOKEN_MAX = PTRDIFF_MAX;
243 /* States are indexed by state_num values. These are normally
244 nonnegative but -1 is used as a special value. */
245 typedef ptrdiff_t state_num;
247 /* Predefined token values. */
248 enum
250 END = -1, /* END is a terminal symbol that matches the
251 end of input; any value of END or less in
252 the parse tree is such a symbol. Accepting
253 states of the DFA are those that would have
254 a transition on END. This is -1, not some
255 more-negative value, to tweak the speed of
256 comparisons to END. */
258 /* Ordinary character values are terminal symbols that match themselves. */
260 /* CSET must come last in the following list of special tokens. Otherwise,
261 the list order matters only for performance. Related special tokens
262 should have nearby values so that code like (t == ANYCHAR || t == MBCSET
263 || CSET <= t) can be done with a single machine-level comparison. */
265 EMPTY = NOTCHAR, /* EMPTY is a terminal symbol that matches
266 the empty string. */
268 QMARK, /* QMARK is an operator of one argument that
269 matches zero or one occurrences of its
270 argument. */
272 STAR, /* STAR is an operator of one argument that
273 matches the Kleene closure (zero or more
274 occurrences) of its argument. */
276 PLUS, /* PLUS is an operator of one argument that
277 matches the positive closure (one or more
278 occurrences) of its argument. */
280 REPMN, /* REPMN is a lexical token corresponding
281 to the {m,n} construct. REPMN never
282 appears in the compiled token vector. */
284 CAT, /* CAT is an operator of two arguments that
285 matches the concatenation of its
286 arguments. CAT is never returned by the
287 lexical analyzer. */
289 OR, /* OR is an operator of two arguments that
290 matches either of its arguments. */
292 LPAREN, /* LPAREN never appears in the parse tree,
293 it is only a lexeme. */
295 RPAREN, /* RPAREN never appears in the parse tree. */
297 WCHAR, /* Only returned by lex. wctok contains
298 the wide character representation. */
300 ANYCHAR, /* ANYCHAR is a terminal symbol that matches
301 a valid multibyte (or single byte) character.
302 It is used only if MB_CUR_MAX > 1. */
304 BEG, /* BEG is an initial symbol that matches the
305 beginning of input. */
307 BEGLINE, /* BEGLINE is a terminal symbol that matches
308 the empty string at the beginning of a
309 line. */
311 ENDLINE, /* ENDLINE is a terminal symbol that matches
312 the empty string at the end of a line. */
314 BEGWORD, /* BEGWORD is a terminal symbol that matches
315 the empty string at the beginning of a
316 word. */
318 ENDWORD, /* ENDWORD is a terminal symbol that matches
319 the empty string at the end of a word. */
321 LIMWORD, /* LIMWORD is a terminal symbol that matches
322 the empty string at the beginning or the
323 end of a word. */
325 NOTLIMWORD, /* NOTLIMWORD is a terminal symbol that
326 matches the empty string not at
327 the beginning or end of a word. */
329 BACKREF, /* BACKREF is generated by \<digit>
330 or by any other construct that
331 is not completely handled. If the scanner
332 detects a transition on backref, it returns
333 a kind of "semi-success" indicating that
334 the match will have to be verified with
335 a backtracking matcher. */
337 MBCSET, /* MBCSET is similar to CSET, but for
338 multibyte characters. */
340 CSET /* CSET and (and any value greater) is a
341 terminal symbol that matches any of a
342 class of characters. */
346 /* States of the recognizer correspond to sets of positions in the parse
347 tree, together with the constraints under which they may be matched.
348 So a position is encoded as an index into the parse tree together with
349 a constraint. */
350 typedef struct
352 idx_t index; /* Index into the parse array. */
353 unsigned int constraint; /* Constraint for matching this position. */
354 } position;
356 /* Sets of positions are stored as arrays. */
357 typedef struct
359 position *elems; /* Elements of this position set. */
360 idx_t nelem; /* Number of elements in this set. */
361 idx_t alloc; /* Number of elements allocated in ELEMS. */
362 } position_set;
364 /* A state of the dfa consists of a set of positions, some flags,
365 and the token value of the lowest-numbered position of the state that
366 contains an END token. */
367 typedef struct
369 size_t hash; /* Hash of the positions of this state. */
370 position_set elems; /* Positions this state could match. */
371 unsigned char context; /* Context from previous state. */
372 unsigned short constraint; /* Constraint for this state to accept. */
373 position_set mbps; /* Positions which can match multibyte
374 characters or the follows, e.g., period.
375 Used only if MB_CUR_MAX > 1. */
376 state_num mb_trindex; /* Index of this state in MB_TRANS, or
377 negative if the state does not have
378 ANYCHAR. */
379 } dfa_state;
381 /* Maximum for any transition table count. This should be at least 3,
382 for the initial state setup. */
383 enum { MAX_TRCOUNT = 1024 };
385 /* A bracket operator.
386 e.g., [a-c], [[:alpha:]], etc. */
387 struct mb_char_classes
389 ptrdiff_t cset;
390 bool invert;
391 wchar_t *chars; /* Normal characters. */
392 idx_t nchars;
393 idx_t nchars_alloc;
396 struct regex_syntax
398 /* Syntax bits controlling the behavior of the lexical analyzer. */
399 reg_syntax_t syntax_bits;
400 bool syntax_bits_set;
402 /* Flag for case-folding letters into sets. */
403 bool case_fold;
405 /* True if ^ and $ match only the start and end of data, and do not match
406 end-of-line within data. */
407 bool anchor;
409 /* End-of-line byte in data. */
410 unsigned char eolbyte;
412 /* Cache of char-context values. */
413 char sbit[NOTCHAR];
415 /* If never_trail[B], the byte B cannot be a non-initial byte in a
416 multibyte character. */
417 bool never_trail[NOTCHAR];
419 /* Set of characters considered letters. */
420 charclass letters;
422 /* Set of characters that are newline. */
423 charclass newline;
426 /* Lexical analyzer. All the dross that deals with the obnoxious
427 GNU Regex syntax bits is located here. The poor, suffering
428 reader is referred to the GNU Regex documentation for the
429 meaning of the @#%!@#%^!@ syntax bits. */
430 struct lexer_state
432 char const *ptr; /* Pointer to next input character. */
433 idx_t left; /* Number of characters remaining. */
434 token lasttok; /* Previous token returned; initially END. */
435 idx_t parens; /* Count of outstanding left parens. */
436 int minrep, maxrep; /* Repeat counts for {m,n}. */
438 /* Wide character representation of the current multibyte character,
439 or WEOF if there was an encoding error. Used only if
440 MB_CUR_MAX > 1. */
441 wint_t wctok;
443 /* The most recently analyzed multibyte bracket expression. */
444 struct mb_char_classes brack;
446 /* We're separated from beginning or (, | only by zero-width characters. */
447 bool laststart;
450 /* Recursive descent parser for regular expressions. */
452 struct parser_state
454 token tok; /* Lookahead token. */
455 idx_t depth; /* Current depth of a hypothetical stack
456 holding deferred productions. This is
457 used to determine the depth that will be
458 required of the real stack later on in
459 dfaanalyze. */
462 /* A compiled regular expression. */
463 struct dfa
465 /* Fields filled by the scanner. */
466 charclass *charclasses; /* Array of character sets for CSET tokens. */
467 idx_t cindex; /* Index for adding new charclasses. */
468 idx_t calloc; /* Number of charclasses allocated. */
469 ptrdiff_t canychar; /* Index of anychar class, or -1. */
471 /* Scanner state */
472 struct lexer_state lex;
474 /* Parser state */
475 struct parser_state parse;
477 /* Fields filled by the parser. */
478 token *tokens; /* Postfix parse array. */
479 idx_t tindex; /* Index for adding new tokens. */
480 idx_t talloc; /* Number of tokens currently allocated. */
481 idx_t depth; /* Depth required of an evaluation stack
482 used for depth-first traversal of the
483 parse tree. */
484 idx_t nleaves; /* Number of non-EMPTY leaves
485 in the parse tree. */
486 idx_t nregexps; /* Count of parallel regexps being built
487 with dfaparse. */
488 bool fast; /* The DFA is fast. */
489 bool epsilon; /* Does a token match only the empty string? */
490 token utf8_anychar_classes[9]; /* To lower ANYCHAR in UTF-8 locales. */
491 mbstate_t mbs; /* Multibyte conversion state. */
493 /* The following are valid only if MB_CUR_MAX > 1. */
495 /* The value of multibyte_prop[i] is defined by following rule.
496 if tokens[i] < NOTCHAR
497 bit 0 : tokens[i] is the first byte of a character, including
498 single-byte characters.
499 bit 1 : tokens[i] is the last byte of a character, including
500 single-byte characters.
502 e.g.
503 tokens
504 = 'single_byte_a', 'multi_byte_A', single_byte_b'
505 = 'sb_a', 'mb_A(1st byte)', 'mb_A(2nd byte)', 'mb_A(3rd byte)', 'sb_b'
506 multibyte_prop
507 = 3 , 1 , 0 , 2 , 3
509 char *multibyte_prop;
511 /* Fields filled by the superset. */
512 struct dfa *superset; /* Hint of the dfa. */
514 /* Fields filled by the state builder. */
515 dfa_state *states; /* States of the dfa. */
516 state_num sindex; /* Index for adding new states. */
517 idx_t salloc; /* Number of states currently allocated. */
519 /* Fields filled by the parse tree->NFA conversion. */
520 position_set *follows; /* Array of follow sets, indexed by position
521 index. The follow of a position is the set
522 of positions containing characters that
523 could conceivably follow a character
524 matching the given position in a string
525 matching the regexp. Allocated to the
526 maximum possible position index. */
527 bool searchflag; /* We are supposed to build a searching
528 as opposed to an exact matcher. A searching
529 matcher finds the first and shortest string
530 matching a regexp anywhere in the buffer,
531 whereas an exact matcher finds the longest
532 string matching, but anchored to the
533 beginning of the buffer. */
535 /* Fields filled by dfaanalyze. */
536 int *constraints; /* Array of union of accepting constraints
537 in the follow of a position. */
538 int *separates; /* Array of contexts on follow of a
539 position. */
541 /* Fields filled by dfaexec. */
542 state_num tralloc; /* Number of transition tables that have
543 slots so far, not counting trans[-1] and
544 trans[-2]. */
545 int trcount; /* Number of transition tables that have
546 been built, other than for initial
547 states. */
548 int min_trcount; /* Number of initial states. Equivalently,
549 the minimum state number for which trcount
550 counts transitions. */
551 state_num **trans; /* Transition tables for states that can
552 never accept. If the transitions for a
553 state have not yet been computed, or the
554 state could possibly accept, its entry in
555 this table is NULL. This points to two
556 past the start of the allocated array,
557 and trans[-1] and trans[-2] are always
558 NULL. */
559 state_num **fails; /* Transition tables after failing to accept
560 on a state that potentially could do so.
561 If trans[i] is non-null, fails[i] must
562 be null. */
563 char *success; /* Table of acceptance conditions used in
564 dfaexec and computed in build_state. */
565 state_num *newlines; /* Transitions on newlines. The entry for a
566 newline in any transition table is always
567 -1 so we can count lines without wasting
568 too many cycles. The transition for a
569 newline is stored separately and handled
570 as a special case. Newline is also used
571 as a sentinel at the end of the buffer. */
572 state_num initstate_notbol; /* Initial state for CTX_LETTER and CTX_NONE
573 context in multibyte locales, in which we
574 do not distinguish between their contexts,
575 as not supported word. */
576 position_set mb_follows; /* Follow set added by ANYCHAR on demand. */
577 state_num **mb_trans; /* Transition tables for states with
578 ANYCHAR. */
579 state_num mb_trcount; /* Number of transition tables for states with
580 ANYCHAR that have actually been built. */
582 /* Syntax configuration. This is near the end so that dfacopysyntax
583 can memset up to here. */
584 struct regex_syntax syntax;
586 /* Information derived from the locale. This is at the end so that
587 a quick memset need not clear it specially. */
589 /* dfaexec implementation. */
590 char *(*dfaexec) (struct dfa *, char const *, char *,
591 bool, ptrdiff_t *, bool *);
593 /* Other cached information derived from the locale. */
594 struct localeinfo localeinfo;
597 /* User access to dfa internals. */
599 /* S could possibly be an accepting state of R. */
600 static bool
601 accepting (state_num s, struct dfa const *r)
603 return r->states[s].constraint != 0;
606 /* STATE accepts in the specified context. */
607 static bool
608 accepts_in_context (int prev, int curr, state_num state, struct dfa const *dfa)
610 return succeeds_in_context (dfa->states[state].constraint, prev, curr);
613 static void regexp (struct dfa *dfa);
615 /* Store into *PWC the result of converting the leading bytes of the
616 multibyte buffer S of length N bytes, using D->localeinfo.sbctowc
617 and updating the conversion state in *D. On conversion error,
618 convert just a single byte, to WEOF. Return the number of bytes
619 converted.
621 This differs from mbrtowc (PWC, S, N, &D->mbs) as follows:
623 * PWC points to wint_t, not to wchar_t.
624 * The last arg is a dfa *D instead of merely a multibyte conversion
625 state D->mbs.
626 * N must be at least 1.
627 * S[N - 1] must be a sentinel byte.
628 * Shift encodings are not supported.
629 * The return value is always in the range 1..N.
630 * D->mbs is always valid afterwards.
631 * *PWC is always set to something. */
632 static int
633 mbs_to_wchar (wint_t *pwc, char const *s, size_t n, struct dfa *d)
635 unsigned char uc = s[0];
636 wint_t wc = d->localeinfo.sbctowc[uc];
638 if (wc == WEOF)
640 wchar_t wch;
641 size_t nbytes = mbrtowc (&wch, s, n, &d->mbs);
642 if (0 < nbytes && nbytes < (size_t) -2)
644 *pwc = wch;
645 return nbytes;
647 memset (&d->mbs, 0, sizeof d->mbs);
650 *pwc = wc;
651 return 1;
654 #ifdef DEBUG
656 static void
657 prtok (token t)
659 if (t <= END)
660 fprintf (stderr, "END");
661 else if (0 <= t && t < NOTCHAR)
663 unsigned int ch = t;
664 fprintf (stderr, "0x%02x", ch);
666 else
668 char const *s;
669 switch (t)
671 case BEG:
672 s = "BEG";
673 break;
674 case EMPTY:
675 s = "EMPTY";
676 break;
677 case BACKREF:
678 s = "BACKREF";
679 break;
680 case BEGLINE:
681 s = "BEGLINE";
682 break;
683 case ENDLINE:
684 s = "ENDLINE";
685 break;
686 case BEGWORD:
687 s = "BEGWORD";
688 break;
689 case ENDWORD:
690 s = "ENDWORD";
691 break;
692 case LIMWORD:
693 s = "LIMWORD";
694 break;
695 case NOTLIMWORD:
696 s = "NOTLIMWORD";
697 break;
698 case QMARK:
699 s = "QMARK";
700 break;
701 case STAR:
702 s = "STAR";
703 break;
704 case PLUS:
705 s = "PLUS";
706 break;
707 case CAT:
708 s = "CAT";
709 break;
710 case OR:
711 s = "OR";
712 break;
713 case LPAREN:
714 s = "LPAREN";
715 break;
716 case RPAREN:
717 s = "RPAREN";
718 break;
719 case ANYCHAR:
720 s = "ANYCHAR";
721 break;
722 case MBCSET:
723 s = "MBCSET";
724 break;
725 default:
726 s = "CSET";
727 break;
729 fprintf (stderr, "%s", s);
732 #endif /* DEBUG */
734 /* Stuff pertaining to charclasses. */
736 static bool
737 tstbit (unsigned int b, charclass const *c)
739 return c->w[b / CHARCLASS_WORD_BITS] >> b % CHARCLASS_WORD_BITS & 1;
742 static void
743 setbit (unsigned int b, charclass *c)
745 charclass_word one = 1;
746 c->w[b / CHARCLASS_WORD_BITS] |= one << b % CHARCLASS_WORD_BITS;
749 static void
750 clrbit (unsigned int b, charclass *c)
752 charclass_word one = 1;
753 c->w[b / CHARCLASS_WORD_BITS] &= ~(one << b % CHARCLASS_WORD_BITS);
756 static void
757 zeroset (charclass *s)
759 memset (s, 0, sizeof *s);
762 static void
763 fillset (charclass *s)
765 for (int i = 0; i < CHARCLASS_WORDS; i++)
766 s->w[i] = CHARCLASS_WORD_MASK;
769 static void
770 notset (charclass *s)
772 for (int i = 0; i < CHARCLASS_WORDS; ++i)
773 s->w[i] = CHARCLASS_WORD_MASK & ~s->w[i];
776 static bool
777 equal (charclass const *s1, charclass const *s2)
779 charclass_word w = 0;
780 for (int i = 0; i < CHARCLASS_WORDS; i++)
781 w |= s1->w[i] ^ s2->w[i];
782 return w == 0;
785 static bool
786 emptyset (charclass const *s)
788 charclass_word w = 0;
789 for (int i = 0; i < CHARCLASS_WORDS; i++)
790 w |= s->w[i];
791 return w == 0;
794 /* Grow PA, which points to an array of *NITEMS items, and return the
795 location of the reallocated array, updating *NITEMS to reflect its
796 new size. The new array will contain at least NITEMS_INCR_MIN more
797 items, but will not contain more than NITEMS_MAX items total.
798 ITEM_SIZE is the size of each item, in bytes.
800 ITEM_SIZE and NITEMS_INCR_MIN must be positive. *NITEMS must be
801 nonnegative. If NITEMS_MAX is -1, it is treated as if it were
802 infinity.
804 If PA is null, then allocate a new array instead of reallocating
805 the old one.
807 Thus, to grow an array A without saving its old contents, do
808 { free (A); A = xpalloc (NULL, &AITEMS, ...); }. */
810 static void *
811 xpalloc (void *pa, idx_t *nitems, idx_t nitems_incr_min,
812 ptrdiff_t nitems_max, idx_t item_size)
814 idx_t n0 = *nitems;
816 /* The approximate size to use for initial small allocation
817 requests. This is the largest "small" request for the GNU C
818 library malloc. */
819 enum { DEFAULT_MXFAST = 64 * sizeof (size_t) / 4 };
821 /* If the array is tiny, grow it to about (but no greater than)
822 DEFAULT_MXFAST bytes. Otherwise, grow it by about 50%.
823 Adjust the growth according to three constraints: NITEMS_INCR_MIN,
824 NITEMS_MAX, and what the C language can represent safely. */
826 idx_t n, nbytes;
827 if (INT_ADD_WRAPV (n0, n0 >> 1, &n))
828 n = IDX_MAX;
829 if (0 <= nitems_max && nitems_max < n)
830 n = nitems_max;
832 idx_t adjusted_nbytes
833 = ((INT_MULTIPLY_WRAPV (n, item_size, &nbytes) || SIZE_MAX < nbytes)
834 ? MIN (IDX_MAX, SIZE_MAX)
835 : nbytes < DEFAULT_MXFAST ? DEFAULT_MXFAST : 0);
836 if (adjusted_nbytes)
838 n = adjusted_nbytes / item_size;
839 nbytes = adjusted_nbytes - adjusted_nbytes % item_size;
842 if (! pa)
843 *nitems = 0;
844 if (n - n0 < nitems_incr_min
845 && (INT_ADD_WRAPV (n0, nitems_incr_min, &n)
846 || (0 <= nitems_max && nitems_max < n)
847 || INT_MULTIPLY_WRAPV (n, item_size, &nbytes)))
848 xalloc_die ();
849 pa = xrealloc (pa, nbytes);
850 *nitems = n;
851 return pa;
854 /* Ensure that the array addressed by PA holds at least I + 1 items.
855 Either return PA, or reallocate the array and return its new address.
856 Although PA may be null, the returned value is never null.
858 The array holds *NITEMS items, where 0 <= I <= *NITEMS; *NITEMS
859 is updated on reallocation. If PA is null, *NITEMS must be zero.
860 Do not allocate more than NITEMS_MAX items total; -1 means no limit.
861 ITEM_SIZE is the size of one item; it must be positive.
862 Avoid O(N**2) behavior on arrays growing linearly. */
863 static void *
864 maybe_realloc (void *pa, idx_t i, idx_t *nitems,
865 ptrdiff_t nitems_max, idx_t item_size)
867 if (i < *nitems)
868 return pa;
869 return xpalloc (pa, nitems, 1, nitems_max, item_size);
872 /* In DFA D, find the index of charclass S, or allocate a new one. */
873 static idx_t
874 charclass_index (struct dfa *d, charclass const *s)
876 idx_t i;
878 for (i = 0; i < d->cindex; ++i)
879 if (equal (s, &d->charclasses[i]))
880 return i;
881 d->charclasses = maybe_realloc (d->charclasses, d->cindex, &d->calloc,
882 TOKEN_MAX - CSET, sizeof *d->charclasses);
883 ++d->cindex;
884 d->charclasses[i] = *s;
885 return i;
888 static bool
889 unibyte_word_constituent (struct dfa const *dfa, unsigned char c)
891 return dfa->localeinfo.sbctowc[c] != WEOF && (isalnum (c) || (c) == '_');
894 static int
895 char_context (struct dfa const *dfa, unsigned char c)
897 if (c == dfa->syntax.eolbyte && !dfa->syntax.anchor)
898 return CTX_NEWLINE;
899 if (unibyte_word_constituent (dfa, c))
900 return CTX_LETTER;
901 return CTX_NONE;
904 /* Set a bit in the charclass for the given wchar_t. Do nothing if WC
905 is represented by a multi-byte sequence. Even for MB_CUR_MAX == 1,
906 this may happen when folding case in weird Turkish locales where
907 dotless i/dotted I are not included in the chosen character set.
908 Return whether a bit was set in the charclass. */
909 static bool
910 setbit_wc (wint_t wc, charclass *c)
912 int b = wctob (wc);
913 if (b < 0)
914 return false;
916 setbit (b, c);
917 return true;
920 /* Set a bit for B and its case variants in the charclass C.
921 MB_CUR_MAX must be 1. */
922 static void
923 setbit_case_fold_c (int b, charclass *c)
925 int ub = toupper (b);
926 for (int i = 0; i < NOTCHAR; i++)
927 if (toupper (i) == ub)
928 setbit (i, c);
931 /* Fetch the next lexical input character from the pattern. There
932 must at least one byte of pattern input. Set DFA->lex.wctok to the
933 value of the character or to WEOF depending on whether the input is
934 a valid multibyte character (possibly of length 1). Then return
935 the next input byte value, except return EOF if the input is a
936 multibyte character of length greater than 1. */
937 static int
938 fetch_wc (struct dfa *dfa)
940 int nbytes = mbs_to_wchar (&dfa->lex.wctok, dfa->lex.ptr, dfa->lex.left,
941 dfa);
942 int c = nbytes == 1 ? to_uchar (dfa->lex.ptr[0]) : EOF;
943 dfa->lex.ptr += nbytes;
944 dfa->lex.left -= nbytes;
945 return c;
948 /* If there is no more input, report an error about unbalanced brackets.
949 Otherwise, behave as with fetch_wc (DFA). */
950 static int
951 bracket_fetch_wc (struct dfa *dfa)
953 if (! dfa->lex.left)
954 dfaerror (_("unbalanced ["));
955 return fetch_wc (dfa);
958 typedef int predicate (int);
960 /* The following list maps the names of the Posix named character classes
961 to predicate functions that determine whether a given character is in
962 the class. The leading [ has already been eaten by the lexical
963 analyzer. */
964 struct dfa_ctype
966 const char *name;
967 predicate *func;
968 bool single_byte_only;
971 static const struct dfa_ctype prednames[] = {
972 {"alpha", isalpha, false},
973 {"upper", isupper, false},
974 {"lower", islower, false},
975 {"digit", isdigit, true},
976 {"xdigit", isxdigit, false},
977 {"space", isspace, false},
978 {"punct", ispunct, false},
979 {"alnum", isalnum, false},
980 {"print", isprint, false},
981 {"graph", isgraph, false},
982 {"cntrl", iscntrl, false},
983 {"blank", isblank, false},
984 {NULL, NULL, false}
987 static const struct dfa_ctype *_GL_ATTRIBUTE_PURE
988 find_pred (const char *str)
990 for (int i = 0; prednames[i].name; i++)
991 if (streq (str, prednames[i].name))
992 return &prednames[i];
993 return NULL;
996 /* Parse a bracket expression, which possibly includes multibyte
997 characters. */
998 static token
999 parse_bracket_exp (struct dfa *dfa)
1001 /* This is a bracket expression that dfaexec is known to
1002 process correctly. */
1003 bool known_bracket_exp = true;
1005 /* Used to warn about [:space:].
1006 Bit 0 = first character is a colon.
1007 Bit 1 = last character is a colon.
1008 Bit 2 = includes any other character but a colon.
1009 Bit 3 = includes ranges, char/equiv classes or collation elements. */
1010 int colon_warning_state;
1012 dfa->lex.brack.nchars = 0;
1013 charclass ccl;
1014 zeroset (&ccl);
1015 int c = bracket_fetch_wc (dfa);
1016 bool invert = c == '^';
1017 if (invert)
1019 c = bracket_fetch_wc (dfa);
1020 known_bracket_exp = dfa->localeinfo.simple;
1022 wint_t wc = dfa->lex.wctok;
1023 int c1;
1024 wint_t wc1;
1025 colon_warning_state = (c == ':');
1028 c1 = NOTCHAR; /* Mark c1 as not initialized. */
1029 colon_warning_state &= ~2;
1031 /* Note that if we're looking at some other [:...:] construct,
1032 we just treat it as a bunch of ordinary characters. We can do
1033 this because we assume regex has checked for syntax errors before
1034 dfa is ever called. */
1035 if (c == '[')
1037 c1 = bracket_fetch_wc (dfa);
1038 wc1 = dfa->lex.wctok;
1040 if ((c1 == ':' && (dfa->syntax.syntax_bits & RE_CHAR_CLASSES))
1041 || c1 == '.' || c1 == '=')
1043 enum { MAX_BRACKET_STRING_LEN = 32 };
1044 char str[MAX_BRACKET_STRING_LEN + 1];
1045 int len = 0;
1046 for (;;)
1048 c = bracket_fetch_wc (dfa);
1049 if (dfa->lex.left == 0
1050 || (c == c1 && dfa->lex.ptr[0] == ']'))
1051 break;
1052 if (len < MAX_BRACKET_STRING_LEN)
1053 str[len++] = c;
1054 else
1055 /* This is in any case an invalid class name. */
1056 str[0] = '\0';
1058 str[len] = '\0';
1060 /* Fetch bracket. */
1061 c = bracket_fetch_wc (dfa);
1062 wc = dfa->lex.wctok;
1063 if (c1 == ':')
1064 /* Build character class. POSIX allows character
1065 classes to match multicharacter collating elements,
1066 but the regex code does not support that, so do not
1067 worry about that possibility. */
1069 char const *class
1070 = (dfa->syntax.case_fold && (streq (str, "upper")
1071 || streq (str, "lower"))
1072 ? "alpha" : str);
1073 const struct dfa_ctype *pred = find_pred (class);
1074 if (!pred)
1075 dfaerror (_("invalid character class"));
1077 if (dfa->localeinfo.multibyte && !pred->single_byte_only)
1078 known_bracket_exp = false;
1079 else
1080 for (int c2 = 0; c2 < NOTCHAR; ++c2)
1081 if (pred->func (c2))
1082 setbit (c2, &ccl);
1084 else
1085 known_bracket_exp = false;
1087 colon_warning_state |= 8;
1089 /* Fetch new lookahead character. */
1090 c1 = bracket_fetch_wc (dfa);
1091 wc1 = dfa->lex.wctok;
1092 continue;
1095 /* We treat '[' as a normal character here. c/c1/wc/wc1
1096 are already set up. */
1099 if (c == '\\'
1100 && (dfa->syntax.syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
1102 c = bracket_fetch_wc (dfa);
1103 wc = dfa->lex.wctok;
1106 if (c1 == NOTCHAR)
1108 c1 = bracket_fetch_wc (dfa);
1109 wc1 = dfa->lex.wctok;
1112 if (c1 == '-')
1113 /* build range characters. */
1115 int c2 = bracket_fetch_wc (dfa);
1116 wint_t wc2 = dfa->lex.wctok;
1118 /* A bracket expression like [a-[.aa.]] matches an unknown set.
1119 Treat it like [-a[.aa.]] while parsing it, and
1120 remember that the set is unknown. */
1121 if (c2 == '[' && dfa->lex.ptr[0] == '.')
1123 known_bracket_exp = false;
1124 c2 = ']';
1127 if (c2 == ']')
1129 /* In the case [x-], the - is an ordinary hyphen,
1130 which is left in c1, the lookahead character. */
1131 dfa->lex.ptr--;
1132 dfa->lex.left++;
1134 else
1136 if (c2 == '\\' && (dfa->syntax.syntax_bits
1137 & RE_BACKSLASH_ESCAPE_IN_LISTS))
1139 c2 = bracket_fetch_wc (dfa);
1140 wc2 = dfa->lex.wctok;
1143 colon_warning_state |= 8;
1144 c1 = bracket_fetch_wc (dfa);
1145 wc1 = dfa->lex.wctok;
1147 /* Treat [x-y] as a range if x != y. */
1148 if (wc != wc2 || wc == WEOF)
1150 if (dfa->localeinfo.simple
1151 || (isasciidigit (c) & isasciidigit (c2)))
1153 for (int ci = c; ci <= c2; ci++)
1154 if (dfa->syntax.case_fold && isalpha (ci))
1155 setbit_case_fold_c (ci, &ccl);
1156 else
1157 setbit (ci, &ccl);
1159 else
1160 known_bracket_exp = false;
1162 continue;
1167 colon_warning_state |= (c == ':') ? 2 : 4;
1169 if (!dfa->localeinfo.multibyte)
1171 if (dfa->syntax.case_fold && isalpha (c))
1172 setbit_case_fold_c (c, &ccl);
1173 else
1174 setbit (c, &ccl);
1175 continue;
1178 if (wc == WEOF)
1179 known_bracket_exp = false;
1180 else
1182 wchar_t folded[CASE_FOLDED_BUFSIZE + 1];
1183 int n = (dfa->syntax.case_fold
1184 ? case_folded_counterparts (wc, folded + 1) + 1
1185 : 1);
1186 folded[0] = wc;
1187 for (int i = 0; i < n; i++)
1188 if (!setbit_wc (folded[i], &ccl))
1190 dfa->lex.brack.chars
1191 = maybe_realloc (dfa->lex.brack.chars, dfa->lex.brack.nchars,
1192 &dfa->lex.brack.nchars_alloc, -1,
1193 sizeof *dfa->lex.brack.chars);
1194 dfa->lex.brack.chars[dfa->lex.brack.nchars++] = folded[i];
1198 while ((wc = wc1, (c = c1) != ']'));
1200 if (colon_warning_state == 7)
1201 dfawarn (_("character class syntax is [[:space:]], not [:space:]"));
1203 if (! known_bracket_exp)
1204 return BACKREF;
1206 if (dfa->localeinfo.multibyte && (invert || dfa->lex.brack.nchars != 0))
1208 dfa->lex.brack.invert = invert;
1209 dfa->lex.brack.cset = emptyset (&ccl) ? -1 : charclass_index (dfa, &ccl);
1210 return MBCSET;
1213 if (invert)
1215 notset (&ccl);
1216 if (dfa->syntax.syntax_bits & RE_HAT_LISTS_NOT_NEWLINE)
1217 clrbit ('\n', &ccl);
1220 return CSET + charclass_index (dfa, &ccl);
1223 struct lexptr
1225 char const *ptr;
1226 idx_t left;
1229 static void
1230 push_lex_state (struct dfa *dfa, struct lexptr *ls, char const *s)
1232 ls->ptr = dfa->lex.ptr;
1233 ls->left = dfa->lex.left;
1234 dfa->lex.ptr = s;
1235 dfa->lex.left = strlen (s);
1238 static void
1239 pop_lex_state (struct dfa *dfa, struct lexptr const *ls)
1241 dfa->lex.ptr = ls->ptr;
1242 dfa->lex.left = ls->left;
1245 static token
1246 lex (struct dfa *dfa)
1248 bool backslash = false;
1250 /* Basic plan: We fetch a character. If it's a backslash,
1251 we set the backslash flag and go through the loop again.
1252 On the plus side, this avoids having a duplicate of the
1253 main switch inside the backslash case. On the minus side,
1254 it means that just about every case begins with
1255 "if (backslash) ...". */
1256 for (int i = 0; i < 2; ++i)
1258 if (! dfa->lex.left)
1259 return dfa->lex.lasttok = END;
1260 int c = fetch_wc (dfa);
1262 switch (c)
1264 case '\\':
1265 if (backslash)
1266 goto normal_char;
1267 if (dfa->lex.left == 0)
1268 dfaerror (_("unfinished \\ escape"));
1269 backslash = true;
1270 break;
1272 case '^':
1273 if (backslash)
1274 goto normal_char;
1275 if (dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_ANCHORS
1276 || dfa->lex.lasttok == END || dfa->lex.lasttok == LPAREN
1277 || dfa->lex.lasttok == OR)
1278 return dfa->lex.lasttok = BEGLINE;
1279 goto normal_char;
1281 case '$':
1282 if (backslash)
1283 goto normal_char;
1284 if (dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_ANCHORS
1285 || dfa->lex.left == 0
1286 || ((dfa->lex.left
1287 > !(dfa->syntax.syntax_bits & RE_NO_BK_PARENS))
1288 && (dfa->lex.ptr[!(dfa->syntax.syntax_bits & RE_NO_BK_PARENS)
1289 & (dfa->lex.ptr[0] == '\\')]
1290 == ')'))
1291 || ((dfa->lex.left
1292 > !(dfa->syntax.syntax_bits & RE_NO_BK_VBAR))
1293 && (dfa->lex.ptr[!(dfa->syntax.syntax_bits & RE_NO_BK_VBAR)
1294 & (dfa->lex.ptr[0] == '\\')]
1295 == '|'))
1296 || ((dfa->syntax.syntax_bits & RE_NEWLINE_ALT)
1297 && dfa->lex.left > 0 && dfa->lex.ptr[0] == '\n'))
1298 return dfa->lex.lasttok = ENDLINE;
1299 goto normal_char;
1301 case '1':
1302 case '2':
1303 case '3':
1304 case '4':
1305 case '5':
1306 case '6':
1307 case '7':
1308 case '8':
1309 case '9':
1310 if (backslash && !(dfa->syntax.syntax_bits & RE_NO_BK_REFS))
1312 dfa->lex.laststart = false;
1313 return dfa->lex.lasttok = BACKREF;
1315 goto normal_char;
1317 case '`':
1318 if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1320 /* FIXME: should be beginning of string */
1321 return dfa->lex.lasttok = BEGLINE;
1323 goto normal_char;
1325 case '\'':
1326 if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1328 /* FIXME: should be end of string */
1329 return dfa->lex.lasttok = ENDLINE;
1331 goto normal_char;
1333 case '<':
1334 if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1335 return dfa->lex.lasttok = BEGWORD;
1336 goto normal_char;
1338 case '>':
1339 if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1340 return dfa->lex.lasttok = ENDWORD;
1341 goto normal_char;
1343 case 'b':
1344 if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1345 return dfa->lex.lasttok = LIMWORD;
1346 goto normal_char;
1348 case 'B':
1349 if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1350 return dfa->lex.lasttok = NOTLIMWORD;
1351 goto normal_char;
1353 case '?':
1354 if (dfa->syntax.syntax_bits & RE_LIMITED_OPS)
1355 goto normal_char;
1356 if (backslash != ((dfa->syntax.syntax_bits & RE_BK_PLUS_QM) != 0))
1357 goto normal_char;
1358 if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS)
1359 && dfa->lex.laststart)
1360 goto normal_char;
1361 return dfa->lex.lasttok = QMARK;
1363 case '*':
1364 if (backslash)
1365 goto normal_char;
1366 if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS)
1367 && dfa->lex.laststart)
1368 goto normal_char;
1369 return dfa->lex.lasttok = STAR;
1371 case '+':
1372 if (dfa->syntax.syntax_bits & RE_LIMITED_OPS)
1373 goto normal_char;
1374 if (backslash != ((dfa->syntax.syntax_bits & RE_BK_PLUS_QM) != 0))
1375 goto normal_char;
1376 if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS)
1377 && dfa->lex.laststart)
1378 goto normal_char;
1379 return dfa->lex.lasttok = PLUS;
1381 case '{':
1382 if (!(dfa->syntax.syntax_bits & RE_INTERVALS))
1383 goto normal_char;
1384 if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_BRACES) == 0))
1385 goto normal_char;
1386 if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS)
1387 && dfa->lex.laststart)
1388 goto normal_char;
1390 /* Cases:
1391 {M} - exact count
1392 {M,} - minimum count, maximum is infinity
1393 {,N} - 0 through N
1394 {,} - 0 to infinity (same as '*')
1395 {M,N} - M through N */
1397 char const *p = dfa->lex.ptr;
1398 char const *lim = p + dfa->lex.left;
1399 dfa->lex.minrep = dfa->lex.maxrep = -1;
1400 for (; p != lim && isasciidigit (*p); p++)
1401 dfa->lex.minrep = (dfa->lex.minrep < 0
1402 ? *p - '0'
1403 : MIN (RE_DUP_MAX + 1,
1404 dfa->lex.minrep * 10 + *p - '0'));
1405 if (p != lim)
1407 if (*p != ',')
1408 dfa->lex.maxrep = dfa->lex.minrep;
1409 else
1411 if (dfa->lex.minrep < 0)
1412 dfa->lex.minrep = 0;
1413 while (++p != lim && isasciidigit (*p))
1414 dfa->lex.maxrep
1415 = (dfa->lex.maxrep < 0
1416 ? *p - '0'
1417 : MIN (RE_DUP_MAX + 1,
1418 dfa->lex.maxrep * 10 + *p - '0'));
1421 if (! ((! backslash || (p != lim && *p++ == '\\'))
1422 && p != lim && *p++ == '}'
1423 && 0 <= dfa->lex.minrep
1424 && (dfa->lex.maxrep < 0
1425 || dfa->lex.minrep <= dfa->lex.maxrep)))
1427 if (dfa->syntax.syntax_bits & RE_INVALID_INTERVAL_ORD)
1428 goto normal_char;
1429 dfaerror (_("invalid content of \\{\\}"));
1431 if (RE_DUP_MAX < dfa->lex.maxrep)
1432 dfaerror (_("regular expression too big"));
1433 dfa->lex.ptr = p;
1434 dfa->lex.left = lim - p;
1436 dfa->lex.laststart = false;
1437 return dfa->lex.lasttok = REPMN;
1439 case '|':
1440 if (dfa->syntax.syntax_bits & RE_LIMITED_OPS)
1441 goto normal_char;
1442 if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_VBAR) == 0))
1443 goto normal_char;
1444 dfa->lex.laststart = true;
1445 return dfa->lex.lasttok = OR;
1447 case '\n':
1448 if (dfa->syntax.syntax_bits & RE_LIMITED_OPS
1449 || backslash || !(dfa->syntax.syntax_bits & RE_NEWLINE_ALT))
1450 goto normal_char;
1451 dfa->lex.laststart = true;
1452 return dfa->lex.lasttok = OR;
1454 case '(':
1455 if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_PARENS) == 0))
1456 goto normal_char;
1457 dfa->lex.parens++;
1458 dfa->lex.laststart = true;
1459 return dfa->lex.lasttok = LPAREN;
1461 case ')':
1462 if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_PARENS) == 0))
1463 goto normal_char;
1464 if (dfa->lex.parens == 0
1465 && dfa->syntax.syntax_bits & RE_UNMATCHED_RIGHT_PAREN_ORD)
1466 goto normal_char;
1467 dfa->lex.parens--;
1468 dfa->lex.laststart = false;
1469 return dfa->lex.lasttok = RPAREN;
1471 case '.':
1472 if (backslash)
1473 goto normal_char;
1474 if (dfa->canychar < 0)
1476 charclass ccl;
1477 fillset (&ccl);
1478 if (!(dfa->syntax.syntax_bits & RE_DOT_NEWLINE))
1479 clrbit ('\n', &ccl);
1480 if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL)
1481 clrbit ('\0', &ccl);
1482 if (dfa->localeinfo.multibyte)
1483 for (int c2 = 0; c2 < NOTCHAR; c2++)
1484 if (dfa->localeinfo.sbctowc[c2] == WEOF)
1485 clrbit (c2, &ccl);
1486 dfa->canychar = charclass_index (dfa, &ccl);
1488 dfa->lex.laststart = false;
1489 return dfa->lex.lasttok = (dfa->localeinfo.multibyte
1490 ? ANYCHAR
1491 : CSET + dfa->canychar);
1493 case 's':
1494 case 'S':
1495 if (!backslash || (dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1496 goto normal_char;
1497 if (!dfa->localeinfo.multibyte)
1499 charclass ccl;
1500 zeroset (&ccl);
1501 for (int c2 = 0; c2 < NOTCHAR; ++c2)
1502 if (isspace (c2))
1503 setbit (c2, &ccl);
1504 if (c == 'S')
1505 notset (&ccl);
1506 dfa->lex.laststart = false;
1507 return dfa->lex.lasttok = CSET + charclass_index (dfa, &ccl);
1510 /* FIXME: see if optimizing this, as is done with ANYCHAR and
1511 add_utf8_anychar, makes sense. */
1513 /* \s and \S are documented to be equivalent to [[:space:]] and
1514 [^[:space:]] respectively, so tell the lexer to process those
1515 strings, each minus its "already processed" '['. */
1517 struct lexptr ls;
1518 push_lex_state (dfa, &ls, &"^[:space:]]"[c == 's']);
1519 dfa->lex.lasttok = parse_bracket_exp (dfa);
1520 pop_lex_state (dfa, &ls);
1523 dfa->lex.laststart = false;
1524 return dfa->lex.lasttok;
1526 case 'w':
1527 case 'W':
1528 if (!backslash || (dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1529 goto normal_char;
1531 if (!dfa->localeinfo.multibyte)
1533 charclass ccl;
1534 zeroset (&ccl);
1535 for (int c2 = 0; c2 < NOTCHAR; ++c2)
1536 if (dfa->syntax.sbit[c2] == CTX_LETTER)
1537 setbit (c2, &ccl);
1538 if (c == 'W')
1539 notset (&ccl);
1540 dfa->lex.laststart = false;
1541 return dfa->lex.lasttok = CSET + charclass_index (dfa, &ccl);
1544 /* FIXME: see if optimizing this, as is done with ANYCHAR and
1545 add_utf8_anychar, makes sense. */
1547 /* \w and \W are documented to be equivalent to [_[:alnum:]] and
1548 [^_[:alnum:]] respectively, so tell the lexer to process those
1549 strings, each minus its "already processed" '['. */
1551 struct lexptr ls;
1552 push_lex_state (dfa, &ls, &"^_[:alnum:]]"[c == 'w']);
1553 dfa->lex.lasttok = parse_bracket_exp (dfa);
1554 pop_lex_state (dfa, &ls);
1557 dfa->lex.laststart = false;
1558 return dfa->lex.lasttok;
1560 case '[':
1561 if (backslash)
1562 goto normal_char;
1563 dfa->lex.laststart = false;
1564 return dfa->lex.lasttok = parse_bracket_exp (dfa);
1566 default:
1567 normal_char:
1568 dfa->lex.laststart = false;
1569 /* For multibyte character sets, folding is done in atom. Always
1570 return WCHAR. */
1571 if (dfa->localeinfo.multibyte)
1572 return dfa->lex.lasttok = WCHAR;
1574 if (dfa->syntax.case_fold && isalpha (c))
1576 charclass ccl;
1577 zeroset (&ccl);
1578 setbit_case_fold_c (c, &ccl);
1579 return dfa->lex.lasttok = CSET + charclass_index (dfa, &ccl);
1582 return dfa->lex.lasttok = c;
1586 /* The above loop should consume at most a backslash
1587 and some other character. */
1588 abort ();
1589 return END; /* keeps pedantic compilers happy. */
1592 static void
1593 addtok_mb (struct dfa *dfa, token t, char mbprop)
1595 if (dfa->talloc == dfa->tindex)
1597 dfa->tokens = xpalloc (dfa->tokens, &dfa->talloc, 1, -1,
1598 sizeof *dfa->tokens);
1599 if (dfa->localeinfo.multibyte)
1600 dfa->multibyte_prop = xnrealloc (dfa->multibyte_prop, dfa->talloc,
1601 sizeof *dfa->multibyte_prop);
1603 if (dfa->localeinfo.multibyte)
1604 dfa->multibyte_prop[dfa->tindex] = mbprop;
1605 dfa->tokens[dfa->tindex++] = t;
1607 switch (t)
1609 case QMARK:
1610 case STAR:
1611 case PLUS:
1612 break;
1614 case CAT:
1615 case OR:
1616 dfa->parse.depth--;
1617 break;
1619 case EMPTY:
1620 dfa->epsilon = true;
1621 goto increment_depth;
1623 case BACKREF:
1624 dfa->fast = false;
1625 goto increment_nleaves;
1627 case BEGLINE:
1628 case ENDLINE:
1629 case BEGWORD:
1630 case ENDWORD:
1631 case LIMWORD:
1632 case NOTLIMWORD:
1633 dfa->epsilon = true;
1634 FALLTHROUGH;
1635 default:
1636 increment_nleaves:
1637 dfa->nleaves++;
1638 increment_depth:
1639 dfa->parse.depth++;
1640 if (dfa->depth < dfa->parse.depth)
1641 dfa->depth = dfa->parse.depth;
1642 break;
1646 static void addtok_wc (struct dfa *dfa, wint_t wc);
1648 /* Add the given token to the parse tree, maintaining the depth count and
1649 updating the maximum depth if necessary. */
1650 static void
1651 addtok (struct dfa *dfa, token t)
1653 if (dfa->localeinfo.multibyte && t == MBCSET)
1655 bool need_or = false;
1657 /* Extract wide characters into alternations for better performance.
1658 This does not require UTF-8. */
1659 for (idx_t i = 0; i < dfa->lex.brack.nchars; i++)
1661 addtok_wc (dfa, dfa->lex.brack.chars[i]);
1662 if (need_or)
1663 addtok (dfa, OR);
1664 need_or = true;
1666 dfa->lex.brack.nchars = 0;
1668 /* Wide characters have been handled above, so it is possible
1669 that the set is empty now. Do nothing in that case. */
1670 if (dfa->lex.brack.cset != -1)
1672 addtok (dfa, CSET + dfa->lex.brack.cset);
1673 if (need_or)
1674 addtok (dfa, OR);
1677 else
1679 addtok_mb (dfa, t, 3);
1683 /* We treat a multibyte character as a single atom, so that DFA
1684 can treat a multibyte character as a single expression.
1686 e.g., we construct the following tree from "<mb1><mb2>".
1687 <mb1(1st-byte)><mb1(2nd-byte)><CAT><mb1(3rd-byte)><CAT>
1688 <mb2(1st-byte)><mb2(2nd-byte)><CAT><mb2(3rd-byte)><CAT><CAT> */
1689 static void
1690 addtok_wc (struct dfa *dfa, wint_t wc)
1692 unsigned char buf[MB_LEN_MAX];
1693 mbstate_t s = { 0 };
1694 size_t stored_bytes = wcrtomb ((char *) buf, wc, &s);
1695 int buflen;
1697 if (stored_bytes != (size_t) -1)
1698 buflen = stored_bytes;
1699 else
1701 /* This is merely stop-gap. buf[0] is undefined, yet skipping
1702 the addtok_mb call altogether can corrupt the heap. */
1703 buflen = 1;
1704 buf[0] = 0;
1707 addtok_mb (dfa, buf[0], buflen == 1 ? 3 : 1);
1708 for (int i = 1; i < buflen; i++)
1710 addtok_mb (dfa, buf[i], i == buflen - 1 ? 2 : 0);
1711 addtok (dfa, CAT);
1715 static void
1716 add_utf8_anychar (struct dfa *dfa)
1718 /* Since the Unicode Standard Version 4.0.0 (2003), a well-formed
1719 UTF-8 byte sequence has been defined as follows:
1721 ([\x00-\x7f]
1722 |[\xc2-\xdf][\x80-\xbf]
1723 |[\xe0][\xa0-\xbf][\x80-\xbf]
1724 |[\xe1-\xec\xee-\xef][\x80-\xbf][\x80-\xbf]
1725 |[\xed][\x80-\x9f][\x80-\xbf]
1726 |[\xf0][\x90-\xbf][\x80-\xbf][\x80-\xbf])
1727 |[\xf1-\xf3][\x80-\xbf][\x80-\xbf][\x80-\xbf]
1728 |[\xf4][\x80-\x8f][\x80-\xbf][\x80-\xbf])
1730 which I'll write more concisely "A|BC|DEC|FCC|GHC|IJCC|KCCC|LMCC",
1731 where A = [\x00-\x7f], B = [\xc2-\xdf], C = [\x80-\xbf],
1732 D = [\xe0], E = [\xa0-\xbf], F = [\xe1-\xec\xee-\xef], G = [\xed],
1733 H = [\x80-\x9f], I = [\xf0],
1734 J = [\x90-\xbf], K = [\xf1-\xf3], L = [\xf4], M = [\x80-\x8f].
1736 This can be refactored to "A|(B|DE|GH|(F|IJ|LM|KC)C)C". */
1738 /* Mnemonics for classes containing two or more bytes. */
1739 enum { A, B, C, E, F, H, J, K, M };
1741 /* Mnemonics for single-byte tokens. */
1742 enum { D_token = 0xe0, G_token = 0xed, I_token = 0xf0, L_token = 0xf4 };
1744 static charclass const utf8_classes[] = {
1745 /* A. 00-7f: 1-byte sequence. */
1746 CHARCLASS_INIT (0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0, 0, 0, 0),
1748 /* B. c2-df: 1st byte of a 2-byte sequence. */
1749 CHARCLASS_INIT (0, 0, 0, 0, 0, 0, 0xfffffffc, 0),
1751 /* C. 80-bf: non-leading bytes. */
1752 CHARCLASS_INIT (0, 0, 0, 0, 0xffffffff, 0xffffffff, 0, 0),
1754 /* D. e0 (just a token). */
1756 /* E. a0-bf: 2nd byte of a "DEC" sequence. */
1757 CHARCLASS_INIT (0, 0, 0, 0, 0, 0xffffffff, 0, 0),
1759 /* F. e1-ec + ee-ef: 1st byte of an "FCC" sequence. */
1760 CHARCLASS_INIT (0, 0, 0, 0, 0, 0, 0, 0xdffe),
1762 /* G. ed (just a token). */
1764 /* H. 80-9f: 2nd byte of a "GHC" sequence. */
1765 CHARCLASS_INIT (0, 0, 0, 0, 0xffff, 0, 0, 0),
1767 /* I. f0 (just a token). */
1769 /* J. 90-bf: 2nd byte of an "IJCC" sequence. */
1770 CHARCLASS_INIT (0, 0, 0, 0, 0xffff0000, 0xffffffff, 0, 0),
1772 /* K. f1-f3: 1st byte of a "KCCC" sequence. */
1773 CHARCLASS_INIT (0, 0, 0, 0, 0, 0, 0, 0xe0000),
1775 /* L. f4 (just a token). */
1777 /* M. 80-8f: 2nd byte of a "LMCC" sequence. */
1778 CHARCLASS_INIT (0, 0, 0, 0, 0xff, 0, 0, 0),
1781 /* Define the character classes that are needed below. */
1782 if (dfa->utf8_anychar_classes[0] == 0)
1784 charclass c = utf8_classes[0];
1785 if (! (dfa->syntax.syntax_bits & RE_DOT_NEWLINE))
1786 clrbit ('\n', &c);
1787 if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL)
1788 clrbit ('\0', &c);
1789 dfa->utf8_anychar_classes[0] = CSET + charclass_index (dfa, &c);
1791 for (int i = 1; i < sizeof utf8_classes / sizeof *utf8_classes; i++)
1792 dfa->utf8_anychar_classes[i]
1793 = CSET + charclass_index (dfa, &utf8_classes[i]);
1796 /* Implement the "A|(B|DE|GH|(F|IJ|LM|KC)C)C" pattern mentioned above.
1797 The token buffer is in reverse Polish order, so we get
1798 "A B D E CAT OR G H CAT OR F I J CAT OR L M CAT OR K
1799 C CAT OR C CAT OR C CAT OR". */
1800 addtok (dfa, dfa->utf8_anychar_classes[A]);
1801 addtok (dfa, dfa->utf8_anychar_classes[B]);
1802 addtok (dfa, D_token);
1803 addtok (dfa, dfa->utf8_anychar_classes[E]);
1804 addtok (dfa, CAT);
1805 addtok (dfa, OR);
1806 addtok (dfa, G_token);
1807 addtok (dfa, dfa->utf8_anychar_classes[H]);
1808 addtok (dfa, CAT);
1809 addtok (dfa, OR);
1810 addtok (dfa, dfa->utf8_anychar_classes[F]);
1811 addtok (dfa, I_token);
1812 addtok (dfa, dfa->utf8_anychar_classes[J]);
1813 addtok (dfa, CAT);
1814 addtok (dfa, OR);
1815 addtok (dfa, L_token);
1816 addtok (dfa, dfa->utf8_anychar_classes[M]);
1817 addtok (dfa, CAT);
1818 addtok (dfa, OR);
1819 addtok (dfa, dfa->utf8_anychar_classes[K]);
1820 for (int i = 0; i < 3; i++)
1822 addtok (dfa, dfa->utf8_anychar_classes[C]);
1823 addtok (dfa, CAT);
1824 addtok (dfa, OR);
1828 /* The grammar understood by the parser is as follows.
1830 regexp:
1831 regexp OR branch
1832 branch
1834 branch:
1835 branch closure
1836 closure
1838 closure:
1839 closure QMARK
1840 closure STAR
1841 closure PLUS
1842 closure REPMN
1843 atom
1845 atom:
1846 <normal character>
1847 <multibyte character>
1848 ANYCHAR
1849 MBCSET
1850 CSET
1851 BACKREF
1852 BEGLINE
1853 ENDLINE
1854 BEGWORD
1855 ENDWORD
1856 LIMWORD
1857 NOTLIMWORD
1858 LPAREN regexp RPAREN
1859 <empty>
1861 The parser builds a parse tree in postfix form in an array of tokens. */
1863 static void
1864 atom (struct dfa *dfa)
1866 if ((0 <= dfa->parse.tok && dfa->parse.tok < NOTCHAR)
1867 || dfa->parse.tok >= CSET
1868 || dfa->parse.tok == BEG || dfa->parse.tok == BACKREF
1869 || dfa->parse.tok == BEGLINE || dfa->parse.tok == ENDLINE
1870 || dfa->parse.tok == BEGWORD || dfa->parse.tok == ENDWORD
1871 || dfa->parse.tok == LIMWORD || dfa->parse.tok == NOTLIMWORD
1872 || dfa->parse.tok == ANYCHAR || dfa->parse.tok == MBCSET)
1874 if (dfa->parse.tok == ANYCHAR && dfa->localeinfo.using_utf8)
1876 /* For UTF-8 expand the period to a series of CSETs that define a
1877 valid UTF-8 character. This avoids using the slow multibyte
1878 path. I'm pretty sure it would be both profitable and correct to
1879 do it for any encoding; however, the optimization must be done
1880 manually as it is done above in add_utf8_anychar. So, let's
1881 start with UTF-8: it is the most used, and the structure of the
1882 encoding makes the correctness more obvious. */
1883 add_utf8_anychar (dfa);
1885 else
1886 addtok (dfa, dfa->parse.tok);
1887 dfa->parse.tok = lex (dfa);
1889 else if (dfa->parse.tok == WCHAR)
1891 if (dfa->lex.wctok == WEOF)
1892 addtok (dfa, BACKREF);
1893 else
1895 addtok_wc (dfa, dfa->lex.wctok);
1897 if (dfa->syntax.case_fold)
1899 wchar_t folded[CASE_FOLDED_BUFSIZE];
1900 int n = case_folded_counterparts (dfa->lex.wctok, folded);
1901 for (int i = 0; i < n; i++)
1903 addtok_wc (dfa, folded[i]);
1904 addtok (dfa, OR);
1909 dfa->parse.tok = lex (dfa);
1911 else if (dfa->parse.tok == LPAREN)
1913 dfa->parse.tok = lex (dfa);
1914 regexp (dfa);
1915 if (dfa->parse.tok != RPAREN)
1916 dfaerror (_("unbalanced ("));
1917 dfa->parse.tok = lex (dfa);
1919 else
1920 addtok (dfa, EMPTY);
1923 /* Return the number of tokens in the given subexpression. */
1924 static idx_t _GL_ATTRIBUTE_PURE
1925 nsubtoks (struct dfa const *dfa, idx_t tindex)
1927 switch (dfa->tokens[tindex - 1])
1929 default:
1930 return 1;
1931 case QMARK:
1932 case STAR:
1933 case PLUS:
1934 return 1 + nsubtoks (dfa, tindex - 1);
1935 case CAT:
1936 case OR:
1938 idx_t ntoks1 = nsubtoks (dfa, tindex - 1);
1939 return 1 + ntoks1 + nsubtoks (dfa, tindex - 1 - ntoks1);
1944 /* Copy the given subexpression to the top of the tree. */
1945 static void
1946 copytoks (struct dfa *dfa, idx_t tindex, idx_t ntokens)
1948 if (dfa->localeinfo.multibyte)
1949 for (idx_t i = 0; i < ntokens; i++)
1950 addtok_mb (dfa, dfa->tokens[tindex + i],
1951 dfa->multibyte_prop[tindex + i]);
1952 else
1953 for (idx_t i = 0; i < ntokens; i++)
1954 addtok_mb (dfa, dfa->tokens[tindex + i], 3);
1957 static void
1958 closure (struct dfa *dfa)
1960 atom (dfa);
1961 while (dfa->parse.tok == QMARK || dfa->parse.tok == STAR
1962 || dfa->parse.tok == PLUS || dfa->parse.tok == REPMN)
1963 if (dfa->parse.tok == REPMN && (dfa->lex.minrep || dfa->lex.maxrep))
1965 idx_t ntokens = nsubtoks (dfa, dfa->tindex);
1966 idx_t tindex = dfa->tindex - ntokens;
1967 if (dfa->lex.maxrep < 0)
1968 addtok (dfa, PLUS);
1969 if (dfa->lex.minrep == 0)
1970 addtok (dfa, QMARK);
1971 int i;
1972 for (i = 1; i < dfa->lex.minrep; i++)
1974 copytoks (dfa, tindex, ntokens);
1975 addtok (dfa, CAT);
1977 for (; i < dfa->lex.maxrep; i++)
1979 copytoks (dfa, tindex, ntokens);
1980 addtok (dfa, QMARK);
1981 addtok (dfa, CAT);
1983 dfa->parse.tok = lex (dfa);
1985 else if (dfa->parse.tok == REPMN)
1987 dfa->tindex -= nsubtoks (dfa, dfa->tindex);
1988 dfa->parse.tok = lex (dfa);
1989 closure (dfa);
1991 else
1993 addtok (dfa, dfa->parse.tok);
1994 dfa->parse.tok = lex (dfa);
1998 static void
1999 branch (struct dfa* dfa)
2001 closure (dfa);
2002 while (dfa->parse.tok != RPAREN && dfa->parse.tok != OR
2003 && dfa->parse.tok >= 0)
2005 closure (dfa);
2006 addtok (dfa, CAT);
2010 static void
2011 regexp (struct dfa *dfa)
2013 branch (dfa);
2014 while (dfa->parse.tok == OR)
2016 dfa->parse.tok = lex (dfa);
2017 branch (dfa);
2018 addtok (dfa, OR);
2022 /* Parse a string S of length LEN into D. S can include NUL characters.
2023 This is the main entry point for the parser. */
2024 void
2025 dfaparse (char const *s, idx_t len, struct dfa *d)
2027 d->lex.ptr = s;
2028 d->lex.left = len;
2029 d->lex.lasttok = END;
2030 d->lex.laststart = true;
2032 if (!d->syntax.syntax_bits_set)
2033 dfaerror (_("no syntax specified"));
2035 if (!d->nregexps)
2036 addtok (d, BEG);
2038 d->parse.tok = lex (d);
2039 d->parse.depth = d->depth;
2041 regexp (d);
2043 if (d->parse.tok != END)
2044 dfaerror (_("unbalanced )"));
2046 addtok (d, END - d->nregexps);
2047 addtok (d, CAT);
2049 if (d->nregexps)
2050 addtok (d, OR);
2052 ++d->nregexps;
2055 /* Some primitives for operating on sets of positions. */
2057 /* Copy one set to another. */
2058 static void
2059 copy (position_set const *src, position_set *dst)
2061 if (dst->alloc < src->nelem)
2063 free (dst->elems);
2064 dst->elems = xpalloc (NULL, &dst->alloc, src->nelem - dst->alloc, -1,
2065 sizeof *dst->elems);
2067 dst->nelem = src->nelem;
2068 if (src->nelem != 0)
2069 memcpy (dst->elems, src->elems, src->nelem * sizeof *dst->elems);
2072 static void
2073 alloc_position_set (position_set *s, idx_t size)
2075 s->elems = xnmalloc (size, sizeof *s->elems);
2076 s->alloc = size;
2077 s->nelem = 0;
2080 /* Insert position P in set S. S is maintained in sorted order on
2081 decreasing index. If there is already an entry in S with P.index
2082 then merge (logically-OR) P's constraints into the one in S.
2083 S->elems must point to an array large enough to hold the resulting set. */
2084 static void
2085 insert (position p, position_set *s)
2087 idx_t count = s->nelem;
2088 idx_t lo = 0, hi = count;
2089 while (lo < hi)
2091 idx_t mid = (lo + hi) >> 1;
2092 if (s->elems[mid].index < p.index)
2093 lo = mid + 1;
2094 else if (s->elems[mid].index == p.index)
2096 s->elems[mid].constraint |= p.constraint;
2097 return;
2099 else
2100 hi = mid;
2103 s->elems = maybe_realloc (s->elems, count, &s->alloc, -1, sizeof *s->elems);
2104 for (idx_t i = count; i > lo; i--)
2105 s->elems[i] = s->elems[i - 1];
2106 s->elems[lo] = p;
2107 ++s->nelem;
2110 static void
2111 append (position p, position_set *s)
2113 idx_t count = s->nelem;
2114 s->elems = maybe_realloc (s->elems, count, &s->alloc, -1, sizeof *s->elems);
2115 s->elems[s->nelem++] = p;
2118 /* Merge S1 and S2 (with the additional constraint C2) into M. The
2119 result is as if the positions of S1, and of S2 with the additional
2120 constraint C2, were inserted into an initially empty set. */
2121 static void
2122 merge_constrained (position_set const *s1, position_set const *s2,
2123 unsigned int c2, position_set *m)
2125 idx_t i = 0, j = 0;
2127 if (m->alloc - s1->nelem < s2->nelem)
2129 free (m->elems);
2130 m->alloc = s1->nelem;
2131 m->elems = xpalloc (NULL, &m->alloc, s2->nelem, -1, sizeof *m->elems);
2133 m->nelem = 0;
2134 while (i < s1->nelem || j < s2->nelem)
2135 if (! (j < s2->nelem)
2136 || (i < s1->nelem && s1->elems[i].index <= s2->elems[j].index))
2138 unsigned int c = ((i < s1->nelem && j < s2->nelem
2139 && s1->elems[i].index == s2->elems[j].index)
2140 ? s2->elems[j++].constraint & c2
2141 : 0);
2142 m->elems[m->nelem].index = s1->elems[i].index;
2143 m->elems[m->nelem++].constraint = s1->elems[i++].constraint | c;
2145 else
2147 if (s2->elems[j].constraint & c2)
2149 m->elems[m->nelem].index = s2->elems[j].index;
2150 m->elems[m->nelem++].constraint = s2->elems[j].constraint & c2;
2152 j++;
2156 /* Merge two sets of positions into a third. The result is exactly as if
2157 the positions of both sets were inserted into an initially empty set. */
2158 static void
2159 merge (position_set const *s1, position_set const *s2, position_set *m)
2161 merge_constrained (s1, s2, -1, m);
2164 /* Merge into DST all the elements of SRC, possibly destroying
2165 the contents of the temporary M. */
2166 static void
2167 merge2 (position_set *dst, position_set const *src, position_set *m)
2169 if (src->nelem < 4)
2171 for (idx_t i = 0; i < src->nelem; i++)
2172 insert (src->elems[i], dst);
2174 else
2176 merge (src, dst, m);
2177 copy (m, dst);
2181 /* Delete a position from a set. Return the nonzero constraint of the
2182 deleted position, or zero if there was no such position. */
2183 static unsigned int
2184 delete (idx_t del, position_set *s)
2186 idx_t count = s->nelem;
2187 idx_t lo = 0, hi = count;
2188 while (lo < hi)
2190 idx_t mid = (lo + hi) >> 1;
2191 if (s->elems[mid].index < del)
2192 lo = mid + 1;
2193 else if (s->elems[mid].index == del)
2195 unsigned int c = s->elems[mid].constraint;
2196 idx_t i;
2197 for (i = mid; i + 1 < count; i++)
2198 s->elems[i] = s->elems[i + 1];
2199 s->nelem = i;
2200 return c;
2202 else
2203 hi = mid;
2205 return 0;
2208 /* Replace a position with the followed set. */
2209 static void
2210 replace (position_set *dst, idx_t del, position_set *add,
2211 unsigned int constraint, position_set *tmp)
2213 unsigned int c = delete (del, dst) & constraint;
2215 if (c)
2217 copy (dst, tmp);
2218 merge_constrained (tmp, add, c, dst);
2222 /* Find the index of the state corresponding to the given position set with
2223 the given preceding context, or create a new state if there is no such
2224 state. Context tells whether we got here on a newline or letter. */
2225 static state_num
2226 state_index (struct dfa *d, position_set const *s, int context)
2228 size_t hash = 0;
2229 int constraint = 0;
2230 state_num i;
2232 for (i = 0; i < s->nelem; ++i)
2234 size_t ind = s->elems[i].index;
2235 hash ^= ind + s->elems[i].constraint;
2238 /* Try to find a state that exactly matches the proposed one. */
2239 for (i = 0; i < d->sindex; ++i)
2241 if (hash != d->states[i].hash || s->nelem != d->states[i].elems.nelem
2242 || context != d->states[i].context)
2243 continue;
2244 state_num j;
2245 for (j = 0; j < s->nelem; ++j)
2246 if (s->elems[j].constraint != d->states[i].elems.elems[j].constraint
2247 || s->elems[j].index != d->states[i].elems.elems[j].index)
2248 break;
2249 if (j == s->nelem)
2250 return i;
2253 #ifdef DEBUG
2254 fprintf (stderr, "new state %td\n nextpos:", i);
2255 for (state_num j = 0; j < s->nelem; j++)
2257 fprintf (stderr, " %td:", s->elems[j].index);
2258 prtok (d->tokens[s->elems[j].index]);
2260 fprintf (stderr, "\n context:");
2261 if (context ^ CTX_ANY)
2263 if (context & CTX_NONE)
2264 fprintf (stderr, " CTX_NONE");
2265 if (context & CTX_LETTER)
2266 fprintf (stderr, " CTX_LETTER");
2267 if (context & CTX_NEWLINE)
2268 fprintf (stderr, " CTX_NEWLINE");
2270 else
2271 fprintf (stderr, " CTX_ANY");
2272 fprintf (stderr, "\n");
2273 #endif
2275 for (state_num j = 0; j < s->nelem; j++)
2277 int c = d->constraints[s->elems[j].index];
2279 if (c != 0)
2281 if (succeeds_in_context (c, context, CTX_ANY))
2282 constraint |= c;
2284 else if (d->tokens[s->elems[j].index] == BACKREF)
2285 constraint = NO_CONSTRAINT;
2289 /* Create a new state. */
2290 d->states = maybe_realloc (d->states, d->sindex, &d->salloc, -1,
2291 sizeof *d->states);
2292 d->states[i].hash = hash;
2293 alloc_position_set (&d->states[i].elems, s->nelem);
2294 copy (s, &d->states[i].elems);
2295 d->states[i].context = context;
2296 d->states[i].constraint = constraint;
2297 d->states[i].mbps.nelem = 0;
2298 d->states[i].mbps.elems = NULL;
2299 d->states[i].mb_trindex = -1;
2301 ++d->sindex;
2303 return i;
2306 /* Find the epsilon closure of D's set of positions. If any position of the set
2307 contains a symbol that matches the empty string in some context, replace
2308 that position with the elements of its follow labeled with an appropriate
2309 constraint. Repeat exhaustively until no funny positions are left.
2310 S->elems must be large enough to hold the result. BACKWARD is D's
2311 backward set; use and update it too. */
2312 static void
2313 epsclosure (struct dfa const *d, position_set *backward)
2315 position_set tmp;
2316 alloc_position_set (&tmp, d->nleaves);
2317 for (idx_t i = 0; i < d->tindex; i++)
2318 if (0 < d->follows[i].nelem)
2320 unsigned int constraint;
2321 switch (d->tokens[i])
2323 default:
2324 continue;
2326 case BEGLINE:
2327 constraint = BEGLINE_CONSTRAINT;
2328 break;
2329 case ENDLINE:
2330 constraint = ENDLINE_CONSTRAINT;
2331 break;
2332 case BEGWORD:
2333 constraint = BEGWORD_CONSTRAINT;
2334 break;
2335 case ENDWORD:
2336 constraint = ENDWORD_CONSTRAINT;
2337 break;
2338 case LIMWORD:
2339 constraint = LIMWORD_CONSTRAINT;
2340 break;
2341 case NOTLIMWORD:
2342 constraint = NOTLIMWORD_CONSTRAINT;
2343 break;
2344 case EMPTY:
2345 constraint = NO_CONSTRAINT;
2346 break;
2349 delete (i, &d->follows[i]);
2351 for (idx_t j = 0; j < backward[i].nelem; j++)
2352 replace (&d->follows[backward[i].elems[j].index], i, &d->follows[i],
2353 constraint, &tmp);
2354 for (idx_t j = 0; j < d->follows[i].nelem; j++)
2355 replace (&backward[d->follows[i].elems[j].index], i, &backward[i],
2356 NO_CONSTRAINT, &tmp);
2358 free (tmp.elems);
2361 /* Returns the set of contexts for which there is at least one
2362 character included in C. */
2364 static int
2365 charclass_context (struct dfa const *dfa, charclass const *c)
2367 int context = 0;
2369 for (int j = 0; j < CHARCLASS_WORDS; j++)
2371 if (c->w[j] & dfa->syntax.newline.w[j])
2372 context |= CTX_NEWLINE;
2373 if (c->w[j] & dfa->syntax.letters.w[j])
2374 context |= CTX_LETTER;
2375 if (c->w[j] & ~(dfa->syntax.letters.w[j] | dfa->syntax.newline.w[j]))
2376 context |= CTX_NONE;
2379 return context;
2382 /* Returns the contexts on which the position set S depends. Each context
2383 in the set of returned contexts (let's call it SC) may have a different
2384 follow set than other contexts in SC, and also different from the
2385 follow set of the complement set (sc ^ CTX_ANY). However, all contexts
2386 in the complement set will have the same follow set. */
2388 static int _GL_ATTRIBUTE_PURE
2389 state_separate_contexts (struct dfa *d, position_set const *s)
2391 int separate_contexts = 0;
2393 for (idx_t j = 0; j < s->nelem; j++)
2394 separate_contexts |= d->separates[s->elems[j].index];
2396 return separate_contexts;
2399 enum
2401 /* Single token is repeated. It is distinguished from non-repeated. */
2402 OPT_REPEAT = (1 << 0),
2404 /* Multiple tokens are repeated. This flag is on at head of tokens. The
2405 node is not merged. */
2406 OPT_LPAREN = (1 << 1),
2408 /* Multiple branches are joined. The node is not merged. */
2409 OPT_RPAREN = (1 << 2),
2411 /* The node is walked. If the node is found in walking again, OPT_RPAREN
2412 flag is turned on. */
2413 OPT_WALKED = (1 << 3),
2415 /* The node is queued. The node is not queued again. */
2416 OPT_QUEUED = (1 << 4)
2419 static void
2420 merge_nfa_state (struct dfa *d, idx_t tindex, char *flags,
2421 position_set *merged)
2423 position_set *follows = d->follows;
2424 idx_t nelem = 0;
2426 for (idx_t i = 0; i < follows[tindex].nelem; i++)
2428 idx_t sindex = follows[tindex].elems[i].index;
2430 /* Skip the node as pruned in future. */
2431 unsigned int iconstraint = follows[tindex].elems[i].constraint;
2432 if (iconstraint == 0)
2433 continue;
2435 if (d->tokens[follows[tindex].elems[i].index] <= END)
2437 d->constraints[tindex] |= follows[tindex].elems[i].constraint;
2438 continue;
2441 if (sindex != tindex && !(flags[sindex] & (OPT_LPAREN | OPT_RPAREN)))
2443 idx_t j;
2445 for (j = 0; j < nelem; j++)
2447 idx_t dindex = follows[tindex].elems[j].index;
2449 if (dindex == tindex)
2450 continue;
2452 if (follows[tindex].elems[j].constraint != iconstraint)
2453 continue;
2455 if (flags[dindex] & (OPT_LPAREN | OPT_RPAREN))
2456 continue;
2458 if (d->tokens[sindex] != d->tokens[dindex])
2459 continue;
2461 if ((flags[sindex] ^ flags[dindex]) & OPT_REPEAT)
2462 continue;
2464 if (flags[sindex] & OPT_REPEAT)
2465 delete (sindex, &follows[sindex]);
2467 merge2 (&follows[dindex], &follows[sindex], merged);
2469 break;
2472 if (j < nelem)
2473 continue;
2476 follows[tindex].elems[nelem++] = follows[tindex].elems[i];
2477 flags[sindex] |= OPT_QUEUED;
2480 follows[tindex].nelem = nelem;
2483 static int
2484 compare (const void *a, const void *b)
2486 position const *p = a, *q = b;
2487 return (p->index > q->index) - (p->index < q->index);
2490 static void
2491 reorder_tokens (struct dfa *d)
2493 idx_t nleaves = 0;
2494 ptrdiff_t *map = xnmalloc (d->tindex, sizeof *map);
2495 map[0] = nleaves++;
2496 for (idx_t i = 1; i < d->tindex; i++)
2497 map[i] = -1;
2499 token *tokens = xnmalloc (d->nleaves, sizeof *tokens);
2500 position_set *follows = xnmalloc (d->nleaves, sizeof *follows);
2501 int *constraints = xnmalloc (d->nleaves, sizeof *constraints);
2502 char *multibyte_prop = (d->localeinfo.multibyte
2503 ? xnmalloc (d->nleaves, sizeof *multibyte_prop)
2504 : NULL);
2506 for (idx_t i = 0; i < d->tindex; i++)
2508 if (map[i] < 0)
2510 free (d->follows[i].elems);
2511 d->follows[i].elems = NULL;
2512 d->follows[i].nelem = 0;
2513 continue;
2516 tokens[map[i]] = d->tokens[i];
2517 follows[map[i]] = d->follows[i];
2518 constraints[map[i]] = d->constraints[i];
2520 if (multibyte_prop != NULL)
2521 multibyte_prop[map[i]] = d->multibyte_prop[i];
2523 for (idx_t j = 0; j < d->follows[i].nelem; j++)
2525 if (map[d->follows[i].elems[j].index] == -1)
2526 map[d->follows[i].elems[j].index] = nleaves++;
2528 d->follows[i].elems[j].index = map[d->follows[i].elems[j].index];
2531 qsort (d->follows[i].elems, d->follows[i].nelem,
2532 sizeof *d->follows[i].elems, compare);
2535 for (idx_t i = 0; i < nleaves; i++)
2537 d->tokens[i] = tokens[i];
2538 d->follows[i] = follows[i];
2539 d->constraints[i] = constraints[i];
2541 if (multibyte_prop != NULL)
2542 d->multibyte_prop[i] = multibyte_prop[i];
2545 d->tindex = d->nleaves = nleaves;
2547 free (tokens);
2548 free (follows);
2549 free (constraints);
2550 free (multibyte_prop);
2551 free (map);
2554 static void
2555 dfaoptimize (struct dfa *d)
2557 char *flags = xzalloc (d->tindex);
2559 for (idx_t i = 0; i < d->tindex; i++)
2561 for (idx_t j = 0; j < d->follows[i].nelem; j++)
2563 if (d->follows[i].elems[j].index == i)
2564 flags[d->follows[i].elems[j].index] |= OPT_REPEAT;
2565 else if (d->follows[i].elems[j].index < i)
2566 flags[d->follows[i].elems[j].index] |= OPT_LPAREN;
2567 else if (flags[d->follows[i].elems[j].index] &= OPT_WALKED)
2568 flags[d->follows[i].elems[j].index] |= OPT_RPAREN;
2569 else
2570 flags[d->follows[i].elems[j].index] |= OPT_WALKED;
2574 flags[0] |= OPT_QUEUED;
2576 position_set merged0;
2577 position_set *merged = &merged0;
2578 alloc_position_set (merged, d->nleaves);
2580 d->constraints = xcalloc (d->tindex, sizeof *d->constraints);
2582 for (idx_t i = 0; i < d->tindex; i++)
2583 if (flags[i] & OPT_QUEUED)
2584 merge_nfa_state (d, i, flags, merged);
2586 reorder_tokens (d);
2588 free (merged->elems);
2589 free (flags);
2592 /* Perform bottom-up analysis on the parse tree, computing various functions.
2593 Note that at this point, we're pretending constructs like \< are real
2594 characters rather than constraints on what can follow them.
2596 Nullable: A node is nullable if it is at the root of a regexp that can
2597 match the empty string.
2598 * EMPTY leaves are nullable.
2599 * No other leaf is nullable.
2600 * A QMARK or STAR node is nullable.
2601 * A PLUS node is nullable if its argument is nullable.
2602 * A CAT node is nullable if both its arguments are nullable.
2603 * An OR node is nullable if either argument is nullable.
2605 Firstpos: The firstpos of a node is the set of positions (nonempty leaves)
2606 that could correspond to the first character of a string matching the
2607 regexp rooted at the given node.
2608 * EMPTY leaves have empty firstpos.
2609 * The firstpos of a nonempty leaf is that leaf itself.
2610 * The firstpos of a QMARK, STAR, or PLUS node is the firstpos of its
2611 argument.
2612 * The firstpos of a CAT node is the firstpos of the left argument, union
2613 the firstpos of the right if the left argument is nullable.
2614 * The firstpos of an OR node is the union of firstpos of each argument.
2616 Lastpos: The lastpos of a node is the set of positions that could
2617 correspond to the last character of a string matching the regexp at
2618 the given node.
2619 * EMPTY leaves have empty lastpos.
2620 * The lastpos of a nonempty leaf is that leaf itself.
2621 * The lastpos of a QMARK, STAR, or PLUS node is the lastpos of its
2622 argument.
2623 * The lastpos of a CAT node is the lastpos of its right argument, union
2624 the lastpos of the left if the right argument is nullable.
2625 * The lastpos of an OR node is the union of the lastpos of each argument.
2627 Follow: The follow of a position is the set of positions that could
2628 correspond to the character following a character matching the node in
2629 a string matching the regexp. At this point we consider special symbols
2630 that match the empty string in some context to be just normal characters.
2631 Later, if we find that a special symbol is in a follow set, we will
2632 replace it with the elements of its follow, labeled with an appropriate
2633 constraint.
2634 * Every node in the firstpos of the argument of a STAR or PLUS node is in
2635 the follow of every node in the lastpos.
2636 * Every node in the firstpos of the second argument of a CAT node is in
2637 the follow of every node in the lastpos of the first argument.
2639 Because of the postfix representation of the parse tree, the depth-first
2640 analysis is conveniently done by a linear scan with the aid of a stack.
2641 Sets are stored as arrays of the elements, obeying a stack-like allocation
2642 scheme; the number of elements in each set deeper in the stack can be
2643 used to determine the address of a particular set's array. */
2644 static void
2645 dfaanalyze (struct dfa *d, bool searchflag)
2647 /* Array allocated to hold position sets. */
2648 position *posalloc = xnmalloc (d->nleaves, 2 * sizeof *posalloc);
2649 /* Firstpos and lastpos elements. */
2650 position *firstpos = posalloc;
2651 position *lastpos = firstpos + d->nleaves;
2652 position pos;
2653 position_set tmp;
2655 /* Stack for element counts and nullable flags. */
2656 struct
2658 /* Whether the entry is nullable. */
2659 bool nullable;
2661 /* Counts of firstpos and lastpos sets. */
2662 idx_t nfirstpos;
2663 idx_t nlastpos;
2664 } *stkalloc = xnmalloc (d->depth, sizeof *stkalloc), *stk = stkalloc;
2666 position_set merged; /* Result of merging sets. */
2668 addtok (d, CAT);
2669 idx_t tindex = d->tindex;
2671 #ifdef DEBUG
2672 fprintf (stderr, "dfaanalyze:\n");
2673 for (idx_t i = 0; i < tindex; i++)
2675 fprintf (stderr, " %td:", i);
2676 prtok (d->tokens[i]);
2678 putc ('\n', stderr);
2679 #endif
2681 d->searchflag = searchflag;
2682 alloc_position_set (&merged, d->nleaves);
2683 d->follows = xcalloc (tindex, sizeof *d->follows);
2684 position_set *backward
2685 = d->epsilon ? xcalloc (tindex, sizeof *backward) : NULL;
2687 for (idx_t i = 0; i < tindex; i++)
2689 switch (d->tokens[i])
2691 case EMPTY:
2692 /* The empty set is nullable. */
2693 stk->nullable = true;
2695 /* The firstpos and lastpos of the empty leaf are both empty. */
2696 stk->nfirstpos = stk->nlastpos = 0;
2697 stk++;
2698 break;
2700 case STAR:
2701 case PLUS:
2702 /* Every element in the lastpos of the argument is in the backward
2703 set of every element in the firstpos. */
2704 if (d->epsilon)
2706 tmp.elems = lastpos - stk[-1].nlastpos;
2707 tmp.nelem = stk[-1].nlastpos;
2708 for (position *p = firstpos - stk[-1].nfirstpos;
2709 p < firstpos; p++)
2710 merge2 (&backward[p->index], &tmp, &merged);
2713 /* Every element in the firstpos of the argument is in the follow
2714 of every element in the lastpos. */
2716 tmp.elems = firstpos - stk[-1].nfirstpos;
2717 tmp.nelem = stk[-1].nfirstpos;
2718 for (position *p = lastpos - stk[-1].nlastpos; p < lastpos; p++)
2719 merge2 (&d->follows[p->index], &tmp, &merged);
2721 FALLTHROUGH;
2722 case QMARK:
2723 /* A QMARK or STAR node is automatically nullable. */
2724 if (d->tokens[i] != PLUS)
2725 stk[-1].nullable = true;
2726 break;
2728 case CAT:
2729 /* Every element in the lastpos of the first argument is in
2730 the backward set of every element in the firstpos of the
2731 second argument. */
2732 if (backward)
2734 tmp.nelem = stk[-2].nlastpos;
2735 tmp.elems = lastpos - stk[-1].nlastpos - stk[-2].nlastpos;
2736 for (position *p = firstpos - stk[-1].nfirstpos;
2737 p < firstpos; p++)
2738 merge2 (&backward[p->index], &tmp, &merged);
2741 /* Every element in the firstpos of the second argument is in the
2742 follow of every element in the lastpos of the first argument. */
2744 tmp.nelem = stk[-1].nfirstpos;
2745 tmp.elems = firstpos - stk[-1].nfirstpos;
2746 for (position *plim = lastpos - stk[-1].nlastpos,
2747 *p = plim - stk[-2].nlastpos;
2748 p < plim; p++)
2749 merge2 (&d->follows[p->index], &tmp, &merged);
2752 /* The firstpos of a CAT node is the firstpos of the first argument,
2753 union that of the second argument if the first is nullable. */
2754 if (stk[-2].nullable)
2755 stk[-2].nfirstpos += stk[-1].nfirstpos;
2756 else
2757 firstpos -= stk[-1].nfirstpos;
2759 /* The lastpos of a CAT node is the lastpos of the second argument,
2760 union that of the first argument if the second is nullable. */
2761 if (stk[-1].nullable)
2762 stk[-2].nlastpos += stk[-1].nlastpos;
2763 else
2765 position *p = lastpos - stk[-1].nlastpos - stk[-2].nlastpos;
2766 for (idx_t j = 0; j < stk[-1].nlastpos; j++)
2767 p[j] = p[j + stk[-2].nlastpos];
2768 lastpos -= stk[-2].nlastpos;
2769 stk[-2].nlastpos = stk[-1].nlastpos;
2772 /* A CAT node is nullable if both arguments are nullable. */
2773 stk[-2].nullable &= stk[-1].nullable;
2774 stk--;
2775 break;
2777 case OR:
2778 /* The firstpos is the union of the firstpos of each argument. */
2779 stk[-2].nfirstpos += stk[-1].nfirstpos;
2781 /* The lastpos is the union of the lastpos of each argument. */
2782 stk[-2].nlastpos += stk[-1].nlastpos;
2784 /* An OR node is nullable if either argument is nullable. */
2785 stk[-2].nullable |= stk[-1].nullable;
2786 stk--;
2787 break;
2789 default:
2790 /* Anything else is a nonempty position. (Note that special
2791 constructs like \< are treated as nonempty strings here;
2792 an "epsilon closure" effectively makes them nullable later.
2793 Backreferences have to get a real position so we can detect
2794 transitions on them later. But they are nullable. */
2795 stk->nullable = d->tokens[i] == BACKREF;
2797 /* This position is in its own firstpos and lastpos. */
2798 stk->nfirstpos = stk->nlastpos = 1;
2799 stk++;
2801 firstpos->index = lastpos->index = i;
2802 firstpos->constraint = lastpos->constraint = NO_CONSTRAINT;
2803 firstpos++, lastpos++;
2805 break;
2807 #ifdef DEBUG
2808 /* ... balance the above nonsyntactic #ifdef goo... */
2809 fprintf (stderr, "node %td:", i);
2810 prtok (d->tokens[i]);
2811 putc ('\n', stderr);
2812 fprintf (stderr,
2813 stk[-1].nullable ? " nullable: yes\n" : " nullable: no\n");
2814 fprintf (stderr, " firstpos:");
2815 for (idx_t j = 0; j < stk[-1].nfirstpos; j++)
2817 fprintf (stderr, " %td:", firstpos[j - stk[-1].nfirstpos].index);
2818 prtok (d->tokens[firstpos[j - stk[-1].nfirstpos].index]);
2820 fprintf (stderr, "\n lastpos:");
2821 for (idx_t j = 0; j < stk[-1].nlastpos; j++)
2823 fprintf (stderr, " %td:", lastpos[j - stk[-1].nlastpos].index);
2824 prtok (d->tokens[lastpos[j - stk[-1].nlastpos].index]);
2826 putc ('\n', stderr);
2827 #endif
2830 if (backward)
2832 /* For each follow set that is the follow set of a real position,
2833 replace it with its epsilon closure. */
2834 epsclosure (d, backward);
2836 for (idx_t i = 0; i < tindex; i++)
2837 free (backward[i].elems);
2838 free (backward);
2841 dfaoptimize (d);
2843 #ifdef DEBUG
2844 for (idx_t i = 0; i < tindex; i++)
2845 if (d->tokens[i] == BEG || d->tokens[i] < NOTCHAR
2846 || d->tokens[i] == BACKREF || d->tokens[i] == ANYCHAR
2847 || d->tokens[i] == MBCSET || d->tokens[i] >= CSET)
2849 fprintf (stderr, "follows(%td:", i);
2850 prtok (d->tokens[i]);
2851 fprintf (stderr, "):");
2852 for (idx_t j = 0; j < d->follows[i].nelem; j++)
2854 fprintf (stderr, " %td:", d->follows[i].elems[j].index);
2855 prtok (d->tokens[d->follows[i].elems[j].index]);
2857 putc ('\n', stderr);
2859 #endif
2861 pos.index = 0;
2862 pos.constraint = NO_CONSTRAINT;
2864 alloc_position_set (&tmp, 1);
2866 append (pos, &tmp);
2868 d->separates = xcalloc (tindex, sizeof *d->separates);
2870 for (idx_t i = 0; i < tindex; i++)
2872 if (prev_newline_dependent (d->constraints[i]))
2873 d->separates[i] |= CTX_NEWLINE;
2874 if (prev_letter_dependent (d->constraints[i]))
2875 d->separates[i] |= CTX_LETTER;
2877 for (idx_t j = 0; j < d->follows[i].nelem; j++)
2879 if (prev_newline_dependent (d->follows[i].elems[j].constraint))
2880 d->separates[i] |= CTX_NEWLINE;
2881 if (prev_letter_dependent (d->follows[i].elems[j].constraint))
2882 d->separates[i] |= CTX_LETTER;
2886 /* Context wanted by some position. */
2887 int separate_contexts = state_separate_contexts (d, &tmp);
2889 /* Build the initial state. */
2890 if (separate_contexts & CTX_NEWLINE)
2891 state_index (d, &tmp, CTX_NEWLINE);
2892 d->initstate_notbol = d->min_trcount
2893 = state_index (d, &tmp, separate_contexts ^ CTX_ANY);
2894 if (separate_contexts & CTX_LETTER)
2895 d->min_trcount = state_index (d, &tmp, CTX_LETTER);
2896 d->min_trcount++;
2897 d->trcount = 0;
2899 free (posalloc);
2900 free (stkalloc);
2901 free (merged.elems);
2902 free (tmp.elems);
2905 /* Make sure D's state arrays are large enough to hold NEW_STATE. */
2906 static void
2907 realloc_trans_if_necessary (struct dfa *d)
2909 state_num oldalloc = d->tralloc;
2910 if (oldalloc < d->sindex)
2912 state_num **realtrans = d->trans ? d->trans - 2 : NULL;
2913 idx_t newalloc1 = realtrans ? d->tralloc + 2 : 0;
2914 realtrans = xpalloc (realtrans, &newalloc1, d->sindex - oldalloc,
2915 -1, sizeof *realtrans);
2916 realtrans[0] = realtrans[1] = NULL;
2917 d->trans = realtrans + 2;
2918 idx_t newalloc = d->tralloc = newalloc1 - 2;
2919 d->fails = xnrealloc (d->fails, newalloc, sizeof *d->fails);
2920 d->success = xnrealloc (d->success, newalloc, sizeof *d->success);
2921 d->newlines = xnrealloc (d->newlines, newalloc, sizeof *d->newlines);
2922 if (d->localeinfo.multibyte)
2924 realtrans = d->mb_trans ? d->mb_trans - 2 : NULL;
2925 realtrans = xnrealloc (realtrans, newalloc1, sizeof *realtrans);
2926 if (oldalloc == 0)
2927 realtrans[0] = realtrans[1] = NULL;
2928 d->mb_trans = realtrans + 2;
2930 for (; oldalloc < newalloc; oldalloc++)
2932 d->trans[oldalloc] = NULL;
2933 d->fails[oldalloc] = NULL;
2934 if (d->localeinfo.multibyte)
2935 d->mb_trans[oldalloc] = NULL;
2941 Calculate the transition table for a new state derived from state s
2942 for a compiled dfa d after input character uc, and return the new
2943 state number.
2945 Do not worry about all possible input characters; calculate just the group
2946 of positions that match uc. Label it with the set of characters that
2947 every position in the group matches (taking into account, if necessary,
2948 preceding context information of s). Then find the union
2949 of these positions' follows, i.e., the set of positions of the
2950 new state. For each character in the group's label, set the transition
2951 on this character to be to a state corresponding to the set's positions,
2952 and its associated backward context information, if necessary.
2954 When building a searching matcher, include the positions of state
2955 0 in every state.
2957 The group is constructed by building an equivalence-class
2958 partition of the positions of s.
2960 For each position, find the set of characters C that it matches. Eliminate
2961 any characters from C that fail on grounds of backward context.
2963 Check whether the group's label L has nonempty
2964 intersection with C. If L - C is nonempty, create a new group labeled
2965 L - C and having the same positions as the current group, and set L to
2966 the intersection of L and C. Insert the position in the group, set
2967 C = C - L, and resume scanning.
2969 If after comparing with every group there are characters remaining in C,
2970 create a new group labeled with the characters of C and insert this
2971 position in that group. */
2973 static state_num
2974 build_state (state_num s, struct dfa *d, unsigned char uc)
2976 position_set follows; /* Union of the follows for each
2977 position of the current state. */
2978 position_set group; /* Positions that match the input char. */
2979 position_set tmp; /* Temporary space for merging sets. */
2980 state_num state; /* New state. */
2981 state_num state_newline; /* New state on a newline transition. */
2982 state_num state_letter; /* New state on a letter transition. */
2984 #ifdef DEBUG
2985 fprintf (stderr, "build state %td\n", s);
2986 #endif
2988 /* A pointer to the new transition table, and the table itself. */
2989 state_num **ptrans = (accepting (s, d) ? d->fails : d->trans) + s;
2990 state_num *trans = *ptrans;
2992 if (!trans)
2994 /* MAX_TRCOUNT is an arbitrary upper limit on the number of
2995 transition tables that can exist at once, other than for
2996 initial states. Often-used transition tables are quickly
2997 rebuilt, whereas rarely-used ones are cleared away. */
2998 if (MAX_TRCOUNT <= d->trcount)
3000 for (state_num i = d->min_trcount; i < d->tralloc; i++)
3002 free (d->trans[i]);
3003 free (d->fails[i]);
3004 d->trans[i] = d->fails[i] = NULL;
3006 d->trcount = 0;
3009 d->trcount++;
3010 *ptrans = trans = xmalloc (NOTCHAR * sizeof *trans);
3012 /* Fill transition table with a default value which means that the
3013 transited state has not been calculated yet. */
3014 for (int i = 0; i < NOTCHAR; i++)
3015 trans[i] = -2;
3018 /* Set up the success bits for this state. */
3019 d->success[s] = 0;
3020 if (accepts_in_context (d->states[s].context, CTX_NEWLINE, s, d))
3021 d->success[s] |= CTX_NEWLINE;
3022 if (accepts_in_context (d->states[s].context, CTX_LETTER, s, d))
3023 d->success[s] |= CTX_LETTER;
3024 if (accepts_in_context (d->states[s].context, CTX_NONE, s, d))
3025 d->success[s] |= CTX_NONE;
3027 alloc_position_set (&follows, d->nleaves);
3029 /* Find the union of the follows of the positions of the group.
3030 This is a hideously inefficient loop. Fix it someday. */
3031 for (idx_t j = 0; j < d->states[s].elems.nelem; j++)
3032 for (idx_t k = 0;
3033 k < d->follows[d->states[s].elems.elems[j].index].nelem; ++k)
3034 insert (d->follows[d->states[s].elems.elems[j].index].elems[k],
3035 &follows);
3037 /* Positions that match the input char. */
3038 alloc_position_set (&group, d->nleaves);
3040 /* The group's label. */
3041 charclass label;
3042 fillset (&label);
3044 for (idx_t i = 0; i < follows.nelem; i++)
3046 charclass matches; /* Set of matching characters. */
3047 position pos = follows.elems[i];
3048 bool matched = false;
3049 if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR)
3051 zeroset (&matches);
3052 setbit (d->tokens[pos.index], &matches);
3053 if (d->tokens[pos.index] == uc)
3054 matched = true;
3056 else if (d->tokens[pos.index] >= CSET)
3058 matches = d->charclasses[d->tokens[pos.index] - CSET];
3059 if (tstbit (uc, &matches))
3060 matched = true;
3062 else if (d->tokens[pos.index] == ANYCHAR)
3064 matches = d->charclasses[d->canychar];
3065 if (tstbit (uc, &matches))
3066 matched = true;
3068 /* ANYCHAR must match with a single character, so we must put
3069 it to D->states[s].mbps which contains the positions which
3070 can match with a single character not a byte. If all
3071 positions which has ANYCHAR does not depend on context of
3072 next character, we put the follows instead of it to
3073 D->states[s].mbps to optimize. */
3074 if (succeeds_in_context (pos.constraint, d->states[s].context,
3075 CTX_NONE))
3077 if (d->states[s].mbps.nelem == 0)
3078 alloc_position_set (&d->states[s].mbps, 1);
3079 insert (pos, &d->states[s].mbps);
3082 else
3083 continue;
3085 /* Some characters may need to be eliminated from matches because
3086 they fail in the current context. */
3087 if (pos.constraint != NO_CONSTRAINT)
3089 if (!succeeds_in_context (pos.constraint,
3090 d->states[s].context, CTX_NEWLINE))
3091 for (int j = 0; j < CHARCLASS_WORDS; j++)
3092 matches.w[j] &= ~d->syntax.newline.w[j];
3093 if (!succeeds_in_context (pos.constraint,
3094 d->states[s].context, CTX_LETTER))
3095 for (int j = 0; j < CHARCLASS_WORDS; ++j)
3096 matches.w[j] &= ~d->syntax.letters.w[j];
3097 if (!succeeds_in_context (pos.constraint,
3098 d->states[s].context, CTX_NONE))
3099 for (int j = 0; j < CHARCLASS_WORDS; ++j)
3100 matches.w[j] &= d->syntax.letters.w[j] | d->syntax.newline.w[j];
3102 /* If there are no characters left, there's no point in going on. */
3103 if (emptyset (&matches))
3104 continue;
3106 /* If we have reset the bit that made us declare "matched", reset
3107 that indicator, too. This is required to avoid an infinite loop
3108 with this command: echo cx | LC_ALL=C grep -E 'c\b[x ]' */
3109 if (!tstbit (uc, &matches))
3110 matched = false;
3113 #ifdef DEBUG
3114 fprintf (stderr, " nextpos %td:", pos.index);
3115 prtok (d->tokens[pos.index]);
3116 fprintf (stderr, " of");
3117 for (unsigned j = 0; j < NOTCHAR; j++)
3118 if (tstbit (j, &matches))
3119 fprintf (stderr, " 0x%02x", j);
3120 fprintf (stderr, "\n");
3121 #endif
3123 if (matched)
3125 for (int k = 0; k < CHARCLASS_WORDS; ++k)
3126 label.w[k] &= matches.w[k];
3127 append (pos, &group);
3129 else
3131 for (int k = 0; k < CHARCLASS_WORDS; ++k)
3132 label.w[k] &= ~matches.w[k];
3136 alloc_position_set (&tmp, d->nleaves);
3138 if (group.nelem > 0)
3140 /* If we are building a searching matcher, throw in the positions
3141 of state 0 as well, if possible. */
3142 if (d->searchflag)
3144 /* If a token in follows.elems is not 1st byte of a multibyte
3145 character, or the states of follows must accept the bytes
3146 which are not 1st byte of the multibyte character.
3147 Then, if a state of follows encounters a byte, it must not be
3148 a 1st byte of a multibyte character nor a single byte character.
3149 In this case, do not add state[0].follows to next state, because
3150 state[0] must accept 1st-byte.
3152 For example, suppose <sb a> is a certain single byte character,
3153 <mb A> is a certain multibyte character, and the codepoint of
3154 <sb a> equals the 2nd byte of the codepoint of <mb A>. When
3155 state[0] accepts <sb a>, state[i] transits to state[i+1] by
3156 accepting the 1st byte of <mb A>, and state[i+1] accepts the
3157 2nd byte of <mb A>, if state[i+1] encounters the codepoint of
3158 <sb a>, it must not be <sb a> but the 2nd byte of <mb A>, so do
3159 not add state[0]. */
3161 bool mergeit = !d->localeinfo.multibyte;
3162 if (!mergeit)
3164 mergeit = true;
3165 for (idx_t j = 0; mergeit && j < group.nelem; j++)
3166 mergeit &= d->multibyte_prop[group.elems[j].index];
3168 if (mergeit)
3169 merge2 (&group, &d->states[0].elems, &tmp);
3172 /* Find out if the new state will want any context information,
3173 by calculating possible contexts that the group can match,
3174 and separate contexts that the new state wants to know. */
3175 int possible_contexts = charclass_context (d, &label);
3176 int separate_contexts = state_separate_contexts (d, &group);
3178 /* Find the state(s) corresponding to the union of the follows. */
3179 if (possible_contexts & ~separate_contexts)
3180 state = state_index (d, &group, separate_contexts ^ CTX_ANY);
3181 else
3182 state = -1;
3183 if (separate_contexts & possible_contexts & CTX_NEWLINE)
3184 state_newline = state_index (d, &group, CTX_NEWLINE);
3185 else
3186 state_newline = state;
3187 if (separate_contexts & possible_contexts & CTX_LETTER)
3188 state_letter = state_index (d, &group, CTX_LETTER);
3189 else
3190 state_letter = state;
3192 /* Reallocate now, to reallocate any newline transition properly. */
3193 realloc_trans_if_necessary (d);
3196 /* If we are a searching matcher, the default transition is to a state
3197 containing the positions of state 0, otherwise the default transition
3198 is to fail miserably. */
3199 else if (d->searchflag)
3201 state_newline = 0;
3202 state_letter = d->min_trcount - 1;
3203 state = d->initstate_notbol;
3205 else
3207 state_newline = -1;
3208 state_letter = -1;
3209 state = -1;
3212 /* Set the transitions for each character in the label. */
3213 for (int i = 0; i < NOTCHAR; i++)
3214 if (tstbit (i, &label))
3215 switch (d->syntax.sbit[i])
3217 case CTX_NEWLINE:
3218 trans[i] = state_newline;
3219 break;
3220 case CTX_LETTER:
3221 trans[i] = state_letter;
3222 break;
3223 default:
3224 trans[i] = state;
3225 break;
3228 #ifdef DEBUG
3229 fprintf (stderr, "trans table %td", s);
3230 for (int i = 0; i < NOTCHAR; ++i)
3232 if (!(i & 0xf))
3233 fprintf (stderr, "\n");
3234 fprintf (stderr, " %2td", trans[i]);
3236 fprintf (stderr, "\n");
3237 #endif
3239 free (group.elems);
3240 free (follows.elems);
3241 free (tmp.elems);
3243 /* Keep the newline transition in a special place so we can use it as
3244 a sentinel. */
3245 if (tstbit (d->syntax.eolbyte, &label))
3247 d->newlines[s] = trans[d->syntax.eolbyte];
3248 trans[d->syntax.eolbyte] = -1;
3251 return trans[uc];
3254 /* Multibyte character handling sub-routines for dfaexec. */
3256 /* Consume a single byte and transit state from 's' to '*next_state'.
3257 This function is almost same as the state transition routin in dfaexec.
3258 But state transition is done just once, otherwise matching succeed or
3259 reach the end of the buffer. */
3260 static state_num
3261 transit_state_singlebyte (struct dfa *d, state_num s, unsigned char const **pp)
3263 state_num *t;
3265 if (d->trans[s])
3266 t = d->trans[s];
3267 else if (d->fails[s])
3268 t = d->fails[s];
3269 else
3271 build_state (s, d, **pp);
3272 if (d->trans[s])
3273 t = d->trans[s];
3274 else
3276 t = d->fails[s];
3277 assert (t);
3281 if (t[**pp] == -2)
3282 build_state (s, d, **pp);
3284 return t[*(*pp)++];
3287 /* Transit state from s, then return new state and update the pointer of
3288 the buffer. This function is for a period operator which can match a
3289 multi-byte character. */
3290 static state_num
3291 transit_state (struct dfa *d, state_num s, unsigned char const **pp,
3292 unsigned char const *end)
3294 wint_t wc;
3296 int mbclen = mbs_to_wchar (&wc, (char const *) *pp, end - *pp, d);
3298 /* This state has some operators which can match a multibyte character. */
3299 d->mb_follows.nelem = 0;
3301 /* Calculate the state which can be reached from the state 's' by
3302 consuming 'mbclen' single bytes from the buffer. */
3303 state_num s1 = s;
3304 int mbci;
3305 for (mbci = 0; mbci < mbclen && (mbci == 0 || d->min_trcount <= s); mbci++)
3306 s = transit_state_singlebyte (d, s, pp);
3307 *pp += mbclen - mbci;
3309 if (wc == WEOF)
3311 /* It is an invalid character, so ANYCHAR is not accepted. */
3312 return s;
3315 /* If all positions which have ANYCHAR do not depend on the context
3316 of the next character, calculate the next state with
3317 pre-calculated follows and cache the result. */
3318 if (d->states[s1].mb_trindex < 0)
3320 if (MAX_TRCOUNT <= d->mb_trcount)
3322 state_num s3;
3323 for (s3 = -1; s3 < d->tralloc; s3++)
3325 free (d->mb_trans[s3]);
3326 d->mb_trans[s3] = NULL;
3329 for (state_num i = 0; i < d->sindex; i++)
3330 d->states[i].mb_trindex = -1;
3331 d->mb_trcount = 0;
3333 d->states[s1].mb_trindex = d->mb_trcount++;
3336 if (! d->mb_trans[s])
3338 enum { TRANSPTR_SIZE = sizeof *d->mb_trans[s] };
3339 enum { TRANSALLOC_SIZE = MAX_TRCOUNT * TRANSPTR_SIZE };
3340 d->mb_trans[s] = xmalloc (TRANSALLOC_SIZE);
3341 for (int i = 0; i < MAX_TRCOUNT; i++)
3342 d->mb_trans[s][i] = -1;
3344 else if (d->mb_trans[s][d->states[s1].mb_trindex] >= 0)
3345 return d->mb_trans[s][d->states[s1].mb_trindex];
3347 if (s == -1)
3348 copy (&d->states[s1].mbps, &d->mb_follows);
3349 else
3350 merge (&d->states[s1].mbps, &d->states[s].elems, &d->mb_follows);
3352 int separate_contexts = state_separate_contexts (d, &d->mb_follows);
3353 state_num s2 = state_index (d, &d->mb_follows, separate_contexts ^ CTX_ANY);
3354 realloc_trans_if_necessary (d);
3356 d->mb_trans[s][d->states[s1].mb_trindex] = s2;
3358 return s2;
3361 /* The initial state may encounter a byte which is not a single byte character
3362 nor the first byte of a multibyte character. But it is incorrect for the
3363 initial state to accept such a byte. For example, in Shift JIS the regular
3364 expression "\\" accepts the codepoint 0x5c, but should not accept the second
3365 byte of the codepoint 0x815c. Then the initial state must skip the bytes
3366 that are not a single byte character nor the first byte of a multibyte
3367 character.
3369 Given DFA state d, use mbs_to_wchar to advance MBP until it reaches
3370 or exceeds P, and return the advanced MBP. If WCP is non-NULL and
3371 the result is greater than P, set *WCP to the final wide character
3372 processed, or to WEOF if no wide character is processed. Otherwise,
3373 if WCP is non-NULL, *WCP may or may not be updated.
3375 Both P and MBP must be no larger than END. */
3376 static unsigned char const *
3377 skip_remains_mb (struct dfa *d, unsigned char const *p,
3378 unsigned char const *mbp, char const *end)
3380 if (d->syntax.never_trail[*p])
3381 return p;
3382 while (mbp < p)
3384 wint_t wc;
3385 mbp += mbs_to_wchar (&wc, (char const *) mbp,
3386 end - (char const *) mbp, d);
3388 return mbp;
3391 /* Search through a buffer looking for a match to the struct dfa *D.
3392 Find the first occurrence of a string matching the regexp in the
3393 buffer, and the shortest possible version thereof. Return a pointer to
3394 the first character after the match, or NULL if none is found. BEGIN
3395 points to the beginning of the buffer, and END points to the first byte
3396 after its end. Note however that we store a sentinel byte (usually
3397 newline) in *END, so the actual buffer must be one byte longer.
3398 When ALLOW_NL, newlines may appear in the matching string.
3399 If COUNT is non-NULL, increment *COUNT once for each newline processed.
3400 If MULTIBYTE, the input consists of multibyte characters and/or
3401 encoding-error bytes. Otherwise, it consists of single-byte characters.
3402 Here is the list of features that make this DFA matcher punt:
3403 - [M-N] range in non-simple locale: regex is up to 25% faster on [a-z]
3404 - [^...] in non-simple locale
3405 - [[=foo=]] or [[.foo.]]
3406 - [[:alpha:]] etc. in multibyte locale (except [[:digit:]] works OK)
3407 - back-reference: (.)\1
3408 - word-delimiter in multibyte locale: \<, \>, \b, \B
3409 See struct localeinfo.simple for the definition of "simple locale". */
3411 static inline char *
3412 dfaexec_main (struct dfa *d, char const *begin, char *end, bool allow_nl,
3413 ptrdiff_t *count, bool multibyte)
3415 if (MAX_TRCOUNT <= d->sindex)
3417 for (state_num s = d->min_trcount; s < d->sindex; s++)
3419 free (d->states[s].elems.elems);
3420 free (d->states[s].mbps.elems);
3422 d->sindex = d->min_trcount;
3424 if (d->trans)
3426 for (state_num s = 0; s < d->tralloc; s++)
3428 free (d->trans[s]);
3429 free (d->fails[s]);
3430 d->trans[s] = d->fails[s] = NULL;
3432 d->trcount = 0;
3435 if (d->localeinfo.multibyte && d->mb_trans)
3437 for (state_num s = -1; s < d->tralloc; s++)
3439 free (d->mb_trans[s]);
3440 d->mb_trans[s] = NULL;
3442 for (state_num s = 0; s < d->min_trcount; s++)
3443 d->states[s].mb_trindex = -1;
3444 d->mb_trcount = 0;
3448 if (!d->tralloc)
3449 realloc_trans_if_necessary (d);
3451 /* Current state. */
3452 state_num s = 0, s1 = 0;
3454 /* Current input character. */
3455 unsigned char const *p = (unsigned char const *) begin;
3456 unsigned char const *mbp = p;
3458 /* Copy of d->trans so it can be optimized into a register. */
3459 state_num **trans = d->trans;
3460 unsigned char eol = d->syntax.eolbyte; /* Likewise for eolbyte. */
3461 unsigned char saved_end = *(unsigned char *) end;
3462 *end = eol;
3464 if (multibyte)
3466 memset (&d->mbs, 0, sizeof d->mbs);
3467 if (d->mb_follows.alloc == 0)
3468 alloc_position_set (&d->mb_follows, d->nleaves);
3471 idx_t nlcount = 0;
3472 for (;;)
3474 state_num *t;
3475 while ((t = trans[s]) != NULL)
3477 if (s < d->min_trcount)
3479 if (!multibyte || d->states[s].mbps.nelem == 0)
3481 while (t[*p] == s)
3482 p++;
3484 if (multibyte)
3485 p = mbp = skip_remains_mb (d, p, mbp, end);
3488 if (multibyte)
3490 s1 = s;
3492 if (d->states[s].mbps.nelem == 0
3493 || d->localeinfo.sbctowc[*p] != WEOF || (char *) p >= end)
3495 /* If an input character does not match ANYCHAR, do it
3496 like a single-byte character. */
3497 s = t[*p++];
3499 else
3501 s = transit_state (d, s, &p, (unsigned char *) end);
3502 mbp = p;
3503 trans = d->trans;
3506 else
3508 s1 = t[*p++];
3509 t = trans[s1];
3510 if (! t)
3512 state_num tmp = s;
3513 s = s1;
3514 s1 = tmp; /* swap */
3515 break;
3517 if (s < d->min_trcount)
3519 while (t[*p] == s1)
3520 p++;
3522 s = t[*p++];
3526 if (s < 0)
3528 if (s == -2)
3530 s = build_state (s1, d, p[-1]);
3531 trans = d->trans;
3533 else if ((char *) p <= end && p[-1] == eol && 0 <= d->newlines[s1])
3535 /* The previous character was a newline. Count it, and skip
3536 checking of multibyte character boundary until here. */
3537 nlcount++;
3538 mbp = p;
3540 s = (allow_nl ? d->newlines[s1]
3541 : d->syntax.sbit[eol] == CTX_NEWLINE ? 0
3542 : d->syntax.sbit[eol] == CTX_LETTER ? d->min_trcount - 1
3543 : d->initstate_notbol);
3545 else
3547 p = NULL;
3548 goto done;
3551 else if (d->fails[s])
3553 if ((d->success[s] & d->syntax.sbit[*p])
3554 || ((char *) p == end
3555 && accepts_in_context (d->states[s].context, CTX_NEWLINE, s,
3556 d)))
3557 goto done;
3559 if (multibyte && s < d->min_trcount)
3560 p = mbp = skip_remains_mb (d, p, mbp, end);
3562 s1 = s;
3563 if (!multibyte || d->states[s].mbps.nelem == 0
3564 || d->localeinfo.sbctowc[*p] != WEOF || (char *) p >= end)
3566 /* If a input character does not match ANYCHAR, do it
3567 like a single-byte character. */
3568 s = d->fails[s][*p++];
3570 else
3572 s = transit_state (d, s, &p, (unsigned char *) end);
3573 mbp = p;
3574 trans = d->trans;
3577 else
3579 build_state (s, d, p[0]);
3580 trans = d->trans;
3584 done:
3585 if (count)
3586 *count += nlcount;
3587 *end = saved_end;
3588 return (char *) p;
3591 /* Specialized versions of dfaexec for multibyte and single-byte cases.
3592 This is for performance, as dfaexec_main is an inline function. */
3594 static char *
3595 dfaexec_mb (struct dfa *d, char const *begin, char *end,
3596 bool allow_nl, ptrdiff_t *count, bool *backref)
3598 return dfaexec_main (d, begin, end, allow_nl, count, true);
3601 static char *
3602 dfaexec_sb (struct dfa *d, char const *begin, char *end,
3603 bool allow_nl, ptrdiff_t *count, bool *backref)
3605 return dfaexec_main (d, begin, end, allow_nl, count, false);
3608 /* Always set *BACKREF and return BEGIN. Use this wrapper for
3609 any regexp that uses a construct not supported by this code. */
3610 static char *
3611 dfaexec_noop (struct dfa *d, char const *begin, char *end,
3612 bool allow_nl, ptrdiff_t *count, bool *backref)
3614 *backref = true;
3615 return (char *) begin;
3618 /* Like dfaexec_main (D, BEGIN, END, ALLOW_NL, COUNT, D->localeinfo.multibyte),
3619 but faster and set *BACKREF if the DFA code does not support this
3620 regexp usage. */
3622 char *
3623 dfaexec (struct dfa *d, char const *begin, char *end,
3624 bool allow_nl, ptrdiff_t *count, bool *backref)
3626 return d->dfaexec (d, begin, end, allow_nl, count, backref);
3629 struct dfa *
3630 dfasuperset (struct dfa const *d)
3632 return d->superset;
3635 bool
3636 dfaisfast (struct dfa const *d)
3638 return d->fast;
3641 static void
3642 free_mbdata (struct dfa *d)
3644 free (d->multibyte_prop);
3645 free (d->lex.brack.chars);
3646 free (d->mb_follows.elems);
3648 if (d->mb_trans)
3650 state_num s;
3651 for (s = -1; s < d->tralloc; s++)
3652 free (d->mb_trans[s]);
3653 free (d->mb_trans - 2);
3657 /* Return true if every construct in D is supported by this DFA matcher. */
3658 bool
3659 dfasupported (struct dfa const *d)
3661 for (idx_t i = 0; i < d->tindex; i++)
3663 switch (d->tokens[i])
3665 case BEGWORD:
3666 case ENDWORD:
3667 case LIMWORD:
3668 case NOTLIMWORD:
3669 if (!d->localeinfo.multibyte)
3670 continue;
3671 FALLTHROUGH;
3672 case BACKREF:
3673 case MBCSET:
3674 return false;
3677 return true;
3680 /* Disable use of the superset DFA if it is not likely to help
3681 performance. */
3682 static void
3683 maybe_disable_superset_dfa (struct dfa *d)
3685 if (!d->localeinfo.using_utf8)
3686 return;
3688 bool have_backref = false;
3689 for (idx_t i = 0; i < d->tindex; i++)
3691 switch (d->tokens[i])
3693 case ANYCHAR:
3694 /* Lowered. */
3695 abort ();
3696 case BACKREF:
3697 have_backref = true;
3698 break;
3699 case MBCSET:
3700 /* Requires multi-byte algorithm. */
3701 return;
3702 default:
3703 break;
3707 if (!have_backref && d->superset)
3709 /* The superset DFA is not likely to be much faster, so remove it. */
3710 dfafree (d->superset);
3711 free (d->superset);
3712 d->superset = NULL;
3715 free_mbdata (d);
3716 d->localeinfo.multibyte = false;
3717 d->dfaexec = dfaexec_sb;
3718 d->fast = true;
3721 static void
3722 dfassbuild (struct dfa *d)
3724 struct dfa *sup = dfaalloc ();
3726 *sup = *d;
3727 sup->localeinfo.multibyte = false;
3728 sup->dfaexec = dfaexec_sb;
3729 sup->multibyte_prop = NULL;
3730 sup->superset = NULL;
3731 sup->states = NULL;
3732 sup->sindex = 0;
3733 sup->constraints = NULL;
3734 sup->separates = NULL;
3735 sup->follows = NULL;
3736 sup->tralloc = 0;
3737 sup->trans = NULL;
3738 sup->fails = NULL;
3739 sup->success = NULL;
3740 sup->newlines = NULL;
3742 sup->charclasses = xnmalloc (sup->calloc, sizeof *sup->charclasses);
3743 if (d->cindex)
3745 memcpy (sup->charclasses, d->charclasses,
3746 d->cindex * sizeof *sup->charclasses);
3749 sup->tokens = xnmalloc (d->tindex, 2 * sizeof *sup->tokens);
3750 sup->talloc = d->tindex * 2;
3752 bool have_achar = false;
3753 bool have_nchar = false;
3754 idx_t j;
3755 for (idx_t i = j = 0; i < d->tindex; i++)
3757 switch (d->tokens[i])
3759 case ANYCHAR:
3760 case MBCSET:
3761 case BACKREF:
3763 charclass ccl;
3764 fillset (&ccl);
3765 sup->tokens[j++] = CSET + charclass_index (sup, &ccl);
3766 sup->tokens[j++] = STAR;
3767 if (d->tokens[i + 1] == QMARK || d->tokens[i + 1] == STAR
3768 || d->tokens[i + 1] == PLUS)
3769 i++;
3770 have_achar = true;
3772 break;
3773 case BEGWORD:
3774 case ENDWORD:
3775 case LIMWORD:
3776 case NOTLIMWORD:
3777 if (d->localeinfo.multibyte)
3779 /* These constraints aren't supported in a multibyte locale.
3780 Ignore them in the superset DFA. */
3781 sup->tokens[j++] = EMPTY;
3782 break;
3784 FALLTHROUGH;
3785 default:
3786 sup->tokens[j++] = d->tokens[i];
3787 if ((0 <= d->tokens[i] && d->tokens[i] < NOTCHAR)
3788 || d->tokens[i] >= CSET)
3789 have_nchar = true;
3790 break;
3793 sup->tindex = j;
3795 if (have_nchar && (have_achar || d->localeinfo.multibyte))
3796 d->superset = sup;
3797 else
3799 dfafree (sup);
3800 free (sup);
3804 /* Parse a string S of length LEN into D (but skip this step if S is null).
3805 Then analyze D and build a matcher for it.
3806 SEARCHFLAG says whether to build a searching or an exact matcher. */
3807 void
3808 dfacomp (char const *s, idx_t len, struct dfa *d, bool searchflag)
3810 if (s != NULL)
3811 dfaparse (s, len, d);
3813 dfassbuild (d);
3815 if (dfasupported (d))
3817 maybe_disable_superset_dfa (d);
3818 dfaanalyze (d, searchflag);
3820 else
3822 d->dfaexec = dfaexec_noop;
3825 if (d->superset)
3827 d->fast = true;
3828 dfaanalyze (d->superset, searchflag);
3832 /* Free the storage held by the components of a dfa. */
3833 void
3834 dfafree (struct dfa *d)
3836 free (d->charclasses);
3837 free (d->tokens);
3839 if (d->localeinfo.multibyte)
3840 free_mbdata (d);
3842 free (d->constraints);
3843 free (d->separates);
3845 for (idx_t i = 0; i < d->sindex; i++)
3847 free (d->states[i].elems.elems);
3848 free (d->states[i].mbps.elems);
3850 free (d->states);
3852 if (d->follows)
3854 for (idx_t i = 0; i < d->tindex; i++)
3855 free (d->follows[i].elems);
3856 free (d->follows);
3859 if (d->trans)
3861 for (idx_t i = 0; i < d->tralloc; i++)
3863 free (d->trans[i]);
3864 free (d->fails[i]);
3867 free (d->trans - 2);
3868 free (d->fails);
3869 free (d->newlines);
3870 free (d->success);
3873 if (d->superset)
3875 dfafree (d->superset);
3876 free (d->superset);
3880 /* Having found the postfix representation of the regular expression,
3881 try to find a long sequence of characters that must appear in any line
3882 containing the r.e.
3883 Finding a "longest" sequence is beyond the scope here;
3884 we take an easy way out and hope for the best.
3885 (Take "(ab|a)b"--please.)
3887 We do a bottom-up calculation of sequences of characters that must appear
3888 in matches of r.e.'s represented by trees rooted at the nodes of the postfix
3889 representation:
3890 sequences that must appear at the left of the match ("left")
3891 sequences that must appear at the right of the match ("right")
3892 lists of sequences that must appear somewhere in the match ("in")
3893 sequences that must constitute the match ("is")
3895 When we get to the root of the tree, we use one of the longest of its
3896 calculated "in" sequences as our answer.
3898 The sequences calculated for the various types of node (in pseudo ANSI c)
3899 are shown below. "p" is the operand of unary operators (and the left-hand
3900 operand of binary operators); "q" is the right-hand operand of binary
3901 operators.
3903 "ZERO" means "a zero-length sequence" below.
3905 Type left right is in
3906 ---- ---- ----- -- --
3907 char c # c # c # c # c
3909 ANYCHAR ZERO ZERO ZERO ZERO
3911 MBCSET ZERO ZERO ZERO ZERO
3913 CSET ZERO ZERO ZERO ZERO
3915 STAR ZERO ZERO ZERO ZERO
3917 QMARK ZERO ZERO ZERO ZERO
3919 PLUS p->left p->right ZERO p->in
3921 CAT (p->is==ZERO)? (q->is==ZERO)? (p->is!=ZERO && p->in plus
3922 p->left : q->right : q->is!=ZERO) ? q->in plus
3923 p->is##q->left p->right##q->is p->is##q->is : p->right##q->left
3924 ZERO
3926 OR longest common longest common (do p->is and substrings common
3927 leading trailing to q->is have same p->in and
3928 (sub)sequence (sub)sequence q->in length and content) ?
3929 of p->left of p->right
3930 and q->left and q->right p->is : NULL
3932 If there's anything else we recognize in the tree, all four sequences get set
3933 to zero-length sequences. If there's something we don't recognize in the
3934 tree, we just return a zero-length sequence.
3936 Break ties in favor of infrequent letters (choosing 'zzz' in preference to
3937 'aaa')?
3939 And ... is it here or someplace that we might ponder "optimizations" such as
3940 egrep 'psi|epsilon' -> egrep 'psi'
3941 egrep 'pepsi|epsilon' -> egrep 'epsi'
3942 (Yes, we now find "epsi" as a "string
3943 that must occur", but we might also
3944 simplify the *entire* r.e. being sought)
3945 grep '[c]' -> grep 'c'
3946 grep '(ab|a)b' -> grep 'ab'
3947 grep 'ab*' -> grep 'a'
3948 grep 'a*b' -> grep 'b'
3950 There are several issues:
3952 Is optimization easy (enough)?
3954 Does optimization actually accomplish anything,
3955 or is the automaton you get from "psi|epsilon" (for example)
3956 the same as the one you get from "psi" (for example)?
3958 Are optimizable r.e.'s likely to be used in real-life situations
3959 (something like 'ab*' is probably unlikely; something like is
3960 'psi|epsilon' is likelier)? */
3962 static char *
3963 icatalloc (char *old, char const *new)
3965 idx_t newsize = strlen (new);
3966 if (newsize == 0)
3967 return old;
3968 idx_t oldsize = strlen (old);
3969 char *result = xrealloc (old, oldsize + newsize + 1);
3970 memcpy (result + oldsize, new, newsize + 1);
3971 return result;
3974 static void
3975 freelist (char **cpp)
3977 while (*cpp)
3978 free (*cpp++);
3981 static char **
3982 enlist (char **cpp, char *new, idx_t len)
3984 new = memcpy (xmalloc (len + 1), new, len);
3985 new[len] = '\0';
3986 /* Is there already something in the list that's new (or longer)? */
3987 idx_t i;
3988 for (i = 0; cpp[i] != NULL; i++)
3989 if (strstr (cpp[i], new) != NULL)
3991 free (new);
3992 return cpp;
3994 /* Eliminate any obsoleted strings. */
3995 for (idx_t j = 0; cpp[j] != NULL; )
3996 if (strstr (new, cpp[j]) == NULL)
3997 ++j;
3998 else
4000 free (cpp[j]);
4001 if (--i == j)
4002 break;
4003 cpp[j] = cpp[i];
4004 cpp[i] = NULL;
4006 /* Add the new string. */
4007 cpp = xnrealloc (cpp, i + 2, sizeof *cpp);
4008 cpp[i] = new;
4009 cpp[i + 1] = NULL;
4010 return cpp;
4013 /* Given pointers to two strings, return a pointer to an allocated
4014 list of their distinct common substrings. */
4015 static char **
4016 comsubs (char *left, char const *right)
4018 char **cpp = xzalloc (sizeof *cpp);
4020 for (char *lcp = left; *lcp != '\0'; lcp++)
4022 idx_t len = 0;
4023 char *rcp = strchr (right, *lcp);
4024 while (rcp != NULL)
4026 idx_t i;
4027 for (i = 1; lcp[i] != '\0' && lcp[i] == rcp[i]; ++i)
4028 continue;
4029 if (i > len)
4030 len = i;
4031 rcp = strchr (rcp + 1, *lcp);
4033 if (len != 0)
4034 cpp = enlist (cpp, lcp, len);
4036 return cpp;
4039 static char **
4040 addlists (char **old, char **new)
4042 for (; *new; new++)
4043 old = enlist (old, *new, strlen (*new));
4044 return old;
4047 /* Given two lists of substrings, return a new list giving substrings
4048 common to both. */
4049 static char **
4050 inboth (char **left, char **right)
4052 char **both = xzalloc (sizeof *both);
4054 for (idx_t lnum = 0; left[lnum] != NULL; lnum++)
4056 for (idx_t rnum = 0; right[rnum] != NULL; rnum++)
4058 char **temp = comsubs (left[lnum], right[rnum]);
4059 both = addlists (both, temp);
4060 freelist (temp);
4061 free (temp);
4064 return both;
4067 typedef struct must must;
4069 struct must
4071 char **in;
4072 char *left;
4073 char *right;
4074 char *is;
4075 bool begline;
4076 bool endline;
4077 must *prev;
4080 static must *
4081 allocmust (must *mp, idx_t size)
4083 must *new_mp = xmalloc (sizeof *new_mp);
4084 new_mp->in = xzalloc (sizeof *new_mp->in);
4085 new_mp->left = xzalloc (size);
4086 new_mp->right = xzalloc (size);
4087 new_mp->is = xzalloc (size);
4088 new_mp->begline = false;
4089 new_mp->endline = false;
4090 new_mp->prev = mp;
4091 return new_mp;
4094 static void
4095 resetmust (must *mp)
4097 freelist (mp->in);
4098 mp->in[0] = NULL;
4099 mp->left[0] = mp->right[0] = mp->is[0] = '\0';
4100 mp->begline = false;
4101 mp->endline = false;
4104 static void
4105 freemust (must *mp)
4107 freelist (mp->in);
4108 free (mp->in);
4109 free (mp->left);
4110 free (mp->right);
4111 free (mp->is);
4112 free (mp);
4115 struct dfamust *
4116 dfamust (struct dfa const *d)
4118 must *mp = NULL;
4119 char const *result = "";
4120 bool exact = false;
4121 bool begline = false;
4122 bool endline = false;
4123 bool need_begline = false;
4124 bool need_endline = false;
4125 bool case_fold_unibyte = d->syntax.case_fold & !d->localeinfo.multibyte;
4127 for (idx_t ri = 1; ri + 1 < d->tindex; ri++)
4129 token t = d->tokens[ri];
4130 switch (t)
4132 case BEGLINE:
4133 mp = allocmust (mp, 2);
4134 mp->begline = true;
4135 need_begline = true;
4136 break;
4137 case ENDLINE:
4138 mp = allocmust (mp, 2);
4139 mp->endline = true;
4140 need_endline = true;
4141 break;
4142 case LPAREN:
4143 case RPAREN:
4144 assert (!"neither LPAREN nor RPAREN may appear here");
4146 case EMPTY:
4147 case BEGWORD:
4148 case ENDWORD:
4149 case LIMWORD:
4150 case NOTLIMWORD:
4151 case BACKREF:
4152 case ANYCHAR:
4153 case MBCSET:
4154 mp = allocmust (mp, 2);
4155 break;
4157 case STAR:
4158 case QMARK:
4159 resetmust (mp);
4160 break;
4162 case OR:
4164 char **new;
4165 must *rmp = mp;
4166 must *lmp = mp = mp->prev;
4167 idx_t j, ln, rn, n;
4169 /* Guaranteed to be. Unlikely, but ... */
4170 if (streq (lmp->is, rmp->is))
4172 lmp->begline &= rmp->begline;
4173 lmp->endline &= rmp->endline;
4175 else
4177 lmp->is[0] = '\0';
4178 lmp->begline = false;
4179 lmp->endline = false;
4181 /* Left side--easy */
4182 idx_t i = 0;
4183 while (lmp->left[i] != '\0' && lmp->left[i] == rmp->left[i])
4184 ++i;
4185 lmp->left[i] = '\0';
4186 /* Right side */
4187 ln = strlen (lmp->right);
4188 rn = strlen (rmp->right);
4189 n = ln;
4190 if (n > rn)
4191 n = rn;
4192 for (i = 0; i < n; ++i)
4193 if (lmp->right[ln - i - 1] != rmp->right[rn - i - 1])
4194 break;
4195 for (j = 0; j < i; ++j)
4196 lmp->right[j] = lmp->right[(ln - i) + j];
4197 lmp->right[j] = '\0';
4198 new = inboth (lmp->in, rmp->in);
4199 freelist (lmp->in);
4200 free (lmp->in);
4201 lmp->in = new;
4202 freemust (rmp);
4204 break;
4206 case PLUS:
4207 mp->is[0] = '\0';
4208 break;
4210 case END:
4211 assert (!mp->prev);
4212 for (idx_t i = 0; mp->in[i] != NULL; i++)
4213 if (strlen (mp->in[i]) > strlen (result))
4214 result = mp->in[i];
4215 if (streq (result, mp->is))
4217 if ((!need_begline || mp->begline) && (!need_endline
4218 || mp->endline))
4219 exact = true;
4220 begline = mp->begline;
4221 endline = mp->endline;
4223 goto done;
4225 case CAT:
4227 must *rmp = mp;
4228 must *lmp = mp = mp->prev;
4230 /* In. Everything in left, plus everything in
4231 right, plus concatenation of
4232 left's right and right's left. */
4233 lmp->in = addlists (lmp->in, rmp->in);
4234 if (lmp->right[0] != '\0' && rmp->left[0] != '\0')
4236 idx_t lrlen = strlen (lmp->right);
4237 idx_t rllen = strlen (rmp->left);
4238 char *tp = xmalloc (lrlen + rllen);
4239 memcpy (tp, lmp->right, lrlen);
4240 memcpy (tp + lrlen, rmp->left, rllen);
4241 lmp->in = enlist (lmp->in, tp, lrlen + rllen);
4242 free (tp);
4244 /* Left-hand */
4245 if (lmp->is[0] != '\0')
4246 lmp->left = icatalloc (lmp->left, rmp->left);
4247 /* Right-hand */
4248 if (rmp->is[0] == '\0')
4249 lmp->right[0] = '\0';
4250 lmp->right = icatalloc (lmp->right, rmp->right);
4251 /* Guaranteed to be */
4252 if ((lmp->is[0] != '\0' || lmp->begline)
4253 && (rmp->is[0] != '\0' || rmp->endline))
4255 lmp->is = icatalloc (lmp->is, rmp->is);
4256 lmp->endline = rmp->endline;
4258 else
4260 lmp->is[0] = '\0';
4261 lmp->begline = false;
4262 lmp->endline = false;
4264 freemust (rmp);
4266 break;
4268 case '\0':
4269 /* Not on *my* shift. */
4270 goto done;
4272 default:
4273 if (CSET <= t)
4275 /* If T is a singleton, or if case-folding in a unibyte
4276 locale and T's members all case-fold to the same char,
4277 convert T to one of its members. Otherwise, do
4278 nothing further with T. */
4279 charclass *ccl = &d->charclasses[t - CSET];
4280 int j;
4281 for (j = 0; j < NOTCHAR; j++)
4282 if (tstbit (j, ccl))
4283 break;
4284 if (! (j < NOTCHAR))
4286 mp = allocmust (mp, 2);
4287 break;
4289 t = j;
4290 while (++j < NOTCHAR)
4291 if (tstbit (j, ccl)
4292 && ! (case_fold_unibyte
4293 && toupper (j) == toupper (t)))
4294 break;
4295 if (j < NOTCHAR)
4297 mp = allocmust (mp, 2);
4298 break;
4302 idx_t rj = ri + 2;
4303 if (d->tokens[ri + 1] == CAT)
4305 for (; rj < d->tindex - 1; rj += 2)
4307 if ((rj != ri && (d->tokens[rj] <= 0
4308 || NOTCHAR <= d->tokens[rj]))
4309 || d->tokens[rj + 1] != CAT)
4310 break;
4313 mp = allocmust (mp, ((rj - ri) >> 1) + 1);
4314 mp->is[0] = mp->left[0] = mp->right[0]
4315 = case_fold_unibyte ? toupper (t) : t;
4317 idx_t i;
4318 for (i = 1; ri + 2 < rj; i++)
4320 ri += 2;
4321 t = d->tokens[ri];
4322 mp->is[i] = mp->left[i] = mp->right[i]
4323 = case_fold_unibyte ? toupper (t) : t;
4325 mp->is[i] = mp->left[i] = mp->right[i] = '\0';
4326 mp->in = enlist (mp->in, mp->is, i);
4327 break;
4330 done:;
4332 struct dfamust *dm = NULL;
4333 if (*result)
4335 dm = xmalloc (FLEXSIZEOF (struct dfamust, must, strlen (result) + 1));
4336 dm->exact = exact;
4337 dm->begline = begline;
4338 dm->endline = endline;
4339 strcpy (dm->must, result);
4342 while (mp)
4344 must *prev = mp->prev;
4345 freemust (mp);
4346 mp = prev;
4349 return dm;
4352 void
4353 dfamustfree (struct dfamust *dm)
4355 free (dm);
4358 struct dfa *
4359 dfaalloc (void)
4361 return xmalloc (sizeof (struct dfa));
4364 /* Initialize DFA. */
4365 void
4366 dfasyntax (struct dfa *dfa, struct localeinfo const *linfo,
4367 reg_syntax_t bits, int dfaopts)
4369 memset (dfa, 0, offsetof (struct dfa, dfaexec));
4370 dfa->dfaexec = linfo->multibyte ? dfaexec_mb : dfaexec_sb;
4371 dfa->localeinfo = *linfo;
4373 dfa->fast = !dfa->localeinfo.multibyte;
4375 dfa->canychar = -1;
4376 dfa->syntax.syntax_bits_set = true;
4377 dfa->syntax.case_fold = (bits & RE_ICASE) != 0;
4378 dfa->syntax.anchor = (dfaopts & DFA_ANCHOR) != 0;
4379 dfa->syntax.eolbyte = dfaopts & DFA_EOL_NUL ? '\0' : '\n';
4380 dfa->syntax.syntax_bits = bits;
4382 for (int i = CHAR_MIN; i <= CHAR_MAX; ++i)
4384 unsigned char uc = i;
4386 dfa->syntax.sbit[uc] = char_context (dfa, uc);
4387 switch (dfa->syntax.sbit[uc])
4389 case CTX_LETTER:
4390 setbit (uc, &dfa->syntax.letters);
4391 break;
4392 case CTX_NEWLINE:
4393 setbit (uc, &dfa->syntax.newline);
4394 break;
4397 /* POSIX requires that the five bytes in "\n\r./" (including the
4398 terminating NUL) cannot occur inside a multibyte character. */
4399 dfa->syntax.never_trail[uc] = (dfa->localeinfo.using_utf8
4400 ? (uc & 0xc0) != 0x80
4401 : strchr ("\n\r./", uc) != NULL);
4405 /* Initialize TO by copying FROM's syntax settings. */
4406 void
4407 dfacopysyntax (struct dfa *to, struct dfa const *from)
4409 memset (to, 0, offsetof (struct dfa, syntax));
4410 to->canychar = -1;
4411 to->fast = from->fast;
4412 to->syntax = from->syntax;
4413 to->dfaexec = from->dfaexec;
4414 to->localeinfo = from->localeinfo;
4417 /* vim:set shiftwidth=2: */