unistr/u{8,16,32}-uctomb: Avoid possible trouble with huge strings.
[gnulib.git] / lib / dfa.c
blob1f0587a7a338b81cfdac8d4d32ff964846a53d7d
1 /* dfa.c - deterministic extended regexp routines for GNU
2 Copyright (C) 1988, 1998, 2000, 2002, 2004-2005, 2007-2020 Free Software
3 Foundation, Inc.
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3, or (at your option)
8 any later version.
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc.,
18 51 Franklin Street - Fifth Floor, Boston, MA 02110-1301, USA */
20 /* Written June, 1988 by Mike Haertel
21 Modified July, 1988 by Arthur David Olson to assist BMG speedups */
23 #include <config.h>
25 #include "dfa.h"
27 #include "flexmember.h"
29 #include <assert.h>
30 #include <ctype.h>
31 #include <stdint.h>
32 #include <stdio.h>
33 #include <stdlib.h>
34 #include <limits.h>
35 #include <string.h>
37 /* Another name for ptrdiff_t, for sizes of objects and nonnegative
38 indexes into objects. It is signed to help catch integer overflow.
39 It has its own name because it is for nonnegative values only. */
40 typedef ptrdiff_t idx_t;
41 static idx_t const IDX_MAX = PTRDIFF_MAX;
43 static bool
44 streq (char const *a, char const *b)
46 return strcmp (a, b) == 0;
49 static bool
50 isasciidigit (char c)
52 return '0' <= c && c <= '9';
55 #include "gettext.h"
56 #define _(str) gettext (str)
58 #include <wchar.h>
60 #include "intprops.h"
61 #include "xalloc.h"
62 #include "localeinfo.h"
64 #ifndef FALLTHROUGH
65 # if 201710L < __STDC_VERSION__
66 # define FALLTHROUGH [[__fallthrough__]]
67 # elif (__GNUC__ >= 7) || (__clang_major__ >= 10)
68 # define FALLTHROUGH __attribute__ ((__fallthrough__))
69 # else
70 # define FALLTHROUGH ((void) 0)
71 # endif
72 #endif
74 #ifndef MIN
75 # define MIN(a,b) ((a) < (b) ? (a) : (b))
76 #endif
78 /* HPUX defines these as macros in sys/param.h. */
79 #ifdef setbit
80 # undef setbit
81 #endif
82 #ifdef clrbit
83 # undef clrbit
84 #endif
86 /* For code that does not use Gnulib’s isblank module. */
87 #if !defined isblank && !defined HAVE_ISBLANK && !defined GNULIB_ISBLANK
88 # define isblank dfa_isblank
89 static int
90 isblank (int c)
92 return c == ' ' || c == '\t';
94 #endif
96 /* First integer value that is greater than any character code. */
97 enum { NOTCHAR = 1 << CHAR_BIT };
99 #ifdef UINT_LEAST64_MAX
101 /* Number of bits used in a charclass word. */
102 enum { CHARCLASS_WORD_BITS = 64 };
104 /* This represents part of a character class. It must be unsigned and
105 at least CHARCLASS_WORD_BITS wide. Any excess bits are zero. */
106 typedef uint_least64_t charclass_word;
108 /* Part of a charclass initializer that represents 64 bits' worth of a
109 charclass, where LO and HI are the low and high-order 32 bits of
110 the 64-bit quantity. */
111 # define CHARCLASS_PAIR(lo, hi) (((charclass_word) (hi) << 32) + (lo))
113 #else
114 /* Fallbacks for pre-C99 hosts that lack 64-bit integers. */
115 enum { CHARCLASS_WORD_BITS = 32 };
116 typedef unsigned long charclass_word;
117 # define CHARCLASS_PAIR(lo, hi) lo, hi
118 #endif
120 /* An initializer for a charclass whose 32-bit words are A through H. */
121 #define CHARCLASS_INIT(a, b, c, d, e, f, g, h) \
122 {{ \
123 CHARCLASS_PAIR (a, b), CHARCLASS_PAIR (c, d), \
124 CHARCLASS_PAIR (e, f), CHARCLASS_PAIR (g, h) \
127 /* The maximum useful value of a charclass_word; all used bits are 1. */
128 static charclass_word const CHARCLASS_WORD_MASK
129 = ((charclass_word) 1 << (CHARCLASS_WORD_BITS - 1) << 1) - 1;
131 /* Number of words required to hold a bit for every character. */
132 enum
134 CHARCLASS_WORDS = (NOTCHAR + CHARCLASS_WORD_BITS - 1) / CHARCLASS_WORD_BITS
137 /* Sets of unsigned characters are stored as bit vectors in arrays of ints. */
138 typedef struct { charclass_word w[CHARCLASS_WORDS]; } charclass;
140 /* Convert a possibly-signed character to an unsigned character. This is
141 a bit safer than casting to unsigned char, since it catches some type
142 errors that the cast doesn't. */
143 static unsigned char
144 to_uchar (char ch)
146 return ch;
149 /* Contexts tell us whether a character is a newline or a word constituent.
150 Word-constituent characters are those that satisfy iswalnum, plus '_'.
151 Each character has a single CTX_* value; bitmasks of CTX_* values denote
152 a particular character class.
154 A state also stores a context value, which is a bitmask of CTX_* values.
155 A state's context represents a set of characters that the state's
156 predecessors must match. For example, a state whose context does not
157 include CTX_LETTER will never have transitions where the previous
158 character is a word constituent. A state whose context is CTX_ANY
159 might have transitions from any character. */
161 enum
163 CTX_NONE = 1,
164 CTX_LETTER = 2,
165 CTX_NEWLINE = 4,
166 CTX_ANY = 7
169 /* Sometimes characters can only be matched depending on the surrounding
170 context. Such context decisions depend on what the previous character
171 was, and the value of the current (lookahead) character. Context
172 dependent constraints are encoded as 9-bit integers. Each bit that
173 is set indicates that the constraint succeeds in the corresponding
174 context.
176 bit 6-8 - valid contexts when next character is CTX_NEWLINE
177 bit 3-5 - valid contexts when next character is CTX_LETTER
178 bit 0-2 - valid contexts when next character is CTX_NONE
180 succeeds_in_context determines whether a given constraint
181 succeeds in a particular context. Prev is a bitmask of possible
182 context values for the previous character, curr is the (single-bit)
183 context value for the lookahead character. */
184 static int
185 newline_constraint (int constraint)
187 return (constraint >> 6) & 7;
189 static int
190 letter_constraint (int constraint)
192 return (constraint >> 3) & 7;
194 static int
195 other_constraint (int constraint)
197 return constraint & 7;
200 static bool
201 succeeds_in_context (int constraint, int prev, int curr)
203 return !! (((curr & CTX_NONE ? other_constraint (constraint) : 0) \
204 | (curr & CTX_LETTER ? letter_constraint (constraint) : 0) \
205 | (curr & CTX_NEWLINE ? newline_constraint (constraint) : 0)) \
206 & prev);
209 /* The following describe what a constraint depends on. */
210 static bool
211 prev_newline_dependent (int constraint)
213 return ((constraint ^ constraint >> 2) & 0111) != 0;
215 static bool
216 prev_letter_dependent (int constraint)
218 return ((constraint ^ constraint >> 1) & 0111) != 0;
221 /* Tokens that match the empty string subject to some constraint actually
222 work by applying that constraint to determine what may follow them,
223 taking into account what has gone before. The following values are
224 the constraints corresponding to the special tokens previously defined. */
225 enum
227 NO_CONSTRAINT = 0777,
228 BEGLINE_CONSTRAINT = 0444,
229 ENDLINE_CONSTRAINT = 0700,
230 BEGWORD_CONSTRAINT = 0050,
231 ENDWORD_CONSTRAINT = 0202,
232 LIMWORD_CONSTRAINT = 0252,
233 NOTLIMWORD_CONSTRAINT = 0525
236 /* The regexp is parsed into an array of tokens in postfix form. Some tokens
237 are operators and others are terminal symbols. Most (but not all) of these
238 codes are returned by the lexical analyzer. */
240 typedef ptrdiff_t token;
241 static token const TOKEN_MAX = PTRDIFF_MAX;
243 /* States are indexed by state_num values. These are normally
244 nonnegative but -1 is used as a special value. */
245 typedef ptrdiff_t state_num;
247 /* Predefined token values. */
248 enum
250 END = -1, /* END is a terminal symbol that matches the
251 end of input; any value of END or less in
252 the parse tree is such a symbol. Accepting
253 states of the DFA are those that would have
254 a transition on END. This is -1, not some
255 more-negative value, to tweak the speed of
256 comparisons to END. */
258 /* Ordinary character values are terminal symbols that match themselves. */
260 /* CSET must come last in the following list of special tokens. Otherwise,
261 the list order matters only for performance. Related special tokens
262 should have nearby values so that code like (t == ANYCHAR || t == MBCSET
263 || CSET <= t) can be done with a single machine-level comparison. */
265 EMPTY = NOTCHAR, /* EMPTY is a terminal symbol that matches
266 the empty string. */
268 QMARK, /* QMARK is an operator of one argument that
269 matches zero or one occurrences of its
270 argument. */
272 STAR, /* STAR is an operator of one argument that
273 matches the Kleene closure (zero or more
274 occurrences) of its argument. */
276 PLUS, /* PLUS is an operator of one argument that
277 matches the positive closure (one or more
278 occurrences) of its argument. */
280 REPMN, /* REPMN is a lexical token corresponding
281 to the {m,n} construct. REPMN never
282 appears in the compiled token vector. */
284 CAT, /* CAT is an operator of two arguments that
285 matches the concatenation of its
286 arguments. CAT is never returned by the
287 lexical analyzer. */
289 OR, /* OR is an operator of two arguments that
290 matches either of its arguments. */
292 LPAREN, /* LPAREN never appears in the parse tree,
293 it is only a lexeme. */
295 RPAREN, /* RPAREN never appears in the parse tree. */
297 WCHAR, /* Only returned by lex. wctok contains
298 the wide character representation. */
300 ANYCHAR, /* ANYCHAR is a terminal symbol that matches
301 a valid multibyte (or single byte) character.
302 It is used only if MB_CUR_MAX > 1. */
304 BEG, /* BEG is an initial symbol that matches the
305 beginning of input. */
307 BEGLINE, /* BEGLINE is a terminal symbol that matches
308 the empty string at the beginning of a
309 line. */
311 ENDLINE, /* ENDLINE is a terminal symbol that matches
312 the empty string at the end of a line. */
314 BEGWORD, /* BEGWORD is a terminal symbol that matches
315 the empty string at the beginning of a
316 word. */
318 ENDWORD, /* ENDWORD is a terminal symbol that matches
319 the empty string at the end of a word. */
321 LIMWORD, /* LIMWORD is a terminal symbol that matches
322 the empty string at the beginning or the
323 end of a word. */
325 NOTLIMWORD, /* NOTLIMWORD is a terminal symbol that
326 matches the empty string not at
327 the beginning or end of a word. */
329 BACKREF, /* BACKREF is generated by \<digit>
330 or by any other construct that
331 is not completely handled. If the scanner
332 detects a transition on backref, it returns
333 a kind of "semi-success" indicating that
334 the match will have to be verified with
335 a backtracking matcher. */
337 MBCSET, /* MBCSET is similar to CSET, but for
338 multibyte characters. */
340 CSET /* CSET and (and any value greater) is a
341 terminal symbol that matches any of a
342 class of characters. */
346 /* States of the recognizer correspond to sets of positions in the parse
347 tree, together with the constraints under which they may be matched.
348 So a position is encoded as an index into the parse tree together with
349 a constraint. */
350 typedef struct
352 idx_t index; /* Index into the parse array. */
353 unsigned int constraint; /* Constraint for matching this position. */
354 } position;
356 /* Sets of positions are stored as arrays. */
357 typedef struct
359 position *elems; /* Elements of this position set. */
360 idx_t nelem; /* Number of elements in this set. */
361 idx_t alloc; /* Number of elements allocated in ELEMS. */
362 } position_set;
364 /* A state of the dfa consists of a set of positions, some flags,
365 and the token value of the lowest-numbered position of the state that
366 contains an END token. */
367 typedef struct
369 size_t hash; /* Hash of the positions of this state. */
370 position_set elems; /* Positions this state could match. */
371 unsigned char context; /* Context from previous state. */
372 unsigned short constraint; /* Constraint for this state to accept. */
373 token first_end; /* Token value of the first END in elems. */
374 position_set mbps; /* Positions which can match multibyte
375 characters or the follows, e.g., period.
376 Used only if MB_CUR_MAX > 1. */
377 state_num mb_trindex; /* Index of this state in MB_TRANS, or
378 negative if the state does not have
379 ANYCHAR. */
380 } dfa_state;
382 /* Maximum for any transition table count. This should be at least 3,
383 for the initial state setup. */
384 enum { MAX_TRCOUNT = 1024 };
386 /* A bracket operator.
387 e.g., [a-c], [[:alpha:]], etc. */
388 struct mb_char_classes
390 ptrdiff_t cset;
391 bool invert;
392 wchar_t *chars; /* Normal characters. */
393 idx_t nchars;
394 idx_t nchars_alloc;
397 struct regex_syntax
399 /* Syntax bits controlling the behavior of the lexical analyzer. */
400 reg_syntax_t syntax_bits;
401 bool syntax_bits_set;
403 /* Flag for case-folding letters into sets. */
404 bool case_fold;
406 /* True if ^ and $ match only the start and end of data, and do not match
407 end-of-line within data. */
408 bool anchor;
410 /* End-of-line byte in data. */
411 unsigned char eolbyte;
413 /* Cache of char-context values. */
414 char sbit[NOTCHAR];
416 /* If never_trail[B], the byte B cannot be a non-initial byte in a
417 multibyte character. */
418 bool never_trail[NOTCHAR];
420 /* Set of characters considered letters. */
421 charclass letters;
423 /* Set of characters that are newline. */
424 charclass newline;
427 /* Lexical analyzer. All the dross that deals with the obnoxious
428 GNU Regex syntax bits is located here. The poor, suffering
429 reader is referred to the GNU Regex documentation for the
430 meaning of the @#%!@#%^!@ syntax bits. */
431 struct lexer_state
433 char const *ptr; /* Pointer to next input character. */
434 idx_t left; /* Number of characters remaining. */
435 token lasttok; /* Previous token returned; initially END. */
436 idx_t parens; /* Count of outstanding left parens. */
437 int minrep, maxrep; /* Repeat counts for {m,n}. */
439 /* Wide character representation of the current multibyte character,
440 or WEOF if there was an encoding error. Used only if
441 MB_CUR_MAX > 1. */
442 wint_t wctok;
444 /* The most recently analyzed multibyte bracket expression. */
445 struct mb_char_classes brack;
447 /* We're separated from beginning or (, | only by zero-width characters. */
448 bool laststart;
451 /* Recursive descent parser for regular expressions. */
453 struct parser_state
455 token tok; /* Lookahead token. */
456 idx_t depth; /* Current depth of a hypothetical stack
457 holding deferred productions. This is
458 used to determine the depth that will be
459 required of the real stack later on in
460 dfaanalyze. */
463 /* A compiled regular expression. */
464 struct dfa
466 /* Fields filled by the scanner. */
467 charclass *charclasses; /* Array of character sets for CSET tokens. */
468 idx_t cindex; /* Index for adding new charclasses. */
469 idx_t calloc; /* Number of charclasses allocated. */
470 ptrdiff_t canychar; /* Index of anychar class, or -1. */
472 /* Scanner state */
473 struct lexer_state lex;
475 /* Parser state */
476 struct parser_state parse;
478 /* Fields filled by the parser. */
479 token *tokens; /* Postfix parse array. */
480 idx_t tindex; /* Index for adding new tokens. */
481 idx_t talloc; /* Number of tokens currently allocated. */
482 idx_t depth; /* Depth required of an evaluation stack
483 used for depth-first traversal of the
484 parse tree. */
485 idx_t nleaves; /* Number of leaves on the parse tree. */
486 idx_t nregexps; /* Count of parallel regexps being built
487 with dfaparse. */
488 bool fast; /* The DFA is fast. */
489 token utf8_anychar_classes[9]; /* To lower ANYCHAR in UTF-8 locales. */
490 mbstate_t mbs; /* Multibyte conversion state. */
492 /* The following are valid only if MB_CUR_MAX > 1. */
494 /* The value of multibyte_prop[i] is defined by following rule.
495 if tokens[i] < NOTCHAR
496 bit 0 : tokens[i] is the first byte of a character, including
497 single-byte characters.
498 bit 1 : tokens[i] is the last byte of a character, including
499 single-byte characters.
501 e.g.
502 tokens
503 = 'single_byte_a', 'multi_byte_A', single_byte_b'
504 = 'sb_a', 'mb_A(1st byte)', 'mb_A(2nd byte)', 'mb_A(3rd byte)', 'sb_b'
505 multibyte_prop
506 = 3 , 1 , 0 , 2 , 3
508 char *multibyte_prop;
510 /* Fields filled by the superset. */
511 struct dfa *superset; /* Hint of the dfa. */
513 /* Fields filled by the state builder. */
514 dfa_state *states; /* States of the dfa. */
515 state_num sindex; /* Index for adding new states. */
516 idx_t salloc; /* Number of states currently allocated. */
518 /* Fields filled by the parse tree->NFA conversion. */
519 position_set *follows; /* Array of follow sets, indexed by position
520 index. The follow of a position is the set
521 of positions containing characters that
522 could conceivably follow a character
523 matching the given position in a string
524 matching the regexp. Allocated to the
525 maximum possible position index. */
526 bool searchflag; /* We are supposed to build a searching
527 as opposed to an exact matcher. A searching
528 matcher finds the first and shortest string
529 matching a regexp anywhere in the buffer,
530 whereas an exact matcher finds the longest
531 string matching, but anchored to the
532 beginning of the buffer. */
534 /* Fields filled by dfaanalyze. */
535 int *constraints; /* Array of union of accepting constraints
536 in the follow of a position. */
537 int *separates; /* Array of contexts on follow of a
538 position. */
540 /* Fields filled by dfaexec. */
541 state_num tralloc; /* Number of transition tables that have
542 slots so far, not counting trans[-1] and
543 trans[-2]. */
544 int trcount; /* Number of transition tables that have
545 been built, other than for initial
546 states. */
547 int min_trcount; /* Number of initial states. Equivalently,
548 the minimum state number for which trcount
549 counts transitions. */
550 state_num **trans; /* Transition tables for states that can
551 never accept. If the transitions for a
552 state have not yet been computed, or the
553 state could possibly accept, its entry in
554 this table is NULL. This points to two
555 past the start of the allocated array,
556 and trans[-1] and trans[-2] are always
557 NULL. */
558 state_num **fails; /* Transition tables after failing to accept
559 on a state that potentially could do so.
560 If trans[i] is non-null, fails[i] must
561 be null. */
562 char *success; /* Table of acceptance conditions used in
563 dfaexec and computed in build_state. */
564 state_num *newlines; /* Transitions on newlines. The entry for a
565 newline in any transition table is always
566 -1 so we can count lines without wasting
567 too many cycles. The transition for a
568 newline is stored separately and handled
569 as a special case. Newline is also used
570 as a sentinel at the end of the buffer. */
571 state_num initstate_notbol; /* Initial state for CTX_LETTER and CTX_NONE
572 context in multibyte locales, in which we
573 do not distinguish between their contexts,
574 as not supported word. */
575 position_set mb_follows; /* Follow set added by ANYCHAR on demand. */
576 state_num **mb_trans; /* Transition tables for states with
577 ANYCHAR. */
578 state_num mb_trcount; /* Number of transition tables for states with
579 ANYCHAR that have actually been built. */
581 /* Syntax configuration. This is near the end so that dfacopysyntax
582 can memset up to here. */
583 struct regex_syntax syntax;
585 /* Information derived from the locale. This is at the end so that
586 a quick memset need not clear it specially. */
588 /* dfaexec implementation. */
589 char *(*dfaexec) (struct dfa *, char const *, char *,
590 bool, ptrdiff_t *, bool *);
592 /* Other cached information derived from the locale. */
593 struct localeinfo localeinfo;
596 /* User access to dfa internals. */
598 /* S could possibly be an accepting state of R. */
599 static bool
600 accepting (state_num s, struct dfa const *r)
602 return r->states[s].constraint != 0;
605 /* STATE accepts in the specified context. */
606 static bool
607 accepts_in_context (int prev, int curr, state_num state, struct dfa const *dfa)
609 return succeeds_in_context (dfa->states[state].constraint, prev, curr);
612 static void regexp (struct dfa *dfa);
614 /* Store into *PWC the result of converting the leading bytes of the
615 multibyte buffer S of length N bytes, using D->localeinfo.sbctowc
616 and updating the conversion state in *D. On conversion error,
617 convert just a single byte, to WEOF. Return the number of bytes
618 converted.
620 This differs from mbrtowc (PWC, S, N, &D->mbs) as follows:
622 * PWC points to wint_t, not to wchar_t.
623 * The last arg is a dfa *D instead of merely a multibyte conversion
624 state D->mbs.
625 * N must be at least 1.
626 * S[N - 1] must be a sentinel byte.
627 * Shift encodings are not supported.
628 * The return value is always in the range 1..N.
629 * D->mbs is always valid afterwards.
630 * *PWC is always set to something. */
631 static int
632 mbs_to_wchar (wint_t *pwc, char const *s, size_t n, struct dfa *d)
634 unsigned char uc = s[0];
635 wint_t wc = d->localeinfo.sbctowc[uc];
637 if (wc == WEOF)
639 wchar_t wch;
640 size_t nbytes = mbrtowc (&wch, s, n, &d->mbs);
641 if (0 < nbytes && nbytes < (size_t) -2)
643 *pwc = wch;
644 return nbytes;
646 memset (&d->mbs, 0, sizeof d->mbs);
649 *pwc = wc;
650 return 1;
653 #ifdef DEBUG
655 static void
656 prtok (token t)
658 if (t <= END)
659 fprintf (stderr, "END");
660 else if (0 <= t && t < NOTCHAR)
662 unsigned int ch = t;
663 fprintf (stderr, "0x%02x", ch);
665 else
667 char const *s;
668 switch (t)
670 case BEG:
671 s = "BEG";
672 break;
673 case EMPTY:
674 s = "EMPTY";
675 break;
676 case BACKREF:
677 s = "BACKREF";
678 break;
679 case BEGLINE:
680 s = "BEGLINE";
681 break;
682 case ENDLINE:
683 s = "ENDLINE";
684 break;
685 case BEGWORD:
686 s = "BEGWORD";
687 break;
688 case ENDWORD:
689 s = "ENDWORD";
690 break;
691 case LIMWORD:
692 s = "LIMWORD";
693 break;
694 case NOTLIMWORD:
695 s = "NOTLIMWORD";
696 break;
697 case QMARK:
698 s = "QMARK";
699 break;
700 case STAR:
701 s = "STAR";
702 break;
703 case PLUS:
704 s = "PLUS";
705 break;
706 case CAT:
707 s = "CAT";
708 break;
709 case OR:
710 s = "OR";
711 break;
712 case LPAREN:
713 s = "LPAREN";
714 break;
715 case RPAREN:
716 s = "RPAREN";
717 break;
718 case ANYCHAR:
719 s = "ANYCHAR";
720 break;
721 case MBCSET:
722 s = "MBCSET";
723 break;
724 default:
725 s = "CSET";
726 break;
728 fprintf (stderr, "%s", s);
731 #endif /* DEBUG */
733 /* Stuff pertaining to charclasses. */
735 static bool
736 tstbit (unsigned int b, charclass const *c)
738 return c->w[b / CHARCLASS_WORD_BITS] >> b % CHARCLASS_WORD_BITS & 1;
741 static void
742 setbit (unsigned int b, charclass *c)
744 charclass_word one = 1;
745 c->w[b / CHARCLASS_WORD_BITS] |= one << b % CHARCLASS_WORD_BITS;
748 static void
749 clrbit (unsigned int b, charclass *c)
751 charclass_word one = 1;
752 c->w[b / CHARCLASS_WORD_BITS] &= ~(one << b % CHARCLASS_WORD_BITS);
755 static void
756 zeroset (charclass *s)
758 memset (s, 0, sizeof *s);
761 static void
762 fillset (charclass *s)
764 for (int i = 0; i < CHARCLASS_WORDS; i++)
765 s->w[i] = CHARCLASS_WORD_MASK;
768 static void
769 notset (charclass *s)
771 for (int i = 0; i < CHARCLASS_WORDS; ++i)
772 s->w[i] = CHARCLASS_WORD_MASK & ~s->w[i];
775 static bool
776 equal (charclass const *s1, charclass const *s2)
778 charclass_word w = 0;
779 for (int i = 0; i < CHARCLASS_WORDS; i++)
780 w |= s1->w[i] ^ s2->w[i];
781 return w == 0;
784 static bool
785 emptyset (charclass const *s)
787 charclass_word w = 0;
788 for (int i = 0; i < CHARCLASS_WORDS; i++)
789 w |= s->w[i];
790 return w == 0;
793 /* Grow PA, which points to an array of *NITEMS items, and return the
794 location of the reallocated array, updating *NITEMS to reflect its
795 new size. The new array will contain at least NITEMS_INCR_MIN more
796 items, but will not contain more than NITEMS_MAX items total.
797 ITEM_SIZE is the size of each item, in bytes.
799 ITEM_SIZE and NITEMS_INCR_MIN must be positive. *NITEMS must be
800 nonnegative. If NITEMS_MAX is -1, it is treated as if it were
801 infinity.
803 If PA is null, then allocate a new array instead of reallocating
804 the old one.
806 Thus, to grow an array A without saving its old contents, do
807 { free (A); A = xpalloc (NULL, &AITEMS, ...); }. */
809 static void *
810 xpalloc (void *pa, idx_t *nitems, idx_t nitems_incr_min,
811 ptrdiff_t nitems_max, idx_t item_size)
813 idx_t n0 = *nitems;
815 /* The approximate size to use for initial small allocation
816 requests. This is the largest "small" request for the GNU C
817 library malloc. */
818 enum { DEFAULT_MXFAST = 64 * sizeof (size_t) / 4 };
820 /* If the array is tiny, grow it to about (but no greater than)
821 DEFAULT_MXFAST bytes. Otherwise, grow it by about 50%.
822 Adjust the growth according to three constraints: NITEMS_INCR_MIN,
823 NITEMS_MAX, and what the C language can represent safely. */
825 idx_t n, nbytes;
826 if (INT_ADD_WRAPV (n0, n0 >> 1, &n))
827 n = IDX_MAX;
828 if (0 <= nitems_max && nitems_max < n)
829 n = nitems_max;
831 idx_t adjusted_nbytes
832 = ((INT_MULTIPLY_WRAPV (n, item_size, &nbytes) || SIZE_MAX < nbytes)
833 ? MIN (IDX_MAX, SIZE_MAX)
834 : nbytes < DEFAULT_MXFAST ? DEFAULT_MXFAST : 0);
835 if (adjusted_nbytes)
837 n = adjusted_nbytes / item_size;
838 nbytes = adjusted_nbytes - adjusted_nbytes % item_size;
841 if (! pa)
842 *nitems = 0;
843 if (n - n0 < nitems_incr_min
844 && (INT_ADD_WRAPV (n0, nitems_incr_min, &n)
845 || (0 <= nitems_max && nitems_max < n)
846 || INT_MULTIPLY_WRAPV (n, item_size, &nbytes)))
847 xalloc_die ();
848 pa = xrealloc (pa, nbytes);
849 *nitems = n;
850 return pa;
853 /* Ensure that the array addressed by PA holds at least I + 1 items.
854 Either return PA, or reallocate the array and return its new address.
855 Although PA may be null, the returned value is never null.
857 The array holds *NITEMS items, where 0 <= I <= *NITEMS; *NITEMS
858 is updated on reallocation. If PA is null, *NITEMS must be zero.
859 Do not allocate more than NITEMS_MAX items total; -1 means no limit.
860 ITEM_SIZE is the size of one item; it must be positive.
861 Avoid O(N**2) behavior on arrays growing linearly. */
862 static void *
863 maybe_realloc (void *pa, idx_t i, idx_t *nitems,
864 ptrdiff_t nitems_max, idx_t item_size)
866 if (i < *nitems)
867 return pa;
868 return xpalloc (pa, nitems, 1, nitems_max, item_size);
871 /* In DFA D, find the index of charclass S, or allocate a new one. */
872 static idx_t
873 charclass_index (struct dfa *d, charclass const *s)
875 idx_t i;
877 for (i = 0; i < d->cindex; ++i)
878 if (equal (s, &d->charclasses[i]))
879 return i;
880 d->charclasses = maybe_realloc (d->charclasses, d->cindex, &d->calloc,
881 TOKEN_MAX - CSET, sizeof *d->charclasses);
882 ++d->cindex;
883 d->charclasses[i] = *s;
884 return i;
887 static bool
888 unibyte_word_constituent (struct dfa const *dfa, unsigned char c)
890 return dfa->localeinfo.sbctowc[c] != WEOF && (isalnum (c) || (c) == '_');
893 static int
894 char_context (struct dfa const *dfa, unsigned char c)
896 if (c == dfa->syntax.eolbyte && !dfa->syntax.anchor)
897 return CTX_NEWLINE;
898 if (unibyte_word_constituent (dfa, c))
899 return CTX_LETTER;
900 return CTX_NONE;
903 /* Set a bit in the charclass for the given wchar_t. Do nothing if WC
904 is represented by a multi-byte sequence. Even for MB_CUR_MAX == 1,
905 this may happen when folding case in weird Turkish locales where
906 dotless i/dotted I are not included in the chosen character set.
907 Return whether a bit was set in the charclass. */
908 static bool
909 setbit_wc (wint_t wc, charclass *c)
911 int b = wctob (wc);
912 if (b < 0)
913 return false;
915 setbit (b, c);
916 return true;
919 /* Set a bit for B and its case variants in the charclass C.
920 MB_CUR_MAX must be 1. */
921 static void
922 setbit_case_fold_c (int b, charclass *c)
924 int ub = toupper (b);
925 for (int i = 0; i < NOTCHAR; i++)
926 if (toupper (i) == ub)
927 setbit (i, c);
930 /* Fetch the next lexical input character from the pattern. There
931 must at least one byte of pattern input. Set DFA->lex.wctok to the
932 value of the character or to WEOF depending on whether the input is
933 a valid multibyte character (possibly of length 1). Then return
934 the next input byte value, except return EOF if the input is a
935 multibyte character of length greater than 1. */
936 static int
937 fetch_wc (struct dfa *dfa)
939 int nbytes = mbs_to_wchar (&dfa->lex.wctok, dfa->lex.ptr, dfa->lex.left,
940 dfa);
941 int c = nbytes == 1 ? to_uchar (dfa->lex.ptr[0]) : EOF;
942 dfa->lex.ptr += nbytes;
943 dfa->lex.left -= nbytes;
944 return c;
947 /* If there is no more input, report an error about unbalanced brackets.
948 Otherwise, behave as with fetch_wc (DFA). */
949 static int
950 bracket_fetch_wc (struct dfa *dfa)
952 if (! dfa->lex.left)
953 dfaerror (_("unbalanced ["));
954 return fetch_wc (dfa);
957 typedef int predicate (int);
959 /* The following list maps the names of the Posix named character classes
960 to predicate functions that determine whether a given character is in
961 the class. The leading [ has already been eaten by the lexical
962 analyzer. */
963 struct dfa_ctype
965 const char *name;
966 predicate *func;
967 bool single_byte_only;
970 static const struct dfa_ctype prednames[] = {
971 {"alpha", isalpha, false},
972 {"upper", isupper, false},
973 {"lower", islower, false},
974 {"digit", isdigit, true},
975 {"xdigit", isxdigit, false},
976 {"space", isspace, false},
977 {"punct", ispunct, false},
978 {"alnum", isalnum, false},
979 {"print", isprint, false},
980 {"graph", isgraph, false},
981 {"cntrl", iscntrl, false},
982 {"blank", isblank, false},
983 {NULL, NULL, false}
986 static const struct dfa_ctype *_GL_ATTRIBUTE_PURE
987 find_pred (const char *str)
989 for (int i = 0; prednames[i].name; i++)
990 if (streq (str, prednames[i].name))
991 return &prednames[i];
992 return NULL;
995 /* Parse a bracket expression, which possibly includes multibyte
996 characters. */
997 static token
998 parse_bracket_exp (struct dfa *dfa)
1000 /* This is a bracket expression that dfaexec is known to
1001 process correctly. */
1002 bool known_bracket_exp = true;
1004 /* Used to warn about [:space:].
1005 Bit 0 = first character is a colon.
1006 Bit 1 = last character is a colon.
1007 Bit 2 = includes any other character but a colon.
1008 Bit 3 = includes ranges, char/equiv classes or collation elements. */
1009 int colon_warning_state;
1011 dfa->lex.brack.nchars = 0;
1012 charclass ccl;
1013 zeroset (&ccl);
1014 int c = bracket_fetch_wc (dfa);
1015 bool invert = c == '^';
1016 if (invert)
1018 c = bracket_fetch_wc (dfa);
1019 known_bracket_exp = dfa->localeinfo.simple;
1021 wint_t wc = dfa->lex.wctok;
1022 int c1;
1023 wint_t wc1;
1024 colon_warning_state = (c == ':');
1027 c1 = NOTCHAR; /* Mark c1 as not initialized. */
1028 colon_warning_state &= ~2;
1030 /* Note that if we're looking at some other [:...:] construct,
1031 we just treat it as a bunch of ordinary characters. We can do
1032 this because we assume regex has checked for syntax errors before
1033 dfa is ever called. */
1034 if (c == '[')
1036 c1 = bracket_fetch_wc (dfa);
1037 wc1 = dfa->lex.wctok;
1039 if ((c1 == ':' && (dfa->syntax.syntax_bits & RE_CHAR_CLASSES))
1040 || c1 == '.' || c1 == '=')
1042 enum { MAX_BRACKET_STRING_LEN = 32 };
1043 char str[MAX_BRACKET_STRING_LEN + 1];
1044 int len = 0;
1045 for (;;)
1047 c = bracket_fetch_wc (dfa);
1048 if (dfa->lex.left == 0
1049 || (c == c1 && dfa->lex.ptr[0] == ']'))
1050 break;
1051 if (len < MAX_BRACKET_STRING_LEN)
1052 str[len++] = c;
1053 else
1054 /* This is in any case an invalid class name. */
1055 str[0] = '\0';
1057 str[len] = '\0';
1059 /* Fetch bracket. */
1060 c = bracket_fetch_wc (dfa);
1061 wc = dfa->lex.wctok;
1062 if (c1 == ':')
1063 /* Build character class. POSIX allows character
1064 classes to match multicharacter collating elements,
1065 but the regex code does not support that, so do not
1066 worry about that possibility. */
1068 char const *class
1069 = (dfa->syntax.case_fold && (streq (str, "upper")
1070 || streq (str, "lower"))
1071 ? "alpha" : str);
1072 const struct dfa_ctype *pred = find_pred (class);
1073 if (!pred)
1074 dfaerror (_("invalid character class"));
1076 if (dfa->localeinfo.multibyte && !pred->single_byte_only)
1077 known_bracket_exp = false;
1078 else
1079 for (int c2 = 0; c2 < NOTCHAR; ++c2)
1080 if (pred->func (c2))
1081 setbit (c2, &ccl);
1083 else
1084 known_bracket_exp = false;
1086 colon_warning_state |= 8;
1088 /* Fetch new lookahead character. */
1089 c1 = bracket_fetch_wc (dfa);
1090 wc1 = dfa->lex.wctok;
1091 continue;
1094 /* We treat '[' as a normal character here. c/c1/wc/wc1
1095 are already set up. */
1098 if (c == '\\'
1099 && (dfa->syntax.syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
1101 c = bracket_fetch_wc (dfa);
1102 wc = dfa->lex.wctok;
1105 if (c1 == NOTCHAR)
1107 c1 = bracket_fetch_wc (dfa);
1108 wc1 = dfa->lex.wctok;
1111 if (c1 == '-')
1112 /* build range characters. */
1114 int c2 = bracket_fetch_wc (dfa);
1115 wint_t wc2 = dfa->lex.wctok;
1117 /* A bracket expression like [a-[.aa.]] matches an unknown set.
1118 Treat it like [-a[.aa.]] while parsing it, and
1119 remember that the set is unknown. */
1120 if (c2 == '[' && dfa->lex.ptr[0] == '.')
1122 known_bracket_exp = false;
1123 c2 = ']';
1126 if (c2 == ']')
1128 /* In the case [x-], the - is an ordinary hyphen,
1129 which is left in c1, the lookahead character. */
1130 dfa->lex.ptr--;
1131 dfa->lex.left++;
1133 else
1135 if (c2 == '\\' && (dfa->syntax.syntax_bits
1136 & RE_BACKSLASH_ESCAPE_IN_LISTS))
1138 c2 = bracket_fetch_wc (dfa);
1139 wc2 = dfa->lex.wctok;
1142 colon_warning_state |= 8;
1143 c1 = bracket_fetch_wc (dfa);
1144 wc1 = dfa->lex.wctok;
1146 /* Treat [x-y] as a range if x != y. */
1147 if (wc != wc2 || wc == WEOF)
1149 if (dfa->localeinfo.simple
1150 || (isasciidigit (c) & isasciidigit (c2)))
1152 for (int ci = c; ci <= c2; ci++)
1153 if (dfa->syntax.case_fold && isalpha (ci))
1154 setbit_case_fold_c (ci, &ccl);
1155 else
1156 setbit (ci, &ccl);
1158 else
1159 known_bracket_exp = false;
1161 continue;
1166 colon_warning_state |= (c == ':') ? 2 : 4;
1168 if (!dfa->localeinfo.multibyte)
1170 if (dfa->syntax.case_fold && isalpha (c))
1171 setbit_case_fold_c (c, &ccl);
1172 else
1173 setbit (c, &ccl);
1174 continue;
1177 if (wc == WEOF)
1178 known_bracket_exp = false;
1179 else
1181 wchar_t folded[CASE_FOLDED_BUFSIZE + 1];
1182 int n = (dfa->syntax.case_fold
1183 ? case_folded_counterparts (wc, folded + 1) + 1
1184 : 1);
1185 folded[0] = wc;
1186 for (int i = 0; i < n; i++)
1187 if (!setbit_wc (folded[i], &ccl))
1189 dfa->lex.brack.chars
1190 = maybe_realloc (dfa->lex.brack.chars, dfa->lex.brack.nchars,
1191 &dfa->lex.brack.nchars_alloc, -1,
1192 sizeof *dfa->lex.brack.chars);
1193 dfa->lex.brack.chars[dfa->lex.brack.nchars++] = folded[i];
1197 while ((wc = wc1, (c = c1) != ']'));
1199 if (colon_warning_state == 7)
1200 dfawarn (_("character class syntax is [[:space:]], not [:space:]"));
1202 if (! known_bracket_exp)
1203 return BACKREF;
1205 if (dfa->localeinfo.multibyte && (invert || dfa->lex.brack.nchars != 0))
1207 dfa->lex.brack.invert = invert;
1208 dfa->lex.brack.cset = emptyset (&ccl) ? -1 : charclass_index (dfa, &ccl);
1209 return MBCSET;
1212 if (invert)
1214 notset (&ccl);
1215 if (dfa->syntax.syntax_bits & RE_HAT_LISTS_NOT_NEWLINE)
1216 clrbit ('\n', &ccl);
1219 return CSET + charclass_index (dfa, &ccl);
1222 struct lexptr
1224 char const *ptr;
1225 idx_t left;
1228 static void
1229 push_lex_state (struct dfa *dfa, struct lexptr *ls, char const *s)
1231 ls->ptr = dfa->lex.ptr;
1232 ls->left = dfa->lex.left;
1233 dfa->lex.ptr = s;
1234 dfa->lex.left = strlen (s);
1237 static void
1238 pop_lex_state (struct dfa *dfa, struct lexptr const *ls)
1240 dfa->lex.ptr = ls->ptr;
1241 dfa->lex.left = ls->left;
1244 static token
1245 lex (struct dfa *dfa)
1247 bool backslash = false;
1249 /* Basic plan: We fetch a character. If it's a backslash,
1250 we set the backslash flag and go through the loop again.
1251 On the plus side, this avoids having a duplicate of the
1252 main switch inside the backslash case. On the minus side,
1253 it means that just about every case begins with
1254 "if (backslash) ...". */
1255 for (int i = 0; i < 2; ++i)
1257 if (! dfa->lex.left)
1258 return dfa->lex.lasttok = END;
1259 int c = fetch_wc (dfa);
1261 switch (c)
1263 case '\\':
1264 if (backslash)
1265 goto normal_char;
1266 if (dfa->lex.left == 0)
1267 dfaerror (_("unfinished \\ escape"));
1268 backslash = true;
1269 break;
1271 case '^':
1272 if (backslash)
1273 goto normal_char;
1274 if (dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_ANCHORS
1275 || dfa->lex.lasttok == END || dfa->lex.lasttok == LPAREN
1276 || dfa->lex.lasttok == OR)
1277 return dfa->lex.lasttok = BEGLINE;
1278 goto normal_char;
1280 case '$':
1281 if (backslash)
1282 goto normal_char;
1283 if (dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_ANCHORS
1284 || dfa->lex.left == 0
1285 || ((dfa->lex.left
1286 > !(dfa->syntax.syntax_bits & RE_NO_BK_PARENS))
1287 && (dfa->lex.ptr[!(dfa->syntax.syntax_bits & RE_NO_BK_PARENS)
1288 & (dfa->lex.ptr[0] == '\\')]
1289 == ')'))
1290 || ((dfa->lex.left
1291 > !(dfa->syntax.syntax_bits & RE_NO_BK_VBAR))
1292 && (dfa->lex.ptr[!(dfa->syntax.syntax_bits & RE_NO_BK_VBAR)
1293 & (dfa->lex.ptr[0] == '\\')]
1294 == '|'))
1295 || ((dfa->syntax.syntax_bits & RE_NEWLINE_ALT)
1296 && dfa->lex.left > 0 && dfa->lex.ptr[0] == '\n'))
1297 return dfa->lex.lasttok = ENDLINE;
1298 goto normal_char;
1300 case '1':
1301 case '2':
1302 case '3':
1303 case '4':
1304 case '5':
1305 case '6':
1306 case '7':
1307 case '8':
1308 case '9':
1309 if (backslash && !(dfa->syntax.syntax_bits & RE_NO_BK_REFS))
1311 dfa->lex.laststart = false;
1312 return dfa->lex.lasttok = BACKREF;
1314 goto normal_char;
1316 case '`':
1317 if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1319 /* FIXME: should be beginning of string */
1320 return dfa->lex.lasttok = BEGLINE;
1322 goto normal_char;
1324 case '\'':
1325 if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1327 /* FIXME: should be end of string */
1328 return dfa->lex.lasttok = ENDLINE;
1330 goto normal_char;
1332 case '<':
1333 if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1334 return dfa->lex.lasttok = BEGWORD;
1335 goto normal_char;
1337 case '>':
1338 if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1339 return dfa->lex.lasttok = ENDWORD;
1340 goto normal_char;
1342 case 'b':
1343 if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1344 return dfa->lex.lasttok = LIMWORD;
1345 goto normal_char;
1347 case 'B':
1348 if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1349 return dfa->lex.lasttok = NOTLIMWORD;
1350 goto normal_char;
1352 case '?':
1353 if (dfa->syntax.syntax_bits & RE_LIMITED_OPS)
1354 goto normal_char;
1355 if (backslash != ((dfa->syntax.syntax_bits & RE_BK_PLUS_QM) != 0))
1356 goto normal_char;
1357 if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS)
1358 && dfa->lex.laststart)
1359 goto normal_char;
1360 return dfa->lex.lasttok = QMARK;
1362 case '*':
1363 if (backslash)
1364 goto normal_char;
1365 if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS)
1366 && dfa->lex.laststart)
1367 goto normal_char;
1368 return dfa->lex.lasttok = STAR;
1370 case '+':
1371 if (dfa->syntax.syntax_bits & RE_LIMITED_OPS)
1372 goto normal_char;
1373 if (backslash != ((dfa->syntax.syntax_bits & RE_BK_PLUS_QM) != 0))
1374 goto normal_char;
1375 if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS)
1376 && dfa->lex.laststart)
1377 goto normal_char;
1378 return dfa->lex.lasttok = PLUS;
1380 case '{':
1381 if (!(dfa->syntax.syntax_bits & RE_INTERVALS))
1382 goto normal_char;
1383 if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_BRACES) == 0))
1384 goto normal_char;
1385 if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS)
1386 && dfa->lex.laststart)
1387 goto normal_char;
1389 /* Cases:
1390 {M} - exact count
1391 {M,} - minimum count, maximum is infinity
1392 {,N} - 0 through N
1393 {,} - 0 to infinity (same as '*')
1394 {M,N} - M through N */
1396 char const *p = dfa->lex.ptr;
1397 char const *lim = p + dfa->lex.left;
1398 dfa->lex.minrep = dfa->lex.maxrep = -1;
1399 for (; p != lim && isasciidigit (*p); p++)
1400 dfa->lex.minrep = (dfa->lex.minrep < 0
1401 ? *p - '0'
1402 : MIN (RE_DUP_MAX + 1,
1403 dfa->lex.minrep * 10 + *p - '0'));
1404 if (p != lim)
1406 if (*p != ',')
1407 dfa->lex.maxrep = dfa->lex.minrep;
1408 else
1410 if (dfa->lex.minrep < 0)
1411 dfa->lex.minrep = 0;
1412 while (++p != lim && isasciidigit (*p))
1413 dfa->lex.maxrep
1414 = (dfa->lex.maxrep < 0
1415 ? *p - '0'
1416 : MIN (RE_DUP_MAX + 1,
1417 dfa->lex.maxrep * 10 + *p - '0'));
1420 if (! ((! backslash || (p != lim && *p++ == '\\'))
1421 && p != lim && *p++ == '}'
1422 && 0 <= dfa->lex.minrep
1423 && (dfa->lex.maxrep < 0
1424 || dfa->lex.minrep <= dfa->lex.maxrep)))
1426 if (dfa->syntax.syntax_bits & RE_INVALID_INTERVAL_ORD)
1427 goto normal_char;
1428 dfaerror (_("invalid content of \\{\\}"));
1430 if (RE_DUP_MAX < dfa->lex.maxrep)
1431 dfaerror (_("regular expression too big"));
1432 dfa->lex.ptr = p;
1433 dfa->lex.left = lim - p;
1435 dfa->lex.laststart = false;
1436 return dfa->lex.lasttok = REPMN;
1438 case '|':
1439 if (dfa->syntax.syntax_bits & RE_LIMITED_OPS)
1440 goto normal_char;
1441 if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_VBAR) == 0))
1442 goto normal_char;
1443 dfa->lex.laststart = true;
1444 return dfa->lex.lasttok = OR;
1446 case '\n':
1447 if (dfa->syntax.syntax_bits & RE_LIMITED_OPS
1448 || backslash || !(dfa->syntax.syntax_bits & RE_NEWLINE_ALT))
1449 goto normal_char;
1450 dfa->lex.laststart = true;
1451 return dfa->lex.lasttok = OR;
1453 case '(':
1454 if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_PARENS) == 0))
1455 goto normal_char;
1456 dfa->lex.parens++;
1457 dfa->lex.laststart = true;
1458 return dfa->lex.lasttok = LPAREN;
1460 case ')':
1461 if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_PARENS) == 0))
1462 goto normal_char;
1463 if (dfa->lex.parens == 0
1464 && dfa->syntax.syntax_bits & RE_UNMATCHED_RIGHT_PAREN_ORD)
1465 goto normal_char;
1466 dfa->lex.parens--;
1467 dfa->lex.laststart = false;
1468 return dfa->lex.lasttok = RPAREN;
1470 case '.':
1471 if (backslash)
1472 goto normal_char;
1473 if (dfa->canychar < 0)
1475 charclass ccl;
1476 fillset (&ccl);
1477 if (!(dfa->syntax.syntax_bits & RE_DOT_NEWLINE))
1478 clrbit ('\n', &ccl);
1479 if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL)
1480 clrbit ('\0', &ccl);
1481 if (dfa->localeinfo.multibyte)
1482 for (int c2 = 0; c2 < NOTCHAR; c2++)
1483 if (dfa->localeinfo.sbctowc[c2] == WEOF)
1484 clrbit (c2, &ccl);
1485 dfa->canychar = charclass_index (dfa, &ccl);
1487 dfa->lex.laststart = false;
1488 return dfa->lex.lasttok = (dfa->localeinfo.multibyte
1489 ? ANYCHAR
1490 : CSET + dfa->canychar);
1492 case 's':
1493 case 'S':
1494 if (!backslash || (dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1495 goto normal_char;
1496 if (!dfa->localeinfo.multibyte)
1498 charclass ccl;
1499 zeroset (&ccl);
1500 for (int c2 = 0; c2 < NOTCHAR; ++c2)
1501 if (isspace (c2))
1502 setbit (c2, &ccl);
1503 if (c == 'S')
1504 notset (&ccl);
1505 dfa->lex.laststart = false;
1506 return dfa->lex.lasttok = CSET + charclass_index (dfa, &ccl);
1509 /* FIXME: see if optimizing this, as is done with ANYCHAR and
1510 add_utf8_anychar, makes sense. */
1512 /* \s and \S are documented to be equivalent to [[:space:]] and
1513 [^[:space:]] respectively, so tell the lexer to process those
1514 strings, each minus its "already processed" '['. */
1516 struct lexptr ls;
1517 push_lex_state (dfa, &ls, &"^[:space:]]"[c == 's']);
1518 dfa->lex.lasttok = parse_bracket_exp (dfa);
1519 pop_lex_state (dfa, &ls);
1522 dfa->lex.laststart = false;
1523 return dfa->lex.lasttok;
1525 case 'w':
1526 case 'W':
1527 if (!backslash || (dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1528 goto normal_char;
1530 if (!dfa->localeinfo.multibyte)
1532 charclass ccl;
1533 zeroset (&ccl);
1534 for (int c2 = 0; c2 < NOTCHAR; ++c2)
1535 if (dfa->syntax.sbit[c2] == CTX_LETTER)
1536 setbit (c2, &ccl);
1537 if (c == 'W')
1538 notset (&ccl);
1539 dfa->lex.laststart = false;
1540 return dfa->lex.lasttok = CSET + charclass_index (dfa, &ccl);
1543 /* FIXME: see if optimizing this, as is done with ANYCHAR and
1544 add_utf8_anychar, makes sense. */
1546 /* \w and \W are documented to be equivalent to [_[:alnum:]] and
1547 [^_[:alnum:]] respectively, so tell the lexer to process those
1548 strings, each minus its "already processed" '['. */
1550 struct lexptr ls;
1551 push_lex_state (dfa, &ls, &"^_[:alnum:]]"[c == 'w']);
1552 dfa->lex.lasttok = parse_bracket_exp (dfa);
1553 pop_lex_state (dfa, &ls);
1556 dfa->lex.laststart = false;
1557 return dfa->lex.lasttok;
1559 case '[':
1560 if (backslash)
1561 goto normal_char;
1562 dfa->lex.laststart = false;
1563 return dfa->lex.lasttok = parse_bracket_exp (dfa);
1565 default:
1566 normal_char:
1567 dfa->lex.laststart = false;
1568 /* For multibyte character sets, folding is done in atom. Always
1569 return WCHAR. */
1570 if (dfa->localeinfo.multibyte)
1571 return dfa->lex.lasttok = WCHAR;
1573 if (dfa->syntax.case_fold && isalpha (c))
1575 charclass ccl;
1576 zeroset (&ccl);
1577 setbit_case_fold_c (c, &ccl);
1578 return dfa->lex.lasttok = CSET + charclass_index (dfa, &ccl);
1581 return dfa->lex.lasttok = c;
1585 /* The above loop should consume at most a backslash
1586 and some other character. */
1587 abort ();
1588 return END; /* keeps pedantic compilers happy. */
1591 static void
1592 addtok_mb (struct dfa *dfa, token t, char mbprop)
1594 if (dfa->talloc == dfa->tindex)
1596 dfa->tokens = xpalloc (dfa->tokens, &dfa->talloc, 1, -1,
1597 sizeof *dfa->tokens);
1598 if (dfa->localeinfo.multibyte)
1599 dfa->multibyte_prop = xnrealloc (dfa->multibyte_prop, dfa->talloc,
1600 sizeof *dfa->multibyte_prop);
1602 if (dfa->localeinfo.multibyte)
1603 dfa->multibyte_prop[dfa->tindex] = mbprop;
1604 dfa->tokens[dfa->tindex++] = t;
1606 switch (t)
1608 case QMARK:
1609 case STAR:
1610 case PLUS:
1611 break;
1613 case CAT:
1614 case OR:
1615 dfa->parse.depth--;
1616 break;
1618 case BACKREF:
1619 dfa->fast = false;
1620 FALLTHROUGH;
1621 default:
1622 dfa->nleaves++;
1623 FALLTHROUGH;
1624 case EMPTY:
1625 dfa->parse.depth++;
1626 break;
1628 if (dfa->parse.depth > dfa->depth)
1629 dfa->depth = dfa->parse.depth;
1632 static void addtok_wc (struct dfa *dfa, wint_t wc);
1634 /* Add the given token to the parse tree, maintaining the depth count and
1635 updating the maximum depth if necessary. */
1636 static void
1637 addtok (struct dfa *dfa, token t)
1639 if (dfa->localeinfo.multibyte && t == MBCSET)
1641 bool need_or = false;
1643 /* Extract wide characters into alternations for better performance.
1644 This does not require UTF-8. */
1645 for (idx_t i = 0; i < dfa->lex.brack.nchars; i++)
1647 addtok_wc (dfa, dfa->lex.brack.chars[i]);
1648 if (need_or)
1649 addtok (dfa, OR);
1650 need_or = true;
1652 dfa->lex.brack.nchars = 0;
1654 /* Wide characters have been handled above, so it is possible
1655 that the set is empty now. Do nothing in that case. */
1656 if (dfa->lex.brack.cset != -1)
1658 addtok (dfa, CSET + dfa->lex.brack.cset);
1659 if (need_or)
1660 addtok (dfa, OR);
1663 else
1665 addtok_mb (dfa, t, 3);
1669 /* We treat a multibyte character as a single atom, so that DFA
1670 can treat a multibyte character as a single expression.
1672 e.g., we construct the following tree from "<mb1><mb2>".
1673 <mb1(1st-byte)><mb1(2nd-byte)><CAT><mb1(3rd-byte)><CAT>
1674 <mb2(1st-byte)><mb2(2nd-byte)><CAT><mb2(3rd-byte)><CAT><CAT> */
1675 static void
1676 addtok_wc (struct dfa *dfa, wint_t wc)
1678 unsigned char buf[MB_LEN_MAX];
1679 mbstate_t s = { 0 };
1680 size_t stored_bytes = wcrtomb ((char *) buf, wc, &s);
1681 int buflen;
1683 if (stored_bytes != (size_t) -1)
1684 buflen = stored_bytes;
1685 else
1687 /* This is merely stop-gap. buf[0] is undefined, yet skipping
1688 the addtok_mb call altogether can corrupt the heap. */
1689 buflen = 1;
1690 buf[0] = 0;
1693 addtok_mb (dfa, buf[0], buflen == 1 ? 3 : 1);
1694 for (int i = 1; i < buflen; i++)
1696 addtok_mb (dfa, buf[i], i == buflen - 1 ? 2 : 0);
1697 addtok (dfa, CAT);
1701 static void
1702 add_utf8_anychar (struct dfa *dfa)
1704 /* Since the Unicode Standard Version 4.0.0 (2003), a well-formed
1705 UTF-8 byte sequence has been defined as follows:
1707 ([\x00-\x7f]
1708 |[\xc2-\xdf][\x80-\xbf]
1709 |[\xe0][\xa0-\xbf][\x80-\xbf]
1710 |[\xe1-\xec\xee-\xef][\x80-\xbf][\x80-\xbf]
1711 |[\xed][\x80-\x9f][\x80-\xbf]
1712 |[\xf0][\x90-\xbf][\x80-\xbf][\x80-\xbf])
1713 |[\xf1-\xf3][\x80-\xbf][\x80-\xbf][\x80-\xbf]
1714 |[\xf4][\x80-\x8f][\x80-\xbf][\x80-\xbf])
1716 which I'll write more concisely "A|BC|DEC|FCC|GHC|IJCC|KCCC|LMCC",
1717 where A = [\x00-\x7f], B = [\xc2-\xdf], C = [\x80-\xbf],
1718 D = [\xe0], E = [\xa0-\xbf], F = [\xe1-\xec\xee-\xef], G = [\xed],
1719 H = [\x80-\x9f], I = [\xf0],
1720 J = [\x90-\xbf], K = [\xf1-\xf3], L = [\xf4], M = [\x80-\x8f].
1722 This can be refactored to "A|(B|DE|GH|(F|IJ|LM|KC)C)C". */
1724 /* Mnemonics for classes containing two or more bytes. */
1725 enum { A, B, C, E, F, H, J, K, M };
1727 /* Mnemonics for single-byte tokens. */
1728 enum { D_token = 0xe0, G_token = 0xed, I_token = 0xf0, L_token = 0xf4 };
1730 static charclass const utf8_classes[] = {
1731 /* A. 00-7f: 1-byte sequence. */
1732 CHARCLASS_INIT (0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0, 0, 0, 0),
1734 /* B. c2-df: 1st byte of a 2-byte sequence. */
1735 CHARCLASS_INIT (0, 0, 0, 0, 0, 0, 0xfffffffc, 0),
1737 /* C. 80-bf: non-leading bytes. */
1738 CHARCLASS_INIT (0, 0, 0, 0, 0xffffffff, 0xffffffff, 0, 0),
1740 /* D. e0 (just a token). */
1742 /* E. a0-bf: 2nd byte of a "DEC" sequence. */
1743 CHARCLASS_INIT (0, 0, 0, 0, 0, 0xffffffff, 0, 0),
1745 /* F. e1-ec + ee-ef: 1st byte of an "FCC" sequence. */
1746 CHARCLASS_INIT (0, 0, 0, 0, 0, 0, 0, 0xdffe),
1748 /* G. ed (just a token). */
1750 /* H. 80-9f: 2nd byte of a "GHC" sequence. */
1751 CHARCLASS_INIT (0, 0, 0, 0, 0xffff, 0, 0, 0),
1753 /* I. f0 (just a token). */
1755 /* J. 90-bf: 2nd byte of an "IJCC" sequence. */
1756 CHARCLASS_INIT (0, 0, 0, 0, 0xffff0000, 0xffffffff, 0, 0),
1758 /* K. f1-f3: 1st byte of a "KCCC" sequence. */
1759 CHARCLASS_INIT (0, 0, 0, 0, 0, 0, 0, 0xe0000),
1761 /* L. f4 (just a token). */
1763 /* M. 80-8f: 2nd byte of a "LMCC" sequence. */
1764 CHARCLASS_INIT (0, 0, 0, 0, 0xff, 0, 0, 0),
1767 /* Define the character classes that are needed below. */
1768 if (dfa->utf8_anychar_classes[0] == 0)
1770 charclass c = utf8_classes[0];
1771 if (! (dfa->syntax.syntax_bits & RE_DOT_NEWLINE))
1772 clrbit ('\n', &c);
1773 if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL)
1774 clrbit ('\0', &c);
1775 dfa->utf8_anychar_classes[0] = CSET + charclass_index (dfa, &c);
1777 for (int i = 1; i < sizeof utf8_classes / sizeof *utf8_classes; i++)
1778 dfa->utf8_anychar_classes[i]
1779 = CSET + charclass_index (dfa, &utf8_classes[i]);
1782 /* Implement the "A|(B|DE|GH|(F|IJ|LM|KC)C)C" pattern mentioned above.
1783 The token buffer is in reverse Polish order, so we get
1784 "A B D E CAT OR G H CAT OR F I J CAT OR L M CAT OR K
1785 C CAT OR C CAT OR C CAT OR". */
1786 addtok (dfa, dfa->utf8_anychar_classes[A]);
1787 addtok (dfa, dfa->utf8_anychar_classes[B]);
1788 addtok (dfa, D_token);
1789 addtok (dfa, dfa->utf8_anychar_classes[E]);
1790 addtok (dfa, CAT);
1791 addtok (dfa, OR);
1792 addtok (dfa, G_token);
1793 addtok (dfa, dfa->utf8_anychar_classes[H]);
1794 addtok (dfa, CAT);
1795 addtok (dfa, OR);
1796 addtok (dfa, dfa->utf8_anychar_classes[F]);
1797 addtok (dfa, I_token);
1798 addtok (dfa, dfa->utf8_anychar_classes[J]);
1799 addtok (dfa, CAT);
1800 addtok (dfa, OR);
1801 addtok (dfa, L_token);
1802 addtok (dfa, dfa->utf8_anychar_classes[M]);
1803 addtok (dfa, CAT);
1804 addtok (dfa, OR);
1805 addtok (dfa, dfa->utf8_anychar_classes[K]);
1806 for (int i = 0; i < 3; i++)
1808 addtok (dfa, dfa->utf8_anychar_classes[C]);
1809 addtok (dfa, CAT);
1810 addtok (dfa, OR);
1814 /* The grammar understood by the parser is as follows.
1816 regexp:
1817 regexp OR branch
1818 branch
1820 branch:
1821 branch closure
1822 closure
1824 closure:
1825 closure QMARK
1826 closure STAR
1827 closure PLUS
1828 closure REPMN
1829 atom
1831 atom:
1832 <normal character>
1833 <multibyte character>
1834 ANYCHAR
1835 MBCSET
1836 CSET
1837 BACKREF
1838 BEGLINE
1839 ENDLINE
1840 BEGWORD
1841 ENDWORD
1842 LIMWORD
1843 NOTLIMWORD
1844 LPAREN regexp RPAREN
1845 <empty>
1847 The parser builds a parse tree in postfix form in an array of tokens. */
1849 static void
1850 atom (struct dfa *dfa)
1852 if ((0 <= dfa->parse.tok && dfa->parse.tok < NOTCHAR)
1853 || dfa->parse.tok >= CSET
1854 || dfa->parse.tok == BEG || dfa->parse.tok == BACKREF
1855 || dfa->parse.tok == BEGLINE || dfa->parse.tok == ENDLINE
1856 || dfa->parse.tok == BEGWORD || dfa->parse.tok == ENDWORD
1857 || dfa->parse.tok == LIMWORD || dfa->parse.tok == NOTLIMWORD
1858 || dfa->parse.tok == ANYCHAR || dfa->parse.tok == MBCSET)
1860 if (dfa->parse.tok == ANYCHAR && dfa->localeinfo.using_utf8)
1862 /* For UTF-8 expand the period to a series of CSETs that define a
1863 valid UTF-8 character. This avoids using the slow multibyte
1864 path. I'm pretty sure it would be both profitable and correct to
1865 do it for any encoding; however, the optimization must be done
1866 manually as it is done above in add_utf8_anychar. So, let's
1867 start with UTF-8: it is the most used, and the structure of the
1868 encoding makes the correctness more obvious. */
1869 add_utf8_anychar (dfa);
1871 else
1872 addtok (dfa, dfa->parse.tok);
1873 dfa->parse.tok = lex (dfa);
1875 else if (dfa->parse.tok == WCHAR)
1877 if (dfa->lex.wctok == WEOF)
1878 addtok (dfa, BACKREF);
1879 else
1881 addtok_wc (dfa, dfa->lex.wctok);
1883 if (dfa->syntax.case_fold)
1885 wchar_t folded[CASE_FOLDED_BUFSIZE];
1886 int n = case_folded_counterparts (dfa->lex.wctok, folded);
1887 for (int i = 0; i < n; i++)
1889 addtok_wc (dfa, folded[i]);
1890 addtok (dfa, OR);
1895 dfa->parse.tok = lex (dfa);
1897 else if (dfa->parse.tok == LPAREN)
1899 dfa->parse.tok = lex (dfa);
1900 regexp (dfa);
1901 if (dfa->parse.tok != RPAREN)
1902 dfaerror (_("unbalanced ("));
1903 dfa->parse.tok = lex (dfa);
1905 else
1906 addtok (dfa, EMPTY);
1909 /* Return the number of tokens in the given subexpression. */
1910 static idx_t _GL_ATTRIBUTE_PURE
1911 nsubtoks (struct dfa const *dfa, idx_t tindex)
1913 switch (dfa->tokens[tindex - 1])
1915 default:
1916 return 1;
1917 case QMARK:
1918 case STAR:
1919 case PLUS:
1920 return 1 + nsubtoks (dfa, tindex - 1);
1921 case CAT:
1922 case OR:
1924 idx_t ntoks1 = nsubtoks (dfa, tindex - 1);
1925 return 1 + ntoks1 + nsubtoks (dfa, tindex - 1 - ntoks1);
1930 /* Copy the given subexpression to the top of the tree. */
1931 static void
1932 copytoks (struct dfa *dfa, idx_t tindex, idx_t ntokens)
1934 if (dfa->localeinfo.multibyte)
1935 for (idx_t i = 0; i < ntokens; i++)
1936 addtok_mb (dfa, dfa->tokens[tindex + i],
1937 dfa->multibyte_prop[tindex + i]);
1938 else
1939 for (idx_t i = 0; i < ntokens; i++)
1940 addtok_mb (dfa, dfa->tokens[tindex + i], 3);
1943 static void
1944 closure (struct dfa *dfa)
1946 atom (dfa);
1947 while (dfa->parse.tok == QMARK || dfa->parse.tok == STAR
1948 || dfa->parse.tok == PLUS || dfa->parse.tok == REPMN)
1949 if (dfa->parse.tok == REPMN && (dfa->lex.minrep || dfa->lex.maxrep))
1951 idx_t ntokens = nsubtoks (dfa, dfa->tindex);
1952 idx_t tindex = dfa->tindex - ntokens;
1953 if (dfa->lex.maxrep < 0)
1954 addtok (dfa, PLUS);
1955 if (dfa->lex.minrep == 0)
1956 addtok (dfa, QMARK);
1957 int i;
1958 for (i = 1; i < dfa->lex.minrep; i++)
1960 copytoks (dfa, tindex, ntokens);
1961 addtok (dfa, CAT);
1963 for (; i < dfa->lex.maxrep; i++)
1965 copytoks (dfa, tindex, ntokens);
1966 addtok (dfa, QMARK);
1967 addtok (dfa, CAT);
1969 dfa->parse.tok = lex (dfa);
1971 else if (dfa->parse.tok == REPMN)
1973 dfa->tindex -= nsubtoks (dfa, dfa->tindex);
1974 dfa->parse.tok = lex (dfa);
1975 closure (dfa);
1977 else
1979 addtok (dfa, dfa->parse.tok);
1980 dfa->parse.tok = lex (dfa);
1984 static void
1985 branch (struct dfa* dfa)
1987 closure (dfa);
1988 while (dfa->parse.tok != RPAREN && dfa->parse.tok != OR
1989 && dfa->parse.tok >= 0)
1991 closure (dfa);
1992 addtok (dfa, CAT);
1996 static void
1997 regexp (struct dfa *dfa)
1999 branch (dfa);
2000 while (dfa->parse.tok == OR)
2002 dfa->parse.tok = lex (dfa);
2003 branch (dfa);
2004 addtok (dfa, OR);
2008 /* Parse a string S of length LEN into D. S can include NUL characters.
2009 This is the main entry point for the parser. */
2010 void
2011 dfaparse (char const *s, idx_t len, struct dfa *d)
2013 d->lex.ptr = s;
2014 d->lex.left = len;
2015 d->lex.lasttok = END;
2016 d->lex.laststart = true;
2018 if (!d->syntax.syntax_bits_set)
2019 dfaerror (_("no syntax specified"));
2021 if (!d->nregexps)
2022 addtok (d, BEG);
2024 d->parse.tok = lex (d);
2025 d->parse.depth = d->depth;
2027 regexp (d);
2029 if (d->parse.tok != END)
2030 dfaerror (_("unbalanced )"));
2032 addtok (d, END - d->nregexps);
2033 addtok (d, CAT);
2035 if (d->nregexps)
2036 addtok (d, OR);
2038 ++d->nregexps;
2041 /* Some primitives for operating on sets of positions. */
2043 /* Copy one set to another. */
2044 static void
2045 copy (position_set const *src, position_set *dst)
2047 if (dst->alloc < src->nelem)
2049 free (dst->elems);
2050 dst->elems = xpalloc (NULL, &dst->alloc, src->nelem - dst->alloc, -1,
2051 sizeof *dst->elems);
2053 dst->nelem = src->nelem;
2054 if (src->nelem != 0)
2055 memcpy (dst->elems, src->elems, src->nelem * sizeof *dst->elems);
2058 static void
2059 alloc_position_set (position_set *s, idx_t size)
2061 s->elems = xnmalloc (size, sizeof *s->elems);
2062 s->alloc = size;
2063 s->nelem = 0;
2066 /* Insert position P in set S. S is maintained in sorted order on
2067 decreasing index. If there is already an entry in S with P.index
2068 then merge (logically-OR) P's constraints into the one in S.
2069 S->elems must point to an array large enough to hold the resulting set. */
2070 static void
2071 insert (position p, position_set *s)
2073 idx_t count = s->nelem;
2074 idx_t lo = 0, hi = count;
2075 while (lo < hi)
2077 idx_t mid = (lo + hi) >> 1;
2078 if (s->elems[mid].index < p.index)
2079 lo = mid + 1;
2080 else if (s->elems[mid].index == p.index)
2082 s->elems[mid].constraint |= p.constraint;
2083 return;
2085 else
2086 hi = mid;
2089 s->elems = maybe_realloc (s->elems, count, &s->alloc, -1, sizeof *s->elems);
2090 for (idx_t i = count; i > lo; i--)
2091 s->elems[i] = s->elems[i - 1];
2092 s->elems[lo] = p;
2093 ++s->nelem;
2096 static void
2097 append (position p, position_set *s)
2099 idx_t count = s->nelem;
2100 s->elems = maybe_realloc (s->elems, count, &s->alloc, -1, sizeof *s->elems);
2101 s->elems[s->nelem++] = p;
2104 /* Merge S1 and S2 (with the additional constraint C2) into M. The
2105 result is as if the positions of S1, and of S2 with the additional
2106 constraint C2, were inserted into an initially empty set. */
2107 static void
2108 merge_constrained (position_set const *s1, position_set const *s2,
2109 unsigned int c2, position_set *m)
2111 idx_t i = 0, j = 0;
2113 if (m->alloc - s1->nelem < s2->nelem)
2115 free (m->elems);
2116 m->alloc = s1->nelem;
2117 m->elems = xpalloc (NULL, &m->alloc, s2->nelem, -1, sizeof *m->elems);
2119 m->nelem = 0;
2120 while (i < s1->nelem || j < s2->nelem)
2121 if (! (j < s2->nelem)
2122 || (i < s1->nelem && s1->elems[i].index <= s2->elems[j].index))
2124 unsigned int c = ((i < s1->nelem && j < s2->nelem
2125 && s1->elems[i].index == s2->elems[j].index)
2126 ? s2->elems[j++].constraint & c2
2127 : 0);
2128 m->elems[m->nelem].index = s1->elems[i].index;
2129 m->elems[m->nelem++].constraint = s1->elems[i++].constraint | c;
2131 else
2133 if (s2->elems[j].constraint & c2)
2135 m->elems[m->nelem].index = s2->elems[j].index;
2136 m->elems[m->nelem++].constraint = s2->elems[j].constraint & c2;
2138 j++;
2142 /* Merge two sets of positions into a third. The result is exactly as if
2143 the positions of both sets were inserted into an initially empty set. */
2144 static void
2145 merge (position_set const *s1, position_set const *s2, position_set *m)
2147 merge_constrained (s1, s2, -1, m);
2150 static void
2151 merge2 (position_set *dst, position_set const *src, position_set *m)
2153 if (src->nelem < 4)
2155 for (idx_t i = 0; i < src->nelem; i++)
2156 insert (src->elems[i], dst);
2158 else
2160 merge (src, dst, m);
2161 copy (m, dst);
2165 /* Delete a position from a set. Return the nonzero constraint of the
2166 deleted position, or zero if there was no such position. */
2167 static unsigned int
2168 delete (idx_t del, position_set *s)
2170 idx_t count = s->nelem;
2171 idx_t lo = 0, hi = count;
2172 while (lo < hi)
2174 idx_t mid = (lo + hi) >> 1;
2175 if (s->elems[mid].index < del)
2176 lo = mid + 1;
2177 else if (s->elems[mid].index == del)
2179 unsigned int c = s->elems[mid].constraint;
2180 idx_t i;
2181 for (i = mid; i + 1 < count; i++)
2182 s->elems[i] = s->elems[i + 1];
2183 s->nelem = i;
2184 return c;
2186 else
2187 hi = mid;
2189 return 0;
2192 /* Replace a position with the followed set. */
2193 static void
2194 replace (position_set *dst, idx_t del, position_set *add,
2195 unsigned int constraint, position_set *tmp)
2197 unsigned int c = delete (del, dst) & constraint;
2199 if (c)
2201 copy (dst, tmp);
2202 merge_constrained (tmp, add, c, dst);
2206 /* Find the index of the state corresponding to the given position set with
2207 the given preceding context, or create a new state if there is no such
2208 state. Context tells whether we got here on a newline or letter. */
2209 static state_num
2210 state_index (struct dfa *d, position_set const *s, int context)
2212 size_t hash = 0;
2213 int constraint = 0;
2214 state_num i;
2215 token first_end = 0;
2217 for (i = 0; i < s->nelem; ++i)
2219 size_t ind = s->elems[i].index;
2220 hash ^= ind + s->elems[i].constraint;
2223 /* Try to find a state that exactly matches the proposed one. */
2224 for (i = 0; i < d->sindex; ++i)
2226 if (hash != d->states[i].hash || s->nelem != d->states[i].elems.nelem
2227 || context != d->states[i].context)
2228 continue;
2229 state_num j;
2230 for (j = 0; j < s->nelem; ++j)
2231 if (s->elems[j].constraint != d->states[i].elems.elems[j].constraint
2232 || s->elems[j].index != d->states[i].elems.elems[j].index)
2233 break;
2234 if (j == s->nelem)
2235 return i;
2238 #ifdef DEBUG
2239 fprintf (stderr, "new state %td\n nextpos:", i);
2240 for (state_num j = 0; j < s->nelem; j++)
2242 fprintf (stderr, " %td:", s->elems[j].index);
2243 prtok (d->tokens[s->elems[j].index]);
2245 fprintf (stderr, "\n context:");
2246 if (context ^ CTX_ANY)
2248 if (context & CTX_NONE)
2249 fprintf (stderr, " CTX_NONE");
2250 if (context & CTX_LETTER)
2251 fprintf (stderr, " CTX_LETTER");
2252 if (context & CTX_NEWLINE)
2253 fprintf (stderr, " CTX_NEWLINE");
2255 else
2256 fprintf (stderr, " CTX_ANY");
2257 fprintf (stderr, "\n");
2258 #endif
2260 for (state_num j = 0; j < s->nelem; j++)
2262 int c = d->constraints[s->elems[j].index];
2264 if (c != 0)
2266 if (succeeds_in_context (c, context, CTX_ANY))
2267 constraint |= c;
2268 if (!first_end)
2269 first_end = d->tokens[s->elems[j].index];
2271 else if (d->tokens[s->elems[j].index] == BACKREF)
2272 constraint = NO_CONSTRAINT;
2276 /* Create a new state. */
2277 d->states = maybe_realloc (d->states, d->sindex, &d->salloc, -1,
2278 sizeof *d->states);
2279 d->states[i].hash = hash;
2280 alloc_position_set (&d->states[i].elems, s->nelem);
2281 copy (s, &d->states[i].elems);
2282 d->states[i].context = context;
2283 d->states[i].constraint = constraint;
2284 d->states[i].first_end = first_end;
2285 d->states[i].mbps.nelem = 0;
2286 d->states[i].mbps.elems = NULL;
2287 d->states[i].mb_trindex = -1;
2289 ++d->sindex;
2291 return i;
2294 /* Find the epsilon closure of a set of positions. If any position of the set
2295 contains a symbol that matches the empty string in some context, replace
2296 that position with the elements of its follow labeled with an appropriate
2297 constraint. Repeat exhaustively until no funny positions are left.
2298 S->elems must be large enough to hold the result. */
2299 static void
2300 epsclosure (struct dfa const *d)
2302 position_set tmp;
2303 alloc_position_set (&tmp, d->nleaves);
2304 for (idx_t i = 0; i < d->tindex; i++)
2305 if (d->follows[i].nelem > 0 && d->tokens[i] >= NOTCHAR
2306 && d->tokens[i] != BACKREF && d->tokens[i] != ANYCHAR
2307 && d->tokens[i] != MBCSET && d->tokens[i] < CSET)
2309 unsigned int constraint;
2310 switch (d->tokens[i])
2312 case BEGLINE:
2313 constraint = BEGLINE_CONSTRAINT;
2314 break;
2315 case ENDLINE:
2316 constraint = ENDLINE_CONSTRAINT;
2317 break;
2318 case BEGWORD:
2319 constraint = BEGWORD_CONSTRAINT;
2320 break;
2321 case ENDWORD:
2322 constraint = ENDWORD_CONSTRAINT;
2323 break;
2324 case LIMWORD:
2325 constraint = LIMWORD_CONSTRAINT;
2326 break;
2327 case NOTLIMWORD:
2328 constraint = NOTLIMWORD_CONSTRAINT;
2329 break;
2330 default:
2331 constraint = NO_CONSTRAINT;
2332 break;
2335 delete (i, &d->follows[i]);
2337 for (idx_t j = 0; j < d->tindex; j++)
2338 if (i != j && d->follows[j].nelem > 0)
2339 replace (&d->follows[j], i, &d->follows[i], constraint, &tmp);
2341 free (tmp.elems);
2344 /* Returns the set of contexts for which there is at least one
2345 character included in C. */
2347 static int
2348 charclass_context (struct dfa const *dfa, charclass const *c)
2350 int context = 0;
2352 for (int j = 0; j < CHARCLASS_WORDS; j++)
2354 if (c->w[j] & dfa->syntax.newline.w[j])
2355 context |= CTX_NEWLINE;
2356 if (c->w[j] & dfa->syntax.letters.w[j])
2357 context |= CTX_LETTER;
2358 if (c->w[j] & ~(dfa->syntax.letters.w[j] | dfa->syntax.newline.w[j]))
2359 context |= CTX_NONE;
2362 return context;
2365 /* Returns the contexts on which the position set S depends. Each context
2366 in the set of returned contexts (let's call it SC) may have a different
2367 follow set than other contexts in SC, and also different from the
2368 follow set of the complement set (sc ^ CTX_ANY). However, all contexts
2369 in the complement set will have the same follow set. */
2371 static int _GL_ATTRIBUTE_PURE
2372 state_separate_contexts (struct dfa *d, position_set const *s)
2374 int separate_contexts = 0;
2376 for (idx_t j = 0; j < s->nelem; j++)
2377 separate_contexts |= d->separates[s->elems[j].index];
2379 return separate_contexts;
2382 enum
2384 /* Single token is repeated. It is distinguished from non-repeated. */
2385 OPT_REPEAT = (1 << 0),
2387 /* Multiple tokens are repeated. This flag is on at head of tokens. The
2388 node is not merged. */
2389 OPT_LPAREN = (1 << 1),
2391 /* Multiple branches are joined. The node is not merged. */
2392 OPT_RPAREN = (1 << 2),
2394 /* The node is walked. If the node is found in walking again, OPT_RPAREN
2395 flag is turned on. */
2396 OPT_WALKED = (1 << 3),
2398 /* The node is queued. The node is not queued again. */
2399 OPT_QUEUED = (1 << 4)
2402 static void
2403 merge_nfa_state (struct dfa *d, idx_t tindex, char *flags,
2404 position_set *merged)
2406 position_set *follows = d->follows;
2407 idx_t nelem = 0;
2409 d->constraints[tindex] = 0;
2411 for (idx_t i = 0; i < follows[tindex].nelem; i++)
2413 idx_t sindex = follows[tindex].elems[i].index;
2415 /* Skip the node as pruned in future. */
2416 unsigned int iconstraint = follows[tindex].elems[i].constraint;
2417 if (iconstraint == 0)
2418 continue;
2420 if (d->tokens[follows[tindex].elems[i].index] <= END)
2422 d->constraints[tindex] |= follows[tindex].elems[i].constraint;
2423 continue;
2426 if (!(flags[sindex] & (OPT_LPAREN | OPT_RPAREN)))
2428 idx_t j;
2430 for (j = 0; j < nelem; j++)
2432 idx_t dindex = follows[tindex].elems[j].index;
2434 if (follows[tindex].elems[j].constraint != iconstraint)
2435 continue;
2437 if (flags[dindex] & (OPT_LPAREN | OPT_RPAREN))
2438 continue;
2440 if (d->tokens[sindex] != d->tokens[dindex])
2441 continue;
2443 if ((flags[sindex] ^ flags[dindex]) & OPT_REPEAT)
2444 continue;
2446 if (flags[sindex] & OPT_REPEAT)
2447 delete (sindex, &follows[sindex]);
2449 merge2 (&follows[dindex], &follows[sindex], merged);
2451 break;
2454 if (j < nelem)
2455 continue;
2458 follows[tindex].elems[nelem++] = follows[tindex].elems[i];
2459 flags[sindex] |= OPT_QUEUED;
2462 follows[tindex].nelem = nelem;
2465 static int
2466 compare (const void *a, const void *b)
2468 position const *p = a, *q = b;
2469 return (p->index > q->index) - (p->index < q->index);
2472 static void
2473 reorder_tokens (struct dfa *d)
2475 idx_t nleaves;
2476 ptrdiff_t *map;
2477 token *tokens;
2478 position_set *follows;
2479 int *constraints;
2480 char *multibyte_prop;
2482 nleaves = 0;
2484 map = xnmalloc (d->tindex, sizeof *map);
2486 map[0] = nleaves++;
2488 for (idx_t i = 1; i < d->tindex; i++)
2489 map[i] = -1;
2491 tokens = xnmalloc (d->nleaves, sizeof *tokens);
2492 follows = xnmalloc (d->nleaves, sizeof *follows);
2493 constraints = xnmalloc (d->nleaves, sizeof *constraints);
2495 if (d->localeinfo.multibyte)
2496 multibyte_prop = xnmalloc (d->nleaves, sizeof *multibyte_prop);
2497 else
2498 multibyte_prop = NULL;
2500 for (idx_t i = 0; i < d->tindex; i++)
2502 if (map[i] == -1)
2504 free (d->follows[i].elems);
2505 d->follows[i].elems = NULL;
2506 d->follows[i].nelem = 0;
2507 continue;
2510 tokens[map[i]] = d->tokens[i];
2511 follows[map[i]] = d->follows[i];
2512 constraints[map[i]] = d->constraints[i];
2514 if (multibyte_prop != NULL)
2515 multibyte_prop[map[i]] = d->multibyte_prop[i];
2517 for (idx_t j = 0; j < d->follows[i].nelem; j++)
2519 if (map[d->follows[i].elems[j].index] == -1)
2520 map[d->follows[i].elems[j].index] = nleaves++;
2522 d->follows[i].elems[j].index = map[d->follows[i].elems[j].index];
2525 qsort (d->follows[i].elems, d->follows[i].nelem,
2526 sizeof *d->follows[i].elems, compare);
2529 for (idx_t i = 0; i < nleaves; i++)
2531 d->tokens[i] = tokens[i];
2532 d->follows[i] = follows[i];
2533 d->constraints[i] = constraints[i];
2535 if (multibyte_prop != NULL)
2536 d->multibyte_prop[i] = multibyte_prop[i];
2539 d->tindex = d->nleaves = nleaves;
2541 free (tokens);
2542 free (follows);
2543 free (constraints);
2544 free (multibyte_prop);
2545 free (map);
2548 static void
2549 dfaoptimize (struct dfa *d)
2551 char *flags = xzalloc (d->tindex);
2553 for (idx_t i = 0; i < d->tindex; i++)
2555 for (idx_t j = 0; j < d->follows[i].nelem; j++)
2557 if (d->follows[i].elems[j].index == i)
2558 flags[d->follows[i].elems[j].index] |= OPT_REPEAT;
2559 else if (d->follows[i].elems[j].index < i)
2560 flags[d->follows[i].elems[j].index] |= OPT_LPAREN;
2561 else if (flags[d->follows[i].elems[j].index] &= OPT_WALKED)
2562 flags[d->follows[i].elems[j].index] |= OPT_RPAREN;
2563 else
2564 flags[d->follows[i].elems[j].index] |= OPT_WALKED;
2568 flags[0] |= OPT_QUEUED;
2570 position_set merged0;
2571 position_set *merged = &merged0;
2572 alloc_position_set (merged, d->nleaves);
2574 d->constraints = xnmalloc (d->tindex, sizeof *d->constraints);
2576 for (idx_t i = 0; i < d->tindex; i++)
2577 if (flags[i] & OPT_QUEUED)
2578 merge_nfa_state (d, i, flags, merged);
2580 reorder_tokens (d);
2582 free (merged->elems);
2583 free (flags);
2586 /* Perform bottom-up analysis on the parse tree, computing various functions.
2587 Note that at this point, we're pretending constructs like \< are real
2588 characters rather than constraints on what can follow them.
2590 Nullable: A node is nullable if it is at the root of a regexp that can
2591 match the empty string.
2592 * EMPTY leaves are nullable.
2593 * No other leaf is nullable.
2594 * A QMARK or STAR node is nullable.
2595 * A PLUS node is nullable if its argument is nullable.
2596 * A CAT node is nullable if both its arguments are nullable.
2597 * An OR node is nullable if either argument is nullable.
2599 Firstpos: The firstpos of a node is the set of positions (nonempty leaves)
2600 that could correspond to the first character of a string matching the
2601 regexp rooted at the given node.
2602 * EMPTY leaves have empty firstpos.
2603 * The firstpos of a nonempty leaf is that leaf itself.
2604 * The firstpos of a QMARK, STAR, or PLUS node is the firstpos of its
2605 argument.
2606 * The firstpos of a CAT node is the firstpos of the left argument, union
2607 the firstpos of the right if the left argument is nullable.
2608 * The firstpos of an OR node is the union of firstpos of each argument.
2610 Lastpos: The lastpos of a node is the set of positions that could
2611 correspond to the last character of a string matching the regexp at
2612 the given node.
2613 * EMPTY leaves have empty lastpos.
2614 * The lastpos of a nonempty leaf is that leaf itself.
2615 * The lastpos of a QMARK, STAR, or PLUS node is the lastpos of its
2616 argument.
2617 * The lastpos of a CAT node is the lastpos of its right argument, union
2618 the lastpos of the left if the right argument is nullable.
2619 * The lastpos of an OR node is the union of the lastpos of each argument.
2621 Follow: The follow of a position is the set of positions that could
2622 correspond to the character following a character matching the node in
2623 a string matching the regexp. At this point we consider special symbols
2624 that match the empty string in some context to be just normal characters.
2625 Later, if we find that a special symbol is in a follow set, we will
2626 replace it with the elements of its follow, labeled with an appropriate
2627 constraint.
2628 * Every node in the firstpos of the argument of a STAR or PLUS node is in
2629 the follow of every node in the lastpos.
2630 * Every node in the firstpos of the second argument of a CAT node is in
2631 the follow of every node in the lastpos of the first argument.
2633 Because of the postfix representation of the parse tree, the depth-first
2634 analysis is conveniently done by a linear scan with the aid of a stack.
2635 Sets are stored as arrays of the elements, obeying a stack-like allocation
2636 scheme; the number of elements in each set deeper in the stack can be
2637 used to determine the address of a particular set's array. */
2638 static void
2639 dfaanalyze (struct dfa *d, bool searchflag)
2641 /* Array allocated to hold position sets. */
2642 position *posalloc = xnmalloc (d->nleaves, 2 * sizeof *posalloc);
2643 /* Firstpos and lastpos elements. */
2644 position *firstpos = posalloc;
2645 position *lastpos = firstpos + d->nleaves;
2646 position pos;
2647 position_set tmp;
2649 /* Stack for element counts and nullable flags. */
2650 struct
2652 /* Whether the entry is nullable. */
2653 bool nullable;
2655 /* Counts of firstpos and lastpos sets. */
2656 idx_t nfirstpos;
2657 idx_t nlastpos;
2658 } *stkalloc = xnmalloc (d->depth, sizeof *stkalloc), *stk = stkalloc;
2660 position_set merged; /* Result of merging sets. */
2662 addtok (d, CAT);
2664 #ifdef DEBUG
2665 fprintf (stderr, "dfaanalyze:\n");
2666 for (idx_t i = 0; i < d->tindex; i++)
2668 fprintf (stderr, " %td:", i);
2669 prtok (d->tokens[i]);
2671 putc ('\n', stderr);
2672 #endif
2674 d->searchflag = searchflag;
2675 alloc_position_set (&merged, d->nleaves);
2676 d->follows = xcalloc (d->tindex, sizeof *d->follows);
2678 for (idx_t i = 0; i < d->tindex; i++)
2680 switch (d->tokens[i])
2682 case EMPTY:
2683 /* The empty set is nullable. */
2684 stk->nullable = true;
2686 /* The firstpos and lastpos of the empty leaf are both empty. */
2687 stk->nfirstpos = stk->nlastpos = 0;
2688 stk++;
2689 break;
2691 case STAR:
2692 case PLUS:
2693 /* Every element in the firstpos of the argument is in the follow
2694 of every element in the lastpos. */
2696 tmp.elems = firstpos - stk[-1].nfirstpos;
2697 tmp.nelem = stk[-1].nfirstpos;
2698 position *p = lastpos - stk[-1].nlastpos;
2699 for (idx_t j = 0; j < stk[-1].nlastpos; j++)
2701 merge (&tmp, &d->follows[p[j].index], &merged);
2702 copy (&merged, &d->follows[p[j].index]);
2705 FALLTHROUGH;
2706 case QMARK:
2707 /* A QMARK or STAR node is automatically nullable. */
2708 if (d->tokens[i] != PLUS)
2709 stk[-1].nullable = true;
2710 break;
2712 case CAT:
2713 /* Every element in the firstpos of the second argument is in the
2714 follow of every element in the lastpos of the first argument. */
2716 tmp.nelem = stk[-1].nfirstpos;
2717 tmp.elems = firstpos - stk[-1].nfirstpos;
2718 position *p = lastpos - stk[-1].nlastpos - stk[-2].nlastpos;
2719 for (idx_t j = 0; j < stk[-2].nlastpos; j++)
2721 merge (&tmp, &d->follows[p[j].index], &merged);
2722 copy (&merged, &d->follows[p[j].index]);
2726 /* The firstpos of a CAT node is the firstpos of the first argument,
2727 union that of the second argument if the first is nullable. */
2728 if (stk[-2].nullable)
2729 stk[-2].nfirstpos += stk[-1].nfirstpos;
2730 else
2731 firstpos -= stk[-1].nfirstpos;
2733 /* The lastpos of a CAT node is the lastpos of the second argument,
2734 union that of the first argument if the second is nullable. */
2735 if (stk[-1].nullable)
2736 stk[-2].nlastpos += stk[-1].nlastpos;
2737 else
2739 position *p = lastpos - stk[-1].nlastpos - stk[-2].nlastpos;
2740 for (idx_t j = 0; j < stk[-1].nlastpos; j++)
2741 p[j] = p[j + stk[-2].nlastpos];
2742 lastpos -= stk[-2].nlastpos;
2743 stk[-2].nlastpos = stk[-1].nlastpos;
2746 /* A CAT node is nullable if both arguments are nullable. */
2747 stk[-2].nullable &= stk[-1].nullable;
2748 stk--;
2749 break;
2751 case OR:
2752 /* The firstpos is the union of the firstpos of each argument. */
2753 stk[-2].nfirstpos += stk[-1].nfirstpos;
2755 /* The lastpos is the union of the lastpos of each argument. */
2756 stk[-2].nlastpos += stk[-1].nlastpos;
2758 /* An OR node is nullable if either argument is nullable. */
2759 stk[-2].nullable |= stk[-1].nullable;
2760 stk--;
2761 break;
2763 default:
2764 /* Anything else is a nonempty position. (Note that special
2765 constructs like \< are treated as nonempty strings here;
2766 an "epsilon closure" effectively makes them nullable later.
2767 Backreferences have to get a real position so we can detect
2768 transitions on them later. But they are nullable. */
2769 stk->nullable = d->tokens[i] == BACKREF;
2771 /* This position is in its own firstpos and lastpos. */
2772 stk->nfirstpos = stk->nlastpos = 1;
2773 stk++;
2775 firstpos->index = lastpos->index = i;
2776 firstpos->constraint = lastpos->constraint = NO_CONSTRAINT;
2777 firstpos++, lastpos++;
2779 break;
2781 #ifdef DEBUG
2782 /* ... balance the above nonsyntactic #ifdef goo... */
2783 fprintf (stderr, "node %td:", i);
2784 prtok (d->tokens[i]);
2785 putc ('\n', stderr);
2786 fprintf (stderr,
2787 stk[-1].nullable ? " nullable: yes\n" : " nullable: no\n");
2788 fprintf (stderr, " firstpos:");
2789 for (idx_t j = 0; j < stk[-1].nfirstpos; j++)
2791 fprintf (stderr, " %td:", firstpos[j - stk[-1].nfirstpos].index);
2792 prtok (d->tokens[firstpos[j - stk[-1].nfirstpos].index]);
2794 fprintf (stderr, "\n lastpos:");
2795 for (idx_t j = 0; j < stk[-1].nlastpos; j++)
2797 fprintf (stderr, " %td:", lastpos[j - stk[-1].nlastpos].index);
2798 prtok (d->tokens[lastpos[j - stk[-1].nlastpos].index]);
2800 putc ('\n', stderr);
2801 #endif
2804 /* For each follow set that is the follow set of a real position, replace
2805 it with its epsilon closure. */
2806 epsclosure (d);
2808 dfaoptimize (d);
2810 #ifdef DEBUG
2811 for (idx_t i = 0; i < d->tindex; i++)
2812 if (d->tokens[i] == BEG || d->tokens[i] < NOTCHAR
2813 || d->tokens[i] == BACKREF || d->tokens[i] == ANYCHAR
2814 || d->tokens[i] == MBCSET || d->tokens[i] >= CSET)
2816 fprintf (stderr, "follows(%td:", i);
2817 prtok (d->tokens[i]);
2818 fprintf (stderr, "):");
2819 for (idx_t j = 0; j < d->follows[i].nelem; j++)
2821 fprintf (stderr, " %td:", d->follows[i].elems[j].index);
2822 prtok (d->tokens[d->follows[i].elems[j].index]);
2824 putc ('\n', stderr);
2826 #endif
2828 pos.index = 0;
2829 pos.constraint = NO_CONSTRAINT;
2831 alloc_position_set (&tmp, 1);
2833 append (pos, &tmp);
2835 d->separates = xnmalloc (d->tindex, sizeof *d->separates);
2837 for (idx_t i = 0; i < d->tindex; i++)
2839 d->separates[i] = 0;
2841 if (prev_newline_dependent (d->constraints[i]))
2842 d->separates[i] |= CTX_NEWLINE;
2843 if (prev_letter_dependent (d->constraints[i]))
2844 d->separates[i] |= CTX_LETTER;
2846 for (idx_t j = 0; j < d->follows[i].nelem; j++)
2848 if (prev_newline_dependent (d->follows[i].elems[j].constraint))
2849 d->separates[i] |= CTX_NEWLINE;
2850 if (prev_letter_dependent (d->follows[i].elems[j].constraint))
2851 d->separates[i] |= CTX_LETTER;
2855 /* Context wanted by some position. */
2856 int separate_contexts = state_separate_contexts (d, &tmp);
2858 /* Build the initial state. */
2859 if (separate_contexts & CTX_NEWLINE)
2860 state_index (d, &tmp, CTX_NEWLINE);
2861 d->initstate_notbol = d->min_trcount
2862 = state_index (d, &tmp, separate_contexts ^ CTX_ANY);
2863 if (separate_contexts & CTX_LETTER)
2864 d->min_trcount = state_index (d, &tmp, CTX_LETTER);
2865 d->min_trcount++;
2866 d->trcount = 0;
2868 free (posalloc);
2869 free (stkalloc);
2870 free (merged.elems);
2871 free (tmp.elems);
2874 /* Make sure D's state arrays are large enough to hold NEW_STATE. */
2875 static void
2876 realloc_trans_if_necessary (struct dfa *d)
2878 state_num oldalloc = d->tralloc;
2879 if (oldalloc < d->sindex)
2881 state_num **realtrans = d->trans ? d->trans - 2 : NULL;
2882 idx_t newalloc1 = realtrans ? d->tralloc + 2 : 0;
2883 realtrans = xpalloc (realtrans, &newalloc1, d->sindex - oldalloc,
2884 -1, sizeof *realtrans);
2885 realtrans[0] = realtrans[1] = NULL;
2886 d->trans = realtrans + 2;
2887 idx_t newalloc = d->tralloc = newalloc1 - 2;
2888 d->fails = xnrealloc (d->fails, newalloc, sizeof *d->fails);
2889 d->success = xnrealloc (d->success, newalloc, sizeof *d->success);
2890 d->newlines = xnrealloc (d->newlines, newalloc, sizeof *d->newlines);
2891 if (d->localeinfo.multibyte)
2893 realtrans = d->mb_trans ? d->mb_trans - 2 : NULL;
2894 realtrans = xnrealloc (realtrans, newalloc1, sizeof *realtrans);
2895 if (oldalloc == 0)
2896 realtrans[0] = realtrans[1] = NULL;
2897 d->mb_trans = realtrans + 2;
2899 for (; oldalloc < newalloc; oldalloc++)
2901 d->trans[oldalloc] = NULL;
2902 d->fails[oldalloc] = NULL;
2903 if (d->localeinfo.multibyte)
2904 d->mb_trans[oldalloc] = NULL;
2910 Calculate the transition table for a new state derived from state s
2911 for a compiled dfa d after input character uc, and return the new
2912 state number.
2914 Do not worry about all possible input characters; calculate just the group
2915 of positions that match uc. Label it with the set of characters that
2916 every position in the group matches (taking into account, if necessary,
2917 preceding context information of s). Then find the union
2918 of these positions' follows, i.e., the set of positions of the
2919 new state. For each character in the group's label, set the transition
2920 on this character to be to a state corresponding to the set's positions,
2921 and its associated backward context information, if necessary.
2923 When building a searching matcher, include the positions of state
2924 0 in every state.
2926 The group is constructed by building an equivalence-class
2927 partition of the positions of s.
2929 For each position, find the set of characters C that it matches. Eliminate
2930 any characters from C that fail on grounds of backward context.
2932 Check whether the group's label L has nonempty
2933 intersection with C. If L - C is nonempty, create a new group labeled
2934 L - C and having the same positions as the current group, and set L to
2935 the intersection of L and C. Insert the position in the group, set
2936 C = C - L, and resume scanning.
2938 If after comparing with every group there are characters remaining in C,
2939 create a new group labeled with the characters of C and insert this
2940 position in that group. */
2942 static state_num
2943 build_state (state_num s, struct dfa *d, unsigned char uc)
2945 position_set follows; /* Union of the follows for each
2946 position of the current state. */
2947 position_set group; /* Positions that match the input char. */
2948 position_set tmp; /* Temporary space for merging sets. */
2949 state_num state; /* New state. */
2950 state_num state_newline; /* New state on a newline transition. */
2951 state_num state_letter; /* New state on a letter transition. */
2953 #ifdef DEBUG
2954 fprintf (stderr, "build state %td\n", s);
2955 #endif
2957 /* A pointer to the new transition table, and the table itself. */
2958 state_num **ptrans = (accepting (s, d) ? d->fails : d->trans) + s;
2959 state_num *trans = *ptrans;
2961 if (!trans)
2963 /* MAX_TRCOUNT is an arbitrary upper limit on the number of
2964 transition tables that can exist at once, other than for
2965 initial states. Often-used transition tables are quickly
2966 rebuilt, whereas rarely-used ones are cleared away. */
2967 if (MAX_TRCOUNT <= d->trcount)
2969 for (state_num i = d->min_trcount; i < d->tralloc; i++)
2971 free (d->trans[i]);
2972 free (d->fails[i]);
2973 d->trans[i] = d->fails[i] = NULL;
2975 d->trcount = 0;
2978 d->trcount++;
2979 *ptrans = trans = xmalloc (NOTCHAR * sizeof *trans);
2981 /* Fill transition table with a default value which means that the
2982 transited state has not been calculated yet. */
2983 for (int i = 0; i < NOTCHAR; i++)
2984 trans[i] = -2;
2987 /* Set up the success bits for this state. */
2988 d->success[s] = 0;
2989 if (accepts_in_context (d->states[s].context, CTX_NEWLINE, s, d))
2990 d->success[s] |= CTX_NEWLINE;
2991 if (accepts_in_context (d->states[s].context, CTX_LETTER, s, d))
2992 d->success[s] |= CTX_LETTER;
2993 if (accepts_in_context (d->states[s].context, CTX_NONE, s, d))
2994 d->success[s] |= CTX_NONE;
2996 alloc_position_set (&follows, d->nleaves);
2998 /* Find the union of the follows of the positions of the group.
2999 This is a hideously inefficient loop. Fix it someday. */
3000 for (idx_t j = 0; j < d->states[s].elems.nelem; j++)
3001 for (idx_t k = 0;
3002 k < d->follows[d->states[s].elems.elems[j].index].nelem; ++k)
3003 insert (d->follows[d->states[s].elems.elems[j].index].elems[k],
3004 &follows);
3006 /* Positions that match the input char. */
3007 alloc_position_set (&group, d->nleaves);
3009 /* The group's label. */
3010 charclass label;
3011 fillset (&label);
3013 for (idx_t i = 0; i < follows.nelem; i++)
3015 charclass matches; /* Set of matching characters. */
3016 position pos = follows.elems[i];
3017 bool matched = false;
3018 if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR)
3020 zeroset (&matches);
3021 setbit (d->tokens[pos.index], &matches);
3022 if (d->tokens[pos.index] == uc)
3023 matched = true;
3025 else if (d->tokens[pos.index] >= CSET)
3027 matches = d->charclasses[d->tokens[pos.index] - CSET];
3028 if (tstbit (uc, &matches))
3029 matched = true;
3031 else if (d->tokens[pos.index] == ANYCHAR)
3033 matches = d->charclasses[d->canychar];
3034 if (tstbit (uc, &matches))
3035 matched = true;
3037 /* ANYCHAR must match with a single character, so we must put
3038 it to D->states[s].mbps which contains the positions which
3039 can match with a single character not a byte. If all
3040 positions which has ANYCHAR does not depend on context of
3041 next character, we put the follows instead of it to
3042 D->states[s].mbps to optimize. */
3043 if (succeeds_in_context (pos.constraint, d->states[s].context,
3044 CTX_NONE))
3046 if (d->states[s].mbps.nelem == 0)
3047 alloc_position_set (&d->states[s].mbps, 1);
3048 insert (pos, &d->states[s].mbps);
3051 else
3052 continue;
3054 /* Some characters may need to be eliminated from matches because
3055 they fail in the current context. */
3056 if (pos.constraint != NO_CONSTRAINT)
3058 if (!succeeds_in_context (pos.constraint,
3059 d->states[s].context, CTX_NEWLINE))
3060 for (int j = 0; j < CHARCLASS_WORDS; j++)
3061 matches.w[j] &= ~d->syntax.newline.w[j];
3062 if (!succeeds_in_context (pos.constraint,
3063 d->states[s].context, CTX_LETTER))
3064 for (int j = 0; j < CHARCLASS_WORDS; ++j)
3065 matches.w[j] &= ~d->syntax.letters.w[j];
3066 if (!succeeds_in_context (pos.constraint,
3067 d->states[s].context, CTX_NONE))
3068 for (int j = 0; j < CHARCLASS_WORDS; ++j)
3069 matches.w[j] &= d->syntax.letters.w[j] | d->syntax.newline.w[j];
3071 /* If there are no characters left, there's no point in going on. */
3072 if (emptyset (&matches))
3073 continue;
3075 /* If we have reset the bit that made us declare "matched", reset
3076 that indicator, too. This is required to avoid an infinite loop
3077 with this command: echo cx | LC_ALL=C grep -E 'c\b[x ]' */
3078 if (!tstbit (uc, &matches))
3079 matched = false;
3082 #ifdef DEBUG
3083 fprintf (stderr, " nextpos %td:", pos.index);
3084 prtok (d->tokens[pos.index]);
3085 fprintf (stderr, " of");
3086 for (unsigned j = 0; j < NOTCHAR; j++)
3087 if (tstbit (j, &matches))
3088 fprintf (stderr, " 0x%02x", j);
3089 fprintf (stderr, "\n");
3090 #endif
3092 if (matched)
3094 for (int k = 0; k < CHARCLASS_WORDS; ++k)
3095 label.w[k] &= matches.w[k];
3096 append (pos, &group);
3098 else
3100 for (int k = 0; k < CHARCLASS_WORDS; ++k)
3101 label.w[k] &= ~matches.w[k];
3105 alloc_position_set (&tmp, d->nleaves);
3107 if (group.nelem > 0)
3109 /* If we are building a searching matcher, throw in the positions
3110 of state 0 as well, if possible. */
3111 if (d->searchflag)
3113 /* If a token in follows.elems is not 1st byte of a multibyte
3114 character, or the states of follows must accept the bytes
3115 which are not 1st byte of the multibyte character.
3116 Then, if a state of follows encounters a byte, it must not be
3117 a 1st byte of a multibyte character nor a single byte character.
3118 In this case, do not add state[0].follows to next state, because
3119 state[0] must accept 1st-byte.
3121 For example, suppose <sb a> is a certain single byte character,
3122 <mb A> is a certain multibyte character, and the codepoint of
3123 <sb a> equals the 2nd byte of the codepoint of <mb A>. When
3124 state[0] accepts <sb a>, state[i] transits to state[i+1] by
3125 accepting the 1st byte of <mb A>, and state[i+1] accepts the
3126 2nd byte of <mb A>, if state[i+1] encounters the codepoint of
3127 <sb a>, it must not be <sb a> but the 2nd byte of <mb A>, so do
3128 not add state[0]. */
3130 bool mergeit = !d->localeinfo.multibyte;
3131 if (!mergeit)
3133 mergeit = true;
3134 for (idx_t j = 0; mergeit && j < group.nelem; j++)
3135 mergeit &= d->multibyte_prop[group.elems[j].index];
3137 if (mergeit)
3139 merge (&d->states[0].elems, &group, &tmp);
3140 copy (&tmp, &group);
3144 /* Find out if the new state will want any context information,
3145 by calculating possible contexts that the group can match,
3146 and separate contexts that the new state wants to know. */
3147 int possible_contexts = charclass_context (d, &label);
3148 int separate_contexts = state_separate_contexts (d, &group);
3150 /* Find the state(s) corresponding to the union of the follows. */
3151 if (possible_contexts & ~separate_contexts)
3152 state = state_index (d, &group, separate_contexts ^ CTX_ANY);
3153 else
3154 state = -1;
3155 if (separate_contexts & possible_contexts & CTX_NEWLINE)
3156 state_newline = state_index (d, &group, CTX_NEWLINE);
3157 else
3158 state_newline = state;
3159 if (separate_contexts & possible_contexts & CTX_LETTER)
3160 state_letter = state_index (d, &group, CTX_LETTER);
3161 else
3162 state_letter = state;
3164 /* Reallocate now, to reallocate any newline transition properly. */
3165 realloc_trans_if_necessary (d);
3168 /* If we are a searching matcher, the default transition is to a state
3169 containing the positions of state 0, otherwise the default transition
3170 is to fail miserably. */
3171 else if (d->searchflag)
3173 state_newline = 0;
3174 state_letter = d->min_trcount - 1;
3175 state = d->initstate_notbol;
3177 else
3179 state_newline = -1;
3180 state_letter = -1;
3181 state = -1;
3184 /* Set the transitions for each character in the label. */
3185 for (int i = 0; i < NOTCHAR; i++)
3186 if (tstbit (i, &label))
3187 switch (d->syntax.sbit[i])
3189 case CTX_NEWLINE:
3190 trans[i] = state_newline;
3191 break;
3192 case CTX_LETTER:
3193 trans[i] = state_letter;
3194 break;
3195 default:
3196 trans[i] = state;
3197 break;
3200 #ifdef DEBUG
3201 fprintf (stderr, "trans table %td", s);
3202 for (int i = 0; i < NOTCHAR; ++i)
3204 if (!(i & 0xf))
3205 fprintf (stderr, "\n");
3206 fprintf (stderr, " %2td", trans[i]);
3208 fprintf (stderr, "\n");
3209 #endif
3211 free (group.elems);
3212 free (follows.elems);
3213 free (tmp.elems);
3215 /* Keep the newline transition in a special place so we can use it as
3216 a sentinel. */
3217 if (tstbit (d->syntax.eolbyte, &label))
3219 d->newlines[s] = trans[d->syntax.eolbyte];
3220 trans[d->syntax.eolbyte] = -1;
3223 return trans[uc];
3226 /* Multibyte character handling sub-routines for dfaexec. */
3228 /* Consume a single byte and transit state from 's' to '*next_state'.
3229 This function is almost same as the state transition routin in dfaexec.
3230 But state transition is done just once, otherwise matching succeed or
3231 reach the end of the buffer. */
3232 static state_num
3233 transit_state_singlebyte (struct dfa *d, state_num s, unsigned char const **pp)
3235 state_num *t;
3237 if (d->trans[s])
3238 t = d->trans[s];
3239 else if (d->fails[s])
3240 t = d->fails[s];
3241 else
3243 build_state (s, d, **pp);
3244 if (d->trans[s])
3245 t = d->trans[s];
3246 else
3248 t = d->fails[s];
3249 assert (t);
3253 if (t[**pp] == -2)
3254 build_state (s, d, **pp);
3256 return t[*(*pp)++];
3259 /* Transit state from s, then return new state and update the pointer of
3260 the buffer. This function is for a period operator which can match a
3261 multi-byte character. */
3262 static state_num
3263 transit_state (struct dfa *d, state_num s, unsigned char const **pp,
3264 unsigned char const *end)
3266 wint_t wc;
3268 int mbclen = mbs_to_wchar (&wc, (char const *) *pp, end - *pp, d);
3270 /* This state has some operators which can match a multibyte character. */
3271 d->mb_follows.nelem = 0;
3273 /* Calculate the state which can be reached from the state 's' by
3274 consuming 'mbclen' single bytes from the buffer. */
3275 state_num s1 = s;
3276 int mbci;
3277 for (mbci = 0; mbci < mbclen && (mbci == 0 || d->min_trcount <= s); mbci++)
3278 s = transit_state_singlebyte (d, s, pp);
3279 *pp += mbclen - mbci;
3281 if (wc == WEOF)
3283 /* It is an invalid character, so ANYCHAR is not accepted. */
3284 return s;
3287 /* If all positions which have ANYCHAR do not depend on the context
3288 of the next character, calculate the next state with
3289 pre-calculated follows and cache the result. */
3290 if (d->states[s1].mb_trindex < 0)
3292 if (MAX_TRCOUNT <= d->mb_trcount)
3294 state_num s3;
3295 for (s3 = -1; s3 < d->tralloc; s3++)
3297 free (d->mb_trans[s3]);
3298 d->mb_trans[s3] = NULL;
3301 for (state_num i = 0; i < d->sindex; i++)
3302 d->states[i].mb_trindex = -1;
3303 d->mb_trcount = 0;
3305 d->states[s1].mb_trindex = d->mb_trcount++;
3308 if (! d->mb_trans[s])
3310 enum { TRANSPTR_SIZE = sizeof *d->mb_trans[s] };
3311 enum { TRANSALLOC_SIZE = MAX_TRCOUNT * TRANSPTR_SIZE };
3312 d->mb_trans[s] = xmalloc (TRANSALLOC_SIZE);
3313 for (int i = 0; i < MAX_TRCOUNT; i++)
3314 d->mb_trans[s][i] = -1;
3316 else if (d->mb_trans[s][d->states[s1].mb_trindex] >= 0)
3317 return d->mb_trans[s][d->states[s1].mb_trindex];
3319 if (s == -1)
3320 copy (&d->states[s1].mbps, &d->mb_follows);
3321 else
3322 merge (&d->states[s1].mbps, &d->states[s].elems, &d->mb_follows);
3324 int separate_contexts = state_separate_contexts (d, &d->mb_follows);
3325 state_num s2 = state_index (d, &d->mb_follows, separate_contexts ^ CTX_ANY);
3326 realloc_trans_if_necessary (d);
3328 d->mb_trans[s][d->states[s1].mb_trindex] = s2;
3330 return s2;
3333 /* The initial state may encounter a byte which is not a single byte character
3334 nor the first byte of a multibyte character. But it is incorrect for the
3335 initial state to accept such a byte. For example, in Shift JIS the regular
3336 expression "\\" accepts the codepoint 0x5c, but should not accept the second
3337 byte of the codepoint 0x815c. Then the initial state must skip the bytes
3338 that are not a single byte character nor the first byte of a multibyte
3339 character.
3341 Given DFA state d, use mbs_to_wchar to advance MBP until it reaches
3342 or exceeds P, and return the advanced MBP. If WCP is non-NULL and
3343 the result is greater than P, set *WCP to the final wide character
3344 processed, or to WEOF if no wide character is processed. Otherwise,
3345 if WCP is non-NULL, *WCP may or may not be updated.
3347 Both P and MBP must be no larger than END. */
3348 static unsigned char const *
3349 skip_remains_mb (struct dfa *d, unsigned char const *p,
3350 unsigned char const *mbp, char const *end)
3352 if (d->syntax.never_trail[*p])
3353 return p;
3354 while (mbp < p)
3356 wint_t wc;
3357 mbp += mbs_to_wchar (&wc, (char const *) mbp,
3358 end - (char const *) mbp, d);
3360 return mbp;
3363 /* Search through a buffer looking for a match to the struct dfa *D.
3364 Find the first occurrence of a string matching the regexp in the
3365 buffer, and the shortest possible version thereof. Return a pointer to
3366 the first character after the match, or NULL if none is found. BEGIN
3367 points to the beginning of the buffer, and END points to the first byte
3368 after its end. Note however that we store a sentinel byte (usually
3369 newline) in *END, so the actual buffer must be one byte longer.
3370 When ALLOW_NL, newlines may appear in the matching string.
3371 If COUNT is non-NULL, increment *COUNT once for each newline processed.
3372 If MULTIBYTE, the input consists of multibyte characters and/or
3373 encoding-error bytes. Otherwise, it consists of single-byte characters.
3374 Here is the list of features that make this DFA matcher punt:
3375 - [M-N] range in non-simple locale: regex is up to 25% faster on [a-z]
3376 - [^...] in non-simple locale
3377 - [[=foo=]] or [[.foo.]]
3378 - [[:alpha:]] etc. in multibyte locale (except [[:digit:]] works OK)
3379 - back-reference: (.)\1
3380 - word-delimiter in multibyte locale: \<, \>, \b, \B
3381 See struct localeinfo.simple for the definition of "simple locale". */
3383 static inline char *
3384 dfaexec_main (struct dfa *d, char const *begin, char *end, bool allow_nl,
3385 ptrdiff_t *count, bool multibyte)
3387 if (MAX_TRCOUNT <= d->sindex)
3389 for (state_num s = d->min_trcount; s < d->sindex; s++)
3391 free (d->states[s].elems.elems);
3392 free (d->states[s].mbps.elems);
3394 d->sindex = d->min_trcount;
3396 if (d->trans)
3398 for (state_num s = 0; s < d->tralloc; s++)
3400 free (d->trans[s]);
3401 free (d->fails[s]);
3402 d->trans[s] = d->fails[s] = NULL;
3404 d->trcount = 0;
3407 if (d->localeinfo.multibyte && d->mb_trans)
3409 for (state_num s = -1; s < d->tralloc; s++)
3411 free (d->mb_trans[s]);
3412 d->mb_trans[s] = NULL;
3414 for (state_num s = 0; s < d->min_trcount; s++)
3415 d->states[s].mb_trindex = -1;
3416 d->mb_trcount = 0;
3420 if (!d->tralloc)
3421 realloc_trans_if_necessary (d);
3423 /* Current state. */
3424 state_num s = 0, s1 = 0;
3426 /* Current input character. */
3427 unsigned char const *p = (unsigned char const *) begin;
3428 unsigned char const *mbp = p;
3430 /* Copy of d->trans so it can be optimized into a register. */
3431 state_num **trans = d->trans;
3432 unsigned char eol = d->syntax.eolbyte; /* Likewise for eolbyte. */
3433 unsigned char saved_end = *(unsigned char *) end;
3434 *end = eol;
3436 if (multibyte)
3438 memset (&d->mbs, 0, sizeof d->mbs);
3439 if (d->mb_follows.alloc == 0)
3440 alloc_position_set (&d->mb_follows, d->nleaves);
3443 idx_t nlcount = 0;
3444 for (;;)
3446 state_num *t;
3447 while ((t = trans[s]) != NULL)
3449 if (s < d->min_trcount)
3451 if (!multibyte || d->states[s].mbps.nelem == 0)
3453 while (t[*p] == s)
3454 p++;
3456 if (multibyte)
3457 p = mbp = skip_remains_mb (d, p, mbp, end);
3460 if (multibyte)
3462 s1 = s;
3464 if (d->states[s].mbps.nelem == 0
3465 || d->localeinfo.sbctowc[*p] != WEOF || (char *) p >= end)
3467 /* If an input character does not match ANYCHAR, do it
3468 like a single-byte character. */
3469 s = t[*p++];
3471 else
3473 s = transit_state (d, s, &p, (unsigned char *) end);
3474 mbp = p;
3475 trans = d->trans;
3478 else
3480 s1 = t[*p++];
3481 t = trans[s1];
3482 if (! t)
3484 state_num tmp = s;
3485 s = s1;
3486 s1 = tmp; /* swap */
3487 break;
3489 if (s < d->min_trcount)
3491 while (t[*p] == s1)
3492 p++;
3494 s = t[*p++];
3498 if (s < 0)
3500 if (s == -2)
3502 s = build_state (s1, d, p[-1]);
3503 trans = d->trans;
3505 else if ((char *) p <= end && p[-1] == eol && 0 <= d->newlines[s1])
3507 /* The previous character was a newline. Count it, and skip
3508 checking of multibyte character boundary until here. */
3509 nlcount++;
3510 mbp = p;
3512 s = (allow_nl ? d->newlines[s1]
3513 : d->syntax.sbit[eol] == CTX_NEWLINE ? 0
3514 : d->syntax.sbit[eol] == CTX_LETTER ? d->min_trcount - 1
3515 : d->initstate_notbol);
3517 else
3519 p = NULL;
3520 goto done;
3523 else if (d->fails[s])
3525 if ((d->success[s] & d->syntax.sbit[*p])
3526 || ((char *) p == end
3527 && accepts_in_context (d->states[s].context, CTX_NEWLINE, s,
3528 d)))
3529 goto done;
3531 if (multibyte && s < d->min_trcount)
3532 p = mbp = skip_remains_mb (d, p, mbp, end);
3534 s1 = s;
3535 if (!multibyte || d->states[s].mbps.nelem == 0
3536 || d->localeinfo.sbctowc[*p] != WEOF || (char *) p >= end)
3538 /* If a input character does not match ANYCHAR, do it
3539 like a single-byte character. */
3540 s = d->fails[s][*p++];
3542 else
3544 s = transit_state (d, s, &p, (unsigned char *) end);
3545 mbp = p;
3546 trans = d->trans;
3549 else
3551 build_state (s, d, p[0]);
3552 trans = d->trans;
3556 done:
3557 if (count)
3558 *count += nlcount;
3559 *end = saved_end;
3560 return (char *) p;
3563 /* Specialized versions of dfaexec for multibyte and single-byte cases.
3564 This is for performance, as dfaexec_main is an inline function. */
3566 static char *
3567 dfaexec_mb (struct dfa *d, char const *begin, char *end,
3568 bool allow_nl, ptrdiff_t *count, bool *backref)
3570 return dfaexec_main (d, begin, end, allow_nl, count, true);
3573 static char *
3574 dfaexec_sb (struct dfa *d, char const *begin, char *end,
3575 bool allow_nl, ptrdiff_t *count, bool *backref)
3577 return dfaexec_main (d, begin, end, allow_nl, count, false);
3580 /* Always set *BACKREF and return BEGIN. Use this wrapper for
3581 any regexp that uses a construct not supported by this code. */
3582 static char *
3583 dfaexec_noop (struct dfa *d, char const *begin, char *end,
3584 bool allow_nl, ptrdiff_t *count, bool *backref)
3586 *backref = true;
3587 return (char *) begin;
3590 /* Like dfaexec_main (D, BEGIN, END, ALLOW_NL, COUNT, D->localeinfo.multibyte),
3591 but faster and set *BACKREF if the DFA code does not support this
3592 regexp usage. */
3594 char *
3595 dfaexec (struct dfa *d, char const *begin, char *end,
3596 bool allow_nl, ptrdiff_t *count, bool *backref)
3598 return d->dfaexec (d, begin, end, allow_nl, count, backref);
3601 struct dfa *
3602 dfasuperset (struct dfa const *d)
3604 return d->superset;
3607 bool
3608 dfaisfast (struct dfa const *d)
3610 return d->fast;
3613 static void
3614 free_mbdata (struct dfa *d)
3616 free (d->multibyte_prop);
3617 free (d->lex.brack.chars);
3618 free (d->mb_follows.elems);
3620 if (d->mb_trans)
3622 state_num s;
3623 for (s = -1; s < d->tralloc; s++)
3624 free (d->mb_trans[s]);
3625 free (d->mb_trans - 2);
3629 /* Return true if every construct in D is supported by this DFA matcher. */
3630 static bool _GL_ATTRIBUTE_PURE
3631 dfa_supported (struct dfa const *d)
3633 for (idx_t i = 0; i < d->tindex; i++)
3635 switch (d->tokens[i])
3637 case BEGWORD:
3638 case ENDWORD:
3639 case LIMWORD:
3640 case NOTLIMWORD:
3641 if (!d->localeinfo.multibyte)
3642 continue;
3643 FALLTHROUGH;
3644 case BACKREF:
3645 case MBCSET:
3646 return false;
3649 return true;
3652 /* Disable use of the superset DFA if it is not likely to help
3653 performance. */
3654 static void
3655 maybe_disable_superset_dfa (struct dfa *d)
3657 if (!d->localeinfo.using_utf8)
3658 return;
3660 bool have_backref = false;
3661 for (idx_t i = 0; i < d->tindex; i++)
3663 switch (d->tokens[i])
3665 case ANYCHAR:
3666 /* Lowered. */
3667 abort ();
3668 case BACKREF:
3669 have_backref = true;
3670 break;
3671 case MBCSET:
3672 /* Requires multi-byte algorithm. */
3673 return;
3674 default:
3675 break;
3679 if (!have_backref && d->superset)
3681 /* The superset DFA is not likely to be much faster, so remove it. */
3682 dfafree (d->superset);
3683 free (d->superset);
3684 d->superset = NULL;
3687 free_mbdata (d);
3688 d->localeinfo.multibyte = false;
3689 d->dfaexec = dfaexec_sb;
3690 d->fast = true;
3693 static void
3694 dfassbuild (struct dfa *d)
3696 struct dfa *sup = dfaalloc ();
3698 *sup = *d;
3699 sup->localeinfo.multibyte = false;
3700 sup->dfaexec = dfaexec_sb;
3701 sup->multibyte_prop = NULL;
3702 sup->superset = NULL;
3703 sup->states = NULL;
3704 sup->sindex = 0;
3705 sup->constraints = NULL;
3706 sup->separates = NULL;
3707 sup->follows = NULL;
3708 sup->tralloc = 0;
3709 sup->trans = NULL;
3710 sup->fails = NULL;
3711 sup->success = NULL;
3712 sup->newlines = NULL;
3714 sup->charclasses = xnmalloc (sup->calloc, sizeof *sup->charclasses);
3715 if (d->cindex)
3717 memcpy (sup->charclasses, d->charclasses,
3718 d->cindex * sizeof *sup->charclasses);
3721 sup->tokens = xnmalloc (d->tindex, 2 * sizeof *sup->tokens);
3722 sup->talloc = d->tindex * 2;
3724 bool have_achar = false;
3725 bool have_nchar = false;
3726 idx_t j;
3727 for (idx_t i = j = 0; i < d->tindex; i++)
3729 switch (d->tokens[i])
3731 case ANYCHAR:
3732 case MBCSET:
3733 case BACKREF:
3735 charclass ccl;
3736 fillset (&ccl);
3737 sup->tokens[j++] = CSET + charclass_index (sup, &ccl);
3738 sup->tokens[j++] = STAR;
3739 if (d->tokens[i + 1] == QMARK || d->tokens[i + 1] == STAR
3740 || d->tokens[i + 1] == PLUS)
3741 i++;
3742 have_achar = true;
3744 break;
3745 case BEGWORD:
3746 case ENDWORD:
3747 case LIMWORD:
3748 case NOTLIMWORD:
3749 if (d->localeinfo.multibyte)
3751 /* These constraints aren't supported in a multibyte locale.
3752 Ignore them in the superset DFA. */
3753 sup->tokens[j++] = EMPTY;
3754 break;
3756 FALLTHROUGH;
3757 default:
3758 sup->tokens[j++] = d->tokens[i];
3759 if ((0 <= d->tokens[i] && d->tokens[i] < NOTCHAR)
3760 || d->tokens[i] >= CSET)
3761 have_nchar = true;
3762 break;
3765 sup->tindex = j;
3767 if (have_nchar && (have_achar || d->localeinfo.multibyte))
3768 d->superset = sup;
3769 else
3771 dfafree (sup);
3772 free (sup);
3776 /* Parse a string S of length LEN into D (but skip this step if S is null).
3777 Then analyze D and build a matcher for it.
3778 SEARCHFLAG says whether to build a searching or an exact matcher. */
3779 void
3780 dfacomp (char const *s, idx_t len, struct dfa *d, bool searchflag)
3782 if (s != NULL)
3783 dfaparse (s, len, d);
3785 dfassbuild (d);
3787 if (dfa_supported (d))
3789 maybe_disable_superset_dfa (d);
3790 dfaanalyze (d, searchflag);
3792 else
3794 d->dfaexec = dfaexec_noop;
3797 if (d->superset)
3799 d->fast = true;
3800 dfaanalyze (d->superset, searchflag);
3804 /* Free the storage held by the components of a dfa. */
3805 void
3806 dfafree (struct dfa *d)
3808 free (d->charclasses);
3809 free (d->tokens);
3811 if (d->localeinfo.multibyte)
3812 free_mbdata (d);
3814 free (d->constraints);
3815 free (d->separates);
3817 for (idx_t i = 0; i < d->sindex; i++)
3819 free (d->states[i].elems.elems);
3820 free (d->states[i].mbps.elems);
3822 free (d->states);
3824 if (d->follows)
3826 for (idx_t i = 0; i < d->tindex; i++)
3827 free (d->follows[i].elems);
3828 free (d->follows);
3831 if (d->trans)
3833 for (idx_t i = 0; i < d->tralloc; i++)
3835 free (d->trans[i]);
3836 free (d->fails[i]);
3839 free (d->trans - 2);
3840 free (d->fails);
3841 free (d->newlines);
3842 free (d->success);
3845 if (d->superset)
3847 dfafree (d->superset);
3848 free (d->superset);
3852 /* Having found the postfix representation of the regular expression,
3853 try to find a long sequence of characters that must appear in any line
3854 containing the r.e.
3855 Finding a "longest" sequence is beyond the scope here;
3856 we take an easy way out and hope for the best.
3857 (Take "(ab|a)b"--please.)
3859 We do a bottom-up calculation of sequences of characters that must appear
3860 in matches of r.e.'s represented by trees rooted at the nodes of the postfix
3861 representation:
3862 sequences that must appear at the left of the match ("left")
3863 sequences that must appear at the right of the match ("right")
3864 lists of sequences that must appear somewhere in the match ("in")
3865 sequences that must constitute the match ("is")
3867 When we get to the root of the tree, we use one of the longest of its
3868 calculated "in" sequences as our answer.
3870 The sequences calculated for the various types of node (in pseudo ANSI c)
3871 are shown below. "p" is the operand of unary operators (and the left-hand
3872 operand of binary operators); "q" is the right-hand operand of binary
3873 operators.
3875 "ZERO" means "a zero-length sequence" below.
3877 Type left right is in
3878 ---- ---- ----- -- --
3879 char c # c # c # c # c
3881 ANYCHAR ZERO ZERO ZERO ZERO
3883 MBCSET ZERO ZERO ZERO ZERO
3885 CSET ZERO ZERO ZERO ZERO
3887 STAR ZERO ZERO ZERO ZERO
3889 QMARK ZERO ZERO ZERO ZERO
3891 PLUS p->left p->right ZERO p->in
3893 CAT (p->is==ZERO)? (q->is==ZERO)? (p->is!=ZERO && p->in plus
3894 p->left : q->right : q->is!=ZERO) ? q->in plus
3895 p->is##q->left p->right##q->is p->is##q->is : p->right##q->left
3896 ZERO
3898 OR longest common longest common (do p->is and substrings common
3899 leading trailing to q->is have same p->in and
3900 (sub)sequence (sub)sequence q->in length and content) ?
3901 of p->left of p->right
3902 and q->left and q->right p->is : NULL
3904 If there's anything else we recognize in the tree, all four sequences get set
3905 to zero-length sequences. If there's something we don't recognize in the
3906 tree, we just return a zero-length sequence.
3908 Break ties in favor of infrequent letters (choosing 'zzz' in preference to
3909 'aaa')?
3911 And ... is it here or someplace that we might ponder "optimizations" such as
3912 egrep 'psi|epsilon' -> egrep 'psi'
3913 egrep 'pepsi|epsilon' -> egrep 'epsi'
3914 (Yes, we now find "epsi" as a "string
3915 that must occur", but we might also
3916 simplify the *entire* r.e. being sought)
3917 grep '[c]' -> grep 'c'
3918 grep '(ab|a)b' -> grep 'ab'
3919 grep 'ab*' -> grep 'a'
3920 grep 'a*b' -> grep 'b'
3922 There are several issues:
3924 Is optimization easy (enough)?
3926 Does optimization actually accomplish anything,
3927 or is the automaton you get from "psi|epsilon" (for example)
3928 the same as the one you get from "psi" (for example)?
3930 Are optimizable r.e.'s likely to be used in real-life situations
3931 (something like 'ab*' is probably unlikely; something like is
3932 'psi|epsilon' is likelier)? */
3934 static char *
3935 icatalloc (char *old, char const *new)
3937 idx_t newsize = strlen (new);
3938 if (newsize == 0)
3939 return old;
3940 idx_t oldsize = strlen (old);
3941 char *result = xrealloc (old, oldsize + newsize + 1);
3942 memcpy (result + oldsize, new, newsize + 1);
3943 return result;
3946 static void
3947 freelist (char **cpp)
3949 while (*cpp)
3950 free (*cpp++);
3953 static char **
3954 enlist (char **cpp, char *new, idx_t len)
3956 new = memcpy (xmalloc (len + 1), new, len);
3957 new[len] = '\0';
3958 /* Is there already something in the list that's new (or longer)? */
3959 idx_t i;
3960 for (i = 0; cpp[i] != NULL; i++)
3961 if (strstr (cpp[i], new) != NULL)
3963 free (new);
3964 return cpp;
3966 /* Eliminate any obsoleted strings. */
3967 for (idx_t j = 0; cpp[j] != NULL; )
3968 if (strstr (new, cpp[j]) == NULL)
3969 ++j;
3970 else
3972 free (cpp[j]);
3973 if (--i == j)
3974 break;
3975 cpp[j] = cpp[i];
3976 cpp[i] = NULL;
3978 /* Add the new string. */
3979 cpp = xnrealloc (cpp, i + 2, sizeof *cpp);
3980 cpp[i] = new;
3981 cpp[i + 1] = NULL;
3982 return cpp;
3985 /* Given pointers to two strings, return a pointer to an allocated
3986 list of their distinct common substrings. */
3987 static char **
3988 comsubs (char *left, char const *right)
3990 char **cpp = xzalloc (sizeof *cpp);
3992 for (char *lcp = left; *lcp != '\0'; lcp++)
3994 idx_t len = 0;
3995 char *rcp = strchr (right, *lcp);
3996 while (rcp != NULL)
3998 idx_t i;
3999 for (i = 1; lcp[i] != '\0' && lcp[i] == rcp[i]; ++i)
4000 continue;
4001 if (i > len)
4002 len = i;
4003 rcp = strchr (rcp + 1, *lcp);
4005 if (len != 0)
4006 cpp = enlist (cpp, lcp, len);
4008 return cpp;
4011 static char **
4012 addlists (char **old, char **new)
4014 for (; *new; new++)
4015 old = enlist (old, *new, strlen (*new));
4016 return old;
4019 /* Given two lists of substrings, return a new list giving substrings
4020 common to both. */
4021 static char **
4022 inboth (char **left, char **right)
4024 char **both = xzalloc (sizeof *both);
4026 for (idx_t lnum = 0; left[lnum] != NULL; lnum++)
4028 for (idx_t rnum = 0; right[rnum] != NULL; rnum++)
4030 char **temp = comsubs (left[lnum], right[rnum]);
4031 both = addlists (both, temp);
4032 freelist (temp);
4033 free (temp);
4036 return both;
4039 typedef struct must must;
4041 struct must
4043 char **in;
4044 char *left;
4045 char *right;
4046 char *is;
4047 bool begline;
4048 bool endline;
4049 must *prev;
4052 static must *
4053 allocmust (must *mp, idx_t size)
4055 must *new_mp = xmalloc (sizeof *new_mp);
4056 new_mp->in = xzalloc (sizeof *new_mp->in);
4057 new_mp->left = xzalloc (size);
4058 new_mp->right = xzalloc (size);
4059 new_mp->is = xzalloc (size);
4060 new_mp->begline = false;
4061 new_mp->endline = false;
4062 new_mp->prev = mp;
4063 return new_mp;
4066 static void
4067 resetmust (must *mp)
4069 freelist (mp->in);
4070 mp->in[0] = NULL;
4071 mp->left[0] = mp->right[0] = mp->is[0] = '\0';
4072 mp->begline = false;
4073 mp->endline = false;
4076 static void
4077 freemust (must *mp)
4079 freelist (mp->in);
4080 free (mp->in);
4081 free (mp->left);
4082 free (mp->right);
4083 free (mp->is);
4084 free (mp);
4087 struct dfamust *
4088 dfamust (struct dfa const *d)
4090 must *mp = NULL;
4091 char const *result = "";
4092 bool exact = false;
4093 bool begline = false;
4094 bool endline = false;
4095 bool need_begline = false;
4096 bool need_endline = false;
4097 bool case_fold_unibyte = d->syntax.case_fold & !d->localeinfo.multibyte;
4099 for (idx_t ri = 1; ri + 1 < d->tindex; ri++)
4101 token t = d->tokens[ri];
4102 switch (t)
4104 case BEGLINE:
4105 mp = allocmust (mp, 2);
4106 mp->begline = true;
4107 need_begline = true;
4108 break;
4109 case ENDLINE:
4110 mp = allocmust (mp, 2);
4111 mp->endline = true;
4112 need_endline = true;
4113 break;
4114 case LPAREN:
4115 case RPAREN:
4116 assert (!"neither LPAREN nor RPAREN may appear here");
4118 case EMPTY:
4119 case BEGWORD:
4120 case ENDWORD:
4121 case LIMWORD:
4122 case NOTLIMWORD:
4123 case BACKREF:
4124 case ANYCHAR:
4125 case MBCSET:
4126 mp = allocmust (mp, 2);
4127 break;
4129 case STAR:
4130 case QMARK:
4131 resetmust (mp);
4132 break;
4134 case OR:
4136 char **new;
4137 must *rmp = mp;
4138 must *lmp = mp = mp->prev;
4139 idx_t j, ln, rn, n;
4141 /* Guaranteed to be. Unlikely, but ... */
4142 if (streq (lmp->is, rmp->is))
4144 lmp->begline &= rmp->begline;
4145 lmp->endline &= rmp->endline;
4147 else
4149 lmp->is[0] = '\0';
4150 lmp->begline = false;
4151 lmp->endline = false;
4153 /* Left side--easy */
4154 idx_t i = 0;
4155 while (lmp->left[i] != '\0' && lmp->left[i] == rmp->left[i])
4156 ++i;
4157 lmp->left[i] = '\0';
4158 /* Right side */
4159 ln = strlen (lmp->right);
4160 rn = strlen (rmp->right);
4161 n = ln;
4162 if (n > rn)
4163 n = rn;
4164 for (i = 0; i < n; ++i)
4165 if (lmp->right[ln - i - 1] != rmp->right[rn - i - 1])
4166 break;
4167 for (j = 0; j < i; ++j)
4168 lmp->right[j] = lmp->right[(ln - i) + j];
4169 lmp->right[j] = '\0';
4170 new = inboth (lmp->in, rmp->in);
4171 freelist (lmp->in);
4172 free (lmp->in);
4173 lmp->in = new;
4174 freemust (rmp);
4176 break;
4178 case PLUS:
4179 mp->is[0] = '\0';
4180 break;
4182 case END:
4183 assert (!mp->prev);
4184 for (idx_t i = 0; mp->in[i] != NULL; i++)
4185 if (strlen (mp->in[i]) > strlen (result))
4186 result = mp->in[i];
4187 if (streq (result, mp->is))
4189 if ((!need_begline || mp->begline) && (!need_endline
4190 || mp->endline))
4191 exact = true;
4192 begline = mp->begline;
4193 endline = mp->endline;
4195 goto done;
4197 case CAT:
4199 must *rmp = mp;
4200 must *lmp = mp = mp->prev;
4202 /* In. Everything in left, plus everything in
4203 right, plus concatenation of
4204 left's right and right's left. */
4205 lmp->in = addlists (lmp->in, rmp->in);
4206 if (lmp->right[0] != '\0' && rmp->left[0] != '\0')
4208 idx_t lrlen = strlen (lmp->right);
4209 idx_t rllen = strlen (rmp->left);
4210 char *tp = xmalloc (lrlen + rllen);
4211 memcpy (tp, lmp->right, lrlen);
4212 memcpy (tp + lrlen, rmp->left, rllen);
4213 lmp->in = enlist (lmp->in, tp, lrlen + rllen);
4214 free (tp);
4216 /* Left-hand */
4217 if (lmp->is[0] != '\0')
4218 lmp->left = icatalloc (lmp->left, rmp->left);
4219 /* Right-hand */
4220 if (rmp->is[0] == '\0')
4221 lmp->right[0] = '\0';
4222 lmp->right = icatalloc (lmp->right, rmp->right);
4223 /* Guaranteed to be */
4224 if ((lmp->is[0] != '\0' || lmp->begline)
4225 && (rmp->is[0] != '\0' || rmp->endline))
4227 lmp->is = icatalloc (lmp->is, rmp->is);
4228 lmp->endline = rmp->endline;
4230 else
4232 lmp->is[0] = '\0';
4233 lmp->begline = false;
4234 lmp->endline = false;
4236 freemust (rmp);
4238 break;
4240 case '\0':
4241 /* Not on *my* shift. */
4242 goto done;
4244 default:
4245 if (CSET <= t)
4247 /* If T is a singleton, or if case-folding in a unibyte
4248 locale and T's members all case-fold to the same char,
4249 convert T to one of its members. Otherwise, do
4250 nothing further with T. */
4251 charclass *ccl = &d->charclasses[t - CSET];
4252 int j;
4253 for (j = 0; j < NOTCHAR; j++)
4254 if (tstbit (j, ccl))
4255 break;
4256 if (! (j < NOTCHAR))
4258 mp = allocmust (mp, 2);
4259 break;
4261 t = j;
4262 while (++j < NOTCHAR)
4263 if (tstbit (j, ccl)
4264 && ! (case_fold_unibyte
4265 && toupper (j) == toupper (t)))
4266 break;
4267 if (j < NOTCHAR)
4269 mp = allocmust (mp, 2);
4270 break;
4274 idx_t rj = ri + 2;
4275 if (d->tokens[ri + 1] == CAT)
4277 for (; rj < d->tindex - 1; rj += 2)
4279 if ((rj != ri && (d->tokens[rj] <= 0
4280 || NOTCHAR <= d->tokens[rj]))
4281 || d->tokens[rj + 1] != CAT)
4282 break;
4285 mp = allocmust (mp, ((rj - ri) >> 1) + 1);
4286 mp->is[0] = mp->left[0] = mp->right[0]
4287 = case_fold_unibyte ? toupper (t) : t;
4289 idx_t i;
4290 for (i = 1; ri + 2 < rj; i++)
4292 ri += 2;
4293 t = d->tokens[ri];
4294 mp->is[i] = mp->left[i] = mp->right[i]
4295 = case_fold_unibyte ? toupper (t) : t;
4297 mp->is[i] = mp->left[i] = mp->right[i] = '\0';
4298 mp->in = enlist (mp->in, mp->is, i);
4299 break;
4302 done:;
4304 struct dfamust *dm = NULL;
4305 if (*result)
4307 dm = xmalloc (FLEXSIZEOF (struct dfamust, must, strlen (result) + 1));
4308 dm->exact = exact;
4309 dm->begline = begline;
4310 dm->endline = endline;
4311 strcpy (dm->must, result);
4314 while (mp)
4316 must *prev = mp->prev;
4317 freemust (mp);
4318 mp = prev;
4321 return dm;
4324 void
4325 dfamustfree (struct dfamust *dm)
4327 free (dm);
4330 struct dfa *
4331 dfaalloc (void)
4333 return xmalloc (sizeof (struct dfa));
4336 /* Initialize DFA. */
4337 void
4338 dfasyntax (struct dfa *dfa, struct localeinfo const *linfo,
4339 reg_syntax_t bits, int dfaopts)
4341 memset (dfa, 0, offsetof (struct dfa, dfaexec));
4342 dfa->dfaexec = linfo->multibyte ? dfaexec_mb : dfaexec_sb;
4343 dfa->localeinfo = *linfo;
4345 dfa->fast = !dfa->localeinfo.multibyte;
4347 dfa->canychar = -1;
4348 dfa->syntax.syntax_bits_set = true;
4349 dfa->syntax.case_fold = (bits & RE_ICASE) != 0;
4350 dfa->syntax.anchor = (dfaopts & DFA_ANCHOR) != 0;
4351 dfa->syntax.eolbyte = dfaopts & DFA_EOL_NUL ? '\0' : '\n';
4352 dfa->syntax.syntax_bits = bits;
4354 for (int i = CHAR_MIN; i <= CHAR_MAX; ++i)
4356 unsigned char uc = i;
4358 dfa->syntax.sbit[uc] = char_context (dfa, uc);
4359 switch (dfa->syntax.sbit[uc])
4361 case CTX_LETTER:
4362 setbit (uc, &dfa->syntax.letters);
4363 break;
4364 case CTX_NEWLINE:
4365 setbit (uc, &dfa->syntax.newline);
4366 break;
4369 /* POSIX requires that the five bytes in "\n\r./" (including the
4370 terminating NUL) cannot occur inside a multibyte character. */
4371 dfa->syntax.never_trail[uc] = (dfa->localeinfo.using_utf8
4372 ? (uc & 0xc0) != 0x80
4373 : strchr ("\n\r./", uc) != NULL);
4377 /* Initialize TO by copying FROM's syntax settings. */
4378 void
4379 dfacopysyntax (struct dfa *to, struct dfa const *from)
4381 memset (to, 0, offsetof (struct dfa, syntax));
4382 to->canychar = -1;
4383 to->fast = from->fast;
4384 to->syntax = from->syntax;
4385 to->dfaexec = from->dfaexec;
4386 to->localeinfo = from->localeinfo;
4389 /* vim:set shiftwidth=2: */