tests: fix 'invalid path dir' error
[gnulib.git] / lib / dfa.c
blob2b9c80e705d27c39929dc3d1d4af1e80bd6e5b5c
1 /* dfa.c - deterministic extended regexp routines for GNU
2 Copyright (C) 1988, 1998, 2000, 2002, 2004-2005, 2007-2017 Free Software
3 Foundation, Inc.
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3, or (at your option)
8 any later version.
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc.,
18 51 Franklin Street - Fifth Floor, Boston, MA 02110-1301, USA */
20 /* Written June, 1988 by Mike Haertel
21 Modified July, 1988 by Arthur David Olson to assist BMG speedups */
23 #include <config.h>
25 #include "dfa.h"
27 #include <assert.h>
28 #include <ctype.h>
29 #include <stdint.h>
30 #include <stdio.h>
31 #include <stdlib.h>
32 #include <limits.h>
33 #include <string.h>
34 #include <locale.h>
36 static bool
37 streq (char const *a, char const *b)
39 return strcmp (a, b) == 0;
42 static bool
43 isasciidigit (char c)
45 return '0' <= c && c <= '9';
48 #include "gettext.h"
49 #define _(str) gettext (str)
51 #include <wchar.h>
53 #include "intprops.h"
54 #include "xalloc.h"
55 #include "localeinfo.h"
57 #ifndef FALLTHROUGH
58 # if __GNUC__ < 7
59 # define FALLTHROUGH ((void) 0)
60 # else
61 # define FALLTHROUGH __attribute__ ((__fallthrough__))
62 # endif
63 #endif
65 #ifndef MIN
66 # define MIN(a,b) ((a) < (b) ? (a) : (b))
67 #endif
69 /* HPUX defines these as macros in sys/param.h. */
70 #ifdef setbit
71 # undef setbit
72 #endif
73 #ifdef clrbit
74 # undef clrbit
75 #endif
77 /* First integer value that is greater than any character code. */
78 enum { NOTCHAR = 1 << CHAR_BIT };
80 /* This represents part of a character class. It must be unsigned and
81 at least CHARCLASS_WORD_BITS wide. Any excess bits are zero. */
82 typedef unsigned long int charclass_word;
84 /* CHARCLASS_WORD_BITS is the number of bits used in a charclass word.
85 CHARCLASS_PAIR (LO, HI) is part of a charclass initializer, and
86 represents 64 bits' worth of a charclass, where LO and HI are the
87 low and high-order 32 bits of the 64-bit quantity. */
88 #if ULONG_MAX >> 31 >> 31 < 3
89 enum { CHARCLASS_WORD_BITS = 32 };
90 # define CHARCLASS_PAIR(lo, hi) lo, hi
91 #else
92 enum { CHARCLASS_WORD_BITS = 64 };
93 # define CHARCLASS_PAIR(lo, hi) (((charclass_word) (hi) << 32) + (lo))
94 #endif
96 /* An initializer for a charclass whose 32-bit words are A through H. */
97 #define CHARCLASS_INIT(a, b, c, d, e, f, g, h) \
98 {{ \
99 CHARCLASS_PAIR (a, b), CHARCLASS_PAIR (c, d), \
100 CHARCLASS_PAIR (e, f), CHARCLASS_PAIR (g, h) \
103 /* The maximum useful value of a charclass_word; all used bits are 1. */
104 static charclass_word const CHARCLASS_WORD_MASK
105 = ((charclass_word) 1 << (CHARCLASS_WORD_BITS - 1) << 1) - 1;
107 /* Number of words required to hold a bit for every character. */
108 enum
110 CHARCLASS_WORDS = (NOTCHAR + CHARCLASS_WORD_BITS - 1) / CHARCLASS_WORD_BITS
113 /* Sets of unsigned characters are stored as bit vectors in arrays of ints. */
114 typedef struct { charclass_word w[CHARCLASS_WORDS]; } charclass;
116 /* Convert a possibly-signed character to an unsigned character. This is
117 a bit safer than casting to unsigned char, since it catches some type
118 errors that the cast doesn't. */
119 static unsigned char
120 to_uchar (char ch)
122 return ch;
125 /* Contexts tell us whether a character is a newline or a word constituent.
126 Word-constituent characters are those that satisfy iswalnum, plus '_'.
127 Each character has a single CTX_* value; bitmasks of CTX_* values denote
128 a particular character class.
130 A state also stores a context value, which is a bitmask of CTX_* values.
131 A state's context represents a set of characters that the state's
132 predecessors must match. For example, a state whose context does not
133 include CTX_LETTER will never have transitions where the previous
134 character is a word constituent. A state whose context is CTX_ANY
135 might have transitions from any character. */
137 enum
139 CTX_NONE = 1,
140 CTX_LETTER = 2,
141 CTX_NEWLINE = 4,
142 CTX_ANY = 7
145 /* Sometimes characters can only be matched depending on the surrounding
146 context. Such context decisions depend on what the previous character
147 was, and the value of the current (lookahead) character. Context
148 dependent constraints are encoded as 9-bit integers. Each bit that
149 is set indicates that the constraint succeeds in the corresponding
150 context.
152 bit 6-8 - valid contexts when next character is CTX_NEWLINE
153 bit 3-5 - valid contexts when next character is CTX_LETTER
154 bit 0-2 - valid contexts when next character is CTX_NONE
156 succeeds_in_context determines whether a given constraint
157 succeeds in a particular context. Prev is a bitmask of possible
158 context values for the previous character, curr is the (single-bit)
159 context value for the lookahead character. */
160 static int
161 newline_constraint (int constraint)
163 return (constraint >> 6) & 7;
165 static int
166 letter_constraint (int constraint)
168 return (constraint >> 3) & 7;
170 static int
171 other_constraint (int constraint)
173 return constraint & 7;
176 static bool
177 succeeds_in_context (int constraint, int prev, int curr)
179 return !! (((curr & CTX_NONE ? other_constraint (constraint) : 0) \
180 | (curr & CTX_LETTER ? letter_constraint (constraint) : 0) \
181 | (curr & CTX_NEWLINE ? newline_constraint (constraint) : 0)) \
182 & prev);
185 /* The following describe what a constraint depends on. */
186 static bool
187 prev_newline_dependent (int constraint)
189 return ((constraint ^ constraint >> 2) & 0111) != 0;
191 static bool
192 prev_letter_dependent (int constraint)
194 return ((constraint ^ constraint >> 1) & 0111) != 0;
197 /* Tokens that match the empty string subject to some constraint actually
198 work by applying that constraint to determine what may follow them,
199 taking into account what has gone before. The following values are
200 the constraints corresponding to the special tokens previously defined. */
201 enum
203 NO_CONSTRAINT = 0777,
204 BEGLINE_CONSTRAINT = 0444,
205 ENDLINE_CONSTRAINT = 0700,
206 BEGWORD_CONSTRAINT = 0050,
207 ENDWORD_CONSTRAINT = 0202,
208 LIMWORD_CONSTRAINT = 0252,
209 NOTLIMWORD_CONSTRAINT = 0525
212 /* The regexp is parsed into an array of tokens in postfix form. Some tokens
213 are operators and others are terminal symbols. Most (but not all) of these
214 codes are returned by the lexical analyzer. */
216 typedef ptrdiff_t token;
217 static ptrdiff_t const TOKEN_MAX = PTRDIFF_MAX;
219 /* States are indexed by state_num values. These are normally
220 nonnegative but -1 is used as a special value. */
221 typedef ptrdiff_t state_num;
223 /* Predefined token values. */
224 enum
226 END = -1, /* END is a terminal symbol that matches the
227 end of input; any value of END or less in
228 the parse tree is such a symbol. Accepting
229 states of the DFA are those that would have
230 a transition on END. */
232 /* Ordinary character values are terminal symbols that match themselves. */
234 EMPTY = NOTCHAR, /* EMPTY is a terminal symbol that matches
235 the empty string. */
237 BACKREF, /* BACKREF is generated by \<digit>
238 or by any other construct that
239 is not completely handled. If the scanner
240 detects a transition on backref, it returns
241 a kind of "semi-success" indicating that
242 the match will have to be verified with
243 a backtracking matcher. */
245 BEGLINE, /* BEGLINE is a terminal symbol that matches
246 the empty string at the beginning of a
247 line. */
249 ENDLINE, /* ENDLINE is a terminal symbol that matches
250 the empty string at the end of a line. */
252 BEGWORD, /* BEGWORD is a terminal symbol that matches
253 the empty string at the beginning of a
254 word. */
256 ENDWORD, /* ENDWORD is a terminal symbol that matches
257 the empty string at the end of a word. */
259 LIMWORD, /* LIMWORD is a terminal symbol that matches
260 the empty string at the beginning or the
261 end of a word. */
263 NOTLIMWORD, /* NOTLIMWORD is a terminal symbol that
264 matches the empty string not at
265 the beginning or end of a word. */
267 QMARK, /* QMARK is an operator of one argument that
268 matches zero or one occurrences of its
269 argument. */
271 STAR, /* STAR is an operator of one argument that
272 matches the Kleene closure (zero or more
273 occurrences) of its argument. */
275 PLUS, /* PLUS is an operator of one argument that
276 matches the positive closure (one or more
277 occurrences) of its argument. */
279 REPMN, /* REPMN is a lexical token corresponding
280 to the {m,n} construct. REPMN never
281 appears in the compiled token vector. */
283 CAT, /* CAT is an operator of two arguments that
284 matches the concatenation of its
285 arguments. CAT is never returned by the
286 lexical analyzer. */
288 OR, /* OR is an operator of two arguments that
289 matches either of its arguments. */
291 LPAREN, /* LPAREN never appears in the parse tree,
292 it is only a lexeme. */
294 RPAREN, /* RPAREN never appears in the parse tree. */
296 ANYCHAR, /* ANYCHAR is a terminal symbol that matches
297 a valid multibyte (or single byte) character.
298 It is used only if MB_CUR_MAX > 1. */
300 MBCSET, /* MBCSET is similar to CSET, but for
301 multibyte characters. */
303 WCHAR, /* Only returned by lex. wctok contains
304 the wide character representation. */
306 CSET /* CSET and (and any value greater) is a
307 terminal symbol that matches any of a
308 class of characters. */
312 /* States of the recognizer correspond to sets of positions in the parse
313 tree, together with the constraints under which they may be matched.
314 So a position is encoded as an index into the parse tree together with
315 a constraint. */
316 typedef struct
318 size_t index; /* Index into the parse array. */
319 unsigned int constraint; /* Constraint for matching this position. */
320 } position;
322 /* Sets of positions are stored as arrays. */
323 typedef struct
325 position *elems; /* Elements of this position set. */
326 ptrdiff_t nelem; /* Number of elements in this set. */
327 ptrdiff_t alloc; /* Number of elements allocated in ELEMS. */
328 } position_set;
330 /* Sets of leaves are also stored as arrays. */
331 typedef struct
333 size_t *elems; /* Elements of this position set. */
334 size_t nelem; /* Number of elements in this set. */
335 } leaf_set;
337 /* A state of the dfa consists of a set of positions, some flags,
338 and the token value of the lowest-numbered position of the state that
339 contains an END token. */
340 typedef struct
342 size_t hash; /* Hash of the positions of this state. */
343 position_set elems; /* Positions this state could match. */
344 unsigned char context; /* Context from previous state. */
345 unsigned short constraint; /* Constraint for this state to accept. */
346 token first_end; /* Token value of the first END in elems. */
347 position_set mbps; /* Positions which can match multibyte
348 characters or the follows, e.g., period.
349 Used only if MB_CUR_MAX > 1. */
350 state_num mb_trindex; /* Index of this state in MB_TRANS, or
351 negative if the state does not have
352 ANYCHAR. */
353 } dfa_state;
355 /* Maximum for any transition table count. This should be at least 3,
356 for the initial state setup. */
357 enum { MAX_TRCOUNT = 1024 };
359 /* A bracket operator.
360 e.g., [a-c], [[:alpha:]], etc. */
361 struct mb_char_classes
363 ptrdiff_t cset;
364 bool invert;
365 wchar_t *chars; /* Normal characters. */
366 ptrdiff_t nchars;
367 ptrdiff_t nchars_alloc;
370 struct regex_syntax
372 /* Syntax bits controlling the behavior of the lexical analyzer. */
373 reg_syntax_t syntax_bits;
374 bool syntax_bits_set;
376 /* Flag for case-folding letters into sets. */
377 bool case_fold;
379 /* True if ^ and $ match only the start and end of data, and do not match
380 end-of-line within data. */
381 bool anchor;
383 /* End-of-line byte in data. */
384 unsigned char eolbyte;
386 /* Cache of char-context values. */
387 char sbit[NOTCHAR];
389 /* If never_trail[B], the byte B cannot be a non-initial byte in a
390 multibyte character. */
391 bool never_trail[NOTCHAR];
393 /* Set of characters considered letters. */
394 charclass letters;
396 /* Set of characters that are newline. */
397 charclass newline;
400 /* Lexical analyzer. All the dross that deals with the obnoxious
401 GNU Regex syntax bits is located here. The poor, suffering
402 reader is referred to the GNU Regex documentation for the
403 meaning of the @#%!@#%^!@ syntax bits. */
404 struct lexer_state
406 char const *ptr; /* Pointer to next input character. */
407 size_t left; /* Number of characters remaining. */
408 token lasttok; /* Previous token returned; initially END. */
409 size_t parens; /* Count of outstanding left parens. */
410 int minrep, maxrep; /* Repeat counts for {m,n}. */
412 /* Wide character representation of the current multibyte character,
413 or WEOF if there was an encoding error. Used only if
414 MB_CUR_MAX > 1. */
415 wint_t wctok;
417 /* Length of the multibyte representation of wctok. */
418 int cur_mb_len;
420 /* The most recently analyzed multibyte bracket expression. */
421 struct mb_char_classes brack;
423 /* We're separated from beginning or (, | only by zero-width characters. */
424 bool laststart;
427 /* Recursive descent parser for regular expressions. */
429 struct parser_state
431 token tok; /* Lookahead token. */
432 size_t depth; /* Current depth of a hypothetical stack
433 holding deferred productions. This is
434 used to determine the depth that will be
435 required of the real stack later on in
436 dfaanalyze. */
439 /* A compiled regular expression. */
440 struct dfa
442 /* Syntax configuration */
443 struct regex_syntax syntax;
445 /* Fields filled by the scanner. */
446 charclass *charclasses; /* Array of character sets for CSET tokens. */
447 ptrdiff_t cindex; /* Index for adding new charclasses. */
448 ptrdiff_t calloc; /* Number of charclasses allocated. */
449 size_t canychar; /* Index of anychar class, or (size_t) -1. */
451 /* Scanner state */
452 struct lexer_state lex;
454 /* Parser state */
455 struct parser_state parse;
457 /* Fields filled by the parser. */
458 token *tokens; /* Postfix parse array. */
459 size_t tindex; /* Index for adding new tokens. */
460 size_t talloc; /* Number of tokens currently allocated. */
461 size_t depth; /* Depth required of an evaluation stack
462 used for depth-first traversal of the
463 parse tree. */
464 size_t nleaves; /* Number of leaves on the parse tree. */
465 size_t nregexps; /* Count of parallel regexps being built
466 with dfaparse. */
467 bool fast; /* The DFA is fast. */
468 token utf8_anychar_classes[5]; /* To lower ANYCHAR in UTF-8 locales. */
469 mbstate_t mbs; /* Multibyte conversion state. */
471 /* The following are valid only if MB_CUR_MAX > 1. */
473 /* The value of multibyte_prop[i] is defined by following rule.
474 if tokens[i] < NOTCHAR
475 bit 0 : tokens[i] is the first byte of a character, including
476 single-byte characters.
477 bit 1 : tokens[i] is the last byte of a character, including
478 single-byte characters.
480 e.g.
481 tokens
482 = 'single_byte_a', 'multi_byte_A', single_byte_b'
483 = 'sb_a', 'mb_A(1st byte)', 'mb_A(2nd byte)', 'mb_A(3rd byte)', 'sb_b'
484 multibyte_prop
485 = 3 , 1 , 0 , 2 , 3
487 char *multibyte_prop;
489 /* Fields filled by the superset. */
490 struct dfa *superset; /* Hint of the dfa. */
492 /* Fields filled by the state builder. */
493 dfa_state *states; /* States of the dfa. */
494 state_num sindex; /* Index for adding new states. */
495 ptrdiff_t salloc; /* Number of states currently allocated. */
497 /* Fields filled by the parse tree->NFA conversion. */
498 position_set *follows; /* Array of follow sets, indexed by position
499 index. The follow of a position is the set
500 of positions containing characters that
501 could conceivably follow a character
502 matching the given position in a string
503 matching the regexp. Allocated to the
504 maximum possible position index. */
505 bool searchflag; /* We are supposed to build a searching
506 as opposed to an exact matcher. A searching
507 matcher finds the first and shortest string
508 matching a regexp anywhere in the buffer,
509 whereas an exact matcher finds the longest
510 string matching, but anchored to the
511 beginning of the buffer. */
513 /* Fields filled by dfaexec. */
514 state_num tralloc; /* Number of transition tables that have
515 slots so far, not counting trans[-1] and
516 trans[-2]. */
517 int trcount; /* Number of transition tables that have
518 been built, other than for initial
519 states. */
520 int min_trcount; /* Number of initial states. Equivalently,
521 the minimum state number for which trcount
522 counts transitions. */
523 state_num **trans; /* Transition tables for states that can
524 never accept. If the transitions for a
525 state have not yet been computed, or the
526 state could possibly accept, its entry in
527 this table is NULL. This points to two
528 past the start of the allocated array,
529 and trans[-1] and trans[-2] are always
530 NULL. */
531 state_num **fails; /* Transition tables after failing to accept
532 on a state that potentially could do so.
533 If trans[i] is non-null, fails[i] must
534 be null. */
535 char *success; /* Table of acceptance conditions used in
536 dfaexec and computed in build_state. */
537 state_num *newlines; /* Transitions on newlines. The entry for a
538 newline in any transition table is always
539 -1 so we can count lines without wasting
540 too many cycles. The transition for a
541 newline is stored separately and handled
542 as a special case. Newline is also used
543 as a sentinel at the end of the buffer. */
544 state_num initstate_notbol; /* Initial state for CTX_LETTER and CTX_NONE
545 context in multibyte locales, in which we
546 do not distinguish between their contexts,
547 as not supported word. */
548 position_set mb_follows; /* Follow set added by ANYCHAR on demand. */
549 state_num **mb_trans; /* Transition tables for states with
550 ANYCHAR. */
551 state_num mb_trcount; /* Number of transition tables for states with
552 ANYCHAR that have actually been built. */
554 /* Information derived from the locale. This is at the end so that
555 a quick memset need not clear it specially. */
557 /* dfaexec implementation. */
558 char *(*dfaexec) (struct dfa *, char const *, char *,
559 bool, size_t *, bool *);
561 /* The locale is simple, like the C locale. These locales can be
562 processed more efficiently, as they are single-byte, their native
563 character set is in collating-sequence order, and they do not
564 have multi-character collating elements. */
565 bool simple_locale;
567 /* Other cached information derived from the locale. */
568 struct localeinfo localeinfo;
571 /* User access to dfa internals. */
573 /* S could possibly be an accepting state of R. */
574 static bool
575 accepting (state_num s, struct dfa const *r)
577 return r->states[s].constraint != 0;
580 /* STATE accepts in the specified context. */
581 static bool
582 accepts_in_context (int prev, int curr, state_num state, struct dfa const *dfa)
584 return succeeds_in_context (dfa->states[state].constraint, prev, curr);
587 static void regexp (struct dfa *dfa);
589 /* Store into *PWC the result of converting the leading bytes of the
590 multibyte buffer S of length N bytes, using D->localeinfo.sbctowc
591 and updating the conversion state in *D. On conversion error,
592 convert just a single byte, to WEOF. Return the number of bytes
593 converted.
595 This differs from mbrtowc (PWC, S, N, &D->mbs) as follows:
597 * PWC points to wint_t, not to wchar_t.
598 * The last arg is a dfa *D instead of merely a multibyte conversion
599 state D->mbs.
600 * N must be at least 1.
601 * S[N - 1] must be a sentinel byte.
602 * Shift encodings are not supported.
603 * The return value is always in the range 1..N.
604 * D->mbs is always valid afterwards.
605 * *PWC is always set to something. */
606 static size_t
607 mbs_to_wchar (wint_t *pwc, char const *s, size_t n, struct dfa *d)
609 unsigned char uc = s[0];
610 wint_t wc = d->localeinfo.sbctowc[uc];
612 if (wc == WEOF)
614 wchar_t wch;
615 size_t nbytes = mbrtowc (&wch, s, n, &d->mbs);
616 if (0 < nbytes && nbytes < (size_t) -2)
618 *pwc = wch;
619 return nbytes;
621 memset (&d->mbs, 0, sizeof d->mbs);
624 *pwc = wc;
625 return 1;
628 #ifdef DEBUG
630 static void
631 prtok (token t)
633 if (t < 0)
634 fprintf (stderr, "END");
635 else if (t < NOTCHAR)
637 unsigned int ch = t;
638 fprintf (stderr, "0x%02x", ch);
640 else
642 char const *s;
643 switch (t)
645 case EMPTY:
646 s = "EMPTY";
647 break;
648 case BACKREF:
649 s = "BACKREF";
650 break;
651 case BEGLINE:
652 s = "BEGLINE";
653 break;
654 case ENDLINE:
655 s = "ENDLINE";
656 break;
657 case BEGWORD:
658 s = "BEGWORD";
659 break;
660 case ENDWORD:
661 s = "ENDWORD";
662 break;
663 case LIMWORD:
664 s = "LIMWORD";
665 break;
666 case NOTLIMWORD:
667 s = "NOTLIMWORD";
668 break;
669 case QMARK:
670 s = "QMARK";
671 break;
672 case STAR:
673 s = "STAR";
674 break;
675 case PLUS:
676 s = "PLUS";
677 break;
678 case CAT:
679 s = "CAT";
680 break;
681 case OR:
682 s = "OR";
683 break;
684 case LPAREN:
685 s = "LPAREN";
686 break;
687 case RPAREN:
688 s = "RPAREN";
689 break;
690 case ANYCHAR:
691 s = "ANYCHAR";
692 break;
693 case MBCSET:
694 s = "MBCSET";
695 break;
696 default:
697 s = "CSET";
698 break;
700 fprintf (stderr, "%s", s);
703 #endif /* DEBUG */
705 /* Stuff pertaining to charclasses. */
707 static bool
708 tstbit (unsigned int b, charclass const *c)
710 return c->w[b / CHARCLASS_WORD_BITS] >> b % CHARCLASS_WORD_BITS & 1;
713 static void
714 setbit (unsigned int b, charclass *c)
716 charclass_word one = 1;
717 c->w[b / CHARCLASS_WORD_BITS] |= one << b % CHARCLASS_WORD_BITS;
720 static void
721 clrbit (unsigned int b, charclass *c)
723 charclass_word one = 1;
724 c->w[b / CHARCLASS_WORD_BITS] &= ~(one << b % CHARCLASS_WORD_BITS);
727 static void
728 zeroset (charclass *s)
730 memset (s, 0, sizeof *s);
733 static void
734 fillset (charclass *s)
736 for (int i = 0; i < CHARCLASS_WORDS; i++)
737 s->w[i] = CHARCLASS_WORD_MASK;
740 static void
741 notset (charclass *s)
743 for (int i = 0; i < CHARCLASS_WORDS; ++i)
744 s->w[i] = CHARCLASS_WORD_MASK & ~s->w[i];
747 static bool
748 equal (charclass const *s1, charclass const *s2)
750 charclass_word w = 0;
751 for (int i = 0; i < CHARCLASS_WORDS; i++)
752 w |= s1->w[i] ^ s2->w[i];
753 return w == 0;
756 static bool
757 emptyset (charclass const *s)
759 charclass_word w = 0;
760 for (int i = 0; i < CHARCLASS_WORDS; i++)
761 w |= s->w[i];
762 return w == 0;
765 /* Grow PA, which points to an array of *NITEMS items, and return the
766 location of the reallocated array, updating *NITEMS to reflect its
767 new size. The new array will contain at least NITEMS_INCR_MIN more
768 items, but will not contain more than NITEMS_MAX items total.
769 ITEM_SIZE is the size of each item, in bytes.
771 ITEM_SIZE and NITEMS_INCR_MIN must be positive. *NITEMS must be
772 nonnegative. If NITEMS_MAX is -1, it is treated as if it were
773 infinity.
775 If PA is null, then allocate a new array instead of reallocating
776 the old one.
778 Thus, to grow an array A without saving its old contents, do
779 { free (A); A = xpalloc (NULL, &AITEMS, ...); }. */
781 static void *
782 xpalloc (void *pa, ptrdiff_t *nitems, ptrdiff_t nitems_incr_min,
783 ptrdiff_t nitems_max, ptrdiff_t item_size)
785 ptrdiff_t n0 = *nitems;
787 /* The approximate size to use for initial small allocation
788 requests. This is the largest "small" request for the GNU C
789 library malloc. */
790 enum { DEFAULT_MXFAST = 64 * sizeof (size_t) / 4 };
792 /* If the array is tiny, grow it to about (but no greater than)
793 DEFAULT_MXFAST bytes. Otherwise, grow it by about 50%.
794 Adjust the growth according to three constraints: NITEMS_INCR_MIN,
795 NITEMS_MAX, and what the C language can represent safely. */
797 ptrdiff_t n, nbytes;
798 if (INT_ADD_WRAPV (n0, n0 >> 1, &n))
799 n = PTRDIFF_MAX;
800 if (0 <= nitems_max && nitems_max < n)
801 n = nitems_max;
803 ptrdiff_t adjusted_nbytes
804 = ((INT_MULTIPLY_WRAPV (n, item_size, &nbytes) || SIZE_MAX < nbytes)
805 ? MIN (PTRDIFF_MAX, SIZE_MAX)
806 : nbytes < DEFAULT_MXFAST ? DEFAULT_MXFAST : 0);
807 if (adjusted_nbytes)
809 n = adjusted_nbytes / item_size;
810 nbytes = adjusted_nbytes - adjusted_nbytes % item_size;
813 if (! pa)
814 *nitems = 0;
815 if (n - n0 < nitems_incr_min
816 && (INT_ADD_WRAPV (n0, nitems_incr_min, &n)
817 || (0 <= nitems_max && nitems_max < n)
818 || INT_MULTIPLY_WRAPV (n, item_size, &nbytes)))
819 xalloc_die ();
820 pa = xrealloc (pa, nbytes);
821 *nitems = n;
822 return pa;
825 /* Ensure that the array addressed by PA holds at least I + 1 items.
826 Either return PA, or reallocate the array and return its new address.
827 Although PA may be null, the returned value is never null.
829 The array holds *NITEMS items, where 0 <= I <= *NITEMS; *NITEMS
830 is updated on reallocation. If PA is null, *NITEMS must be zero.
831 Do not allocate more than NITEMS_MAX items total; -1 means no limit.
832 ITEM_SIZE is the size of one item; it must be positive.
833 Avoid O(N**2) behavior on arrays growing linearly. */
834 static void *
835 maybe_realloc (void *pa, ptrdiff_t i, ptrdiff_t *nitems,
836 ptrdiff_t nitems_max, ptrdiff_t item_size)
838 if (i < *nitems)
839 return pa;
840 return xpalloc (pa, nitems, 1, nitems_max, item_size);
843 /* In DFA D, find the index of charclass S, or allocate a new one. */
844 static ptrdiff_t
845 charclass_index (struct dfa *d, charclass *s)
847 ptrdiff_t i;
849 for (i = 0; i < d->cindex; ++i)
850 if (equal (s, &d->charclasses[i]))
851 return i;
852 d->charclasses = maybe_realloc (d->charclasses, d->cindex, &d->calloc,
853 TOKEN_MAX - CSET, sizeof *d->charclasses);
854 ++d->cindex;
855 d->charclasses[i] = *s;
856 return i;
859 static bool
860 unibyte_word_constituent (struct dfa const *dfa, unsigned char c)
862 return dfa->localeinfo.sbctowc[c] != WEOF && (isalnum (c) || (c) == '_');
865 static int
866 char_context (struct dfa const *dfa, unsigned char c)
868 if (c == dfa->syntax.eolbyte && !dfa->syntax.anchor)
869 return CTX_NEWLINE;
870 if (unibyte_word_constituent (dfa, c))
871 return CTX_LETTER;
872 return CTX_NONE;
875 /* Set a bit in the charclass for the given wchar_t. Do nothing if WC
876 is represented by a multi-byte sequence. Even for MB_CUR_MAX == 1,
877 this may happen when folding case in weird Turkish locales where
878 dotless i/dotted I are not included in the chosen character set.
879 Return whether a bit was set in the charclass. */
880 static bool
881 setbit_wc (wint_t wc, charclass *c)
883 int b = wctob (wc);
884 if (b < 0)
885 return false;
887 setbit (b, c);
888 return true;
891 /* Set a bit for B and its case variants in the charclass C.
892 MB_CUR_MAX must be 1. */
893 static void
894 setbit_case_fold_c (int b, charclass *c)
896 int ub = toupper (b);
897 for (int i = 0; i < NOTCHAR; i++)
898 if (toupper (i) == ub)
899 setbit (i, c);
902 /* Return true if the locale compatible with the C locale. */
904 static bool
905 using_simple_locale (bool multibyte)
907 /* The native character set is known to be compatible with
908 the C locale. The following test isn't perfect, but it's good
909 enough in practice, as only ASCII and EBCDIC are in common use
910 and this test correctly accepts ASCII and rejects EBCDIC. */
911 enum { native_c_charset =
912 ('\b' == 8 && '\t' == 9 && '\n' == 10 && '\v' == 11 && '\f' == 12
913 && '\r' == 13 && ' ' == 32 && '!' == 33 && '"' == 34 && '#' == 35
914 && '%' == 37 && '&' == 38 && '\'' == 39 && '(' == 40 && ')' == 41
915 && '*' == 42 && '+' == 43 && ',' == 44 && '-' == 45 && '.' == 46
916 && '/' == 47 && '0' == 48 && '9' == 57 && ':' == 58 && ';' == 59
917 && '<' == 60 && '=' == 61 && '>' == 62 && '?' == 63 && 'A' == 65
918 && 'Z' == 90 && '[' == 91 && '\\' == 92 && ']' == 93 && '^' == 94
919 && '_' == 95 && 'a' == 97 && 'z' == 122 && '{' == 123 && '|' == 124
920 && '}' == 125 && '~' == 126)
923 if (!native_c_charset || multibyte)
924 return false;
925 else
927 /* Treat C and POSIX locales as being compatible. Also, treat
928 errors as compatible, as these are invariably from stubs. */
929 char const *loc = setlocale (LC_ALL, NULL);
930 return !loc || streq (loc, "C") || streq (loc, "POSIX");
934 /* Fetch the next lexical input character from the pattern. There
935 must at least one byte of pattern input. Set DFA->lex.wctok to the
936 value of the character or to WEOF depending on whether the input is
937 a valid multibyte character (possibly of length 1). Then return
938 the next input byte value, except return EOF if the input is a
939 multibyte character of length greater than 1. */
940 static int
941 fetch_wc (struct dfa *dfa)
943 size_t nbytes = mbs_to_wchar (&dfa->lex.wctok, dfa->lex.ptr, dfa->lex.left,
944 dfa);
945 dfa->lex.cur_mb_len = nbytes;
946 int c = nbytes == 1 ? to_uchar (dfa->lex.ptr[0]) : EOF;
947 dfa->lex.ptr += nbytes;
948 dfa->lex.left -= nbytes;
949 return c;
952 /* If there is no more input, report an error about unbalanced brackets.
953 Otherwise, behave as with fetch_wc (DFA). */
954 static int
955 bracket_fetch_wc (struct dfa *dfa)
957 if (! dfa->lex.left)
958 dfaerror (_("unbalanced ["));
959 return fetch_wc (dfa);
962 typedef int predicate (int);
964 /* The following list maps the names of the Posix named character classes
965 to predicate functions that determine whether a given character is in
966 the class. The leading [ has already been eaten by the lexical
967 analyzer. */
968 struct dfa_ctype
970 const char *name;
971 predicate *func;
972 bool single_byte_only;
975 static const struct dfa_ctype prednames[] = {
976 {"alpha", isalpha, false},
977 {"upper", isupper, false},
978 {"lower", islower, false},
979 {"digit", isdigit, true},
980 {"xdigit", isxdigit, false},
981 {"space", isspace, false},
982 {"punct", ispunct, false},
983 {"alnum", isalnum, false},
984 {"print", isprint, false},
985 {"graph", isgraph, false},
986 {"cntrl", iscntrl, false},
987 {"blank", isblank, false},
988 {NULL, NULL, false}
991 static const struct dfa_ctype *_GL_ATTRIBUTE_PURE
992 find_pred (const char *str)
994 for (unsigned int i = 0; prednames[i].name; ++i)
995 if (streq (str, prednames[i].name))
996 return &prednames[i];
997 return NULL;
1000 /* Parse a bracket expression, which possibly includes multibyte
1001 characters. */
1002 static token
1003 parse_bracket_exp (struct dfa *dfa)
1005 /* This is a bracket expression that dfaexec is known to
1006 process correctly. */
1007 bool known_bracket_exp = true;
1009 /* Used to warn about [:space:].
1010 Bit 0 = first character is a colon.
1011 Bit 1 = last character is a colon.
1012 Bit 2 = includes any other character but a colon.
1013 Bit 3 = includes ranges, char/equiv classes or collation elements. */
1014 int colon_warning_state;
1016 dfa->lex.brack.nchars = 0;
1017 charclass ccl;
1018 zeroset (&ccl);
1019 int c = bracket_fetch_wc (dfa);
1020 bool invert = c == '^';
1021 if (invert)
1023 c = bracket_fetch_wc (dfa);
1024 known_bracket_exp = dfa->simple_locale;
1026 wint_t wc = dfa->lex.wctok;
1027 int c1;
1028 wint_t wc1;
1029 colon_warning_state = (c == ':');
1032 c1 = NOTCHAR; /* Mark c1 as not initialized. */
1033 colon_warning_state &= ~2;
1035 /* Note that if we're looking at some other [:...:] construct,
1036 we just treat it as a bunch of ordinary characters. We can do
1037 this because we assume regex has checked for syntax errors before
1038 dfa is ever called. */
1039 if (c == '[')
1041 c1 = bracket_fetch_wc (dfa);
1042 wc1 = dfa->lex.wctok;
1044 if ((c1 == ':' && (dfa->syntax.syntax_bits & RE_CHAR_CLASSES))
1045 || c1 == '.' || c1 == '=')
1047 enum { MAX_BRACKET_STRING_LEN = 32 };
1048 char str[MAX_BRACKET_STRING_LEN + 1];
1049 size_t len = 0;
1050 for (;;)
1052 c = bracket_fetch_wc (dfa);
1053 if (dfa->lex.left == 0
1054 || (c == c1 && dfa->lex.ptr[0] == ']'))
1055 break;
1056 if (len < MAX_BRACKET_STRING_LEN)
1057 str[len++] = c;
1058 else
1059 /* This is in any case an invalid class name. */
1060 str[0] = '\0';
1062 str[len] = '\0';
1064 /* Fetch bracket. */
1065 c = bracket_fetch_wc (dfa);
1066 wc = dfa->lex.wctok;
1067 if (c1 == ':')
1068 /* Build character class. POSIX allows character
1069 classes to match multicharacter collating elements,
1070 but the regex code does not support that, so do not
1071 worry about that possibility. */
1073 char const *class
1074 = (dfa->syntax.case_fold && (streq (str, "upper")
1075 || streq (str, "lower"))
1076 ? "alpha" : str);
1077 const struct dfa_ctype *pred = find_pred (class);
1078 if (!pred)
1079 dfaerror (_("invalid character class"));
1081 if (dfa->localeinfo.multibyte && !pred->single_byte_only)
1082 known_bracket_exp = false;
1083 else
1084 for (int c2 = 0; c2 < NOTCHAR; ++c2)
1085 if (pred->func (c2))
1086 setbit (c2, &ccl);
1088 else
1089 known_bracket_exp = false;
1091 colon_warning_state |= 8;
1093 /* Fetch new lookahead character. */
1094 c1 = bracket_fetch_wc (dfa);
1095 wc1 = dfa->lex.wctok;
1096 continue;
1099 /* We treat '[' as a normal character here. c/c1/wc/wc1
1100 are already set up. */
1103 if (c == '\\'
1104 && (dfa->syntax.syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
1106 c = bracket_fetch_wc (dfa);
1107 wc = dfa->lex.wctok;
1110 if (c1 == NOTCHAR)
1112 c1 = bracket_fetch_wc (dfa);
1113 wc1 = dfa->lex.wctok;
1116 if (c1 == '-')
1117 /* build range characters. */
1119 int c2 = bracket_fetch_wc (dfa);
1120 wint_t wc2 = dfa->lex.wctok;
1122 /* A bracket expression like [a-[.aa.]] matches an unknown set.
1123 Treat it like [-a[.aa.]] while parsing it, and
1124 remember that the set is unknown. */
1125 if (c2 == '[' && dfa->lex.ptr[0] == '.')
1127 known_bracket_exp = false;
1128 c2 = ']';
1131 if (c2 == ']')
1133 /* In the case [x-], the - is an ordinary hyphen,
1134 which is left in c1, the lookahead character. */
1135 dfa->lex.ptr -= dfa->lex.cur_mb_len;
1136 dfa->lex.left += dfa->lex.cur_mb_len;
1138 else
1140 if (c2 == '\\' && (dfa->syntax.syntax_bits
1141 & RE_BACKSLASH_ESCAPE_IN_LISTS))
1143 c2 = bracket_fetch_wc (dfa);
1144 wc2 = dfa->lex.wctok;
1147 colon_warning_state |= 8;
1148 c1 = bracket_fetch_wc (dfa);
1149 wc1 = dfa->lex.wctok;
1151 /* Treat [x-y] as a range if x != y. */
1152 if (wc != wc2 || wc == WEOF)
1154 if (dfa->simple_locale
1155 || (isasciidigit (c) & isasciidigit (c2)))
1157 for (int ci = c; ci <= c2; ci++)
1158 if (dfa->syntax.case_fold && isalpha (ci))
1159 setbit_case_fold_c (ci, &ccl);
1160 else
1161 setbit (ci, &ccl);
1163 else
1164 known_bracket_exp = false;
1166 continue;
1171 colon_warning_state |= (c == ':') ? 2 : 4;
1173 if (!dfa->localeinfo.multibyte)
1175 if (dfa->syntax.case_fold && isalpha (c))
1176 setbit_case_fold_c (c, &ccl);
1177 else
1178 setbit (c, &ccl);
1179 continue;
1182 if (wc == WEOF)
1183 known_bracket_exp = false;
1184 else
1186 wchar_t folded[CASE_FOLDED_BUFSIZE + 1];
1187 unsigned int n = (dfa->syntax.case_fold
1188 ? case_folded_counterparts (wc, folded + 1) + 1
1189 : 1);
1190 folded[0] = wc;
1191 for (unsigned int i = 0; i < n; i++)
1192 if (!setbit_wc (folded[i], &ccl))
1194 dfa->lex.brack.chars
1195 = maybe_realloc (dfa->lex.brack.chars, dfa->lex.brack.nchars,
1196 &dfa->lex.brack.nchars_alloc, -1,
1197 sizeof *dfa->lex.brack.chars);
1198 dfa->lex.brack.chars[dfa->lex.brack.nchars++] = folded[i];
1202 while ((wc = wc1, (c = c1) != ']'));
1204 if (colon_warning_state == 7)
1205 dfawarn (_("character class syntax is [[:space:]], not [:space:]"));
1207 if (! known_bracket_exp)
1208 return BACKREF;
1210 if (dfa->localeinfo.multibyte && (invert || dfa->lex.brack.nchars != 0))
1212 dfa->lex.brack.invert = invert;
1213 dfa->lex.brack.cset = emptyset (&ccl) ? -1 : charclass_index (dfa, &ccl);
1214 return MBCSET;
1217 if (invert)
1219 notset (&ccl);
1220 if (dfa->syntax.syntax_bits & RE_HAT_LISTS_NOT_NEWLINE)
1221 clrbit ('\n', &ccl);
1224 return CSET + charclass_index (dfa, &ccl);
1227 struct lexptr
1229 char const *ptr;
1230 size_t left;
1233 static void
1234 push_lex_state (struct dfa *dfa, struct lexptr *ls, char const *s)
1236 ls->ptr = dfa->lex.ptr;
1237 ls->left = dfa->lex.left;
1238 dfa->lex.ptr = s;
1239 dfa->lex.left = strlen (s);
1242 static void
1243 pop_lex_state (struct dfa *dfa, struct lexptr const *ls)
1245 dfa->lex.ptr = ls->ptr;
1246 dfa->lex.left = ls->left;
1249 static token
1250 lex (struct dfa *dfa)
1252 bool backslash = false;
1254 /* Basic plan: We fetch a character. If it's a backslash,
1255 we set the backslash flag and go through the loop again.
1256 On the plus side, this avoids having a duplicate of the
1257 main switch inside the backslash case. On the minus side,
1258 it means that just about every case begins with
1259 "if (backslash) ...". */
1260 for (int i = 0; i < 2; ++i)
1262 if (! dfa->lex.left)
1263 return dfa->lex.lasttok = END;
1264 int c = fetch_wc (dfa);
1266 switch (c)
1268 case '\\':
1269 if (backslash)
1270 goto normal_char;
1271 if (dfa->lex.left == 0)
1272 dfaerror (_("unfinished \\ escape"));
1273 backslash = true;
1274 break;
1276 case '^':
1277 if (backslash)
1278 goto normal_char;
1279 if (dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_ANCHORS
1280 || dfa->lex.lasttok == END || dfa->lex.lasttok == LPAREN
1281 || dfa->lex.lasttok == OR)
1282 return dfa->lex.lasttok = BEGLINE;
1283 goto normal_char;
1285 case '$':
1286 if (backslash)
1287 goto normal_char;
1288 if (dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_ANCHORS
1289 || dfa->lex.left == 0
1290 || ((dfa->lex.left
1291 > !(dfa->syntax.syntax_bits & RE_NO_BK_PARENS))
1292 && (dfa->lex.ptr[!(dfa->syntax.syntax_bits & RE_NO_BK_PARENS)
1293 & (dfa->lex.ptr[0] == '\\')]
1294 == ')'))
1295 || ((dfa->lex.left
1296 > !(dfa->syntax.syntax_bits & RE_NO_BK_VBAR))
1297 && (dfa->lex.ptr[!(dfa->syntax.syntax_bits & RE_NO_BK_VBAR)
1298 & (dfa->lex.ptr[0] == '\\')]
1299 == '|'))
1300 || ((dfa->syntax.syntax_bits & RE_NEWLINE_ALT)
1301 && dfa->lex.left > 0 && dfa->lex.ptr[0] == '\n'))
1302 return dfa->lex.lasttok = ENDLINE;
1303 goto normal_char;
1305 case '1':
1306 case '2':
1307 case '3':
1308 case '4':
1309 case '5':
1310 case '6':
1311 case '7':
1312 case '8':
1313 case '9':
1314 if (backslash && !(dfa->syntax.syntax_bits & RE_NO_BK_REFS))
1316 dfa->lex.laststart = false;
1317 return dfa->lex.lasttok = BACKREF;
1319 goto normal_char;
1321 case '`':
1322 if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1324 /* FIXME: should be beginning of string */
1325 return dfa->lex.lasttok = BEGLINE;
1327 goto normal_char;
1329 case '\'':
1330 if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1332 /* FIXME: should be end of string */
1333 return dfa->lex.lasttok = ENDLINE;
1335 goto normal_char;
1337 case '<':
1338 if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1339 return dfa->lex.lasttok = BEGWORD;
1340 goto normal_char;
1342 case '>':
1343 if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1344 return dfa->lex.lasttok = ENDWORD;
1345 goto normal_char;
1347 case 'b':
1348 if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1349 return dfa->lex.lasttok = LIMWORD;
1350 goto normal_char;
1352 case 'B':
1353 if (backslash && !(dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1354 return dfa->lex.lasttok = NOTLIMWORD;
1355 goto normal_char;
1357 case '?':
1358 if (dfa->syntax.syntax_bits & RE_LIMITED_OPS)
1359 goto normal_char;
1360 if (backslash != ((dfa->syntax.syntax_bits & RE_BK_PLUS_QM) != 0))
1361 goto normal_char;
1362 if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS)
1363 && dfa->lex.laststart)
1364 goto normal_char;
1365 return dfa->lex.lasttok = QMARK;
1367 case '*':
1368 if (backslash)
1369 goto normal_char;
1370 if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS)
1371 && dfa->lex.laststart)
1372 goto normal_char;
1373 return dfa->lex.lasttok = STAR;
1375 case '+':
1376 if (dfa->syntax.syntax_bits & RE_LIMITED_OPS)
1377 goto normal_char;
1378 if (backslash != ((dfa->syntax.syntax_bits & RE_BK_PLUS_QM) != 0))
1379 goto normal_char;
1380 if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS)
1381 && dfa->lex.laststart)
1382 goto normal_char;
1383 return dfa->lex.lasttok = PLUS;
1385 case '{':
1386 if (!(dfa->syntax.syntax_bits & RE_INTERVALS))
1387 goto normal_char;
1388 if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_BRACES) == 0))
1389 goto normal_char;
1390 if (!(dfa->syntax.syntax_bits & RE_CONTEXT_INDEP_OPS)
1391 && dfa->lex.laststart)
1392 goto normal_char;
1394 /* Cases:
1395 {M} - exact count
1396 {M,} - minimum count, maximum is infinity
1397 {,N} - 0 through N
1398 {,} - 0 to infinity (same as '*')
1399 {M,N} - M through N */
1401 char const *p = dfa->lex.ptr;
1402 char const *lim = p + dfa->lex.left;
1403 dfa->lex.minrep = dfa->lex.maxrep = -1;
1404 for (; p != lim && isasciidigit (*p); p++)
1405 dfa->lex.minrep = (dfa->lex.minrep < 0
1406 ? *p - '0'
1407 : MIN (RE_DUP_MAX + 1,
1408 dfa->lex.minrep * 10 + *p - '0'));
1409 if (p != lim)
1411 if (*p != ',')
1412 dfa->lex.maxrep = dfa->lex.minrep;
1413 else
1415 if (dfa->lex.minrep < 0)
1416 dfa->lex.minrep = 0;
1417 while (++p != lim && isasciidigit (*p))
1418 dfa->lex.maxrep
1419 = (dfa->lex.maxrep < 0
1420 ? *p - '0'
1421 : MIN (RE_DUP_MAX + 1,
1422 dfa->lex.maxrep * 10 + *p - '0'));
1425 if (! ((! backslash || (p != lim && *p++ == '\\'))
1426 && p != lim && *p++ == '}'
1427 && 0 <= dfa->lex.minrep
1428 && (dfa->lex.maxrep < 0
1429 || dfa->lex.minrep <= dfa->lex.maxrep)))
1431 if (dfa->syntax.syntax_bits & RE_INVALID_INTERVAL_ORD)
1432 goto normal_char;
1433 dfaerror (_("invalid content of \\{\\}"));
1435 if (RE_DUP_MAX < dfa->lex.maxrep)
1436 dfaerror (_("regular expression too big"));
1437 dfa->lex.ptr = p;
1438 dfa->lex.left = lim - p;
1440 dfa->lex.laststart = false;
1441 return dfa->lex.lasttok = REPMN;
1443 case '|':
1444 if (dfa->syntax.syntax_bits & RE_LIMITED_OPS)
1445 goto normal_char;
1446 if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_VBAR) == 0))
1447 goto normal_char;
1448 dfa->lex.laststart = true;
1449 return dfa->lex.lasttok = OR;
1451 case '\n':
1452 if (dfa->syntax.syntax_bits & RE_LIMITED_OPS
1453 || backslash || !(dfa->syntax.syntax_bits & RE_NEWLINE_ALT))
1454 goto normal_char;
1455 dfa->lex.laststart = true;
1456 return dfa->lex.lasttok = OR;
1458 case '(':
1459 if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_PARENS) == 0))
1460 goto normal_char;
1461 dfa->lex.parens++;
1462 dfa->lex.laststart = true;
1463 return dfa->lex.lasttok = LPAREN;
1465 case ')':
1466 if (backslash != ((dfa->syntax.syntax_bits & RE_NO_BK_PARENS) == 0))
1467 goto normal_char;
1468 if (dfa->lex.parens == 0
1469 && dfa->syntax.syntax_bits & RE_UNMATCHED_RIGHT_PAREN_ORD)
1470 goto normal_char;
1471 dfa->lex.parens--;
1472 dfa->lex.laststart = false;
1473 return dfa->lex.lasttok = RPAREN;
1475 case '.':
1476 if (backslash)
1477 goto normal_char;
1478 if (dfa->canychar == (size_t) -1)
1480 charclass ccl;
1481 fillset (&ccl);
1482 if (!(dfa->syntax.syntax_bits & RE_DOT_NEWLINE))
1483 clrbit ('\n', &ccl);
1484 if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL)
1485 clrbit ('\0', &ccl);
1486 if (dfa->localeinfo.multibyte)
1487 for (int c2 = 0; c2 < NOTCHAR; c2++)
1488 if (dfa->localeinfo.sbctowc[c2] == WEOF)
1489 clrbit (c2, &ccl);
1490 dfa->canychar = charclass_index (dfa, &ccl);
1492 dfa->lex.laststart = false;
1493 return dfa->lex.lasttok = (dfa->localeinfo.multibyte
1494 ? ANYCHAR
1495 : CSET + dfa->canychar);
1497 case 's':
1498 case 'S':
1499 if (!backslash || (dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1500 goto normal_char;
1501 if (!dfa->localeinfo.multibyte)
1503 charclass ccl;
1504 zeroset (&ccl);
1505 for (int c2 = 0; c2 < NOTCHAR; ++c2)
1506 if (isspace (c2))
1507 setbit (c2, &ccl);
1508 if (c == 'S')
1509 notset (&ccl);
1510 dfa->lex.laststart = false;
1511 return dfa->lex.lasttok = CSET + charclass_index (dfa, &ccl);
1514 /* FIXME: see if optimizing this, as is done with ANYCHAR and
1515 add_utf8_anychar, makes sense. */
1517 /* \s and \S are documented to be equivalent to [[:space:]] and
1518 [^[:space:]] respectively, so tell the lexer to process those
1519 strings, each minus its "already processed" '['. */
1521 struct lexptr ls;
1522 push_lex_state (dfa, &ls, &"^[:space:]]"[c == 's']);
1523 dfa->lex.lasttok = parse_bracket_exp (dfa);
1524 pop_lex_state (dfa, &ls);
1527 dfa->lex.laststart = false;
1528 return dfa->lex.lasttok;
1530 case 'w':
1531 case 'W':
1532 if (!backslash || (dfa->syntax.syntax_bits & RE_NO_GNU_OPS))
1533 goto normal_char;
1535 if (!dfa->localeinfo.multibyte)
1537 charclass ccl;
1538 zeroset (&ccl);
1539 for (int c2 = 0; c2 < NOTCHAR; ++c2)
1540 if (dfa->syntax.sbit[c2] == CTX_LETTER)
1541 setbit (c2, &ccl);
1542 if (c == 'W')
1543 notset (&ccl);
1544 dfa->lex.laststart = false;
1545 return dfa->lex.lasttok = CSET + charclass_index (dfa, &ccl);
1548 /* FIXME: see if optimizing this, as is done with ANYCHAR and
1549 add_utf8_anychar, makes sense. */
1551 /* \w and \W are documented to be equivalent to [_[:alnum:]] and
1552 [^_[:alnum:]] respectively, so tell the lexer to process those
1553 strings, each minus its "already processed" '['. */
1555 struct lexptr ls;
1556 push_lex_state (dfa, &ls, &"^_[:alnum:]]"[c == 'w']);
1557 dfa->lex.lasttok = parse_bracket_exp (dfa);
1558 pop_lex_state (dfa, &ls);
1561 dfa->lex.laststart = false;
1562 return dfa->lex.lasttok;
1564 case '[':
1565 if (backslash)
1566 goto normal_char;
1567 dfa->lex.laststart = false;
1568 return dfa->lex.lasttok = parse_bracket_exp (dfa);
1570 default:
1571 normal_char:
1572 dfa->lex.laststart = false;
1573 /* For multibyte character sets, folding is done in atom. Always
1574 return WCHAR. */
1575 if (dfa->localeinfo.multibyte)
1576 return dfa->lex.lasttok = WCHAR;
1578 if (dfa->syntax.case_fold && isalpha (c))
1580 charclass ccl;
1581 zeroset (&ccl);
1582 setbit_case_fold_c (c, &ccl);
1583 return dfa->lex.lasttok = CSET + charclass_index (dfa, &ccl);
1586 return dfa->lex.lasttok = c;
1590 /* The above loop should consume at most a backslash
1591 and some other character. */
1592 abort ();
1593 return END; /* keeps pedantic compilers happy. */
1596 static void
1597 addtok_mb (struct dfa *dfa, token t, char mbprop)
1599 if (dfa->talloc == dfa->tindex)
1601 dfa->tokens = x2nrealloc (dfa->tokens, &dfa->talloc,
1602 sizeof *dfa->tokens);
1603 if (dfa->localeinfo.multibyte)
1604 dfa->multibyte_prop = xnrealloc (dfa->multibyte_prop, dfa->talloc,
1605 sizeof *dfa->multibyte_prop);
1607 if (dfa->localeinfo.multibyte)
1608 dfa->multibyte_prop[dfa->tindex] = mbprop;
1609 dfa->tokens[dfa->tindex++] = t;
1611 switch (t)
1613 case QMARK:
1614 case STAR:
1615 case PLUS:
1616 break;
1618 case CAT:
1619 case OR:
1620 dfa->parse.depth--;
1621 break;
1623 case BACKREF:
1624 dfa->fast = false;
1625 FALLTHROUGH;
1626 default:
1627 dfa->nleaves++;
1628 FALLTHROUGH;
1629 case EMPTY:
1630 dfa->parse.depth++;
1631 break;
1633 if (dfa->parse.depth > dfa->depth)
1634 dfa->depth = dfa->parse.depth;
1637 static void addtok_wc (struct dfa *dfa, wint_t wc);
1639 /* Add the given token to the parse tree, maintaining the depth count and
1640 updating the maximum depth if necessary. */
1641 static void
1642 addtok (struct dfa *dfa, token t)
1644 if (dfa->localeinfo.multibyte && t == MBCSET)
1646 bool need_or = false;
1648 /* Extract wide characters into alternations for better performance.
1649 This does not require UTF-8. */
1650 for (ptrdiff_t i = 0; i < dfa->lex.brack.nchars; i++)
1652 addtok_wc (dfa, dfa->lex.brack.chars[i]);
1653 if (need_or)
1654 addtok (dfa, OR);
1655 need_or = true;
1657 dfa->lex.brack.nchars = 0;
1659 /* Wide characters have been handled above, so it is possible
1660 that the set is empty now. Do nothing in that case. */
1661 if (dfa->lex.brack.cset != -1)
1663 addtok (dfa, CSET + dfa->lex.brack.cset);
1664 if (need_or)
1665 addtok (dfa, OR);
1668 else
1670 addtok_mb (dfa, t, 3);
1674 /* We treat a multibyte character as a single atom, so that DFA
1675 can treat a multibyte character as a single expression.
1677 e.g., we construct the following tree from "<mb1><mb2>".
1678 <mb1(1st-byte)><mb1(2nd-byte)><CAT><mb1(3rd-byte)><CAT>
1679 <mb2(1st-byte)><mb2(2nd-byte)><CAT><mb2(3rd-byte)><CAT><CAT> */
1680 static void
1681 addtok_wc (struct dfa *dfa, wint_t wc)
1683 unsigned char buf[MB_LEN_MAX];
1684 mbstate_t s = { 0 };
1685 size_t stored_bytes = wcrtomb ((char *) buf, wc, &s);
1687 if (stored_bytes != (size_t) -1)
1688 dfa->lex.cur_mb_len = stored_bytes;
1689 else
1691 /* This is merely stop-gap. buf[0] is undefined, yet skipping
1692 the addtok_mb call altogether can corrupt the heap. */
1693 dfa->lex.cur_mb_len = 1;
1694 buf[0] = 0;
1697 addtok_mb (dfa, buf[0], dfa->lex.cur_mb_len == 1 ? 3 : 1);
1698 for (int i = 1; i < dfa->lex.cur_mb_len; i++)
1700 addtok_mb (dfa, buf[i], i == dfa->lex.cur_mb_len - 1 ? 2 : 0);
1701 addtok (dfa, CAT);
1705 static void
1706 add_utf8_anychar (struct dfa *dfa)
1708 static charclass const utf8_classes[5] = {
1709 /* 80-bf: non-leading bytes. */
1710 CHARCLASS_INIT (0, 0, 0, 0, 0xffffffff, 0xffffffff, 0, 0),
1712 /* 00-7f: 1-byte sequence. */
1713 CHARCLASS_INIT (0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0, 0, 0, 0),
1715 /* c2-df: 2-byte sequence. */
1716 CHARCLASS_INIT (0, 0, 0, 0, 0, 0, 0xfffffffc, 0),
1718 /* e0-ef: 3-byte sequence. */
1719 CHARCLASS_INIT (0, 0, 0, 0, 0, 0, 0, 0xffff),
1721 /* f0-f7: 4-byte sequence. */
1722 CHARCLASS_INIT (0, 0, 0, 0, 0, 0, 0, 0xff0000)
1724 const unsigned int n = sizeof (utf8_classes) / sizeof (utf8_classes[0]);
1726 /* Define the five character classes that are needed below. */
1727 if (dfa->utf8_anychar_classes[0] == 0)
1728 for (unsigned int i = 0; i < n; i++)
1730 charclass c = utf8_classes[i];
1731 if (i == 1)
1733 if (!(dfa->syntax.syntax_bits & RE_DOT_NEWLINE))
1734 clrbit ('\n', &c);
1735 if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL)
1736 clrbit ('\0', &c);
1738 dfa->utf8_anychar_classes[i] = CSET + charclass_index (dfa, &c);
1741 /* A valid UTF-8 character is
1743 ([0x00-0x7f]
1744 |[0xc2-0xdf][0x80-0xbf]
1745 |[0xe0-0xef[0x80-0xbf][0x80-0xbf]
1746 |[0xf0-f7][0x80-0xbf][0x80-0xbf][0x80-0xbf])
1748 which I'll write more concisely "B|CA|DAA|EAAA". Factor the [0x00-0x7f]
1749 and you get "B|(C|(D|EA)A)A". And since the token buffer is in reverse
1750 Polish notation, you get "B C D E A CAT OR A CAT OR A CAT OR". */
1751 unsigned int i;
1752 for (i = 1; i < n; i++)
1753 addtok (dfa, dfa->utf8_anychar_classes[i]);
1754 while (--i > 1)
1756 addtok (dfa, dfa->utf8_anychar_classes[0]);
1757 addtok (dfa, CAT);
1758 addtok (dfa, OR);
1762 /* The grammar understood by the parser is as follows.
1764 regexp:
1765 regexp OR branch
1766 branch
1768 branch:
1769 branch closure
1770 closure
1772 closure:
1773 closure QMARK
1774 closure STAR
1775 closure PLUS
1776 closure REPMN
1777 atom
1779 atom:
1780 <normal character>
1781 <multibyte character>
1782 ANYCHAR
1783 MBCSET
1784 CSET
1785 BACKREF
1786 BEGLINE
1787 ENDLINE
1788 BEGWORD
1789 ENDWORD
1790 LIMWORD
1791 NOTLIMWORD
1792 LPAREN regexp RPAREN
1793 <empty>
1795 The parser builds a parse tree in postfix form in an array of tokens. */
1797 static void
1798 atom (struct dfa *dfa)
1800 if (dfa->parse.tok == WCHAR)
1802 if (dfa->lex.wctok == WEOF)
1803 addtok (dfa, BACKREF);
1804 else
1806 addtok_wc (dfa, dfa->lex.wctok);
1808 if (dfa->syntax.case_fold)
1810 wchar_t folded[CASE_FOLDED_BUFSIZE];
1811 unsigned int n = case_folded_counterparts (dfa->lex.wctok,
1812 folded);
1813 for (unsigned int i = 0; i < n; i++)
1815 addtok_wc (dfa, folded[i]);
1816 addtok (dfa, OR);
1821 dfa->parse.tok = lex (dfa);
1823 else if (dfa->parse.tok == ANYCHAR && dfa->localeinfo.using_utf8)
1825 /* For UTF-8 expand the period to a series of CSETs that define a valid
1826 UTF-8 character. This avoids using the slow multibyte path. I'm
1827 pretty sure it would be both profitable and correct to do it for
1828 any encoding; however, the optimization must be done manually as
1829 it is done above in add_utf8_anychar. So, let's start with
1830 UTF-8: it is the most used, and the structure of the encoding
1831 makes the correctness more obvious. */
1832 add_utf8_anychar (dfa);
1833 dfa->parse.tok = lex (dfa);
1835 else if ((0 <= dfa->parse.tok && dfa->parse.tok < NOTCHAR)
1836 || dfa->parse.tok >= CSET || dfa->parse.tok == BACKREF
1837 || dfa->parse.tok == BEGLINE || dfa->parse.tok == ENDLINE
1838 || dfa->parse.tok == BEGWORD || dfa->parse.tok == ANYCHAR
1839 || dfa->parse.tok == MBCSET || dfa->parse.tok == ENDWORD
1840 || dfa->parse.tok == LIMWORD || dfa->parse.tok == NOTLIMWORD)
1842 addtok (dfa, dfa->parse.tok);
1843 dfa->parse.tok = lex (dfa);
1845 else if (dfa->parse.tok == LPAREN)
1847 dfa->parse.tok = lex (dfa);
1848 regexp (dfa);
1849 if (dfa->parse.tok != RPAREN)
1850 dfaerror (_("unbalanced ("));
1851 dfa->parse.tok = lex (dfa);
1853 else
1854 addtok (dfa, EMPTY);
1857 /* Return the number of tokens in the given subexpression. */
1858 static size_t _GL_ATTRIBUTE_PURE
1859 nsubtoks (struct dfa const *dfa, size_t tindex)
1861 switch (dfa->tokens[tindex - 1])
1863 default:
1864 return 1;
1865 case QMARK:
1866 case STAR:
1867 case PLUS:
1868 return 1 + nsubtoks (dfa, tindex - 1);
1869 case CAT:
1870 case OR:
1872 size_t ntoks1 = nsubtoks (dfa, tindex - 1);
1873 return 1 + ntoks1 + nsubtoks (dfa, tindex - 1 - ntoks1);
1878 /* Copy the given subexpression to the top of the tree. */
1879 static void
1880 copytoks (struct dfa *dfa, size_t tindex, size_t ntokens)
1882 if (dfa->localeinfo.multibyte)
1883 for (size_t i = 0; i < ntokens; ++i)
1884 addtok_mb (dfa, dfa->tokens[tindex + i],
1885 dfa->multibyte_prop[tindex + i]);
1886 else
1887 for (size_t i = 0; i < ntokens; ++i)
1888 addtok_mb (dfa, dfa->tokens[tindex + i], 3);
1891 static void
1892 closure (struct dfa *dfa)
1894 atom (dfa);
1895 while (dfa->parse.tok == QMARK || dfa->parse.tok == STAR
1896 || dfa->parse.tok == PLUS || dfa->parse.tok == REPMN)
1897 if (dfa->parse.tok == REPMN && (dfa->lex.minrep || dfa->lex.maxrep))
1899 size_t ntokens = nsubtoks (dfa, dfa->tindex);
1900 size_t tindex = dfa->tindex - ntokens;
1901 if (dfa->lex.maxrep < 0)
1902 addtok (dfa, PLUS);
1903 if (dfa->lex.minrep == 0)
1904 addtok (dfa, QMARK);
1905 int i;
1906 for (i = 1; i < dfa->lex.minrep; i++)
1908 copytoks (dfa, tindex, ntokens);
1909 addtok (dfa, CAT);
1911 for (; i < dfa->lex.maxrep; i++)
1913 copytoks (dfa, tindex, ntokens);
1914 addtok (dfa, QMARK);
1915 addtok (dfa, CAT);
1917 dfa->parse.tok = lex (dfa);
1919 else if (dfa->parse.tok == REPMN)
1921 dfa->tindex -= nsubtoks (dfa, dfa->tindex);
1922 dfa->parse.tok = lex (dfa);
1923 closure (dfa);
1925 else
1927 addtok (dfa, dfa->parse.tok);
1928 dfa->parse.tok = lex (dfa);
1932 static void
1933 branch (struct dfa* dfa)
1935 closure (dfa);
1936 while (dfa->parse.tok != RPAREN && dfa->parse.tok != OR
1937 && dfa->parse.tok >= 0)
1939 closure (dfa);
1940 addtok (dfa, CAT);
1944 static void
1945 regexp (struct dfa *dfa)
1947 branch (dfa);
1948 while (dfa->parse.tok == OR)
1950 dfa->parse.tok = lex (dfa);
1951 branch (dfa);
1952 addtok (dfa, OR);
1956 /* Main entry point for the parser. S is a string to be parsed, len is the
1957 length of the string, so s can include NUL characters. D is a pointer to
1958 the struct dfa to parse into. */
1959 static void
1960 dfaparse (char const *s, size_t len, struct dfa *d)
1962 d->lex.ptr = s;
1963 d->lex.left = len;
1964 d->lex.lasttok = END;
1965 d->lex.laststart = true;
1967 if (!d->syntax.syntax_bits_set)
1968 dfaerror (_("no syntax specified"));
1970 d->parse.tok = lex (d);
1971 d->parse.depth = d->depth;
1973 regexp (d);
1975 if (d->parse.tok != END)
1976 dfaerror (_("unbalanced )"));
1978 addtok (d, END - d->nregexps);
1979 addtok (d, CAT);
1981 if (d->nregexps)
1982 addtok (d, OR);
1984 ++d->nregexps;
1987 /* Some primitives for operating on sets of positions. */
1989 /* Copy one set to another. */
1990 static void
1991 copy (position_set const *src, position_set *dst)
1993 if (dst->alloc < src->nelem)
1995 free (dst->elems);
1996 dst->elems = xpalloc (NULL, &dst->alloc, src->nelem - dst->alloc, -1,
1997 sizeof *dst->elems);
1999 dst->nelem = src->nelem;
2000 if (src->nelem != 0)
2001 memcpy (dst->elems, src->elems, src->nelem * sizeof *dst->elems);
2004 static void
2005 alloc_position_set (position_set *s, size_t size)
2007 s->elems = xnmalloc (size, sizeof *s->elems);
2008 s->alloc = size;
2009 s->nelem = 0;
2012 /* Insert position P in set S. S is maintained in sorted order on
2013 decreasing index. If there is already an entry in S with P.index
2014 then merge (logically-OR) P's constraints into the one in S.
2015 S->elems must point to an array large enough to hold the resulting set. */
2016 static void
2017 insert (position p, position_set *s)
2019 ptrdiff_t count = s->nelem;
2020 ptrdiff_t lo = 0, hi = count;
2021 while (lo < hi)
2023 ptrdiff_t mid = (lo + hi) >> 1;
2024 if (s->elems[mid].index > p.index)
2025 lo = mid + 1;
2026 else if (s->elems[mid].index == p.index)
2028 s->elems[mid].constraint |= p.constraint;
2029 return;
2031 else
2032 hi = mid;
2035 s->elems = maybe_realloc (s->elems, count, &s->alloc, -1, sizeof *s->elems);
2036 for (ptrdiff_t i = count; i > lo; i--)
2037 s->elems[i] = s->elems[i - 1];
2038 s->elems[lo] = p;
2039 ++s->nelem;
2042 /* Merge S1 and S2 (with the additional constraint C2) into M. The
2043 result is as if the positions of S1, and of S2 with the additional
2044 constraint C2, were inserted into an initially empty set. */
2045 static void
2046 merge_constrained (position_set const *s1, position_set const *s2,
2047 unsigned int c2, position_set *m)
2049 ptrdiff_t i = 0, j = 0;
2051 if (m->alloc - s1->nelem < s2->nelem)
2053 free (m->elems);
2054 m->alloc = s1->nelem;
2055 m->elems = xpalloc (NULL, &m->alloc, s2->nelem, -1, sizeof *m->elems);
2057 m->nelem = 0;
2058 while (i < s1->nelem || j < s2->nelem)
2059 if (! (j < s2->nelem)
2060 || (i < s1->nelem && s1->elems[i].index >= s2->elems[j].index))
2062 unsigned int c = ((i < s1->nelem && j < s2->nelem
2063 && s1->elems[i].index == s2->elems[j].index)
2064 ? s2->elems[j++].constraint & c2
2065 : 0);
2066 m->elems[m->nelem].index = s1->elems[i].index;
2067 m->elems[m->nelem++].constraint = s1->elems[i++].constraint | c;
2069 else
2071 if (s2->elems[j].constraint & c2)
2073 m->elems[m->nelem].index = s2->elems[j].index;
2074 m->elems[m->nelem++].constraint = s2->elems[j].constraint & c2;
2076 j++;
2080 /* Merge two sets of positions into a third. The result is exactly as if
2081 the positions of both sets were inserted into an initially empty set. */
2082 static void
2083 merge (position_set const *s1, position_set const *s2, position_set *m)
2085 merge_constrained (s1, s2, -1, m);
2088 /* Delete a position from a set. Return the nonzero constraint of the
2089 deleted position, or zero if there was no such position. */
2090 static unsigned int
2091 delete (size_t del, position_set *s)
2093 size_t count = s->nelem;
2094 size_t lo = 0, hi = count;
2095 while (lo < hi)
2097 size_t mid = (lo + hi) >> 1;
2098 if (s->elems[mid].index > del)
2099 lo = mid + 1;
2100 else if (s->elems[mid].index == del)
2102 unsigned int c = s->elems[mid].constraint;
2103 size_t i;
2104 for (i = mid; i + 1 < count; i++)
2105 s->elems[i] = s->elems[i + 1];
2106 s->nelem = i;
2107 return c;
2109 else
2110 hi = mid;
2112 return 0;
2115 /* Replace a position with the followed set. */
2116 static void
2117 replace (position_set *dst, size_t del, position_set *add,
2118 unsigned int constraint, position_set *tmp)
2120 unsigned int c = delete (del, dst) & constraint;
2122 if (c)
2124 copy (dst, tmp);
2125 merge_constrained (tmp, add, c, dst);
2129 /* Find the index of the state corresponding to the given position set with
2130 the given preceding context, or create a new state if there is no such
2131 state. Context tells whether we got here on a newline or letter. */
2132 static state_num
2133 state_index (struct dfa *d, position_set const *s, int context)
2135 size_t hash = 0;
2136 int constraint = 0;
2137 state_num i;
2138 token first_end = 0;
2140 for (i = 0; i < s->nelem; ++i)
2141 hash ^= s->elems[i].index + s->elems[i].constraint;
2143 /* Try to find a state that exactly matches the proposed one. */
2144 for (i = 0; i < d->sindex; ++i)
2146 if (hash != d->states[i].hash || s->nelem != d->states[i].elems.nelem
2147 || context != d->states[i].context)
2148 continue;
2149 state_num j;
2150 for (j = 0; j < s->nelem; ++j)
2151 if (s->elems[j].constraint != d->states[i].elems.elems[j].constraint
2152 || s->elems[j].index != d->states[i].elems.elems[j].index)
2153 break;
2154 if (j == s->nelem)
2155 return i;
2158 #ifdef DEBUG
2159 fprintf (stderr, "new state %zd\n nextpos:", i);
2160 for (state_num j = 0; j < s->nelem; j++)
2162 fprintf (stderr, " %zu:", s->elems[j].index);
2163 prtok (d->tokens[s->elems[j].index]);
2165 fprintf (stderr, "\n context:");
2166 if (context ^ CTX_ANY)
2168 if (context & CTX_NONE)
2169 fprintf (stderr, " CTX_NONE");
2170 if (context & CTX_LETTER)
2171 fprintf (stderr, " CTX_LETTER");
2172 if (context & CTX_NEWLINE)
2173 fprintf (stderr, " CTX_NEWLINE");
2175 else
2176 fprintf (stderr, " CTX_ANY");
2177 fprintf (stderr, "\n");
2178 #endif
2180 for (state_num j = 0; j < s->nelem; j++)
2182 int c = s->elems[j].constraint;
2183 if (d->tokens[s->elems[j].index] < 0)
2185 if (succeeds_in_context (c, context, CTX_ANY))
2186 constraint |= c;
2187 if (!first_end)
2188 first_end = d->tokens[s->elems[j].index];
2190 else if (d->tokens[s->elems[j].index] == BACKREF)
2191 constraint = NO_CONSTRAINT;
2195 /* Create a new state. */
2196 d->states = maybe_realloc (d->states, d->sindex, &d->salloc, -1,
2197 sizeof *d->states);
2198 d->states[i].hash = hash;
2199 alloc_position_set (&d->states[i].elems, s->nelem);
2200 copy (s, &d->states[i].elems);
2201 d->states[i].context = context;
2202 d->states[i].constraint = constraint;
2203 d->states[i].first_end = first_end;
2204 d->states[i].mbps.nelem = 0;
2205 d->states[i].mbps.elems = NULL;
2206 d->states[i].mb_trindex = -1;
2208 ++d->sindex;
2210 return i;
2213 /* Find the epsilon closure of a set of positions. If any position of the set
2214 contains a symbol that matches the empty string in some context, replace
2215 that position with the elements of its follow labeled with an appropriate
2216 constraint. Repeat exhaustively until no funny positions are left.
2217 S->elems must be large enough to hold the result. */
2218 static void
2219 epsclosure (position_set *initial, struct dfa const *d)
2221 position_set tmp;
2222 alloc_position_set (&tmp, d->nleaves);
2223 for (size_t i = 0; i < d->tindex; ++i)
2224 if (d->follows[i].nelem > 0 && d->tokens[i] >= NOTCHAR
2225 && d->tokens[i] != BACKREF && d->tokens[i] != ANYCHAR
2226 && d->tokens[i] != MBCSET && d->tokens[i] < CSET)
2228 unsigned int constraint;
2229 switch (d->tokens[i])
2231 case BEGLINE:
2232 constraint = BEGLINE_CONSTRAINT;
2233 break;
2234 case ENDLINE:
2235 constraint = ENDLINE_CONSTRAINT;
2236 break;
2237 case BEGWORD:
2238 constraint = BEGWORD_CONSTRAINT;
2239 break;
2240 case ENDWORD:
2241 constraint = ENDWORD_CONSTRAINT;
2242 break;
2243 case LIMWORD:
2244 constraint = LIMWORD_CONSTRAINT;
2245 break;
2246 case NOTLIMWORD:
2247 constraint = NOTLIMWORD_CONSTRAINT;
2248 break;
2249 default:
2250 constraint = NO_CONSTRAINT;
2251 break;
2254 delete (i, &d->follows[i]);
2256 for (size_t j = 0; j < d->tindex; j++)
2257 if (i != j && d->follows[j].nelem > 0)
2258 replace (&d->follows[j], i, &d->follows[i], constraint, &tmp);
2260 replace (initial, i, &d->follows[i], constraint, &tmp);
2262 free (tmp.elems);
2265 /* Returns the set of contexts for which there is at least one
2266 character included in C. */
2268 static int
2269 charclass_context (struct dfa const *dfa, charclass const *c)
2271 int context = 0;
2273 for (unsigned int j = 0; j < CHARCLASS_WORDS; ++j)
2275 if (c->w[j] & dfa->syntax.newline.w[j])
2276 context |= CTX_NEWLINE;
2277 if (c->w[j] & dfa->syntax.letters.w[j])
2278 context |= CTX_LETTER;
2279 if (c->w[j] & ~(dfa->syntax.letters.w[j] | dfa->syntax.newline.w[j]))
2280 context |= CTX_NONE;
2283 return context;
2286 /* Returns the contexts on which the position set S depends. Each context
2287 in the set of returned contexts (let's call it SC) may have a different
2288 follow set than other contexts in SC, and also different from the
2289 follow set of the complement set (sc ^ CTX_ANY). However, all contexts
2290 in the complement set will have the same follow set. */
2292 static int _GL_ATTRIBUTE_PURE
2293 state_separate_contexts (position_set const *s)
2295 int separate_contexts = 0;
2297 for (size_t j = 0; j < s->nelem; j++)
2299 if (prev_newline_dependent (s->elems[j].constraint))
2300 separate_contexts |= CTX_NEWLINE;
2301 if (prev_letter_dependent (s->elems[j].constraint))
2302 separate_contexts |= CTX_LETTER;
2305 return separate_contexts;
2309 /* Perform bottom-up analysis on the parse tree, computing various functions.
2310 Note that at this point, we're pretending constructs like \< are real
2311 characters rather than constraints on what can follow them.
2313 Nullable: A node is nullable if it is at the root of a regexp that can
2314 match the empty string.
2315 * EMPTY leaves are nullable.
2316 * No other leaf is nullable.
2317 * A QMARK or STAR node is nullable.
2318 * A PLUS node is nullable if its argument is nullable.
2319 * A CAT node is nullable if both its arguments are nullable.
2320 * An OR node is nullable if either argument is nullable.
2322 Firstpos: The firstpos of a node is the set of positions (nonempty leaves)
2323 that could correspond to the first character of a string matching the
2324 regexp rooted at the given node.
2325 * EMPTY leaves have empty firstpos.
2326 * The firstpos of a nonempty leaf is that leaf itself.
2327 * The firstpos of a QMARK, STAR, or PLUS node is the firstpos of its
2328 argument.
2329 * The firstpos of a CAT node is the firstpos of the left argument, union
2330 the firstpos of the right if the left argument is nullable.
2331 * The firstpos of an OR node is the union of firstpos of each argument.
2333 Lastpos: The lastpos of a node is the set of positions that could
2334 correspond to the last character of a string matching the regexp at
2335 the given node.
2336 * EMPTY leaves have empty lastpos.
2337 * The lastpos of a nonempty leaf is that leaf itself.
2338 * The lastpos of a QMARK, STAR, or PLUS node is the lastpos of its
2339 argument.
2340 * The lastpos of a CAT node is the lastpos of its right argument, union
2341 the lastpos of the left if the right argument is nullable.
2342 * The lastpos of an OR node is the union of the lastpos of each argument.
2344 Follow: The follow of a position is the set of positions that could
2345 correspond to the character following a character matching the node in
2346 a string matching the regexp. At this point we consider special symbols
2347 that match the empty string in some context to be just normal characters.
2348 Later, if we find that a special symbol is in a follow set, we will
2349 replace it with the elements of its follow, labeled with an appropriate
2350 constraint.
2351 * Every node in the firstpos of the argument of a STAR or PLUS node is in
2352 the follow of every node in the lastpos.
2353 * Every node in the firstpos of the second argument of a CAT node is in
2354 the follow of every node in the lastpos of the first argument.
2356 Because of the postfix representation of the parse tree, the depth-first
2357 analysis is conveniently done by a linear scan with the aid of a stack.
2358 Sets are stored as arrays of the elements, obeying a stack-like allocation
2359 scheme; the number of elements in each set deeper in the stack can be
2360 used to determine the address of a particular set's array. */
2361 static void
2362 dfaanalyze (struct dfa *d, bool searchflag)
2364 /* Array allocated to hold position sets. */
2365 position *posalloc = xnmalloc (d->nleaves, 2 * sizeof *posalloc);
2366 /* Firstpos and lastpos elements. */
2367 position *firstpos = posalloc + d->nleaves;
2368 position *lastpos = firstpos + d->nleaves;
2370 /* Stack for element counts and nullable flags. */
2371 struct
2373 /* Whether the entry is nullable. */
2374 bool nullable;
2376 /* Counts of firstpos and lastpos sets. */
2377 size_t nfirstpos;
2378 size_t nlastpos;
2379 } *stkalloc = xnmalloc (d->depth, sizeof *stkalloc), *stk = stkalloc;
2381 position_set merged; /* Result of merging sets. */
2383 #ifdef DEBUG
2384 fprintf (stderr, "dfaanalyze:\n");
2385 for (size_t i = 0; i < d->tindex; ++i)
2387 fprintf (stderr, " %zu:", i);
2388 prtok (d->tokens[i]);
2390 putc ('\n', stderr);
2391 #endif
2393 d->searchflag = searchflag;
2394 alloc_position_set (&merged, d->nleaves);
2395 d->follows = xcalloc (d->tindex, sizeof *d->follows);
2397 for (size_t i = 0; i < d->tindex; ++i)
2399 switch (d->tokens[i])
2401 case EMPTY:
2402 /* The empty set is nullable. */
2403 stk->nullable = true;
2405 /* The firstpos and lastpos of the empty leaf are both empty. */
2406 stk->nfirstpos = stk->nlastpos = 0;
2407 stk++;
2408 break;
2410 case STAR:
2411 case PLUS:
2412 /* Every element in the firstpos of the argument is in the follow
2413 of every element in the lastpos. */
2415 position_set tmp;
2416 tmp.nelem = stk[-1].nfirstpos;
2417 tmp.elems = firstpos;
2418 position *pos = lastpos;
2419 for (size_t j = 0; j < stk[-1].nlastpos; j++)
2421 merge (&tmp, &d->follows[pos[j].index], &merged);
2422 copy (&merged, &d->follows[pos[j].index]);
2425 FALLTHROUGH;
2426 case QMARK:
2427 /* A QMARK or STAR node is automatically nullable. */
2428 if (d->tokens[i] != PLUS)
2429 stk[-1].nullable = true;
2430 break;
2432 case CAT:
2433 /* Every element in the firstpos of the second argument is in the
2434 follow of every element in the lastpos of the first argument. */
2436 position_set tmp;
2437 tmp.nelem = stk[-1].nfirstpos;
2438 tmp.elems = firstpos;
2439 position *pos = lastpos + stk[-1].nlastpos;
2440 for (size_t j = 0; j < stk[-2].nlastpos; j++)
2442 merge (&tmp, &d->follows[pos[j].index], &merged);
2443 copy (&merged, &d->follows[pos[j].index]);
2447 /* The firstpos of a CAT node is the firstpos of the first argument,
2448 union that of the second argument if the first is nullable. */
2449 if (stk[-2].nullable)
2450 stk[-2].nfirstpos += stk[-1].nfirstpos;
2451 else
2452 firstpos += stk[-1].nfirstpos;
2454 /* The lastpos of a CAT node is the lastpos of the second argument,
2455 union that of the first argument if the second is nullable. */
2456 if (stk[-1].nullable)
2457 stk[-2].nlastpos += stk[-1].nlastpos;
2458 else
2460 position *pos = lastpos + stk[-2].nlastpos;
2461 for (size_t j = stk[-1].nlastpos; j-- > 0;)
2462 pos[j] = lastpos[j];
2463 lastpos += stk[-2].nlastpos;
2464 stk[-2].nlastpos = stk[-1].nlastpos;
2467 /* A CAT node is nullable if both arguments are nullable. */
2468 stk[-2].nullable &= stk[-1].nullable;
2469 stk--;
2470 break;
2472 case OR:
2473 /* The firstpos is the union of the firstpos of each argument. */
2474 stk[-2].nfirstpos += stk[-1].nfirstpos;
2476 /* The lastpos is the union of the lastpos of each argument. */
2477 stk[-2].nlastpos += stk[-1].nlastpos;
2479 /* An OR node is nullable if either argument is nullable. */
2480 stk[-2].nullable |= stk[-1].nullable;
2481 stk--;
2482 break;
2484 default:
2485 /* Anything else is a nonempty position. (Note that special
2486 constructs like \< are treated as nonempty strings here;
2487 an "epsilon closure" effectively makes them nullable later.
2488 Backreferences have to get a real position so we can detect
2489 transitions on them later. But they are nullable. */
2490 stk->nullable = d->tokens[i] == BACKREF;
2492 /* This position is in its own firstpos and lastpos. */
2493 stk->nfirstpos = stk->nlastpos = 1;
2494 stk++;
2496 --firstpos, --lastpos;
2497 firstpos->index = lastpos->index = i;
2498 firstpos->constraint = lastpos->constraint = NO_CONSTRAINT;
2500 break;
2502 #ifdef DEBUG
2503 /* ... balance the above nonsyntactic #ifdef goo... */
2504 fprintf (stderr, "node %zu:", i);
2505 prtok (d->tokens[i]);
2506 putc ('\n', stderr);
2507 fprintf (stderr,
2508 stk[-1].nullable ? " nullable: yes\n" : " nullable: no\n");
2509 fprintf (stderr, " firstpos:");
2510 for (size_t j = stk[-1].nfirstpos; j-- > 0;)
2512 fprintf (stderr, " %zu:", firstpos[j].index);
2513 prtok (d->tokens[firstpos[j].index]);
2515 fprintf (stderr, "\n lastpos:");
2516 for (size_t j = stk[-1].nlastpos; j-- > 0;)
2518 fprintf (stderr, " %zu:", lastpos[j].index);
2519 prtok (d->tokens[lastpos[j].index]);
2521 putc ('\n', stderr);
2522 #endif
2525 #ifdef DEBUG
2526 for (size_t i = 0; i < d->tindex; ++i)
2527 if (d->tokens[i] < NOTCHAR || d->tokens[i] == BACKREF
2528 || d->tokens[i] == ANYCHAR || d->tokens[i] == MBCSET
2529 || d->tokens[i] >= CSET)
2531 fprintf (stderr, "follows(%zu:", i);
2532 prtok (d->tokens[i]);
2533 fprintf (stderr, "):");
2534 for (size_t j = d->follows[i].nelem; j-- > 0;)
2536 fprintf (stderr, " %zu:", d->follows[i].elems[j].index);
2537 prtok (d->tokens[d->follows[i].elems[j].index]);
2539 putc ('\n', stderr);
2541 #endif
2543 /* Get the epsilon closure of the firstpos of the regexp. The result will
2544 be the set of positions of state 0. */
2545 merged.nelem = 0;
2546 for (size_t i = 0; i < stk[-1].nfirstpos; ++i)
2547 insert (firstpos[i], &merged);
2549 /* For each follow set that is the follow set of a real position, replace
2550 it with its epsilon closure. */
2551 epsclosure (&merged, d);
2553 /* Context wanted by some position. */
2554 int separate_contexts = state_separate_contexts (&merged);
2556 /* Build the initial state. */
2557 if (separate_contexts & CTX_NEWLINE)
2558 state_index (d, &merged, CTX_NEWLINE);
2559 d->initstate_notbol = d->min_trcount
2560 = state_index (d, &merged, separate_contexts ^ CTX_ANY);
2561 if (separate_contexts & CTX_LETTER)
2562 d->min_trcount = state_index (d, &merged, CTX_LETTER);
2563 d->min_trcount++;
2564 d->trcount = 0;
2566 free (posalloc);
2567 free (stkalloc);
2568 free (merged.elems);
2571 /* Make sure D's state arrays are large enough to hold NEW_STATE. */
2572 static void
2573 realloc_trans_if_necessary (struct dfa *d)
2575 state_num oldalloc = d->tralloc;
2576 if (oldalloc < d->sindex)
2578 state_num **realtrans = d->trans ? d->trans - 2 : NULL;
2579 ptrdiff_t newalloc1 = realtrans ? d->tralloc + 2 : 0;
2580 realtrans = xpalloc (realtrans, &newalloc1, d->sindex - oldalloc,
2581 -1, sizeof *realtrans);
2582 realtrans[0] = realtrans[1] = NULL;
2583 d->trans = realtrans + 2;
2584 ptrdiff_t newalloc = d->tralloc = newalloc1 - 2;
2585 d->fails = xnrealloc (d->fails, newalloc, sizeof *d->fails);
2586 d->success = xnrealloc (d->success, newalloc, sizeof *d->success);
2587 d->newlines = xnrealloc (d->newlines, newalloc, sizeof *d->newlines);
2588 if (d->localeinfo.multibyte)
2590 realtrans = d->mb_trans ? d->mb_trans - 2 : NULL;
2591 realtrans = xnrealloc (realtrans, newalloc1, sizeof *realtrans);
2592 if (oldalloc == 0)
2593 realtrans[0] = realtrans[1] = NULL;
2594 d->mb_trans = realtrans + 2;
2596 for (; oldalloc < newalloc; oldalloc++)
2598 d->trans[oldalloc] = NULL;
2599 d->fails[oldalloc] = NULL;
2600 if (d->localeinfo.multibyte)
2601 d->mb_trans[oldalloc] = NULL;
2607 Calculate the transition table for a new state derived from state s
2608 for a compiled dfa d after input character uc, and return the new
2609 state number.
2611 Do not worry about all possible input characters; calculate just the group
2612 of positions that match uc. Label it with the set of characters that
2613 every position in the group matches (taking into account, if necessary,
2614 preceding context information of s). Then find the union
2615 of these positions' follows, i.e., the set of positions of the
2616 new state. For each character in the group's label, set the transition
2617 on this character to be to a state corresponding to the set's positions,
2618 and its associated backward context information, if necessary.
2620 When building a searching matcher, include the positions of state
2621 0 in every state.
2623 The group is constructed by building an equivalence-class
2624 partition of the positions of s.
2626 For each position, find the set of characters C that it matches. Eliminate
2627 any characters from C that fail on grounds of backward context.
2629 Check whether the group's label L has nonempty
2630 intersection with C. If L - C is nonempty, create a new group labeled
2631 L - C and having the same positions as the current group, and set L to
2632 the intersection of L and C. Insert the position in the group, set
2633 C = C - L, and resume scanning.
2635 If after comparing with every group there are characters remaining in C,
2636 create a new group labeled with the characters of C and insert this
2637 position in that group. */
2639 static state_num
2640 build_state (state_num s, struct dfa *d, unsigned char uc)
2642 position_set follows; /* Union of the follows of the group. */
2643 position_set tmp; /* Temporary space for merging sets. */
2644 state_num state; /* New state. */
2645 state_num state_newline; /* New state on a newline transition. */
2646 state_num state_letter; /* New state on a letter transition. */
2648 #ifdef DEBUG
2649 fprintf (stderr, "build state %td\n", s);
2650 #endif
2652 /* A pointer to the new transition table, and the table itself. */
2653 state_num **ptrans = (accepting (s, d) ? d->fails : d->trans) + s;
2654 state_num *trans = *ptrans;
2656 if (!trans)
2658 /* MAX_TRCOUNT is an arbitrary upper limit on the number of
2659 transition tables that can exist at once, other than for
2660 initial states. Often-used transition tables are quickly
2661 rebuilt, whereas rarely-used ones are cleared away. */
2662 if (MAX_TRCOUNT <= d->trcount)
2664 for (state_num i = d->min_trcount; i < d->tralloc; i++)
2666 free (d->trans[i]);
2667 free (d->fails[i]);
2668 d->trans[i] = d->fails[i] = NULL;
2670 d->trcount = 0;
2673 d->trcount++;
2674 *ptrans = trans = xmalloc (NOTCHAR * sizeof *trans);
2676 /* Fill transition table with a default value which means that the
2677 transited state has not been calculated yet. */
2678 for (int i = 0; i < NOTCHAR; i++)
2679 trans[i] = -2;
2682 /* Set up the success bits for this state. */
2683 d->success[s] = 0;
2684 if (accepts_in_context (d->states[s].context, CTX_NEWLINE, s, d))
2685 d->success[s] |= CTX_NEWLINE;
2686 if (accepts_in_context (d->states[s].context, CTX_LETTER, s, d))
2687 d->success[s] |= CTX_LETTER;
2688 if (accepts_in_context (d->states[s].context, CTX_NONE, s, d))
2689 d->success[s] |= CTX_NONE;
2691 /* Positions that match the input char. */
2692 leaf_set group;
2693 group.elems = xnmalloc (d->nleaves, sizeof *group.elems);
2694 group.nelem = 0;
2696 /* The group's label. */
2697 charclass label;
2698 fillset (&label);
2700 for (size_t i = 0; i < d->states[s].elems.nelem; ++i)
2702 charclass matches; /* Set of matching characters. */
2703 position pos = d->states[s].elems.elems[i];
2704 bool matched = false;
2705 if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR)
2707 zeroset (&matches);
2708 setbit (d->tokens[pos.index], &matches);
2709 if (d->tokens[pos.index] == uc)
2710 matched = true;
2712 else if (d->tokens[pos.index] >= CSET)
2714 matches = d->charclasses[d->tokens[pos.index] - CSET];
2715 if (tstbit (uc, &matches))
2716 matched = true;
2718 else if (d->tokens[pos.index] == ANYCHAR)
2720 matches = d->charclasses[d->canychar];
2721 if (tstbit (uc, &matches))
2722 matched = true;
2724 /* ANYCHAR must match with a single character, so we must put
2725 it to D->states[s].mbps which contains the positions which
2726 can match with a single character not a byte. If all
2727 positions which has ANYCHAR does not depend on context of
2728 next character, we put the follows instead of it to
2729 D->states[s].mbps to optimize. */
2730 if (succeeds_in_context (pos.constraint, d->states[s].context,
2731 CTX_NONE))
2733 if (d->states[s].mbps.nelem == 0)
2734 alloc_position_set (&d->states[s].mbps,
2735 d->follows[pos.index].nelem);
2736 for (size_t j = 0; j < d->follows[pos.index].nelem; j++)
2737 insert (d->follows[pos.index].elems[j], &d->states[s].mbps);
2740 else
2741 continue;
2743 /* Some characters may need to be eliminated from matches because
2744 they fail in the current context. */
2745 if (pos.constraint != NO_CONSTRAINT)
2747 if (!succeeds_in_context (pos.constraint,
2748 d->states[s].context, CTX_NEWLINE))
2749 for (size_t j = 0; j < CHARCLASS_WORDS; ++j)
2750 matches.w[j] &= ~d->syntax.newline.w[j];
2751 if (!succeeds_in_context (pos.constraint,
2752 d->states[s].context, CTX_LETTER))
2753 for (size_t j = 0; j < CHARCLASS_WORDS; ++j)
2754 matches.w[j] &= ~d->syntax.letters.w[j];
2755 if (!succeeds_in_context (pos.constraint,
2756 d->states[s].context, CTX_NONE))
2757 for (size_t j = 0; j < CHARCLASS_WORDS; ++j)
2758 matches.w[j] &= d->syntax.letters.w[j] | d->syntax.newline.w[j];
2760 /* If there are no characters left, there's no point in going on. */
2761 if (emptyset (&matches))
2762 continue;
2764 /* If we have reset the bit that made us declare "matched", reset
2765 that indicator, too. This is required to avoid an infinite loop
2766 with this command: echo cx | LC_ALL=C grep -E 'c\b[x ]' */
2767 if (!tstbit (uc, &matches))
2768 matched = false;
2771 #ifdef DEBUG
2772 fprintf (stderr, " nextpos %zu:", pos.index);
2773 prtok (d->tokens[pos.index]);
2774 fprintf (stderr, " of");
2775 for (size_t j = 0; j < NOTCHAR; j++)
2776 if (tstbit (j, &matches))
2777 fprintf (stderr, " 0x%02zx", j);
2778 fprintf (stderr, "\n");
2779 #endif
2781 if (matched)
2783 for (size_t k = 0; k < CHARCLASS_WORDS; ++k)
2784 label.w[k] &= matches.w[k];
2785 group.elems[group.nelem++] = pos.index;
2787 else
2789 for (size_t k = 0; k < CHARCLASS_WORDS; ++k)
2790 label.w[k] &= ~matches.w[k];
2794 alloc_position_set (&follows, d->nleaves);
2795 alloc_position_set (&tmp, d->nleaves);
2797 if (group.nelem > 0)
2799 follows.nelem = 0;
2801 /* Find the union of the follows of the positions of the group.
2802 This is a hideously inefficient loop. Fix it someday. */
2803 for (size_t j = 0; j < group.nelem; ++j)
2804 for (size_t k = 0; k < d->follows[group.elems[j]].nelem; ++k)
2805 insert (d->follows[group.elems[j]].elems[k], &follows);
2807 /* If we are building a searching matcher, throw in the positions
2808 of state 0 as well, if possible. */
2809 if (d->searchflag)
2811 /* If a token in follows.elems is not 1st byte of a multibyte
2812 character, or the states of follows must accept the bytes
2813 which are not 1st byte of the multibyte character.
2814 Then, if a state of follows encounters a byte, it must not be
2815 a 1st byte of a multibyte character nor a single byte character.
2816 In this case, do not add state[0].follows to next state, because
2817 state[0] must accept 1st-byte.
2819 For example, suppose <sb a> is a certain single byte character,
2820 <mb A> is a certain multibyte character, and the codepoint of
2821 <sb a> equals the 2nd byte of the codepoint of <mb A>. When
2822 state[0] accepts <sb a>, state[i] transits to state[i+1] by
2823 accepting the 1st byte of <mb A>, and state[i+1] accepts the
2824 2nd byte of <mb A>, if state[i+1] encounters the codepoint of
2825 <sb a>, it must not be <sb a> but the 2nd byte of <mb A>, so do
2826 not add state[0]. */
2828 bool mergeit = !d->localeinfo.multibyte;
2829 if (!mergeit)
2831 mergeit = true;
2832 for (size_t j = 0; mergeit && j < follows.nelem; j++)
2833 mergeit &= d->multibyte_prop[follows.elems[j].index];
2835 if (mergeit)
2837 merge (&d->states[0].elems, &follows, &tmp);
2838 copy (&tmp, &follows);
2842 /* Find out if the new state will want any context information,
2843 by calculating possible contexts that the group can match,
2844 and separate contexts that the new state wants to know. */
2845 int possible_contexts = charclass_context (d, &label);
2846 int separate_contexts = state_separate_contexts (&follows);
2848 /* Find the state(s) corresponding to the union of the follows. */
2849 if (possible_contexts & ~separate_contexts)
2850 state = state_index (d, &follows, separate_contexts ^ CTX_ANY);
2851 else
2852 state = -1;
2853 if (separate_contexts & possible_contexts & CTX_NEWLINE)
2854 state_newline = state_index (d, &follows, CTX_NEWLINE);
2855 else
2856 state_newline = state;
2857 if (separate_contexts & possible_contexts & CTX_LETTER)
2858 state_letter = state_index (d, &follows, CTX_LETTER);
2859 else
2860 state_letter = state;
2862 /* Reallocate now, to reallocate any newline transition properly. */
2863 realloc_trans_if_necessary (d);
2866 /* If we are a searching matcher, the default transition is to a state
2867 containing the positions of state 0, otherwise the default transition
2868 is to fail miserably. */
2869 else if (d->searchflag)
2871 state_newline = 0;
2872 state_letter = d->min_trcount - 1;
2873 state = d->initstate_notbol;
2875 else
2877 state_newline = -1;
2878 state_letter = -1;
2879 state = -1;
2882 /* Set the transitions for each character in the label. */
2883 for (size_t i = 0; i < NOTCHAR; i++)
2884 if (tstbit (i, &label))
2885 switch (d->syntax.sbit[i])
2887 case CTX_NEWLINE:
2888 trans[i] = state_newline;
2889 break;
2890 case CTX_LETTER:
2891 trans[i] = state_letter;
2892 break;
2893 default:
2894 trans[i] = state;
2895 break;
2898 #ifdef DEBUG
2899 fprintf (stderr, "trans table %td", s);
2900 for (size_t i = 0; i < NOTCHAR; ++i)
2902 if (!(i & 0xf))
2903 fprintf (stderr, "\n");
2904 fprintf (stderr, " %2td", trans[i]);
2906 fprintf (stderr, "\n");
2907 #endif
2909 free (group.elems);
2910 free (follows.elems);
2911 free (tmp.elems);
2913 /* Keep the newline transition in a special place so we can use it as
2914 a sentinel. */
2915 if (tstbit (d->syntax.eolbyte, &label))
2917 d->newlines[s] = trans[d->syntax.eolbyte];
2918 trans[d->syntax.eolbyte] = -1;
2921 return trans[uc];
2924 /* Multibyte character handling sub-routines for dfaexec. */
2926 /* Consume a single byte and transit state from 's' to '*next_state'.
2927 This function is almost same as the state transition routin in dfaexec.
2928 But state transition is done just once, otherwise matching succeed or
2929 reach the end of the buffer. */
2930 static state_num
2931 transit_state_singlebyte (struct dfa *d, state_num s, unsigned char const **pp)
2933 state_num *t;
2935 if (d->trans[s])
2936 t = d->trans[s];
2937 else if (d->fails[s])
2938 t = d->fails[s];
2939 else
2941 build_state (s, d, **pp);
2942 if (d->trans[s])
2943 t = d->trans[s];
2944 else
2946 t = d->fails[s];
2947 assert (t);
2951 if (t[**pp] == -2)
2952 build_state (s, d, **pp);
2954 return t[*(*pp)++];
2957 /* Transit state from s, then return new state and update the pointer of
2958 the buffer. This function is for a period operator which can match a
2959 multi-byte character. */
2960 static state_num
2961 transit_state (struct dfa *d, state_num s, unsigned char const **pp,
2962 unsigned char const *end)
2964 wint_t wc;
2966 int mbclen = mbs_to_wchar (&wc, (char const *) *pp, end - *pp, d);
2968 /* This state has some operators which can match a multibyte character. */
2969 d->mb_follows.nelem = 0;
2971 /* Calculate the state which can be reached from the state 's' by
2972 consuming 'mbclen' single bytes from the buffer. */
2973 state_num s1 = s;
2974 int mbci;
2975 for (mbci = 0; mbci < mbclen && (mbci == 0 || d->min_trcount <= s); mbci++)
2976 s = transit_state_singlebyte (d, s, pp);
2977 *pp += mbclen - mbci;
2979 if (wc == WEOF)
2981 /* It is an invalid character, so ANYCHAR is not accepted. */
2982 return s;
2985 /* If all positions which have ANYCHAR do not depend on the context
2986 of the next character, calculate the next state with
2987 pre-calculated follows and cache the result. */
2988 if (d->states[s1].mb_trindex < 0)
2990 if (MAX_TRCOUNT <= d->mb_trcount)
2992 state_num s3;
2993 for (s3 = -1; s3 < d->tralloc; s3++)
2995 free (d->mb_trans[s3]);
2996 d->mb_trans[s3] = NULL;
2999 for (state_num i = 0; i < d->sindex; i++)
3000 d->states[i].mb_trindex = -1;
3001 d->mb_trcount = 0;
3003 d->states[s1].mb_trindex = d->mb_trcount++;
3006 if (! d->mb_trans[s])
3008 enum { TRANSPTR_SIZE = sizeof *d->mb_trans[s] };
3009 enum { TRANSALLOC_SIZE = MAX_TRCOUNT * TRANSPTR_SIZE };
3010 d->mb_trans[s] = xmalloc (TRANSALLOC_SIZE);
3011 for (int i = 0; i < MAX_TRCOUNT; i++)
3012 d->mb_trans[s][i] = -1;
3014 else if (d->mb_trans[s][d->states[s1].mb_trindex] >= 0)
3015 return d->mb_trans[s][d->states[s1].mb_trindex];
3017 if (s == -1)
3018 copy (&d->states[s1].mbps, &d->mb_follows);
3019 else
3020 merge (&d->states[s1].mbps, &d->states[s].elems, &d->mb_follows);
3022 int separate_contexts = state_separate_contexts (&d->mb_follows);
3023 state_num s2 = state_index (d, &d->mb_follows, separate_contexts ^ CTX_ANY);
3024 realloc_trans_if_necessary (d);
3026 d->mb_trans[s][d->states[s1].mb_trindex] = s2;
3028 return s2;
3031 /* The initial state may encounter a byte which is not a single byte character
3032 nor the first byte of a multibyte character. But it is incorrect for the
3033 initial state to accept such a byte. For example, in Shift JIS the regular
3034 expression "\\" accepts the codepoint 0x5c, but should not accept the second
3035 byte of the codepoint 0x815c. Then the initial state must skip the bytes
3036 that are not a single byte character nor the first byte of a multibyte
3037 character.
3039 Given DFA state d, use mbs_to_wchar to advance MBP until it reaches
3040 or exceeds P, and return the advanced MBP. If WCP is non-NULL and
3041 the result is greater than P, set *WCP to the final wide character
3042 processed, or to WEOF if no wide character is processed. Otherwise,
3043 if WCP is non-NULL, *WCP may or may not be updated.
3045 Both P and MBP must be no larger than END. */
3046 static unsigned char const *
3047 skip_remains_mb (struct dfa *d, unsigned char const *p,
3048 unsigned char const *mbp, char const *end)
3050 if (d->syntax.never_trail[*p])
3051 return p;
3052 while (mbp < p)
3054 wint_t wc;
3055 mbp += mbs_to_wchar (&wc, (char const *) mbp,
3056 end - (char const *) mbp, d);
3058 return mbp;
3061 /* Search through a buffer looking for a match to the struct dfa *D.
3062 Find the first occurrence of a string matching the regexp in the
3063 buffer, and the shortest possible version thereof. Return a pointer to
3064 the first character after the match, or NULL if none is found. BEGIN
3065 points to the beginning of the buffer, and END points to the first byte
3066 after its end. Note however that we store a sentinel byte (usually
3067 newline) in *END, so the actual buffer must be one byte longer.
3068 When ALLOW_NL, newlines may appear in the matching string.
3069 If COUNT is non-NULL, increment *COUNT once for each newline processed.
3070 If MULTIBYTE, the input consists of multibyte characters and/or
3071 encoding-error bytes. Otherwise, it consists of single-byte characters.
3072 Here is the list of features that make this DFA matcher punt:
3073 - [M-N] range in non-simple locale: regex is up to 25% faster on [a-z]
3074 - [^...] in non-simple locale
3075 - [[=foo=]] or [[.foo.]]
3076 - [[:alpha:]] etc. in multibyte locale (except [[:digit:]] works OK)
3077 - back-reference: (.)\1
3078 - word-delimiter in multibyte locale: \<, \>, \b, \B
3079 See using_simple_locale for the definition of "simple locale". */
3081 static inline char *
3082 dfaexec_main (struct dfa *d, char const *begin, char *end, bool allow_nl,
3083 size_t *count, bool multibyte)
3085 if (MAX_TRCOUNT <= d->sindex)
3087 for (state_num s = d->min_trcount; s < d->sindex; s++)
3089 free (d->states[s].elems.elems);
3090 free (d->states[s].mbps.elems);
3092 d->sindex = d->min_trcount;
3094 if (d->trans)
3096 for (state_num s = 0; s < d->tralloc; s++)
3098 free (d->trans[s]);
3099 free (d->fails[s]);
3100 d->trans[s] = d->fails[s] = NULL;
3102 d->trcount = 0;
3105 if (d->localeinfo.multibyte && d->mb_trans)
3107 for (state_num s = -1; s < d->tralloc; s++)
3109 free (d->mb_trans[s]);
3110 d->mb_trans[s] = NULL;
3112 for (state_num s = 0; s < d->min_trcount; s++)
3113 d->states[s].mb_trindex = -1;
3114 d->mb_trcount = 0;
3118 if (!d->tralloc)
3119 realloc_trans_if_necessary (d);
3121 /* Current state. */
3122 state_num s = 0, s1 = 0;
3124 /* Current input character. */
3125 unsigned char const *p = (unsigned char const *) begin;
3126 unsigned char const *mbp = p;
3128 /* Copy of d->trans so it can be optimized into a register. */
3129 state_num **trans = d->trans;
3130 unsigned char eol = d->syntax.eolbyte; /* Likewise for eolbyte. */
3131 unsigned char saved_end = *(unsigned char *) end;
3132 *end = eol;
3134 if (multibyte)
3136 memset (&d->mbs, 0, sizeof d->mbs);
3137 if (d->mb_follows.alloc == 0)
3138 alloc_position_set (&d->mb_follows, d->nleaves);
3141 size_t nlcount = 0;
3142 for (;;)
3144 state_num *t;
3145 while ((t = trans[s]) != NULL)
3147 if (s < d->min_trcount)
3149 if (!multibyte || d->states[s].mbps.nelem == 0)
3151 while (t[*p] == s)
3152 p++;
3154 if (multibyte)
3155 p = mbp = skip_remains_mb (d, p, mbp, end);
3158 if (multibyte)
3160 s1 = s;
3162 if (d->states[s].mbps.nelem == 0
3163 || d->localeinfo.sbctowc[*p] != WEOF || (char *) p >= end)
3165 /* If an input character does not match ANYCHAR, do it
3166 like a single-byte character. */
3167 s = t[*p++];
3169 else
3171 s = transit_state (d, s, &p, (unsigned char *) end);
3172 mbp = p;
3173 trans = d->trans;
3176 else
3178 s1 = t[*p++];
3179 t = trans[s1];
3180 if (! t)
3182 state_num tmp = s;
3183 s = s1;
3184 s1 = tmp; /* swap */
3185 break;
3187 if (s < d->min_trcount)
3189 while (t[*p] == s1)
3190 p++;
3192 s = t[*p++];
3196 if (s < 0)
3198 if (s == -2)
3200 s = build_state (s1, d, p[-1]);
3201 trans = d->trans;
3203 else if ((char *) p <= end && p[-1] == eol && 0 <= d->newlines[s1])
3205 /* The previous character was a newline. Count it, and skip
3206 checking of multibyte character boundary until here. */
3207 nlcount++;
3208 mbp = p;
3210 s = (allow_nl ? d->newlines[s1]
3211 : d->syntax.sbit[eol] == CTX_NEWLINE ? 0
3212 : d->syntax.sbit[eol] == CTX_LETTER ? d->min_trcount - 1
3213 : d->initstate_notbol);
3215 else
3217 p = NULL;
3218 goto done;
3221 else if (d->fails[s])
3223 if ((d->success[s] & d->syntax.sbit[*p])
3224 || ((char *) p == end
3225 && accepts_in_context (d->states[s].context, CTX_NEWLINE, s,
3226 d)))
3227 goto done;
3229 if (multibyte && s < d->min_trcount)
3230 p = mbp = skip_remains_mb (d, p, mbp, end);
3232 s1 = s;
3233 if (!multibyte || d->states[s].mbps.nelem == 0
3234 || d->localeinfo.sbctowc[*p] != WEOF || (char *) p >= end)
3236 /* If a input character does not match ANYCHAR, do it
3237 like a single-byte character. */
3238 s = d->fails[s][*p++];
3240 else
3242 s = transit_state (d, s, &p, (unsigned char *) end);
3243 mbp = p;
3244 trans = d->trans;
3247 else
3249 build_state (s, d, p[0]);
3250 trans = d->trans;
3254 done:
3255 if (count)
3256 *count += nlcount;
3257 *end = saved_end;
3258 return (char *) p;
3261 /* Specialized versions of dfaexec for multibyte and single-byte cases.
3262 This is for performance, as dfaexec_main is an inline function. */
3264 static char *
3265 dfaexec_mb (struct dfa *d, char const *begin, char *end,
3266 bool allow_nl, size_t *count, bool *backref)
3268 return dfaexec_main (d, begin, end, allow_nl, count, true);
3271 static char *
3272 dfaexec_sb (struct dfa *d, char const *begin, char *end,
3273 bool allow_nl, size_t *count, bool *backref)
3275 return dfaexec_main (d, begin, end, allow_nl, count, false);
3278 /* Always set *BACKREF and return BEGIN. Use this wrapper for
3279 any regexp that uses a construct not supported by this code. */
3280 static char *
3281 dfaexec_noop (struct dfa *d, char const *begin, char *end,
3282 bool allow_nl, size_t *count, bool *backref)
3284 *backref = true;
3285 return (char *) begin;
3288 /* Like dfaexec_main (D, BEGIN, END, ALLOW_NL, COUNT, D->localeinfo.multibyte),
3289 but faster and set *BACKREF if the DFA code does not support this
3290 regexp usage. */
3292 char *
3293 dfaexec (struct dfa *d, char const *begin, char *end,
3294 bool allow_nl, size_t *count, bool *backref)
3296 return d->dfaexec (d, begin, end, allow_nl, count, backref);
3299 struct dfa *
3300 dfasuperset (struct dfa const *d)
3302 return d->superset;
3305 bool
3306 dfaisfast (struct dfa const *d)
3308 return d->fast;
3311 static void
3312 free_mbdata (struct dfa *d)
3314 free (d->multibyte_prop);
3315 free (d->lex.brack.chars);
3316 free (d->mb_follows.elems);
3318 if (d->mb_trans)
3320 state_num s;
3321 for (s = -1; s < d->tralloc; s++)
3322 free (d->mb_trans[s]);
3323 free (d->mb_trans - 2);
3327 /* Return true if every construct in D is supported by this DFA matcher. */
3328 static bool _GL_ATTRIBUTE_PURE
3329 dfa_supported (struct dfa const *d)
3331 for (size_t i = 0; i < d->tindex; i++)
3333 switch (d->tokens[i])
3335 case BEGWORD:
3336 case ENDWORD:
3337 case LIMWORD:
3338 case NOTLIMWORD:
3339 if (!d->localeinfo.multibyte)
3340 continue;
3341 FALLTHROUGH;
3342 case BACKREF:
3343 case MBCSET:
3344 return false;
3347 return true;
3350 static void
3351 dfaoptimize (struct dfa *d)
3353 if (!d->localeinfo.using_utf8)
3354 return;
3356 bool have_backref = false;
3357 for (size_t i = 0; i < d->tindex; ++i)
3359 switch (d->tokens[i])
3361 case ANYCHAR:
3362 /* Lowered. */
3363 abort ();
3364 case BACKREF:
3365 have_backref = true;
3366 break;
3367 case MBCSET:
3368 /* Requires multi-byte algorithm. */
3369 return;
3370 default:
3371 break;
3375 if (!have_backref && d->superset)
3377 /* The superset DFA is not likely to be much faster, so remove it. */
3378 dfafree (d->superset);
3379 free (d->superset);
3380 d->superset = NULL;
3383 free_mbdata (d);
3384 d->localeinfo.multibyte = false;
3385 d->dfaexec = dfaexec_sb;
3386 d->fast = true;
3389 static void
3390 dfassbuild (struct dfa *d)
3392 struct dfa *sup = dfaalloc ();
3394 *sup = *d;
3395 sup->localeinfo.multibyte = false;
3396 sup->dfaexec = dfaexec_sb;
3397 sup->multibyte_prop = NULL;
3398 sup->superset = NULL;
3399 sup->states = NULL;
3400 sup->sindex = 0;
3401 sup->follows = NULL;
3402 sup->tralloc = 0;
3403 sup->trans = NULL;
3404 sup->fails = NULL;
3405 sup->success = NULL;
3406 sup->newlines = NULL;
3408 sup->charclasses = xnmalloc (sup->calloc, sizeof *sup->charclasses);
3409 if (d->cindex)
3411 memcpy (sup->charclasses, d->charclasses,
3412 d->cindex * sizeof *sup->charclasses);
3415 sup->tokens = xnmalloc (d->tindex, 2 * sizeof *sup->tokens);
3416 sup->talloc = d->tindex * 2;
3418 bool have_achar = false;
3419 bool have_nchar = false;
3420 size_t j;
3421 for (size_t i = j = 0; i < d->tindex; i++)
3423 switch (d->tokens[i])
3425 case ANYCHAR:
3426 case MBCSET:
3427 case BACKREF:
3429 charclass ccl;
3430 fillset (&ccl);
3431 sup->tokens[j++] = CSET + charclass_index (sup, &ccl);
3432 sup->tokens[j++] = STAR;
3433 if (d->tokens[i + 1] == QMARK || d->tokens[i + 1] == STAR
3434 || d->tokens[i + 1] == PLUS)
3435 i++;
3436 have_achar = true;
3438 break;
3439 case BEGWORD:
3440 case ENDWORD:
3441 case LIMWORD:
3442 case NOTLIMWORD:
3443 if (d->localeinfo.multibyte)
3445 /* These constraints aren't supported in a multibyte locale.
3446 Ignore them in the superset DFA. */
3447 sup->tokens[j++] = EMPTY;
3448 break;
3450 FALLTHROUGH;
3451 default:
3452 sup->tokens[j++] = d->tokens[i];
3453 if ((0 <= d->tokens[i] && d->tokens[i] < NOTCHAR)
3454 || d->tokens[i] >= CSET)
3455 have_nchar = true;
3456 break;
3459 sup->tindex = j;
3461 if (have_nchar && (have_achar || d->localeinfo.multibyte))
3462 d->superset = sup;
3463 else
3465 dfafree (sup);
3466 free (sup);
3470 /* Parse and analyze a single string of the given length. */
3471 void
3472 dfacomp (char const *s, size_t len, struct dfa *d, bool searchflag)
3474 dfaparse (s, len, d);
3475 dfassbuild (d);
3477 if (dfa_supported (d))
3479 dfaoptimize (d);
3480 dfaanalyze (d, searchflag);
3482 else
3484 d->dfaexec = dfaexec_noop;
3487 if (d->superset)
3489 d->fast = true;
3490 dfaanalyze (d->superset, searchflag);
3494 /* Free the storage held by the components of a dfa. */
3495 void
3496 dfafree (struct dfa *d)
3498 free (d->charclasses);
3499 free (d->tokens);
3501 if (d->localeinfo.multibyte)
3502 free_mbdata (d);
3504 for (size_t i = 0; i < d->sindex; ++i)
3506 free (d->states[i].elems.elems);
3507 free (d->states[i].mbps.elems);
3509 free (d->states);
3511 if (d->follows)
3513 for (size_t i = 0; i < d->tindex; ++i)
3514 free (d->follows[i].elems);
3515 free (d->follows);
3518 if (d->trans)
3520 for (size_t i = 0; i < d->tralloc; ++i)
3522 free (d->trans[i]);
3523 free (d->fails[i]);
3526 free (d->trans - 2);
3527 free (d->fails);
3528 free (d->newlines);
3529 free (d->success);
3532 if (d->superset)
3533 dfafree (d->superset);
3536 /* Having found the postfix representation of the regular expression,
3537 try to find a long sequence of characters that must appear in any line
3538 containing the r.e.
3539 Finding a "longest" sequence is beyond the scope here;
3540 we take an easy way out and hope for the best.
3541 (Take "(ab|a)b"--please.)
3543 We do a bottom-up calculation of sequences of characters that must appear
3544 in matches of r.e.'s represented by trees rooted at the nodes of the postfix
3545 representation:
3546 sequences that must appear at the left of the match ("left")
3547 sequences that must appear at the right of the match ("right")
3548 lists of sequences that must appear somewhere in the match ("in")
3549 sequences that must constitute the match ("is")
3551 When we get to the root of the tree, we use one of the longest of its
3552 calculated "in" sequences as our answer.
3554 The sequences calculated for the various types of node (in pseudo ANSI c)
3555 are shown below. "p" is the operand of unary operators (and the left-hand
3556 operand of binary operators); "q" is the right-hand operand of binary
3557 operators.
3559 "ZERO" means "a zero-length sequence" below.
3561 Type left right is in
3562 ---- ---- ----- -- --
3563 char c # c # c # c # c
3565 ANYCHAR ZERO ZERO ZERO ZERO
3567 MBCSET ZERO ZERO ZERO ZERO
3569 CSET ZERO ZERO ZERO ZERO
3571 STAR ZERO ZERO ZERO ZERO
3573 QMARK ZERO ZERO ZERO ZERO
3575 PLUS p->left p->right ZERO p->in
3577 CAT (p->is==ZERO)? (q->is==ZERO)? (p->is!=ZERO && p->in plus
3578 p->left : q->right : q->is!=ZERO) ? q->in plus
3579 p->is##q->left p->right##q->is p->is##q->is : p->right##q->left
3580 ZERO
3582 OR longest common longest common (do p->is and substrings common
3583 leading trailing to q->is have same p->in and
3584 (sub)sequence (sub)sequence q->in length and content) ?
3585 of p->left of p->right
3586 and q->left and q->right p->is : NULL
3588 If there's anything else we recognize in the tree, all four sequences get set
3589 to zero-length sequences. If there's something we don't recognize in the
3590 tree, we just return a zero-length sequence.
3592 Break ties in favor of infrequent letters (choosing 'zzz' in preference to
3593 'aaa')?
3595 And ... is it here or someplace that we might ponder "optimizations" such as
3596 egrep 'psi|epsilon' -> egrep 'psi'
3597 egrep 'pepsi|epsilon' -> egrep 'epsi'
3598 (Yes, we now find "epsi" as a "string
3599 that must occur", but we might also
3600 simplify the *entire* r.e. being sought)
3601 grep '[c]' -> grep 'c'
3602 grep '(ab|a)b' -> grep 'ab'
3603 grep 'ab*' -> grep 'a'
3604 grep 'a*b' -> grep 'b'
3606 There are several issues:
3608 Is optimization easy (enough)?
3610 Does optimization actually accomplish anything,
3611 or is the automaton you get from "psi|epsilon" (for example)
3612 the same as the one you get from "psi" (for example)?
3614 Are optimizable r.e.'s likely to be used in real-life situations
3615 (something like 'ab*' is probably unlikely; something like is
3616 'psi|epsilon' is likelier)? */
3618 static char *
3619 icatalloc (char *old, char const *new)
3621 size_t newsize = strlen (new);
3622 if (newsize == 0)
3623 return old;
3624 size_t oldsize = strlen (old);
3625 char *result = xrealloc (old, oldsize + newsize + 1);
3626 memcpy (result + oldsize, new, newsize + 1);
3627 return result;
3630 static void
3631 freelist (char **cpp)
3633 while (*cpp)
3634 free (*cpp++);
3637 static char **
3638 enlist (char **cpp, char *new, size_t len)
3640 new = memcpy (xmalloc (len + 1), new, len);
3641 new[len] = '\0';
3642 /* Is there already something in the list that's new (or longer)? */
3643 size_t i;
3644 for (i = 0; cpp[i] != NULL; ++i)
3645 if (strstr (cpp[i], new) != NULL)
3647 free (new);
3648 return cpp;
3650 /* Eliminate any obsoleted strings. */
3651 for (size_t j = 0; cpp[j] != NULL; )
3652 if (strstr (new, cpp[j]) == NULL)
3653 ++j;
3654 else
3656 free (cpp[j]);
3657 if (--i == j)
3658 break;
3659 cpp[j] = cpp[i];
3660 cpp[i] = NULL;
3662 /* Add the new string. */
3663 cpp = xnrealloc (cpp, i + 2, sizeof *cpp);
3664 cpp[i] = new;
3665 cpp[i + 1] = NULL;
3666 return cpp;
3669 /* Given pointers to two strings, return a pointer to an allocated
3670 list of their distinct common substrings. */
3671 static char **
3672 comsubs (char *left, char const *right)
3674 char **cpp = xzalloc (sizeof *cpp);
3676 for (char *lcp = left; *lcp != '\0'; lcp++)
3678 size_t len = 0;
3679 char *rcp = strchr (right, *lcp);
3680 while (rcp != NULL)
3682 size_t i;
3683 for (i = 1; lcp[i] != '\0' && lcp[i] == rcp[i]; ++i)
3684 continue;
3685 if (i > len)
3686 len = i;
3687 rcp = strchr (rcp + 1, *lcp);
3689 if (len != 0)
3690 cpp = enlist (cpp, lcp, len);
3692 return cpp;
3695 static char **
3696 addlists (char **old, char **new)
3698 for (; *new; new++)
3699 old = enlist (old, *new, strlen (*new));
3700 return old;
3703 /* Given two lists of substrings, return a new list giving substrings
3704 common to both. */
3705 static char **
3706 inboth (char **left, char **right)
3708 char **both = xzalloc (sizeof *both);
3710 for (size_t lnum = 0; left[lnum] != NULL; ++lnum)
3712 for (size_t rnum = 0; right[rnum] != NULL; ++rnum)
3714 char **temp = comsubs (left[lnum], right[rnum]);
3715 both = addlists (both, temp);
3716 freelist (temp);
3717 free (temp);
3720 return both;
3723 typedef struct must must;
3725 struct must
3727 char **in;
3728 char *left;
3729 char *right;
3730 char *is;
3731 bool begline;
3732 bool endline;
3733 must *prev;
3736 static must *
3737 allocmust (must *mp, size_t size)
3739 must *new_mp = xmalloc (sizeof *new_mp);
3740 new_mp->in = xzalloc (sizeof *new_mp->in);
3741 new_mp->left = xzalloc (size);
3742 new_mp->right = xzalloc (size);
3743 new_mp->is = xzalloc (size);
3744 new_mp->begline = false;
3745 new_mp->endline = false;
3746 new_mp->prev = mp;
3747 return new_mp;
3750 static void
3751 resetmust (must *mp)
3753 freelist (mp->in);
3754 mp->in[0] = NULL;
3755 mp->left[0] = mp->right[0] = mp->is[0] = '\0';
3756 mp->begline = false;
3757 mp->endline = false;
3760 static void
3761 freemust (must *mp)
3763 freelist (mp->in);
3764 free (mp->in);
3765 free (mp->left);
3766 free (mp->right);
3767 free (mp->is);
3768 free (mp);
3771 struct dfamust *
3772 dfamust (struct dfa const *d)
3774 must *mp = NULL;
3775 char const *result = "";
3776 bool exact = false;
3777 bool begline = false;
3778 bool endline = false;
3779 bool need_begline = false;
3780 bool need_endline = false;
3781 bool case_fold_unibyte = d->syntax.case_fold && MB_CUR_MAX == 1;
3783 for (size_t ri = 0; ri < d->tindex; ++ri)
3785 token t = d->tokens[ri];
3786 switch (t)
3788 case BEGLINE:
3789 mp = allocmust (mp, 2);
3790 mp->begline = true;
3791 need_begline = true;
3792 break;
3793 case ENDLINE:
3794 mp = allocmust (mp, 2);
3795 mp->endline = true;
3796 need_endline = true;
3797 break;
3798 case LPAREN:
3799 case RPAREN:
3800 assert (!"neither LPAREN nor RPAREN may appear here");
3802 case EMPTY:
3803 case BEGWORD:
3804 case ENDWORD:
3805 case LIMWORD:
3806 case NOTLIMWORD:
3807 case BACKREF:
3808 case ANYCHAR:
3809 case MBCSET:
3810 mp = allocmust (mp, 2);
3811 break;
3813 case STAR:
3814 case QMARK:
3815 resetmust (mp);
3816 break;
3818 case OR:
3820 char **new;
3821 must *rmp = mp;
3822 must *lmp = mp = mp->prev;
3823 size_t j, ln, rn, n;
3825 /* Guaranteed to be. Unlikely, but ... */
3826 if (streq (lmp->is, rmp->is))
3828 lmp->begline &= rmp->begline;
3829 lmp->endline &= rmp->endline;
3831 else
3833 lmp->is[0] = '\0';
3834 lmp->begline = false;
3835 lmp->endline = false;
3837 /* Left side--easy */
3838 size_t i = 0;
3839 while (lmp->left[i] != '\0' && lmp->left[i] == rmp->left[i])
3840 ++i;
3841 lmp->left[i] = '\0';
3842 /* Right side */
3843 ln = strlen (lmp->right);
3844 rn = strlen (rmp->right);
3845 n = ln;
3846 if (n > rn)
3847 n = rn;
3848 for (i = 0; i < n; ++i)
3849 if (lmp->right[ln - i - 1] != rmp->right[rn - i - 1])
3850 break;
3851 for (j = 0; j < i; ++j)
3852 lmp->right[j] = lmp->right[(ln - i) + j];
3853 lmp->right[j] = '\0';
3854 new = inboth (lmp->in, rmp->in);
3855 freelist (lmp->in);
3856 free (lmp->in);
3857 lmp->in = new;
3858 freemust (rmp);
3860 break;
3862 case PLUS:
3863 mp->is[0] = '\0';
3864 break;
3866 case END:
3867 assert (!mp->prev);
3868 for (size_t i = 0; mp->in[i] != NULL; ++i)
3869 if (strlen (mp->in[i]) > strlen (result))
3870 result = mp->in[i];
3871 if (streq (result, mp->is))
3873 if ((!need_begline || mp->begline) && (!need_endline
3874 || mp->endline))
3875 exact = true;
3876 begline = mp->begline;
3877 endline = mp->endline;
3879 goto done;
3881 case CAT:
3883 must *rmp = mp;
3884 must *lmp = mp = mp->prev;
3886 /* In. Everything in left, plus everything in
3887 right, plus concatenation of
3888 left's right and right's left. */
3889 lmp->in = addlists (lmp->in, rmp->in);
3890 if (lmp->right[0] != '\0' && rmp->left[0] != '\0')
3892 size_t lrlen = strlen (lmp->right);
3893 size_t rllen = strlen (rmp->left);
3894 char *tp = xmalloc (lrlen + rllen);
3895 memcpy (tp, lmp->right, lrlen);
3896 memcpy (tp + lrlen, rmp->left, rllen);
3897 lmp->in = enlist (lmp->in, tp, lrlen + rllen);
3898 free (tp);
3900 /* Left-hand */
3901 if (lmp->is[0] != '\0')
3902 lmp->left = icatalloc (lmp->left, rmp->left);
3903 /* Right-hand */
3904 if (rmp->is[0] == '\0')
3905 lmp->right[0] = '\0';
3906 lmp->right = icatalloc (lmp->right, rmp->right);
3907 /* Guaranteed to be */
3908 if ((lmp->is[0] != '\0' || lmp->begline)
3909 && (rmp->is[0] != '\0' || rmp->endline))
3911 lmp->is = icatalloc (lmp->is, rmp->is);
3912 lmp->endline = rmp->endline;
3914 else
3916 lmp->is[0] = '\0';
3917 lmp->begline = false;
3918 lmp->endline = false;
3920 freemust (rmp);
3922 break;
3924 case '\0':
3925 /* Not on *my* shift. */
3926 goto done;
3928 default:
3929 if (CSET <= t)
3931 /* If T is a singleton, or if case-folding in a unibyte
3932 locale and T's members all case-fold to the same char,
3933 convert T to one of its members. Otherwise, do
3934 nothing further with T. */
3935 charclass *ccl = &d->charclasses[t - CSET];
3936 int j;
3937 for (j = 0; j < NOTCHAR; j++)
3938 if (tstbit (j, ccl))
3939 break;
3940 if (! (j < NOTCHAR))
3942 mp = allocmust (mp, 2);
3943 break;
3945 t = j;
3946 while (++j < NOTCHAR)
3947 if (tstbit (j, ccl)
3948 && ! (case_fold_unibyte
3949 && toupper (j) == toupper (t)))
3950 break;
3951 if (j < NOTCHAR)
3953 mp = allocmust (mp, 2);
3954 break;
3958 size_t rj = ri + 2;
3959 if (d->tokens[ri + 1] == CAT)
3961 for (; rj < d->tindex - 1; rj += 2)
3963 if ((rj != ri && (d->tokens[rj] <= 0
3964 || NOTCHAR <= d->tokens[rj]))
3965 || d->tokens[rj + 1] != CAT)
3966 break;
3969 mp = allocmust (mp, ((rj - ri) >> 1) + 1);
3970 mp->is[0] = mp->left[0] = mp->right[0]
3971 = case_fold_unibyte ? toupper (t) : t;
3973 size_t i;
3974 for (i = 1; ri + 2 < rj; i++)
3976 ri += 2;
3977 t = d->tokens[ri];
3978 mp->is[i] = mp->left[i] = mp->right[i]
3979 = case_fold_unibyte ? toupper (t) : t;
3981 mp->is[i] = mp->left[i] = mp->right[i] = '\0';
3982 mp->in = enlist (mp->in, mp->is, i);
3983 break;
3986 done:;
3988 struct dfamust *dm = NULL;
3989 if (*result)
3991 dm = xmalloc (sizeof *dm);
3992 dm->exact = exact;
3993 dm->begline = begline;
3994 dm->endline = endline;
3995 dm->must = xstrdup (result);
3998 while (mp)
4000 must *prev = mp->prev;
4001 freemust (mp);
4002 mp = prev;
4005 return dm;
4008 void
4009 dfamustfree (struct dfamust *dm)
4011 free (dm->must);
4012 free (dm);
4015 struct dfa *
4016 dfaalloc (void)
4018 return xmalloc (sizeof (struct dfa));
4021 /* Initialize DFA. */
4022 void
4023 dfasyntax (struct dfa *dfa, struct localeinfo const *linfo,
4024 reg_syntax_t bits, int dfaopts)
4026 memset (dfa, 0, offsetof (struct dfa, dfaexec));
4027 dfa->dfaexec = linfo->multibyte ? dfaexec_mb : dfaexec_sb;
4028 dfa->simple_locale = using_simple_locale (linfo->multibyte);
4029 dfa->localeinfo = *linfo;
4031 dfa->fast = !dfa->localeinfo.multibyte;
4033 dfa->canychar = -1;
4034 dfa->lex.cur_mb_len = 1;
4035 dfa->syntax.syntax_bits_set = true;
4036 dfa->syntax.case_fold = (bits & RE_ICASE) != 0;
4037 dfa->syntax.anchor = (dfaopts & DFA_ANCHOR) != 0;
4038 dfa->syntax.eolbyte = dfaopts & DFA_EOL_NUL ? '\0' : '\n';
4039 dfa->syntax.syntax_bits = bits;
4041 for (int i = CHAR_MIN; i <= CHAR_MAX; ++i)
4043 unsigned char uc = i;
4045 dfa->syntax.sbit[uc] = char_context (dfa, uc);
4046 switch (dfa->syntax.sbit[uc])
4048 case CTX_LETTER:
4049 setbit (uc, &dfa->syntax.letters);
4050 break;
4051 case CTX_NEWLINE:
4052 setbit (uc, &dfa->syntax.newline);
4053 break;
4056 /* POSIX requires that the five bytes in "\n\r./" (including the
4057 terminating NUL) cannot occur inside a multibyte character. */
4058 dfa->syntax.never_trail[uc] = (dfa->localeinfo.using_utf8
4059 ? (uc & 0xc0) != 0x80
4060 : strchr ("\n\r./", uc) != NULL);
4064 /* vim:set shiftwidth=2: */