1 /* dfa.c - deterministic extended regexp routines for GNU
2 Copyright (C) 1988, 1998, 2000, 2002, 2004-2005, 2007-2018 Free Software
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3, or (at your option)
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
18 51 Franklin Street - Fifth Floor, Boston, MA 02110-1301, USA */
20 /* Written June, 1988 by Mike Haertel
21 Modified July, 1988 by Arthur David Olson to assist BMG speedups */
37 streq (char const *a
, char const *b
)
39 return strcmp (a
, b
) == 0;
45 return '0' <= c
&& c
<= '9';
49 #define _(str) gettext (str)
55 #include "localeinfo.h"
59 # define FALLTHROUGH ((void) 0)
61 # define FALLTHROUGH __attribute__ ((__fallthrough__))
66 # define MIN(a,b) ((a) < (b) ? (a) : (b))
69 /* HPUX defines these as macros in sys/param.h. */
77 /* First integer value that is greater than any character code. */
78 enum { NOTCHAR
= 1 << CHAR_BIT
};
80 /* This represents part of a character class. It must be unsigned and
81 at least CHARCLASS_WORD_BITS wide. Any excess bits are zero. */
82 typedef unsigned long int charclass_word
;
84 /* CHARCLASS_WORD_BITS is the number of bits used in a charclass word.
85 CHARCLASS_PAIR (LO, HI) is part of a charclass initializer, and
86 represents 64 bits' worth of a charclass, where LO and HI are the
87 low and high-order 32 bits of the 64-bit quantity. */
88 #if ULONG_MAX >> 31 >> 31 < 3
89 enum { CHARCLASS_WORD_BITS
= 32 };
90 # define CHARCLASS_PAIR(lo, hi) lo, hi
92 enum { CHARCLASS_WORD_BITS
= 64 };
93 # define CHARCLASS_PAIR(lo, hi) (((charclass_word) (hi) << 32) + (lo))
96 /* An initializer for a charclass whose 32-bit words are A through H. */
97 #define CHARCLASS_INIT(a, b, c, d, e, f, g, h) \
99 CHARCLASS_PAIR (a, b), CHARCLASS_PAIR (c, d), \
100 CHARCLASS_PAIR (e, f), CHARCLASS_PAIR (g, h) \
103 /* The maximum useful value of a charclass_word; all used bits are 1. */
104 static charclass_word
const CHARCLASS_WORD_MASK
105 = ((charclass_word
) 1 << (CHARCLASS_WORD_BITS
- 1) << 1) - 1;
107 /* Number of words required to hold a bit for every character. */
110 CHARCLASS_WORDS
= (NOTCHAR
+ CHARCLASS_WORD_BITS
- 1) / CHARCLASS_WORD_BITS
113 /* Sets of unsigned characters are stored as bit vectors in arrays of ints. */
114 typedef struct { charclass_word w
[CHARCLASS_WORDS
]; } charclass
;
116 /* Convert a possibly-signed character to an unsigned character. This is
117 a bit safer than casting to unsigned char, since it catches some type
118 errors that the cast doesn't. */
125 /* Contexts tell us whether a character is a newline or a word constituent.
126 Word-constituent characters are those that satisfy iswalnum, plus '_'.
127 Each character has a single CTX_* value; bitmasks of CTX_* values denote
128 a particular character class.
130 A state also stores a context value, which is a bitmask of CTX_* values.
131 A state's context represents a set of characters that the state's
132 predecessors must match. For example, a state whose context does not
133 include CTX_LETTER will never have transitions where the previous
134 character is a word constituent. A state whose context is CTX_ANY
135 might have transitions from any character. */
145 /* Sometimes characters can only be matched depending on the surrounding
146 context. Such context decisions depend on what the previous character
147 was, and the value of the current (lookahead) character. Context
148 dependent constraints are encoded as 9-bit integers. Each bit that
149 is set indicates that the constraint succeeds in the corresponding
152 bit 6-8 - valid contexts when next character is CTX_NEWLINE
153 bit 3-5 - valid contexts when next character is CTX_LETTER
154 bit 0-2 - valid contexts when next character is CTX_NONE
156 succeeds_in_context determines whether a given constraint
157 succeeds in a particular context. Prev is a bitmask of possible
158 context values for the previous character, curr is the (single-bit)
159 context value for the lookahead character. */
161 newline_constraint (int constraint
)
163 return (constraint
>> 6) & 7;
166 letter_constraint (int constraint
)
168 return (constraint
>> 3) & 7;
171 other_constraint (int constraint
)
173 return constraint
& 7;
177 succeeds_in_context (int constraint
, int prev
, int curr
)
179 return !! (((curr
& CTX_NONE
? other_constraint (constraint
) : 0) \
180 | (curr
& CTX_LETTER
? letter_constraint (constraint
) : 0) \
181 | (curr
& CTX_NEWLINE
? newline_constraint (constraint
) : 0)) \
185 /* The following describe what a constraint depends on. */
187 prev_newline_dependent (int constraint
)
189 return ((constraint
^ constraint
>> 2) & 0111) != 0;
192 prev_letter_dependent (int constraint
)
194 return ((constraint
^ constraint
>> 1) & 0111) != 0;
197 /* Tokens that match the empty string subject to some constraint actually
198 work by applying that constraint to determine what may follow them,
199 taking into account what has gone before. The following values are
200 the constraints corresponding to the special tokens previously defined. */
203 NO_CONSTRAINT
= 0777,
204 BEGLINE_CONSTRAINT
= 0444,
205 ENDLINE_CONSTRAINT
= 0700,
206 BEGWORD_CONSTRAINT
= 0050,
207 ENDWORD_CONSTRAINT
= 0202,
208 LIMWORD_CONSTRAINT
= 0252,
209 NOTLIMWORD_CONSTRAINT
= 0525
212 /* The regexp is parsed into an array of tokens in postfix form. Some tokens
213 are operators and others are terminal symbols. Most (but not all) of these
214 codes are returned by the lexical analyzer. */
216 typedef ptrdiff_t token
;
217 static ptrdiff_t const TOKEN_MAX
= PTRDIFF_MAX
;
219 /* States are indexed by state_num values. These are normally
220 nonnegative but -1 is used as a special value. */
221 typedef ptrdiff_t state_num
;
223 /* Predefined token values. */
226 END
= -1, /* END is a terminal symbol that matches the
227 end of input; any value of END or less in
228 the parse tree is such a symbol. Accepting
229 states of the DFA are those that would have
230 a transition on END. */
232 /* Ordinary character values are terminal symbols that match themselves. */
234 EMPTY
= NOTCHAR
, /* EMPTY is a terminal symbol that matches
237 BACKREF
, /* BACKREF is generated by \<digit>
238 or by any other construct that
239 is not completely handled. If the scanner
240 detects a transition on backref, it returns
241 a kind of "semi-success" indicating that
242 the match will have to be verified with
243 a backtracking matcher. */
245 BEGLINE
, /* BEGLINE is a terminal symbol that matches
246 the empty string at the beginning of a
249 ENDLINE
, /* ENDLINE is a terminal symbol that matches
250 the empty string at the end of a line. */
252 BEGWORD
, /* BEGWORD is a terminal symbol that matches
253 the empty string at the beginning of a
256 ENDWORD
, /* ENDWORD is a terminal symbol that matches
257 the empty string at the end of a word. */
259 LIMWORD
, /* LIMWORD is a terminal symbol that matches
260 the empty string at the beginning or the
263 NOTLIMWORD
, /* NOTLIMWORD is a terminal symbol that
264 matches the empty string not at
265 the beginning or end of a word. */
267 QMARK
, /* QMARK is an operator of one argument that
268 matches zero or one occurrences of its
271 STAR
, /* STAR is an operator of one argument that
272 matches the Kleene closure (zero or more
273 occurrences) of its argument. */
275 PLUS
, /* PLUS is an operator of one argument that
276 matches the positive closure (one or more
277 occurrences) of its argument. */
279 REPMN
, /* REPMN is a lexical token corresponding
280 to the {m,n} construct. REPMN never
281 appears in the compiled token vector. */
283 CAT
, /* CAT is an operator of two arguments that
284 matches the concatenation of its
285 arguments. CAT is never returned by the
288 OR
, /* OR is an operator of two arguments that
289 matches either of its arguments. */
291 LPAREN
, /* LPAREN never appears in the parse tree,
292 it is only a lexeme. */
294 RPAREN
, /* RPAREN never appears in the parse tree. */
296 ANYCHAR
, /* ANYCHAR is a terminal symbol that matches
297 a valid multibyte (or single byte) character.
298 It is used only if MB_CUR_MAX > 1. */
300 MBCSET
, /* MBCSET is similar to CSET, but for
301 multibyte characters. */
303 WCHAR
, /* Only returned by lex. wctok contains
304 the wide character representation. */
306 CSET
/* CSET and (and any value greater) is a
307 terminal symbol that matches any of a
308 class of characters. */
312 /* States of the recognizer correspond to sets of positions in the parse
313 tree, together with the constraints under which they may be matched.
314 So a position is encoded as an index into the parse tree together with
318 size_t index
; /* Index into the parse array. */
319 unsigned int constraint
; /* Constraint for matching this position. */
322 /* Sets of positions are stored as arrays. */
325 position
*elems
; /* Elements of this position set. */
326 ptrdiff_t nelem
; /* Number of elements in this set. */
327 ptrdiff_t alloc
; /* Number of elements allocated in ELEMS. */
330 /* Sets of leaves are also stored as arrays. */
333 size_t *elems
; /* Elements of this position set. */
334 size_t nelem
; /* Number of elements in this set. */
337 /* A state of the dfa consists of a set of positions, some flags,
338 and the token value of the lowest-numbered position of the state that
339 contains an END token. */
342 size_t hash
; /* Hash of the positions of this state. */
343 position_set elems
; /* Positions this state could match. */
344 unsigned char context
; /* Context from previous state. */
345 unsigned short constraint
; /* Constraint for this state to accept. */
346 token first_end
; /* Token value of the first END in elems. */
347 position_set mbps
; /* Positions which can match multibyte
348 characters or the follows, e.g., period.
349 Used only if MB_CUR_MAX > 1. */
350 state_num mb_trindex
; /* Index of this state in MB_TRANS, or
351 negative if the state does not have
355 /* Maximum for any transition table count. This should be at least 3,
356 for the initial state setup. */
357 enum { MAX_TRCOUNT
= 1024 };
359 /* A bracket operator.
360 e.g., [a-c], [[:alpha:]], etc. */
361 struct mb_char_classes
365 wchar_t *chars
; /* Normal characters. */
367 ptrdiff_t nchars_alloc
;
372 /* Syntax bits controlling the behavior of the lexical analyzer. */
373 reg_syntax_t syntax_bits
;
374 bool syntax_bits_set
;
376 /* Flag for case-folding letters into sets. */
379 /* True if ^ and $ match only the start and end of data, and do not match
380 end-of-line within data. */
383 /* End-of-line byte in data. */
384 unsigned char eolbyte
;
386 /* Cache of char-context values. */
389 /* If never_trail[B], the byte B cannot be a non-initial byte in a
390 multibyte character. */
391 bool never_trail
[NOTCHAR
];
393 /* Set of characters considered letters. */
396 /* Set of characters that are newline. */
400 /* Lexical analyzer. All the dross that deals with the obnoxious
401 GNU Regex syntax bits is located here. The poor, suffering
402 reader is referred to the GNU Regex documentation for the
403 meaning of the @#%!@#%^!@ syntax bits. */
406 char const *ptr
; /* Pointer to next input character. */
407 size_t left
; /* Number of characters remaining. */
408 token lasttok
; /* Previous token returned; initially END. */
409 size_t parens
; /* Count of outstanding left parens. */
410 int minrep
, maxrep
; /* Repeat counts for {m,n}. */
412 /* Wide character representation of the current multibyte character,
413 or WEOF if there was an encoding error. Used only if
417 /* Length of the multibyte representation of wctok. */
420 /* The most recently analyzed multibyte bracket expression. */
421 struct mb_char_classes brack
;
423 /* We're separated from beginning or (, | only by zero-width characters. */
427 /* Recursive descent parser for regular expressions. */
431 token tok
; /* Lookahead token. */
432 size_t depth
; /* Current depth of a hypothetical stack
433 holding deferred productions. This is
434 used to determine the depth that will be
435 required of the real stack later on in
439 /* A compiled regular expression. */
442 /* Syntax configuration */
443 struct regex_syntax syntax
;
445 /* Fields filled by the scanner. */
446 charclass
*charclasses
; /* Array of character sets for CSET tokens. */
447 ptrdiff_t cindex
; /* Index for adding new charclasses. */
448 ptrdiff_t calloc
; /* Number of charclasses allocated. */
449 size_t canychar
; /* Index of anychar class, or (size_t) -1. */
452 struct lexer_state lex
;
455 struct parser_state parse
;
457 /* Fields filled by the parser. */
458 token
*tokens
; /* Postfix parse array. */
459 size_t tindex
; /* Index for adding new tokens. */
460 size_t talloc
; /* Number of tokens currently allocated. */
461 size_t depth
; /* Depth required of an evaluation stack
462 used for depth-first traversal of the
464 size_t nleaves
; /* Number of leaves on the parse tree. */
465 size_t nregexps
; /* Count of parallel regexps being built
467 bool fast
; /* The DFA is fast. */
468 token utf8_anychar_classes
[5]; /* To lower ANYCHAR in UTF-8 locales. */
469 mbstate_t mbs
; /* Multibyte conversion state. */
471 /* The following are valid only if MB_CUR_MAX > 1. */
473 /* The value of multibyte_prop[i] is defined by following rule.
474 if tokens[i] < NOTCHAR
475 bit 0 : tokens[i] is the first byte of a character, including
476 single-byte characters.
477 bit 1 : tokens[i] is the last byte of a character, including
478 single-byte characters.
482 = 'single_byte_a', 'multi_byte_A', single_byte_b'
483 = 'sb_a', 'mb_A(1st byte)', 'mb_A(2nd byte)', 'mb_A(3rd byte)', 'sb_b'
487 char *multibyte_prop
;
489 /* Fields filled by the superset. */
490 struct dfa
*superset
; /* Hint of the dfa. */
492 /* Fields filled by the state builder. */
493 dfa_state
*states
; /* States of the dfa. */
494 state_num sindex
; /* Index for adding new states. */
495 ptrdiff_t salloc
; /* Number of states currently allocated. */
497 /* Fields filled by the parse tree->NFA conversion. */
498 position_set
*follows
; /* Array of follow sets, indexed by position
499 index. The follow of a position is the set
500 of positions containing characters that
501 could conceivably follow a character
502 matching the given position in a string
503 matching the regexp. Allocated to the
504 maximum possible position index. */
505 bool searchflag
; /* We are supposed to build a searching
506 as opposed to an exact matcher. A searching
507 matcher finds the first and shortest string
508 matching a regexp anywhere in the buffer,
509 whereas an exact matcher finds the longest
510 string matching, but anchored to the
511 beginning of the buffer. */
513 /* Fields filled by dfaexec. */
514 state_num tralloc
; /* Number of transition tables that have
515 slots so far, not counting trans[-1] and
517 int trcount
; /* Number of transition tables that have
518 been built, other than for initial
520 int min_trcount
; /* Number of initial states. Equivalently,
521 the minimum state number for which trcount
522 counts transitions. */
523 state_num
**trans
; /* Transition tables for states that can
524 never accept. If the transitions for a
525 state have not yet been computed, or the
526 state could possibly accept, its entry in
527 this table is NULL. This points to two
528 past the start of the allocated array,
529 and trans[-1] and trans[-2] are always
531 state_num
**fails
; /* Transition tables after failing to accept
532 on a state that potentially could do so.
533 If trans[i] is non-null, fails[i] must
535 char *success
; /* Table of acceptance conditions used in
536 dfaexec and computed in build_state. */
537 state_num
*newlines
; /* Transitions on newlines. The entry for a
538 newline in any transition table is always
539 -1 so we can count lines without wasting
540 too many cycles. The transition for a
541 newline is stored separately and handled
542 as a special case. Newline is also used
543 as a sentinel at the end of the buffer. */
544 state_num initstate_notbol
; /* Initial state for CTX_LETTER and CTX_NONE
545 context in multibyte locales, in which we
546 do not distinguish between their contexts,
547 as not supported word. */
548 position_set mb_follows
; /* Follow set added by ANYCHAR on demand. */
549 state_num
**mb_trans
; /* Transition tables for states with
551 state_num mb_trcount
; /* Number of transition tables for states with
552 ANYCHAR that have actually been built. */
554 /* Information derived from the locale. This is at the end so that
555 a quick memset need not clear it specially. */
557 /* dfaexec implementation. */
558 char *(*dfaexec
) (struct dfa
*, char const *, char *,
559 bool, size_t *, bool *);
561 /* The locale is simple, like the C locale. These locales can be
562 processed more efficiently, as they are single-byte, their native
563 character set is in collating-sequence order, and they do not
564 have multi-character collating elements. */
567 /* Other cached information derived from the locale. */
568 struct localeinfo localeinfo
;
571 /* User access to dfa internals. */
573 /* S could possibly be an accepting state of R. */
575 accepting (state_num s
, struct dfa
const *r
)
577 return r
->states
[s
].constraint
!= 0;
580 /* STATE accepts in the specified context. */
582 accepts_in_context (int prev
, int curr
, state_num state
, struct dfa
const *dfa
)
584 return succeeds_in_context (dfa
->states
[state
].constraint
, prev
, curr
);
587 static void regexp (struct dfa
*dfa
);
589 /* Store into *PWC the result of converting the leading bytes of the
590 multibyte buffer S of length N bytes, using D->localeinfo.sbctowc
591 and updating the conversion state in *D. On conversion error,
592 convert just a single byte, to WEOF. Return the number of bytes
595 This differs from mbrtowc (PWC, S, N, &D->mbs) as follows:
597 * PWC points to wint_t, not to wchar_t.
598 * The last arg is a dfa *D instead of merely a multibyte conversion
600 * N must be at least 1.
601 * S[N - 1] must be a sentinel byte.
602 * Shift encodings are not supported.
603 * The return value is always in the range 1..N.
604 * D->mbs is always valid afterwards.
605 * *PWC is always set to something. */
607 mbs_to_wchar (wint_t *pwc
, char const *s
, size_t n
, struct dfa
*d
)
609 unsigned char uc
= s
[0];
610 wint_t wc
= d
->localeinfo
.sbctowc
[uc
];
615 size_t nbytes
= mbrtowc (&wch
, s
, n
, &d
->mbs
);
616 if (0 < nbytes
&& nbytes
< (size_t) -2)
621 memset (&d
->mbs
, 0, sizeof d
->mbs
);
634 fprintf (stderr
, "END");
635 else if (t
< NOTCHAR
)
638 fprintf (stderr
, "0x%02x", ch
);
700 fprintf (stderr
, "%s", s
);
705 /* Stuff pertaining to charclasses. */
708 tstbit (unsigned int b
, charclass
const *c
)
710 return c
->w
[b
/ CHARCLASS_WORD_BITS
] >> b
% CHARCLASS_WORD_BITS
& 1;
714 setbit (unsigned int b
, charclass
*c
)
716 charclass_word one
= 1;
717 c
->w
[b
/ CHARCLASS_WORD_BITS
] |= one
<< b
% CHARCLASS_WORD_BITS
;
721 clrbit (unsigned int b
, charclass
*c
)
723 charclass_word one
= 1;
724 c
->w
[b
/ CHARCLASS_WORD_BITS
] &= ~(one
<< b
% CHARCLASS_WORD_BITS
);
728 zeroset (charclass
*s
)
730 memset (s
, 0, sizeof *s
);
734 fillset (charclass
*s
)
736 for (int i
= 0; i
< CHARCLASS_WORDS
; i
++)
737 s
->w
[i
] = CHARCLASS_WORD_MASK
;
741 notset (charclass
*s
)
743 for (int i
= 0; i
< CHARCLASS_WORDS
; ++i
)
744 s
->w
[i
] = CHARCLASS_WORD_MASK
& ~s
->w
[i
];
748 equal (charclass
const *s1
, charclass
const *s2
)
750 charclass_word w
= 0;
751 for (int i
= 0; i
< CHARCLASS_WORDS
; i
++)
752 w
|= s1
->w
[i
] ^ s2
->w
[i
];
757 emptyset (charclass
const *s
)
759 charclass_word w
= 0;
760 for (int i
= 0; i
< CHARCLASS_WORDS
; i
++)
765 /* Grow PA, which points to an array of *NITEMS items, and return the
766 location of the reallocated array, updating *NITEMS to reflect its
767 new size. The new array will contain at least NITEMS_INCR_MIN more
768 items, but will not contain more than NITEMS_MAX items total.
769 ITEM_SIZE is the size of each item, in bytes.
771 ITEM_SIZE and NITEMS_INCR_MIN must be positive. *NITEMS must be
772 nonnegative. If NITEMS_MAX is -1, it is treated as if it were
775 If PA is null, then allocate a new array instead of reallocating
778 Thus, to grow an array A without saving its old contents, do
779 { free (A); A = xpalloc (NULL, &AITEMS, ...); }. */
782 xpalloc (void *pa
, ptrdiff_t *nitems
, ptrdiff_t nitems_incr_min
,
783 ptrdiff_t nitems_max
, ptrdiff_t item_size
)
785 ptrdiff_t n0
= *nitems
;
787 /* The approximate size to use for initial small allocation
788 requests. This is the largest "small" request for the GNU C
790 enum { DEFAULT_MXFAST
= 64 * sizeof (size_t) / 4 };
792 /* If the array is tiny, grow it to about (but no greater than)
793 DEFAULT_MXFAST bytes. Otherwise, grow it by about 50%.
794 Adjust the growth according to three constraints: NITEMS_INCR_MIN,
795 NITEMS_MAX, and what the C language can represent safely. */
798 if (INT_ADD_WRAPV (n0
, n0
>> 1, &n
))
800 if (0 <= nitems_max
&& nitems_max
< n
)
803 ptrdiff_t adjusted_nbytes
804 = ((INT_MULTIPLY_WRAPV (n
, item_size
, &nbytes
) || SIZE_MAX
< nbytes
)
805 ? MIN (PTRDIFF_MAX
, SIZE_MAX
)
806 : nbytes
< DEFAULT_MXFAST
? DEFAULT_MXFAST
: 0);
809 n
= adjusted_nbytes
/ item_size
;
810 nbytes
= adjusted_nbytes
- adjusted_nbytes
% item_size
;
815 if (n
- n0
< nitems_incr_min
816 && (INT_ADD_WRAPV (n0
, nitems_incr_min
, &n
)
817 || (0 <= nitems_max
&& nitems_max
< n
)
818 || INT_MULTIPLY_WRAPV (n
, item_size
, &nbytes
)))
820 pa
= xrealloc (pa
, nbytes
);
825 /* Ensure that the array addressed by PA holds at least I + 1 items.
826 Either return PA, or reallocate the array and return its new address.
827 Although PA may be null, the returned value is never null.
829 The array holds *NITEMS items, where 0 <= I <= *NITEMS; *NITEMS
830 is updated on reallocation. If PA is null, *NITEMS must be zero.
831 Do not allocate more than NITEMS_MAX items total; -1 means no limit.
832 ITEM_SIZE is the size of one item; it must be positive.
833 Avoid O(N**2) behavior on arrays growing linearly. */
835 maybe_realloc (void *pa
, ptrdiff_t i
, ptrdiff_t *nitems
,
836 ptrdiff_t nitems_max
, ptrdiff_t item_size
)
840 return xpalloc (pa
, nitems
, 1, nitems_max
, item_size
);
843 /* In DFA D, find the index of charclass S, or allocate a new one. */
845 charclass_index (struct dfa
*d
, charclass
*s
)
849 for (i
= 0; i
< d
->cindex
; ++i
)
850 if (equal (s
, &d
->charclasses
[i
]))
852 d
->charclasses
= maybe_realloc (d
->charclasses
, d
->cindex
, &d
->calloc
,
853 TOKEN_MAX
- CSET
, sizeof *d
->charclasses
);
855 d
->charclasses
[i
] = *s
;
860 unibyte_word_constituent (struct dfa
const *dfa
, unsigned char c
)
862 return dfa
->localeinfo
.sbctowc
[c
] != WEOF
&& (isalnum (c
) || (c
) == '_');
866 char_context (struct dfa
const *dfa
, unsigned char c
)
868 if (c
== dfa
->syntax
.eolbyte
&& !dfa
->syntax
.anchor
)
870 if (unibyte_word_constituent (dfa
, c
))
875 /* Set a bit in the charclass for the given wchar_t. Do nothing if WC
876 is represented by a multi-byte sequence. Even for MB_CUR_MAX == 1,
877 this may happen when folding case in weird Turkish locales where
878 dotless i/dotted I are not included in the chosen character set.
879 Return whether a bit was set in the charclass. */
881 setbit_wc (wint_t wc
, charclass
*c
)
891 /* Set a bit for B and its case variants in the charclass C.
892 MB_CUR_MAX must be 1. */
894 setbit_case_fold_c (int b
, charclass
*c
)
896 int ub
= toupper (b
);
897 for (int i
= 0; i
< NOTCHAR
; i
++)
898 if (toupper (i
) == ub
)
902 /* Return true if the locale compatible with the C locale. */
905 using_simple_locale (bool multibyte
)
907 /* The native character set is known to be compatible with
908 the C locale. The following test isn't perfect, but it's good
909 enough in practice, as only ASCII and EBCDIC are in common use
910 and this test correctly accepts ASCII and rejects EBCDIC. */
911 enum { native_c_charset
=
912 ('\b' == 8 && '\t' == 9 && '\n' == 10 && '\v' == 11 && '\f' == 12
913 && '\r' == 13 && ' ' == 32 && '!' == 33 && '"' == 34 && '#' == 35
914 && '%' == 37 && '&' == 38 && '\'' == 39 && '(' == 40 && ')' == 41
915 && '*' == 42 && '+' == 43 && ',' == 44 && '-' == 45 && '.' == 46
916 && '/' == 47 && '0' == 48 && '9' == 57 && ':' == 58 && ';' == 59
917 && '<' == 60 && '=' == 61 && '>' == 62 && '?' == 63 && 'A' == 65
918 && 'Z' == 90 && '[' == 91 && '\\' == 92 && ']' == 93 && '^' == 94
919 && '_' == 95 && 'a' == 97 && 'z' == 122 && '{' == 123 && '|' == 124
920 && '}' == 125 && '~' == 126)
923 if (!native_c_charset
|| multibyte
)
927 /* Treat C and POSIX locales as being compatible. Also, treat
928 errors as compatible, as these are invariably from stubs. */
929 char const *loc
= setlocale (LC_ALL
, NULL
);
930 return !loc
|| streq (loc
, "C") || streq (loc
, "POSIX");
934 /* Fetch the next lexical input character from the pattern. There
935 must at least one byte of pattern input. Set DFA->lex.wctok to the
936 value of the character or to WEOF depending on whether the input is
937 a valid multibyte character (possibly of length 1). Then return
938 the next input byte value, except return EOF if the input is a
939 multibyte character of length greater than 1. */
941 fetch_wc (struct dfa
*dfa
)
943 size_t nbytes
= mbs_to_wchar (&dfa
->lex
.wctok
, dfa
->lex
.ptr
, dfa
->lex
.left
,
945 dfa
->lex
.cur_mb_len
= nbytes
;
946 int c
= nbytes
== 1 ? to_uchar (dfa
->lex
.ptr
[0]) : EOF
;
947 dfa
->lex
.ptr
+= nbytes
;
948 dfa
->lex
.left
-= nbytes
;
952 /* If there is no more input, report an error about unbalanced brackets.
953 Otherwise, behave as with fetch_wc (DFA). */
955 bracket_fetch_wc (struct dfa
*dfa
)
958 dfaerror (_("unbalanced ["));
959 return fetch_wc (dfa
);
962 typedef int predicate (int);
964 /* The following list maps the names of the Posix named character classes
965 to predicate functions that determine whether a given character is in
966 the class. The leading [ has already been eaten by the lexical
972 bool single_byte_only
;
975 static const struct dfa_ctype prednames
[] = {
976 {"alpha", isalpha
, false},
977 {"upper", isupper
, false},
978 {"lower", islower
, false},
979 {"digit", isdigit
, true},
980 {"xdigit", isxdigit
, false},
981 {"space", isspace
, false},
982 {"punct", ispunct
, false},
983 {"alnum", isalnum
, false},
984 {"print", isprint
, false},
985 {"graph", isgraph
, false},
986 {"cntrl", iscntrl
, false},
987 {"blank", isblank
, false},
991 static const struct dfa_ctype
*_GL_ATTRIBUTE_PURE
992 find_pred (const char *str
)
994 for (unsigned int i
= 0; prednames
[i
].name
; ++i
)
995 if (streq (str
, prednames
[i
].name
))
996 return &prednames
[i
];
1000 /* Parse a bracket expression, which possibly includes multibyte
1003 parse_bracket_exp (struct dfa
*dfa
)
1005 /* This is a bracket expression that dfaexec is known to
1006 process correctly. */
1007 bool known_bracket_exp
= true;
1009 /* Used to warn about [:space:].
1010 Bit 0 = first character is a colon.
1011 Bit 1 = last character is a colon.
1012 Bit 2 = includes any other character but a colon.
1013 Bit 3 = includes ranges, char/equiv classes or collation elements. */
1014 int colon_warning_state
;
1016 dfa
->lex
.brack
.nchars
= 0;
1019 int c
= bracket_fetch_wc (dfa
);
1020 bool invert
= c
== '^';
1023 c
= bracket_fetch_wc (dfa
);
1024 known_bracket_exp
= dfa
->simple_locale
;
1026 wint_t wc
= dfa
->lex
.wctok
;
1029 colon_warning_state
= (c
== ':');
1032 c1
= NOTCHAR
; /* Mark c1 as not initialized. */
1033 colon_warning_state
&= ~2;
1035 /* Note that if we're looking at some other [:...:] construct,
1036 we just treat it as a bunch of ordinary characters. We can do
1037 this because we assume regex has checked for syntax errors before
1038 dfa is ever called. */
1041 c1
= bracket_fetch_wc (dfa
);
1042 wc1
= dfa
->lex
.wctok
;
1044 if ((c1
== ':' && (dfa
->syntax
.syntax_bits
& RE_CHAR_CLASSES
))
1045 || c1
== '.' || c1
== '=')
1047 enum { MAX_BRACKET_STRING_LEN
= 32 };
1048 char str
[MAX_BRACKET_STRING_LEN
+ 1];
1052 c
= bracket_fetch_wc (dfa
);
1053 if (dfa
->lex
.left
== 0
1054 || (c
== c1
&& dfa
->lex
.ptr
[0] == ']'))
1056 if (len
< MAX_BRACKET_STRING_LEN
)
1059 /* This is in any case an invalid class name. */
1064 /* Fetch bracket. */
1065 c
= bracket_fetch_wc (dfa
);
1066 wc
= dfa
->lex
.wctok
;
1068 /* Build character class. POSIX allows character
1069 classes to match multicharacter collating elements,
1070 but the regex code does not support that, so do not
1071 worry about that possibility. */
1074 = (dfa
->syntax
.case_fold
&& (streq (str
, "upper")
1075 || streq (str
, "lower"))
1077 const struct dfa_ctype
*pred
= find_pred (class);
1079 dfaerror (_("invalid character class"));
1081 if (dfa
->localeinfo
.multibyte
&& !pred
->single_byte_only
)
1082 known_bracket_exp
= false;
1084 for (int c2
= 0; c2
< NOTCHAR
; ++c2
)
1085 if (pred
->func (c2
))
1089 known_bracket_exp
= false;
1091 colon_warning_state
|= 8;
1093 /* Fetch new lookahead character. */
1094 c1
= bracket_fetch_wc (dfa
);
1095 wc1
= dfa
->lex
.wctok
;
1099 /* We treat '[' as a normal character here. c/c1/wc/wc1
1100 are already set up. */
1104 && (dfa
->syntax
.syntax_bits
& RE_BACKSLASH_ESCAPE_IN_LISTS
))
1106 c
= bracket_fetch_wc (dfa
);
1107 wc
= dfa
->lex
.wctok
;
1112 c1
= bracket_fetch_wc (dfa
);
1113 wc1
= dfa
->lex
.wctok
;
1117 /* build range characters. */
1119 int c2
= bracket_fetch_wc (dfa
);
1120 wint_t wc2
= dfa
->lex
.wctok
;
1122 /* A bracket expression like [a-[.aa.]] matches an unknown set.
1123 Treat it like [-a[.aa.]] while parsing it, and
1124 remember that the set is unknown. */
1125 if (c2
== '[' && dfa
->lex
.ptr
[0] == '.')
1127 known_bracket_exp
= false;
1133 /* In the case [x-], the - is an ordinary hyphen,
1134 which is left in c1, the lookahead character. */
1135 dfa
->lex
.ptr
-= dfa
->lex
.cur_mb_len
;
1136 dfa
->lex
.left
+= dfa
->lex
.cur_mb_len
;
1140 if (c2
== '\\' && (dfa
->syntax
.syntax_bits
1141 & RE_BACKSLASH_ESCAPE_IN_LISTS
))
1143 c2
= bracket_fetch_wc (dfa
);
1144 wc2
= dfa
->lex
.wctok
;
1147 colon_warning_state
|= 8;
1148 c1
= bracket_fetch_wc (dfa
);
1149 wc1
= dfa
->lex
.wctok
;
1151 /* Treat [x-y] as a range if x != y. */
1152 if (wc
!= wc2
|| wc
== WEOF
)
1154 if (dfa
->simple_locale
1155 || (isasciidigit (c
) & isasciidigit (c2
)))
1157 for (int ci
= c
; ci
<= c2
; ci
++)
1158 if (dfa
->syntax
.case_fold
&& isalpha (ci
))
1159 setbit_case_fold_c (ci
, &ccl
);
1164 known_bracket_exp
= false;
1171 colon_warning_state
|= (c
== ':') ? 2 : 4;
1173 if (!dfa
->localeinfo
.multibyte
)
1175 if (dfa
->syntax
.case_fold
&& isalpha (c
))
1176 setbit_case_fold_c (c
, &ccl
);
1183 known_bracket_exp
= false;
1186 wchar_t folded
[CASE_FOLDED_BUFSIZE
+ 1];
1187 unsigned int n
= (dfa
->syntax
.case_fold
1188 ? case_folded_counterparts (wc
, folded
+ 1) + 1
1191 for (unsigned int i
= 0; i
< n
; i
++)
1192 if (!setbit_wc (folded
[i
], &ccl
))
1194 dfa
->lex
.brack
.chars
1195 = maybe_realloc (dfa
->lex
.brack
.chars
, dfa
->lex
.brack
.nchars
,
1196 &dfa
->lex
.brack
.nchars_alloc
, -1,
1197 sizeof *dfa
->lex
.brack
.chars
);
1198 dfa
->lex
.brack
.chars
[dfa
->lex
.brack
.nchars
++] = folded
[i
];
1202 while ((wc
= wc1
, (c
= c1
) != ']'));
1204 if (colon_warning_state
== 7)
1205 dfawarn (_("character class syntax is [[:space:]], not [:space:]"));
1207 if (! known_bracket_exp
)
1210 if (dfa
->localeinfo
.multibyte
&& (invert
|| dfa
->lex
.brack
.nchars
!= 0))
1212 dfa
->lex
.brack
.invert
= invert
;
1213 dfa
->lex
.brack
.cset
= emptyset (&ccl
) ? -1 : charclass_index (dfa
, &ccl
);
1220 if (dfa
->syntax
.syntax_bits
& RE_HAT_LISTS_NOT_NEWLINE
)
1221 clrbit ('\n', &ccl
);
1224 return CSET
+ charclass_index (dfa
, &ccl
);
1234 push_lex_state (struct dfa
*dfa
, struct lexptr
*ls
, char const *s
)
1236 ls
->ptr
= dfa
->lex
.ptr
;
1237 ls
->left
= dfa
->lex
.left
;
1239 dfa
->lex
.left
= strlen (s
);
1243 pop_lex_state (struct dfa
*dfa
, struct lexptr
const *ls
)
1245 dfa
->lex
.ptr
= ls
->ptr
;
1246 dfa
->lex
.left
= ls
->left
;
1250 lex (struct dfa
*dfa
)
1252 bool backslash
= false;
1254 /* Basic plan: We fetch a character. If it's a backslash,
1255 we set the backslash flag and go through the loop again.
1256 On the plus side, this avoids having a duplicate of the
1257 main switch inside the backslash case. On the minus side,
1258 it means that just about every case begins with
1259 "if (backslash) ...". */
1260 for (int i
= 0; i
< 2; ++i
)
1262 if (! dfa
->lex
.left
)
1263 return dfa
->lex
.lasttok
= END
;
1264 int c
= fetch_wc (dfa
);
1271 if (dfa
->lex
.left
== 0)
1272 dfaerror (_("unfinished \\ escape"));
1279 if (dfa
->syntax
.syntax_bits
& RE_CONTEXT_INDEP_ANCHORS
1280 || dfa
->lex
.lasttok
== END
|| dfa
->lex
.lasttok
== LPAREN
1281 || dfa
->lex
.lasttok
== OR
)
1282 return dfa
->lex
.lasttok
= BEGLINE
;
1288 if (dfa
->syntax
.syntax_bits
& RE_CONTEXT_INDEP_ANCHORS
1289 || dfa
->lex
.left
== 0
1291 > !(dfa
->syntax
.syntax_bits
& RE_NO_BK_PARENS
))
1292 && (dfa
->lex
.ptr
[!(dfa
->syntax
.syntax_bits
& RE_NO_BK_PARENS
)
1293 & (dfa
->lex
.ptr
[0] == '\\')]
1296 > !(dfa
->syntax
.syntax_bits
& RE_NO_BK_VBAR
))
1297 && (dfa
->lex
.ptr
[!(dfa
->syntax
.syntax_bits
& RE_NO_BK_VBAR
)
1298 & (dfa
->lex
.ptr
[0] == '\\')]
1300 || ((dfa
->syntax
.syntax_bits
& RE_NEWLINE_ALT
)
1301 && dfa
->lex
.left
> 0 && dfa
->lex
.ptr
[0] == '\n'))
1302 return dfa
->lex
.lasttok
= ENDLINE
;
1314 if (backslash
&& !(dfa
->syntax
.syntax_bits
& RE_NO_BK_REFS
))
1316 dfa
->lex
.laststart
= false;
1317 return dfa
->lex
.lasttok
= BACKREF
;
1322 if (backslash
&& !(dfa
->syntax
.syntax_bits
& RE_NO_GNU_OPS
))
1324 /* FIXME: should be beginning of string */
1325 return dfa
->lex
.lasttok
= BEGLINE
;
1330 if (backslash
&& !(dfa
->syntax
.syntax_bits
& RE_NO_GNU_OPS
))
1332 /* FIXME: should be end of string */
1333 return dfa
->lex
.lasttok
= ENDLINE
;
1338 if (backslash
&& !(dfa
->syntax
.syntax_bits
& RE_NO_GNU_OPS
))
1339 return dfa
->lex
.lasttok
= BEGWORD
;
1343 if (backslash
&& !(dfa
->syntax
.syntax_bits
& RE_NO_GNU_OPS
))
1344 return dfa
->lex
.lasttok
= ENDWORD
;
1348 if (backslash
&& !(dfa
->syntax
.syntax_bits
& RE_NO_GNU_OPS
))
1349 return dfa
->lex
.lasttok
= LIMWORD
;
1353 if (backslash
&& !(dfa
->syntax
.syntax_bits
& RE_NO_GNU_OPS
))
1354 return dfa
->lex
.lasttok
= NOTLIMWORD
;
1358 if (dfa
->syntax
.syntax_bits
& RE_LIMITED_OPS
)
1360 if (backslash
!= ((dfa
->syntax
.syntax_bits
& RE_BK_PLUS_QM
) != 0))
1362 if (!(dfa
->syntax
.syntax_bits
& RE_CONTEXT_INDEP_OPS
)
1363 && dfa
->lex
.laststart
)
1365 return dfa
->lex
.lasttok
= QMARK
;
1370 if (!(dfa
->syntax
.syntax_bits
& RE_CONTEXT_INDEP_OPS
)
1371 && dfa
->lex
.laststart
)
1373 return dfa
->lex
.lasttok
= STAR
;
1376 if (dfa
->syntax
.syntax_bits
& RE_LIMITED_OPS
)
1378 if (backslash
!= ((dfa
->syntax
.syntax_bits
& RE_BK_PLUS_QM
) != 0))
1380 if (!(dfa
->syntax
.syntax_bits
& RE_CONTEXT_INDEP_OPS
)
1381 && dfa
->lex
.laststart
)
1383 return dfa
->lex
.lasttok
= PLUS
;
1386 if (!(dfa
->syntax
.syntax_bits
& RE_INTERVALS
))
1388 if (backslash
!= ((dfa
->syntax
.syntax_bits
& RE_NO_BK_BRACES
) == 0))
1390 if (!(dfa
->syntax
.syntax_bits
& RE_CONTEXT_INDEP_OPS
)
1391 && dfa
->lex
.laststart
)
1396 {M,} - minimum count, maximum is infinity
1398 {,} - 0 to infinity (same as '*')
1399 {M,N} - M through N */
1401 char const *p
= dfa
->lex
.ptr
;
1402 char const *lim
= p
+ dfa
->lex
.left
;
1403 dfa
->lex
.minrep
= dfa
->lex
.maxrep
= -1;
1404 for (; p
!= lim
&& isasciidigit (*p
); p
++)
1405 dfa
->lex
.minrep
= (dfa
->lex
.minrep
< 0
1407 : MIN (RE_DUP_MAX
+ 1,
1408 dfa
->lex
.minrep
* 10 + *p
- '0'));
1412 dfa
->lex
.maxrep
= dfa
->lex
.minrep
;
1415 if (dfa
->lex
.minrep
< 0)
1416 dfa
->lex
.minrep
= 0;
1417 while (++p
!= lim
&& isasciidigit (*p
))
1419 = (dfa
->lex
.maxrep
< 0
1421 : MIN (RE_DUP_MAX
+ 1,
1422 dfa
->lex
.maxrep
* 10 + *p
- '0'));
1425 if (! ((! backslash
|| (p
!= lim
&& *p
++ == '\\'))
1426 && p
!= lim
&& *p
++ == '}'
1427 && 0 <= dfa
->lex
.minrep
1428 && (dfa
->lex
.maxrep
< 0
1429 || dfa
->lex
.minrep
<= dfa
->lex
.maxrep
)))
1431 if (dfa
->syntax
.syntax_bits
& RE_INVALID_INTERVAL_ORD
)
1433 dfaerror (_("invalid content of \\{\\}"));
1435 if (RE_DUP_MAX
< dfa
->lex
.maxrep
)
1436 dfaerror (_("regular expression too big"));
1438 dfa
->lex
.left
= lim
- p
;
1440 dfa
->lex
.laststart
= false;
1441 return dfa
->lex
.lasttok
= REPMN
;
1444 if (dfa
->syntax
.syntax_bits
& RE_LIMITED_OPS
)
1446 if (backslash
!= ((dfa
->syntax
.syntax_bits
& RE_NO_BK_VBAR
) == 0))
1448 dfa
->lex
.laststart
= true;
1449 return dfa
->lex
.lasttok
= OR
;
1452 if (dfa
->syntax
.syntax_bits
& RE_LIMITED_OPS
1453 || backslash
|| !(dfa
->syntax
.syntax_bits
& RE_NEWLINE_ALT
))
1455 dfa
->lex
.laststart
= true;
1456 return dfa
->lex
.lasttok
= OR
;
1459 if (backslash
!= ((dfa
->syntax
.syntax_bits
& RE_NO_BK_PARENS
) == 0))
1462 dfa
->lex
.laststart
= true;
1463 return dfa
->lex
.lasttok
= LPAREN
;
1466 if (backslash
!= ((dfa
->syntax
.syntax_bits
& RE_NO_BK_PARENS
) == 0))
1468 if (dfa
->lex
.parens
== 0
1469 && dfa
->syntax
.syntax_bits
& RE_UNMATCHED_RIGHT_PAREN_ORD
)
1472 dfa
->lex
.laststart
= false;
1473 return dfa
->lex
.lasttok
= RPAREN
;
1478 if (dfa
->canychar
== (size_t) -1)
1482 if (!(dfa
->syntax
.syntax_bits
& RE_DOT_NEWLINE
))
1483 clrbit ('\n', &ccl
);
1484 if (dfa
->syntax
.syntax_bits
& RE_DOT_NOT_NULL
)
1485 clrbit ('\0', &ccl
);
1486 if (dfa
->localeinfo
.multibyte
)
1487 for (int c2
= 0; c2
< NOTCHAR
; c2
++)
1488 if (dfa
->localeinfo
.sbctowc
[c2
] == WEOF
)
1490 dfa
->canychar
= charclass_index (dfa
, &ccl
);
1492 dfa
->lex
.laststart
= false;
1493 return dfa
->lex
.lasttok
= (dfa
->localeinfo
.multibyte
1495 : CSET
+ dfa
->canychar
);
1499 if (!backslash
|| (dfa
->syntax
.syntax_bits
& RE_NO_GNU_OPS
))
1501 if (!dfa
->localeinfo
.multibyte
)
1505 for (int c2
= 0; c2
< NOTCHAR
; ++c2
)
1510 dfa
->lex
.laststart
= false;
1511 return dfa
->lex
.lasttok
= CSET
+ charclass_index (dfa
, &ccl
);
1514 /* FIXME: see if optimizing this, as is done with ANYCHAR and
1515 add_utf8_anychar, makes sense. */
1517 /* \s and \S are documented to be equivalent to [[:space:]] and
1518 [^[:space:]] respectively, so tell the lexer to process those
1519 strings, each minus its "already processed" '['. */
1522 push_lex_state (dfa
, &ls
, &"^[:space:]]"[c
== 's']);
1523 dfa
->lex
.lasttok
= parse_bracket_exp (dfa
);
1524 pop_lex_state (dfa
, &ls
);
1527 dfa
->lex
.laststart
= false;
1528 return dfa
->lex
.lasttok
;
1532 if (!backslash
|| (dfa
->syntax
.syntax_bits
& RE_NO_GNU_OPS
))
1535 if (!dfa
->localeinfo
.multibyte
)
1539 for (int c2
= 0; c2
< NOTCHAR
; ++c2
)
1540 if (dfa
->syntax
.sbit
[c2
] == CTX_LETTER
)
1544 dfa
->lex
.laststart
= false;
1545 return dfa
->lex
.lasttok
= CSET
+ charclass_index (dfa
, &ccl
);
1548 /* FIXME: see if optimizing this, as is done with ANYCHAR and
1549 add_utf8_anychar, makes sense. */
1551 /* \w and \W are documented to be equivalent to [_[:alnum:]] and
1552 [^_[:alnum:]] respectively, so tell the lexer to process those
1553 strings, each minus its "already processed" '['. */
1556 push_lex_state (dfa
, &ls
, &"^_[:alnum:]]"[c
== 'w']);
1557 dfa
->lex
.lasttok
= parse_bracket_exp (dfa
);
1558 pop_lex_state (dfa
, &ls
);
1561 dfa
->lex
.laststart
= false;
1562 return dfa
->lex
.lasttok
;
1567 dfa
->lex
.laststart
= false;
1568 return dfa
->lex
.lasttok
= parse_bracket_exp (dfa
);
1572 dfa
->lex
.laststart
= false;
1573 /* For multibyte character sets, folding is done in atom. Always
1575 if (dfa
->localeinfo
.multibyte
)
1576 return dfa
->lex
.lasttok
= WCHAR
;
1578 if (dfa
->syntax
.case_fold
&& isalpha (c
))
1582 setbit_case_fold_c (c
, &ccl
);
1583 return dfa
->lex
.lasttok
= CSET
+ charclass_index (dfa
, &ccl
);
1586 return dfa
->lex
.lasttok
= c
;
1590 /* The above loop should consume at most a backslash
1591 and some other character. */
1593 return END
; /* keeps pedantic compilers happy. */
1597 addtok_mb (struct dfa
*dfa
, token t
, char mbprop
)
1599 if (dfa
->talloc
== dfa
->tindex
)
1601 dfa
->tokens
= x2nrealloc (dfa
->tokens
, &dfa
->talloc
,
1602 sizeof *dfa
->tokens
);
1603 if (dfa
->localeinfo
.multibyte
)
1604 dfa
->multibyte_prop
= xnrealloc (dfa
->multibyte_prop
, dfa
->talloc
,
1605 sizeof *dfa
->multibyte_prop
);
1607 if (dfa
->localeinfo
.multibyte
)
1608 dfa
->multibyte_prop
[dfa
->tindex
] = mbprop
;
1609 dfa
->tokens
[dfa
->tindex
++] = t
;
1633 if (dfa
->parse
.depth
> dfa
->depth
)
1634 dfa
->depth
= dfa
->parse
.depth
;
1637 static void addtok_wc (struct dfa
*dfa
, wint_t wc
);
1639 /* Add the given token to the parse tree, maintaining the depth count and
1640 updating the maximum depth if necessary. */
1642 addtok (struct dfa
*dfa
, token t
)
1644 if (dfa
->localeinfo
.multibyte
&& t
== MBCSET
)
1646 bool need_or
= false;
1648 /* Extract wide characters into alternations for better performance.
1649 This does not require UTF-8. */
1650 for (ptrdiff_t i
= 0; i
< dfa
->lex
.brack
.nchars
; i
++)
1652 addtok_wc (dfa
, dfa
->lex
.brack
.chars
[i
]);
1657 dfa
->lex
.brack
.nchars
= 0;
1659 /* Wide characters have been handled above, so it is possible
1660 that the set is empty now. Do nothing in that case. */
1661 if (dfa
->lex
.brack
.cset
!= -1)
1663 addtok (dfa
, CSET
+ dfa
->lex
.brack
.cset
);
1670 addtok_mb (dfa
, t
, 3);
1674 /* We treat a multibyte character as a single atom, so that DFA
1675 can treat a multibyte character as a single expression.
1677 e.g., we construct the following tree from "<mb1><mb2>".
1678 <mb1(1st-byte)><mb1(2nd-byte)><CAT><mb1(3rd-byte)><CAT>
1679 <mb2(1st-byte)><mb2(2nd-byte)><CAT><mb2(3rd-byte)><CAT><CAT> */
1681 addtok_wc (struct dfa
*dfa
, wint_t wc
)
1683 unsigned char buf
[MB_LEN_MAX
];
1684 mbstate_t s
= { 0 };
1685 size_t stored_bytes
= wcrtomb ((char *) buf
, wc
, &s
);
1687 if (stored_bytes
!= (size_t) -1)
1688 dfa
->lex
.cur_mb_len
= stored_bytes
;
1691 /* This is merely stop-gap. buf[0] is undefined, yet skipping
1692 the addtok_mb call altogether can corrupt the heap. */
1693 dfa
->lex
.cur_mb_len
= 1;
1697 addtok_mb (dfa
, buf
[0], dfa
->lex
.cur_mb_len
== 1 ? 3 : 1);
1698 for (int i
= 1; i
< dfa
->lex
.cur_mb_len
; i
++)
1700 addtok_mb (dfa
, buf
[i
], i
== dfa
->lex
.cur_mb_len
- 1 ? 2 : 0);
1706 add_utf8_anychar (struct dfa
*dfa
)
1708 static charclass
const utf8_classes
[5] = {
1709 /* 80-bf: non-leading bytes. */
1710 CHARCLASS_INIT (0, 0, 0, 0, 0xffffffff, 0xffffffff, 0, 0),
1712 /* 00-7f: 1-byte sequence. */
1713 CHARCLASS_INIT (0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0, 0, 0, 0),
1715 /* c2-df: 2-byte sequence. */
1716 CHARCLASS_INIT (0, 0, 0, 0, 0, 0, 0xfffffffc, 0),
1718 /* e0-ef: 3-byte sequence. */
1719 CHARCLASS_INIT (0, 0, 0, 0, 0, 0, 0, 0xffff),
1721 /* f0-f7: 4-byte sequence. */
1722 CHARCLASS_INIT (0, 0, 0, 0, 0, 0, 0, 0xff0000)
1724 const unsigned int n
= sizeof (utf8_classes
) / sizeof (utf8_classes
[0]);
1726 /* Define the five character classes that are needed below. */
1727 if (dfa
->utf8_anychar_classes
[0] == 0)
1728 for (unsigned int i
= 0; i
< n
; i
++)
1730 charclass c
= utf8_classes
[i
];
1733 if (!(dfa
->syntax
.syntax_bits
& RE_DOT_NEWLINE
))
1735 if (dfa
->syntax
.syntax_bits
& RE_DOT_NOT_NULL
)
1738 dfa
->utf8_anychar_classes
[i
] = CSET
+ charclass_index (dfa
, &c
);
1741 /* A valid UTF-8 character is
1744 |[0xc2-0xdf][0x80-0xbf]
1745 |[0xe0-0xef[0x80-0xbf][0x80-0xbf]
1746 |[0xf0-f7][0x80-0xbf][0x80-0xbf][0x80-0xbf])
1748 which I'll write more concisely "B|CA|DAA|EAAA". Factor the [0x00-0x7f]
1749 and you get "B|(C|(D|EA)A)A". And since the token buffer is in reverse
1750 Polish notation, you get "B C D E A CAT OR A CAT OR A CAT OR". */
1752 for (i
= 1; i
< n
; i
++)
1753 addtok (dfa
, dfa
->utf8_anychar_classes
[i
]);
1756 addtok (dfa
, dfa
->utf8_anychar_classes
[0]);
1762 /* The grammar understood by the parser is as follows.
1781 <multibyte character>
1792 LPAREN regexp RPAREN
1795 The parser builds a parse tree in postfix form in an array of tokens. */
1798 atom (struct dfa
*dfa
)
1800 if (dfa
->parse
.tok
== WCHAR
)
1802 if (dfa
->lex
.wctok
== WEOF
)
1803 addtok (dfa
, BACKREF
);
1806 addtok_wc (dfa
, dfa
->lex
.wctok
);
1808 if (dfa
->syntax
.case_fold
)
1810 wchar_t folded
[CASE_FOLDED_BUFSIZE
];
1811 unsigned int n
= case_folded_counterparts (dfa
->lex
.wctok
,
1813 for (unsigned int i
= 0; i
< n
; i
++)
1815 addtok_wc (dfa
, folded
[i
]);
1821 dfa
->parse
.tok
= lex (dfa
);
1823 else if (dfa
->parse
.tok
== ANYCHAR
&& dfa
->localeinfo
.using_utf8
)
1825 /* For UTF-8 expand the period to a series of CSETs that define a valid
1826 UTF-8 character. This avoids using the slow multibyte path. I'm
1827 pretty sure it would be both profitable and correct to do it for
1828 any encoding; however, the optimization must be done manually as
1829 it is done above in add_utf8_anychar. So, let's start with
1830 UTF-8: it is the most used, and the structure of the encoding
1831 makes the correctness more obvious. */
1832 add_utf8_anychar (dfa
);
1833 dfa
->parse
.tok
= lex (dfa
);
1835 else if ((0 <= dfa
->parse
.tok
&& dfa
->parse
.tok
< NOTCHAR
)
1836 || dfa
->parse
.tok
>= CSET
|| dfa
->parse
.tok
== BACKREF
1837 || dfa
->parse
.tok
== BEGLINE
|| dfa
->parse
.tok
== ENDLINE
1838 || dfa
->parse
.tok
== BEGWORD
|| dfa
->parse
.tok
== ANYCHAR
1839 || dfa
->parse
.tok
== MBCSET
|| dfa
->parse
.tok
== ENDWORD
1840 || dfa
->parse
.tok
== LIMWORD
|| dfa
->parse
.tok
== NOTLIMWORD
)
1842 addtok (dfa
, dfa
->parse
.tok
);
1843 dfa
->parse
.tok
= lex (dfa
);
1845 else if (dfa
->parse
.tok
== LPAREN
)
1847 dfa
->parse
.tok
= lex (dfa
);
1849 if (dfa
->parse
.tok
!= RPAREN
)
1850 dfaerror (_("unbalanced ("));
1851 dfa
->parse
.tok
= lex (dfa
);
1854 addtok (dfa
, EMPTY
);
1857 /* Return the number of tokens in the given subexpression. */
1858 static size_t _GL_ATTRIBUTE_PURE
1859 nsubtoks (struct dfa
const *dfa
, size_t tindex
)
1861 switch (dfa
->tokens
[tindex
- 1])
1868 return 1 + nsubtoks (dfa
, tindex
- 1);
1872 size_t ntoks1
= nsubtoks (dfa
, tindex
- 1);
1873 return 1 + ntoks1
+ nsubtoks (dfa
, tindex
- 1 - ntoks1
);
1878 /* Copy the given subexpression to the top of the tree. */
1880 copytoks (struct dfa
*dfa
, size_t tindex
, size_t ntokens
)
1882 if (dfa
->localeinfo
.multibyte
)
1883 for (size_t i
= 0; i
< ntokens
; ++i
)
1884 addtok_mb (dfa
, dfa
->tokens
[tindex
+ i
],
1885 dfa
->multibyte_prop
[tindex
+ i
]);
1887 for (size_t i
= 0; i
< ntokens
; ++i
)
1888 addtok_mb (dfa
, dfa
->tokens
[tindex
+ i
], 3);
1892 closure (struct dfa
*dfa
)
1895 while (dfa
->parse
.tok
== QMARK
|| dfa
->parse
.tok
== STAR
1896 || dfa
->parse
.tok
== PLUS
|| dfa
->parse
.tok
== REPMN
)
1897 if (dfa
->parse
.tok
== REPMN
&& (dfa
->lex
.minrep
|| dfa
->lex
.maxrep
))
1899 size_t ntokens
= nsubtoks (dfa
, dfa
->tindex
);
1900 size_t tindex
= dfa
->tindex
- ntokens
;
1901 if (dfa
->lex
.maxrep
< 0)
1903 if (dfa
->lex
.minrep
== 0)
1904 addtok (dfa
, QMARK
);
1906 for (i
= 1; i
< dfa
->lex
.minrep
; i
++)
1908 copytoks (dfa
, tindex
, ntokens
);
1911 for (; i
< dfa
->lex
.maxrep
; i
++)
1913 copytoks (dfa
, tindex
, ntokens
);
1914 addtok (dfa
, QMARK
);
1917 dfa
->parse
.tok
= lex (dfa
);
1919 else if (dfa
->parse
.tok
== REPMN
)
1921 dfa
->tindex
-= nsubtoks (dfa
, dfa
->tindex
);
1922 dfa
->parse
.tok
= lex (dfa
);
1927 addtok (dfa
, dfa
->parse
.tok
);
1928 dfa
->parse
.tok
= lex (dfa
);
1933 branch (struct dfa
* dfa
)
1936 while (dfa
->parse
.tok
!= RPAREN
&& dfa
->parse
.tok
!= OR
1937 && dfa
->parse
.tok
>= 0)
1945 regexp (struct dfa
*dfa
)
1948 while (dfa
->parse
.tok
== OR
)
1950 dfa
->parse
.tok
= lex (dfa
);
1956 /* Main entry point for the parser. S is a string to be parsed, len is the
1957 length of the string, so s can include NUL characters. D is a pointer to
1958 the struct dfa to parse into. */
1960 dfaparse (char const *s
, size_t len
, struct dfa
*d
)
1964 d
->lex
.lasttok
= END
;
1965 d
->lex
.laststart
= true;
1967 if (!d
->syntax
.syntax_bits_set
)
1968 dfaerror (_("no syntax specified"));
1970 d
->parse
.tok
= lex (d
);
1971 d
->parse
.depth
= d
->depth
;
1975 if (d
->parse
.tok
!= END
)
1976 dfaerror (_("unbalanced )"));
1978 addtok (d
, END
- d
->nregexps
);
1987 /* Some primitives for operating on sets of positions. */
1989 /* Copy one set to another. */
1991 copy (position_set
const *src
, position_set
*dst
)
1993 if (dst
->alloc
< src
->nelem
)
1996 dst
->elems
= xpalloc (NULL
, &dst
->alloc
, src
->nelem
- dst
->alloc
, -1,
1997 sizeof *dst
->elems
);
1999 dst
->nelem
= src
->nelem
;
2000 if (src
->nelem
!= 0)
2001 memcpy (dst
->elems
, src
->elems
, src
->nelem
* sizeof *dst
->elems
);
2005 alloc_position_set (position_set
*s
, size_t size
)
2007 s
->elems
= xnmalloc (size
, sizeof *s
->elems
);
2012 /* Insert position P in set S. S is maintained in sorted order on
2013 decreasing index. If there is already an entry in S with P.index
2014 then merge (logically-OR) P's constraints into the one in S.
2015 S->elems must point to an array large enough to hold the resulting set. */
2017 insert (position p
, position_set
*s
)
2019 ptrdiff_t count
= s
->nelem
;
2020 ptrdiff_t lo
= 0, hi
= count
;
2023 ptrdiff_t mid
= (lo
+ hi
) >> 1;
2024 if (s
->elems
[mid
].index
> p
.index
)
2026 else if (s
->elems
[mid
].index
== p
.index
)
2028 s
->elems
[mid
].constraint
|= p
.constraint
;
2035 s
->elems
= maybe_realloc (s
->elems
, count
, &s
->alloc
, -1, sizeof *s
->elems
);
2036 for (ptrdiff_t i
= count
; i
> lo
; i
--)
2037 s
->elems
[i
] = s
->elems
[i
- 1];
2042 /* Merge S1 and S2 (with the additional constraint C2) into M. The
2043 result is as if the positions of S1, and of S2 with the additional
2044 constraint C2, were inserted into an initially empty set. */
2046 merge_constrained (position_set
const *s1
, position_set
const *s2
,
2047 unsigned int c2
, position_set
*m
)
2049 ptrdiff_t i
= 0, j
= 0;
2051 if (m
->alloc
- s1
->nelem
< s2
->nelem
)
2054 m
->alloc
= s1
->nelem
;
2055 m
->elems
= xpalloc (NULL
, &m
->alloc
, s2
->nelem
, -1, sizeof *m
->elems
);
2058 while (i
< s1
->nelem
|| j
< s2
->nelem
)
2059 if (! (j
< s2
->nelem
)
2060 || (i
< s1
->nelem
&& s1
->elems
[i
].index
>= s2
->elems
[j
].index
))
2062 unsigned int c
= ((i
< s1
->nelem
&& j
< s2
->nelem
2063 && s1
->elems
[i
].index
== s2
->elems
[j
].index
)
2064 ? s2
->elems
[j
++].constraint
& c2
2066 m
->elems
[m
->nelem
].index
= s1
->elems
[i
].index
;
2067 m
->elems
[m
->nelem
++].constraint
= s1
->elems
[i
++].constraint
| c
;
2071 if (s2
->elems
[j
].constraint
& c2
)
2073 m
->elems
[m
->nelem
].index
= s2
->elems
[j
].index
;
2074 m
->elems
[m
->nelem
++].constraint
= s2
->elems
[j
].constraint
& c2
;
2080 /* Merge two sets of positions into a third. The result is exactly as if
2081 the positions of both sets were inserted into an initially empty set. */
2083 merge (position_set
const *s1
, position_set
const *s2
, position_set
*m
)
2085 merge_constrained (s1
, s2
, -1, m
);
2088 /* Delete a position from a set. Return the nonzero constraint of the
2089 deleted position, or zero if there was no such position. */
2091 delete (size_t del
, position_set
*s
)
2093 size_t count
= s
->nelem
;
2094 size_t lo
= 0, hi
= count
;
2097 size_t mid
= (lo
+ hi
) >> 1;
2098 if (s
->elems
[mid
].index
> del
)
2100 else if (s
->elems
[mid
].index
== del
)
2102 unsigned int c
= s
->elems
[mid
].constraint
;
2104 for (i
= mid
; i
+ 1 < count
; i
++)
2105 s
->elems
[i
] = s
->elems
[i
+ 1];
2115 /* Replace a position with the followed set. */
2117 replace (position_set
*dst
, size_t del
, position_set
*add
,
2118 unsigned int constraint
, position_set
*tmp
)
2120 unsigned int c
= delete (del
, dst
) & constraint
;
2125 merge_constrained (tmp
, add
, c
, dst
);
2129 /* Find the index of the state corresponding to the given position set with
2130 the given preceding context, or create a new state if there is no such
2131 state. Context tells whether we got here on a newline or letter. */
2133 state_index (struct dfa
*d
, position_set
const *s
, int context
)
2138 token first_end
= 0;
2140 for (i
= 0; i
< s
->nelem
; ++i
)
2141 hash
^= s
->elems
[i
].index
+ s
->elems
[i
].constraint
;
2143 /* Try to find a state that exactly matches the proposed one. */
2144 for (i
= 0; i
< d
->sindex
; ++i
)
2146 if (hash
!= d
->states
[i
].hash
|| s
->nelem
!= d
->states
[i
].elems
.nelem
2147 || context
!= d
->states
[i
].context
)
2150 for (j
= 0; j
< s
->nelem
; ++j
)
2151 if (s
->elems
[j
].constraint
!= d
->states
[i
].elems
.elems
[j
].constraint
2152 || s
->elems
[j
].index
!= d
->states
[i
].elems
.elems
[j
].index
)
2159 fprintf (stderr
, "new state %zd\n nextpos:", i
);
2160 for (state_num j
= 0; j
< s
->nelem
; j
++)
2162 fprintf (stderr
, " %zu:", s
->elems
[j
].index
);
2163 prtok (d
->tokens
[s
->elems
[j
].index
]);
2165 fprintf (stderr
, "\n context:");
2166 if (context
^ CTX_ANY
)
2168 if (context
& CTX_NONE
)
2169 fprintf (stderr
, " CTX_NONE");
2170 if (context
& CTX_LETTER
)
2171 fprintf (stderr
, " CTX_LETTER");
2172 if (context
& CTX_NEWLINE
)
2173 fprintf (stderr
, " CTX_NEWLINE");
2176 fprintf (stderr
, " CTX_ANY");
2177 fprintf (stderr
, "\n");
2180 for (state_num j
= 0; j
< s
->nelem
; j
++)
2182 int c
= s
->elems
[j
].constraint
;
2183 if (d
->tokens
[s
->elems
[j
].index
] < 0)
2185 if (succeeds_in_context (c
, context
, CTX_ANY
))
2188 first_end
= d
->tokens
[s
->elems
[j
].index
];
2190 else if (d
->tokens
[s
->elems
[j
].index
] == BACKREF
)
2191 constraint
= NO_CONSTRAINT
;
2195 /* Create a new state. */
2196 d
->states
= maybe_realloc (d
->states
, d
->sindex
, &d
->salloc
, -1,
2198 d
->states
[i
].hash
= hash
;
2199 alloc_position_set (&d
->states
[i
].elems
, s
->nelem
);
2200 copy (s
, &d
->states
[i
].elems
);
2201 d
->states
[i
].context
= context
;
2202 d
->states
[i
].constraint
= constraint
;
2203 d
->states
[i
].first_end
= first_end
;
2204 d
->states
[i
].mbps
.nelem
= 0;
2205 d
->states
[i
].mbps
.elems
= NULL
;
2206 d
->states
[i
].mb_trindex
= -1;
2213 /* Find the epsilon closure of a set of positions. If any position of the set
2214 contains a symbol that matches the empty string in some context, replace
2215 that position with the elements of its follow labeled with an appropriate
2216 constraint. Repeat exhaustively until no funny positions are left.
2217 S->elems must be large enough to hold the result. */
2219 epsclosure (position_set
*initial
, struct dfa
const *d
)
2222 alloc_position_set (&tmp
, d
->nleaves
);
2223 for (size_t i
= 0; i
< d
->tindex
; ++i
)
2224 if (d
->follows
[i
].nelem
> 0 && d
->tokens
[i
] >= NOTCHAR
2225 && d
->tokens
[i
] != BACKREF
&& d
->tokens
[i
] != ANYCHAR
2226 && d
->tokens
[i
] != MBCSET
&& d
->tokens
[i
] < CSET
)
2228 unsigned int constraint
;
2229 switch (d
->tokens
[i
])
2232 constraint
= BEGLINE_CONSTRAINT
;
2235 constraint
= ENDLINE_CONSTRAINT
;
2238 constraint
= BEGWORD_CONSTRAINT
;
2241 constraint
= ENDWORD_CONSTRAINT
;
2244 constraint
= LIMWORD_CONSTRAINT
;
2247 constraint
= NOTLIMWORD_CONSTRAINT
;
2250 constraint
= NO_CONSTRAINT
;
2254 delete (i
, &d
->follows
[i
]);
2256 for (size_t j
= 0; j
< d
->tindex
; j
++)
2257 if (i
!= j
&& d
->follows
[j
].nelem
> 0)
2258 replace (&d
->follows
[j
], i
, &d
->follows
[i
], constraint
, &tmp
);
2260 replace (initial
, i
, &d
->follows
[i
], constraint
, &tmp
);
2265 /* Returns the set of contexts for which there is at least one
2266 character included in C. */
2269 charclass_context (struct dfa
const *dfa
, charclass
const *c
)
2273 for (unsigned int j
= 0; j
< CHARCLASS_WORDS
; ++j
)
2275 if (c
->w
[j
] & dfa
->syntax
.newline
.w
[j
])
2276 context
|= CTX_NEWLINE
;
2277 if (c
->w
[j
] & dfa
->syntax
.letters
.w
[j
])
2278 context
|= CTX_LETTER
;
2279 if (c
->w
[j
] & ~(dfa
->syntax
.letters
.w
[j
] | dfa
->syntax
.newline
.w
[j
]))
2280 context
|= CTX_NONE
;
2286 /* Returns the contexts on which the position set S depends. Each context
2287 in the set of returned contexts (let's call it SC) may have a different
2288 follow set than other contexts in SC, and also different from the
2289 follow set of the complement set (sc ^ CTX_ANY). However, all contexts
2290 in the complement set will have the same follow set. */
2292 static int _GL_ATTRIBUTE_PURE
2293 state_separate_contexts (position_set
const *s
)
2295 int separate_contexts
= 0;
2297 for (size_t j
= 0; j
< s
->nelem
; j
++)
2299 if (prev_newline_dependent (s
->elems
[j
].constraint
))
2300 separate_contexts
|= CTX_NEWLINE
;
2301 if (prev_letter_dependent (s
->elems
[j
].constraint
))
2302 separate_contexts
|= CTX_LETTER
;
2305 return separate_contexts
;
2309 /* Perform bottom-up analysis on the parse tree, computing various functions.
2310 Note that at this point, we're pretending constructs like \< are real
2311 characters rather than constraints on what can follow them.
2313 Nullable: A node is nullable if it is at the root of a regexp that can
2314 match the empty string.
2315 * EMPTY leaves are nullable.
2316 * No other leaf is nullable.
2317 * A QMARK or STAR node is nullable.
2318 * A PLUS node is nullable if its argument is nullable.
2319 * A CAT node is nullable if both its arguments are nullable.
2320 * An OR node is nullable if either argument is nullable.
2322 Firstpos: The firstpos of a node is the set of positions (nonempty leaves)
2323 that could correspond to the first character of a string matching the
2324 regexp rooted at the given node.
2325 * EMPTY leaves have empty firstpos.
2326 * The firstpos of a nonempty leaf is that leaf itself.
2327 * The firstpos of a QMARK, STAR, or PLUS node is the firstpos of its
2329 * The firstpos of a CAT node is the firstpos of the left argument, union
2330 the firstpos of the right if the left argument is nullable.
2331 * The firstpos of an OR node is the union of firstpos of each argument.
2333 Lastpos: The lastpos of a node is the set of positions that could
2334 correspond to the last character of a string matching the regexp at
2336 * EMPTY leaves have empty lastpos.
2337 * The lastpos of a nonempty leaf is that leaf itself.
2338 * The lastpos of a QMARK, STAR, or PLUS node is the lastpos of its
2340 * The lastpos of a CAT node is the lastpos of its right argument, union
2341 the lastpos of the left if the right argument is nullable.
2342 * The lastpos of an OR node is the union of the lastpos of each argument.
2344 Follow: The follow of a position is the set of positions that could
2345 correspond to the character following a character matching the node in
2346 a string matching the regexp. At this point we consider special symbols
2347 that match the empty string in some context to be just normal characters.
2348 Later, if we find that a special symbol is in a follow set, we will
2349 replace it with the elements of its follow, labeled with an appropriate
2351 * Every node in the firstpos of the argument of a STAR or PLUS node is in
2352 the follow of every node in the lastpos.
2353 * Every node in the firstpos of the second argument of a CAT node is in
2354 the follow of every node in the lastpos of the first argument.
2356 Because of the postfix representation of the parse tree, the depth-first
2357 analysis is conveniently done by a linear scan with the aid of a stack.
2358 Sets are stored as arrays of the elements, obeying a stack-like allocation
2359 scheme; the number of elements in each set deeper in the stack can be
2360 used to determine the address of a particular set's array. */
2362 dfaanalyze (struct dfa
*d
, bool searchflag
)
2364 /* Array allocated to hold position sets. */
2365 position
*posalloc
= xnmalloc (d
->nleaves
, 2 * sizeof *posalloc
);
2366 /* Firstpos and lastpos elements. */
2367 position
*firstpos
= posalloc
+ d
->nleaves
;
2368 position
*lastpos
= firstpos
+ d
->nleaves
;
2370 /* Stack for element counts and nullable flags. */
2373 /* Whether the entry is nullable. */
2376 /* Counts of firstpos and lastpos sets. */
2379 } *stkalloc
= xnmalloc (d
->depth
, sizeof *stkalloc
), *stk
= stkalloc
;
2381 position_set merged
; /* Result of merging sets. */
2384 fprintf (stderr
, "dfaanalyze:\n");
2385 for (size_t i
= 0; i
< d
->tindex
; ++i
)
2387 fprintf (stderr
, " %zu:", i
);
2388 prtok (d
->tokens
[i
]);
2390 putc ('\n', stderr
);
2393 d
->searchflag
= searchflag
;
2394 alloc_position_set (&merged
, d
->nleaves
);
2395 d
->follows
= xcalloc (d
->tindex
, sizeof *d
->follows
);
2397 for (size_t i
= 0; i
< d
->tindex
; ++i
)
2399 switch (d
->tokens
[i
])
2402 /* The empty set is nullable. */
2403 stk
->nullable
= true;
2405 /* The firstpos and lastpos of the empty leaf are both empty. */
2406 stk
->nfirstpos
= stk
->nlastpos
= 0;
2412 /* Every element in the firstpos of the argument is in the follow
2413 of every element in the lastpos. */
2416 tmp
.nelem
= stk
[-1].nfirstpos
;
2417 tmp
.elems
= firstpos
;
2418 position
*pos
= lastpos
;
2419 for (size_t j
= 0; j
< stk
[-1].nlastpos
; j
++)
2421 merge (&tmp
, &d
->follows
[pos
[j
].index
], &merged
);
2422 copy (&merged
, &d
->follows
[pos
[j
].index
]);
2427 /* A QMARK or STAR node is automatically nullable. */
2428 if (d
->tokens
[i
] != PLUS
)
2429 stk
[-1].nullable
= true;
2433 /* Every element in the firstpos of the second argument is in the
2434 follow of every element in the lastpos of the first argument. */
2437 tmp
.nelem
= stk
[-1].nfirstpos
;
2438 tmp
.elems
= firstpos
;
2439 position
*pos
= lastpos
+ stk
[-1].nlastpos
;
2440 for (size_t j
= 0; j
< stk
[-2].nlastpos
; j
++)
2442 merge (&tmp
, &d
->follows
[pos
[j
].index
], &merged
);
2443 copy (&merged
, &d
->follows
[pos
[j
].index
]);
2447 /* The firstpos of a CAT node is the firstpos of the first argument,
2448 union that of the second argument if the first is nullable. */
2449 if (stk
[-2].nullable
)
2450 stk
[-2].nfirstpos
+= stk
[-1].nfirstpos
;
2452 firstpos
+= stk
[-1].nfirstpos
;
2454 /* The lastpos of a CAT node is the lastpos of the second argument,
2455 union that of the first argument if the second is nullable. */
2456 if (stk
[-1].nullable
)
2457 stk
[-2].nlastpos
+= stk
[-1].nlastpos
;
2460 position
*pos
= lastpos
+ stk
[-2].nlastpos
;
2461 for (size_t j
= stk
[-1].nlastpos
; j
-- > 0;)
2462 pos
[j
] = lastpos
[j
];
2463 lastpos
+= stk
[-2].nlastpos
;
2464 stk
[-2].nlastpos
= stk
[-1].nlastpos
;
2467 /* A CAT node is nullable if both arguments are nullable. */
2468 stk
[-2].nullable
&= stk
[-1].nullable
;
2473 /* The firstpos is the union of the firstpos of each argument. */
2474 stk
[-2].nfirstpos
+= stk
[-1].nfirstpos
;
2476 /* The lastpos is the union of the lastpos of each argument. */
2477 stk
[-2].nlastpos
+= stk
[-1].nlastpos
;
2479 /* An OR node is nullable if either argument is nullable. */
2480 stk
[-2].nullable
|= stk
[-1].nullable
;
2485 /* Anything else is a nonempty position. (Note that special
2486 constructs like \< are treated as nonempty strings here;
2487 an "epsilon closure" effectively makes them nullable later.
2488 Backreferences have to get a real position so we can detect
2489 transitions on them later. But they are nullable. */
2490 stk
->nullable
= d
->tokens
[i
] == BACKREF
;
2492 /* This position is in its own firstpos and lastpos. */
2493 stk
->nfirstpos
= stk
->nlastpos
= 1;
2496 --firstpos
, --lastpos
;
2497 firstpos
->index
= lastpos
->index
= i
;
2498 firstpos
->constraint
= lastpos
->constraint
= NO_CONSTRAINT
;
2503 /* ... balance the above nonsyntactic #ifdef goo... */
2504 fprintf (stderr
, "node %zu:", i
);
2505 prtok (d
->tokens
[i
]);
2506 putc ('\n', stderr
);
2508 stk
[-1].nullable
? " nullable: yes\n" : " nullable: no\n");
2509 fprintf (stderr
, " firstpos:");
2510 for (size_t j
= stk
[-1].nfirstpos
; j
-- > 0;)
2512 fprintf (stderr
, " %zu:", firstpos
[j
].index
);
2513 prtok (d
->tokens
[firstpos
[j
].index
]);
2515 fprintf (stderr
, "\n lastpos:");
2516 for (size_t j
= stk
[-1].nlastpos
; j
-- > 0;)
2518 fprintf (stderr
, " %zu:", lastpos
[j
].index
);
2519 prtok (d
->tokens
[lastpos
[j
].index
]);
2521 putc ('\n', stderr
);
2526 for (size_t i
= 0; i
< d
->tindex
; ++i
)
2527 if (d
->tokens
[i
] < NOTCHAR
|| d
->tokens
[i
] == BACKREF
2528 || d
->tokens
[i
] == ANYCHAR
|| d
->tokens
[i
] == MBCSET
2529 || d
->tokens
[i
] >= CSET
)
2531 fprintf (stderr
, "follows(%zu:", i
);
2532 prtok (d
->tokens
[i
]);
2533 fprintf (stderr
, "):");
2534 for (size_t j
= d
->follows
[i
].nelem
; j
-- > 0;)
2536 fprintf (stderr
, " %zu:", d
->follows
[i
].elems
[j
].index
);
2537 prtok (d
->tokens
[d
->follows
[i
].elems
[j
].index
]);
2539 putc ('\n', stderr
);
2543 /* Get the epsilon closure of the firstpos of the regexp. The result will
2544 be the set of positions of state 0. */
2546 for (size_t i
= 0; i
< stk
[-1].nfirstpos
; ++i
)
2547 insert (firstpos
[i
], &merged
);
2549 /* For each follow set that is the follow set of a real position, replace
2550 it with its epsilon closure. */
2551 epsclosure (&merged
, d
);
2553 /* Context wanted by some position. */
2554 int separate_contexts
= state_separate_contexts (&merged
);
2556 /* Build the initial state. */
2557 if (separate_contexts
& CTX_NEWLINE
)
2558 state_index (d
, &merged
, CTX_NEWLINE
);
2559 d
->initstate_notbol
= d
->min_trcount
2560 = state_index (d
, &merged
, separate_contexts
^ CTX_ANY
);
2561 if (separate_contexts
& CTX_LETTER
)
2562 d
->min_trcount
= state_index (d
, &merged
, CTX_LETTER
);
2568 free (merged
.elems
);
2571 /* Make sure D's state arrays are large enough to hold NEW_STATE. */
2573 realloc_trans_if_necessary (struct dfa
*d
)
2575 state_num oldalloc
= d
->tralloc
;
2576 if (oldalloc
< d
->sindex
)
2578 state_num
**realtrans
= d
->trans
? d
->trans
- 2 : NULL
;
2579 ptrdiff_t newalloc1
= realtrans
? d
->tralloc
+ 2 : 0;
2580 realtrans
= xpalloc (realtrans
, &newalloc1
, d
->sindex
- oldalloc
,
2581 -1, sizeof *realtrans
);
2582 realtrans
[0] = realtrans
[1] = NULL
;
2583 d
->trans
= realtrans
+ 2;
2584 ptrdiff_t newalloc
= d
->tralloc
= newalloc1
- 2;
2585 d
->fails
= xnrealloc (d
->fails
, newalloc
, sizeof *d
->fails
);
2586 d
->success
= xnrealloc (d
->success
, newalloc
, sizeof *d
->success
);
2587 d
->newlines
= xnrealloc (d
->newlines
, newalloc
, sizeof *d
->newlines
);
2588 if (d
->localeinfo
.multibyte
)
2590 realtrans
= d
->mb_trans
? d
->mb_trans
- 2 : NULL
;
2591 realtrans
= xnrealloc (realtrans
, newalloc1
, sizeof *realtrans
);
2593 realtrans
[0] = realtrans
[1] = NULL
;
2594 d
->mb_trans
= realtrans
+ 2;
2596 for (; oldalloc
< newalloc
; oldalloc
++)
2598 d
->trans
[oldalloc
] = NULL
;
2599 d
->fails
[oldalloc
] = NULL
;
2600 if (d
->localeinfo
.multibyte
)
2601 d
->mb_trans
[oldalloc
] = NULL
;
2607 Calculate the transition table for a new state derived from state s
2608 for a compiled dfa d after input character uc, and return the new
2611 Do not worry about all possible input characters; calculate just the group
2612 of positions that match uc. Label it with the set of characters that
2613 every position in the group matches (taking into account, if necessary,
2614 preceding context information of s). Then find the union
2615 of these positions' follows, i.e., the set of positions of the
2616 new state. For each character in the group's label, set the transition
2617 on this character to be to a state corresponding to the set's positions,
2618 and its associated backward context information, if necessary.
2620 When building a searching matcher, include the positions of state
2623 The group is constructed by building an equivalence-class
2624 partition of the positions of s.
2626 For each position, find the set of characters C that it matches. Eliminate
2627 any characters from C that fail on grounds of backward context.
2629 Check whether the group's label L has nonempty
2630 intersection with C. If L - C is nonempty, create a new group labeled
2631 L - C and having the same positions as the current group, and set L to
2632 the intersection of L and C. Insert the position in the group, set
2633 C = C - L, and resume scanning.
2635 If after comparing with every group there are characters remaining in C,
2636 create a new group labeled with the characters of C and insert this
2637 position in that group. */
2640 build_state (state_num s
, struct dfa
*d
, unsigned char uc
)
2642 position_set follows
; /* Union of the follows of the group. */
2643 position_set tmp
; /* Temporary space for merging sets. */
2644 state_num state
; /* New state. */
2645 state_num state_newline
; /* New state on a newline transition. */
2646 state_num state_letter
; /* New state on a letter transition. */
2649 fprintf (stderr
, "build state %td\n", s
);
2652 /* A pointer to the new transition table, and the table itself. */
2653 state_num
**ptrans
= (accepting (s
, d
) ? d
->fails
: d
->trans
) + s
;
2654 state_num
*trans
= *ptrans
;
2658 /* MAX_TRCOUNT is an arbitrary upper limit on the number of
2659 transition tables that can exist at once, other than for
2660 initial states. Often-used transition tables are quickly
2661 rebuilt, whereas rarely-used ones are cleared away. */
2662 if (MAX_TRCOUNT
<= d
->trcount
)
2664 for (state_num i
= d
->min_trcount
; i
< d
->tralloc
; i
++)
2668 d
->trans
[i
] = d
->fails
[i
] = NULL
;
2674 *ptrans
= trans
= xmalloc (NOTCHAR
* sizeof *trans
);
2676 /* Fill transition table with a default value which means that the
2677 transited state has not been calculated yet. */
2678 for (int i
= 0; i
< NOTCHAR
; i
++)
2682 /* Set up the success bits for this state. */
2684 if (accepts_in_context (d
->states
[s
].context
, CTX_NEWLINE
, s
, d
))
2685 d
->success
[s
] |= CTX_NEWLINE
;
2686 if (accepts_in_context (d
->states
[s
].context
, CTX_LETTER
, s
, d
))
2687 d
->success
[s
] |= CTX_LETTER
;
2688 if (accepts_in_context (d
->states
[s
].context
, CTX_NONE
, s
, d
))
2689 d
->success
[s
] |= CTX_NONE
;
2691 /* Positions that match the input char. */
2693 group
.elems
= xnmalloc (d
->nleaves
, sizeof *group
.elems
);
2696 /* The group's label. */
2700 for (size_t i
= 0; i
< d
->states
[s
].elems
.nelem
; ++i
)
2702 charclass matches
; /* Set of matching characters. */
2703 position pos
= d
->states
[s
].elems
.elems
[i
];
2704 bool matched
= false;
2705 if (d
->tokens
[pos
.index
] >= 0 && d
->tokens
[pos
.index
] < NOTCHAR
)
2708 setbit (d
->tokens
[pos
.index
], &matches
);
2709 if (d
->tokens
[pos
.index
] == uc
)
2712 else if (d
->tokens
[pos
.index
] >= CSET
)
2714 matches
= d
->charclasses
[d
->tokens
[pos
.index
] - CSET
];
2715 if (tstbit (uc
, &matches
))
2718 else if (d
->tokens
[pos
.index
] == ANYCHAR
)
2720 matches
= d
->charclasses
[d
->canychar
];
2721 if (tstbit (uc
, &matches
))
2724 /* ANYCHAR must match with a single character, so we must put
2725 it to D->states[s].mbps which contains the positions which
2726 can match with a single character not a byte. If all
2727 positions which has ANYCHAR does not depend on context of
2728 next character, we put the follows instead of it to
2729 D->states[s].mbps to optimize. */
2730 if (succeeds_in_context (pos
.constraint
, d
->states
[s
].context
,
2733 if (d
->states
[s
].mbps
.nelem
== 0)
2734 alloc_position_set (&d
->states
[s
].mbps
,
2735 d
->follows
[pos
.index
].nelem
);
2736 for (size_t j
= 0; j
< d
->follows
[pos
.index
].nelem
; j
++)
2737 insert (d
->follows
[pos
.index
].elems
[j
], &d
->states
[s
].mbps
);
2743 /* Some characters may need to be eliminated from matches because
2744 they fail in the current context. */
2745 if (pos
.constraint
!= NO_CONSTRAINT
)
2747 if (!succeeds_in_context (pos
.constraint
,
2748 d
->states
[s
].context
, CTX_NEWLINE
))
2749 for (size_t j
= 0; j
< CHARCLASS_WORDS
; ++j
)
2750 matches
.w
[j
] &= ~d
->syntax
.newline
.w
[j
];
2751 if (!succeeds_in_context (pos
.constraint
,
2752 d
->states
[s
].context
, CTX_LETTER
))
2753 for (size_t j
= 0; j
< CHARCLASS_WORDS
; ++j
)
2754 matches
.w
[j
] &= ~d
->syntax
.letters
.w
[j
];
2755 if (!succeeds_in_context (pos
.constraint
,
2756 d
->states
[s
].context
, CTX_NONE
))
2757 for (size_t j
= 0; j
< CHARCLASS_WORDS
; ++j
)
2758 matches
.w
[j
] &= d
->syntax
.letters
.w
[j
] | d
->syntax
.newline
.w
[j
];
2760 /* If there are no characters left, there's no point in going on. */
2761 if (emptyset (&matches
))
2764 /* If we have reset the bit that made us declare "matched", reset
2765 that indicator, too. This is required to avoid an infinite loop
2766 with this command: echo cx | LC_ALL=C grep -E 'c\b[x ]' */
2767 if (!tstbit (uc
, &matches
))
2772 fprintf (stderr
, " nextpos %zu:", pos
.index
);
2773 prtok (d
->tokens
[pos
.index
]);
2774 fprintf (stderr
, " of");
2775 for (size_t j
= 0; j
< NOTCHAR
; j
++)
2776 if (tstbit (j
, &matches
))
2777 fprintf (stderr
, " 0x%02zx", j
);
2778 fprintf (stderr
, "\n");
2783 for (size_t k
= 0; k
< CHARCLASS_WORDS
; ++k
)
2784 label
.w
[k
] &= matches
.w
[k
];
2785 group
.elems
[group
.nelem
++] = pos
.index
;
2789 for (size_t k
= 0; k
< CHARCLASS_WORDS
; ++k
)
2790 label
.w
[k
] &= ~matches
.w
[k
];
2794 alloc_position_set (&follows
, d
->nleaves
);
2795 alloc_position_set (&tmp
, d
->nleaves
);
2797 if (group
.nelem
> 0)
2801 /* Find the union of the follows of the positions of the group.
2802 This is a hideously inefficient loop. Fix it someday. */
2803 for (size_t j
= 0; j
< group
.nelem
; ++j
)
2804 for (size_t k
= 0; k
< d
->follows
[group
.elems
[j
]].nelem
; ++k
)
2805 insert (d
->follows
[group
.elems
[j
]].elems
[k
], &follows
);
2807 /* If we are building a searching matcher, throw in the positions
2808 of state 0 as well, if possible. */
2811 /* If a token in follows.elems is not 1st byte of a multibyte
2812 character, or the states of follows must accept the bytes
2813 which are not 1st byte of the multibyte character.
2814 Then, if a state of follows encounters a byte, it must not be
2815 a 1st byte of a multibyte character nor a single byte character.
2816 In this case, do not add state[0].follows to next state, because
2817 state[0] must accept 1st-byte.
2819 For example, suppose <sb a> is a certain single byte character,
2820 <mb A> is a certain multibyte character, and the codepoint of
2821 <sb a> equals the 2nd byte of the codepoint of <mb A>. When
2822 state[0] accepts <sb a>, state[i] transits to state[i+1] by
2823 accepting the 1st byte of <mb A>, and state[i+1] accepts the
2824 2nd byte of <mb A>, if state[i+1] encounters the codepoint of
2825 <sb a>, it must not be <sb a> but the 2nd byte of <mb A>, so do
2826 not add state[0]. */
2828 bool mergeit
= !d
->localeinfo
.multibyte
;
2832 for (size_t j
= 0; mergeit
&& j
< follows
.nelem
; j
++)
2833 mergeit
&= d
->multibyte_prop
[follows
.elems
[j
].index
];
2837 merge (&d
->states
[0].elems
, &follows
, &tmp
);
2838 copy (&tmp
, &follows
);
2842 /* Find out if the new state will want any context information,
2843 by calculating possible contexts that the group can match,
2844 and separate contexts that the new state wants to know. */
2845 int possible_contexts
= charclass_context (d
, &label
);
2846 int separate_contexts
= state_separate_contexts (&follows
);
2848 /* Find the state(s) corresponding to the union of the follows. */
2849 if (possible_contexts
& ~separate_contexts
)
2850 state
= state_index (d
, &follows
, separate_contexts
^ CTX_ANY
);
2853 if (separate_contexts
& possible_contexts
& CTX_NEWLINE
)
2854 state_newline
= state_index (d
, &follows
, CTX_NEWLINE
);
2856 state_newline
= state
;
2857 if (separate_contexts
& possible_contexts
& CTX_LETTER
)
2858 state_letter
= state_index (d
, &follows
, CTX_LETTER
);
2860 state_letter
= state
;
2862 /* Reallocate now, to reallocate any newline transition properly. */
2863 realloc_trans_if_necessary (d
);
2866 /* If we are a searching matcher, the default transition is to a state
2867 containing the positions of state 0, otherwise the default transition
2868 is to fail miserably. */
2869 else if (d
->searchflag
)
2872 state_letter
= d
->min_trcount
- 1;
2873 state
= d
->initstate_notbol
;
2882 /* Set the transitions for each character in the label. */
2883 for (size_t i
= 0; i
< NOTCHAR
; i
++)
2884 if (tstbit (i
, &label
))
2885 switch (d
->syntax
.sbit
[i
])
2888 trans
[i
] = state_newline
;
2891 trans
[i
] = state_letter
;
2899 fprintf (stderr
, "trans table %td", s
);
2900 for (size_t i
= 0; i
< NOTCHAR
; ++i
)
2903 fprintf (stderr
, "\n");
2904 fprintf (stderr
, " %2td", trans
[i
]);
2906 fprintf (stderr
, "\n");
2910 free (follows
.elems
);
2913 /* Keep the newline transition in a special place so we can use it as
2915 if (tstbit (d
->syntax
.eolbyte
, &label
))
2917 d
->newlines
[s
] = trans
[d
->syntax
.eolbyte
];
2918 trans
[d
->syntax
.eolbyte
] = -1;
2924 /* Multibyte character handling sub-routines for dfaexec. */
2926 /* Consume a single byte and transit state from 's' to '*next_state'.
2927 This function is almost same as the state transition routin in dfaexec.
2928 But state transition is done just once, otherwise matching succeed or
2929 reach the end of the buffer. */
2931 transit_state_singlebyte (struct dfa
*d
, state_num s
, unsigned char const **pp
)
2937 else if (d
->fails
[s
])
2941 build_state (s
, d
, **pp
);
2952 build_state (s
, d
, **pp
);
2957 /* Transit state from s, then return new state and update the pointer of
2958 the buffer. This function is for a period operator which can match a
2959 multi-byte character. */
2961 transit_state (struct dfa
*d
, state_num s
, unsigned char const **pp
,
2962 unsigned char const *end
)
2966 int mbclen
= mbs_to_wchar (&wc
, (char const *) *pp
, end
- *pp
, d
);
2968 /* This state has some operators which can match a multibyte character. */
2969 d
->mb_follows
.nelem
= 0;
2971 /* Calculate the state which can be reached from the state 's' by
2972 consuming 'mbclen' single bytes from the buffer. */
2975 for (mbci
= 0; mbci
< mbclen
&& (mbci
== 0 || d
->min_trcount
<= s
); mbci
++)
2976 s
= transit_state_singlebyte (d
, s
, pp
);
2977 *pp
+= mbclen
- mbci
;
2981 /* It is an invalid character, so ANYCHAR is not accepted. */
2985 /* If all positions which have ANYCHAR do not depend on the context
2986 of the next character, calculate the next state with
2987 pre-calculated follows and cache the result. */
2988 if (d
->states
[s1
].mb_trindex
< 0)
2990 if (MAX_TRCOUNT
<= d
->mb_trcount
)
2993 for (s3
= -1; s3
< d
->tralloc
; s3
++)
2995 free (d
->mb_trans
[s3
]);
2996 d
->mb_trans
[s3
] = NULL
;
2999 for (state_num i
= 0; i
< d
->sindex
; i
++)
3000 d
->states
[i
].mb_trindex
= -1;
3003 d
->states
[s1
].mb_trindex
= d
->mb_trcount
++;
3006 if (! d
->mb_trans
[s
])
3008 enum { TRANSPTR_SIZE
= sizeof *d
->mb_trans
[s
] };
3009 enum { TRANSALLOC_SIZE
= MAX_TRCOUNT
* TRANSPTR_SIZE
};
3010 d
->mb_trans
[s
] = xmalloc (TRANSALLOC_SIZE
);
3011 for (int i
= 0; i
< MAX_TRCOUNT
; i
++)
3012 d
->mb_trans
[s
][i
] = -1;
3014 else if (d
->mb_trans
[s
][d
->states
[s1
].mb_trindex
] >= 0)
3015 return d
->mb_trans
[s
][d
->states
[s1
].mb_trindex
];
3018 copy (&d
->states
[s1
].mbps
, &d
->mb_follows
);
3020 merge (&d
->states
[s1
].mbps
, &d
->states
[s
].elems
, &d
->mb_follows
);
3022 int separate_contexts
= state_separate_contexts (&d
->mb_follows
);
3023 state_num s2
= state_index (d
, &d
->mb_follows
, separate_contexts
^ CTX_ANY
);
3024 realloc_trans_if_necessary (d
);
3026 d
->mb_trans
[s
][d
->states
[s1
].mb_trindex
] = s2
;
3031 /* The initial state may encounter a byte which is not a single byte character
3032 nor the first byte of a multibyte character. But it is incorrect for the
3033 initial state to accept such a byte. For example, in Shift JIS the regular
3034 expression "\\" accepts the codepoint 0x5c, but should not accept the second
3035 byte of the codepoint 0x815c. Then the initial state must skip the bytes
3036 that are not a single byte character nor the first byte of a multibyte
3039 Given DFA state d, use mbs_to_wchar to advance MBP until it reaches
3040 or exceeds P, and return the advanced MBP. If WCP is non-NULL and
3041 the result is greater than P, set *WCP to the final wide character
3042 processed, or to WEOF if no wide character is processed. Otherwise,
3043 if WCP is non-NULL, *WCP may or may not be updated.
3045 Both P and MBP must be no larger than END. */
3046 static unsigned char const *
3047 skip_remains_mb (struct dfa
*d
, unsigned char const *p
,
3048 unsigned char const *mbp
, char const *end
)
3050 if (d
->syntax
.never_trail
[*p
])
3055 mbp
+= mbs_to_wchar (&wc
, (char const *) mbp
,
3056 end
- (char const *) mbp
, d
);
3061 /* Search through a buffer looking for a match to the struct dfa *D.
3062 Find the first occurrence of a string matching the regexp in the
3063 buffer, and the shortest possible version thereof. Return a pointer to
3064 the first character after the match, or NULL if none is found. BEGIN
3065 points to the beginning of the buffer, and END points to the first byte
3066 after its end. Note however that we store a sentinel byte (usually
3067 newline) in *END, so the actual buffer must be one byte longer.
3068 When ALLOW_NL, newlines may appear in the matching string.
3069 If COUNT is non-NULL, increment *COUNT once for each newline processed.
3070 If MULTIBYTE, the input consists of multibyte characters and/or
3071 encoding-error bytes. Otherwise, it consists of single-byte characters.
3072 Here is the list of features that make this DFA matcher punt:
3073 - [M-N] range in non-simple locale: regex is up to 25% faster on [a-z]
3074 - [^...] in non-simple locale
3075 - [[=foo=]] or [[.foo.]]
3076 - [[:alpha:]] etc. in multibyte locale (except [[:digit:]] works OK)
3077 - back-reference: (.)\1
3078 - word-delimiter in multibyte locale: \<, \>, \b, \B
3079 See using_simple_locale for the definition of "simple locale". */
3081 static inline char *
3082 dfaexec_main (struct dfa
*d
, char const *begin
, char *end
, bool allow_nl
,
3083 size_t *count
, bool multibyte
)
3085 if (MAX_TRCOUNT
<= d
->sindex
)
3087 for (state_num s
= d
->min_trcount
; s
< d
->sindex
; s
++)
3089 free (d
->states
[s
].elems
.elems
);
3090 free (d
->states
[s
].mbps
.elems
);
3092 d
->sindex
= d
->min_trcount
;
3096 for (state_num s
= 0; s
< d
->tralloc
; s
++)
3100 d
->trans
[s
] = d
->fails
[s
] = NULL
;
3105 if (d
->localeinfo
.multibyte
&& d
->mb_trans
)
3107 for (state_num s
= -1; s
< d
->tralloc
; s
++)
3109 free (d
->mb_trans
[s
]);
3110 d
->mb_trans
[s
] = NULL
;
3112 for (state_num s
= 0; s
< d
->min_trcount
; s
++)
3113 d
->states
[s
].mb_trindex
= -1;
3119 realloc_trans_if_necessary (d
);
3121 /* Current state. */
3122 state_num s
= 0, s1
= 0;
3124 /* Current input character. */
3125 unsigned char const *p
= (unsigned char const *) begin
;
3126 unsigned char const *mbp
= p
;
3128 /* Copy of d->trans so it can be optimized into a register. */
3129 state_num
**trans
= d
->trans
;
3130 unsigned char eol
= d
->syntax
.eolbyte
; /* Likewise for eolbyte. */
3131 unsigned char saved_end
= *(unsigned char *) end
;
3136 memset (&d
->mbs
, 0, sizeof d
->mbs
);
3137 if (d
->mb_follows
.alloc
== 0)
3138 alloc_position_set (&d
->mb_follows
, d
->nleaves
);
3145 while ((t
= trans
[s
]) != NULL
)
3147 if (s
< d
->min_trcount
)
3149 if (!multibyte
|| d
->states
[s
].mbps
.nelem
== 0)
3155 p
= mbp
= skip_remains_mb (d
, p
, mbp
, end
);
3162 if (d
->states
[s
].mbps
.nelem
== 0
3163 || d
->localeinfo
.sbctowc
[*p
] != WEOF
|| (char *) p
>= end
)
3165 /* If an input character does not match ANYCHAR, do it
3166 like a single-byte character. */
3171 s
= transit_state (d
, s
, &p
, (unsigned char *) end
);
3184 s1
= tmp
; /* swap */
3187 if (s
< d
->min_trcount
)
3200 s
= build_state (s1
, d
, p
[-1]);
3203 else if ((char *) p
<= end
&& p
[-1] == eol
&& 0 <= d
->newlines
[s1
])
3205 /* The previous character was a newline. Count it, and skip
3206 checking of multibyte character boundary until here. */
3210 s
= (allow_nl
? d
->newlines
[s1
]
3211 : d
->syntax
.sbit
[eol
] == CTX_NEWLINE
? 0
3212 : d
->syntax
.sbit
[eol
] == CTX_LETTER
? d
->min_trcount
- 1
3213 : d
->initstate_notbol
);
3221 else if (d
->fails
[s
])
3223 if ((d
->success
[s
] & d
->syntax
.sbit
[*p
])
3224 || ((char *) p
== end
3225 && accepts_in_context (d
->states
[s
].context
, CTX_NEWLINE
, s
,
3229 if (multibyte
&& s
< d
->min_trcount
)
3230 p
= mbp
= skip_remains_mb (d
, p
, mbp
, end
);
3233 if (!multibyte
|| d
->states
[s
].mbps
.nelem
== 0
3234 || d
->localeinfo
.sbctowc
[*p
] != WEOF
|| (char *) p
>= end
)
3236 /* If a input character does not match ANYCHAR, do it
3237 like a single-byte character. */
3238 s
= d
->fails
[s
][*p
++];
3242 s
= transit_state (d
, s
, &p
, (unsigned char *) end
);
3249 build_state (s
, d
, p
[0]);
3261 /* Specialized versions of dfaexec for multibyte and single-byte cases.
3262 This is for performance, as dfaexec_main is an inline function. */
3265 dfaexec_mb (struct dfa
*d
, char const *begin
, char *end
,
3266 bool allow_nl
, size_t *count
, bool *backref
)
3268 return dfaexec_main (d
, begin
, end
, allow_nl
, count
, true);
3272 dfaexec_sb (struct dfa
*d
, char const *begin
, char *end
,
3273 bool allow_nl
, size_t *count
, bool *backref
)
3275 return dfaexec_main (d
, begin
, end
, allow_nl
, count
, false);
3278 /* Always set *BACKREF and return BEGIN. Use this wrapper for
3279 any regexp that uses a construct not supported by this code. */
3281 dfaexec_noop (struct dfa
*d
, char const *begin
, char *end
,
3282 bool allow_nl
, size_t *count
, bool *backref
)
3285 return (char *) begin
;
3288 /* Like dfaexec_main (D, BEGIN, END, ALLOW_NL, COUNT, D->localeinfo.multibyte),
3289 but faster and set *BACKREF if the DFA code does not support this
3293 dfaexec (struct dfa
*d
, char const *begin
, char *end
,
3294 bool allow_nl
, size_t *count
, bool *backref
)
3296 return d
->dfaexec (d
, begin
, end
, allow_nl
, count
, backref
);
3300 dfasuperset (struct dfa
const *d
)
3306 dfaisfast (struct dfa
const *d
)
3312 free_mbdata (struct dfa
*d
)
3314 free (d
->multibyte_prop
);
3315 free (d
->lex
.brack
.chars
);
3316 free (d
->mb_follows
.elems
);
3321 for (s
= -1; s
< d
->tralloc
; s
++)
3322 free (d
->mb_trans
[s
]);
3323 free (d
->mb_trans
- 2);
3327 /* Return true if every construct in D is supported by this DFA matcher. */
3328 static bool _GL_ATTRIBUTE_PURE
3329 dfa_supported (struct dfa
const *d
)
3331 for (size_t i
= 0; i
< d
->tindex
; i
++)
3333 switch (d
->tokens
[i
])
3339 if (!d
->localeinfo
.multibyte
)
3351 dfaoptimize (struct dfa
*d
)
3353 if (!d
->localeinfo
.using_utf8
)
3356 bool have_backref
= false;
3357 for (size_t i
= 0; i
< d
->tindex
; ++i
)
3359 switch (d
->tokens
[i
])
3365 have_backref
= true;
3368 /* Requires multi-byte algorithm. */
3375 if (!have_backref
&& d
->superset
)
3377 /* The superset DFA is not likely to be much faster, so remove it. */
3378 dfafree (d
->superset
);
3384 d
->localeinfo
.multibyte
= false;
3385 d
->dfaexec
= dfaexec_sb
;
3390 dfassbuild (struct dfa
*d
)
3392 struct dfa
*sup
= dfaalloc ();
3395 sup
->localeinfo
.multibyte
= false;
3396 sup
->dfaexec
= dfaexec_sb
;
3397 sup
->multibyte_prop
= NULL
;
3398 sup
->superset
= NULL
;
3401 sup
->follows
= NULL
;
3405 sup
->success
= NULL
;
3406 sup
->newlines
= NULL
;
3408 sup
->charclasses
= xnmalloc (sup
->calloc
, sizeof *sup
->charclasses
);
3411 memcpy (sup
->charclasses
, d
->charclasses
,
3412 d
->cindex
* sizeof *sup
->charclasses
);
3415 sup
->tokens
= xnmalloc (d
->tindex
, 2 * sizeof *sup
->tokens
);
3416 sup
->talloc
= d
->tindex
* 2;
3418 bool have_achar
= false;
3419 bool have_nchar
= false;
3421 for (size_t i
= j
= 0; i
< d
->tindex
; i
++)
3423 switch (d
->tokens
[i
])
3431 sup
->tokens
[j
++] = CSET
+ charclass_index (sup
, &ccl
);
3432 sup
->tokens
[j
++] = STAR
;
3433 if (d
->tokens
[i
+ 1] == QMARK
|| d
->tokens
[i
+ 1] == STAR
3434 || d
->tokens
[i
+ 1] == PLUS
)
3443 if (d
->localeinfo
.multibyte
)
3445 /* These constraints aren't supported in a multibyte locale.
3446 Ignore them in the superset DFA. */
3447 sup
->tokens
[j
++] = EMPTY
;
3452 sup
->tokens
[j
++] = d
->tokens
[i
];
3453 if ((0 <= d
->tokens
[i
] && d
->tokens
[i
] < NOTCHAR
)
3454 || d
->tokens
[i
] >= CSET
)
3461 if (have_nchar
&& (have_achar
|| d
->localeinfo
.multibyte
))
3470 /* Parse and analyze a single string of the given length. */
3472 dfacomp (char const *s
, size_t len
, struct dfa
*d
, bool searchflag
)
3474 dfaparse (s
, len
, d
);
3477 if (dfa_supported (d
))
3480 dfaanalyze (d
, searchflag
);
3484 d
->dfaexec
= dfaexec_noop
;
3490 dfaanalyze (d
->superset
, searchflag
);
3494 /* Free the storage held by the components of a dfa. */
3496 dfafree (struct dfa
*d
)
3498 free (d
->charclasses
);
3501 if (d
->localeinfo
.multibyte
)
3504 for (size_t i
= 0; i
< d
->sindex
; ++i
)
3506 free (d
->states
[i
].elems
.elems
);
3507 free (d
->states
[i
].mbps
.elems
);
3513 for (size_t i
= 0; i
< d
->tindex
; ++i
)
3514 free (d
->follows
[i
].elems
);
3520 for (size_t i
= 0; i
< d
->tralloc
; ++i
)
3526 free (d
->trans
- 2);
3533 dfafree (d
->superset
);
3536 /* Having found the postfix representation of the regular expression,
3537 try to find a long sequence of characters that must appear in any line
3539 Finding a "longest" sequence is beyond the scope here;
3540 we take an easy way out and hope for the best.
3541 (Take "(ab|a)b"--please.)
3543 We do a bottom-up calculation of sequences of characters that must appear
3544 in matches of r.e.'s represented by trees rooted at the nodes of the postfix
3546 sequences that must appear at the left of the match ("left")
3547 sequences that must appear at the right of the match ("right")
3548 lists of sequences that must appear somewhere in the match ("in")
3549 sequences that must constitute the match ("is")
3551 When we get to the root of the tree, we use one of the longest of its
3552 calculated "in" sequences as our answer.
3554 The sequences calculated for the various types of node (in pseudo ANSI c)
3555 are shown below. "p" is the operand of unary operators (and the left-hand
3556 operand of binary operators); "q" is the right-hand operand of binary
3559 "ZERO" means "a zero-length sequence" below.
3561 Type left right is in
3562 ---- ---- ----- -- --
3563 char c # c # c # c # c
3565 ANYCHAR ZERO ZERO ZERO ZERO
3567 MBCSET ZERO ZERO ZERO ZERO
3569 CSET ZERO ZERO ZERO ZERO
3571 STAR ZERO ZERO ZERO ZERO
3573 QMARK ZERO ZERO ZERO ZERO
3575 PLUS p->left p->right ZERO p->in
3577 CAT (p->is==ZERO)? (q->is==ZERO)? (p->is!=ZERO && p->in plus
3578 p->left : q->right : q->is!=ZERO) ? q->in plus
3579 p->is##q->left p->right##q->is p->is##q->is : p->right##q->left
3582 OR longest common longest common (do p->is and substrings common
3583 leading trailing to q->is have same p->in and
3584 (sub)sequence (sub)sequence q->in length and content) ?
3585 of p->left of p->right
3586 and q->left and q->right p->is : NULL
3588 If there's anything else we recognize in the tree, all four sequences get set
3589 to zero-length sequences. If there's something we don't recognize in the
3590 tree, we just return a zero-length sequence.
3592 Break ties in favor of infrequent letters (choosing 'zzz' in preference to
3595 And ... is it here or someplace that we might ponder "optimizations" such as
3596 egrep 'psi|epsilon' -> egrep 'psi'
3597 egrep 'pepsi|epsilon' -> egrep 'epsi'
3598 (Yes, we now find "epsi" as a "string
3599 that must occur", but we might also
3600 simplify the *entire* r.e. being sought)
3601 grep '[c]' -> grep 'c'
3602 grep '(ab|a)b' -> grep 'ab'
3603 grep 'ab*' -> grep 'a'
3604 grep 'a*b' -> grep 'b'
3606 There are several issues:
3608 Is optimization easy (enough)?
3610 Does optimization actually accomplish anything,
3611 or is the automaton you get from "psi|epsilon" (for example)
3612 the same as the one you get from "psi" (for example)?
3614 Are optimizable r.e.'s likely to be used in real-life situations
3615 (something like 'ab*' is probably unlikely; something like is
3616 'psi|epsilon' is likelier)? */
3619 icatalloc (char *old
, char const *new)
3621 size_t newsize
= strlen (new);
3624 size_t oldsize
= strlen (old
);
3625 char *result
= xrealloc (old
, oldsize
+ newsize
+ 1);
3626 memcpy (result
+ oldsize
, new, newsize
+ 1);
3631 freelist (char **cpp
)
3638 enlist (char **cpp
, char *new, size_t len
)
3640 new = memcpy (xmalloc (len
+ 1), new, len
);
3642 /* Is there already something in the list that's new (or longer)? */
3644 for (i
= 0; cpp
[i
] != NULL
; ++i
)
3645 if (strstr (cpp
[i
], new) != NULL
)
3650 /* Eliminate any obsoleted strings. */
3651 for (size_t j
= 0; cpp
[j
] != NULL
; )
3652 if (strstr (new, cpp
[j
]) == NULL
)
3662 /* Add the new string. */
3663 cpp
= xnrealloc (cpp
, i
+ 2, sizeof *cpp
);
3669 /* Given pointers to two strings, return a pointer to an allocated
3670 list of their distinct common substrings. */
3672 comsubs (char *left
, char const *right
)
3674 char **cpp
= xzalloc (sizeof *cpp
);
3676 for (char *lcp
= left
; *lcp
!= '\0'; lcp
++)
3679 char *rcp
= strchr (right
, *lcp
);
3683 for (i
= 1; lcp
[i
] != '\0' && lcp
[i
] == rcp
[i
]; ++i
)
3687 rcp
= strchr (rcp
+ 1, *lcp
);
3690 cpp
= enlist (cpp
, lcp
, len
);
3696 addlists (char **old
, char **new)
3699 old
= enlist (old
, *new, strlen (*new));
3703 /* Given two lists of substrings, return a new list giving substrings
3706 inboth (char **left
, char **right
)
3708 char **both
= xzalloc (sizeof *both
);
3710 for (size_t lnum
= 0; left
[lnum
] != NULL
; ++lnum
)
3712 for (size_t rnum
= 0; right
[rnum
] != NULL
; ++rnum
)
3714 char **temp
= comsubs (left
[lnum
], right
[rnum
]);
3715 both
= addlists (both
, temp
);
3723 typedef struct must must
;
3737 allocmust (must
*mp
, size_t size
)
3739 must
*new_mp
= xmalloc (sizeof *new_mp
);
3740 new_mp
->in
= xzalloc (sizeof *new_mp
->in
);
3741 new_mp
->left
= xzalloc (size
);
3742 new_mp
->right
= xzalloc (size
);
3743 new_mp
->is
= xzalloc (size
);
3744 new_mp
->begline
= false;
3745 new_mp
->endline
= false;
3751 resetmust (must
*mp
)
3755 mp
->left
[0] = mp
->right
[0] = mp
->is
[0] = '\0';
3756 mp
->begline
= false;
3757 mp
->endline
= false;
3772 dfamust (struct dfa
const *d
)
3775 char const *result
= "";
3777 bool begline
= false;
3778 bool endline
= false;
3779 bool need_begline
= false;
3780 bool need_endline
= false;
3781 bool case_fold_unibyte
= d
->syntax
.case_fold
&& MB_CUR_MAX
== 1;
3783 for (size_t ri
= 0; ri
< d
->tindex
; ++ri
)
3785 token t
= d
->tokens
[ri
];
3789 mp
= allocmust (mp
, 2);
3791 need_begline
= true;
3794 mp
= allocmust (mp
, 2);
3796 need_endline
= true;
3800 assert (!"neither LPAREN nor RPAREN may appear here");
3810 mp
= allocmust (mp
, 2);
3822 must
*lmp
= mp
= mp
->prev
;
3823 size_t j
, ln
, rn
, n
;
3825 /* Guaranteed to be. Unlikely, but ... */
3826 if (streq (lmp
->is
, rmp
->is
))
3828 lmp
->begline
&= rmp
->begline
;
3829 lmp
->endline
&= rmp
->endline
;
3834 lmp
->begline
= false;
3835 lmp
->endline
= false;
3837 /* Left side--easy */
3839 while (lmp
->left
[i
] != '\0' && lmp
->left
[i
] == rmp
->left
[i
])
3841 lmp
->left
[i
] = '\0';
3843 ln
= strlen (lmp
->right
);
3844 rn
= strlen (rmp
->right
);
3848 for (i
= 0; i
< n
; ++i
)
3849 if (lmp
->right
[ln
- i
- 1] != rmp
->right
[rn
- i
- 1])
3851 for (j
= 0; j
< i
; ++j
)
3852 lmp
->right
[j
] = lmp
->right
[(ln
- i
) + j
];
3853 lmp
->right
[j
] = '\0';
3854 new = inboth (lmp
->in
, rmp
->in
);
3868 for (size_t i
= 0; mp
->in
[i
] != NULL
; ++i
)
3869 if (strlen (mp
->in
[i
]) > strlen (result
))
3871 if (streq (result
, mp
->is
))
3873 if ((!need_begline
|| mp
->begline
) && (!need_endline
3876 begline
= mp
->begline
;
3877 endline
= mp
->endline
;
3884 must
*lmp
= mp
= mp
->prev
;
3886 /* In. Everything in left, plus everything in
3887 right, plus concatenation of
3888 left's right and right's left. */
3889 lmp
->in
= addlists (lmp
->in
, rmp
->in
);
3890 if (lmp
->right
[0] != '\0' && rmp
->left
[0] != '\0')
3892 size_t lrlen
= strlen (lmp
->right
);
3893 size_t rllen
= strlen (rmp
->left
);
3894 char *tp
= xmalloc (lrlen
+ rllen
);
3895 memcpy (tp
, lmp
->right
, lrlen
);
3896 memcpy (tp
+ lrlen
, rmp
->left
, rllen
);
3897 lmp
->in
= enlist (lmp
->in
, tp
, lrlen
+ rllen
);
3901 if (lmp
->is
[0] != '\0')
3902 lmp
->left
= icatalloc (lmp
->left
, rmp
->left
);
3904 if (rmp
->is
[0] == '\0')
3905 lmp
->right
[0] = '\0';
3906 lmp
->right
= icatalloc (lmp
->right
, rmp
->right
);
3907 /* Guaranteed to be */
3908 if ((lmp
->is
[0] != '\0' || lmp
->begline
)
3909 && (rmp
->is
[0] != '\0' || rmp
->endline
))
3911 lmp
->is
= icatalloc (lmp
->is
, rmp
->is
);
3912 lmp
->endline
= rmp
->endline
;
3917 lmp
->begline
= false;
3918 lmp
->endline
= false;
3925 /* Not on *my* shift. */
3931 /* If T is a singleton, or if case-folding in a unibyte
3932 locale and T's members all case-fold to the same char,
3933 convert T to one of its members. Otherwise, do
3934 nothing further with T. */
3935 charclass
*ccl
= &d
->charclasses
[t
- CSET
];
3937 for (j
= 0; j
< NOTCHAR
; j
++)
3938 if (tstbit (j
, ccl
))
3940 if (! (j
< NOTCHAR
))
3942 mp
= allocmust (mp
, 2);
3946 while (++j
< NOTCHAR
)
3948 && ! (case_fold_unibyte
3949 && toupper (j
) == toupper (t
)))
3953 mp
= allocmust (mp
, 2);
3959 if (d
->tokens
[ri
+ 1] == CAT
)
3961 for (; rj
< d
->tindex
- 1; rj
+= 2)
3963 if ((rj
!= ri
&& (d
->tokens
[rj
] <= 0
3964 || NOTCHAR
<= d
->tokens
[rj
]))
3965 || d
->tokens
[rj
+ 1] != CAT
)
3969 mp
= allocmust (mp
, ((rj
- ri
) >> 1) + 1);
3970 mp
->is
[0] = mp
->left
[0] = mp
->right
[0]
3971 = case_fold_unibyte
? toupper (t
) : t
;
3974 for (i
= 1; ri
+ 2 < rj
; i
++)
3978 mp
->is
[i
] = mp
->left
[i
] = mp
->right
[i
]
3979 = case_fold_unibyte
? toupper (t
) : t
;
3981 mp
->is
[i
] = mp
->left
[i
] = mp
->right
[i
] = '\0';
3982 mp
->in
= enlist (mp
->in
, mp
->is
, i
);
3988 struct dfamust
*dm
= NULL
;
3991 dm
= xmalloc (sizeof *dm
);
3993 dm
->begline
= begline
;
3994 dm
->endline
= endline
;
3995 dm
->must
= xstrdup (result
);
4000 must
*prev
= mp
->prev
;
4009 dfamustfree (struct dfamust
*dm
)
4018 return xmalloc (sizeof (struct dfa
));
4021 /* Initialize DFA. */
4023 dfasyntax (struct dfa
*dfa
, struct localeinfo
const *linfo
,
4024 reg_syntax_t bits
, int dfaopts
)
4026 memset (dfa
, 0, offsetof (struct dfa
, dfaexec
));
4027 dfa
->dfaexec
= linfo
->multibyte
? dfaexec_mb
: dfaexec_sb
;
4028 dfa
->simple_locale
= using_simple_locale (linfo
->multibyte
);
4029 dfa
->localeinfo
= *linfo
;
4031 dfa
->fast
= !dfa
->localeinfo
.multibyte
;
4034 dfa
->lex
.cur_mb_len
= 1;
4035 dfa
->syntax
.syntax_bits_set
= true;
4036 dfa
->syntax
.case_fold
= (bits
& RE_ICASE
) != 0;
4037 dfa
->syntax
.anchor
= (dfaopts
& DFA_ANCHOR
) != 0;
4038 dfa
->syntax
.eolbyte
= dfaopts
& DFA_EOL_NUL
? '\0' : '\n';
4039 dfa
->syntax
.syntax_bits
= bits
;
4041 for (int i
= CHAR_MIN
; i
<= CHAR_MAX
; ++i
)
4043 unsigned char uc
= i
;
4045 dfa
->syntax
.sbit
[uc
] = char_context (dfa
, uc
);
4046 switch (dfa
->syntax
.sbit
[uc
])
4049 setbit (uc
, &dfa
->syntax
.letters
);
4052 setbit (uc
, &dfa
->syntax
.newline
);
4056 /* POSIX requires that the five bytes in "\n\r./" (including the
4057 terminating NUL) cannot occur inside a multibyte character. */
4058 dfa
->syntax
.never_trail
[uc
] = (dfa
->localeinfo
.using_utf8
4059 ? (uc
& 0xc0) != 0x80
4060 : strchr ("\n\r./", uc
) != NULL
);
4064 /* vim:set shiftwidth=2: */