1 /* dfa.c - deterministic extended regexp routines for GNU
2 Copyright (C) 1988, 1998, 2000, 2002, 2004-2005, 2007-2019 Free Software
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3, or (at your option)
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
18 51 Franklin Street - Fifth Floor, Boston, MA 02110-1301, USA */
20 /* Written June, 1988 by Mike Haertel
21 Modified July, 1988 by Arthur David Olson to assist BMG speedups */
37 streq (char const *a
, char const *b
)
39 return strcmp (a
, b
) == 0;
45 return '0' <= c
&& c
<= '9';
49 #define _(str) gettext (str)
55 #include "localeinfo.h"
59 # define FALLTHROUGH ((void) 0)
61 # define FALLTHROUGH __attribute__ ((__fallthrough__))
66 # define MIN(a,b) ((a) < (b) ? (a) : (b))
69 /* HPUX defines these as macros in sys/param.h. */
77 /* First integer value that is greater than any character code. */
78 enum { NOTCHAR
= 1 << CHAR_BIT
};
80 /* This represents part of a character class. It must be unsigned and
81 at least CHARCLASS_WORD_BITS wide. Any excess bits are zero. */
82 typedef unsigned long int charclass_word
;
84 /* CHARCLASS_WORD_BITS is the number of bits used in a charclass word.
85 CHARCLASS_PAIR (LO, HI) is part of a charclass initializer, and
86 represents 64 bits' worth of a charclass, where LO and HI are the
87 low and high-order 32 bits of the 64-bit quantity. */
88 #if ULONG_MAX >> 31 >> 31 < 3
89 enum { CHARCLASS_WORD_BITS
= 32 };
90 # define CHARCLASS_PAIR(lo, hi) lo, hi
92 enum { CHARCLASS_WORD_BITS
= 64 };
93 # define CHARCLASS_PAIR(lo, hi) (((charclass_word) (hi) << 32) + (lo))
96 /* An initializer for a charclass whose 32-bit words are A through H. */
97 #define CHARCLASS_INIT(a, b, c, d, e, f, g, h) \
99 CHARCLASS_PAIR (a, b), CHARCLASS_PAIR (c, d), \
100 CHARCLASS_PAIR (e, f), CHARCLASS_PAIR (g, h) \
103 /* The maximum useful value of a charclass_word; all used bits are 1. */
104 static charclass_word
const CHARCLASS_WORD_MASK
105 = ((charclass_word
) 1 << (CHARCLASS_WORD_BITS
- 1) << 1) - 1;
107 /* Number of words required to hold a bit for every character. */
110 CHARCLASS_WORDS
= (NOTCHAR
+ CHARCLASS_WORD_BITS
- 1) / CHARCLASS_WORD_BITS
113 /* Sets of unsigned characters are stored as bit vectors in arrays of ints. */
114 typedef struct { charclass_word w
[CHARCLASS_WORDS
]; } charclass
;
116 /* Convert a possibly-signed character to an unsigned character. This is
117 a bit safer than casting to unsigned char, since it catches some type
118 errors that the cast doesn't. */
125 /* Contexts tell us whether a character is a newline or a word constituent.
126 Word-constituent characters are those that satisfy iswalnum, plus '_'.
127 Each character has a single CTX_* value; bitmasks of CTX_* values denote
128 a particular character class.
130 A state also stores a context value, which is a bitmask of CTX_* values.
131 A state's context represents a set of characters that the state's
132 predecessors must match. For example, a state whose context does not
133 include CTX_LETTER will never have transitions where the previous
134 character is a word constituent. A state whose context is CTX_ANY
135 might have transitions from any character. */
145 /* Sometimes characters can only be matched depending on the surrounding
146 context. Such context decisions depend on what the previous character
147 was, and the value of the current (lookahead) character. Context
148 dependent constraints are encoded as 9-bit integers. Each bit that
149 is set indicates that the constraint succeeds in the corresponding
152 bit 6-8 - valid contexts when next character is CTX_NEWLINE
153 bit 3-5 - valid contexts when next character is CTX_LETTER
154 bit 0-2 - valid contexts when next character is CTX_NONE
156 succeeds_in_context determines whether a given constraint
157 succeeds in a particular context. Prev is a bitmask of possible
158 context values for the previous character, curr is the (single-bit)
159 context value for the lookahead character. */
161 newline_constraint (int constraint
)
163 return (constraint
>> 6) & 7;
166 letter_constraint (int constraint
)
168 return (constraint
>> 3) & 7;
171 other_constraint (int constraint
)
173 return constraint
& 7;
177 succeeds_in_context (int constraint
, int prev
, int curr
)
179 return !! (((curr
& CTX_NONE
? other_constraint (constraint
) : 0) \
180 | (curr
& CTX_LETTER
? letter_constraint (constraint
) : 0) \
181 | (curr
& CTX_NEWLINE
? newline_constraint (constraint
) : 0)) \
185 /* The following describe what a constraint depends on. */
187 prev_newline_dependent (int constraint
)
189 return ((constraint
^ constraint
>> 2) & 0111) != 0;
192 prev_letter_dependent (int constraint
)
194 return ((constraint
^ constraint
>> 1) & 0111) != 0;
197 /* Tokens that match the empty string subject to some constraint actually
198 work by applying that constraint to determine what may follow them,
199 taking into account what has gone before. The following values are
200 the constraints corresponding to the special tokens previously defined. */
203 NO_CONSTRAINT
= 0777,
204 BEGLINE_CONSTRAINT
= 0444,
205 ENDLINE_CONSTRAINT
= 0700,
206 BEGWORD_CONSTRAINT
= 0050,
207 ENDWORD_CONSTRAINT
= 0202,
208 LIMWORD_CONSTRAINT
= 0252,
209 NOTLIMWORD_CONSTRAINT
= 0525
212 /* The regexp is parsed into an array of tokens in postfix form. Some tokens
213 are operators and others are terminal symbols. Most (but not all) of these
214 codes are returned by the lexical analyzer. */
216 typedef ptrdiff_t token
;
217 static ptrdiff_t const TOKEN_MAX
= PTRDIFF_MAX
;
219 /* States are indexed by state_num values. These are normally
220 nonnegative but -1 is used as a special value. */
221 typedef ptrdiff_t state_num
;
223 /* Predefined token values. */
226 END
= -1, /* END is a terminal symbol that matches the
227 end of input; any value of END or less in
228 the parse tree is such a symbol. Accepting
229 states of the DFA are those that would have
230 a transition on END. This is -1, not some
231 more-negative value, to tweak the speed of
232 comparisons to END. */
234 /* Ordinary character values are terminal symbols that match themselves. */
236 /* CSET must come last in the following list of special tokens. Otherwise,
237 the list order matters only for performance. Related special tokens
238 should have nearby values so that code like (t == ANYCHAR || t == MBCSET
239 || CSET <= t) can be done with a single machine-level comparison. */
241 EMPTY
= NOTCHAR
, /* EMPTY is a terminal symbol that matches
244 QMARK
, /* QMARK is an operator of one argument that
245 matches zero or one occurrences of its
248 STAR
, /* STAR is an operator of one argument that
249 matches the Kleene closure (zero or more
250 occurrences) of its argument. */
252 PLUS
, /* PLUS is an operator of one argument that
253 matches the positive closure (one or more
254 occurrences) of its argument. */
256 REPMN
, /* REPMN is a lexical token corresponding
257 to the {m,n} construct. REPMN never
258 appears in the compiled token vector. */
260 CAT
, /* CAT is an operator of two arguments that
261 matches the concatenation of its
262 arguments. CAT is never returned by the
265 OR
, /* OR is an operator of two arguments that
266 matches either of its arguments. */
268 LPAREN
, /* LPAREN never appears in the parse tree,
269 it is only a lexeme. */
271 RPAREN
, /* RPAREN never appears in the parse tree. */
273 WCHAR
, /* Only returned by lex. wctok contains
274 the wide character representation. */
276 ANYCHAR
, /* ANYCHAR is a terminal symbol that matches
277 a valid multibyte (or single byte) character.
278 It is used only if MB_CUR_MAX > 1. */
280 BEG
, /* BEG is an initial symbol that matches the
281 beginning of input. */
283 BEGLINE
, /* BEGLINE is a terminal symbol that matches
284 the empty string at the beginning of a
287 ENDLINE
, /* ENDLINE is a terminal symbol that matches
288 the empty string at the end of a line. */
290 BEGWORD
, /* BEGWORD is a terminal symbol that matches
291 the empty string at the beginning of a
294 ENDWORD
, /* ENDWORD is a terminal symbol that matches
295 the empty string at the end of a word. */
297 LIMWORD
, /* LIMWORD is a terminal symbol that matches
298 the empty string at the beginning or the
301 NOTLIMWORD
, /* NOTLIMWORD is a terminal symbol that
302 matches the empty string not at
303 the beginning or end of a word. */
305 BACKREF
, /* BACKREF is generated by \<digit>
306 or by any other construct that
307 is not completely handled. If the scanner
308 detects a transition on backref, it returns
309 a kind of "semi-success" indicating that
310 the match will have to be verified with
311 a backtracking matcher. */
313 MBCSET
, /* MBCSET is similar to CSET, but for
314 multibyte characters. */
316 CSET
/* CSET and (and any value greater) is a
317 terminal symbol that matches any of a
318 class of characters. */
322 /* States of the recognizer correspond to sets of positions in the parse
323 tree, together with the constraints under which they may be matched.
324 So a position is encoded as an index into the parse tree together with
328 size_t index
; /* Index into the parse array. */
329 unsigned int constraint
; /* Constraint for matching this position. */
332 /* Sets of positions are stored as arrays. */
335 position
*elems
; /* Elements of this position set. */
336 ptrdiff_t nelem
; /* Number of elements in this set. */
337 ptrdiff_t alloc
; /* Number of elements allocated in ELEMS. */
340 /* A state of the dfa consists of a set of positions, some flags,
341 and the token value of the lowest-numbered position of the state that
342 contains an END token. */
345 size_t hash
; /* Hash of the positions of this state. */
346 position_set elems
; /* Positions this state could match. */
347 unsigned char context
; /* Context from previous state. */
348 unsigned short constraint
; /* Constraint for this state to accept. */
349 token first_end
; /* Token value of the first END in elems. */
350 position_set mbps
; /* Positions which can match multibyte
351 characters or the follows, e.g., period.
352 Used only if MB_CUR_MAX > 1. */
353 state_num mb_trindex
; /* Index of this state in MB_TRANS, or
354 negative if the state does not have
358 /* Maximum for any transition table count. This should be at least 3,
359 for the initial state setup. */
360 enum { MAX_TRCOUNT
= 1024 };
362 /* A bracket operator.
363 e.g., [a-c], [[:alpha:]], etc. */
364 struct mb_char_classes
368 wchar_t *chars
; /* Normal characters. */
370 ptrdiff_t nchars_alloc
;
375 /* Syntax bits controlling the behavior of the lexical analyzer. */
376 reg_syntax_t syntax_bits
;
377 bool syntax_bits_set
;
379 /* Flag for case-folding letters into sets. */
382 /* True if ^ and $ match only the start and end of data, and do not match
383 end-of-line within data. */
386 /* End-of-line byte in data. */
387 unsigned char eolbyte
;
389 /* Cache of char-context values. */
392 /* If never_trail[B], the byte B cannot be a non-initial byte in a
393 multibyte character. */
394 bool never_trail
[NOTCHAR
];
396 /* Set of characters considered letters. */
399 /* Set of characters that are newline. */
403 /* Lexical analyzer. All the dross that deals with the obnoxious
404 GNU Regex syntax bits is located here. The poor, suffering
405 reader is referred to the GNU Regex documentation for the
406 meaning of the @#%!@#%^!@ syntax bits. */
409 char const *ptr
; /* Pointer to next input character. */
410 size_t left
; /* Number of characters remaining. */
411 token lasttok
; /* Previous token returned; initially END. */
412 size_t parens
; /* Count of outstanding left parens. */
413 int minrep
, maxrep
; /* Repeat counts for {m,n}. */
415 /* Wide character representation of the current multibyte character,
416 or WEOF if there was an encoding error. Used only if
420 /* Length of the multibyte representation of wctok. */
423 /* The most recently analyzed multibyte bracket expression. */
424 struct mb_char_classes brack
;
426 /* We're separated from beginning or (, | only by zero-width characters. */
430 /* Recursive descent parser for regular expressions. */
434 token tok
; /* Lookahead token. */
435 size_t depth
; /* Current depth of a hypothetical stack
436 holding deferred productions. This is
437 used to determine the depth that will be
438 required of the real stack later on in
442 /* A compiled regular expression. */
445 /* Syntax configuration */
446 struct regex_syntax syntax
;
448 /* Fields filled by the scanner. */
449 charclass
*charclasses
; /* Array of character sets for CSET tokens. */
450 ptrdiff_t cindex
; /* Index for adding new charclasses. */
451 ptrdiff_t calloc
; /* Number of charclasses allocated. */
452 size_t canychar
; /* Index of anychar class, or (size_t) -1. */
455 struct lexer_state lex
;
458 struct parser_state parse
;
460 /* Fields filled by the parser. */
461 token
*tokens
; /* Postfix parse array. */
462 size_t tindex
; /* Index for adding new tokens. */
463 size_t talloc
; /* Number of tokens currently allocated. */
464 size_t depth
; /* Depth required of an evaluation stack
465 used for depth-first traversal of the
467 size_t nleaves
; /* Number of leaves on the parse tree. */
468 size_t nregexps
; /* Count of parallel regexps being built
470 bool fast
; /* The DFA is fast. */
471 token utf8_anychar_classes
[5]; /* To lower ANYCHAR in UTF-8 locales. */
472 mbstate_t mbs
; /* Multibyte conversion state. */
474 /* The following are valid only if MB_CUR_MAX > 1. */
476 /* The value of multibyte_prop[i] is defined by following rule.
477 if tokens[i] < NOTCHAR
478 bit 0 : tokens[i] is the first byte of a character, including
479 single-byte characters.
480 bit 1 : tokens[i] is the last byte of a character, including
481 single-byte characters.
485 = 'single_byte_a', 'multi_byte_A', single_byte_b'
486 = 'sb_a', 'mb_A(1st byte)', 'mb_A(2nd byte)', 'mb_A(3rd byte)', 'sb_b'
490 char *multibyte_prop
;
492 /* Fields filled by the superset. */
493 struct dfa
*superset
; /* Hint of the dfa. */
495 /* Fields filled by the state builder. */
496 dfa_state
*states
; /* States of the dfa. */
497 state_num sindex
; /* Index for adding new states. */
498 ptrdiff_t salloc
; /* Number of states currently allocated. */
500 /* Fields filled by the parse tree->NFA conversion. */
501 position_set
*follows
; /* Array of follow sets, indexed by position
502 index. The follow of a position is the set
503 of positions containing characters that
504 could conceivably follow a character
505 matching the given position in a string
506 matching the regexp. Allocated to the
507 maximum possible position index. */
508 bool searchflag
; /* We are supposed to build a searching
509 as opposed to an exact matcher. A searching
510 matcher finds the first and shortest string
511 matching a regexp anywhere in the buffer,
512 whereas an exact matcher finds the longest
513 string matching, but anchored to the
514 beginning of the buffer. */
516 /* Fields filled by dfaanalyze. */
517 int *constraints
; /* Array of union of accepting constraints
518 in the follow of a position. */
519 int *separates
; /* Array of contexts on follow of a
522 /* Fields filled by dfaexec. */
523 state_num tralloc
; /* Number of transition tables that have
524 slots so far, not counting trans[-1] and
526 int trcount
; /* Number of transition tables that have
527 been built, other than for initial
529 int min_trcount
; /* Number of initial states. Equivalently,
530 the minimum state number for which trcount
531 counts transitions. */
532 state_num
**trans
; /* Transition tables for states that can
533 never accept. If the transitions for a
534 state have not yet been computed, or the
535 state could possibly accept, its entry in
536 this table is NULL. This points to two
537 past the start of the allocated array,
538 and trans[-1] and trans[-2] are always
540 state_num
**fails
; /* Transition tables after failing to accept
541 on a state that potentially could do so.
542 If trans[i] is non-null, fails[i] must
544 char *success
; /* Table of acceptance conditions used in
545 dfaexec and computed in build_state. */
546 state_num
*newlines
; /* Transitions on newlines. The entry for a
547 newline in any transition table is always
548 -1 so we can count lines without wasting
549 too many cycles. The transition for a
550 newline is stored separately and handled
551 as a special case. Newline is also used
552 as a sentinel at the end of the buffer. */
553 state_num initstate_notbol
; /* Initial state for CTX_LETTER and CTX_NONE
554 context in multibyte locales, in which we
555 do not distinguish between their contexts,
556 as not supported word. */
557 position_set mb_follows
; /* Follow set added by ANYCHAR on demand. */
558 state_num
**mb_trans
; /* Transition tables for states with
560 state_num mb_trcount
; /* Number of transition tables for states with
561 ANYCHAR that have actually been built. */
563 /* Information derived from the locale. This is at the end so that
564 a quick memset need not clear it specially. */
566 /* dfaexec implementation. */
567 char *(*dfaexec
) (struct dfa
*, char const *, char *,
568 bool, size_t *, bool *);
570 /* The locale is simple, like the C locale. These locales can be
571 processed more efficiently, as they are single-byte, their native
572 character set is in collating-sequence order, and they do not
573 have multi-character collating elements. */
576 /* Other cached information derived from the locale. */
577 struct localeinfo localeinfo
;
580 /* User access to dfa internals. */
582 /* S could possibly be an accepting state of R. */
584 accepting (state_num s
, struct dfa
const *r
)
586 return r
->states
[s
].constraint
!= 0;
589 /* STATE accepts in the specified context. */
591 accepts_in_context (int prev
, int curr
, state_num state
, struct dfa
const *dfa
)
593 return succeeds_in_context (dfa
->states
[state
].constraint
, prev
, curr
);
596 static void regexp (struct dfa
*dfa
);
598 /* Store into *PWC the result of converting the leading bytes of the
599 multibyte buffer S of length N bytes, using D->localeinfo.sbctowc
600 and updating the conversion state in *D. On conversion error,
601 convert just a single byte, to WEOF. Return the number of bytes
604 This differs from mbrtowc (PWC, S, N, &D->mbs) as follows:
606 * PWC points to wint_t, not to wchar_t.
607 * The last arg is a dfa *D instead of merely a multibyte conversion
609 * N must be at least 1.
610 * S[N - 1] must be a sentinel byte.
611 * Shift encodings are not supported.
612 * The return value is always in the range 1..N.
613 * D->mbs is always valid afterwards.
614 * *PWC is always set to something. */
616 mbs_to_wchar (wint_t *pwc
, char const *s
, size_t n
, struct dfa
*d
)
618 unsigned char uc
= s
[0];
619 wint_t wc
= d
->localeinfo
.sbctowc
[uc
];
624 size_t nbytes
= mbrtowc (&wch
, s
, n
, &d
->mbs
);
625 if (0 < nbytes
&& nbytes
< (size_t) -2)
630 memset (&d
->mbs
, 0, sizeof d
->mbs
);
643 fprintf (stderr
, "END");
644 else if (0 <= t
&& t
< NOTCHAR
)
647 fprintf (stderr
, "0x%02x", ch
);
712 fprintf (stderr
, "%s", s
);
717 /* Stuff pertaining to charclasses. */
720 tstbit (unsigned int b
, charclass
const *c
)
722 return c
->w
[b
/ CHARCLASS_WORD_BITS
] >> b
% CHARCLASS_WORD_BITS
& 1;
726 setbit (unsigned int b
, charclass
*c
)
728 charclass_word one
= 1;
729 c
->w
[b
/ CHARCLASS_WORD_BITS
] |= one
<< b
% CHARCLASS_WORD_BITS
;
733 clrbit (unsigned int b
, charclass
*c
)
735 charclass_word one
= 1;
736 c
->w
[b
/ CHARCLASS_WORD_BITS
] &= ~(one
<< b
% CHARCLASS_WORD_BITS
);
740 zeroset (charclass
*s
)
742 memset (s
, 0, sizeof *s
);
746 fillset (charclass
*s
)
748 for (int i
= 0; i
< CHARCLASS_WORDS
; i
++)
749 s
->w
[i
] = CHARCLASS_WORD_MASK
;
753 notset (charclass
*s
)
755 for (int i
= 0; i
< CHARCLASS_WORDS
; ++i
)
756 s
->w
[i
] = CHARCLASS_WORD_MASK
& ~s
->w
[i
];
760 equal (charclass
const *s1
, charclass
const *s2
)
762 charclass_word w
= 0;
763 for (int i
= 0; i
< CHARCLASS_WORDS
; i
++)
764 w
|= s1
->w
[i
] ^ s2
->w
[i
];
769 emptyset (charclass
const *s
)
771 charclass_word w
= 0;
772 for (int i
= 0; i
< CHARCLASS_WORDS
; i
++)
777 /* Grow PA, which points to an array of *NITEMS items, and return the
778 location of the reallocated array, updating *NITEMS to reflect its
779 new size. The new array will contain at least NITEMS_INCR_MIN more
780 items, but will not contain more than NITEMS_MAX items total.
781 ITEM_SIZE is the size of each item, in bytes.
783 ITEM_SIZE and NITEMS_INCR_MIN must be positive. *NITEMS must be
784 nonnegative. If NITEMS_MAX is -1, it is treated as if it were
787 If PA is null, then allocate a new array instead of reallocating
790 Thus, to grow an array A without saving its old contents, do
791 { free (A); A = xpalloc (NULL, &AITEMS, ...); }. */
794 xpalloc (void *pa
, ptrdiff_t *nitems
, ptrdiff_t nitems_incr_min
,
795 ptrdiff_t nitems_max
, ptrdiff_t item_size
)
797 ptrdiff_t n0
= *nitems
;
799 /* The approximate size to use for initial small allocation
800 requests. This is the largest "small" request for the GNU C
802 enum { DEFAULT_MXFAST
= 64 * sizeof (size_t) / 4 };
804 /* If the array is tiny, grow it to about (but no greater than)
805 DEFAULT_MXFAST bytes. Otherwise, grow it by about 50%.
806 Adjust the growth according to three constraints: NITEMS_INCR_MIN,
807 NITEMS_MAX, and what the C language can represent safely. */
810 if (INT_ADD_WRAPV (n0
, n0
>> 1, &n
))
812 if (0 <= nitems_max
&& nitems_max
< n
)
815 ptrdiff_t adjusted_nbytes
816 = ((INT_MULTIPLY_WRAPV (n
, item_size
, &nbytes
) || SIZE_MAX
< nbytes
)
817 ? MIN (PTRDIFF_MAX
, SIZE_MAX
)
818 : nbytes
< DEFAULT_MXFAST
? DEFAULT_MXFAST
: 0);
821 n
= adjusted_nbytes
/ item_size
;
822 nbytes
= adjusted_nbytes
- adjusted_nbytes
% item_size
;
827 if (n
- n0
< nitems_incr_min
828 && (INT_ADD_WRAPV (n0
, nitems_incr_min
, &n
)
829 || (0 <= nitems_max
&& nitems_max
< n
)
830 || INT_MULTIPLY_WRAPV (n
, item_size
, &nbytes
)))
832 pa
= xrealloc (pa
, nbytes
);
837 /* Ensure that the array addressed by PA holds at least I + 1 items.
838 Either return PA, or reallocate the array and return its new address.
839 Although PA may be null, the returned value is never null.
841 The array holds *NITEMS items, where 0 <= I <= *NITEMS; *NITEMS
842 is updated on reallocation. If PA is null, *NITEMS must be zero.
843 Do not allocate more than NITEMS_MAX items total; -1 means no limit.
844 ITEM_SIZE is the size of one item; it must be positive.
845 Avoid O(N**2) behavior on arrays growing linearly. */
847 maybe_realloc (void *pa
, ptrdiff_t i
, ptrdiff_t *nitems
,
848 ptrdiff_t nitems_max
, ptrdiff_t item_size
)
852 return xpalloc (pa
, nitems
, 1, nitems_max
, item_size
);
855 /* In DFA D, find the index of charclass S, or allocate a new one. */
857 charclass_index (struct dfa
*d
, charclass
*s
)
861 for (i
= 0; i
< d
->cindex
; ++i
)
862 if (equal (s
, &d
->charclasses
[i
]))
864 d
->charclasses
= maybe_realloc (d
->charclasses
, d
->cindex
, &d
->calloc
,
865 TOKEN_MAX
- CSET
, sizeof *d
->charclasses
);
867 d
->charclasses
[i
] = *s
;
872 unibyte_word_constituent (struct dfa
const *dfa
, unsigned char c
)
874 return dfa
->localeinfo
.sbctowc
[c
] != WEOF
&& (isalnum (c
) || (c
) == '_');
878 char_context (struct dfa
const *dfa
, unsigned char c
)
880 if (c
== dfa
->syntax
.eolbyte
&& !dfa
->syntax
.anchor
)
882 if (unibyte_word_constituent (dfa
, c
))
887 /* Set a bit in the charclass for the given wchar_t. Do nothing if WC
888 is represented by a multi-byte sequence. Even for MB_CUR_MAX == 1,
889 this may happen when folding case in weird Turkish locales where
890 dotless i/dotted I are not included in the chosen character set.
891 Return whether a bit was set in the charclass. */
893 setbit_wc (wint_t wc
, charclass
*c
)
903 /* Set a bit for B and its case variants in the charclass C.
904 MB_CUR_MAX must be 1. */
906 setbit_case_fold_c (int b
, charclass
*c
)
908 int ub
= toupper (b
);
909 for (int i
= 0; i
< NOTCHAR
; i
++)
910 if (toupper (i
) == ub
)
914 /* Return true if the locale compatible with the C locale. */
917 using_simple_locale (bool multibyte
)
919 /* The native character set is known to be compatible with
920 the C locale. The following test isn't perfect, but it's good
921 enough in practice, as only ASCII and EBCDIC are in common use
922 and this test correctly accepts ASCII and rejects EBCDIC. */
923 enum { native_c_charset
=
924 ('\b' == 8 && '\t' == 9 && '\n' == 10 && '\v' == 11 && '\f' == 12
925 && '\r' == 13 && ' ' == 32 && '!' == 33 && '"' == 34 && '#' == 35
926 && '%' == 37 && '&' == 38 && '\'' == 39 && '(' == 40 && ')' == 41
927 && '*' == 42 && '+' == 43 && ',' == 44 && '-' == 45 && '.' == 46
928 && '/' == 47 && '0' == 48 && '9' == 57 && ':' == 58 && ';' == 59
929 && '<' == 60 && '=' == 61 && '>' == 62 && '?' == 63 && 'A' == 65
930 && 'Z' == 90 && '[' == 91 && '\\' == 92 && ']' == 93 && '^' == 94
931 && '_' == 95 && 'a' == 97 && 'z' == 122 && '{' == 123 && '|' == 124
932 && '}' == 125 && '~' == 126)
935 if (!native_c_charset
|| multibyte
)
939 /* Treat C and POSIX locales as being compatible. Also, treat
940 errors as compatible, as these are invariably from stubs. */
941 char const *loc
= setlocale (LC_ALL
, NULL
);
942 return !loc
|| streq (loc
, "C") || streq (loc
, "POSIX");
946 /* Fetch the next lexical input character from the pattern. There
947 must at least one byte of pattern input. Set DFA->lex.wctok to the
948 value of the character or to WEOF depending on whether the input is
949 a valid multibyte character (possibly of length 1). Then return
950 the next input byte value, except return EOF if the input is a
951 multibyte character of length greater than 1. */
953 fetch_wc (struct dfa
*dfa
)
955 size_t nbytes
= mbs_to_wchar (&dfa
->lex
.wctok
, dfa
->lex
.ptr
, dfa
->lex
.left
,
957 dfa
->lex
.cur_mb_len
= nbytes
;
958 int c
= nbytes
== 1 ? to_uchar (dfa
->lex
.ptr
[0]) : EOF
;
959 dfa
->lex
.ptr
+= nbytes
;
960 dfa
->lex
.left
-= nbytes
;
964 /* If there is no more input, report an error about unbalanced brackets.
965 Otherwise, behave as with fetch_wc (DFA). */
967 bracket_fetch_wc (struct dfa
*dfa
)
970 dfaerror (_("unbalanced ["));
971 return fetch_wc (dfa
);
974 typedef int predicate (int);
976 /* The following list maps the names of the Posix named character classes
977 to predicate functions that determine whether a given character is in
978 the class. The leading [ has already been eaten by the lexical
984 bool single_byte_only
;
987 static const struct dfa_ctype prednames
[] = {
988 {"alpha", isalpha
, false},
989 {"upper", isupper
, false},
990 {"lower", islower
, false},
991 {"digit", isdigit
, true},
992 {"xdigit", isxdigit
, false},
993 {"space", isspace
, false},
994 {"punct", ispunct
, false},
995 {"alnum", isalnum
, false},
996 {"print", isprint
, false},
997 {"graph", isgraph
, false},
998 {"cntrl", iscntrl
, false},
999 {"blank", isblank
, false},
1003 static const struct dfa_ctype
*_GL_ATTRIBUTE_PURE
1004 find_pred (const char *str
)
1006 for (unsigned int i
= 0; prednames
[i
].name
; ++i
)
1007 if (streq (str
, prednames
[i
].name
))
1008 return &prednames
[i
];
1012 /* Parse a bracket expression, which possibly includes multibyte
1015 parse_bracket_exp (struct dfa
*dfa
)
1017 /* This is a bracket expression that dfaexec is known to
1018 process correctly. */
1019 bool known_bracket_exp
= true;
1021 /* Used to warn about [:space:].
1022 Bit 0 = first character is a colon.
1023 Bit 1 = last character is a colon.
1024 Bit 2 = includes any other character but a colon.
1025 Bit 3 = includes ranges, char/equiv classes or collation elements. */
1026 int colon_warning_state
;
1028 dfa
->lex
.brack
.nchars
= 0;
1031 int c
= bracket_fetch_wc (dfa
);
1032 bool invert
= c
== '^';
1035 c
= bracket_fetch_wc (dfa
);
1036 known_bracket_exp
= dfa
->simple_locale
;
1038 wint_t wc
= dfa
->lex
.wctok
;
1041 colon_warning_state
= (c
== ':');
1044 c1
= NOTCHAR
; /* Mark c1 as not initialized. */
1045 colon_warning_state
&= ~2;
1047 /* Note that if we're looking at some other [:...:] construct,
1048 we just treat it as a bunch of ordinary characters. We can do
1049 this because we assume regex has checked for syntax errors before
1050 dfa is ever called. */
1053 c1
= bracket_fetch_wc (dfa
);
1054 wc1
= dfa
->lex
.wctok
;
1056 if ((c1
== ':' && (dfa
->syntax
.syntax_bits
& RE_CHAR_CLASSES
))
1057 || c1
== '.' || c1
== '=')
1059 enum { MAX_BRACKET_STRING_LEN
= 32 };
1060 char str
[MAX_BRACKET_STRING_LEN
+ 1];
1064 c
= bracket_fetch_wc (dfa
);
1065 if (dfa
->lex
.left
== 0
1066 || (c
== c1
&& dfa
->lex
.ptr
[0] == ']'))
1068 if (len
< MAX_BRACKET_STRING_LEN
)
1071 /* This is in any case an invalid class name. */
1076 /* Fetch bracket. */
1077 c
= bracket_fetch_wc (dfa
);
1078 wc
= dfa
->lex
.wctok
;
1080 /* Build character class. POSIX allows character
1081 classes to match multicharacter collating elements,
1082 but the regex code does not support that, so do not
1083 worry about that possibility. */
1086 = (dfa
->syntax
.case_fold
&& (streq (str
, "upper")
1087 || streq (str
, "lower"))
1089 const struct dfa_ctype
*pred
= find_pred (class);
1091 dfaerror (_("invalid character class"));
1093 if (dfa
->localeinfo
.multibyte
&& !pred
->single_byte_only
)
1094 known_bracket_exp
= false;
1096 for (int c2
= 0; c2
< NOTCHAR
; ++c2
)
1097 if (pred
->func (c2
))
1101 known_bracket_exp
= false;
1103 colon_warning_state
|= 8;
1105 /* Fetch new lookahead character. */
1106 c1
= bracket_fetch_wc (dfa
);
1107 wc1
= dfa
->lex
.wctok
;
1111 /* We treat '[' as a normal character here. c/c1/wc/wc1
1112 are already set up. */
1116 && (dfa
->syntax
.syntax_bits
& RE_BACKSLASH_ESCAPE_IN_LISTS
))
1118 c
= bracket_fetch_wc (dfa
);
1119 wc
= dfa
->lex
.wctok
;
1124 c1
= bracket_fetch_wc (dfa
);
1125 wc1
= dfa
->lex
.wctok
;
1129 /* build range characters. */
1131 int c2
= bracket_fetch_wc (dfa
);
1132 wint_t wc2
= dfa
->lex
.wctok
;
1134 /* A bracket expression like [a-[.aa.]] matches an unknown set.
1135 Treat it like [-a[.aa.]] while parsing it, and
1136 remember that the set is unknown. */
1137 if (c2
== '[' && dfa
->lex
.ptr
[0] == '.')
1139 known_bracket_exp
= false;
1145 /* In the case [x-], the - is an ordinary hyphen,
1146 which is left in c1, the lookahead character. */
1147 dfa
->lex
.ptr
-= dfa
->lex
.cur_mb_len
;
1148 dfa
->lex
.left
+= dfa
->lex
.cur_mb_len
;
1152 if (c2
== '\\' && (dfa
->syntax
.syntax_bits
1153 & RE_BACKSLASH_ESCAPE_IN_LISTS
))
1155 c2
= bracket_fetch_wc (dfa
);
1156 wc2
= dfa
->lex
.wctok
;
1159 colon_warning_state
|= 8;
1160 c1
= bracket_fetch_wc (dfa
);
1161 wc1
= dfa
->lex
.wctok
;
1163 /* Treat [x-y] as a range if x != y. */
1164 if (wc
!= wc2
|| wc
== WEOF
)
1166 if (dfa
->simple_locale
1167 || (isasciidigit (c
) & isasciidigit (c2
)))
1169 for (int ci
= c
; ci
<= c2
; ci
++)
1170 if (dfa
->syntax
.case_fold
&& isalpha (ci
))
1171 setbit_case_fold_c (ci
, &ccl
);
1176 known_bracket_exp
= false;
1183 colon_warning_state
|= (c
== ':') ? 2 : 4;
1185 if (!dfa
->localeinfo
.multibyte
)
1187 if (dfa
->syntax
.case_fold
&& isalpha (c
))
1188 setbit_case_fold_c (c
, &ccl
);
1195 known_bracket_exp
= false;
1198 wchar_t folded
[CASE_FOLDED_BUFSIZE
+ 1];
1199 unsigned int n
= (dfa
->syntax
.case_fold
1200 ? case_folded_counterparts (wc
, folded
+ 1) + 1
1203 for (unsigned int i
= 0; i
< n
; i
++)
1204 if (!setbit_wc (folded
[i
], &ccl
))
1206 dfa
->lex
.brack
.chars
1207 = maybe_realloc (dfa
->lex
.brack
.chars
, dfa
->lex
.brack
.nchars
,
1208 &dfa
->lex
.brack
.nchars_alloc
, -1,
1209 sizeof *dfa
->lex
.brack
.chars
);
1210 dfa
->lex
.brack
.chars
[dfa
->lex
.brack
.nchars
++] = folded
[i
];
1214 while ((wc
= wc1
, (c
= c1
) != ']'));
1216 if (colon_warning_state
== 7)
1217 dfawarn (_("character class syntax is [[:space:]], not [:space:]"));
1219 if (! known_bracket_exp
)
1222 if (dfa
->localeinfo
.multibyte
&& (invert
|| dfa
->lex
.brack
.nchars
!= 0))
1224 dfa
->lex
.brack
.invert
= invert
;
1225 dfa
->lex
.brack
.cset
= emptyset (&ccl
) ? -1 : charclass_index (dfa
, &ccl
);
1232 if (dfa
->syntax
.syntax_bits
& RE_HAT_LISTS_NOT_NEWLINE
)
1233 clrbit ('\n', &ccl
);
1236 return CSET
+ charclass_index (dfa
, &ccl
);
1246 push_lex_state (struct dfa
*dfa
, struct lexptr
*ls
, char const *s
)
1248 ls
->ptr
= dfa
->lex
.ptr
;
1249 ls
->left
= dfa
->lex
.left
;
1251 dfa
->lex
.left
= strlen (s
);
1255 pop_lex_state (struct dfa
*dfa
, struct lexptr
const *ls
)
1257 dfa
->lex
.ptr
= ls
->ptr
;
1258 dfa
->lex
.left
= ls
->left
;
1262 lex (struct dfa
*dfa
)
1264 bool backslash
= false;
1266 /* Basic plan: We fetch a character. If it's a backslash,
1267 we set the backslash flag and go through the loop again.
1268 On the plus side, this avoids having a duplicate of the
1269 main switch inside the backslash case. On the minus side,
1270 it means that just about every case begins with
1271 "if (backslash) ...". */
1272 for (int i
= 0; i
< 2; ++i
)
1274 if (! dfa
->lex
.left
)
1275 return dfa
->lex
.lasttok
= END
;
1276 int c
= fetch_wc (dfa
);
1283 if (dfa
->lex
.left
== 0)
1284 dfaerror (_("unfinished \\ escape"));
1291 if (dfa
->syntax
.syntax_bits
& RE_CONTEXT_INDEP_ANCHORS
1292 || dfa
->lex
.lasttok
== END
|| dfa
->lex
.lasttok
== LPAREN
1293 || dfa
->lex
.lasttok
== OR
)
1294 return dfa
->lex
.lasttok
= BEGLINE
;
1300 if (dfa
->syntax
.syntax_bits
& RE_CONTEXT_INDEP_ANCHORS
1301 || dfa
->lex
.left
== 0
1303 > !(dfa
->syntax
.syntax_bits
& RE_NO_BK_PARENS
))
1304 && (dfa
->lex
.ptr
[!(dfa
->syntax
.syntax_bits
& RE_NO_BK_PARENS
)
1305 & (dfa
->lex
.ptr
[0] == '\\')]
1308 > !(dfa
->syntax
.syntax_bits
& RE_NO_BK_VBAR
))
1309 && (dfa
->lex
.ptr
[!(dfa
->syntax
.syntax_bits
& RE_NO_BK_VBAR
)
1310 & (dfa
->lex
.ptr
[0] == '\\')]
1312 || ((dfa
->syntax
.syntax_bits
& RE_NEWLINE_ALT
)
1313 && dfa
->lex
.left
> 0 && dfa
->lex
.ptr
[0] == '\n'))
1314 return dfa
->lex
.lasttok
= ENDLINE
;
1326 if (backslash
&& !(dfa
->syntax
.syntax_bits
& RE_NO_BK_REFS
))
1328 dfa
->lex
.laststart
= false;
1329 return dfa
->lex
.lasttok
= BACKREF
;
1334 if (backslash
&& !(dfa
->syntax
.syntax_bits
& RE_NO_GNU_OPS
))
1336 /* FIXME: should be beginning of string */
1337 return dfa
->lex
.lasttok
= BEGLINE
;
1342 if (backslash
&& !(dfa
->syntax
.syntax_bits
& RE_NO_GNU_OPS
))
1344 /* FIXME: should be end of string */
1345 return dfa
->lex
.lasttok
= ENDLINE
;
1350 if (backslash
&& !(dfa
->syntax
.syntax_bits
& RE_NO_GNU_OPS
))
1351 return dfa
->lex
.lasttok
= BEGWORD
;
1355 if (backslash
&& !(dfa
->syntax
.syntax_bits
& RE_NO_GNU_OPS
))
1356 return dfa
->lex
.lasttok
= ENDWORD
;
1360 if (backslash
&& !(dfa
->syntax
.syntax_bits
& RE_NO_GNU_OPS
))
1361 return dfa
->lex
.lasttok
= LIMWORD
;
1365 if (backslash
&& !(dfa
->syntax
.syntax_bits
& RE_NO_GNU_OPS
))
1366 return dfa
->lex
.lasttok
= NOTLIMWORD
;
1370 if (dfa
->syntax
.syntax_bits
& RE_LIMITED_OPS
)
1372 if (backslash
!= ((dfa
->syntax
.syntax_bits
& RE_BK_PLUS_QM
) != 0))
1374 if (!(dfa
->syntax
.syntax_bits
& RE_CONTEXT_INDEP_OPS
)
1375 && dfa
->lex
.laststart
)
1377 return dfa
->lex
.lasttok
= QMARK
;
1382 if (!(dfa
->syntax
.syntax_bits
& RE_CONTEXT_INDEP_OPS
)
1383 && dfa
->lex
.laststart
)
1385 return dfa
->lex
.lasttok
= STAR
;
1388 if (dfa
->syntax
.syntax_bits
& RE_LIMITED_OPS
)
1390 if (backslash
!= ((dfa
->syntax
.syntax_bits
& RE_BK_PLUS_QM
) != 0))
1392 if (!(dfa
->syntax
.syntax_bits
& RE_CONTEXT_INDEP_OPS
)
1393 && dfa
->lex
.laststart
)
1395 return dfa
->lex
.lasttok
= PLUS
;
1398 if (!(dfa
->syntax
.syntax_bits
& RE_INTERVALS
))
1400 if (backslash
!= ((dfa
->syntax
.syntax_bits
& RE_NO_BK_BRACES
) == 0))
1402 if (!(dfa
->syntax
.syntax_bits
& RE_CONTEXT_INDEP_OPS
)
1403 && dfa
->lex
.laststart
)
1408 {M,} - minimum count, maximum is infinity
1410 {,} - 0 to infinity (same as '*')
1411 {M,N} - M through N */
1413 char const *p
= dfa
->lex
.ptr
;
1414 char const *lim
= p
+ dfa
->lex
.left
;
1415 dfa
->lex
.minrep
= dfa
->lex
.maxrep
= -1;
1416 for (; p
!= lim
&& isasciidigit (*p
); p
++)
1417 dfa
->lex
.minrep
= (dfa
->lex
.minrep
< 0
1419 : MIN (RE_DUP_MAX
+ 1,
1420 dfa
->lex
.minrep
* 10 + *p
- '0'));
1424 dfa
->lex
.maxrep
= dfa
->lex
.minrep
;
1427 if (dfa
->lex
.minrep
< 0)
1428 dfa
->lex
.minrep
= 0;
1429 while (++p
!= lim
&& isasciidigit (*p
))
1431 = (dfa
->lex
.maxrep
< 0
1433 : MIN (RE_DUP_MAX
+ 1,
1434 dfa
->lex
.maxrep
* 10 + *p
- '0'));
1437 if (! ((! backslash
|| (p
!= lim
&& *p
++ == '\\'))
1438 && p
!= lim
&& *p
++ == '}'
1439 && 0 <= dfa
->lex
.minrep
1440 && (dfa
->lex
.maxrep
< 0
1441 || dfa
->lex
.minrep
<= dfa
->lex
.maxrep
)))
1443 if (dfa
->syntax
.syntax_bits
& RE_INVALID_INTERVAL_ORD
)
1445 dfaerror (_("invalid content of \\{\\}"));
1447 if (RE_DUP_MAX
< dfa
->lex
.maxrep
)
1448 dfaerror (_("regular expression too big"));
1450 dfa
->lex
.left
= lim
- p
;
1452 dfa
->lex
.laststart
= false;
1453 return dfa
->lex
.lasttok
= REPMN
;
1456 if (dfa
->syntax
.syntax_bits
& RE_LIMITED_OPS
)
1458 if (backslash
!= ((dfa
->syntax
.syntax_bits
& RE_NO_BK_VBAR
) == 0))
1460 dfa
->lex
.laststart
= true;
1461 return dfa
->lex
.lasttok
= OR
;
1464 if (dfa
->syntax
.syntax_bits
& RE_LIMITED_OPS
1465 || backslash
|| !(dfa
->syntax
.syntax_bits
& RE_NEWLINE_ALT
))
1467 dfa
->lex
.laststart
= true;
1468 return dfa
->lex
.lasttok
= OR
;
1471 if (backslash
!= ((dfa
->syntax
.syntax_bits
& RE_NO_BK_PARENS
) == 0))
1474 dfa
->lex
.laststart
= true;
1475 return dfa
->lex
.lasttok
= LPAREN
;
1478 if (backslash
!= ((dfa
->syntax
.syntax_bits
& RE_NO_BK_PARENS
) == 0))
1480 if (dfa
->lex
.parens
== 0
1481 && dfa
->syntax
.syntax_bits
& RE_UNMATCHED_RIGHT_PAREN_ORD
)
1484 dfa
->lex
.laststart
= false;
1485 return dfa
->lex
.lasttok
= RPAREN
;
1490 if (dfa
->canychar
== (size_t) -1)
1494 if (!(dfa
->syntax
.syntax_bits
& RE_DOT_NEWLINE
))
1495 clrbit ('\n', &ccl
);
1496 if (dfa
->syntax
.syntax_bits
& RE_DOT_NOT_NULL
)
1497 clrbit ('\0', &ccl
);
1498 if (dfa
->localeinfo
.multibyte
)
1499 for (int c2
= 0; c2
< NOTCHAR
; c2
++)
1500 if (dfa
->localeinfo
.sbctowc
[c2
] == WEOF
)
1502 dfa
->canychar
= charclass_index (dfa
, &ccl
);
1504 dfa
->lex
.laststart
= false;
1505 return dfa
->lex
.lasttok
= (dfa
->localeinfo
.multibyte
1507 : CSET
+ dfa
->canychar
);
1511 if (!backslash
|| (dfa
->syntax
.syntax_bits
& RE_NO_GNU_OPS
))
1513 if (!dfa
->localeinfo
.multibyte
)
1517 for (int c2
= 0; c2
< NOTCHAR
; ++c2
)
1522 dfa
->lex
.laststart
= false;
1523 return dfa
->lex
.lasttok
= CSET
+ charclass_index (dfa
, &ccl
);
1526 /* FIXME: see if optimizing this, as is done with ANYCHAR and
1527 add_utf8_anychar, makes sense. */
1529 /* \s and \S are documented to be equivalent to [[:space:]] and
1530 [^[:space:]] respectively, so tell the lexer to process those
1531 strings, each minus its "already processed" '['. */
1534 push_lex_state (dfa
, &ls
, &"^[:space:]]"[c
== 's']);
1535 dfa
->lex
.lasttok
= parse_bracket_exp (dfa
);
1536 pop_lex_state (dfa
, &ls
);
1539 dfa
->lex
.laststart
= false;
1540 return dfa
->lex
.lasttok
;
1544 if (!backslash
|| (dfa
->syntax
.syntax_bits
& RE_NO_GNU_OPS
))
1547 if (!dfa
->localeinfo
.multibyte
)
1551 for (int c2
= 0; c2
< NOTCHAR
; ++c2
)
1552 if (dfa
->syntax
.sbit
[c2
] == CTX_LETTER
)
1556 dfa
->lex
.laststart
= false;
1557 return dfa
->lex
.lasttok
= CSET
+ charclass_index (dfa
, &ccl
);
1560 /* FIXME: see if optimizing this, as is done with ANYCHAR and
1561 add_utf8_anychar, makes sense. */
1563 /* \w and \W are documented to be equivalent to [_[:alnum:]] and
1564 [^_[:alnum:]] respectively, so tell the lexer to process those
1565 strings, each minus its "already processed" '['. */
1568 push_lex_state (dfa
, &ls
, &"^_[:alnum:]]"[c
== 'w']);
1569 dfa
->lex
.lasttok
= parse_bracket_exp (dfa
);
1570 pop_lex_state (dfa
, &ls
);
1573 dfa
->lex
.laststart
= false;
1574 return dfa
->lex
.lasttok
;
1579 dfa
->lex
.laststart
= false;
1580 return dfa
->lex
.lasttok
= parse_bracket_exp (dfa
);
1584 dfa
->lex
.laststart
= false;
1585 /* For multibyte character sets, folding is done in atom. Always
1587 if (dfa
->localeinfo
.multibyte
)
1588 return dfa
->lex
.lasttok
= WCHAR
;
1590 if (dfa
->syntax
.case_fold
&& isalpha (c
))
1594 setbit_case_fold_c (c
, &ccl
);
1595 return dfa
->lex
.lasttok
= CSET
+ charclass_index (dfa
, &ccl
);
1598 return dfa
->lex
.lasttok
= c
;
1602 /* The above loop should consume at most a backslash
1603 and some other character. */
1605 return END
; /* keeps pedantic compilers happy. */
1609 addtok_mb (struct dfa
*dfa
, token t
, char mbprop
)
1611 if (dfa
->talloc
== dfa
->tindex
)
1613 dfa
->tokens
= x2nrealloc (dfa
->tokens
, &dfa
->talloc
,
1614 sizeof *dfa
->tokens
);
1615 if (dfa
->localeinfo
.multibyte
)
1616 dfa
->multibyte_prop
= xnrealloc (dfa
->multibyte_prop
, dfa
->talloc
,
1617 sizeof *dfa
->multibyte_prop
);
1619 if (dfa
->localeinfo
.multibyte
)
1620 dfa
->multibyte_prop
[dfa
->tindex
] = mbprop
;
1621 dfa
->tokens
[dfa
->tindex
++] = t
;
1645 if (dfa
->parse
.depth
> dfa
->depth
)
1646 dfa
->depth
= dfa
->parse
.depth
;
1649 static void addtok_wc (struct dfa
*dfa
, wint_t wc
);
1651 /* Add the given token to the parse tree, maintaining the depth count and
1652 updating the maximum depth if necessary. */
1654 addtok (struct dfa
*dfa
, token t
)
1656 if (dfa
->localeinfo
.multibyte
&& t
== MBCSET
)
1658 bool need_or
= false;
1660 /* Extract wide characters into alternations for better performance.
1661 This does not require UTF-8. */
1662 for (ptrdiff_t i
= 0; i
< dfa
->lex
.brack
.nchars
; i
++)
1664 addtok_wc (dfa
, dfa
->lex
.brack
.chars
[i
]);
1669 dfa
->lex
.brack
.nchars
= 0;
1671 /* Wide characters have been handled above, so it is possible
1672 that the set is empty now. Do nothing in that case. */
1673 if (dfa
->lex
.brack
.cset
!= -1)
1675 addtok (dfa
, CSET
+ dfa
->lex
.brack
.cset
);
1682 addtok_mb (dfa
, t
, 3);
1686 /* We treat a multibyte character as a single atom, so that DFA
1687 can treat a multibyte character as a single expression.
1689 e.g., we construct the following tree from "<mb1><mb2>".
1690 <mb1(1st-byte)><mb1(2nd-byte)><CAT><mb1(3rd-byte)><CAT>
1691 <mb2(1st-byte)><mb2(2nd-byte)><CAT><mb2(3rd-byte)><CAT><CAT> */
1693 addtok_wc (struct dfa
*dfa
, wint_t wc
)
1695 unsigned char buf
[MB_LEN_MAX
];
1696 mbstate_t s
= { 0 };
1697 size_t stored_bytes
= wcrtomb ((char *) buf
, wc
, &s
);
1699 if (stored_bytes
!= (size_t) -1)
1700 dfa
->lex
.cur_mb_len
= stored_bytes
;
1703 /* This is merely stop-gap. buf[0] is undefined, yet skipping
1704 the addtok_mb call altogether can corrupt the heap. */
1705 dfa
->lex
.cur_mb_len
= 1;
1709 addtok_mb (dfa
, buf
[0], dfa
->lex
.cur_mb_len
== 1 ? 3 : 1);
1710 for (int i
= 1; i
< dfa
->lex
.cur_mb_len
; i
++)
1712 addtok_mb (dfa
, buf
[i
], i
== dfa
->lex
.cur_mb_len
- 1 ? 2 : 0);
1718 add_utf8_anychar (struct dfa
*dfa
)
1720 static charclass
const utf8_classes
[5] = {
1721 /* 80-bf: non-leading bytes. */
1722 CHARCLASS_INIT (0, 0, 0, 0, 0xffffffff, 0xffffffff, 0, 0),
1724 /* 00-7f: 1-byte sequence. */
1725 CHARCLASS_INIT (0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0, 0, 0, 0),
1727 /* c2-df: 2-byte sequence. */
1728 CHARCLASS_INIT (0, 0, 0, 0, 0, 0, 0xfffffffc, 0),
1730 /* e0-ef: 3-byte sequence. */
1731 CHARCLASS_INIT (0, 0, 0, 0, 0, 0, 0, 0xffff),
1733 /* f0-f7: 4-byte sequence. */
1734 CHARCLASS_INIT (0, 0, 0, 0, 0, 0, 0, 0xff0000)
1736 const unsigned int n
= sizeof (utf8_classes
) / sizeof (utf8_classes
[0]);
1738 /* Define the five character classes that are needed below. */
1739 if (dfa
->utf8_anychar_classes
[0] == 0)
1740 for (unsigned int i
= 0; i
< n
; i
++)
1742 charclass c
= utf8_classes
[i
];
1745 if (!(dfa
->syntax
.syntax_bits
& RE_DOT_NEWLINE
))
1747 if (dfa
->syntax
.syntax_bits
& RE_DOT_NOT_NULL
)
1750 dfa
->utf8_anychar_classes
[i
] = CSET
+ charclass_index (dfa
, &c
);
1753 /* A valid UTF-8 character is
1756 |[0xc2-0xdf][0x80-0xbf]
1757 |[0xe0-0xef[0x80-0xbf][0x80-0xbf]
1758 |[0xf0-f7][0x80-0xbf][0x80-0xbf][0x80-0xbf])
1760 which I'll write more concisely "B|CA|DAA|EAAA". Factor the [0x00-0x7f]
1761 and you get "B|(C|(D|EA)A)A". And since the token buffer is in reverse
1762 Polish notation, you get "B C D E A CAT OR A CAT OR A CAT OR". */
1764 for (i
= 1; i
< n
; i
++)
1765 addtok (dfa
, dfa
->utf8_anychar_classes
[i
]);
1768 addtok (dfa
, dfa
->utf8_anychar_classes
[0]);
1774 /* The grammar understood by the parser is as follows.
1793 <multibyte character>
1804 LPAREN regexp RPAREN
1807 The parser builds a parse tree in postfix form in an array of tokens. */
1810 atom (struct dfa
*dfa
)
1812 if ((0 <= dfa
->parse
.tok
&& dfa
->parse
.tok
< NOTCHAR
)
1813 || dfa
->parse
.tok
>= CSET
1814 || dfa
->parse
.tok
== BEG
|| dfa
->parse
.tok
== BACKREF
1815 || dfa
->parse
.tok
== BEGLINE
|| dfa
->parse
.tok
== ENDLINE
1816 || dfa
->parse
.tok
== BEGWORD
|| dfa
->parse
.tok
== ENDWORD
1817 || dfa
->parse
.tok
== LIMWORD
|| dfa
->parse
.tok
== NOTLIMWORD
1818 || dfa
->parse
.tok
== ANYCHAR
|| dfa
->parse
.tok
== MBCSET
)
1820 if (dfa
->parse
.tok
== ANYCHAR
&& dfa
->localeinfo
.using_utf8
)
1822 /* For UTF-8 expand the period to a series of CSETs that define a
1823 valid UTF-8 character. This avoids using the slow multibyte
1824 path. I'm pretty sure it would be both profitable and correct to
1825 do it for any encoding; however, the optimization must be done
1826 manually as it is done above in add_utf8_anychar. So, let's
1827 start with UTF-8: it is the most used, and the structure of the
1828 encoding makes the correctness more obvious. */
1829 add_utf8_anychar (dfa
);
1832 addtok (dfa
, dfa
->parse
.tok
);
1833 dfa
->parse
.tok
= lex (dfa
);
1835 else if (dfa
->parse
.tok
== WCHAR
)
1837 if (dfa
->lex
.wctok
== WEOF
)
1838 addtok (dfa
, BACKREF
);
1841 addtok_wc (dfa
, dfa
->lex
.wctok
);
1843 if (dfa
->syntax
.case_fold
)
1845 wchar_t folded
[CASE_FOLDED_BUFSIZE
];
1846 unsigned int n
= case_folded_counterparts (dfa
->lex
.wctok
,
1848 for (unsigned int i
= 0; i
< n
; i
++)
1850 addtok_wc (dfa
, folded
[i
]);
1856 dfa
->parse
.tok
= lex (dfa
);
1858 else if (dfa
->parse
.tok
== LPAREN
)
1860 dfa
->parse
.tok
= lex (dfa
);
1862 if (dfa
->parse
.tok
!= RPAREN
)
1863 dfaerror (_("unbalanced ("));
1864 dfa
->parse
.tok
= lex (dfa
);
1867 addtok (dfa
, EMPTY
);
1870 /* Return the number of tokens in the given subexpression. */
1871 static size_t _GL_ATTRIBUTE_PURE
1872 nsubtoks (struct dfa
const *dfa
, size_t tindex
)
1874 switch (dfa
->tokens
[tindex
- 1])
1881 return 1 + nsubtoks (dfa
, tindex
- 1);
1885 size_t ntoks1
= nsubtoks (dfa
, tindex
- 1);
1886 return 1 + ntoks1
+ nsubtoks (dfa
, tindex
- 1 - ntoks1
);
1891 /* Copy the given subexpression to the top of the tree. */
1893 copytoks (struct dfa
*dfa
, size_t tindex
, size_t ntokens
)
1895 if (dfa
->localeinfo
.multibyte
)
1896 for (size_t i
= 0; i
< ntokens
; ++i
)
1897 addtok_mb (dfa
, dfa
->tokens
[tindex
+ i
],
1898 dfa
->multibyte_prop
[tindex
+ i
]);
1900 for (size_t i
= 0; i
< ntokens
; ++i
)
1901 addtok_mb (dfa
, dfa
->tokens
[tindex
+ i
], 3);
1905 closure (struct dfa
*dfa
)
1908 while (dfa
->parse
.tok
== QMARK
|| dfa
->parse
.tok
== STAR
1909 || dfa
->parse
.tok
== PLUS
|| dfa
->parse
.tok
== REPMN
)
1910 if (dfa
->parse
.tok
== REPMN
&& (dfa
->lex
.minrep
|| dfa
->lex
.maxrep
))
1912 size_t ntokens
= nsubtoks (dfa
, dfa
->tindex
);
1913 size_t tindex
= dfa
->tindex
- ntokens
;
1914 if (dfa
->lex
.maxrep
< 0)
1916 if (dfa
->lex
.minrep
== 0)
1917 addtok (dfa
, QMARK
);
1919 for (i
= 1; i
< dfa
->lex
.minrep
; i
++)
1921 copytoks (dfa
, tindex
, ntokens
);
1924 for (; i
< dfa
->lex
.maxrep
; i
++)
1926 copytoks (dfa
, tindex
, ntokens
);
1927 addtok (dfa
, QMARK
);
1930 dfa
->parse
.tok
= lex (dfa
);
1932 else if (dfa
->parse
.tok
== REPMN
)
1934 dfa
->tindex
-= nsubtoks (dfa
, dfa
->tindex
);
1935 dfa
->parse
.tok
= lex (dfa
);
1940 addtok (dfa
, dfa
->parse
.tok
);
1941 dfa
->parse
.tok
= lex (dfa
);
1946 branch (struct dfa
* dfa
)
1949 while (dfa
->parse
.tok
!= RPAREN
&& dfa
->parse
.tok
!= OR
1950 && dfa
->parse
.tok
>= 0)
1958 regexp (struct dfa
*dfa
)
1961 while (dfa
->parse
.tok
== OR
)
1963 dfa
->parse
.tok
= lex (dfa
);
1969 /* Main entry point for the parser. S is a string to be parsed, len is the
1970 length of the string, so s can include NUL characters. D is a pointer to
1971 the struct dfa to parse into. */
1973 dfaparse (char const *s
, size_t len
, struct dfa
*d
)
1977 d
->lex
.lasttok
= END
;
1978 d
->lex
.laststart
= true;
1980 if (!d
->syntax
.syntax_bits_set
)
1981 dfaerror (_("no syntax specified"));
1986 d
->parse
.tok
= lex (d
);
1987 d
->parse
.depth
= d
->depth
;
1991 if (d
->parse
.tok
!= END
)
1992 dfaerror (_("unbalanced )"));
1994 addtok (d
, END
- d
->nregexps
);
2003 /* Some primitives for operating on sets of positions. */
2005 /* Copy one set to another. */
2007 copy (position_set
const *src
, position_set
*dst
)
2009 if (dst
->alloc
< src
->nelem
)
2012 dst
->elems
= xpalloc (NULL
, &dst
->alloc
, src
->nelem
- dst
->alloc
, -1,
2013 sizeof *dst
->elems
);
2015 dst
->nelem
= src
->nelem
;
2016 if (src
->nelem
!= 0)
2017 memcpy (dst
->elems
, src
->elems
, src
->nelem
* sizeof *dst
->elems
);
2021 alloc_position_set (position_set
*s
, size_t size
)
2023 s
->elems
= xnmalloc (size
, sizeof *s
->elems
);
2028 /* Insert position P in set S. S is maintained in sorted order on
2029 decreasing index. If there is already an entry in S with P.index
2030 then merge (logically-OR) P's constraints into the one in S.
2031 S->elems must point to an array large enough to hold the resulting set. */
2033 insert (position p
, position_set
*s
)
2035 ptrdiff_t count
= s
->nelem
;
2036 ptrdiff_t lo
= 0, hi
= count
;
2039 ptrdiff_t mid
= (lo
+ hi
) >> 1;
2040 if (s
->elems
[mid
].index
< p
.index
)
2042 else if (s
->elems
[mid
].index
== p
.index
)
2044 s
->elems
[mid
].constraint
|= p
.constraint
;
2051 s
->elems
= maybe_realloc (s
->elems
, count
, &s
->alloc
, -1, sizeof *s
->elems
);
2052 for (ptrdiff_t i
= count
; i
> lo
; i
--)
2053 s
->elems
[i
] = s
->elems
[i
- 1];
2059 append (position p
, position_set
*s
)
2061 ptrdiff_t count
= s
->nelem
;
2062 s
->elems
= maybe_realloc (s
->elems
, count
, &s
->alloc
, -1, sizeof *s
->elems
);
2063 s
->elems
[s
->nelem
++] = p
;
2066 /* Merge S1 and S2 (with the additional constraint C2) into M. The
2067 result is as if the positions of S1, and of S2 with the additional
2068 constraint C2, were inserted into an initially empty set. */
2070 merge_constrained (position_set
const *s1
, position_set
const *s2
,
2071 unsigned int c2
, position_set
*m
)
2073 ptrdiff_t i
= 0, j
= 0;
2075 if (m
->alloc
- s1
->nelem
< s2
->nelem
)
2078 m
->alloc
= s1
->nelem
;
2079 m
->elems
= xpalloc (NULL
, &m
->alloc
, s2
->nelem
, -1, sizeof *m
->elems
);
2082 while (i
< s1
->nelem
|| j
< s2
->nelem
)
2083 if (! (j
< s2
->nelem
)
2084 || (i
< s1
->nelem
&& s1
->elems
[i
].index
<= s2
->elems
[j
].index
))
2086 unsigned int c
= ((i
< s1
->nelem
&& j
< s2
->nelem
2087 && s1
->elems
[i
].index
== s2
->elems
[j
].index
)
2088 ? s2
->elems
[j
++].constraint
& c2
2090 m
->elems
[m
->nelem
].index
= s1
->elems
[i
].index
;
2091 m
->elems
[m
->nelem
++].constraint
= s1
->elems
[i
++].constraint
| c
;
2095 if (s2
->elems
[j
].constraint
& c2
)
2097 m
->elems
[m
->nelem
].index
= s2
->elems
[j
].index
;
2098 m
->elems
[m
->nelem
++].constraint
= s2
->elems
[j
].constraint
& c2
;
2104 /* Merge two sets of positions into a third. The result is exactly as if
2105 the positions of both sets were inserted into an initially empty set. */
2107 merge (position_set
const *s1
, position_set
const *s2
, position_set
*m
)
2109 merge_constrained (s1
, s2
, -1, m
);
2113 merge2 (position_set
*dst
, position_set
const *src
, position_set
*m
)
2117 for (ptrdiff_t i
= 0; i
< src
->nelem
; ++i
)
2118 insert (src
->elems
[i
], dst
);
2122 merge (src
, dst
, m
);
2127 /* Delete a position from a set. Return the nonzero constraint of the
2128 deleted position, or zero if there was no such position. */
2130 delete (size_t del
, position_set
*s
)
2132 size_t count
= s
->nelem
;
2133 size_t lo
= 0, hi
= count
;
2136 size_t mid
= (lo
+ hi
) >> 1;
2137 if (s
->elems
[mid
].index
< del
)
2139 else if (s
->elems
[mid
].index
== del
)
2141 unsigned int c
= s
->elems
[mid
].constraint
;
2143 for (i
= mid
; i
+ 1 < count
; i
++)
2144 s
->elems
[i
] = s
->elems
[i
+ 1];
2154 /* Replace a position with the followed set. */
2156 replace (position_set
*dst
, size_t del
, position_set
*add
,
2157 unsigned int constraint
, position_set
*tmp
)
2159 unsigned int c
= delete (del
, dst
) & constraint
;
2164 merge_constrained (tmp
, add
, c
, dst
);
2168 /* Find the index of the state corresponding to the given position set with
2169 the given preceding context, or create a new state if there is no such
2170 state. Context tells whether we got here on a newline or letter. */
2172 state_index (struct dfa
*d
, position_set
const *s
, int context
)
2177 token first_end
= 0;
2179 for (i
= 0; i
< s
->nelem
; ++i
)
2180 hash
^= s
->elems
[i
].index
+ s
->elems
[i
].constraint
;
2182 /* Try to find a state that exactly matches the proposed one. */
2183 for (i
= 0; i
< d
->sindex
; ++i
)
2185 if (hash
!= d
->states
[i
].hash
|| s
->nelem
!= d
->states
[i
].elems
.nelem
2186 || context
!= d
->states
[i
].context
)
2189 for (j
= 0; j
< s
->nelem
; ++j
)
2190 if (s
->elems
[j
].constraint
!= d
->states
[i
].elems
.elems
[j
].constraint
2191 || s
->elems
[j
].index
!= d
->states
[i
].elems
.elems
[j
].index
)
2198 fprintf (stderr
, "new state %zd\n nextpos:", i
);
2199 for (state_num j
= 0; j
< s
->nelem
; j
++)
2201 fprintf (stderr
, " %zu:", s
->elems
[j
].index
);
2202 prtok (d
->tokens
[s
->elems
[j
].index
]);
2204 fprintf (stderr
, "\n context:");
2205 if (context
^ CTX_ANY
)
2207 if (context
& CTX_NONE
)
2208 fprintf (stderr
, " CTX_NONE");
2209 if (context
& CTX_LETTER
)
2210 fprintf (stderr
, " CTX_LETTER");
2211 if (context
& CTX_NEWLINE
)
2212 fprintf (stderr
, " CTX_NEWLINE");
2215 fprintf (stderr
, " CTX_ANY");
2216 fprintf (stderr
, "\n");
2219 for (state_num j
= 0; j
< s
->nelem
; j
++)
2221 int c
= d
->constraints
[s
->elems
[j
].index
];
2225 if (succeeds_in_context (c
, context
, CTX_ANY
))
2228 first_end
= d
->tokens
[s
->elems
[j
].index
];
2230 else if (d
->tokens
[s
->elems
[j
].index
] == BACKREF
)
2231 constraint
= NO_CONSTRAINT
;
2235 /* Create a new state. */
2236 d
->states
= maybe_realloc (d
->states
, d
->sindex
, &d
->salloc
, -1,
2238 d
->states
[i
].hash
= hash
;
2239 alloc_position_set (&d
->states
[i
].elems
, s
->nelem
);
2240 copy (s
, &d
->states
[i
].elems
);
2241 d
->states
[i
].context
= context
;
2242 d
->states
[i
].constraint
= constraint
;
2243 d
->states
[i
].first_end
= first_end
;
2244 d
->states
[i
].mbps
.nelem
= 0;
2245 d
->states
[i
].mbps
.elems
= NULL
;
2246 d
->states
[i
].mb_trindex
= -1;
2253 /* Find the epsilon closure of a set of positions. If any position of the set
2254 contains a symbol that matches the empty string in some context, replace
2255 that position with the elements of its follow labeled with an appropriate
2256 constraint. Repeat exhaustively until no funny positions are left.
2257 S->elems must be large enough to hold the result. */
2259 epsclosure (struct dfa
const *d
)
2262 alloc_position_set (&tmp
, d
->nleaves
);
2263 for (size_t i
= 0; i
< d
->tindex
; ++i
)
2264 if (d
->follows
[i
].nelem
> 0 && d
->tokens
[i
] >= NOTCHAR
2265 && d
->tokens
[i
] != BACKREF
&& d
->tokens
[i
] != ANYCHAR
2266 && d
->tokens
[i
] != MBCSET
&& d
->tokens
[i
] < CSET
)
2268 unsigned int constraint
;
2269 switch (d
->tokens
[i
])
2272 constraint
= BEGLINE_CONSTRAINT
;
2275 constraint
= ENDLINE_CONSTRAINT
;
2278 constraint
= BEGWORD_CONSTRAINT
;
2281 constraint
= ENDWORD_CONSTRAINT
;
2284 constraint
= LIMWORD_CONSTRAINT
;
2287 constraint
= NOTLIMWORD_CONSTRAINT
;
2290 constraint
= NO_CONSTRAINT
;
2294 delete (i
, &d
->follows
[i
]);
2296 for (size_t j
= 0; j
< d
->tindex
; j
++)
2297 if (i
!= j
&& d
->follows
[j
].nelem
> 0)
2298 replace (&d
->follows
[j
], i
, &d
->follows
[i
], constraint
, &tmp
);
2303 /* Returns the set of contexts for which there is at least one
2304 character included in C. */
2307 charclass_context (struct dfa
const *dfa
, charclass
const *c
)
2311 for (unsigned int j
= 0; j
< CHARCLASS_WORDS
; ++j
)
2313 if (c
->w
[j
] & dfa
->syntax
.newline
.w
[j
])
2314 context
|= CTX_NEWLINE
;
2315 if (c
->w
[j
] & dfa
->syntax
.letters
.w
[j
])
2316 context
|= CTX_LETTER
;
2317 if (c
->w
[j
] & ~(dfa
->syntax
.letters
.w
[j
] | dfa
->syntax
.newline
.w
[j
]))
2318 context
|= CTX_NONE
;
2324 /* Returns the contexts on which the position set S depends. Each context
2325 in the set of returned contexts (let's call it SC) may have a different
2326 follow set than other contexts in SC, and also different from the
2327 follow set of the complement set (sc ^ CTX_ANY). However, all contexts
2328 in the complement set will have the same follow set. */
2330 static int _GL_ATTRIBUTE_PURE
2331 state_separate_contexts (struct dfa
*d
, position_set
const *s
)
2333 int separate_contexts
= 0;
2335 for (size_t j
= 0; j
< s
->nelem
; j
++)
2336 separate_contexts
|= d
->separates
[s
->elems
[j
].index
];
2338 return separate_contexts
;
2343 /* Single token is repeated. It is distinguished from non-repeated. */
2344 OPT_REPEAT
= (1 << 0),
2346 /* Multiple tokens are repeated. This flag is on at head of tokens. The
2347 node is not merged. */
2348 OPT_LPAREN
= (1 << 1),
2350 /* Multiple branches are joined. The node is not merged. */
2351 OPT_RPAREN
= (1 << 2),
2353 /* The node is walked. If the node is found in walking again, OPT_RPAREN
2354 flag is turned on. */
2355 OPT_WALKED
= (1 << 3),
2357 /* The node is queued. The node is not queued again. */
2358 OPT_QUEUED
= (1 << 4)
2362 merge_nfa_state (struct dfa
*d
, size_t tindex
, char *flags
,
2363 position_set
*merged
)
2365 position_set
*follows
= d
->follows
;
2366 ptrdiff_t nelem
= 0;
2368 d
->constraints
[tindex
] = 0;
2370 for (ptrdiff_t i
= 0; i
< follows
[tindex
].nelem
; i
++)
2372 size_t sindex
= follows
[tindex
].elems
[i
].index
;
2374 /* Skip the node as pruned in future. */
2375 unsigned int iconstraint
= follows
[tindex
].elems
[i
].constraint
;
2376 if (iconstraint
== 0)
2379 if (d
->tokens
[follows
[tindex
].elems
[i
].index
] <= END
)
2381 d
->constraints
[tindex
] |= follows
[tindex
].elems
[i
].constraint
;
2385 if (!(flags
[sindex
] & (OPT_LPAREN
| OPT_RPAREN
)))
2389 for (j
= 0; j
< nelem
; j
++)
2391 size_t dindex
= follows
[tindex
].elems
[j
].index
;
2393 if (follows
[tindex
].elems
[j
].constraint
!= iconstraint
)
2396 if (flags
[dindex
] & (OPT_LPAREN
| OPT_RPAREN
))
2399 if (d
->tokens
[sindex
] != d
->tokens
[dindex
])
2402 if ((flags
[sindex
] ^ flags
[dindex
]) & OPT_REPEAT
)
2405 if (flags
[sindex
] & OPT_REPEAT
)
2406 delete (sindex
, &follows
[sindex
]);
2408 merge2 (&follows
[dindex
], &follows
[sindex
], merged
);
2417 follows
[tindex
].elems
[nelem
++] = follows
[tindex
].elems
[i
];
2418 flags
[sindex
] |= OPT_QUEUED
;
2421 follows
[tindex
].nelem
= nelem
;
2425 compare (const void *a
, const void *b
)
2430 aindex
= (int) ((position
*) a
)->index
;
2431 bindex
= (int) ((position
*) b
)->index
;
2433 return aindex
- bindex
;
2437 reorder_tokens (struct dfa
*d
)
2442 position_set
*follows
;
2444 char *multibyte_prop
;
2448 map
= xnmalloc (d
->tindex
, sizeof *map
);
2452 for (ptrdiff_t i
= 1; i
< d
->tindex
; i
++)
2455 tokens
= xnmalloc (d
->nleaves
, sizeof *tokens
);
2456 follows
= xnmalloc (d
->nleaves
, sizeof *follows
);
2457 constraints
= xnmalloc (d
->nleaves
, sizeof *constraints
);
2459 if (d
->localeinfo
.multibyte
)
2460 multibyte_prop
= xnmalloc (d
->nleaves
, sizeof *multibyte_prop
);
2462 multibyte_prop
= NULL
;
2464 for (ptrdiff_t i
= 0; i
< d
->tindex
; i
++)
2468 free (d
->follows
[i
].elems
);
2469 d
->follows
[i
].elems
= NULL
;
2470 d
->follows
[i
].nelem
= 0;
2474 tokens
[map
[i
]] = d
->tokens
[i
];
2475 follows
[map
[i
]] = d
->follows
[i
];
2476 constraints
[map
[i
]] = d
->constraints
[i
];
2478 if (multibyte_prop
!= NULL
)
2479 multibyte_prop
[map
[i
]] = d
->multibyte_prop
[i
];
2481 for (ptrdiff_t j
= 0; j
< d
->follows
[i
].nelem
; j
++)
2483 if (map
[d
->follows
[i
].elems
[j
].index
] == -1)
2484 map
[d
->follows
[i
].elems
[j
].index
] = nleaves
++;
2486 d
->follows
[i
].elems
[j
].index
= map
[d
->follows
[i
].elems
[j
].index
];
2489 qsort (d
->follows
[i
].elems
, d
->follows
[i
].nelem
,
2490 sizeof *d
->follows
[i
].elems
, compare
);
2493 for (ptrdiff_t i
= 0; i
< nleaves
; i
++)
2495 d
->tokens
[i
] = tokens
[i
];
2496 d
->follows
[i
] = follows
[i
];
2497 d
->constraints
[i
] = constraints
[i
];
2499 if (multibyte_prop
!= NULL
)
2500 d
->multibyte_prop
[i
] = multibyte_prop
[i
];
2503 d
->tindex
= d
->nleaves
= nleaves
;
2508 free (multibyte_prop
);
2513 dfaoptimize (struct dfa
*d
)
2516 position_set merged0
;
2517 position_set
*merged
;
2519 flags
= xmalloc (d
->tindex
* sizeof *flags
);
2520 memset (flags
, 0, d
->tindex
* sizeof *flags
);
2522 for (size_t i
= 0; i
< d
->tindex
; i
++)
2524 for (ptrdiff_t j
= 0; j
< d
->follows
[i
].nelem
; j
++)
2526 if (d
->follows
[i
].elems
[j
].index
== i
)
2527 flags
[d
->follows
[i
].elems
[j
].index
] |= OPT_REPEAT
;
2528 else if (d
->follows
[i
].elems
[j
].index
< i
)
2529 flags
[d
->follows
[i
].elems
[j
].index
] |= OPT_LPAREN
;
2530 else if (flags
[d
->follows
[i
].elems
[j
].index
] &= OPT_WALKED
)
2531 flags
[d
->follows
[i
].elems
[j
].index
] |= OPT_RPAREN
;
2533 flags
[d
->follows
[i
].elems
[j
].index
] |= OPT_WALKED
;
2537 flags
[0] |= OPT_QUEUED
;
2540 alloc_position_set (merged
, d
->nleaves
);
2542 d
->constraints
= xnmalloc (d
->tindex
, sizeof *d
->constraints
);
2544 for (ptrdiff_t i
= 0; i
< d
->tindex
; i
++)
2545 if (flags
[i
] & OPT_QUEUED
)
2546 merge_nfa_state (d
, i
, flags
, merged
);
2550 free (merged
->elems
);
2554 /* Perform bottom-up analysis on the parse tree, computing various functions.
2555 Note that at this point, we're pretending constructs like \< are real
2556 characters rather than constraints on what can follow them.
2558 Nullable: A node is nullable if it is at the root of a regexp that can
2559 match the empty string.
2560 * EMPTY leaves are nullable.
2561 * No other leaf is nullable.
2562 * A QMARK or STAR node is nullable.
2563 * A PLUS node is nullable if its argument is nullable.
2564 * A CAT node is nullable if both its arguments are nullable.
2565 * An OR node is nullable if either argument is nullable.
2567 Firstpos: The firstpos of a node is the set of positions (nonempty leaves)
2568 that could correspond to the first character of a string matching the
2569 regexp rooted at the given node.
2570 * EMPTY leaves have empty firstpos.
2571 * The firstpos of a nonempty leaf is that leaf itself.
2572 * The firstpos of a QMARK, STAR, or PLUS node is the firstpos of its
2574 * The firstpos of a CAT node is the firstpos of the left argument, union
2575 the firstpos of the right if the left argument is nullable.
2576 * The firstpos of an OR node is the union of firstpos of each argument.
2578 Lastpos: The lastpos of a node is the set of positions that could
2579 correspond to the last character of a string matching the regexp at
2581 * EMPTY leaves have empty lastpos.
2582 * The lastpos of a nonempty leaf is that leaf itself.
2583 * The lastpos of a QMARK, STAR, or PLUS node is the lastpos of its
2585 * The lastpos of a CAT node is the lastpos of its right argument, union
2586 the lastpos of the left if the right argument is nullable.
2587 * The lastpos of an OR node is the union of the lastpos of each argument.
2589 Follow: The follow of a position is the set of positions that could
2590 correspond to the character following a character matching the node in
2591 a string matching the regexp. At this point we consider special symbols
2592 that match the empty string in some context to be just normal characters.
2593 Later, if we find that a special symbol is in a follow set, we will
2594 replace it with the elements of its follow, labeled with an appropriate
2596 * Every node in the firstpos of the argument of a STAR or PLUS node is in
2597 the follow of every node in the lastpos.
2598 * Every node in the firstpos of the second argument of a CAT node is in
2599 the follow of every node in the lastpos of the first argument.
2601 Because of the postfix representation of the parse tree, the depth-first
2602 analysis is conveniently done by a linear scan with the aid of a stack.
2603 Sets are stored as arrays of the elements, obeying a stack-like allocation
2604 scheme; the number of elements in each set deeper in the stack can be
2605 used to determine the address of a particular set's array. */
2607 dfaanalyze (struct dfa
*d
, bool searchflag
)
2609 /* Array allocated to hold position sets. */
2610 position
*posalloc
= xnmalloc (d
->nleaves
, 2 * sizeof *posalloc
);
2611 /* Firstpos and lastpos elements. */
2612 position
*firstpos
= posalloc
;
2613 position
*lastpos
= firstpos
+ d
->nleaves
;
2617 /* Stack for element counts and nullable flags. */
2620 /* Whether the entry is nullable. */
2623 /* Counts of firstpos and lastpos sets. */
2626 } *stkalloc
= xnmalloc (d
->depth
, sizeof *stkalloc
), *stk
= stkalloc
;
2628 position_set merged
; /* Result of merging sets. */
2633 fprintf (stderr
, "dfaanalyze:\n");
2634 for (size_t i
= 0; i
< d
->tindex
; ++i
)
2636 fprintf (stderr
, " %zu:", i
);
2637 prtok (d
->tokens
[i
]);
2639 putc ('\n', stderr
);
2642 d
->searchflag
= searchflag
;
2643 alloc_position_set (&merged
, d
->nleaves
);
2644 d
->follows
= xcalloc (d
->tindex
, sizeof *d
->follows
);
2646 for (size_t i
= 0; i
< d
->tindex
; ++i
)
2648 switch (d
->tokens
[i
])
2651 /* The empty set is nullable. */
2652 stk
->nullable
= true;
2654 /* The firstpos and lastpos of the empty leaf are both empty. */
2655 stk
->nfirstpos
= stk
->nlastpos
= 0;
2661 /* Every element in the firstpos of the argument is in the follow
2662 of every element in the lastpos. */
2664 tmp
.elems
= firstpos
- stk
[-1].nfirstpos
;
2665 tmp
.nelem
= stk
[-1].nfirstpos
;
2666 position
*p
= lastpos
- stk
[-1].nlastpos
;
2667 for (size_t j
= 0; j
< stk
[-1].nlastpos
; j
++)
2669 merge (&tmp
, &d
->follows
[p
[j
].index
], &merged
);
2670 copy (&merged
, &d
->follows
[p
[j
].index
]);
2675 /* A QMARK or STAR node is automatically nullable. */
2676 if (d
->tokens
[i
] != PLUS
)
2677 stk
[-1].nullable
= true;
2681 /* Every element in the firstpos of the second argument is in the
2682 follow of every element in the lastpos of the first argument. */
2684 tmp
.nelem
= stk
[-1].nfirstpos
;
2685 tmp
.elems
= firstpos
- stk
[-1].nfirstpos
;
2686 position
*p
= lastpos
- stk
[-1].nlastpos
- stk
[-2].nlastpos
;
2687 for (size_t j
= 0; j
< stk
[-2].nlastpos
; j
++)
2689 merge (&tmp
, &d
->follows
[p
[j
].index
], &merged
);
2690 copy (&merged
, &d
->follows
[p
[j
].index
]);
2694 /* The firstpos of a CAT node is the firstpos of the first argument,
2695 union that of the second argument if the first is nullable. */
2696 if (stk
[-2].nullable
)
2697 stk
[-2].nfirstpos
+= stk
[-1].nfirstpos
;
2699 firstpos
-= stk
[-1].nfirstpos
;
2701 /* The lastpos of a CAT node is the lastpos of the second argument,
2702 union that of the first argument if the second is nullable. */
2703 if (stk
[-1].nullable
)
2704 stk
[-2].nlastpos
+= stk
[-1].nlastpos
;
2707 position
*p
= lastpos
- stk
[-1].nlastpos
- stk
[-2].nlastpos
;
2708 for (size_t j
= 0; j
< stk
[-1].nlastpos
; j
++)
2709 p
[j
] = p
[j
+ stk
[-2].nlastpos
];
2710 lastpos
-= stk
[-2].nlastpos
;
2711 stk
[-2].nlastpos
= stk
[-1].nlastpos
;
2714 /* A CAT node is nullable if both arguments are nullable. */
2715 stk
[-2].nullable
&= stk
[-1].nullable
;
2720 /* The firstpos is the union of the firstpos of each argument. */
2721 stk
[-2].nfirstpos
+= stk
[-1].nfirstpos
;
2723 /* The lastpos is the union of the lastpos of each argument. */
2724 stk
[-2].nlastpos
+= stk
[-1].nlastpos
;
2726 /* An OR node is nullable if either argument is nullable. */
2727 stk
[-2].nullable
|= stk
[-1].nullable
;
2732 /* Anything else is a nonempty position. (Note that special
2733 constructs like \< are treated as nonempty strings here;
2734 an "epsilon closure" effectively makes them nullable later.
2735 Backreferences have to get a real position so we can detect
2736 transitions on them later. But they are nullable. */
2737 stk
->nullable
= d
->tokens
[i
] == BACKREF
;
2739 /* This position is in its own firstpos and lastpos. */
2740 stk
->nfirstpos
= stk
->nlastpos
= 1;
2743 firstpos
->index
= lastpos
->index
= i
;
2744 firstpos
->constraint
= lastpos
->constraint
= NO_CONSTRAINT
;
2745 firstpos
++, lastpos
++;
2750 /* ... balance the above nonsyntactic #ifdef goo... */
2751 fprintf (stderr
, "node %zu:", i
);
2752 prtok (d
->tokens
[i
]);
2753 putc ('\n', stderr
);
2755 stk
[-1].nullable
? " nullable: yes\n" : " nullable: no\n");
2756 fprintf (stderr
, " firstpos:");
2757 for (size_t j
= 0; j
< stk
[-1].nfirstpos
; j
++)
2759 fprintf (stderr
, " %zu:", firstpos
[j
- stk
[-1].nfirstpos
].index
);
2760 prtok (d
->tokens
[firstpos
[j
- stk
[-1].nfirstpos
].index
]);
2762 fprintf (stderr
, "\n lastpos:");
2763 for (size_t j
= 0; j
< stk
[-1].nlastpos
; j
++)
2765 fprintf (stderr
, " %zu:", lastpos
[j
- stk
[-1].nlastpos
].index
);
2766 prtok (d
->tokens
[lastpos
[j
- stk
[-1].nlastpos
].index
]);
2768 putc ('\n', stderr
);
2772 /* For each follow set that is the follow set of a real position, replace
2773 it with its epsilon closure. */
2779 for (size_t i
= 0; i
< d
->tindex
; ++i
)
2780 if (d
->tokens
[i
] == BEG
|| d
->tokens
[i
] < NOTCHAR
2781 || d
->tokens
[i
] == BACKREF
|| d
->tokens
[i
] == ANYCHAR
2782 || d
->tokens
[i
] == MBCSET
|| d
->tokens
[i
] >= CSET
)
2784 fprintf (stderr
, "follows(%zu:", i
);
2785 prtok (d
->tokens
[i
]);
2786 fprintf (stderr
, "):");
2787 for (size_t j
= 0; j
< d
->follows
[i
].nelem
; j
++)
2789 fprintf (stderr
, " %zu:", d
->follows
[i
].elems
[j
].index
);
2790 prtok (d
->tokens
[d
->follows
[i
].elems
[j
].index
]);
2792 putc ('\n', stderr
);
2797 pos
.constraint
= NO_CONSTRAINT
;
2799 alloc_position_set (&tmp
, 1);
2803 d
->separates
= xnmalloc (d
->tindex
, sizeof *d
->separates
);
2805 for (ptrdiff_t i
= 0; i
< d
->tindex
; i
++)
2807 d
->separates
[i
] = 0;
2809 if (prev_newline_dependent (d
->constraints
[i
]))
2810 d
->separates
[i
] |= CTX_NEWLINE
;
2811 if (prev_letter_dependent (d
->constraints
[i
]))
2812 d
->separates
[i
] |= CTX_LETTER
;
2814 for (ptrdiff_t j
= 0; j
< d
->follows
[i
].nelem
; j
++)
2816 if (prev_newline_dependent (d
->follows
[i
].elems
[j
].constraint
))
2817 d
->separates
[i
] |= CTX_NEWLINE
;
2818 if (prev_letter_dependent (d
->follows
[i
].elems
[j
].constraint
))
2819 d
->separates
[i
] |= CTX_LETTER
;
2823 /* Context wanted by some position. */
2824 int separate_contexts
= state_separate_contexts (d
, &tmp
);
2826 /* Build the initial state. */
2827 if (separate_contexts
& CTX_NEWLINE
)
2828 state_index (d
, &tmp
, CTX_NEWLINE
);
2829 d
->initstate_notbol
= d
->min_trcount
2830 = state_index (d
, &tmp
, separate_contexts
^ CTX_ANY
);
2831 if (separate_contexts
& CTX_LETTER
)
2832 d
->min_trcount
= state_index (d
, &tmp
, CTX_LETTER
);
2838 free (merged
.elems
);
2842 /* Make sure D's state arrays are large enough to hold NEW_STATE. */
2844 realloc_trans_if_necessary (struct dfa
*d
)
2846 state_num oldalloc
= d
->tralloc
;
2847 if (oldalloc
< d
->sindex
)
2849 state_num
**realtrans
= d
->trans
? d
->trans
- 2 : NULL
;
2850 ptrdiff_t newalloc1
= realtrans
? d
->tralloc
+ 2 : 0;
2851 realtrans
= xpalloc (realtrans
, &newalloc1
, d
->sindex
- oldalloc
,
2852 -1, sizeof *realtrans
);
2853 realtrans
[0] = realtrans
[1] = NULL
;
2854 d
->trans
= realtrans
+ 2;
2855 ptrdiff_t newalloc
= d
->tralloc
= newalloc1
- 2;
2856 d
->fails
= xnrealloc (d
->fails
, newalloc
, sizeof *d
->fails
);
2857 d
->success
= xnrealloc (d
->success
, newalloc
, sizeof *d
->success
);
2858 d
->newlines
= xnrealloc (d
->newlines
, newalloc
, sizeof *d
->newlines
);
2859 if (d
->localeinfo
.multibyte
)
2861 realtrans
= d
->mb_trans
? d
->mb_trans
- 2 : NULL
;
2862 realtrans
= xnrealloc (realtrans
, newalloc1
, sizeof *realtrans
);
2864 realtrans
[0] = realtrans
[1] = NULL
;
2865 d
->mb_trans
= realtrans
+ 2;
2867 for (; oldalloc
< newalloc
; oldalloc
++)
2869 d
->trans
[oldalloc
] = NULL
;
2870 d
->fails
[oldalloc
] = NULL
;
2871 if (d
->localeinfo
.multibyte
)
2872 d
->mb_trans
[oldalloc
] = NULL
;
2878 Calculate the transition table for a new state derived from state s
2879 for a compiled dfa d after input character uc, and return the new
2882 Do not worry about all possible input characters; calculate just the group
2883 of positions that match uc. Label it with the set of characters that
2884 every position in the group matches (taking into account, if necessary,
2885 preceding context information of s). Then find the union
2886 of these positions' follows, i.e., the set of positions of the
2887 new state. For each character in the group's label, set the transition
2888 on this character to be to a state corresponding to the set's positions,
2889 and its associated backward context information, if necessary.
2891 When building a searching matcher, include the positions of state
2894 The group is constructed by building an equivalence-class
2895 partition of the positions of s.
2897 For each position, find the set of characters C that it matches. Eliminate
2898 any characters from C that fail on grounds of backward context.
2900 Check whether the group's label L has nonempty
2901 intersection with C. If L - C is nonempty, create a new group labeled
2902 L - C and having the same positions as the current group, and set L to
2903 the intersection of L and C. Insert the position in the group, set
2904 C = C - L, and resume scanning.
2906 If after comparing with every group there are characters remaining in C,
2907 create a new group labeled with the characters of C and insert this
2908 position in that group. */
2911 build_state (state_num s
, struct dfa
*d
, unsigned char uc
)
2913 position_set follows
; /* Union of the follows for each
2914 position of the current state. */
2915 position_set group
; /* Positions that match the input char. */
2916 position_set tmp
; /* Temporary space for merging sets. */
2917 state_num state
; /* New state. */
2918 state_num state_newline
; /* New state on a newline transition. */
2919 state_num state_letter
; /* New state on a letter transition. */
2922 fprintf (stderr
, "build state %td\n", s
);
2925 /* A pointer to the new transition table, and the table itself. */
2926 state_num
**ptrans
= (accepting (s
, d
) ? d
->fails
: d
->trans
) + s
;
2927 state_num
*trans
= *ptrans
;
2931 /* MAX_TRCOUNT is an arbitrary upper limit on the number of
2932 transition tables that can exist at once, other than for
2933 initial states. Often-used transition tables are quickly
2934 rebuilt, whereas rarely-used ones are cleared away. */
2935 if (MAX_TRCOUNT
<= d
->trcount
)
2937 for (state_num i
= d
->min_trcount
; i
< d
->tralloc
; i
++)
2941 d
->trans
[i
] = d
->fails
[i
] = NULL
;
2947 *ptrans
= trans
= xmalloc (NOTCHAR
* sizeof *trans
);
2949 /* Fill transition table with a default value which means that the
2950 transited state has not been calculated yet. */
2951 for (int i
= 0; i
< NOTCHAR
; i
++)
2955 /* Set up the success bits for this state. */
2957 if (accepts_in_context (d
->states
[s
].context
, CTX_NEWLINE
, s
, d
))
2958 d
->success
[s
] |= CTX_NEWLINE
;
2959 if (accepts_in_context (d
->states
[s
].context
, CTX_LETTER
, s
, d
))
2960 d
->success
[s
] |= CTX_LETTER
;
2961 if (accepts_in_context (d
->states
[s
].context
, CTX_NONE
, s
, d
))
2962 d
->success
[s
] |= CTX_NONE
;
2964 alloc_position_set (&follows
, d
->nleaves
);
2966 /* Find the union of the follows of the positions of the group.
2967 This is a hideously inefficient loop. Fix it someday. */
2968 for (size_t j
= 0; j
< d
->states
[s
].elems
.nelem
; ++j
)
2970 k
< d
->follows
[d
->states
[s
].elems
.elems
[j
].index
].nelem
; ++k
)
2971 insert (d
->follows
[d
->states
[s
].elems
.elems
[j
].index
].elems
[k
],
2974 /* Positions that match the input char. */
2975 alloc_position_set (&group
, d
->nleaves
);
2977 /* The group's label. */
2981 for (size_t i
= 0; i
< follows
.nelem
; ++i
)
2983 charclass matches
; /* Set of matching characters. */
2984 position pos
= follows
.elems
[i
];
2985 bool matched
= false;
2986 if (d
->tokens
[pos
.index
] >= 0 && d
->tokens
[pos
.index
] < NOTCHAR
)
2989 setbit (d
->tokens
[pos
.index
], &matches
);
2990 if (d
->tokens
[pos
.index
] == uc
)
2993 else if (d
->tokens
[pos
.index
] >= CSET
)
2995 matches
= d
->charclasses
[d
->tokens
[pos
.index
] - CSET
];
2996 if (tstbit (uc
, &matches
))
2999 else if (d
->tokens
[pos
.index
] == ANYCHAR
)
3001 matches
= d
->charclasses
[d
->canychar
];
3002 if (tstbit (uc
, &matches
))
3005 /* ANYCHAR must match with a single character, so we must put
3006 it to D->states[s].mbps which contains the positions which
3007 can match with a single character not a byte. If all
3008 positions which has ANYCHAR does not depend on context of
3009 next character, we put the follows instead of it to
3010 D->states[s].mbps to optimize. */
3011 if (succeeds_in_context (pos
.constraint
, d
->states
[s
].context
,
3014 if (d
->states
[s
].mbps
.nelem
== 0)
3015 alloc_position_set (&d
->states
[s
].mbps
, 1);
3016 insert (pos
, &d
->states
[s
].mbps
);
3022 /* Some characters may need to be eliminated from matches because
3023 they fail in the current context. */
3024 if (pos
.constraint
!= NO_CONSTRAINT
)
3026 if (!succeeds_in_context (pos
.constraint
,
3027 d
->states
[s
].context
, CTX_NEWLINE
))
3028 for (size_t j
= 0; j
< CHARCLASS_WORDS
; ++j
)
3029 matches
.w
[j
] &= ~d
->syntax
.newline
.w
[j
];
3030 if (!succeeds_in_context (pos
.constraint
,
3031 d
->states
[s
].context
, CTX_LETTER
))
3032 for (size_t j
= 0; j
< CHARCLASS_WORDS
; ++j
)
3033 matches
.w
[j
] &= ~d
->syntax
.letters
.w
[j
];
3034 if (!succeeds_in_context (pos
.constraint
,
3035 d
->states
[s
].context
, CTX_NONE
))
3036 for (size_t j
= 0; j
< CHARCLASS_WORDS
; ++j
)
3037 matches
.w
[j
] &= d
->syntax
.letters
.w
[j
] | d
->syntax
.newline
.w
[j
];
3039 /* If there are no characters left, there's no point in going on. */
3040 if (emptyset (&matches
))
3043 /* If we have reset the bit that made us declare "matched", reset
3044 that indicator, too. This is required to avoid an infinite loop
3045 with this command: echo cx | LC_ALL=C grep -E 'c\b[x ]' */
3046 if (!tstbit (uc
, &matches
))
3051 fprintf (stderr
, " nextpos %zu:", pos
.index
);
3052 prtok (d
->tokens
[pos
.index
]);
3053 fprintf (stderr
, " of");
3054 for (size_t j
= 0; j
< NOTCHAR
; j
++)
3055 if (tstbit (j
, &matches
))
3056 fprintf (stderr
, " 0x%02zx", j
);
3057 fprintf (stderr
, "\n");
3062 for (size_t k
= 0; k
< CHARCLASS_WORDS
; ++k
)
3063 label
.w
[k
] &= matches
.w
[k
];
3064 append (pos
, &group
);
3068 for (size_t k
= 0; k
< CHARCLASS_WORDS
; ++k
)
3069 label
.w
[k
] &= ~matches
.w
[k
];
3073 alloc_position_set (&tmp
, d
->nleaves
);
3075 if (group
.nelem
> 0)
3077 /* If we are building a searching matcher, throw in the positions
3078 of state 0 as well, if possible. */
3081 /* If a token in follows.elems is not 1st byte of a multibyte
3082 character, or the states of follows must accept the bytes
3083 which are not 1st byte of the multibyte character.
3084 Then, if a state of follows encounters a byte, it must not be
3085 a 1st byte of a multibyte character nor a single byte character.
3086 In this case, do not add state[0].follows to next state, because
3087 state[0] must accept 1st-byte.
3089 For example, suppose <sb a> is a certain single byte character,
3090 <mb A> is a certain multibyte character, and the codepoint of
3091 <sb a> equals the 2nd byte of the codepoint of <mb A>. When
3092 state[0] accepts <sb a>, state[i] transits to state[i+1] by
3093 accepting the 1st byte of <mb A>, and state[i+1] accepts the
3094 2nd byte of <mb A>, if state[i+1] encounters the codepoint of
3095 <sb a>, it must not be <sb a> but the 2nd byte of <mb A>, so do
3096 not add state[0]. */
3098 bool mergeit
= !d
->localeinfo
.multibyte
;
3102 for (size_t j
= 0; mergeit
&& j
< group
.nelem
; j
++)
3103 mergeit
&= d
->multibyte_prop
[group
.elems
[j
].index
];
3107 merge (&d
->states
[0].elems
, &group
, &tmp
);
3108 copy (&tmp
, &group
);
3112 /* Find out if the new state will want any context information,
3113 by calculating possible contexts that the group can match,
3114 and separate contexts that the new state wants to know. */
3115 int possible_contexts
= charclass_context (d
, &label
);
3116 int separate_contexts
= state_separate_contexts (d
, &group
);
3118 /* Find the state(s) corresponding to the union of the follows. */
3119 if (possible_contexts
& ~separate_contexts
)
3120 state
= state_index (d
, &group
, separate_contexts
^ CTX_ANY
);
3123 if (separate_contexts
& possible_contexts
& CTX_NEWLINE
)
3124 state_newline
= state_index (d
, &group
, CTX_NEWLINE
);
3126 state_newline
= state
;
3127 if (separate_contexts
& possible_contexts
& CTX_LETTER
)
3128 state_letter
= state_index (d
, &group
, CTX_LETTER
);
3130 state_letter
= state
;
3132 /* Reallocate now, to reallocate any newline transition properly. */
3133 realloc_trans_if_necessary (d
);
3136 /* If we are a searching matcher, the default transition is to a state
3137 containing the positions of state 0, otherwise the default transition
3138 is to fail miserably. */
3139 else if (d
->searchflag
)
3142 state_letter
= d
->min_trcount
- 1;
3143 state
= d
->initstate_notbol
;
3152 /* Set the transitions for each character in the label. */
3153 for (size_t i
= 0; i
< NOTCHAR
; i
++)
3154 if (tstbit (i
, &label
))
3155 switch (d
->syntax
.sbit
[i
])
3158 trans
[i
] = state_newline
;
3161 trans
[i
] = state_letter
;
3169 fprintf (stderr
, "trans table %td", s
);
3170 for (size_t i
= 0; i
< NOTCHAR
; ++i
)
3173 fprintf (stderr
, "\n");
3174 fprintf (stderr
, " %2td", trans
[i
]);
3176 fprintf (stderr
, "\n");
3180 free (follows
.elems
);
3183 /* Keep the newline transition in a special place so we can use it as
3185 if (tstbit (d
->syntax
.eolbyte
, &label
))
3187 d
->newlines
[s
] = trans
[d
->syntax
.eolbyte
];
3188 trans
[d
->syntax
.eolbyte
] = -1;
3194 /* Multibyte character handling sub-routines for dfaexec. */
3196 /* Consume a single byte and transit state from 's' to '*next_state'.
3197 This function is almost same as the state transition routin in dfaexec.
3198 But state transition is done just once, otherwise matching succeed or
3199 reach the end of the buffer. */
3201 transit_state_singlebyte (struct dfa
*d
, state_num s
, unsigned char const **pp
)
3207 else if (d
->fails
[s
])
3211 build_state (s
, d
, **pp
);
3222 build_state (s
, d
, **pp
);
3227 /* Transit state from s, then return new state and update the pointer of
3228 the buffer. This function is for a period operator which can match a
3229 multi-byte character. */
3231 transit_state (struct dfa
*d
, state_num s
, unsigned char const **pp
,
3232 unsigned char const *end
)
3236 int mbclen
= mbs_to_wchar (&wc
, (char const *) *pp
, end
- *pp
, d
);
3238 /* This state has some operators which can match a multibyte character. */
3239 d
->mb_follows
.nelem
= 0;
3241 /* Calculate the state which can be reached from the state 's' by
3242 consuming 'mbclen' single bytes from the buffer. */
3245 for (mbci
= 0; mbci
< mbclen
&& (mbci
== 0 || d
->min_trcount
<= s
); mbci
++)
3246 s
= transit_state_singlebyte (d
, s
, pp
);
3247 *pp
+= mbclen
- mbci
;
3251 /* It is an invalid character, so ANYCHAR is not accepted. */
3255 /* If all positions which have ANYCHAR do not depend on the context
3256 of the next character, calculate the next state with
3257 pre-calculated follows and cache the result. */
3258 if (d
->states
[s1
].mb_trindex
< 0)
3260 if (MAX_TRCOUNT
<= d
->mb_trcount
)
3263 for (s3
= -1; s3
< d
->tralloc
; s3
++)
3265 free (d
->mb_trans
[s3
]);
3266 d
->mb_trans
[s3
] = NULL
;
3269 for (state_num i
= 0; i
< d
->sindex
; i
++)
3270 d
->states
[i
].mb_trindex
= -1;
3273 d
->states
[s1
].mb_trindex
= d
->mb_trcount
++;
3276 if (! d
->mb_trans
[s
])
3278 enum { TRANSPTR_SIZE
= sizeof *d
->mb_trans
[s
] };
3279 enum { TRANSALLOC_SIZE
= MAX_TRCOUNT
* TRANSPTR_SIZE
};
3280 d
->mb_trans
[s
] = xmalloc (TRANSALLOC_SIZE
);
3281 for (int i
= 0; i
< MAX_TRCOUNT
; i
++)
3282 d
->mb_trans
[s
][i
] = -1;
3284 else if (d
->mb_trans
[s
][d
->states
[s1
].mb_trindex
] >= 0)
3285 return d
->mb_trans
[s
][d
->states
[s1
].mb_trindex
];
3288 copy (&d
->states
[s1
].mbps
, &d
->mb_follows
);
3290 merge (&d
->states
[s1
].mbps
, &d
->states
[s
].elems
, &d
->mb_follows
);
3292 int separate_contexts
= state_separate_contexts (d
, &d
->mb_follows
);
3293 state_num s2
= state_index (d
, &d
->mb_follows
, separate_contexts
^ CTX_ANY
);
3294 realloc_trans_if_necessary (d
);
3296 d
->mb_trans
[s
][d
->states
[s1
].mb_trindex
] = s2
;
3301 /* The initial state may encounter a byte which is not a single byte character
3302 nor the first byte of a multibyte character. But it is incorrect for the
3303 initial state to accept such a byte. For example, in Shift JIS the regular
3304 expression "\\" accepts the codepoint 0x5c, but should not accept the second
3305 byte of the codepoint 0x815c. Then the initial state must skip the bytes
3306 that are not a single byte character nor the first byte of a multibyte
3309 Given DFA state d, use mbs_to_wchar to advance MBP until it reaches
3310 or exceeds P, and return the advanced MBP. If WCP is non-NULL and
3311 the result is greater than P, set *WCP to the final wide character
3312 processed, or to WEOF if no wide character is processed. Otherwise,
3313 if WCP is non-NULL, *WCP may or may not be updated.
3315 Both P and MBP must be no larger than END. */
3316 static unsigned char const *
3317 skip_remains_mb (struct dfa
*d
, unsigned char const *p
,
3318 unsigned char const *mbp
, char const *end
)
3320 if (d
->syntax
.never_trail
[*p
])
3325 mbp
+= mbs_to_wchar (&wc
, (char const *) mbp
,
3326 end
- (char const *) mbp
, d
);
3331 /* Search through a buffer looking for a match to the struct dfa *D.
3332 Find the first occurrence of a string matching the regexp in the
3333 buffer, and the shortest possible version thereof. Return a pointer to
3334 the first character after the match, or NULL if none is found. BEGIN
3335 points to the beginning of the buffer, and END points to the first byte
3336 after its end. Note however that we store a sentinel byte (usually
3337 newline) in *END, so the actual buffer must be one byte longer.
3338 When ALLOW_NL, newlines may appear in the matching string.
3339 If COUNT is non-NULL, increment *COUNT once for each newline processed.
3340 If MULTIBYTE, the input consists of multibyte characters and/or
3341 encoding-error bytes. Otherwise, it consists of single-byte characters.
3342 Here is the list of features that make this DFA matcher punt:
3343 - [M-N] range in non-simple locale: regex is up to 25% faster on [a-z]
3344 - [^...] in non-simple locale
3345 - [[=foo=]] or [[.foo.]]
3346 - [[:alpha:]] etc. in multibyte locale (except [[:digit:]] works OK)
3347 - back-reference: (.)\1
3348 - word-delimiter in multibyte locale: \<, \>, \b, \B
3349 See using_simple_locale for the definition of "simple locale". */
3351 static inline char *
3352 dfaexec_main (struct dfa
*d
, char const *begin
, char *end
, bool allow_nl
,
3353 size_t *count
, bool multibyte
)
3355 if (MAX_TRCOUNT
<= d
->sindex
)
3357 for (state_num s
= d
->min_trcount
; s
< d
->sindex
; s
++)
3359 free (d
->states
[s
].elems
.elems
);
3360 free (d
->states
[s
].mbps
.elems
);
3362 d
->sindex
= d
->min_trcount
;
3366 for (state_num s
= 0; s
< d
->tralloc
; s
++)
3370 d
->trans
[s
] = d
->fails
[s
] = NULL
;
3375 if (d
->localeinfo
.multibyte
&& d
->mb_trans
)
3377 for (state_num s
= -1; s
< d
->tralloc
; s
++)
3379 free (d
->mb_trans
[s
]);
3380 d
->mb_trans
[s
] = NULL
;
3382 for (state_num s
= 0; s
< d
->min_trcount
; s
++)
3383 d
->states
[s
].mb_trindex
= -1;
3389 realloc_trans_if_necessary (d
);
3391 /* Current state. */
3392 state_num s
= 0, s1
= 0;
3394 /* Current input character. */
3395 unsigned char const *p
= (unsigned char const *) begin
;
3396 unsigned char const *mbp
= p
;
3398 /* Copy of d->trans so it can be optimized into a register. */
3399 state_num
**trans
= d
->trans
;
3400 unsigned char eol
= d
->syntax
.eolbyte
; /* Likewise for eolbyte. */
3401 unsigned char saved_end
= *(unsigned char *) end
;
3406 memset (&d
->mbs
, 0, sizeof d
->mbs
);
3407 if (d
->mb_follows
.alloc
== 0)
3408 alloc_position_set (&d
->mb_follows
, d
->nleaves
);
3415 while ((t
= trans
[s
]) != NULL
)
3417 if (s
< d
->min_trcount
)
3419 if (!multibyte
|| d
->states
[s
].mbps
.nelem
== 0)
3425 p
= mbp
= skip_remains_mb (d
, p
, mbp
, end
);
3432 if (d
->states
[s
].mbps
.nelem
== 0
3433 || d
->localeinfo
.sbctowc
[*p
] != WEOF
|| (char *) p
>= end
)
3435 /* If an input character does not match ANYCHAR, do it
3436 like a single-byte character. */
3441 s
= transit_state (d
, s
, &p
, (unsigned char *) end
);
3454 s1
= tmp
; /* swap */
3457 if (s
< d
->min_trcount
)
3470 s
= build_state (s1
, d
, p
[-1]);
3473 else if ((char *) p
<= end
&& p
[-1] == eol
&& 0 <= d
->newlines
[s1
])
3475 /* The previous character was a newline. Count it, and skip
3476 checking of multibyte character boundary until here. */
3480 s
= (allow_nl
? d
->newlines
[s1
]
3481 : d
->syntax
.sbit
[eol
] == CTX_NEWLINE
? 0
3482 : d
->syntax
.sbit
[eol
] == CTX_LETTER
? d
->min_trcount
- 1
3483 : d
->initstate_notbol
);
3491 else if (d
->fails
[s
])
3493 if ((d
->success
[s
] & d
->syntax
.sbit
[*p
])
3494 || ((char *) p
== end
3495 && accepts_in_context (d
->states
[s
].context
, CTX_NEWLINE
, s
,
3499 if (multibyte
&& s
< d
->min_trcount
)
3500 p
= mbp
= skip_remains_mb (d
, p
, mbp
, end
);
3503 if (!multibyte
|| d
->states
[s
].mbps
.nelem
== 0
3504 || d
->localeinfo
.sbctowc
[*p
] != WEOF
|| (char *) p
>= end
)
3506 /* If a input character does not match ANYCHAR, do it
3507 like a single-byte character. */
3508 s
= d
->fails
[s
][*p
++];
3512 s
= transit_state (d
, s
, &p
, (unsigned char *) end
);
3519 build_state (s
, d
, p
[0]);
3531 /* Specialized versions of dfaexec for multibyte and single-byte cases.
3532 This is for performance, as dfaexec_main is an inline function. */
3535 dfaexec_mb (struct dfa
*d
, char const *begin
, char *end
,
3536 bool allow_nl
, size_t *count
, bool *backref
)
3538 return dfaexec_main (d
, begin
, end
, allow_nl
, count
, true);
3542 dfaexec_sb (struct dfa
*d
, char const *begin
, char *end
,
3543 bool allow_nl
, size_t *count
, bool *backref
)
3545 return dfaexec_main (d
, begin
, end
, allow_nl
, count
, false);
3548 /* Always set *BACKREF and return BEGIN. Use this wrapper for
3549 any regexp that uses a construct not supported by this code. */
3551 dfaexec_noop (struct dfa
*d
, char const *begin
, char *end
,
3552 bool allow_nl
, size_t *count
, bool *backref
)
3555 return (char *) begin
;
3558 /* Like dfaexec_main (D, BEGIN, END, ALLOW_NL, COUNT, D->localeinfo.multibyte),
3559 but faster and set *BACKREF if the DFA code does not support this
3563 dfaexec (struct dfa
*d
, char const *begin
, char *end
,
3564 bool allow_nl
, size_t *count
, bool *backref
)
3566 return d
->dfaexec (d
, begin
, end
, allow_nl
, count
, backref
);
3570 dfasuperset (struct dfa
const *d
)
3576 dfaisfast (struct dfa
const *d
)
3582 free_mbdata (struct dfa
*d
)
3584 free (d
->multibyte_prop
);
3585 free (d
->lex
.brack
.chars
);
3586 free (d
->mb_follows
.elems
);
3591 for (s
= -1; s
< d
->tralloc
; s
++)
3592 free (d
->mb_trans
[s
]);
3593 free (d
->mb_trans
- 2);
3597 /* Return true if every construct in D is supported by this DFA matcher. */
3598 static bool _GL_ATTRIBUTE_PURE
3599 dfa_supported (struct dfa
const *d
)
3601 for (size_t i
= 0; i
< d
->tindex
; i
++)
3603 switch (d
->tokens
[i
])
3609 if (!d
->localeinfo
.multibyte
)
3620 /* Disable use of the superset DFA if it is not likely to help
3623 maybe_disable_superset_dfa (struct dfa
*d
)
3625 if (!d
->localeinfo
.using_utf8
)
3628 bool have_backref
= false;
3629 for (size_t i
= 0; i
< d
->tindex
; ++i
)
3631 switch (d
->tokens
[i
])
3637 have_backref
= true;
3640 /* Requires multi-byte algorithm. */
3647 if (!have_backref
&& d
->superset
)
3649 /* The superset DFA is not likely to be much faster, so remove it. */
3650 dfafree (d
->superset
);
3656 d
->localeinfo
.multibyte
= false;
3657 d
->dfaexec
= dfaexec_sb
;
3662 dfassbuild (struct dfa
*d
)
3664 struct dfa
*sup
= dfaalloc ();
3667 sup
->localeinfo
.multibyte
= false;
3668 sup
->dfaexec
= dfaexec_sb
;
3669 sup
->multibyte_prop
= NULL
;
3670 sup
->superset
= NULL
;
3673 sup
->constraints
= NULL
;
3674 sup
->separates
= NULL
;
3675 sup
->follows
= NULL
;
3679 sup
->success
= NULL
;
3680 sup
->newlines
= NULL
;
3682 sup
->charclasses
= xnmalloc (sup
->calloc
, sizeof *sup
->charclasses
);
3685 memcpy (sup
->charclasses
, d
->charclasses
,
3686 d
->cindex
* sizeof *sup
->charclasses
);
3689 sup
->tokens
= xnmalloc (d
->tindex
, 2 * sizeof *sup
->tokens
);
3690 sup
->talloc
= d
->tindex
* 2;
3692 bool have_achar
= false;
3693 bool have_nchar
= false;
3695 for (size_t i
= j
= 0; i
< d
->tindex
; i
++)
3697 switch (d
->tokens
[i
])
3705 sup
->tokens
[j
++] = CSET
+ charclass_index (sup
, &ccl
);
3706 sup
->tokens
[j
++] = STAR
;
3707 if (d
->tokens
[i
+ 1] == QMARK
|| d
->tokens
[i
+ 1] == STAR
3708 || d
->tokens
[i
+ 1] == PLUS
)
3717 if (d
->localeinfo
.multibyte
)
3719 /* These constraints aren't supported in a multibyte locale.
3720 Ignore them in the superset DFA. */
3721 sup
->tokens
[j
++] = EMPTY
;
3726 sup
->tokens
[j
++] = d
->tokens
[i
];
3727 if ((0 <= d
->tokens
[i
] && d
->tokens
[i
] < NOTCHAR
)
3728 || d
->tokens
[i
] >= CSET
)
3735 if (have_nchar
&& (have_achar
|| d
->localeinfo
.multibyte
))
3744 /* Parse and analyze a single string of the given length. */
3746 dfacomp (char const *s
, size_t len
, struct dfa
*d
, bool searchflag
)
3748 dfaparse (s
, len
, d
);
3751 if (dfa_supported (d
))
3753 maybe_disable_superset_dfa (d
);
3754 dfaanalyze (d
, searchflag
);
3758 d
->dfaexec
= dfaexec_noop
;
3764 dfaanalyze (d
->superset
, searchflag
);
3768 /* Free the storage held by the components of a dfa. */
3770 dfafree (struct dfa
*d
)
3772 free (d
->charclasses
);
3775 if (d
->localeinfo
.multibyte
)
3778 free (d
->constraints
);
3779 free (d
->separates
);
3781 for (size_t i
= 0; i
< d
->sindex
; ++i
)
3783 free (d
->states
[i
].elems
.elems
);
3784 free (d
->states
[i
].mbps
.elems
);
3790 for (size_t i
= 0; i
< d
->tindex
; ++i
)
3791 free (d
->follows
[i
].elems
);
3797 for (size_t i
= 0; i
< d
->tralloc
; ++i
)
3803 free (d
->trans
- 2);
3811 dfafree (d
->superset
);
3816 /* Having found the postfix representation of the regular expression,
3817 try to find a long sequence of characters that must appear in any line
3819 Finding a "longest" sequence is beyond the scope here;
3820 we take an easy way out and hope for the best.
3821 (Take "(ab|a)b"--please.)
3823 We do a bottom-up calculation of sequences of characters that must appear
3824 in matches of r.e.'s represented by trees rooted at the nodes of the postfix
3826 sequences that must appear at the left of the match ("left")
3827 sequences that must appear at the right of the match ("right")
3828 lists of sequences that must appear somewhere in the match ("in")
3829 sequences that must constitute the match ("is")
3831 When we get to the root of the tree, we use one of the longest of its
3832 calculated "in" sequences as our answer.
3834 The sequences calculated for the various types of node (in pseudo ANSI c)
3835 are shown below. "p" is the operand of unary operators (and the left-hand
3836 operand of binary operators); "q" is the right-hand operand of binary
3839 "ZERO" means "a zero-length sequence" below.
3841 Type left right is in
3842 ---- ---- ----- -- --
3843 char c # c # c # c # c
3845 ANYCHAR ZERO ZERO ZERO ZERO
3847 MBCSET ZERO ZERO ZERO ZERO
3849 CSET ZERO ZERO ZERO ZERO
3851 STAR ZERO ZERO ZERO ZERO
3853 QMARK ZERO ZERO ZERO ZERO
3855 PLUS p->left p->right ZERO p->in
3857 CAT (p->is==ZERO)? (q->is==ZERO)? (p->is!=ZERO && p->in plus
3858 p->left : q->right : q->is!=ZERO) ? q->in plus
3859 p->is##q->left p->right##q->is p->is##q->is : p->right##q->left
3862 OR longest common longest common (do p->is and substrings common
3863 leading trailing to q->is have same p->in and
3864 (sub)sequence (sub)sequence q->in length and content) ?
3865 of p->left of p->right
3866 and q->left and q->right p->is : NULL
3868 If there's anything else we recognize in the tree, all four sequences get set
3869 to zero-length sequences. If there's something we don't recognize in the
3870 tree, we just return a zero-length sequence.
3872 Break ties in favor of infrequent letters (choosing 'zzz' in preference to
3875 And ... is it here or someplace that we might ponder "optimizations" such as
3876 egrep 'psi|epsilon' -> egrep 'psi'
3877 egrep 'pepsi|epsilon' -> egrep 'epsi'
3878 (Yes, we now find "epsi" as a "string
3879 that must occur", but we might also
3880 simplify the *entire* r.e. being sought)
3881 grep '[c]' -> grep 'c'
3882 grep '(ab|a)b' -> grep 'ab'
3883 grep 'ab*' -> grep 'a'
3884 grep 'a*b' -> grep 'b'
3886 There are several issues:
3888 Is optimization easy (enough)?
3890 Does optimization actually accomplish anything,
3891 or is the automaton you get from "psi|epsilon" (for example)
3892 the same as the one you get from "psi" (for example)?
3894 Are optimizable r.e.'s likely to be used in real-life situations
3895 (something like 'ab*' is probably unlikely; something like is
3896 'psi|epsilon' is likelier)? */
3899 icatalloc (char *old
, char const *new)
3901 size_t newsize
= strlen (new);
3904 size_t oldsize
= strlen (old
);
3905 char *result
= xrealloc (old
, oldsize
+ newsize
+ 1);
3906 memcpy (result
+ oldsize
, new, newsize
+ 1);
3911 freelist (char **cpp
)
3918 enlist (char **cpp
, char *new, size_t len
)
3920 new = memcpy (xmalloc (len
+ 1), new, len
);
3922 /* Is there already something in the list that's new (or longer)? */
3924 for (i
= 0; cpp
[i
] != NULL
; ++i
)
3925 if (strstr (cpp
[i
], new) != NULL
)
3930 /* Eliminate any obsoleted strings. */
3931 for (size_t j
= 0; cpp
[j
] != NULL
; )
3932 if (strstr (new, cpp
[j
]) == NULL
)
3942 /* Add the new string. */
3943 cpp
= xnrealloc (cpp
, i
+ 2, sizeof *cpp
);
3949 /* Given pointers to two strings, return a pointer to an allocated
3950 list of their distinct common substrings. */
3952 comsubs (char *left
, char const *right
)
3954 char **cpp
= xzalloc (sizeof *cpp
);
3956 for (char *lcp
= left
; *lcp
!= '\0'; lcp
++)
3959 char *rcp
= strchr (right
, *lcp
);
3963 for (i
= 1; lcp
[i
] != '\0' && lcp
[i
] == rcp
[i
]; ++i
)
3967 rcp
= strchr (rcp
+ 1, *lcp
);
3970 cpp
= enlist (cpp
, lcp
, len
);
3976 addlists (char **old
, char **new)
3979 old
= enlist (old
, *new, strlen (*new));
3983 /* Given two lists of substrings, return a new list giving substrings
3986 inboth (char **left
, char **right
)
3988 char **both
= xzalloc (sizeof *both
);
3990 for (size_t lnum
= 0; left
[lnum
] != NULL
; ++lnum
)
3992 for (size_t rnum
= 0; right
[rnum
] != NULL
; ++rnum
)
3994 char **temp
= comsubs (left
[lnum
], right
[rnum
]);
3995 both
= addlists (both
, temp
);
4003 typedef struct must must
;
4017 allocmust (must
*mp
, size_t size
)
4019 must
*new_mp
= xmalloc (sizeof *new_mp
);
4020 new_mp
->in
= xzalloc (sizeof *new_mp
->in
);
4021 new_mp
->left
= xzalloc (size
);
4022 new_mp
->right
= xzalloc (size
);
4023 new_mp
->is
= xzalloc (size
);
4024 new_mp
->begline
= false;
4025 new_mp
->endline
= false;
4031 resetmust (must
*mp
)
4035 mp
->left
[0] = mp
->right
[0] = mp
->is
[0] = '\0';
4036 mp
->begline
= false;
4037 mp
->endline
= false;
4052 dfamust (struct dfa
const *d
)
4055 char const *result
= "";
4057 bool begline
= false;
4058 bool endline
= false;
4059 bool need_begline
= false;
4060 bool need_endline
= false;
4061 bool case_fold_unibyte
= d
->syntax
.case_fold
&& MB_CUR_MAX
== 1;
4063 for (size_t ri
= 1; ri
+ 1 < d
->tindex
; ri
++)
4065 token t
= d
->tokens
[ri
];
4069 mp
= allocmust (mp
, 2);
4071 need_begline
= true;
4074 mp
= allocmust (mp
, 2);
4076 need_endline
= true;
4080 assert (!"neither LPAREN nor RPAREN may appear here");
4090 mp
= allocmust (mp
, 2);
4102 must
*lmp
= mp
= mp
->prev
;
4103 size_t j
, ln
, rn
, n
;
4105 /* Guaranteed to be. Unlikely, but ... */
4106 if (streq (lmp
->is
, rmp
->is
))
4108 lmp
->begline
&= rmp
->begline
;
4109 lmp
->endline
&= rmp
->endline
;
4114 lmp
->begline
= false;
4115 lmp
->endline
= false;
4117 /* Left side--easy */
4119 while (lmp
->left
[i
] != '\0' && lmp
->left
[i
] == rmp
->left
[i
])
4121 lmp
->left
[i
] = '\0';
4123 ln
= strlen (lmp
->right
);
4124 rn
= strlen (rmp
->right
);
4128 for (i
= 0; i
< n
; ++i
)
4129 if (lmp
->right
[ln
- i
- 1] != rmp
->right
[rn
- i
- 1])
4131 for (j
= 0; j
< i
; ++j
)
4132 lmp
->right
[j
] = lmp
->right
[(ln
- i
) + j
];
4133 lmp
->right
[j
] = '\0';
4134 new = inboth (lmp
->in
, rmp
->in
);
4148 for (size_t i
= 0; mp
->in
[i
] != NULL
; ++i
)
4149 if (strlen (mp
->in
[i
]) > strlen (result
))
4151 if (streq (result
, mp
->is
))
4153 if ((!need_begline
|| mp
->begline
) && (!need_endline
4156 begline
= mp
->begline
;
4157 endline
= mp
->endline
;
4164 must
*lmp
= mp
= mp
->prev
;
4166 /* In. Everything in left, plus everything in
4167 right, plus concatenation of
4168 left's right and right's left. */
4169 lmp
->in
= addlists (lmp
->in
, rmp
->in
);
4170 if (lmp
->right
[0] != '\0' && rmp
->left
[0] != '\0')
4172 size_t lrlen
= strlen (lmp
->right
);
4173 size_t rllen
= strlen (rmp
->left
);
4174 char *tp
= xmalloc (lrlen
+ rllen
);
4175 memcpy (tp
, lmp
->right
, lrlen
);
4176 memcpy (tp
+ lrlen
, rmp
->left
, rllen
);
4177 lmp
->in
= enlist (lmp
->in
, tp
, lrlen
+ rllen
);
4181 if (lmp
->is
[0] != '\0')
4182 lmp
->left
= icatalloc (lmp
->left
, rmp
->left
);
4184 if (rmp
->is
[0] == '\0')
4185 lmp
->right
[0] = '\0';
4186 lmp
->right
= icatalloc (lmp
->right
, rmp
->right
);
4187 /* Guaranteed to be */
4188 if ((lmp
->is
[0] != '\0' || lmp
->begline
)
4189 && (rmp
->is
[0] != '\0' || rmp
->endline
))
4191 lmp
->is
= icatalloc (lmp
->is
, rmp
->is
);
4192 lmp
->endline
= rmp
->endline
;
4197 lmp
->begline
= false;
4198 lmp
->endline
= false;
4205 /* Not on *my* shift. */
4211 /* If T is a singleton, or if case-folding in a unibyte
4212 locale and T's members all case-fold to the same char,
4213 convert T to one of its members. Otherwise, do
4214 nothing further with T. */
4215 charclass
*ccl
= &d
->charclasses
[t
- CSET
];
4217 for (j
= 0; j
< NOTCHAR
; j
++)
4218 if (tstbit (j
, ccl
))
4220 if (! (j
< NOTCHAR
))
4222 mp
= allocmust (mp
, 2);
4226 while (++j
< NOTCHAR
)
4228 && ! (case_fold_unibyte
4229 && toupper (j
) == toupper (t
)))
4233 mp
= allocmust (mp
, 2);
4239 if (d
->tokens
[ri
+ 1] == CAT
)
4241 for (; rj
< d
->tindex
- 1; rj
+= 2)
4243 if ((rj
!= ri
&& (d
->tokens
[rj
] <= 0
4244 || NOTCHAR
<= d
->tokens
[rj
]))
4245 || d
->tokens
[rj
+ 1] != CAT
)
4249 mp
= allocmust (mp
, ((rj
- ri
) >> 1) + 1);
4250 mp
->is
[0] = mp
->left
[0] = mp
->right
[0]
4251 = case_fold_unibyte
? toupper (t
) : t
;
4254 for (i
= 1; ri
+ 2 < rj
; i
++)
4258 mp
->is
[i
] = mp
->left
[i
] = mp
->right
[i
]
4259 = case_fold_unibyte
? toupper (t
) : t
;
4261 mp
->is
[i
] = mp
->left
[i
] = mp
->right
[i
] = '\0';
4262 mp
->in
= enlist (mp
->in
, mp
->is
, i
);
4268 struct dfamust
*dm
= NULL
;
4271 dm
= xmalloc (sizeof *dm
);
4273 dm
->begline
= begline
;
4274 dm
->endline
= endline
;
4275 dm
->must
= xstrdup (result
);
4280 must
*prev
= mp
->prev
;
4289 dfamustfree (struct dfamust
*dm
)
4298 return xmalloc (sizeof (struct dfa
));
4301 /* Initialize DFA. */
4303 dfasyntax (struct dfa
*dfa
, struct localeinfo
const *linfo
,
4304 reg_syntax_t bits
, int dfaopts
)
4306 memset (dfa
, 0, offsetof (struct dfa
, dfaexec
));
4307 dfa
->dfaexec
= linfo
->multibyte
? dfaexec_mb
: dfaexec_sb
;
4308 dfa
->simple_locale
= using_simple_locale (linfo
->multibyte
);
4309 dfa
->localeinfo
= *linfo
;
4311 dfa
->fast
= !dfa
->localeinfo
.multibyte
;
4314 dfa
->lex
.cur_mb_len
= 1;
4315 dfa
->syntax
.syntax_bits_set
= true;
4316 dfa
->syntax
.case_fold
= (bits
& RE_ICASE
) != 0;
4317 dfa
->syntax
.anchor
= (dfaopts
& DFA_ANCHOR
) != 0;
4318 dfa
->syntax
.eolbyte
= dfaopts
& DFA_EOL_NUL
? '\0' : '\n';
4319 dfa
->syntax
.syntax_bits
= bits
;
4321 for (int i
= CHAR_MIN
; i
<= CHAR_MAX
; ++i
)
4323 unsigned char uc
= i
;
4325 dfa
->syntax
.sbit
[uc
] = char_context (dfa
, uc
);
4326 switch (dfa
->syntax
.sbit
[uc
])
4329 setbit (uc
, &dfa
->syntax
.letters
);
4332 setbit (uc
, &dfa
->syntax
.newline
);
4336 /* POSIX requires that the five bytes in "\n\r./" (including the
4337 terminating NUL) cannot occur inside a multibyte character. */
4338 dfa
->syntax
.never_trail
[uc
] = (dfa
->localeinfo
.using_utf8
4339 ? (uc
& 0xc0) != 0x80
4340 : strchr ("\n\r./", uc
) != NULL
);
4344 /* vim:set shiftwidth=2: */