Merge mktime, timegm from upstream Gnulib
[glibc.git] / posix / regcomp.c
blobe81652f229b69d365203e9b3c7d8c19429fe3321
1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002-2018 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <https://www.gnu.org/licenses/>. */
20 #ifdef _LIBC
21 # include <locale/weight.h>
22 #endif
24 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
25 size_t length, reg_syntax_t syntax);
26 static void re_compile_fastmap_iter (regex_t *bufp,
27 const re_dfastate_t *init_state,
28 char *fastmap);
29 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
30 #ifdef RE_ENABLE_I18N
31 static void free_charset (re_charset_t *cset);
32 #endif /* RE_ENABLE_I18N */
33 static void free_workarea_compile (regex_t *preg);
34 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
35 #ifdef RE_ENABLE_I18N
36 static void optimize_utf8 (re_dfa_t *dfa);
37 #endif
38 static reg_errcode_t analyze (regex_t *preg);
39 static reg_errcode_t preorder (bin_tree_t *root,
40 reg_errcode_t (fn (void *, bin_tree_t *)),
41 void *extra);
42 static reg_errcode_t postorder (bin_tree_t *root,
43 reg_errcode_t (fn (void *, bin_tree_t *)),
44 void *extra);
45 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
46 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
47 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
48 bin_tree_t *node);
49 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
50 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
51 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
52 static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
53 static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
54 unsigned int constraint);
55 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
56 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
57 Idx node, bool root);
58 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
59 static Idx fetch_number (re_string_t *input, re_token_t *token,
60 reg_syntax_t syntax);
61 static int peek_token (re_token_t *token, re_string_t *input,
62 reg_syntax_t syntax);
63 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
64 reg_syntax_t syntax, reg_errcode_t *err);
65 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
66 re_token_t *token, reg_syntax_t syntax,
67 Idx nest, reg_errcode_t *err);
68 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
69 re_token_t *token, reg_syntax_t syntax,
70 Idx nest, reg_errcode_t *err);
71 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
72 re_token_t *token, reg_syntax_t syntax,
73 Idx nest, reg_errcode_t *err);
74 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
75 re_token_t *token, reg_syntax_t syntax,
76 Idx nest, reg_errcode_t *err);
77 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
78 re_dfa_t *dfa, re_token_t *token,
79 reg_syntax_t syntax, reg_errcode_t *err);
80 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
81 re_token_t *token, reg_syntax_t syntax,
82 reg_errcode_t *err);
83 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
84 re_string_t *regexp,
85 re_token_t *token, int token_len,
86 re_dfa_t *dfa,
87 reg_syntax_t syntax,
88 bool accept_hyphen);
89 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
90 re_string_t *regexp,
91 re_token_t *token);
92 #ifdef RE_ENABLE_I18N
93 static reg_errcode_t build_equiv_class (bitset_t sbcset,
94 re_charset_t *mbcset,
95 Idx *equiv_class_alloc,
96 const unsigned char *name);
97 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
98 bitset_t sbcset,
99 re_charset_t *mbcset,
100 Idx *char_class_alloc,
101 const char *class_name,
102 reg_syntax_t syntax);
103 #else /* not RE_ENABLE_I18N */
104 static reg_errcode_t build_equiv_class (bitset_t sbcset,
105 const unsigned char *name);
106 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
107 bitset_t sbcset,
108 const char *class_name,
109 reg_syntax_t syntax);
110 #endif /* not RE_ENABLE_I18N */
111 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
112 RE_TRANSLATE_TYPE trans,
113 const char *class_name,
114 const char *extra,
115 bool non_match, reg_errcode_t *err);
116 static bin_tree_t *create_tree (re_dfa_t *dfa,
117 bin_tree_t *left, bin_tree_t *right,
118 re_token_type_t type);
119 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
120 bin_tree_t *left, bin_tree_t *right,
121 const re_token_t *token);
122 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
123 static void free_token (re_token_t *node);
124 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
125 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
127 /* This table gives an error message for each of the error codes listed
128 in regex.h. Obviously the order here has to be same as there.
129 POSIX doesn't require that we do anything for REG_NOERROR,
130 but why not be nice? */
132 static const char __re_error_msgid[] =
134 #define REG_NOERROR_IDX 0
135 gettext_noop ("Success") /* REG_NOERROR */
136 "\0"
137 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
138 gettext_noop ("No match") /* REG_NOMATCH */
139 "\0"
140 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
141 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
142 "\0"
143 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
144 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
145 "\0"
146 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
147 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
148 "\0"
149 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
150 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
151 "\0"
152 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
153 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
154 "\0"
155 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
156 gettext_noop ("Unmatched [, [^, [:, [., or [=") /* REG_EBRACK */
157 "\0"
158 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [, [^, [:, [., or [=")
159 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
160 "\0"
161 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
162 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
163 "\0"
164 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
165 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
166 "\0"
167 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
168 gettext_noop ("Invalid range end") /* REG_ERANGE */
169 "\0"
170 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
171 gettext_noop ("Memory exhausted") /* REG_ESPACE */
172 "\0"
173 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
174 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
175 "\0"
176 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
177 gettext_noop ("Premature end of regular expression") /* REG_EEND */
178 "\0"
179 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
180 gettext_noop ("Regular expression too big") /* REG_ESIZE */
181 "\0"
182 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
183 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
186 static const size_t __re_error_msgid_idx[] =
188 REG_NOERROR_IDX,
189 REG_NOMATCH_IDX,
190 REG_BADPAT_IDX,
191 REG_ECOLLATE_IDX,
192 REG_ECTYPE_IDX,
193 REG_EESCAPE_IDX,
194 REG_ESUBREG_IDX,
195 REG_EBRACK_IDX,
196 REG_EPAREN_IDX,
197 REG_EBRACE_IDX,
198 REG_BADBR_IDX,
199 REG_ERANGE_IDX,
200 REG_ESPACE_IDX,
201 REG_BADRPT_IDX,
202 REG_EEND_IDX,
203 REG_ESIZE_IDX,
204 REG_ERPAREN_IDX
207 /* Entry points for GNU code. */
209 /* re_compile_pattern is the GNU regular expression compiler: it
210 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
211 Returns 0 if the pattern was valid, otherwise an error string.
213 Assumes the 'allocated' (and perhaps 'buffer') and 'translate' fields
214 are set in BUFP on entry. */
216 const char *
217 re_compile_pattern (const char *pattern, size_t length,
218 struct re_pattern_buffer *bufp)
220 reg_errcode_t ret;
222 /* And GNU code determines whether or not to get register information
223 by passing null for the REGS argument to re_match, etc., not by
224 setting no_sub, unless RE_NO_SUB is set. */
225 bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
227 /* Match anchors at newline. */
228 bufp->newline_anchor = 1;
230 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
232 if (!ret)
233 return NULL;
234 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
236 #ifdef _LIBC
237 weak_alias (__re_compile_pattern, re_compile_pattern)
238 #endif
240 /* Set by 're_set_syntax' to the current regexp syntax to recognize. Can
241 also be assigned to arbitrarily: each pattern buffer stores its own
242 syntax, so it can be changed between regex compilations. */
243 /* This has no initializer because initialized variables in Emacs
244 become read-only after dumping. */
245 reg_syntax_t re_syntax_options;
248 /* Specify the precise syntax of regexps for compilation. This provides
249 for compatibility for various utilities which historically have
250 different, incompatible syntaxes.
252 The argument SYNTAX is a bit mask comprised of the various bits
253 defined in regex.h. We return the old syntax. */
255 reg_syntax_t
256 re_set_syntax (reg_syntax_t syntax)
258 reg_syntax_t ret = re_syntax_options;
260 re_syntax_options = syntax;
261 return ret;
263 #ifdef _LIBC
264 weak_alias (__re_set_syntax, re_set_syntax)
265 #endif
268 re_compile_fastmap (struct re_pattern_buffer *bufp)
270 re_dfa_t *dfa = bufp->buffer;
271 char *fastmap = bufp->fastmap;
273 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
274 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
275 if (dfa->init_state != dfa->init_state_word)
276 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
277 if (dfa->init_state != dfa->init_state_nl)
278 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
279 if (dfa->init_state != dfa->init_state_begbuf)
280 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
281 bufp->fastmap_accurate = 1;
282 return 0;
284 #ifdef _LIBC
285 weak_alias (__re_compile_fastmap, re_compile_fastmap)
286 #endif
288 static inline void
289 __attribute__ ((always_inline))
290 re_set_fastmap (char *fastmap, bool icase, int ch)
292 fastmap[ch] = 1;
293 if (icase)
294 fastmap[tolower (ch)] = 1;
297 /* Helper function for re_compile_fastmap.
298 Compile fastmap for the initial_state INIT_STATE. */
300 static void
301 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
302 char *fastmap)
304 re_dfa_t *dfa = bufp->buffer;
305 Idx node_cnt;
306 bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
307 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
309 Idx node = init_state->nodes.elems[node_cnt];
310 re_token_type_t type = dfa->nodes[node].type;
312 if (type == CHARACTER)
314 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
315 #ifdef RE_ENABLE_I18N
316 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
318 unsigned char buf[MB_LEN_MAX];
319 unsigned char *p;
320 wchar_t wc;
321 mbstate_t state;
323 p = buf;
324 *p++ = dfa->nodes[node].opr.c;
325 while (++node < dfa->nodes_len
326 && dfa->nodes[node].type == CHARACTER
327 && dfa->nodes[node].mb_partial)
328 *p++ = dfa->nodes[node].opr.c;
329 memset (&state, '\0', sizeof (state));
330 if (__mbrtowc (&wc, (const char *) buf, p - buf,
331 &state) == p - buf
332 && (__wcrtomb ((char *) buf, __towlower (wc), &state)
333 != (size_t) -1))
334 re_set_fastmap (fastmap, false, buf[0]);
336 #endif
338 else if (type == SIMPLE_BRACKET)
340 int i, ch;
341 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
343 int j;
344 bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
345 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
346 if (w & ((bitset_word_t) 1 << j))
347 re_set_fastmap (fastmap, icase, ch);
350 #ifdef RE_ENABLE_I18N
351 else if (type == COMPLEX_BRACKET)
353 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
354 Idx i;
356 # ifdef _LIBC
357 /* See if we have to try all bytes which start multiple collation
358 elements.
359 e.g. In da_DK, we want to catch 'a' since "aa" is a valid
360 collation element, and don't catch 'b' since 'b' is
361 the only collation element which starts from 'b' (and
362 it is caught by SIMPLE_BRACKET). */
363 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
364 && (cset->ncoll_syms || cset->nranges))
366 const int32_t *table = (const int32_t *)
367 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
368 for (i = 0; i < SBC_MAX; ++i)
369 if (table[i] < 0)
370 re_set_fastmap (fastmap, icase, i);
372 # endif /* _LIBC */
374 /* See if we have to start the match at all multibyte characters,
375 i.e. where we would not find an invalid sequence. This only
376 applies to multibyte character sets; for single byte character
377 sets, the SIMPLE_BRACKET again suffices. */
378 if (dfa->mb_cur_max > 1
379 && (cset->nchar_classes || cset->non_match || cset->nranges
380 # ifdef _LIBC
381 || cset->nequiv_classes
382 # endif /* _LIBC */
385 unsigned char c = 0;
388 mbstate_t mbs;
389 memset (&mbs, 0, sizeof (mbs));
390 if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
391 re_set_fastmap (fastmap, false, (int) c);
393 while (++c != 0);
396 else
398 /* ... Else catch all bytes which can start the mbchars. */
399 for (i = 0; i < cset->nmbchars; ++i)
401 char buf[256];
402 mbstate_t state;
403 memset (&state, '\0', sizeof (state));
404 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
405 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
406 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
408 if (__wcrtomb (buf, __towlower (cset->mbchars[i]), &state)
409 != (size_t) -1)
410 re_set_fastmap (fastmap, false, *(unsigned char *) buf);
415 #endif /* RE_ENABLE_I18N */
416 else if (type == OP_PERIOD
417 #ifdef RE_ENABLE_I18N
418 || type == OP_UTF8_PERIOD
419 #endif /* RE_ENABLE_I18N */
420 || type == END_OF_RE)
422 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
423 if (type == END_OF_RE)
424 bufp->can_be_null = 1;
425 return;
430 /* Entry point for POSIX code. */
431 /* regcomp takes a regular expression as a string and compiles it.
433 PREG is a regex_t *. We do not expect any fields to be initialized,
434 since POSIX says we shouldn't. Thus, we set
436 'buffer' to the compiled pattern;
437 'used' to the length of the compiled pattern;
438 'syntax' to RE_SYNTAX_POSIX_EXTENDED if the
439 REG_EXTENDED bit in CFLAGS is set; otherwise, to
440 RE_SYNTAX_POSIX_BASIC;
441 'newline_anchor' to REG_NEWLINE being set in CFLAGS;
442 'fastmap' to an allocated space for the fastmap;
443 'fastmap_accurate' to zero;
444 're_nsub' to the number of subexpressions in PATTERN.
446 PATTERN is the address of the pattern string.
448 CFLAGS is a series of bits which affect compilation.
450 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
451 use POSIX basic syntax.
453 If REG_NEWLINE is set, then . and [^...] don't match newline.
454 Also, regexec will try a match beginning after every newline.
456 If REG_ICASE is set, then we considers upper- and lowercase
457 versions of letters to be equivalent when matching.
459 If REG_NOSUB is set, then when PREG is passed to regexec, that
460 routine will report only success or failure, and nothing about the
461 registers.
463 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
464 the return codes and their meanings.) */
467 regcomp (regex_t *_Restrict_ preg, const char *_Restrict_ pattern, int cflags)
469 reg_errcode_t ret;
470 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
471 : RE_SYNTAX_POSIX_BASIC);
473 preg->buffer = NULL;
474 preg->allocated = 0;
475 preg->used = 0;
477 /* Try to allocate space for the fastmap. */
478 preg->fastmap = re_malloc (char, SBC_MAX);
479 if (BE (preg->fastmap == NULL, 0))
480 return REG_ESPACE;
482 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
484 /* If REG_NEWLINE is set, newlines are treated differently. */
485 if (cflags & REG_NEWLINE)
486 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
487 syntax &= ~RE_DOT_NEWLINE;
488 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
489 /* It also changes the matching behavior. */
490 preg->newline_anchor = 1;
492 else
493 preg->newline_anchor = 0;
494 preg->no_sub = !!(cflags & REG_NOSUB);
495 preg->translate = NULL;
497 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
499 /* POSIX doesn't distinguish between an unmatched open-group and an
500 unmatched close-group: both are REG_EPAREN. */
501 if (ret == REG_ERPAREN)
502 ret = REG_EPAREN;
504 /* We have already checked preg->fastmap != NULL. */
505 if (BE (ret == REG_NOERROR, 1))
506 /* Compute the fastmap now, since regexec cannot modify the pattern
507 buffer. This function never fails in this implementation. */
508 (void) re_compile_fastmap (preg);
509 else
511 /* Some error occurred while compiling the expression. */
512 re_free (preg->fastmap);
513 preg->fastmap = NULL;
516 return (int) ret;
518 #ifdef _LIBC
519 libc_hidden_def (__regcomp)
520 weak_alias (__regcomp, regcomp)
521 #endif
523 /* Returns a message corresponding to an error code, ERRCODE, returned
524 from either regcomp or regexec. We don't use PREG here. */
526 size_t
527 regerror (int errcode, const regex_t *_Restrict_ preg, char *_Restrict_ errbuf,
528 size_t errbuf_size)
530 const char *msg;
531 size_t msg_size;
533 if (BE (errcode < 0
534 || errcode >= (int) (sizeof (__re_error_msgid_idx)
535 / sizeof (__re_error_msgid_idx[0])), 0))
536 /* Only error codes returned by the rest of the code should be passed
537 to this routine. If we are given anything else, or if other regex
538 code generates an invalid error code, then the program has a bug.
539 Dump core so we can fix it. */
540 abort ();
542 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
544 msg_size = strlen (msg) + 1; /* Includes the null. */
546 if (BE (errbuf_size != 0, 1))
548 size_t cpy_size = msg_size;
549 if (BE (msg_size > errbuf_size, 0))
551 cpy_size = errbuf_size - 1;
552 errbuf[cpy_size] = '\0';
554 memcpy (errbuf, msg, cpy_size);
557 return msg_size;
559 #ifdef _LIBC
560 weak_alias (__regerror, regerror)
561 #endif
564 #ifdef RE_ENABLE_I18N
565 /* This static array is used for the map to single-byte characters when
566 UTF-8 is used. Otherwise we would allocate memory just to initialize
567 it the same all the time. UTF-8 is the preferred encoding so this is
568 a worthwhile optimization. */
569 static const bitset_t utf8_sb_map =
571 /* Set the first 128 bits. */
572 # if defined __GNUC__ && !defined __STRICT_ANSI__
573 [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
574 # else
575 # if 4 * BITSET_WORD_BITS < ASCII_CHARS
576 # error "bitset_word_t is narrower than 32 bits"
577 # elif 3 * BITSET_WORD_BITS < ASCII_CHARS
578 BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
579 # elif 2 * BITSET_WORD_BITS < ASCII_CHARS
580 BITSET_WORD_MAX, BITSET_WORD_MAX,
581 # elif 1 * BITSET_WORD_BITS < ASCII_CHARS
582 BITSET_WORD_MAX,
583 # endif
584 (BITSET_WORD_MAX
585 >> (SBC_MAX % BITSET_WORD_BITS == 0
587 : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
588 # endif
590 #endif
593 static void
594 free_dfa_content (re_dfa_t *dfa)
596 Idx i, j;
598 if (dfa->nodes)
599 for (i = 0; i < dfa->nodes_len; ++i)
600 free_token (dfa->nodes + i);
601 re_free (dfa->nexts);
602 for (i = 0; i < dfa->nodes_len; ++i)
604 if (dfa->eclosures != NULL)
605 re_node_set_free (dfa->eclosures + i);
606 if (dfa->inveclosures != NULL)
607 re_node_set_free (dfa->inveclosures + i);
608 if (dfa->edests != NULL)
609 re_node_set_free (dfa->edests + i);
611 re_free (dfa->edests);
612 re_free (dfa->eclosures);
613 re_free (dfa->inveclosures);
614 re_free (dfa->nodes);
616 if (dfa->state_table)
617 for (i = 0; i <= dfa->state_hash_mask; ++i)
619 struct re_state_table_entry *entry = dfa->state_table + i;
620 for (j = 0; j < entry->num; ++j)
622 re_dfastate_t *state = entry->array[j];
623 free_state (state);
625 re_free (entry->array);
627 re_free (dfa->state_table);
628 #ifdef RE_ENABLE_I18N
629 if (dfa->sb_char != utf8_sb_map)
630 re_free (dfa->sb_char);
631 #endif
632 re_free (dfa->subexp_map);
633 #ifdef DEBUG
634 re_free (dfa->re_str);
635 #endif
637 re_free (dfa);
641 /* Free dynamically allocated space used by PREG. */
643 void
644 regfree (regex_t *preg)
646 re_dfa_t *dfa = preg->buffer;
647 if (BE (dfa != NULL, 1))
649 lock_fini (dfa->lock);
650 free_dfa_content (dfa);
652 preg->buffer = NULL;
653 preg->allocated = 0;
655 re_free (preg->fastmap);
656 preg->fastmap = NULL;
658 re_free (preg->translate);
659 preg->translate = NULL;
661 #ifdef _LIBC
662 libc_hidden_def (__regfree)
663 weak_alias (__regfree, regfree)
664 #endif
666 /* Entry points compatible with 4.2 BSD regex library. We don't define
667 them unless specifically requested. */
669 #if defined _REGEX_RE_COMP || defined _LIBC
671 /* BSD has one and only one pattern buffer. */
672 static struct re_pattern_buffer re_comp_buf;
674 char *
675 # ifdef _LIBC
676 /* Make these definitions weak in libc, so POSIX programs can redefine
677 these names if they don't use our functions, and still use
678 regcomp/regexec above without link errors. */
679 weak_function
680 # endif
681 re_comp (const char *s)
683 reg_errcode_t ret;
684 char *fastmap;
686 if (!s)
688 if (!re_comp_buf.buffer)
689 return gettext ("No previous regular expression");
690 return 0;
693 if (re_comp_buf.buffer)
695 fastmap = re_comp_buf.fastmap;
696 re_comp_buf.fastmap = NULL;
697 __regfree (&re_comp_buf);
698 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
699 re_comp_buf.fastmap = fastmap;
702 if (re_comp_buf.fastmap == NULL)
704 re_comp_buf.fastmap = re_malloc (char, SBC_MAX);
705 if (re_comp_buf.fastmap == NULL)
706 return (char *) gettext (__re_error_msgid
707 + __re_error_msgid_idx[(int) REG_ESPACE]);
710 /* Since 're_exec' always passes NULL for the 'regs' argument, we
711 don't need to initialize the pattern buffer fields which affect it. */
713 /* Match anchors at newlines. */
714 re_comp_buf.newline_anchor = 1;
716 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
718 if (!ret)
719 return NULL;
721 /* Yes, we're discarding 'const' here if !HAVE_LIBINTL. */
722 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
725 #ifdef _LIBC
726 libc_freeres_fn (free_mem)
728 __regfree (&re_comp_buf);
730 #endif
732 #endif /* _REGEX_RE_COMP */
734 /* Internal entry point.
735 Compile the regular expression PATTERN, whose length is LENGTH.
736 SYNTAX indicate regular expression's syntax. */
738 static reg_errcode_t
739 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
740 reg_syntax_t syntax)
742 reg_errcode_t err = REG_NOERROR;
743 re_dfa_t *dfa;
744 re_string_t regexp;
746 /* Initialize the pattern buffer. */
747 preg->fastmap_accurate = 0;
748 preg->syntax = syntax;
749 preg->not_bol = preg->not_eol = 0;
750 preg->used = 0;
751 preg->re_nsub = 0;
752 preg->can_be_null = 0;
753 preg->regs_allocated = REGS_UNALLOCATED;
755 /* Initialize the dfa. */
756 dfa = preg->buffer;
757 if (BE (preg->allocated < sizeof (re_dfa_t), 0))
759 /* If zero allocated, but buffer is non-null, try to realloc
760 enough space. This loses if buffer's address is bogus, but
761 that is the user's responsibility. If ->buffer is NULL this
762 is a simple allocation. */
763 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
764 if (dfa == NULL)
765 return REG_ESPACE;
766 preg->allocated = sizeof (re_dfa_t);
767 preg->buffer = dfa;
769 preg->used = sizeof (re_dfa_t);
771 err = init_dfa (dfa, length);
772 if (BE (err == REG_NOERROR && lock_init (dfa->lock) != 0, 0))
773 err = REG_ESPACE;
774 if (BE (err != REG_NOERROR, 0))
776 free_dfa_content (dfa);
777 preg->buffer = NULL;
778 preg->allocated = 0;
779 return err;
781 #ifdef DEBUG
782 /* Note: length+1 will not overflow since it is checked in init_dfa. */
783 dfa->re_str = re_malloc (char, length + 1);
784 strncpy (dfa->re_str, pattern, length + 1);
785 #endif
787 err = re_string_construct (&regexp, pattern, length, preg->translate,
788 (syntax & RE_ICASE) != 0, dfa);
789 if (BE (err != REG_NOERROR, 0))
791 re_compile_internal_free_return:
792 free_workarea_compile (preg);
793 re_string_destruct (&regexp);
794 lock_fini (dfa->lock);
795 free_dfa_content (dfa);
796 preg->buffer = NULL;
797 preg->allocated = 0;
798 return err;
801 /* Parse the regular expression, and build a structure tree. */
802 preg->re_nsub = 0;
803 dfa->str_tree = parse (&regexp, preg, syntax, &err);
804 if (BE (dfa->str_tree == NULL, 0))
805 goto re_compile_internal_free_return;
807 /* Analyze the tree and create the nfa. */
808 err = analyze (preg);
809 if (BE (err != REG_NOERROR, 0))
810 goto re_compile_internal_free_return;
812 #ifdef RE_ENABLE_I18N
813 /* If possible, do searching in single byte encoding to speed things up. */
814 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
815 optimize_utf8 (dfa);
816 #endif
818 /* Then create the initial state of the dfa. */
819 err = create_initial_state (dfa);
821 /* Release work areas. */
822 free_workarea_compile (preg);
823 re_string_destruct (&regexp);
825 if (BE (err != REG_NOERROR, 0))
827 lock_fini (dfa->lock);
828 free_dfa_content (dfa);
829 preg->buffer = NULL;
830 preg->allocated = 0;
833 return err;
836 /* Initialize DFA. We use the length of the regular expression PAT_LEN
837 as the initial length of some arrays. */
839 static reg_errcode_t
840 init_dfa (re_dfa_t *dfa, size_t pat_len)
842 __re_size_t table_size;
843 #ifndef _LIBC
844 const char *codeset_name;
845 #endif
846 #ifdef RE_ENABLE_I18N
847 size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
848 #else
849 size_t max_i18n_object_size = 0;
850 #endif
851 size_t max_object_size =
852 MAX (sizeof (struct re_state_table_entry),
853 MAX (sizeof (re_token_t),
854 MAX (sizeof (re_node_set),
855 MAX (sizeof (regmatch_t),
856 max_i18n_object_size))));
858 memset (dfa, '\0', sizeof (re_dfa_t));
860 /* Force allocation of str_tree_storage the first time. */
861 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
863 /* Avoid overflows. The extra "/ 2" is for the table_size doubling
864 calculation below, and for similar doubling calculations
865 elsewhere. And it's <= rather than <, because some of the
866 doubling calculations add 1 afterwards. */
867 if (BE (MIN (IDX_MAX, SIZE_MAX / max_object_size) / 2 <= pat_len, 0))
868 return REG_ESPACE;
870 dfa->nodes_alloc = pat_len + 1;
871 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
873 /* table_size = 2 ^ ceil(log pat_len) */
874 for (table_size = 1; ; table_size <<= 1)
875 if (table_size > pat_len)
876 break;
878 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
879 dfa->state_hash_mask = table_size - 1;
881 dfa->mb_cur_max = MB_CUR_MAX;
882 #ifdef _LIBC
883 if (dfa->mb_cur_max == 6
884 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
885 dfa->is_utf8 = 1;
886 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
887 != 0);
888 #else
889 codeset_name = nl_langinfo (CODESET);
890 if ((codeset_name[0] == 'U' || codeset_name[0] == 'u')
891 && (codeset_name[1] == 'T' || codeset_name[1] == 't')
892 && (codeset_name[2] == 'F' || codeset_name[2] == 'f')
893 && strcmp (codeset_name + 3 + (codeset_name[3] == '-'), "8") == 0)
894 dfa->is_utf8 = 1;
896 /* We check exhaustively in the loop below if this charset is a
897 superset of ASCII. */
898 dfa->map_notascii = 0;
899 #endif
901 #ifdef RE_ENABLE_I18N
902 if (dfa->mb_cur_max > 1)
904 if (dfa->is_utf8)
905 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
906 else
908 int i, j, ch;
910 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
911 if (BE (dfa->sb_char == NULL, 0))
912 return REG_ESPACE;
914 /* Set the bits corresponding to single byte chars. */
915 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
916 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
918 wint_t wch = __btowc (ch);
919 if (wch != WEOF)
920 dfa->sb_char[i] |= (bitset_word_t) 1 << j;
921 # ifndef _LIBC
922 if (isascii (ch) && wch != ch)
923 dfa->map_notascii = 1;
924 # endif
928 #endif
930 if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
931 return REG_ESPACE;
932 return REG_NOERROR;
935 /* Initialize WORD_CHAR table, which indicate which character is
936 "word". In this case "word" means that it is the word construction
937 character used by some operators like "\<", "\>", etc. */
939 static void
940 init_word_char (re_dfa_t *dfa)
942 int i = 0;
943 int j;
944 int ch = 0;
945 dfa->word_ops_used = 1;
946 if (BE (dfa->map_notascii == 0, 1))
948 /* Avoid uint32_t and uint64_t as some non-GCC platforms lack
949 them, an issue when this code is used in Gnulib. */
950 bitset_word_t bits0 = 0x00000000;
951 bitset_word_t bits1 = 0x03ff0000;
952 bitset_word_t bits2 = 0x87fffffe;
953 bitset_word_t bits3 = 0x07fffffe;
954 if (BITSET_WORD_BITS == 64)
956 /* Pacify gcc -Woverflow on 32-bit platformns. */
957 dfa->word_char[0] = bits1 << 31 << 1 | bits0;
958 dfa->word_char[1] = bits3 << 31 << 1 | bits2;
959 i = 2;
961 else if (BITSET_WORD_BITS == 32)
963 dfa->word_char[0] = bits0;
964 dfa->word_char[1] = bits1;
965 dfa->word_char[2] = bits2;
966 dfa->word_char[3] = bits3;
967 i = 4;
969 else
970 goto general_case;
971 ch = 128;
973 if (BE (dfa->is_utf8, 1))
975 memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8);
976 return;
980 general_case:
981 for (; i < BITSET_WORDS; ++i)
982 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
983 if (isalnum (ch) || ch == '_')
984 dfa->word_char[i] |= (bitset_word_t) 1 << j;
987 /* Free the work area which are only used while compiling. */
989 static void
990 free_workarea_compile (regex_t *preg)
992 re_dfa_t *dfa = preg->buffer;
993 bin_tree_storage_t *storage, *next;
994 for (storage = dfa->str_tree_storage; storage; storage = next)
996 next = storage->next;
997 re_free (storage);
999 dfa->str_tree_storage = NULL;
1000 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
1001 dfa->str_tree = NULL;
1002 re_free (dfa->org_indices);
1003 dfa->org_indices = NULL;
1006 /* Create initial states for all contexts. */
1008 static reg_errcode_t
1009 create_initial_state (re_dfa_t *dfa)
1011 Idx first, i;
1012 reg_errcode_t err;
1013 re_node_set init_nodes;
1015 /* Initial states have the epsilon closure of the node which is
1016 the first node of the regular expression. */
1017 first = dfa->str_tree->first->node_idx;
1018 dfa->init_node = first;
1019 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
1020 if (BE (err != REG_NOERROR, 0))
1021 return err;
1023 /* The back-references which are in initial states can epsilon transit,
1024 since in this case all of the subexpressions can be null.
1025 Then we add epsilon closures of the nodes which are the next nodes of
1026 the back-references. */
1027 if (dfa->nbackref > 0)
1028 for (i = 0; i < init_nodes.nelem; ++i)
1030 Idx node_idx = init_nodes.elems[i];
1031 re_token_type_t type = dfa->nodes[node_idx].type;
1033 Idx clexp_idx;
1034 if (type != OP_BACK_REF)
1035 continue;
1036 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1038 re_token_t *clexp_node;
1039 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1040 if (clexp_node->type == OP_CLOSE_SUBEXP
1041 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1042 break;
1044 if (clexp_idx == init_nodes.nelem)
1045 continue;
1047 if (type == OP_BACK_REF)
1049 Idx dest_idx = dfa->edests[node_idx].elems[0];
1050 if (!re_node_set_contains (&init_nodes, dest_idx))
1052 reg_errcode_t merge_err
1053 = re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1054 if (merge_err != REG_NOERROR)
1055 return merge_err;
1056 i = 0;
1061 /* It must be the first time to invoke acquire_state. */
1062 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1063 /* We don't check ERR here, since the initial state must not be NULL. */
1064 if (BE (dfa->init_state == NULL, 0))
1065 return err;
1066 if (dfa->init_state->has_constraint)
1068 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1069 CONTEXT_WORD);
1070 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1071 CONTEXT_NEWLINE);
1072 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1073 &init_nodes,
1074 CONTEXT_NEWLINE
1075 | CONTEXT_BEGBUF);
1076 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1077 || dfa->init_state_begbuf == NULL, 0))
1078 return err;
1080 else
1081 dfa->init_state_word = dfa->init_state_nl
1082 = dfa->init_state_begbuf = dfa->init_state;
1084 re_node_set_free (&init_nodes);
1085 return REG_NOERROR;
1088 #ifdef RE_ENABLE_I18N
1089 /* If it is possible to do searching in single byte encoding instead of UTF-8
1090 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1091 DFA nodes where needed. */
1093 static void
1094 optimize_utf8 (re_dfa_t *dfa)
1096 Idx node;
1097 int i;
1098 bool mb_chars = false;
1099 bool has_period = false;
1101 for (node = 0; node < dfa->nodes_len; ++node)
1102 switch (dfa->nodes[node].type)
1104 case CHARACTER:
1105 if (dfa->nodes[node].opr.c >= ASCII_CHARS)
1106 mb_chars = true;
1107 break;
1108 case ANCHOR:
1109 switch (dfa->nodes[node].opr.ctx_type)
1111 case LINE_FIRST:
1112 case LINE_LAST:
1113 case BUF_FIRST:
1114 case BUF_LAST:
1115 break;
1116 default:
1117 /* Word anchors etc. cannot be handled. It's okay to test
1118 opr.ctx_type since constraints (for all DFA nodes) are
1119 created by ORing one or more opr.ctx_type values. */
1120 return;
1122 break;
1123 case OP_PERIOD:
1124 has_period = true;
1125 break;
1126 case OP_BACK_REF:
1127 case OP_ALT:
1128 case END_OF_RE:
1129 case OP_DUP_ASTERISK:
1130 case OP_OPEN_SUBEXP:
1131 case OP_CLOSE_SUBEXP:
1132 break;
1133 case COMPLEX_BRACKET:
1134 return;
1135 case SIMPLE_BRACKET:
1136 /* Just double check. */
1138 int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
1140 : BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
1141 for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1143 if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
1144 return;
1145 rshift = 0;
1148 break;
1149 default:
1150 abort ();
1153 if (mb_chars || has_period)
1154 for (node = 0; node < dfa->nodes_len; ++node)
1156 if (dfa->nodes[node].type == CHARACTER
1157 && dfa->nodes[node].opr.c >= ASCII_CHARS)
1158 dfa->nodes[node].mb_partial = 0;
1159 else if (dfa->nodes[node].type == OP_PERIOD)
1160 dfa->nodes[node].type = OP_UTF8_PERIOD;
1163 /* The search can be in single byte locale. */
1164 dfa->mb_cur_max = 1;
1165 dfa->is_utf8 = 0;
1166 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1168 #endif
1170 /* Analyze the structure tree, and calculate "first", "next", "edest",
1171 "eclosure", and "inveclosure". */
1173 static reg_errcode_t
1174 analyze (regex_t *preg)
1176 re_dfa_t *dfa = preg->buffer;
1177 reg_errcode_t ret;
1179 /* Allocate arrays. */
1180 dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1181 dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1182 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1183 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1184 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1185 || dfa->eclosures == NULL, 0))
1186 return REG_ESPACE;
1188 dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
1189 if (dfa->subexp_map != NULL)
1191 Idx i;
1192 for (i = 0; i < preg->re_nsub; i++)
1193 dfa->subexp_map[i] = i;
1194 preorder (dfa->str_tree, optimize_subexps, dfa);
1195 for (i = 0; i < preg->re_nsub; i++)
1196 if (dfa->subexp_map[i] != i)
1197 break;
1198 if (i == preg->re_nsub)
1200 re_free (dfa->subexp_map);
1201 dfa->subexp_map = NULL;
1205 ret = postorder (dfa->str_tree, lower_subexps, preg);
1206 if (BE (ret != REG_NOERROR, 0))
1207 return ret;
1208 ret = postorder (dfa->str_tree, calc_first, dfa);
1209 if (BE (ret != REG_NOERROR, 0))
1210 return ret;
1211 preorder (dfa->str_tree, calc_next, dfa);
1212 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1213 if (BE (ret != REG_NOERROR, 0))
1214 return ret;
1215 ret = calc_eclosure (dfa);
1216 if (BE (ret != REG_NOERROR, 0))
1217 return ret;
1219 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1220 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1221 if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1222 || dfa->nbackref)
1224 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1225 if (BE (dfa->inveclosures == NULL, 0))
1226 return REG_ESPACE;
1227 ret = calc_inveclosure (dfa);
1230 return ret;
1233 /* Our parse trees are very unbalanced, so we cannot use a stack to
1234 implement parse tree visits. Instead, we use parent pointers and
1235 some hairy code in these two functions. */
1236 static reg_errcode_t
1237 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1238 void *extra)
1240 bin_tree_t *node, *prev;
1242 for (node = root; ; )
1244 /* Descend down the tree, preferably to the left (or to the right
1245 if that's the only child). */
1246 while (node->left || node->right)
1247 if (node->left)
1248 node = node->left;
1249 else
1250 node = node->right;
1254 reg_errcode_t err = fn (extra, node);
1255 if (BE (err != REG_NOERROR, 0))
1256 return err;
1257 if (node->parent == NULL)
1258 return REG_NOERROR;
1259 prev = node;
1260 node = node->parent;
1262 /* Go up while we have a node that is reached from the right. */
1263 while (node->right == prev || node->right == NULL);
1264 node = node->right;
1268 static reg_errcode_t
1269 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1270 void *extra)
1272 bin_tree_t *node;
1274 for (node = root; ; )
1276 reg_errcode_t err = fn (extra, node);
1277 if (BE (err != REG_NOERROR, 0))
1278 return err;
1280 /* Go to the left node, or up and to the right. */
1281 if (node->left)
1282 node = node->left;
1283 else
1285 bin_tree_t *prev = NULL;
1286 while (node->right == prev || node->right == NULL)
1288 prev = node;
1289 node = node->parent;
1290 if (!node)
1291 return REG_NOERROR;
1293 node = node->right;
1298 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1299 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1300 backreferences as well. Requires a preorder visit. */
1301 static reg_errcode_t
1302 optimize_subexps (void *extra, bin_tree_t *node)
1304 re_dfa_t *dfa = (re_dfa_t *) extra;
1306 if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1308 int idx = node->token.opr.idx;
1309 node->token.opr.idx = dfa->subexp_map[idx];
1310 dfa->used_bkref_map |= 1 << node->token.opr.idx;
1313 else if (node->token.type == SUBEXP
1314 && node->left && node->left->token.type == SUBEXP)
1316 Idx other_idx = node->left->token.opr.idx;
1318 node->left = node->left->left;
1319 if (node->left)
1320 node->left->parent = node;
1322 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1323 if (other_idx < BITSET_WORD_BITS)
1324 dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1327 return REG_NOERROR;
1330 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1331 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1332 static reg_errcode_t
1333 lower_subexps (void *extra, bin_tree_t *node)
1335 regex_t *preg = (regex_t *) extra;
1336 reg_errcode_t err = REG_NOERROR;
1338 if (node->left && node->left->token.type == SUBEXP)
1340 node->left = lower_subexp (&err, preg, node->left);
1341 if (node->left)
1342 node->left->parent = node;
1344 if (node->right && node->right->token.type == SUBEXP)
1346 node->right = lower_subexp (&err, preg, node->right);
1347 if (node->right)
1348 node->right->parent = node;
1351 return err;
1354 static bin_tree_t *
1355 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1357 re_dfa_t *dfa = preg->buffer;
1358 bin_tree_t *body = node->left;
1359 bin_tree_t *op, *cls, *tree1, *tree;
1361 if (preg->no_sub
1362 /* We do not optimize empty subexpressions, because otherwise we may
1363 have bad CONCAT nodes with NULL children. This is obviously not
1364 very common, so we do not lose much. An example that triggers
1365 this case is the sed "script" /\(\)/x. */
1366 && node->left != NULL
1367 && (node->token.opr.idx >= BITSET_WORD_BITS
1368 || !(dfa->used_bkref_map
1369 & ((bitset_word_t) 1 << node->token.opr.idx))))
1370 return node->left;
1372 /* Convert the SUBEXP node to the concatenation of an
1373 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1374 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1375 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1376 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1377 tree = create_tree (dfa, op, tree1, CONCAT);
1378 if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1380 *err = REG_ESPACE;
1381 return NULL;
1384 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1385 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1386 return tree;
1389 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1390 nodes. Requires a postorder visit. */
1391 static reg_errcode_t
1392 calc_first (void *extra, bin_tree_t *node)
1394 re_dfa_t *dfa = (re_dfa_t *) extra;
1395 if (node->token.type == CONCAT)
1397 node->first = node->left->first;
1398 node->node_idx = node->left->node_idx;
1400 else
1402 node->first = node;
1403 node->node_idx = re_dfa_add_node (dfa, node->token);
1404 if (BE (node->node_idx == -1, 0))
1405 return REG_ESPACE;
1406 if (node->token.type == ANCHOR)
1407 dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1409 return REG_NOERROR;
1412 /* Pass 2: compute NEXT on the tree. Preorder visit. */
1413 static reg_errcode_t
1414 calc_next (void *extra, bin_tree_t *node)
1416 switch (node->token.type)
1418 case OP_DUP_ASTERISK:
1419 node->left->next = node;
1420 break;
1421 case CONCAT:
1422 node->left->next = node->right->first;
1423 node->right->next = node->next;
1424 break;
1425 default:
1426 if (node->left)
1427 node->left->next = node->next;
1428 if (node->right)
1429 node->right->next = node->next;
1430 break;
1432 return REG_NOERROR;
1435 /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1436 static reg_errcode_t
1437 link_nfa_nodes (void *extra, bin_tree_t *node)
1439 re_dfa_t *dfa = (re_dfa_t *) extra;
1440 Idx idx = node->node_idx;
1441 reg_errcode_t err = REG_NOERROR;
1443 switch (node->token.type)
1445 case CONCAT:
1446 break;
1448 case END_OF_RE:
1449 assert (node->next == NULL);
1450 break;
1452 case OP_DUP_ASTERISK:
1453 case OP_ALT:
1455 Idx left, right;
1456 dfa->has_plural_match = 1;
1457 if (node->left != NULL)
1458 left = node->left->first->node_idx;
1459 else
1460 left = node->next->node_idx;
1461 if (node->right != NULL)
1462 right = node->right->first->node_idx;
1463 else
1464 right = node->next->node_idx;
1465 assert (left > -1);
1466 assert (right > -1);
1467 err = re_node_set_init_2 (dfa->edests + idx, left, right);
1469 break;
1471 case ANCHOR:
1472 case OP_OPEN_SUBEXP:
1473 case OP_CLOSE_SUBEXP:
1474 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1475 break;
1477 case OP_BACK_REF:
1478 dfa->nexts[idx] = node->next->node_idx;
1479 if (node->token.type == OP_BACK_REF)
1480 err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1481 break;
1483 default:
1484 assert (!IS_EPSILON_NODE (node->token.type));
1485 dfa->nexts[idx] = node->next->node_idx;
1486 break;
1489 return err;
1492 /* Duplicate the epsilon closure of the node ROOT_NODE.
1493 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1494 to their own constraint. */
1496 static reg_errcode_t
1497 duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
1498 Idx root_node, unsigned int init_constraint)
1500 Idx org_node, clone_node;
1501 bool ok;
1502 unsigned int constraint = init_constraint;
1503 for (org_node = top_org_node, clone_node = top_clone_node;;)
1505 Idx org_dest, clone_dest;
1506 if (dfa->nodes[org_node].type == OP_BACK_REF)
1508 /* If the back reference epsilon-transit, its destination must
1509 also have the constraint. Then duplicate the epsilon closure
1510 of the destination of the back reference, and store it in
1511 edests of the back reference. */
1512 org_dest = dfa->nexts[org_node];
1513 re_node_set_empty (dfa->edests + clone_node);
1514 clone_dest = duplicate_node (dfa, org_dest, constraint);
1515 if (BE (clone_dest == -1, 0))
1516 return REG_ESPACE;
1517 dfa->nexts[clone_node] = dfa->nexts[org_node];
1518 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1519 if (BE (! ok, 0))
1520 return REG_ESPACE;
1522 else if (dfa->edests[org_node].nelem == 0)
1524 /* In case of the node can't epsilon-transit, don't duplicate the
1525 destination and store the original destination as the
1526 destination of the node. */
1527 dfa->nexts[clone_node] = dfa->nexts[org_node];
1528 break;
1530 else if (dfa->edests[org_node].nelem == 1)
1532 /* In case of the node can epsilon-transit, and it has only one
1533 destination. */
1534 org_dest = dfa->edests[org_node].elems[0];
1535 re_node_set_empty (dfa->edests + clone_node);
1536 /* If the node is root_node itself, it means the epsilon closure
1537 has a loop. Then tie it to the destination of the root_node. */
1538 if (org_node == root_node && clone_node != org_node)
1540 ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
1541 if (BE (! ok, 0))
1542 return REG_ESPACE;
1543 break;
1545 /* In case the node has another constraint, append it. */
1546 constraint |= dfa->nodes[org_node].constraint;
1547 clone_dest = duplicate_node (dfa, org_dest, constraint);
1548 if (BE (clone_dest == -1, 0))
1549 return REG_ESPACE;
1550 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1551 if (BE (! ok, 0))
1552 return REG_ESPACE;
1554 else /* dfa->edests[org_node].nelem == 2 */
1556 /* In case of the node can epsilon-transit, and it has two
1557 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1558 org_dest = dfa->edests[org_node].elems[0];
1559 re_node_set_empty (dfa->edests + clone_node);
1560 /* Search for a duplicated node which satisfies the constraint. */
1561 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1562 if (clone_dest == -1)
1564 /* There is no such duplicated node, create a new one. */
1565 reg_errcode_t err;
1566 clone_dest = duplicate_node (dfa, org_dest, constraint);
1567 if (BE (clone_dest == -1, 0))
1568 return REG_ESPACE;
1569 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1570 if (BE (! ok, 0))
1571 return REG_ESPACE;
1572 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1573 root_node, constraint);
1574 if (BE (err != REG_NOERROR, 0))
1575 return err;
1577 else
1579 /* There is a duplicated node which satisfies the constraint,
1580 use it to avoid infinite loop. */
1581 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1582 if (BE (! ok, 0))
1583 return REG_ESPACE;
1586 org_dest = dfa->edests[org_node].elems[1];
1587 clone_dest = duplicate_node (dfa, org_dest, constraint);
1588 if (BE (clone_dest == -1, 0))
1589 return REG_ESPACE;
1590 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1591 if (BE (! ok, 0))
1592 return REG_ESPACE;
1594 org_node = org_dest;
1595 clone_node = clone_dest;
1597 return REG_NOERROR;
1600 /* Search for a node which is duplicated from the node ORG_NODE, and
1601 satisfies the constraint CONSTRAINT. */
1603 static Idx
1604 search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1605 unsigned int constraint)
1607 Idx idx;
1608 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1610 if (org_node == dfa->org_indices[idx]
1611 && constraint == dfa->nodes[idx].constraint)
1612 return idx; /* Found. */
1614 return -1; /* Not found. */
1617 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1618 Return the index of the new node, or -1 if insufficient storage is
1619 available. */
1621 static Idx
1622 duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
1624 Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1625 if (BE (dup_idx != -1, 1))
1627 dfa->nodes[dup_idx].constraint = constraint;
1628 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1629 dfa->nodes[dup_idx].duplicated = 1;
1631 /* Store the index of the original node. */
1632 dfa->org_indices[dup_idx] = org_idx;
1634 return dup_idx;
1637 static reg_errcode_t
1638 calc_inveclosure (re_dfa_t *dfa)
1640 Idx src, idx;
1641 bool ok;
1642 for (idx = 0; idx < dfa->nodes_len; ++idx)
1643 re_node_set_init_empty (dfa->inveclosures + idx);
1645 for (src = 0; src < dfa->nodes_len; ++src)
1647 Idx *elems = dfa->eclosures[src].elems;
1648 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1650 ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1651 if (BE (! ok, 0))
1652 return REG_ESPACE;
1656 return REG_NOERROR;
1659 /* Calculate "eclosure" for all the node in DFA. */
1661 static reg_errcode_t
1662 calc_eclosure (re_dfa_t *dfa)
1664 Idx node_idx;
1665 bool incomplete;
1666 #ifdef DEBUG
1667 assert (dfa->nodes_len > 0);
1668 #endif
1669 incomplete = false;
1670 /* For each nodes, calculate epsilon closure. */
1671 for (node_idx = 0; ; ++node_idx)
1673 reg_errcode_t err;
1674 re_node_set eclosure_elem;
1675 if (node_idx == dfa->nodes_len)
1677 if (!incomplete)
1678 break;
1679 incomplete = false;
1680 node_idx = 0;
1683 #ifdef DEBUG
1684 assert (dfa->eclosures[node_idx].nelem != -1);
1685 #endif
1687 /* If we have already calculated, skip it. */
1688 if (dfa->eclosures[node_idx].nelem != 0)
1689 continue;
1690 /* Calculate epsilon closure of 'node_idx'. */
1691 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1692 if (BE (err != REG_NOERROR, 0))
1693 return err;
1695 if (dfa->eclosures[node_idx].nelem == 0)
1697 incomplete = true;
1698 re_node_set_free (&eclosure_elem);
1701 return REG_NOERROR;
1704 /* Calculate epsilon closure of NODE. */
1706 static reg_errcode_t
1707 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
1709 reg_errcode_t err;
1710 Idx i;
1711 re_node_set eclosure;
1712 bool ok;
1713 bool incomplete = false;
1714 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1715 if (BE (err != REG_NOERROR, 0))
1716 return err;
1718 /* This indicates that we are calculating this node now.
1719 We reference this value to avoid infinite loop. */
1720 dfa->eclosures[node].nelem = -1;
1722 /* If the current node has constraints, duplicate all nodes
1723 since they must inherit the constraints. */
1724 if (dfa->nodes[node].constraint
1725 && dfa->edests[node].nelem
1726 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1728 err = duplicate_node_closure (dfa, node, node, node,
1729 dfa->nodes[node].constraint);
1730 if (BE (err != REG_NOERROR, 0))
1731 return err;
1734 /* Expand each epsilon destination nodes. */
1735 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1736 for (i = 0; i < dfa->edests[node].nelem; ++i)
1738 re_node_set eclosure_elem;
1739 Idx edest = dfa->edests[node].elems[i];
1740 /* If calculating the epsilon closure of 'edest' is in progress,
1741 return intermediate result. */
1742 if (dfa->eclosures[edest].nelem == -1)
1744 incomplete = true;
1745 continue;
1747 /* If we haven't calculated the epsilon closure of 'edest' yet,
1748 calculate now. Otherwise use calculated epsilon closure. */
1749 if (dfa->eclosures[edest].nelem == 0)
1751 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1752 if (BE (err != REG_NOERROR, 0))
1753 return err;
1755 else
1756 eclosure_elem = dfa->eclosures[edest];
1757 /* Merge the epsilon closure of 'edest'. */
1758 err = re_node_set_merge (&eclosure, &eclosure_elem);
1759 if (BE (err != REG_NOERROR, 0))
1760 return err;
1761 /* If the epsilon closure of 'edest' is incomplete,
1762 the epsilon closure of this node is also incomplete. */
1763 if (dfa->eclosures[edest].nelem == 0)
1765 incomplete = true;
1766 re_node_set_free (&eclosure_elem);
1770 /* An epsilon closure includes itself. */
1771 ok = re_node_set_insert (&eclosure, node);
1772 if (BE (! ok, 0))
1773 return REG_ESPACE;
1774 if (incomplete && !root)
1775 dfa->eclosures[node].nelem = 0;
1776 else
1777 dfa->eclosures[node] = eclosure;
1778 *new_set = eclosure;
1779 return REG_NOERROR;
1782 /* Functions for token which are used in the parser. */
1784 /* Fetch a token from INPUT.
1785 We must not use this function inside bracket expressions. */
1787 static void
1788 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1790 re_string_skip_bytes (input, peek_token (result, input, syntax));
1793 /* Peek a token from INPUT, and return the length of the token.
1794 We must not use this function inside bracket expressions. */
1796 static int
1797 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1799 unsigned char c;
1801 if (re_string_eoi (input))
1803 token->type = END_OF_RE;
1804 return 0;
1807 c = re_string_peek_byte (input, 0);
1808 token->opr.c = c;
1810 token->word_char = 0;
1811 #ifdef RE_ENABLE_I18N
1812 token->mb_partial = 0;
1813 if (input->mb_cur_max > 1 &&
1814 !re_string_first_byte (input, re_string_cur_idx (input)))
1816 token->type = CHARACTER;
1817 token->mb_partial = 1;
1818 return 1;
1820 #endif
1821 if (c == '\\')
1823 unsigned char c2;
1824 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1826 token->type = BACK_SLASH;
1827 return 1;
1830 c2 = re_string_peek_byte_case (input, 1);
1831 token->opr.c = c2;
1832 token->type = CHARACTER;
1833 #ifdef RE_ENABLE_I18N
1834 if (input->mb_cur_max > 1)
1836 wint_t wc = re_string_wchar_at (input,
1837 re_string_cur_idx (input) + 1);
1838 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1840 else
1841 #endif
1842 token->word_char = IS_WORD_CHAR (c2) != 0;
1844 switch (c2)
1846 case '|':
1847 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1848 token->type = OP_ALT;
1849 break;
1850 case '1': case '2': case '3': case '4': case '5':
1851 case '6': case '7': case '8': case '9':
1852 if (!(syntax & RE_NO_BK_REFS))
1854 token->type = OP_BACK_REF;
1855 token->opr.idx = c2 - '1';
1857 break;
1858 case '<':
1859 if (!(syntax & RE_NO_GNU_OPS))
1861 token->type = ANCHOR;
1862 token->opr.ctx_type = WORD_FIRST;
1864 break;
1865 case '>':
1866 if (!(syntax & RE_NO_GNU_OPS))
1868 token->type = ANCHOR;
1869 token->opr.ctx_type = WORD_LAST;
1871 break;
1872 case 'b':
1873 if (!(syntax & RE_NO_GNU_OPS))
1875 token->type = ANCHOR;
1876 token->opr.ctx_type = WORD_DELIM;
1878 break;
1879 case 'B':
1880 if (!(syntax & RE_NO_GNU_OPS))
1882 token->type = ANCHOR;
1883 token->opr.ctx_type = NOT_WORD_DELIM;
1885 break;
1886 case 'w':
1887 if (!(syntax & RE_NO_GNU_OPS))
1888 token->type = OP_WORD;
1889 break;
1890 case 'W':
1891 if (!(syntax & RE_NO_GNU_OPS))
1892 token->type = OP_NOTWORD;
1893 break;
1894 case 's':
1895 if (!(syntax & RE_NO_GNU_OPS))
1896 token->type = OP_SPACE;
1897 break;
1898 case 'S':
1899 if (!(syntax & RE_NO_GNU_OPS))
1900 token->type = OP_NOTSPACE;
1901 break;
1902 case '`':
1903 if (!(syntax & RE_NO_GNU_OPS))
1905 token->type = ANCHOR;
1906 token->opr.ctx_type = BUF_FIRST;
1908 break;
1909 case '\'':
1910 if (!(syntax & RE_NO_GNU_OPS))
1912 token->type = ANCHOR;
1913 token->opr.ctx_type = BUF_LAST;
1915 break;
1916 case '(':
1917 if (!(syntax & RE_NO_BK_PARENS))
1918 token->type = OP_OPEN_SUBEXP;
1919 break;
1920 case ')':
1921 if (!(syntax & RE_NO_BK_PARENS))
1922 token->type = OP_CLOSE_SUBEXP;
1923 break;
1924 case '+':
1925 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1926 token->type = OP_DUP_PLUS;
1927 break;
1928 case '?':
1929 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1930 token->type = OP_DUP_QUESTION;
1931 break;
1932 case '{':
1933 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1934 token->type = OP_OPEN_DUP_NUM;
1935 break;
1936 case '}':
1937 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1938 token->type = OP_CLOSE_DUP_NUM;
1939 break;
1940 default:
1941 break;
1943 return 2;
1946 token->type = CHARACTER;
1947 #ifdef RE_ENABLE_I18N
1948 if (input->mb_cur_max > 1)
1950 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1951 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1953 else
1954 #endif
1955 token->word_char = IS_WORD_CHAR (token->opr.c);
1957 switch (c)
1959 case '\n':
1960 if (syntax & RE_NEWLINE_ALT)
1961 token->type = OP_ALT;
1962 break;
1963 case '|':
1964 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1965 token->type = OP_ALT;
1966 break;
1967 case '*':
1968 token->type = OP_DUP_ASTERISK;
1969 break;
1970 case '+':
1971 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1972 token->type = OP_DUP_PLUS;
1973 break;
1974 case '?':
1975 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1976 token->type = OP_DUP_QUESTION;
1977 break;
1978 case '{':
1979 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1980 token->type = OP_OPEN_DUP_NUM;
1981 break;
1982 case '}':
1983 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1984 token->type = OP_CLOSE_DUP_NUM;
1985 break;
1986 case '(':
1987 if (syntax & RE_NO_BK_PARENS)
1988 token->type = OP_OPEN_SUBEXP;
1989 break;
1990 case ')':
1991 if (syntax & RE_NO_BK_PARENS)
1992 token->type = OP_CLOSE_SUBEXP;
1993 break;
1994 case '[':
1995 token->type = OP_OPEN_BRACKET;
1996 break;
1997 case '.':
1998 token->type = OP_PERIOD;
1999 break;
2000 case '^':
2001 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
2002 re_string_cur_idx (input) != 0)
2004 char prev = re_string_peek_byte (input, -1);
2005 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
2006 break;
2008 token->type = ANCHOR;
2009 token->opr.ctx_type = LINE_FIRST;
2010 break;
2011 case '$':
2012 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
2013 re_string_cur_idx (input) + 1 != re_string_length (input))
2015 re_token_t next;
2016 re_string_skip_bytes (input, 1);
2017 peek_token (&next, input, syntax);
2018 re_string_skip_bytes (input, -1);
2019 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
2020 break;
2022 token->type = ANCHOR;
2023 token->opr.ctx_type = LINE_LAST;
2024 break;
2025 default:
2026 break;
2028 return 1;
2031 /* Peek a token from INPUT, and return the length of the token.
2032 We must not use this function out of bracket expressions. */
2034 static int
2035 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2037 unsigned char c;
2038 if (re_string_eoi (input))
2040 token->type = END_OF_RE;
2041 return 0;
2043 c = re_string_peek_byte (input, 0);
2044 token->opr.c = c;
2046 #ifdef RE_ENABLE_I18N
2047 if (input->mb_cur_max > 1 &&
2048 !re_string_first_byte (input, re_string_cur_idx (input)))
2050 token->type = CHARACTER;
2051 return 1;
2053 #endif /* RE_ENABLE_I18N */
2055 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2056 && re_string_cur_idx (input) + 1 < re_string_length (input))
2058 /* In this case, '\' escape a character. */
2059 unsigned char c2;
2060 re_string_skip_bytes (input, 1);
2061 c2 = re_string_peek_byte (input, 0);
2062 token->opr.c = c2;
2063 token->type = CHARACTER;
2064 return 1;
2066 if (c == '[') /* '[' is a special char in a bracket exps. */
2068 unsigned char c2;
2069 int token_len;
2070 if (re_string_cur_idx (input) + 1 < re_string_length (input))
2071 c2 = re_string_peek_byte (input, 1);
2072 else
2073 c2 = 0;
2074 token->opr.c = c2;
2075 token_len = 2;
2076 switch (c2)
2078 case '.':
2079 token->type = OP_OPEN_COLL_ELEM;
2080 break;
2082 case '=':
2083 token->type = OP_OPEN_EQUIV_CLASS;
2084 break;
2086 case ':':
2087 if (syntax & RE_CHAR_CLASSES)
2089 token->type = OP_OPEN_CHAR_CLASS;
2090 break;
2092 FALLTHROUGH;
2093 default:
2094 token->type = CHARACTER;
2095 token->opr.c = c;
2096 token_len = 1;
2097 break;
2099 return token_len;
2101 switch (c)
2103 case '-':
2104 token->type = OP_CHARSET_RANGE;
2105 break;
2106 case ']':
2107 token->type = OP_CLOSE_BRACKET;
2108 break;
2109 case '^':
2110 token->type = OP_NON_MATCH_LIST;
2111 break;
2112 default:
2113 token->type = CHARACTER;
2115 return 1;
2118 /* Functions for parser. */
2120 /* Entry point of the parser.
2121 Parse the regular expression REGEXP and return the structure tree.
2122 If an error occurs, ERR is set by error code, and return NULL.
2123 This function build the following tree, from regular expression <reg_exp>:
2127 <reg_exp> EOR
2129 CAT means concatenation.
2130 EOR means end of regular expression. */
2132 static bin_tree_t *
2133 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2134 reg_errcode_t *err)
2136 re_dfa_t *dfa = preg->buffer;
2137 bin_tree_t *tree, *eor, *root;
2138 re_token_t current_token;
2139 dfa->syntax = syntax;
2140 fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2141 tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2142 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2143 return NULL;
2144 eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2145 if (tree != NULL)
2146 root = create_tree (dfa, tree, eor, CONCAT);
2147 else
2148 root = eor;
2149 if (BE (eor == NULL || root == NULL, 0))
2151 *err = REG_ESPACE;
2152 return NULL;
2154 return root;
2157 /* This function build the following tree, from regular expression
2158 <branch1>|<branch2>:
2162 <branch1> <branch2>
2164 ALT means alternative, which represents the operator '|'. */
2166 static bin_tree_t *
2167 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2168 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2170 re_dfa_t *dfa = preg->buffer;
2171 bin_tree_t *tree, *branch = NULL;
2172 bitset_word_t initial_bkref_map = dfa->completed_bkref_map;
2173 tree = parse_branch (regexp, preg, token, syntax, nest, err);
2174 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2175 return NULL;
2177 while (token->type == OP_ALT)
2179 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2180 if (token->type != OP_ALT && token->type != END_OF_RE
2181 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2183 bitset_word_t accumulated_bkref_map = dfa->completed_bkref_map;
2184 dfa->completed_bkref_map = initial_bkref_map;
2185 branch = parse_branch (regexp, preg, token, syntax, nest, err);
2186 if (BE (*err != REG_NOERROR && branch == NULL, 0))
2188 if (tree != NULL)
2189 postorder (tree, free_tree, NULL);
2190 return NULL;
2192 dfa->completed_bkref_map |= accumulated_bkref_map;
2194 else
2195 branch = NULL;
2196 tree = create_tree (dfa, tree, branch, OP_ALT);
2197 if (BE (tree == NULL, 0))
2199 *err = REG_ESPACE;
2200 return NULL;
2203 return tree;
2206 /* This function build the following tree, from regular expression
2207 <exp1><exp2>:
2211 <exp1> <exp2>
2213 CAT means concatenation. */
2215 static bin_tree_t *
2216 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2217 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2219 bin_tree_t *tree, *expr;
2220 re_dfa_t *dfa = preg->buffer;
2221 tree = parse_expression (regexp, preg, token, syntax, nest, err);
2222 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2223 return NULL;
2225 while (token->type != OP_ALT && token->type != END_OF_RE
2226 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2228 expr = parse_expression (regexp, preg, token, syntax, nest, err);
2229 if (BE (*err != REG_NOERROR && expr == NULL, 0))
2231 if (tree != NULL)
2232 postorder (tree, free_tree, NULL);
2233 return NULL;
2235 if (tree != NULL && expr != NULL)
2237 bin_tree_t *newtree = create_tree (dfa, tree, expr, CONCAT);
2238 if (newtree == NULL)
2240 postorder (expr, free_tree, NULL);
2241 postorder (tree, free_tree, NULL);
2242 *err = REG_ESPACE;
2243 return NULL;
2245 tree = newtree;
2247 else if (tree == NULL)
2248 tree = expr;
2249 /* Otherwise expr == NULL, we don't need to create new tree. */
2251 return tree;
2254 /* This function build the following tree, from regular expression a*:
2260 static bin_tree_t *
2261 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2262 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2264 re_dfa_t *dfa = preg->buffer;
2265 bin_tree_t *tree;
2266 switch (token->type)
2268 case CHARACTER:
2269 tree = create_token_tree (dfa, NULL, NULL, token);
2270 if (BE (tree == NULL, 0))
2272 *err = REG_ESPACE;
2273 return NULL;
2275 #ifdef RE_ENABLE_I18N
2276 if (dfa->mb_cur_max > 1)
2278 while (!re_string_eoi (regexp)
2279 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2281 bin_tree_t *mbc_remain;
2282 fetch_token (token, regexp, syntax);
2283 mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2284 tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2285 if (BE (mbc_remain == NULL || tree == NULL, 0))
2287 *err = REG_ESPACE;
2288 return NULL;
2292 #endif
2293 break;
2295 case OP_OPEN_SUBEXP:
2296 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2297 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2298 return NULL;
2299 break;
2301 case OP_OPEN_BRACKET:
2302 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2303 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2304 return NULL;
2305 break;
2307 case OP_BACK_REF:
2308 if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2310 *err = REG_ESUBREG;
2311 return NULL;
2313 dfa->used_bkref_map |= 1 << token->opr.idx;
2314 tree = create_token_tree (dfa, NULL, NULL, token);
2315 if (BE (tree == NULL, 0))
2317 *err = REG_ESPACE;
2318 return NULL;
2320 ++dfa->nbackref;
2321 dfa->has_mb_node = 1;
2322 break;
2324 case OP_OPEN_DUP_NUM:
2325 if (syntax & RE_CONTEXT_INVALID_DUP)
2327 *err = REG_BADRPT;
2328 return NULL;
2330 FALLTHROUGH;
2331 case OP_DUP_ASTERISK:
2332 case OP_DUP_PLUS:
2333 case OP_DUP_QUESTION:
2334 if (syntax & RE_CONTEXT_INVALID_OPS)
2336 *err = REG_BADRPT;
2337 return NULL;
2339 else if (syntax & RE_CONTEXT_INDEP_OPS)
2341 fetch_token (token, regexp, syntax);
2342 return parse_expression (regexp, preg, token, syntax, nest, err);
2344 FALLTHROUGH;
2345 case OP_CLOSE_SUBEXP:
2346 if ((token->type == OP_CLOSE_SUBEXP) &&
2347 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2349 *err = REG_ERPAREN;
2350 return NULL;
2352 FALLTHROUGH;
2353 case OP_CLOSE_DUP_NUM:
2354 /* We treat it as a normal character. */
2356 /* Then we can these characters as normal characters. */
2357 token->type = CHARACTER;
2358 /* mb_partial and word_char bits should be initialized already
2359 by peek_token. */
2360 tree = create_token_tree (dfa, NULL, NULL, token);
2361 if (BE (tree == NULL, 0))
2363 *err = REG_ESPACE;
2364 return NULL;
2366 break;
2368 case ANCHOR:
2369 if ((token->opr.ctx_type
2370 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2371 && dfa->word_ops_used == 0)
2372 init_word_char (dfa);
2373 if (token->opr.ctx_type == WORD_DELIM
2374 || token->opr.ctx_type == NOT_WORD_DELIM)
2376 bin_tree_t *tree_first, *tree_last;
2377 if (token->opr.ctx_type == WORD_DELIM)
2379 token->opr.ctx_type = WORD_FIRST;
2380 tree_first = create_token_tree (dfa, NULL, NULL, token);
2381 token->opr.ctx_type = WORD_LAST;
2383 else
2385 token->opr.ctx_type = INSIDE_WORD;
2386 tree_first = create_token_tree (dfa, NULL, NULL, token);
2387 token->opr.ctx_type = INSIDE_NOTWORD;
2389 tree_last = create_token_tree (dfa, NULL, NULL, token);
2390 tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2391 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2393 *err = REG_ESPACE;
2394 return NULL;
2397 else
2399 tree = create_token_tree (dfa, NULL, NULL, token);
2400 if (BE (tree == NULL, 0))
2402 *err = REG_ESPACE;
2403 return NULL;
2406 /* We must return here, since ANCHORs can't be followed
2407 by repetition operators.
2408 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2409 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2410 fetch_token (token, regexp, syntax);
2411 return tree;
2413 case OP_PERIOD:
2414 tree = create_token_tree (dfa, NULL, NULL, token);
2415 if (BE (tree == NULL, 0))
2417 *err = REG_ESPACE;
2418 return NULL;
2420 if (dfa->mb_cur_max > 1)
2421 dfa->has_mb_node = 1;
2422 break;
2424 case OP_WORD:
2425 case OP_NOTWORD:
2426 tree = build_charclass_op (dfa, regexp->trans,
2427 "alnum",
2428 "_",
2429 token->type == OP_NOTWORD, err);
2430 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2431 return NULL;
2432 break;
2434 case OP_SPACE:
2435 case OP_NOTSPACE:
2436 tree = build_charclass_op (dfa, regexp->trans,
2437 "space",
2439 token->type == OP_NOTSPACE, err);
2440 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2441 return NULL;
2442 break;
2444 case OP_ALT:
2445 case END_OF_RE:
2446 return NULL;
2448 case BACK_SLASH:
2449 *err = REG_EESCAPE;
2450 return NULL;
2452 default:
2453 /* Must not happen? */
2454 #ifdef DEBUG
2455 assert (0);
2456 #endif
2457 return NULL;
2459 fetch_token (token, regexp, syntax);
2461 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2462 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2464 bin_tree_t *dup_tree = parse_dup_op (tree, regexp, dfa, token,
2465 syntax, err);
2466 if (BE (*err != REG_NOERROR && dup_tree == NULL, 0))
2468 if (tree != NULL)
2469 postorder (tree, free_tree, NULL);
2470 return NULL;
2472 tree = dup_tree;
2473 /* In BRE consecutive duplications are not allowed. */
2474 if ((syntax & RE_CONTEXT_INVALID_DUP)
2475 && (token->type == OP_DUP_ASTERISK
2476 || token->type == OP_OPEN_DUP_NUM))
2478 if (tree != NULL)
2479 postorder (tree, free_tree, NULL);
2480 *err = REG_BADRPT;
2481 return NULL;
2485 return tree;
2488 /* This function build the following tree, from regular expression
2489 (<reg_exp>):
2490 SUBEXP
2492 <reg_exp>
2495 static bin_tree_t *
2496 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2497 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2499 re_dfa_t *dfa = preg->buffer;
2500 bin_tree_t *tree;
2501 size_t cur_nsub;
2502 cur_nsub = preg->re_nsub++;
2504 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2506 /* The subexpression may be a null string. */
2507 if (token->type == OP_CLOSE_SUBEXP)
2508 tree = NULL;
2509 else
2511 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2512 if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2514 if (tree != NULL)
2515 postorder (tree, free_tree, NULL);
2516 *err = REG_EPAREN;
2518 if (BE (*err != REG_NOERROR, 0))
2519 return NULL;
2522 if (cur_nsub <= '9' - '1')
2523 dfa->completed_bkref_map |= 1 << cur_nsub;
2525 tree = create_tree (dfa, tree, NULL, SUBEXP);
2526 if (BE (tree == NULL, 0))
2528 *err = REG_ESPACE;
2529 return NULL;
2531 tree->token.opr.idx = cur_nsub;
2532 return tree;
2535 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2537 static bin_tree_t *
2538 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2539 re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2541 bin_tree_t *tree = NULL, *old_tree = NULL;
2542 Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2543 re_token_t start_token = *token;
2545 if (token->type == OP_OPEN_DUP_NUM)
2547 end = 0;
2548 start = fetch_number (regexp, token, syntax);
2549 if (start == -1)
2551 if (token->type == CHARACTER && token->opr.c == ',')
2552 start = 0; /* We treat "{,m}" as "{0,m}". */
2553 else
2555 *err = REG_BADBR; /* <re>{} is invalid. */
2556 return NULL;
2559 if (BE (start != -2, 1))
2561 /* We treat "{n}" as "{n,n}". */
2562 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2563 : ((token->type == CHARACTER && token->opr.c == ',')
2564 ? fetch_number (regexp, token, syntax) : -2));
2566 if (BE (start == -2 || end == -2, 0))
2568 /* Invalid sequence. */
2569 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2571 if (token->type == END_OF_RE)
2572 *err = REG_EBRACE;
2573 else
2574 *err = REG_BADBR;
2576 return NULL;
2579 /* If the syntax bit is set, rollback. */
2580 re_string_set_index (regexp, start_idx);
2581 *token = start_token;
2582 token->type = CHARACTER;
2583 /* mb_partial and word_char bits should be already initialized by
2584 peek_token. */
2585 return elem;
2588 if (BE ((end != -1 && start > end)
2589 || token->type != OP_CLOSE_DUP_NUM, 0))
2591 /* First number greater than second. */
2592 *err = REG_BADBR;
2593 return NULL;
2596 if (BE (RE_DUP_MAX < (end == -1 ? start : end), 0))
2598 *err = REG_ESIZE;
2599 return NULL;
2602 else
2604 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2605 end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2608 fetch_token (token, regexp, syntax);
2610 if (BE (elem == NULL, 0))
2611 return NULL;
2612 if (BE (start == 0 && end == 0, 0))
2614 postorder (elem, free_tree, NULL);
2615 return NULL;
2618 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2619 if (BE (start > 0, 0))
2621 tree = elem;
2622 for (i = 2; i <= start; ++i)
2624 elem = duplicate_tree (elem, dfa);
2625 tree = create_tree (dfa, tree, elem, CONCAT);
2626 if (BE (elem == NULL || tree == NULL, 0))
2627 goto parse_dup_op_espace;
2630 if (start == end)
2631 return tree;
2633 /* Duplicate ELEM before it is marked optional. */
2634 elem = duplicate_tree (elem, dfa);
2635 if (BE (elem == NULL, 0))
2636 goto parse_dup_op_espace;
2637 old_tree = tree;
2639 else
2640 old_tree = NULL;
2642 if (elem->token.type == SUBEXP)
2644 uintptr_t subidx = elem->token.opr.idx;
2645 postorder (elem, mark_opt_subexp, (void *) subidx);
2648 tree = create_tree (dfa, elem, NULL,
2649 (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2650 if (BE (tree == NULL, 0))
2651 goto parse_dup_op_espace;
2653 /* This loop is actually executed only when end != -1,
2654 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2655 already created the start+1-th copy. */
2656 if (TYPE_SIGNED (Idx) || end != -1)
2657 for (i = start + 2; i <= end; ++i)
2659 elem = duplicate_tree (elem, dfa);
2660 tree = create_tree (dfa, tree, elem, CONCAT);
2661 if (BE (elem == NULL || tree == NULL, 0))
2662 goto parse_dup_op_espace;
2664 tree = create_tree (dfa, tree, NULL, OP_ALT);
2665 if (BE (tree == NULL, 0))
2666 goto parse_dup_op_espace;
2669 if (old_tree)
2670 tree = create_tree (dfa, old_tree, tree, CONCAT);
2672 return tree;
2674 parse_dup_op_espace:
2675 *err = REG_ESPACE;
2676 return NULL;
2679 /* Size of the names for collating symbol/equivalence_class/character_class.
2680 I'm not sure, but maybe enough. */
2681 #define BRACKET_NAME_BUF_SIZE 32
2683 #ifndef _LIBC
2685 # ifdef RE_ENABLE_I18N
2686 /* Convert the byte B to the corresponding wide character. In a
2687 unibyte locale, treat B as itself. In a multibyte locale, return
2688 WEOF if B is an encoding error. */
2689 static wint_t
2690 parse_byte (unsigned char b, re_charset_t *mbcset)
2692 return mbcset == NULL ? b : __btowc (b);
2694 # endif
2696 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2697 Build the range expression which starts from START_ELEM, and ends
2698 at END_ELEM. The result are written to MBCSET and SBCSET.
2699 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2700 mbcset->range_ends, is a pointer argument since we may
2701 update it. */
2703 static reg_errcode_t
2704 # ifdef RE_ENABLE_I18N
2705 build_range_exp (const reg_syntax_t syntax,
2706 bitset_t sbcset,
2707 re_charset_t *mbcset,
2708 Idx *range_alloc,
2709 const bracket_elem_t *start_elem,
2710 const bracket_elem_t *end_elem)
2711 # else /* not RE_ENABLE_I18N */
2712 build_range_exp (const reg_syntax_t syntax,
2713 bitset_t sbcset,
2714 const bracket_elem_t *start_elem,
2715 const bracket_elem_t *end_elem)
2716 # endif /* not RE_ENABLE_I18N */
2718 unsigned int start_ch, end_ch;
2719 /* Equivalence Classes and Character Classes can't be a range start/end. */
2720 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2721 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2723 return REG_ERANGE;
2725 /* We can handle no multi character collating elements without libc
2726 support. */
2727 if (BE ((start_elem->type == COLL_SYM
2728 && strlen ((char *) start_elem->opr.name) > 1)
2729 || (end_elem->type == COLL_SYM
2730 && strlen ((char *) end_elem->opr.name) > 1), 0))
2731 return REG_ECOLLATE;
2733 # ifdef RE_ENABLE_I18N
2735 wchar_t wc;
2736 wint_t start_wc;
2737 wint_t end_wc;
2739 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2740 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2741 : 0));
2742 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2743 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2744 : 0));
2745 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2746 ? parse_byte (start_ch, mbcset) : start_elem->opr.wch);
2747 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2748 ? parse_byte (end_ch, mbcset) : end_elem->opr.wch);
2749 if (start_wc == WEOF || end_wc == WEOF)
2750 return REG_ECOLLATE;
2751 else if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_wc > end_wc, 0))
2752 return REG_ERANGE;
2754 /* Got valid collation sequence values, add them as a new entry.
2755 However, for !_LIBC we have no collation elements: if the
2756 character set is single byte, the single byte character set
2757 that we build below suffices. parse_bracket_exp passes
2758 no MBCSET if dfa->mb_cur_max == 1. */
2759 if (mbcset)
2761 /* Check the space of the arrays. */
2762 if (BE (*range_alloc == mbcset->nranges, 0))
2764 /* There is not enough space, need realloc. */
2765 wchar_t *new_array_start, *new_array_end;
2766 Idx new_nranges;
2768 /* +1 in case of mbcset->nranges is 0. */
2769 new_nranges = 2 * mbcset->nranges + 1;
2770 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2771 are NULL if *range_alloc == 0. */
2772 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2773 new_nranges);
2774 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2775 new_nranges);
2777 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2779 re_free (new_array_start);
2780 re_free (new_array_end);
2781 return REG_ESPACE;
2784 mbcset->range_starts = new_array_start;
2785 mbcset->range_ends = new_array_end;
2786 *range_alloc = new_nranges;
2789 mbcset->range_starts[mbcset->nranges] = start_wc;
2790 mbcset->range_ends[mbcset->nranges++] = end_wc;
2793 /* Build the table for single byte characters. */
2794 for (wc = 0; wc < SBC_MAX; ++wc)
2796 if (start_wc <= wc && wc <= end_wc)
2797 bitset_set (sbcset, wc);
2800 # else /* not RE_ENABLE_I18N */
2802 unsigned int ch;
2803 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2804 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2805 : 0));
2806 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2807 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2808 : 0));
2809 if (start_ch > end_ch)
2810 return REG_ERANGE;
2811 /* Build the table for single byte characters. */
2812 for (ch = 0; ch < SBC_MAX; ++ch)
2813 if (start_ch <= ch && ch <= end_ch)
2814 bitset_set (sbcset, ch);
2816 # endif /* not RE_ENABLE_I18N */
2817 return REG_NOERROR;
2819 #endif /* not _LIBC */
2821 #ifndef _LIBC
2822 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2823 Build the collating element which is represented by NAME.
2824 The result are written to MBCSET and SBCSET.
2825 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2826 pointer argument since we may update it. */
2828 static reg_errcode_t
2829 # ifdef RE_ENABLE_I18N
2830 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2831 Idx *coll_sym_alloc, const unsigned char *name)
2832 # else /* not RE_ENABLE_I18N */
2833 build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2834 # endif /* not RE_ENABLE_I18N */
2836 size_t name_len = strlen ((const char *) name);
2837 if (BE (name_len != 1, 0))
2838 return REG_ECOLLATE;
2839 else
2841 bitset_set (sbcset, name[0]);
2842 return REG_NOERROR;
2845 #endif /* not _LIBC */
2847 /* This function parse bracket expression like "[abc]", "[a-c]",
2848 "[[.a-a.]]" etc. */
2850 static bin_tree_t *
2851 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2852 reg_syntax_t syntax, reg_errcode_t *err)
2854 #ifdef _LIBC
2855 const unsigned char *collseqmb;
2856 const char *collseqwc;
2857 uint32_t nrules;
2858 int32_t table_size;
2859 const int32_t *symb_table;
2860 const unsigned char *extra;
2862 /* Local function for parse_bracket_exp used in _LIBC environment.
2863 Seek the collating symbol entry corresponding to NAME.
2864 Return the index of the symbol in the SYMB_TABLE,
2865 or -1 if not found. */
2867 auto inline int32_t
2868 __attribute__ ((always_inline))
2869 seek_collating_symbol_entry (const unsigned char *name, size_t name_len)
2871 int32_t elem;
2873 for (elem = 0; elem < table_size; elem++)
2874 if (symb_table[2 * elem] != 0)
2876 int32_t idx = symb_table[2 * elem + 1];
2877 /* Skip the name of collating element name. */
2878 idx += 1 + extra[idx];
2879 if (/* Compare the length of the name. */
2880 name_len == extra[idx]
2881 /* Compare the name. */
2882 && memcmp (name, &extra[idx + 1], name_len) == 0)
2883 /* Yep, this is the entry. */
2884 return elem;
2886 return -1;
2889 /* Local function for parse_bracket_exp used in _LIBC environment.
2890 Look up the collation sequence value of BR_ELEM.
2891 Return the value if succeeded, UINT_MAX otherwise. */
2893 auto inline unsigned int
2894 __attribute__ ((always_inline))
2895 lookup_collation_sequence_value (bracket_elem_t *br_elem)
2897 if (br_elem->type == SB_CHAR)
2900 if (MB_CUR_MAX == 1)
2902 if (nrules == 0)
2903 return collseqmb[br_elem->opr.ch];
2904 else
2906 wint_t wc = __btowc (br_elem->opr.ch);
2907 return __collseq_table_lookup (collseqwc, wc);
2910 else if (br_elem->type == MB_CHAR)
2912 if (nrules != 0)
2913 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2915 else if (br_elem->type == COLL_SYM)
2917 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2918 if (nrules != 0)
2920 int32_t elem, idx;
2921 elem = seek_collating_symbol_entry (br_elem->opr.name,
2922 sym_name_len);
2923 if (elem != -1)
2925 /* We found the entry. */
2926 idx = symb_table[2 * elem + 1];
2927 /* Skip the name of collating element name. */
2928 idx += 1 + extra[idx];
2929 /* Skip the byte sequence of the collating element. */
2930 idx += 1 + extra[idx];
2931 /* Adjust for the alignment. */
2932 idx = (idx + 3) & ~3;
2933 /* Skip the multibyte collation sequence value. */
2934 idx += sizeof (unsigned int);
2935 /* Skip the wide char sequence of the collating element. */
2936 idx += sizeof (unsigned int) *
2937 (1 + *(unsigned int *) (extra + idx));
2938 /* Return the collation sequence value. */
2939 return *(unsigned int *) (extra + idx);
2941 else if (sym_name_len == 1)
2943 /* No valid character. Match it as a single byte
2944 character. */
2945 return collseqmb[br_elem->opr.name[0]];
2948 else if (sym_name_len == 1)
2949 return collseqmb[br_elem->opr.name[0]];
2951 return UINT_MAX;
2954 /* Local function for parse_bracket_exp used in _LIBC environment.
2955 Build the range expression which starts from START_ELEM, and ends
2956 at END_ELEM. The result are written to MBCSET and SBCSET.
2957 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2958 mbcset->range_ends, is a pointer argument since we may
2959 update it. */
2961 auto inline reg_errcode_t
2962 __attribute__ ((always_inline))
2963 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2964 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2966 unsigned int ch;
2967 uint32_t start_collseq;
2968 uint32_t end_collseq;
2970 /* Equivalence Classes and Character Classes can't be a range
2971 start/end. */
2972 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2973 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2975 return REG_ERANGE;
2977 /* FIXME: Implement rational ranges here, too. */
2978 start_collseq = lookup_collation_sequence_value (start_elem);
2979 end_collseq = lookup_collation_sequence_value (end_elem);
2980 /* Check start/end collation sequence values. */
2981 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2982 return REG_ECOLLATE;
2983 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2984 return REG_ERANGE;
2986 /* Got valid collation sequence values, add them as a new entry.
2987 However, if we have no collation elements, and the character set
2988 is single byte, the single byte character set that we
2989 build below suffices. */
2990 if (nrules > 0 || dfa->mb_cur_max > 1)
2992 /* Check the space of the arrays. */
2993 if (BE (*range_alloc == mbcset->nranges, 0))
2995 /* There is not enough space, need realloc. */
2996 uint32_t *new_array_start;
2997 uint32_t *new_array_end;
2998 Idx new_nranges;
3000 /* +1 in case of mbcset->nranges is 0. */
3001 new_nranges = 2 * mbcset->nranges + 1;
3002 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
3003 new_nranges);
3004 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
3005 new_nranges);
3007 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
3008 return REG_ESPACE;
3010 mbcset->range_starts = new_array_start;
3011 mbcset->range_ends = new_array_end;
3012 *range_alloc = new_nranges;
3015 mbcset->range_starts[mbcset->nranges] = start_collseq;
3016 mbcset->range_ends[mbcset->nranges++] = end_collseq;
3019 /* Build the table for single byte characters. */
3020 for (ch = 0; ch < SBC_MAX; ch++)
3022 uint32_t ch_collseq;
3024 if (MB_CUR_MAX == 1)
3026 if (nrules == 0)
3027 ch_collseq = collseqmb[ch];
3028 else
3029 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
3030 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
3031 bitset_set (sbcset, ch);
3033 return REG_NOERROR;
3036 /* Local function for parse_bracket_exp used in _LIBC environment.
3037 Build the collating element which is represented by NAME.
3038 The result are written to MBCSET and SBCSET.
3039 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
3040 pointer argument since we may update it. */
3042 auto inline reg_errcode_t
3043 __attribute__ ((always_inline))
3044 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
3045 Idx *coll_sym_alloc, const unsigned char *name)
3047 int32_t elem, idx;
3048 size_t name_len = strlen ((const char *) name);
3049 if (nrules != 0)
3051 elem = seek_collating_symbol_entry (name, name_len);
3052 if (elem != -1)
3054 /* We found the entry. */
3055 idx = symb_table[2 * elem + 1];
3056 /* Skip the name of collating element name. */
3057 idx += 1 + extra[idx];
3059 else if (name_len == 1)
3061 /* No valid character, treat it as a normal
3062 character. */
3063 bitset_set (sbcset, name[0]);
3064 return REG_NOERROR;
3066 else
3067 return REG_ECOLLATE;
3069 /* Got valid collation sequence, add it as a new entry. */
3070 /* Check the space of the arrays. */
3071 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3073 /* Not enough, realloc it. */
3074 /* +1 in case of mbcset->ncoll_syms is 0. */
3075 Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3076 /* Use realloc since mbcset->coll_syms is NULL
3077 if *alloc == 0. */
3078 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3079 new_coll_sym_alloc);
3080 if (BE (new_coll_syms == NULL, 0))
3081 return REG_ESPACE;
3082 mbcset->coll_syms = new_coll_syms;
3083 *coll_sym_alloc = new_coll_sym_alloc;
3085 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3086 return REG_NOERROR;
3088 else
3090 if (BE (name_len != 1, 0))
3091 return REG_ECOLLATE;
3092 else
3094 bitset_set (sbcset, name[0]);
3095 return REG_NOERROR;
3099 #endif
3101 re_token_t br_token;
3102 re_bitset_ptr_t sbcset;
3103 #ifdef RE_ENABLE_I18N
3104 re_charset_t *mbcset;
3105 Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3106 Idx equiv_class_alloc = 0, char_class_alloc = 0;
3107 #endif /* not RE_ENABLE_I18N */
3108 bool non_match = false;
3109 bin_tree_t *work_tree;
3110 int token_len;
3111 bool first_round = true;
3112 #ifdef _LIBC
3113 collseqmb = (const unsigned char *)
3114 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3115 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3116 if (nrules)
3119 if (MB_CUR_MAX > 1)
3121 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3122 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3123 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3124 _NL_COLLATE_SYMB_TABLEMB);
3125 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3126 _NL_COLLATE_SYMB_EXTRAMB);
3128 #endif
3129 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3130 #ifdef RE_ENABLE_I18N
3131 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3132 #endif /* RE_ENABLE_I18N */
3133 #ifdef RE_ENABLE_I18N
3134 if (BE (sbcset == NULL || mbcset == NULL, 0))
3135 #else
3136 if (BE (sbcset == NULL, 0))
3137 #endif /* RE_ENABLE_I18N */
3139 re_free (sbcset);
3140 #ifdef RE_ENABLE_I18N
3141 re_free (mbcset);
3142 #endif
3143 *err = REG_ESPACE;
3144 return NULL;
3147 token_len = peek_token_bracket (token, regexp, syntax);
3148 if (BE (token->type == END_OF_RE, 0))
3150 *err = REG_BADPAT;
3151 goto parse_bracket_exp_free_return;
3153 if (token->type == OP_NON_MATCH_LIST)
3155 #ifdef RE_ENABLE_I18N
3156 mbcset->non_match = 1;
3157 #endif /* not RE_ENABLE_I18N */
3158 non_match = true;
3159 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3160 bitset_set (sbcset, '\n');
3161 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3162 token_len = peek_token_bracket (token, regexp, syntax);
3163 if (BE (token->type == END_OF_RE, 0))
3165 *err = REG_BADPAT;
3166 goto parse_bracket_exp_free_return;
3170 /* We treat the first ']' as a normal character. */
3171 if (token->type == OP_CLOSE_BRACKET)
3172 token->type = CHARACTER;
3174 while (1)
3176 bracket_elem_t start_elem, end_elem;
3177 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3178 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3179 reg_errcode_t ret;
3180 int token_len2 = 0;
3181 bool is_range_exp = false;
3182 re_token_t token2;
3184 start_elem.opr.name = start_name_buf;
3185 start_elem.type = COLL_SYM;
3186 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3187 syntax, first_round);
3188 if (BE (ret != REG_NOERROR, 0))
3190 *err = ret;
3191 goto parse_bracket_exp_free_return;
3193 first_round = false;
3195 /* Get information about the next token. We need it in any case. */
3196 token_len = peek_token_bracket (token, regexp, syntax);
3198 /* Do not check for ranges if we know they are not allowed. */
3199 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3201 if (BE (token->type == END_OF_RE, 0))
3203 *err = REG_EBRACK;
3204 goto parse_bracket_exp_free_return;
3206 if (token->type == OP_CHARSET_RANGE)
3208 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3209 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3210 if (BE (token2.type == END_OF_RE, 0))
3212 *err = REG_EBRACK;
3213 goto parse_bracket_exp_free_return;
3215 if (token2.type == OP_CLOSE_BRACKET)
3217 /* We treat the last '-' as a normal character. */
3218 re_string_skip_bytes (regexp, -token_len);
3219 token->type = CHARACTER;
3221 else
3222 is_range_exp = true;
3226 if (is_range_exp == true)
3228 end_elem.opr.name = end_name_buf;
3229 end_elem.type = COLL_SYM;
3230 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3231 dfa, syntax, true);
3232 if (BE (ret != REG_NOERROR, 0))
3234 *err = ret;
3235 goto parse_bracket_exp_free_return;
3238 token_len = peek_token_bracket (token, regexp, syntax);
3240 #ifdef _LIBC
3241 *err = build_range_exp (sbcset, mbcset, &range_alloc,
3242 &start_elem, &end_elem);
3243 #else
3244 # ifdef RE_ENABLE_I18N
3245 *err = build_range_exp (syntax, sbcset,
3246 dfa->mb_cur_max > 1 ? mbcset : NULL,
3247 &range_alloc, &start_elem, &end_elem);
3248 # else
3249 *err = build_range_exp (syntax, sbcset, &start_elem, &end_elem);
3250 # endif
3251 #endif /* RE_ENABLE_I18N */
3252 if (BE (*err != REG_NOERROR, 0))
3253 goto parse_bracket_exp_free_return;
3255 else
3257 switch (start_elem.type)
3259 case SB_CHAR:
3260 bitset_set (sbcset, start_elem.opr.ch);
3261 break;
3262 #ifdef RE_ENABLE_I18N
3263 case MB_CHAR:
3264 /* Check whether the array has enough space. */
3265 if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3267 wchar_t *new_mbchars;
3268 /* Not enough, realloc it. */
3269 /* +1 in case of mbcset->nmbchars is 0. */
3270 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3271 /* Use realloc since array is NULL if *alloc == 0. */
3272 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3273 mbchar_alloc);
3274 if (BE (new_mbchars == NULL, 0))
3275 goto parse_bracket_exp_espace;
3276 mbcset->mbchars = new_mbchars;
3278 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3279 break;
3280 #endif /* RE_ENABLE_I18N */
3281 case EQUIV_CLASS:
3282 *err = build_equiv_class (sbcset,
3283 #ifdef RE_ENABLE_I18N
3284 mbcset, &equiv_class_alloc,
3285 #endif /* RE_ENABLE_I18N */
3286 start_elem.opr.name);
3287 if (BE (*err != REG_NOERROR, 0))
3288 goto parse_bracket_exp_free_return;
3289 break;
3290 case COLL_SYM:
3291 *err = build_collating_symbol (sbcset,
3292 #ifdef RE_ENABLE_I18N
3293 mbcset, &coll_sym_alloc,
3294 #endif /* RE_ENABLE_I18N */
3295 start_elem.opr.name);
3296 if (BE (*err != REG_NOERROR, 0))
3297 goto parse_bracket_exp_free_return;
3298 break;
3299 case CHAR_CLASS:
3300 *err = build_charclass (regexp->trans, sbcset,
3301 #ifdef RE_ENABLE_I18N
3302 mbcset, &char_class_alloc,
3303 #endif /* RE_ENABLE_I18N */
3304 (const char *) start_elem.opr.name,
3305 syntax);
3306 if (BE (*err != REG_NOERROR, 0))
3307 goto parse_bracket_exp_free_return;
3308 break;
3309 default:
3310 assert (0);
3311 break;
3314 if (BE (token->type == END_OF_RE, 0))
3316 *err = REG_EBRACK;
3317 goto parse_bracket_exp_free_return;
3319 if (token->type == OP_CLOSE_BRACKET)
3320 break;
3323 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3325 /* If it is non-matching list. */
3326 if (non_match)
3327 bitset_not (sbcset);
3329 #ifdef RE_ENABLE_I18N
3330 /* Ensure only single byte characters are set. */
3331 if (dfa->mb_cur_max > 1)
3332 bitset_mask (sbcset, dfa->sb_char);
3334 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3335 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3336 || mbcset->non_match)))
3338 bin_tree_t *mbc_tree;
3339 int sbc_idx;
3340 /* Build a tree for complex bracket. */
3341 dfa->has_mb_node = 1;
3342 br_token.type = COMPLEX_BRACKET;
3343 br_token.opr.mbcset = mbcset;
3344 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3345 if (BE (mbc_tree == NULL, 0))
3346 goto parse_bracket_exp_espace;
3347 for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3348 if (sbcset[sbc_idx])
3349 break;
3350 /* If there are no bits set in sbcset, there is no point
3351 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3352 if (sbc_idx < BITSET_WORDS)
3354 /* Build a tree for simple bracket. */
3355 br_token.type = SIMPLE_BRACKET;
3356 br_token.opr.sbcset = sbcset;
3357 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3358 if (BE (work_tree == NULL, 0))
3359 goto parse_bracket_exp_espace;
3361 /* Then join them by ALT node. */
3362 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3363 if (BE (work_tree == NULL, 0))
3364 goto parse_bracket_exp_espace;
3366 else
3368 re_free (sbcset);
3369 work_tree = mbc_tree;
3372 else
3373 #endif /* not RE_ENABLE_I18N */
3375 #ifdef RE_ENABLE_I18N
3376 free_charset (mbcset);
3377 #endif
3378 /* Build a tree for simple bracket. */
3379 br_token.type = SIMPLE_BRACKET;
3380 br_token.opr.sbcset = sbcset;
3381 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3382 if (BE (work_tree == NULL, 0))
3383 goto parse_bracket_exp_espace;
3385 return work_tree;
3387 parse_bracket_exp_espace:
3388 *err = REG_ESPACE;
3389 parse_bracket_exp_free_return:
3390 re_free (sbcset);
3391 #ifdef RE_ENABLE_I18N
3392 free_charset (mbcset);
3393 #endif /* RE_ENABLE_I18N */
3394 return NULL;
3397 /* Parse an element in the bracket expression. */
3399 static reg_errcode_t
3400 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3401 re_token_t *token, int token_len, re_dfa_t *dfa,
3402 reg_syntax_t syntax, bool accept_hyphen)
3404 #ifdef RE_ENABLE_I18N
3405 int cur_char_size;
3406 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3407 if (cur_char_size > 1)
3409 elem->type = MB_CHAR;
3410 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3411 re_string_skip_bytes (regexp, cur_char_size);
3412 return REG_NOERROR;
3414 #endif /* RE_ENABLE_I18N */
3415 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3416 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3417 || token->type == OP_OPEN_EQUIV_CLASS)
3418 return parse_bracket_symbol (elem, regexp, token);
3419 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3421 /* A '-' must only appear as anything but a range indicator before
3422 the closing bracket. Everything else is an error. */
3423 re_token_t token2;
3424 (void) peek_token_bracket (&token2, regexp, syntax);
3425 if (token2.type != OP_CLOSE_BRACKET)
3426 /* The actual error value is not standardized since this whole
3427 case is undefined. But ERANGE makes good sense. */
3428 return REG_ERANGE;
3430 elem->type = SB_CHAR;
3431 elem->opr.ch = token->opr.c;
3432 return REG_NOERROR;
3435 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3436 such as [:<character_class>:], [.<collating_element>.], and
3437 [=<equivalent_class>=]. */
3439 static reg_errcode_t
3440 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3441 re_token_t *token)
3443 unsigned char ch, delim = token->opr.c;
3444 int i = 0;
3445 if (re_string_eoi(regexp))
3446 return REG_EBRACK;
3447 for (;; ++i)
3449 if (i >= BRACKET_NAME_BUF_SIZE)
3450 return REG_EBRACK;
3451 if (token->type == OP_OPEN_CHAR_CLASS)
3452 ch = re_string_fetch_byte_case (regexp);
3453 else
3454 ch = re_string_fetch_byte (regexp);
3455 if (re_string_eoi(regexp))
3456 return REG_EBRACK;
3457 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3458 break;
3459 elem->opr.name[i] = ch;
3461 re_string_skip_bytes (regexp, 1);
3462 elem->opr.name[i] = '\0';
3463 switch (token->type)
3465 case OP_OPEN_COLL_ELEM:
3466 elem->type = COLL_SYM;
3467 break;
3468 case OP_OPEN_EQUIV_CLASS:
3469 elem->type = EQUIV_CLASS;
3470 break;
3471 case OP_OPEN_CHAR_CLASS:
3472 elem->type = CHAR_CLASS;
3473 break;
3474 default:
3475 break;
3477 return REG_NOERROR;
3480 /* Helper function for parse_bracket_exp.
3481 Build the equivalence class which is represented by NAME.
3482 The result are written to MBCSET and SBCSET.
3483 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3484 is a pointer argument since we may update it. */
3486 static reg_errcode_t
3487 #ifdef RE_ENABLE_I18N
3488 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3489 Idx *equiv_class_alloc, const unsigned char *name)
3490 #else /* not RE_ENABLE_I18N */
3491 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3492 #endif /* not RE_ENABLE_I18N */
3494 #ifdef _LIBC
3495 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3496 if (nrules != 0)
3498 const int32_t *table, *indirect;
3499 const unsigned char *weights, *extra, *cp;
3500 unsigned char char_buf[2];
3501 int32_t idx1, idx2;
3502 unsigned int ch;
3503 size_t len;
3504 /* Calculate the index for equivalence class. */
3505 cp = name;
3506 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3507 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3508 _NL_COLLATE_WEIGHTMB);
3509 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3510 _NL_COLLATE_EXTRAMB);
3511 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3512 _NL_COLLATE_INDIRECTMB);
3513 idx1 = findidx (table, indirect, extra, &cp, -1);
3514 if (BE (idx1 == 0 || *cp != '\0', 0))
3515 /* This isn't a valid character. */
3516 return REG_ECOLLATE;
3518 /* Build single byte matching table for this equivalence class. */
3519 len = weights[idx1 & 0xffffff];
3520 for (ch = 0; ch < SBC_MAX; ++ch)
3522 char_buf[0] = ch;
3523 cp = char_buf;
3524 idx2 = findidx (table, indirect, extra, &cp, 1);
3526 idx2 = table[ch];
3528 if (idx2 == 0)
3529 /* This isn't a valid character. */
3530 continue;
3531 /* Compare only if the length matches and the collation rule
3532 index is the same. */
3533 if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24)
3534 && memcmp (weights + (idx1 & 0xffffff) + 1,
3535 weights + (idx2 & 0xffffff) + 1, len) == 0)
3536 bitset_set (sbcset, ch);
3538 /* Check whether the array has enough space. */
3539 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3541 /* Not enough, realloc it. */
3542 /* +1 in case of mbcset->nequiv_classes is 0. */
3543 Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3544 /* Use realloc since the array is NULL if *alloc == 0. */
3545 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3546 int32_t,
3547 new_equiv_class_alloc);
3548 if (BE (new_equiv_classes == NULL, 0))
3549 return REG_ESPACE;
3550 mbcset->equiv_classes = new_equiv_classes;
3551 *equiv_class_alloc = new_equiv_class_alloc;
3553 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3555 else
3556 #endif /* _LIBC */
3558 if (BE (strlen ((const char *) name) != 1, 0))
3559 return REG_ECOLLATE;
3560 bitset_set (sbcset, *name);
3562 return REG_NOERROR;
3565 /* Helper function for parse_bracket_exp.
3566 Build the character class which is represented by NAME.
3567 The result are written to MBCSET and SBCSET.
3568 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3569 is a pointer argument since we may update it. */
3571 static reg_errcode_t
3572 #ifdef RE_ENABLE_I18N
3573 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3574 re_charset_t *mbcset, Idx *char_class_alloc,
3575 const char *class_name, reg_syntax_t syntax)
3576 #else /* not RE_ENABLE_I18N */
3577 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3578 const char *class_name, reg_syntax_t syntax)
3579 #endif /* not RE_ENABLE_I18N */
3581 int i;
3582 const char *name = class_name;
3584 /* In case of REG_ICASE "upper" and "lower" match the both of
3585 upper and lower cases. */
3586 if ((syntax & RE_ICASE)
3587 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3588 name = "alpha";
3590 #ifdef RE_ENABLE_I18N
3591 /* Check the space of the arrays. */
3592 if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3594 /* Not enough, realloc it. */
3595 /* +1 in case of mbcset->nchar_classes is 0. */
3596 Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3597 /* Use realloc since array is NULL if *alloc == 0. */
3598 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3599 new_char_class_alloc);
3600 if (BE (new_char_classes == NULL, 0))
3601 return REG_ESPACE;
3602 mbcset->char_classes = new_char_classes;
3603 *char_class_alloc = new_char_class_alloc;
3605 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3606 #endif /* RE_ENABLE_I18N */
3608 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3609 do { \
3610 if (BE (trans != NULL, 0)) \
3612 for (i = 0; i < SBC_MAX; ++i) \
3613 if (ctype_func (i)) \
3614 bitset_set (sbcset, trans[i]); \
3616 else \
3618 for (i = 0; i < SBC_MAX; ++i) \
3619 if (ctype_func (i)) \
3620 bitset_set (sbcset, i); \
3622 } while (0)
3624 if (strcmp (name, "alnum") == 0)
3625 BUILD_CHARCLASS_LOOP (isalnum);
3626 else if (strcmp (name, "cntrl") == 0)
3627 BUILD_CHARCLASS_LOOP (iscntrl);
3628 else if (strcmp (name, "lower") == 0)
3629 BUILD_CHARCLASS_LOOP (islower);
3630 else if (strcmp (name, "space") == 0)
3631 BUILD_CHARCLASS_LOOP (isspace);
3632 else if (strcmp (name, "alpha") == 0)
3633 BUILD_CHARCLASS_LOOP (isalpha);
3634 else if (strcmp (name, "digit") == 0)
3635 BUILD_CHARCLASS_LOOP (isdigit);
3636 else if (strcmp (name, "print") == 0)
3637 BUILD_CHARCLASS_LOOP (isprint);
3638 else if (strcmp (name, "upper") == 0)
3639 BUILD_CHARCLASS_LOOP (isupper);
3640 else if (strcmp (name, "blank") == 0)
3641 BUILD_CHARCLASS_LOOP (isblank);
3642 else if (strcmp (name, "graph") == 0)
3643 BUILD_CHARCLASS_LOOP (isgraph);
3644 else if (strcmp (name, "punct") == 0)
3645 BUILD_CHARCLASS_LOOP (ispunct);
3646 else if (strcmp (name, "xdigit") == 0)
3647 BUILD_CHARCLASS_LOOP (isxdigit);
3648 else
3649 return REG_ECTYPE;
3651 return REG_NOERROR;
3654 static bin_tree_t *
3655 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3656 const char *class_name,
3657 const char *extra, bool non_match,
3658 reg_errcode_t *err)
3660 re_bitset_ptr_t sbcset;
3661 #ifdef RE_ENABLE_I18N
3662 re_charset_t *mbcset;
3663 Idx alloc = 0;
3664 #endif /* not RE_ENABLE_I18N */
3665 reg_errcode_t ret;
3666 re_token_t br_token;
3667 bin_tree_t *tree;
3669 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3670 if (BE (sbcset == NULL, 0))
3672 *err = REG_ESPACE;
3673 return NULL;
3675 #ifdef RE_ENABLE_I18N
3676 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3677 if (BE (mbcset == NULL, 0))
3679 re_free (sbcset);
3680 *err = REG_ESPACE;
3681 return NULL;
3683 mbcset->non_match = non_match;
3684 #endif /* RE_ENABLE_I18N */
3686 /* We don't care the syntax in this case. */
3687 ret = build_charclass (trans, sbcset,
3688 #ifdef RE_ENABLE_I18N
3689 mbcset, &alloc,
3690 #endif /* RE_ENABLE_I18N */
3691 class_name, 0);
3693 if (BE (ret != REG_NOERROR, 0))
3695 re_free (sbcset);
3696 #ifdef RE_ENABLE_I18N
3697 free_charset (mbcset);
3698 #endif /* RE_ENABLE_I18N */
3699 *err = ret;
3700 return NULL;
3702 /* \w match '_' also. */
3703 for (; *extra; extra++)
3704 bitset_set (sbcset, *extra);
3706 /* If it is non-matching list. */
3707 if (non_match)
3708 bitset_not (sbcset);
3710 #ifdef RE_ENABLE_I18N
3711 /* Ensure only single byte characters are set. */
3712 if (dfa->mb_cur_max > 1)
3713 bitset_mask (sbcset, dfa->sb_char);
3714 #endif
3716 /* Build a tree for simple bracket. */
3717 #if defined GCC_LINT || defined lint
3718 memset (&br_token, 0, sizeof br_token);
3719 #endif
3720 br_token.type = SIMPLE_BRACKET;
3721 br_token.opr.sbcset = sbcset;
3722 tree = create_token_tree (dfa, NULL, NULL, &br_token);
3723 if (BE (tree == NULL, 0))
3724 goto build_word_op_espace;
3726 #ifdef RE_ENABLE_I18N
3727 if (dfa->mb_cur_max > 1)
3729 bin_tree_t *mbc_tree;
3730 /* Build a tree for complex bracket. */
3731 br_token.type = COMPLEX_BRACKET;
3732 br_token.opr.mbcset = mbcset;
3733 dfa->has_mb_node = 1;
3734 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3735 if (BE (mbc_tree == NULL, 0))
3736 goto build_word_op_espace;
3737 /* Then join them by ALT node. */
3738 tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3739 if (BE (mbc_tree != NULL, 1))
3740 return tree;
3742 else
3744 free_charset (mbcset);
3745 return tree;
3747 #else /* not RE_ENABLE_I18N */
3748 return tree;
3749 #endif /* not RE_ENABLE_I18N */
3751 build_word_op_espace:
3752 re_free (sbcset);
3753 #ifdef RE_ENABLE_I18N
3754 free_charset (mbcset);
3755 #endif /* RE_ENABLE_I18N */
3756 *err = REG_ESPACE;
3757 return NULL;
3760 /* This is intended for the expressions like "a{1,3}".
3761 Fetch a number from 'input', and return the number.
3762 Return -1 if the number field is empty like "{,1}".
3763 Return RE_DUP_MAX + 1 if the number field is too large.
3764 Return -2 if an error occurred. */
3766 static Idx
3767 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3769 Idx num = -1;
3770 unsigned char c;
3771 while (1)
3773 fetch_token (token, input, syntax);
3774 c = token->opr.c;
3775 if (BE (token->type == END_OF_RE, 0))
3776 return -2;
3777 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3778 break;
3779 num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3780 ? -2
3781 : num == -1
3782 ? c - '0'
3783 : MIN (RE_DUP_MAX + 1, num * 10 + c - '0'));
3785 return num;
3788 #ifdef RE_ENABLE_I18N
3789 static void
3790 free_charset (re_charset_t *cset)
3792 re_free (cset->mbchars);
3793 # ifdef _LIBC
3794 re_free (cset->coll_syms);
3795 re_free (cset->equiv_classes);
3796 # endif
3797 re_free (cset->range_starts);
3798 re_free (cset->range_ends);
3799 re_free (cset->char_classes);
3800 re_free (cset);
3802 #endif /* RE_ENABLE_I18N */
3804 /* Functions for binary tree operation. */
3806 /* Create a tree node. */
3808 static bin_tree_t *
3809 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3810 re_token_type_t type)
3812 re_token_t t;
3813 #if defined GCC_LINT || defined lint
3814 memset (&t, 0, sizeof t);
3815 #endif
3816 t.type = type;
3817 return create_token_tree (dfa, left, right, &t);
3820 static bin_tree_t *
3821 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3822 const re_token_t *token)
3824 bin_tree_t *tree;
3825 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3827 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3829 if (storage == NULL)
3830 return NULL;
3831 storage->next = dfa->str_tree_storage;
3832 dfa->str_tree_storage = storage;
3833 dfa->str_tree_storage_idx = 0;
3835 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3837 tree->parent = NULL;
3838 tree->left = left;
3839 tree->right = right;
3840 tree->token = *token;
3841 tree->token.duplicated = 0;
3842 tree->token.opt_subexp = 0;
3843 tree->first = NULL;
3844 tree->next = NULL;
3845 tree->node_idx = -1;
3847 if (left != NULL)
3848 left->parent = tree;
3849 if (right != NULL)
3850 right->parent = tree;
3851 return tree;
3854 /* Mark the tree SRC as an optional subexpression.
3855 To be called from preorder or postorder. */
3857 static reg_errcode_t
3858 mark_opt_subexp (void *extra, bin_tree_t *node)
3860 Idx idx = (uintptr_t) extra;
3861 if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3862 node->token.opt_subexp = 1;
3864 return REG_NOERROR;
3867 /* Free the allocated memory inside NODE. */
3869 static void
3870 free_token (re_token_t *node)
3872 #ifdef RE_ENABLE_I18N
3873 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3874 free_charset (node->opr.mbcset);
3875 else
3876 #endif /* RE_ENABLE_I18N */
3877 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3878 re_free (node->opr.sbcset);
3881 /* Worker function for tree walking. Free the allocated memory inside NODE
3882 and its children. */
3884 static reg_errcode_t
3885 free_tree (void *extra, bin_tree_t *node)
3887 free_token (&node->token);
3888 return REG_NOERROR;
3892 /* Duplicate the node SRC, and return new node. This is a preorder
3893 visit similar to the one implemented by the generic visitor, but
3894 we need more infrastructure to maintain two parallel trees --- so,
3895 it's easier to duplicate. */
3897 static bin_tree_t *
3898 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3900 const bin_tree_t *node;
3901 bin_tree_t *dup_root;
3902 bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3904 for (node = root; ; )
3906 /* Create a new tree and link it back to the current parent. */
3907 *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3908 if (*p_new == NULL)
3909 return NULL;
3910 (*p_new)->parent = dup_node;
3911 (*p_new)->token.duplicated = 1;
3912 dup_node = *p_new;
3914 /* Go to the left node, or up and to the right. */
3915 if (node->left)
3917 node = node->left;
3918 p_new = &dup_node->left;
3920 else
3922 const bin_tree_t *prev = NULL;
3923 while (node->right == prev || node->right == NULL)
3925 prev = node;
3926 node = node->parent;
3927 dup_node = dup_node->parent;
3928 if (!node)
3929 return dup_root;
3931 node = node->right;
3932 p_new = &dup_node->right;