exp2l: Work around a NetBSD 10.0/i386 bug.
[gnulib.git] / lib / regcomp.c
blob696cf813f3c5c87456cda5ad1294b5982699f50b
1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <https://www.gnu.org/licenses/>. */
20 #ifdef _LIBC
21 # include <locale/weight.h>
22 #endif
24 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
25 size_t length, reg_syntax_t syntax);
26 static void re_compile_fastmap_iter (regex_t *bufp,
27 const re_dfastate_t *init_state,
28 char *fastmap);
29 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
30 static void free_charset (re_charset_t *cset);
31 static void free_workarea_compile (regex_t *preg);
32 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
33 static void optimize_utf8 (re_dfa_t *dfa);
34 static reg_errcode_t analyze (regex_t *preg);
35 static reg_errcode_t preorder (bin_tree_t *root,
36 reg_errcode_t (fn (void *, bin_tree_t *)),
37 void *extra);
38 static reg_errcode_t postorder (bin_tree_t *root,
39 reg_errcode_t (fn (void *, bin_tree_t *)),
40 void *extra);
41 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
42 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
43 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
44 bin_tree_t *node);
45 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
46 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
47 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
48 static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
49 static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
50 unsigned int constraint);
51 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
52 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
53 Idx node, bool root);
54 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
55 static Idx fetch_number (re_string_t *input, re_token_t *token,
56 reg_syntax_t syntax);
57 static int peek_token (re_token_t *token, re_string_t *input,
58 reg_syntax_t syntax);
59 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
60 reg_syntax_t syntax, reg_errcode_t *err);
61 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
62 re_token_t *token, reg_syntax_t syntax,
63 Idx nest, reg_errcode_t *err);
64 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
65 re_token_t *token, reg_syntax_t syntax,
66 Idx nest, reg_errcode_t *err);
67 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
68 re_token_t *token, reg_syntax_t syntax,
69 Idx nest, reg_errcode_t *err);
70 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
71 re_token_t *token, reg_syntax_t syntax,
72 Idx nest, reg_errcode_t *err);
73 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
74 re_dfa_t *dfa, re_token_t *token,
75 reg_syntax_t syntax, reg_errcode_t *err);
76 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
77 re_token_t *token, reg_syntax_t syntax,
78 reg_errcode_t *err);
79 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
80 re_string_t *regexp,
81 re_token_t *token, int token_len,
82 re_dfa_t *dfa,
83 reg_syntax_t syntax,
84 bool accept_hyphen);
85 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
86 re_string_t *regexp,
87 re_token_t *token);
88 static reg_errcode_t build_equiv_class (bitset_t sbcset,
89 re_charset_t *mbcset,
90 Idx *equiv_class_alloc,
91 const unsigned char *name);
92 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
93 bitset_t sbcset,
94 re_charset_t *mbcset,
95 Idx *char_class_alloc,
96 const char *class_name,
97 reg_syntax_t syntax);
98 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
99 RE_TRANSLATE_TYPE trans,
100 const char *class_name,
101 const char *extra,
102 bool non_match, reg_errcode_t *err);
103 static bin_tree_t *create_tree (re_dfa_t *dfa,
104 bin_tree_t *left, bin_tree_t *right,
105 re_token_type_t type);
106 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
107 bin_tree_t *left, bin_tree_t *right,
108 const re_token_t *token);
109 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
110 static void free_token (re_token_t *node);
111 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
112 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
114 /* This table gives an error message for each of the error codes listed
115 in regex.h. Obviously the order here has to be same as there.
116 POSIX doesn't require that we do anything for REG_NOERROR,
117 but why not be nice? */
119 static const char __re_error_msgid[] =
121 #define REG_NOERROR_IDX 0
122 gettext_noop ("Success") /* REG_NOERROR */
123 "\0"
124 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
125 gettext_noop ("No match") /* REG_NOMATCH */
126 "\0"
127 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
128 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
129 "\0"
130 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
131 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
132 "\0"
133 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
134 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
135 "\0"
136 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
137 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
138 "\0"
139 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
140 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
141 "\0"
142 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
143 gettext_noop ("Unmatched [, [^, [:, [., or [=") /* REG_EBRACK */
144 "\0"
145 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [, [^, [:, [., or [=")
146 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
147 "\0"
148 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
149 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
150 "\0"
151 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
152 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
153 "\0"
154 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
155 gettext_noop ("Invalid range end") /* REG_ERANGE */
156 "\0"
157 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
158 gettext_noop ("Memory exhausted") /* REG_ESPACE */
159 "\0"
160 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
161 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
162 "\0"
163 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
164 gettext_noop ("Premature end of regular expression") /* REG_EEND */
165 "\0"
166 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
167 gettext_noop ("Regular expression too big") /* REG_ESIZE */
168 "\0"
169 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
170 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
173 static const size_t __re_error_msgid_idx[] =
175 REG_NOERROR_IDX,
176 REG_NOMATCH_IDX,
177 REG_BADPAT_IDX,
178 REG_ECOLLATE_IDX,
179 REG_ECTYPE_IDX,
180 REG_EESCAPE_IDX,
181 REG_ESUBREG_IDX,
182 REG_EBRACK_IDX,
183 REG_EPAREN_IDX,
184 REG_EBRACE_IDX,
185 REG_BADBR_IDX,
186 REG_ERANGE_IDX,
187 REG_ESPACE_IDX,
188 REG_BADRPT_IDX,
189 REG_EEND_IDX,
190 REG_ESIZE_IDX,
191 REG_ERPAREN_IDX
194 /* Entry points for GNU code. */
196 /* re_compile_pattern is the GNU regular expression compiler: it
197 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
198 Returns 0 if the pattern was valid, otherwise an error string.
200 Assumes the 'allocated' (and perhaps 'buffer') and 'translate' fields
201 are set in BUFP on entry. */
203 const char *
204 re_compile_pattern (const char *pattern, size_t length,
205 struct re_pattern_buffer *bufp)
207 reg_errcode_t ret;
209 /* And GNU code determines whether or not to get register information
210 by passing null for the REGS argument to re_match, etc., not by
211 setting no_sub, unless RE_NO_SUB is set. */
212 bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
214 /* Match anchors at newline. */
215 bufp->newline_anchor = 1;
217 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
219 if (!ret)
220 return NULL;
221 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
223 weak_alias (__re_compile_pattern, re_compile_pattern)
225 /* Set by 're_set_syntax' to the current regexp syntax to recognize. Can
226 also be assigned to arbitrarily: each pattern buffer stores its own
227 syntax, so it can be changed between regex compilations. */
228 /* This has no initializer because initialized variables in Emacs
229 become read-only after dumping. */
230 reg_syntax_t re_syntax_options;
233 /* Specify the precise syntax of regexps for compilation. This provides
234 for compatibility for various utilities which historically have
235 different, incompatible syntaxes.
237 The argument SYNTAX is a bit mask comprised of the various bits
238 defined in regex.h. We return the old syntax. */
240 reg_syntax_t
241 re_set_syntax (reg_syntax_t syntax)
243 reg_syntax_t ret = re_syntax_options;
245 re_syntax_options = syntax;
246 return ret;
248 weak_alias (__re_set_syntax, re_set_syntax)
251 re_compile_fastmap (struct re_pattern_buffer *bufp)
253 re_dfa_t *dfa = bufp->buffer;
254 char *fastmap = bufp->fastmap;
256 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
257 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
258 if (dfa->init_state != dfa->init_state_word)
259 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
260 if (dfa->init_state != dfa->init_state_nl)
261 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
262 if (dfa->init_state != dfa->init_state_begbuf)
263 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
264 bufp->fastmap_accurate = 1;
265 return 0;
267 weak_alias (__re_compile_fastmap, re_compile_fastmap)
269 static __always_inline void
270 re_set_fastmap (char *fastmap, bool icase, int ch)
272 fastmap[ch] = 1;
273 if (icase)
274 fastmap[tolower (ch)] = 1;
277 /* Helper function for re_compile_fastmap.
278 Compile fastmap for the initial_state INIT_STATE. */
280 static void
281 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
282 char *fastmap)
284 re_dfa_t *dfa = bufp->buffer;
285 Idx node_cnt;
286 bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
287 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
289 Idx node = init_state->nodes.elems[node_cnt];
290 re_token_type_t type = dfa->nodes[node].type;
292 if (type == CHARACTER)
294 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
295 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
297 unsigned char buf[MB_LEN_MAX];
298 unsigned char *p;
299 wchar_t wc;
300 mbstate_t state;
302 p = buf;
303 *p++ = dfa->nodes[node].opr.c;
304 while (++node < dfa->nodes_len
305 && dfa->nodes[node].type == CHARACTER
306 && dfa->nodes[node].mb_partial)
307 *p++ = dfa->nodes[node].opr.c;
308 memset (&state, '\0', sizeof (state));
309 if (__mbrtowc (&wc, (const char *) buf, p - buf,
310 &state) == p - buf
311 && (__wcrtomb ((char *) buf, __towlower (wc), &state)
312 != (size_t) -1))
313 re_set_fastmap (fastmap, false, buf[0]);
316 else if (type == SIMPLE_BRACKET)
318 int i, ch;
319 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
321 int j;
322 bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
323 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
324 if (w & ((bitset_word_t) 1 << j))
325 re_set_fastmap (fastmap, icase, ch);
328 else if (type == COMPLEX_BRACKET)
330 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
331 Idx i;
333 #ifdef _LIBC
334 /* See if we have to try all bytes which start multiple collation
335 elements.
336 e.g. In da_DK, we want to catch 'a' since "aa" is a valid
337 collation element, and don't catch 'b' since 'b' is
338 the only collation element which starts from 'b' (and
339 it is caught by SIMPLE_BRACKET). */
340 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
341 && (cset->ncoll_syms || cset->nranges))
343 const int32_t *table = (const int32_t *)
344 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
345 for (i = 0; i < SBC_MAX; ++i)
346 if (table[i] < 0)
347 re_set_fastmap (fastmap, icase, i);
349 #endif /* _LIBC */
351 /* See if we have to start the match at all multibyte characters,
352 i.e. where we would not find an invalid sequence. This only
353 applies to multibyte character sets; for single byte character
354 sets, the SIMPLE_BRACKET again suffices. */
355 if (dfa->mb_cur_max > 1
356 && (cset->nchar_classes || cset->non_match || cset->nranges
357 #ifdef _LIBC
358 || cset->nequiv_classes
359 #endif /* _LIBC */
362 unsigned char c = 0;
365 mbstate_t mbs;
366 memset (&mbs, 0, sizeof (mbs));
367 if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
368 re_set_fastmap (fastmap, false, (int) c);
370 while (++c != 0);
373 else
375 /* ... Else catch all bytes which can start the mbchars. */
376 for (i = 0; i < cset->nmbchars; ++i)
378 char buf[256];
379 mbstate_t state;
380 memset (&state, '\0', sizeof (state));
381 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
382 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
383 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
385 if (__wcrtomb (buf, __towlower (cset->mbchars[i]), &state)
386 != (size_t) -1)
387 re_set_fastmap (fastmap, false, *(unsigned char *) buf);
392 else if (type == OP_PERIOD || type == OP_UTF8_PERIOD || type == END_OF_RE)
394 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
395 if (type == END_OF_RE)
396 bufp->can_be_null = 1;
397 return;
402 /* Entry point for POSIX code. */
403 /* regcomp takes a regular expression as a string and compiles it.
405 PREG is a regex_t *. We do not expect any fields to be initialized,
406 since POSIX says we shouldn't. Thus, we set
408 'buffer' to the compiled pattern;
409 'used' to the length of the compiled pattern;
410 'syntax' to RE_SYNTAX_POSIX_EXTENDED if the
411 REG_EXTENDED bit in CFLAGS is set; otherwise, to
412 RE_SYNTAX_POSIX_BASIC;
413 'newline_anchor' to REG_NEWLINE being set in CFLAGS;
414 'fastmap' to an allocated space for the fastmap;
415 'fastmap_accurate' to zero;
416 're_nsub' to the number of subexpressions in PATTERN.
418 PATTERN is the address of the pattern string.
420 CFLAGS is a series of bits which affect compilation.
422 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
423 use POSIX basic syntax.
425 If REG_NEWLINE is set, then . and [^...] don't match newline.
426 Also, regexec will try a match beginning after every newline.
428 If REG_ICASE is set, then we considers upper- and lowercase
429 versions of letters to be equivalent when matching.
431 If REG_NOSUB is set, then when PREG is passed to regexec, that
432 routine will report only success or failure, and nothing about the
433 registers.
435 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
436 the return codes and their meanings.) */
439 regcomp (regex_t *__restrict preg, const char *__restrict pattern, int cflags)
441 reg_errcode_t ret;
442 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
443 : RE_SYNTAX_POSIX_BASIC);
445 preg->buffer = NULL;
446 preg->allocated = 0;
447 preg->used = 0;
449 /* Try to allocate space for the fastmap. */
450 preg->fastmap = re_malloc (char, SBC_MAX);
451 if (__glibc_unlikely (preg->fastmap == NULL))
452 return REG_ESPACE;
454 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
456 /* If REG_NEWLINE is set, newlines are treated differently. */
457 if (cflags & REG_NEWLINE)
458 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
459 syntax &= ~RE_DOT_NEWLINE;
460 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
461 /* It also changes the matching behavior. */
462 preg->newline_anchor = 1;
464 else
465 preg->newline_anchor = 0;
466 preg->no_sub = !!(cflags & REG_NOSUB);
467 preg->translate = NULL;
469 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
471 /* POSIX doesn't distinguish between an unmatched open-group and an
472 unmatched close-group: both are REG_EPAREN. */
473 if (ret == REG_ERPAREN)
474 ret = REG_EPAREN;
476 /* We have already checked preg->fastmap != NULL. */
477 if (__glibc_likely (ret == REG_NOERROR))
478 /* Compute the fastmap now, since regexec cannot modify the pattern
479 buffer. This function never fails in this implementation. */
480 (void) re_compile_fastmap (preg);
481 else
483 /* Some error occurred while compiling the expression. */
484 re_free (preg->fastmap);
485 preg->fastmap = NULL;
488 return (int) ret;
490 libc_hidden_def (__regcomp)
491 weak_alias (__regcomp, regcomp)
493 /* Returns a message corresponding to an error code, ERRCODE, returned
494 from either regcomp or regexec. We don't use PREG here. */
496 size_t
497 regerror (int errcode, const regex_t *__restrict preg, char *__restrict errbuf,
498 size_t errbuf_size)
500 const char *msg;
501 size_t msg_size;
502 int nerrcodes = sizeof __re_error_msgid_idx / sizeof __re_error_msgid_idx[0];
504 if (__glibc_unlikely (errcode < 0 || errcode >= nerrcodes))
505 /* Only error codes returned by the rest of the code should be passed
506 to this routine. If we are given anything else, or if other regex
507 code generates an invalid error code, then the program has a bug.
508 Dump core so we can fix it. */
509 abort ();
511 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
513 msg_size = strlen (msg) + 1; /* Includes the null. */
515 if (__glibc_likely (errbuf_size != 0))
517 size_t cpy_size = msg_size;
518 if (__glibc_unlikely (msg_size > errbuf_size))
520 cpy_size = errbuf_size - 1;
521 errbuf[cpy_size] = '\0';
523 memcpy (errbuf, msg, cpy_size);
526 return msg_size;
528 weak_alias (__regerror, regerror)
531 /* This static array is used for the map to single-byte characters when
532 UTF-8 is used. Otherwise we would allocate memory just to initialize
533 it the same all the time. UTF-8 is the preferred encoding so this is
534 a worthwhile optimization. */
535 static const bitset_t utf8_sb_map =
537 /* Set the first 128 bits. */
538 #if (defined __GNUC__ || __clang_major__ >= 4) && !defined __STRICT_ANSI__
539 [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
540 #else
541 # if 4 * BITSET_WORD_BITS < ASCII_CHARS
542 # error "bitset_word_t is narrower than 32 bits"
543 # elif 3 * BITSET_WORD_BITS < ASCII_CHARS
544 BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
545 # elif 2 * BITSET_WORD_BITS < ASCII_CHARS
546 BITSET_WORD_MAX, BITSET_WORD_MAX,
547 # elif 1 * BITSET_WORD_BITS < ASCII_CHARS
548 BITSET_WORD_MAX,
549 # endif
550 (BITSET_WORD_MAX
551 >> (SBC_MAX % BITSET_WORD_BITS == 0
553 : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
554 #endif
558 static void
559 free_dfa_content (re_dfa_t *dfa)
561 Idx i, j;
563 if (dfa->nodes)
564 for (i = 0; i < dfa->nodes_len; ++i)
565 free_token (dfa->nodes + i);
566 re_free (dfa->nexts);
567 for (i = 0; i < dfa->nodes_len; ++i)
569 if (dfa->eclosures != NULL)
570 re_node_set_free (dfa->eclosures + i);
571 if (dfa->inveclosures != NULL)
572 re_node_set_free (dfa->inveclosures + i);
573 if (dfa->edests != NULL)
574 re_node_set_free (dfa->edests + i);
576 re_free (dfa->edests);
577 re_free (dfa->eclosures);
578 re_free (dfa->inveclosures);
579 re_free (dfa->nodes);
581 if (dfa->state_table)
582 for (i = 0; i <= dfa->state_hash_mask; ++i)
584 struct re_state_table_entry *entry = dfa->state_table + i;
585 for (j = 0; j < entry->num; ++j)
587 re_dfastate_t *state = entry->array[j];
588 free_state (state);
590 re_free (entry->array);
592 re_free (dfa->state_table);
593 if (dfa->sb_char != utf8_sb_map)
594 re_free (dfa->sb_char);
595 re_free (dfa->subexp_map);
596 #ifdef DEBUG
597 re_free (dfa->re_str);
598 #endif
600 re_free (dfa);
604 /* Free dynamically allocated space used by PREG. */
606 void
607 regfree (regex_t *preg)
609 re_dfa_t *dfa = preg->buffer;
610 if (__glibc_likely (dfa != NULL))
612 lock_fini (dfa->lock);
613 free_dfa_content (dfa);
615 preg->buffer = NULL;
616 preg->allocated = 0;
618 re_free (preg->fastmap);
619 preg->fastmap = NULL;
621 re_free (preg->translate);
622 preg->translate = NULL;
624 libc_hidden_def (__regfree)
625 weak_alias (__regfree, regfree)
627 /* Entry points compatible with 4.2 BSD regex library. We don't define
628 them unless specifically requested. */
630 #if defined _REGEX_RE_COMP || defined _LIBC
632 /* BSD has one and only one pattern buffer. */
633 static struct re_pattern_buffer re_comp_buf;
635 char *
636 # ifdef _LIBC
637 /* Make these definitions weak in libc, so POSIX programs can redefine
638 these names if they don't use our functions, and still use
639 regcomp/regexec above without link errors. */
640 weak_function
641 # endif
642 re_comp (const char *s)
644 reg_errcode_t ret;
645 char *fastmap;
647 if (!s)
649 if (!re_comp_buf.buffer)
650 return gettext ("No previous regular expression");
651 return 0;
654 if (re_comp_buf.buffer)
656 fastmap = re_comp_buf.fastmap;
657 re_comp_buf.fastmap = NULL;
658 __regfree (&re_comp_buf);
659 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
660 re_comp_buf.fastmap = fastmap;
663 if (re_comp_buf.fastmap == NULL)
665 re_comp_buf.fastmap = re_malloc (char, SBC_MAX);
666 if (re_comp_buf.fastmap == NULL)
667 return (char *) gettext (__re_error_msgid
668 + __re_error_msgid_idx[(int) REG_ESPACE]);
671 /* Since 're_exec' always passes NULL for the 'regs' argument, we
672 don't need to initialize the pattern buffer fields which affect it. */
674 /* Match anchors at newlines. */
675 re_comp_buf.newline_anchor = 1;
677 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
679 if (!ret)
680 return NULL;
682 /* Yes, we're discarding 'const' here if !HAVE_LIBINTL. */
683 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
686 #ifdef _LIBC
687 libc_freeres_fn (free_mem)
689 __regfree (&re_comp_buf);
691 #endif
693 #endif /* _REGEX_RE_COMP */
695 /* Internal entry point.
696 Compile the regular expression PATTERN, whose length is LENGTH.
697 SYNTAX indicate regular expression's syntax. */
699 static reg_errcode_t
700 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
701 reg_syntax_t syntax)
703 reg_errcode_t err = REG_NOERROR;
704 re_dfa_t *dfa;
705 re_string_t regexp;
707 /* Initialize the pattern buffer. */
708 preg->fastmap_accurate = 0;
709 preg->syntax = syntax;
710 preg->not_bol = preg->not_eol = 0;
711 preg->used = 0;
712 preg->re_nsub = 0;
713 preg->can_be_null = 0;
714 preg->regs_allocated = REGS_UNALLOCATED;
716 /* Initialize the dfa. */
717 dfa = preg->buffer;
718 if (__glibc_unlikely (preg->allocated < sizeof (re_dfa_t)))
720 /* If zero allocated, but buffer is non-null, try to realloc
721 enough space. This loses if buffer's address is bogus, but
722 that is the user's responsibility. If ->buffer is NULL this
723 is a simple allocation. */
724 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
725 if (dfa == NULL)
726 return REG_ESPACE;
727 preg->allocated = sizeof (re_dfa_t);
728 preg->buffer = dfa;
730 preg->used = sizeof (re_dfa_t);
732 err = init_dfa (dfa, length);
733 if (__glibc_unlikely (err == REG_NOERROR && lock_init (dfa->lock) != 0))
734 err = REG_ESPACE;
735 if (__glibc_unlikely (err != REG_NOERROR))
737 free_dfa_content (dfa);
738 preg->buffer = NULL;
739 preg->allocated = 0;
740 return err;
742 #ifdef DEBUG
743 /* Note: length+1 will not overflow since it is checked in init_dfa. */
744 dfa->re_str = re_malloc (char, length + 1);
745 strncpy (dfa->re_str, pattern, length + 1);
746 #endif
748 err = re_string_construct (&regexp, pattern, length, preg->translate,
749 (syntax & RE_ICASE) != 0, dfa);
750 if (__glibc_unlikely (err != REG_NOERROR))
752 re_compile_internal_free_return:
753 free_workarea_compile (preg);
754 re_string_destruct (&regexp);
755 lock_fini (dfa->lock);
756 free_dfa_content (dfa);
757 preg->buffer = NULL;
758 preg->allocated = 0;
759 return err;
762 /* Parse the regular expression, and build a structure tree. */
763 preg->re_nsub = 0;
764 dfa->str_tree = parse (&regexp, preg, syntax, &err);
765 if (__glibc_unlikely (dfa->str_tree == NULL))
766 goto re_compile_internal_free_return;
768 /* Analyze the tree and create the nfa. */
769 err = analyze (preg);
770 if (__glibc_unlikely (err != REG_NOERROR))
771 goto re_compile_internal_free_return;
773 /* If possible, do searching in single byte encoding to speed things up. */
774 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
775 optimize_utf8 (dfa);
777 /* Then create the initial state of the dfa. */
778 err = create_initial_state (dfa);
780 /* Release work areas. */
781 free_workarea_compile (preg);
782 re_string_destruct (&regexp);
784 if (__glibc_unlikely (err != REG_NOERROR))
786 lock_fini (dfa->lock);
787 free_dfa_content (dfa);
788 preg->buffer = NULL;
789 preg->allocated = 0;
792 return err;
795 /* Initialize DFA. We use the length of the regular expression PAT_LEN
796 as the initial length of some arrays. */
798 static reg_errcode_t
799 init_dfa (re_dfa_t *dfa, size_t pat_len)
801 __re_size_t table_size;
802 #ifndef _LIBC
803 const char *codeset_name;
804 #endif
805 size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
806 size_t max_object_size =
807 MAX (sizeof (struct re_state_table_entry),
808 MAX (sizeof (re_token_t),
809 MAX (sizeof (re_node_set),
810 MAX (sizeof (regmatch_t),
811 max_i18n_object_size))));
813 memset (dfa, '\0', sizeof (re_dfa_t));
815 /* Force allocation of str_tree_storage the first time. */
816 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
818 /* Avoid overflows. The extra "/ 2" is for the table_size doubling
819 calculation below, and for similar doubling calculations
820 elsewhere. And it's <= rather than <, because some of the
821 doubling calculations add 1 afterwards. */
822 if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / max_object_size) / 2
823 <= pat_len))
824 return REG_ESPACE;
826 dfa->nodes_alloc = pat_len + 1;
827 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
829 /* table_size = 2 ^ ceil(log pat_len) */
830 for (table_size = 1; ; table_size <<= 1)
831 if (table_size > pat_len)
832 break;
834 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
835 dfa->state_hash_mask = table_size - 1;
837 dfa->mb_cur_max = MB_CUR_MAX;
838 #ifdef _LIBC
839 if (dfa->mb_cur_max == 6
840 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
841 dfa->is_utf8 = 1;
842 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
843 != 0);
844 #else
845 codeset_name = nl_langinfo (CODESET);
846 if ((codeset_name[0] == 'U' || codeset_name[0] == 'u')
847 && (codeset_name[1] == 'T' || codeset_name[1] == 't')
848 && (codeset_name[2] == 'F' || codeset_name[2] == 'f')
849 && strcmp (codeset_name + 3 + (codeset_name[3] == '-'), "8") == 0)
850 dfa->is_utf8 = 1;
852 /* We check exhaustively in the loop below if this charset is a
853 superset of ASCII. */
854 dfa->map_notascii = 0;
855 #endif
857 if (dfa->mb_cur_max > 1)
859 if (dfa->is_utf8)
860 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
861 else
863 int i, j, ch;
865 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
866 if (__glibc_unlikely (dfa->sb_char == NULL))
867 return REG_ESPACE;
869 /* Set the bits corresponding to single byte chars. */
870 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
871 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
873 wint_t wch = __btowc (ch);
874 if (wch != WEOF)
875 dfa->sb_char[i] |= (bitset_word_t) 1 << j;
876 #ifndef _LIBC
877 if (isascii (ch) && wch != ch)
878 dfa->map_notascii = 1;
879 #endif
884 if (__glibc_unlikely (dfa->nodes == NULL || dfa->state_table == NULL))
885 return REG_ESPACE;
886 return REG_NOERROR;
889 /* Initialize WORD_CHAR table, which indicate which character is
890 "word". In this case "word" means that it is the word construction
891 character used by some operators like "\<", "\>", etc. */
893 static void
894 init_word_char (re_dfa_t *dfa)
896 int i = 0;
897 int j;
898 int ch = 0;
899 dfa->word_ops_used = 1;
900 if (__glibc_likely (dfa->map_notascii == 0))
902 bitset_word_t bits0 = 0x00000000;
903 bitset_word_t bits1 = 0x03ff0000;
904 bitset_word_t bits2 = 0x87fffffe;
905 bitset_word_t bits3 = 0x07fffffe;
906 if (BITSET_WORD_BITS == 64)
908 /* Pacify gcc -Woverflow on 32-bit platforms. */
909 dfa->word_char[0] = bits1 << 31 << 1 | bits0;
910 dfa->word_char[1] = bits3 << 31 << 1 | bits2;
911 i = 2;
913 else if (BITSET_WORD_BITS == 32)
915 dfa->word_char[0] = bits0;
916 dfa->word_char[1] = bits1;
917 dfa->word_char[2] = bits2;
918 dfa->word_char[3] = bits3;
919 i = 4;
921 else
922 goto general_case;
923 ch = 128;
925 if (__glibc_likely (dfa->is_utf8))
927 memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8);
928 return;
932 general_case:
933 for (; i < BITSET_WORDS; ++i)
934 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
935 if (isalnum (ch) || ch == '_')
936 dfa->word_char[i] |= (bitset_word_t) 1 << j;
939 /* Free the work area which are only used while compiling. */
941 static void
942 free_workarea_compile (regex_t *preg)
944 re_dfa_t *dfa = preg->buffer;
945 bin_tree_storage_t *storage, *next;
946 for (storage = dfa->str_tree_storage; storage; storage = next)
948 next = storage->next;
949 re_free (storage);
951 dfa->str_tree_storage = NULL;
952 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
953 dfa->str_tree = NULL;
954 re_free (dfa->org_indices);
955 dfa->org_indices = NULL;
958 /* Create initial states for all contexts. */
960 static reg_errcode_t
961 create_initial_state (re_dfa_t *dfa)
963 Idx first, i;
964 reg_errcode_t err;
965 re_node_set init_nodes;
967 /* Initial states have the epsilon closure of the node which is
968 the first node of the regular expression. */
969 first = dfa->str_tree->first->node_idx;
970 dfa->init_node = first;
971 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
972 if (__glibc_unlikely (err != REG_NOERROR))
973 return err;
975 /* The back-references which are in initial states can epsilon transit,
976 since in this case all of the subexpressions can be null.
977 Then we add epsilon closures of the nodes which are the next nodes of
978 the back-references. */
979 if (dfa->nbackref > 0)
980 for (i = 0; i < init_nodes.nelem; ++i)
982 Idx node_idx = init_nodes.elems[i];
983 re_token_type_t type = dfa->nodes[node_idx].type;
985 Idx clexp_idx;
986 if (type != OP_BACK_REF)
987 continue;
988 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
990 re_token_t *clexp_node;
991 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
992 if (clexp_node->type == OP_CLOSE_SUBEXP
993 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
994 break;
996 if (clexp_idx == init_nodes.nelem)
997 continue;
999 if (type == OP_BACK_REF)
1001 Idx dest_idx = dfa->edests[node_idx].elems[0];
1002 if (!re_node_set_contains (&init_nodes, dest_idx))
1004 reg_errcode_t merge_err
1005 = re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1006 if (merge_err != REG_NOERROR)
1007 return merge_err;
1008 i = 0;
1013 /* It must be the first time to invoke acquire_state. */
1014 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1015 /* We don't check ERR here, since the initial state must not be NULL. */
1016 if (__glibc_unlikely (dfa->init_state == NULL))
1017 return err;
1018 if (dfa->init_state->has_constraint)
1020 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1021 CONTEXT_WORD);
1022 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1023 CONTEXT_NEWLINE);
1024 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1025 &init_nodes,
1026 CONTEXT_NEWLINE
1027 | CONTEXT_BEGBUF);
1028 if (__glibc_unlikely (dfa->init_state_word == NULL
1029 || dfa->init_state_nl == NULL
1030 || dfa->init_state_begbuf == NULL))
1031 return err;
1033 else
1034 dfa->init_state_word = dfa->init_state_nl
1035 = dfa->init_state_begbuf = dfa->init_state;
1037 re_node_set_free (&init_nodes);
1038 return REG_NOERROR;
1041 /* If it is possible to do searching in single byte encoding instead of UTF-8
1042 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1043 DFA nodes where needed. */
1045 static void
1046 optimize_utf8 (re_dfa_t *dfa)
1048 Idx node;
1049 int i;
1050 bool mb_chars = false;
1051 bool has_period = false;
1053 for (node = 0; node < dfa->nodes_len; ++node)
1054 switch (dfa->nodes[node].type)
1056 case CHARACTER:
1057 if (dfa->nodes[node].opr.c >= ASCII_CHARS)
1058 mb_chars = true;
1059 break;
1060 case ANCHOR:
1061 switch (dfa->nodes[node].opr.ctx_type)
1063 case LINE_FIRST:
1064 case LINE_LAST:
1065 case BUF_FIRST:
1066 case BUF_LAST:
1067 break;
1068 default:
1069 /* Word anchors etc. cannot be handled. It's okay to test
1070 opr.ctx_type since constraints (for all DFA nodes) are
1071 created by ORing one or more opr.ctx_type values. */
1072 return;
1074 break;
1075 case OP_PERIOD:
1076 has_period = true;
1077 break;
1078 case OP_BACK_REF:
1079 case OP_ALT:
1080 case END_OF_RE:
1081 case OP_DUP_ASTERISK:
1082 case OP_OPEN_SUBEXP:
1083 case OP_CLOSE_SUBEXP:
1084 break;
1085 case COMPLEX_BRACKET:
1086 return;
1087 case SIMPLE_BRACKET:
1088 /* Just double check. */
1090 int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
1092 : BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
1093 for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1095 if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
1096 return;
1097 rshift = 0;
1100 break;
1101 default:
1102 abort ();
1105 if (mb_chars || has_period)
1106 for (node = 0; node < dfa->nodes_len; ++node)
1108 if (dfa->nodes[node].type == CHARACTER
1109 && dfa->nodes[node].opr.c >= ASCII_CHARS)
1110 dfa->nodes[node].mb_partial = 0;
1111 else if (dfa->nodes[node].type == OP_PERIOD)
1112 dfa->nodes[node].type = OP_UTF8_PERIOD;
1115 /* The search can be in single byte locale. */
1116 dfa->mb_cur_max = 1;
1117 dfa->is_utf8 = 0;
1118 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1121 /* Analyze the structure tree, and calculate "first", "next", "edest",
1122 "eclosure", and "inveclosure". */
1124 static reg_errcode_t
1125 analyze (regex_t *preg)
1127 re_dfa_t *dfa = preg->buffer;
1128 reg_errcode_t ret;
1130 /* Allocate arrays. */
1131 dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1132 dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1133 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1134 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1135 if (__glibc_unlikely (dfa->nexts == NULL || dfa->org_indices == NULL
1136 || dfa->edests == NULL || dfa->eclosures == NULL))
1137 return REG_ESPACE;
1139 dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
1140 if (dfa->subexp_map != NULL)
1142 Idx i;
1143 for (i = 0; i < preg->re_nsub; i++)
1144 dfa->subexp_map[i] = i;
1145 preorder (dfa->str_tree, optimize_subexps, dfa);
1146 for (i = 0; i < preg->re_nsub; i++)
1147 if (dfa->subexp_map[i] != i)
1148 break;
1149 if (i == preg->re_nsub)
1151 re_free (dfa->subexp_map);
1152 dfa->subexp_map = NULL;
1156 ret = postorder (dfa->str_tree, lower_subexps, preg);
1157 if (__glibc_unlikely (ret != REG_NOERROR))
1158 return ret;
1159 ret = postorder (dfa->str_tree, calc_first, dfa);
1160 if (__glibc_unlikely (ret != REG_NOERROR))
1161 return ret;
1162 preorder (dfa->str_tree, calc_next, dfa);
1163 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1164 if (__glibc_unlikely (ret != REG_NOERROR))
1165 return ret;
1166 ret = calc_eclosure (dfa);
1167 if (__glibc_unlikely (ret != REG_NOERROR))
1168 return ret;
1170 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1171 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1172 if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1173 || dfa->nbackref)
1175 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1176 if (__glibc_unlikely (dfa->inveclosures == NULL))
1177 return REG_ESPACE;
1178 ret = calc_inveclosure (dfa);
1181 return ret;
1184 /* Our parse trees are very unbalanced, so we cannot use a stack to
1185 implement parse tree visits. Instead, we use parent pointers and
1186 some hairy code in these two functions. */
1187 static reg_errcode_t
1188 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1189 void *extra)
1191 bin_tree_t *node, *prev;
1193 for (node = root; ; )
1195 /* Descend down the tree, preferably to the left (or to the right
1196 if that's the only child). */
1197 while (node->left || node->right)
1198 if (node->left)
1199 node = node->left;
1200 else
1201 node = node->right;
1205 reg_errcode_t err = fn (extra, node);
1206 if (__glibc_unlikely (err != REG_NOERROR))
1207 return err;
1208 if (node->parent == NULL)
1209 return REG_NOERROR;
1210 prev = node;
1211 node = node->parent;
1213 /* Go up while we have a node that is reached from the right. */
1214 while (node->right == prev || node->right == NULL);
1215 node = node->right;
1219 static reg_errcode_t
1220 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1221 void *extra)
1223 bin_tree_t *node;
1225 for (node = root; ; )
1227 reg_errcode_t err = fn (extra, node);
1228 if (__glibc_unlikely (err != REG_NOERROR))
1229 return err;
1231 /* Go to the left node, or up and to the right. */
1232 if (node->left)
1233 node = node->left;
1234 else
1236 bin_tree_t *prev = NULL;
1237 while (node->right == prev || node->right == NULL)
1239 prev = node;
1240 node = node->parent;
1241 if (!node)
1242 return REG_NOERROR;
1244 node = node->right;
1249 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1250 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1251 backreferences as well. Requires a preorder visit. */
1252 static reg_errcode_t
1253 optimize_subexps (void *extra, bin_tree_t *node)
1255 re_dfa_t *dfa = (re_dfa_t *) extra;
1257 if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1259 int idx = node->token.opr.idx;
1260 node->token.opr.idx = dfa->subexp_map[idx];
1261 dfa->used_bkref_map |= 1 << node->token.opr.idx;
1264 else if (node->token.type == SUBEXP
1265 && node->left && node->left->token.type == SUBEXP)
1267 Idx other_idx = node->left->token.opr.idx;
1269 node->left = node->left->left;
1270 if (node->left)
1271 node->left->parent = node;
1273 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1274 if (other_idx < BITSET_WORD_BITS)
1275 dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1278 return REG_NOERROR;
1281 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1282 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1283 static reg_errcode_t
1284 lower_subexps (void *extra, bin_tree_t *node)
1286 regex_t *preg = (regex_t *) extra;
1287 reg_errcode_t err = REG_NOERROR;
1289 if (node->left && node->left->token.type == SUBEXP)
1291 node->left = lower_subexp (&err, preg, node->left);
1292 if (node->left)
1293 node->left->parent = node;
1295 if (node->right && node->right->token.type == SUBEXP)
1297 node->right = lower_subexp (&err, preg, node->right);
1298 if (node->right)
1299 node->right->parent = node;
1302 return err;
1305 static bin_tree_t *
1306 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1308 re_dfa_t *dfa = preg->buffer;
1309 bin_tree_t *body = node->left;
1310 bin_tree_t *op, *cls, *tree1, *tree;
1312 if (preg->no_sub
1313 /* We do not optimize empty subexpressions, because otherwise we may
1314 have bad CONCAT nodes with NULL children. This is obviously not
1315 very common, so we do not lose much. An example that triggers
1316 this case is the sed "script" /\(\)/x. */
1317 && node->left != NULL
1318 && (node->token.opr.idx >= BITSET_WORD_BITS
1319 || !(dfa->used_bkref_map
1320 & ((bitset_word_t) 1 << node->token.opr.idx))))
1321 return node->left;
1323 /* Convert the SUBEXP node to the concatenation of an
1324 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1325 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1326 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1327 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1328 tree = create_tree (dfa, op, tree1, CONCAT);
1329 if (__glibc_unlikely (tree == NULL || tree1 == NULL
1330 || op == NULL || cls == NULL))
1332 *err = REG_ESPACE;
1333 return NULL;
1336 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1337 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1338 return tree;
1341 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1342 nodes. Requires a postorder visit. */
1343 static reg_errcode_t
1344 calc_first (void *extra, bin_tree_t *node)
1346 re_dfa_t *dfa = (re_dfa_t *) extra;
1347 if (node->token.type == CONCAT)
1349 node->first = node->left->first;
1350 node->node_idx = node->left->node_idx;
1352 else
1354 node->first = node;
1355 node->node_idx = re_dfa_add_node (dfa, node->token);
1356 if (__glibc_unlikely (node->node_idx == -1))
1357 return REG_ESPACE;
1358 if (node->token.type == ANCHOR)
1359 dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1361 return REG_NOERROR;
1364 /* Pass 2: compute NEXT on the tree. Preorder visit. */
1365 static reg_errcode_t
1366 calc_next (void *extra, bin_tree_t *node)
1368 switch (node->token.type)
1370 case OP_DUP_ASTERISK:
1371 node->left->next = node;
1372 break;
1373 case CONCAT:
1374 node->left->next = node->right->first;
1375 node->right->next = node->next;
1376 break;
1377 default:
1378 if (node->left)
1379 node->left->next = node->next;
1380 if (node->right)
1381 node->right->next = node->next;
1382 break;
1384 return REG_NOERROR;
1387 /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1388 static reg_errcode_t
1389 link_nfa_nodes (void *extra, bin_tree_t *node)
1391 re_dfa_t *dfa = (re_dfa_t *) extra;
1392 Idx idx = node->node_idx;
1393 reg_errcode_t err = REG_NOERROR;
1395 switch (node->token.type)
1397 case CONCAT:
1398 break;
1400 case END_OF_RE:
1401 DEBUG_ASSERT (node->next == NULL);
1402 break;
1404 case OP_DUP_ASTERISK:
1405 case OP_ALT:
1407 Idx left, right;
1408 dfa->has_plural_match = 1;
1409 if (node->left != NULL)
1410 left = node->left->first->node_idx;
1411 else
1412 left = node->next->node_idx;
1413 if (node->right != NULL)
1414 right = node->right->first->node_idx;
1415 else
1416 right = node->next->node_idx;
1417 DEBUG_ASSERT (left > -1);
1418 DEBUG_ASSERT (right > -1);
1419 err = re_node_set_init_2 (dfa->edests + idx, left, right);
1421 break;
1423 case ANCHOR:
1424 case OP_OPEN_SUBEXP:
1425 case OP_CLOSE_SUBEXP:
1426 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1427 break;
1429 case OP_BACK_REF:
1430 dfa->nexts[idx] = node->next->node_idx;
1431 if (node->token.type == OP_BACK_REF)
1432 err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1433 break;
1435 default:
1436 DEBUG_ASSERT (!IS_EPSILON_NODE (node->token.type));
1437 dfa->nexts[idx] = node->next->node_idx;
1438 break;
1441 return err;
1444 /* Duplicate the epsilon closure of the node ROOT_NODE.
1445 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1446 to their own constraint. */
1448 static reg_errcode_t
1449 duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
1450 Idx root_node, unsigned int init_constraint)
1452 Idx org_node, clone_node;
1453 bool ok;
1454 unsigned int constraint = init_constraint;
1455 for (org_node = top_org_node, clone_node = top_clone_node;;)
1457 Idx org_dest, clone_dest;
1458 if (dfa->nodes[org_node].type == OP_BACK_REF)
1460 /* If the back reference epsilon-transit, its destination must
1461 also have the constraint. Then duplicate the epsilon closure
1462 of the destination of the back reference, and store it in
1463 edests of the back reference. */
1464 org_dest = dfa->nexts[org_node];
1465 re_node_set_empty (dfa->edests + clone_node);
1466 clone_dest = duplicate_node (dfa, org_dest, constraint);
1467 if (__glibc_unlikely (clone_dest == -1))
1468 return REG_ESPACE;
1469 dfa->nexts[clone_node] = dfa->nexts[org_node];
1470 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1471 if (__glibc_unlikely (! ok))
1472 return REG_ESPACE;
1474 else if (dfa->edests[org_node].nelem == 0)
1476 /* In case of the node can't epsilon-transit, don't duplicate the
1477 destination and store the original destination as the
1478 destination of the node. */
1479 dfa->nexts[clone_node] = dfa->nexts[org_node];
1480 break;
1482 else if (dfa->edests[org_node].nelem == 1)
1484 /* In case of the node can epsilon-transit, and it has only one
1485 destination. */
1486 org_dest = dfa->edests[org_node].elems[0];
1487 re_node_set_empty (dfa->edests + clone_node);
1488 /* If the node is root_node itself, it means the epsilon closure
1489 has a loop. Then tie it to the destination of the root_node. */
1490 if (org_node == root_node && clone_node != org_node)
1492 ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
1493 if (__glibc_unlikely (! ok))
1494 return REG_ESPACE;
1495 break;
1497 /* In case the node has another constraint, append it. */
1498 constraint |= dfa->nodes[org_node].constraint;
1499 clone_dest = duplicate_node (dfa, org_dest, constraint);
1500 if (__glibc_unlikely (clone_dest == -1))
1501 return REG_ESPACE;
1502 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1503 if (__glibc_unlikely (! ok))
1504 return REG_ESPACE;
1506 else /* dfa->edests[org_node].nelem == 2 */
1508 /* In case of the node can epsilon-transit, and it has two
1509 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1510 org_dest = dfa->edests[org_node].elems[0];
1511 re_node_set_empty (dfa->edests + clone_node);
1512 /* Search for a duplicated node which satisfies the constraint. */
1513 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1514 if (clone_dest == -1)
1516 /* There is no such duplicated node, create a new one. */
1517 reg_errcode_t err;
1518 clone_dest = duplicate_node (dfa, org_dest, constraint);
1519 if (__glibc_unlikely (clone_dest == -1))
1520 return REG_ESPACE;
1521 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1522 if (__glibc_unlikely (! ok))
1523 return REG_ESPACE;
1524 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1525 root_node, constraint);
1526 if (__glibc_unlikely (err != REG_NOERROR))
1527 return err;
1529 else
1531 /* There is a duplicated node which satisfies the constraint,
1532 use it to avoid infinite loop. */
1533 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1534 if (__glibc_unlikely (! ok))
1535 return REG_ESPACE;
1538 org_dest = dfa->edests[org_node].elems[1];
1539 clone_dest = duplicate_node (dfa, org_dest, constraint);
1540 if (__glibc_unlikely (clone_dest == -1))
1541 return REG_ESPACE;
1542 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1543 if (__glibc_unlikely (! ok))
1544 return REG_ESPACE;
1546 org_node = org_dest;
1547 clone_node = clone_dest;
1549 return REG_NOERROR;
1552 /* Search for a node which is duplicated from the node ORG_NODE, and
1553 satisfies the constraint CONSTRAINT. */
1555 static Idx
1556 search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1557 unsigned int constraint)
1559 Idx idx;
1560 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1562 if (org_node == dfa->org_indices[idx]
1563 && constraint == dfa->nodes[idx].constraint)
1564 return idx; /* Found. */
1566 return -1; /* Not found. */
1569 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1570 Return the index of the new node, or -1 if insufficient storage is
1571 available. */
1573 static Idx
1574 duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
1576 Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1577 if (__glibc_likely (dup_idx != -1))
1579 dfa->nodes[dup_idx].constraint = constraint;
1580 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1581 dfa->nodes[dup_idx].duplicated = 1;
1583 /* Store the index of the original node. */
1584 dfa->org_indices[dup_idx] = org_idx;
1586 return dup_idx;
1589 static reg_errcode_t
1590 calc_inveclosure (re_dfa_t *dfa)
1592 Idx src, idx;
1593 bool ok;
1594 for (idx = 0; idx < dfa->nodes_len; ++idx)
1595 re_node_set_init_empty (dfa->inveclosures + idx);
1597 for (src = 0; src < dfa->nodes_len; ++src)
1599 Idx *elems = dfa->eclosures[src].elems;
1600 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1602 ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1603 if (__glibc_unlikely (! ok))
1604 return REG_ESPACE;
1608 return REG_NOERROR;
1611 /* Calculate "eclosure" for all the node in DFA. */
1613 static reg_errcode_t
1614 calc_eclosure (re_dfa_t *dfa)
1616 Idx node_idx;
1617 bool incomplete;
1618 DEBUG_ASSERT (dfa->nodes_len > 0);
1619 incomplete = false;
1620 /* For each nodes, calculate epsilon closure. */
1621 for (node_idx = 0; ; ++node_idx)
1623 reg_errcode_t err;
1624 re_node_set eclosure_elem;
1625 if (node_idx == dfa->nodes_len)
1627 if (!incomplete)
1628 break;
1629 incomplete = false;
1630 node_idx = 0;
1633 DEBUG_ASSERT (dfa->eclosures[node_idx].nelem != -1);
1635 /* If we have already calculated, skip it. */
1636 if (dfa->eclosures[node_idx].nelem != 0)
1637 continue;
1638 /* Calculate epsilon closure of 'node_idx'. */
1639 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1640 if (__glibc_unlikely (err != REG_NOERROR))
1641 return err;
1643 if (dfa->eclosures[node_idx].nelem == 0)
1645 incomplete = true;
1646 re_node_set_free (&eclosure_elem);
1649 return REG_NOERROR;
1652 /* Calculate epsilon closure of NODE. */
1654 static reg_errcode_t
1655 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
1657 reg_errcode_t err;
1658 Idx i;
1659 re_node_set eclosure;
1660 bool incomplete = false;
1661 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1662 if (__glibc_unlikely (err != REG_NOERROR))
1663 return err;
1665 /* An epsilon closure includes itself. */
1666 eclosure.elems[eclosure.nelem++] = node;
1668 /* This indicates that we are calculating this node now.
1669 We reference this value to avoid infinite loop. */
1670 dfa->eclosures[node].nelem = -1;
1672 /* If the current node has constraints, duplicate all nodes
1673 since they must inherit the constraints. */
1674 if (dfa->nodes[node].constraint
1675 && dfa->edests[node].nelem
1676 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1678 err = duplicate_node_closure (dfa, node, node, node,
1679 dfa->nodes[node].constraint);
1680 if (__glibc_unlikely (err != REG_NOERROR))
1681 return err;
1684 /* Expand each epsilon destination nodes. */
1685 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1686 for (i = 0; i < dfa->edests[node].nelem; ++i)
1688 re_node_set eclosure_elem;
1689 Idx edest = dfa->edests[node].elems[i];
1690 /* If calculating the epsilon closure of 'edest' is in progress,
1691 return intermediate result. */
1692 if (dfa->eclosures[edest].nelem == -1)
1694 incomplete = true;
1695 continue;
1697 /* If we haven't calculated the epsilon closure of 'edest' yet,
1698 calculate now. Otherwise use calculated epsilon closure. */
1699 if (dfa->eclosures[edest].nelem == 0)
1701 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1702 if (__glibc_unlikely (err != REG_NOERROR))
1703 return err;
1705 else
1706 eclosure_elem = dfa->eclosures[edest];
1707 /* Merge the epsilon closure of 'edest'. */
1708 err = re_node_set_merge (&eclosure, &eclosure_elem);
1709 if (__glibc_unlikely (err != REG_NOERROR))
1710 return err;
1711 /* If the epsilon closure of 'edest' is incomplete,
1712 the epsilon closure of this node is also incomplete. */
1713 if (dfa->eclosures[edest].nelem == 0)
1715 incomplete = true;
1716 re_node_set_free (&eclosure_elem);
1720 if (incomplete && !root)
1721 dfa->eclosures[node].nelem = 0;
1722 else
1723 dfa->eclosures[node] = eclosure;
1724 *new_set = eclosure;
1725 return REG_NOERROR;
1728 /* Functions for token which are used in the parser. */
1730 /* Fetch a token from INPUT.
1731 We must not use this function inside bracket expressions. */
1733 static void
1734 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1736 re_string_skip_bytes (input, peek_token (result, input, syntax));
1739 /* Peek a token from INPUT, and return the length of the token.
1740 We must not use this function inside bracket expressions. */
1742 static int
1743 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1745 unsigned char c;
1747 if (re_string_eoi (input))
1749 token->type = END_OF_RE;
1750 return 0;
1753 c = re_string_peek_byte (input, 0);
1754 token->opr.c = c;
1756 token->word_char = 0;
1757 token->mb_partial = 0;
1758 if (input->mb_cur_max > 1
1759 && !re_string_first_byte (input, re_string_cur_idx (input)))
1761 token->type = CHARACTER;
1762 token->mb_partial = 1;
1763 return 1;
1765 if (c == '\\')
1767 unsigned char c2;
1768 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1770 token->type = BACK_SLASH;
1771 return 1;
1774 c2 = re_string_peek_byte_case (input, 1);
1775 token->opr.c = c2;
1776 token->type = CHARACTER;
1777 if (input->mb_cur_max > 1)
1779 wint_t wc = re_string_wchar_at (input,
1780 re_string_cur_idx (input) + 1);
1781 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1783 else
1784 token->word_char = IS_WORD_CHAR (c2) != 0;
1786 switch (c2)
1788 case '|':
1789 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1790 token->type = OP_ALT;
1791 break;
1792 case '1': case '2': case '3': case '4': case '5':
1793 case '6': case '7': case '8': case '9':
1794 if (!(syntax & RE_NO_BK_REFS))
1796 token->type = OP_BACK_REF;
1797 token->opr.idx = c2 - '1';
1799 break;
1800 case '<':
1801 if (!(syntax & RE_NO_GNU_OPS))
1803 token->type = ANCHOR;
1804 token->opr.ctx_type = WORD_FIRST;
1806 break;
1807 case '>':
1808 if (!(syntax & RE_NO_GNU_OPS))
1810 token->type = ANCHOR;
1811 token->opr.ctx_type = WORD_LAST;
1813 break;
1814 case 'b':
1815 if (!(syntax & RE_NO_GNU_OPS))
1817 token->type = ANCHOR;
1818 token->opr.ctx_type = WORD_DELIM;
1820 break;
1821 case 'B':
1822 if (!(syntax & RE_NO_GNU_OPS))
1824 token->type = ANCHOR;
1825 token->opr.ctx_type = NOT_WORD_DELIM;
1827 break;
1828 case 'w':
1829 if (!(syntax & RE_NO_GNU_OPS))
1830 token->type = OP_WORD;
1831 break;
1832 case 'W':
1833 if (!(syntax & RE_NO_GNU_OPS))
1834 token->type = OP_NOTWORD;
1835 break;
1836 case 's':
1837 if (!(syntax & RE_NO_GNU_OPS))
1838 token->type = OP_SPACE;
1839 break;
1840 case 'S':
1841 if (!(syntax & RE_NO_GNU_OPS))
1842 token->type = OP_NOTSPACE;
1843 break;
1844 case '`':
1845 if (!(syntax & RE_NO_GNU_OPS))
1847 token->type = ANCHOR;
1848 token->opr.ctx_type = BUF_FIRST;
1850 break;
1851 case '\'':
1852 if (!(syntax & RE_NO_GNU_OPS))
1854 token->type = ANCHOR;
1855 token->opr.ctx_type = BUF_LAST;
1857 break;
1858 case '(':
1859 if (!(syntax & RE_NO_BK_PARENS))
1860 token->type = OP_OPEN_SUBEXP;
1861 break;
1862 case ')':
1863 if (!(syntax & RE_NO_BK_PARENS))
1864 token->type = OP_CLOSE_SUBEXP;
1865 break;
1866 case '+':
1867 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1868 token->type = OP_DUP_PLUS;
1869 break;
1870 case '?':
1871 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1872 token->type = OP_DUP_QUESTION;
1873 break;
1874 case '{':
1875 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1876 token->type = OP_OPEN_DUP_NUM;
1877 break;
1878 case '}':
1879 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1880 token->type = OP_CLOSE_DUP_NUM;
1881 break;
1882 default:
1883 break;
1885 return 2;
1888 token->type = CHARACTER;
1889 if (input->mb_cur_max > 1)
1891 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1892 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1894 else
1895 token->word_char = IS_WORD_CHAR (token->opr.c);
1897 switch (c)
1899 case '\n':
1900 if (syntax & RE_NEWLINE_ALT)
1901 token->type = OP_ALT;
1902 break;
1903 case '|':
1904 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1905 token->type = OP_ALT;
1906 break;
1907 case '*':
1908 token->type = OP_DUP_ASTERISK;
1909 break;
1910 case '+':
1911 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1912 token->type = OP_DUP_PLUS;
1913 break;
1914 case '?':
1915 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1916 token->type = OP_DUP_QUESTION;
1917 break;
1918 case '{':
1919 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1920 token->type = OP_OPEN_DUP_NUM;
1921 break;
1922 case '}':
1923 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1924 token->type = OP_CLOSE_DUP_NUM;
1925 break;
1926 case '(':
1927 if (syntax & RE_NO_BK_PARENS)
1928 token->type = OP_OPEN_SUBEXP;
1929 break;
1930 case ')':
1931 if (syntax & RE_NO_BK_PARENS)
1932 token->type = OP_CLOSE_SUBEXP;
1933 break;
1934 case '[':
1935 token->type = OP_OPEN_BRACKET;
1936 break;
1937 case '.':
1938 token->type = OP_PERIOD;
1939 break;
1940 case '^':
1941 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE))
1942 && re_string_cur_idx (input) != 0)
1944 char prev = re_string_peek_byte (input, -1);
1945 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1946 break;
1948 token->type = ANCHOR;
1949 token->opr.ctx_type = LINE_FIRST;
1950 break;
1951 case '$':
1952 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS)
1953 && re_string_cur_idx (input) + 1 != re_string_length (input))
1955 re_token_t next;
1956 re_string_skip_bytes (input, 1);
1957 peek_token (&next, input, syntax);
1958 re_string_skip_bytes (input, -1);
1959 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1960 break;
1962 token->type = ANCHOR;
1963 token->opr.ctx_type = LINE_LAST;
1964 break;
1965 default:
1966 break;
1968 return 1;
1971 /* Peek a token from INPUT, and return the length of the token.
1972 We must not use this function out of bracket expressions. */
1974 static int
1975 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1977 unsigned char c;
1978 if (re_string_eoi (input))
1980 token->type = END_OF_RE;
1981 return 0;
1983 c = re_string_peek_byte (input, 0);
1984 token->opr.c = c;
1986 if (input->mb_cur_max > 1
1987 && !re_string_first_byte (input, re_string_cur_idx (input)))
1989 token->type = CHARACTER;
1990 return 1;
1993 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
1994 && re_string_cur_idx (input) + 1 < re_string_length (input))
1996 /* In this case, '\' escape a character. */
1997 unsigned char c2;
1998 re_string_skip_bytes (input, 1);
1999 c2 = re_string_peek_byte (input, 0);
2000 token->opr.c = c2;
2001 token->type = CHARACTER;
2002 return 1;
2004 if (c == '[') /* '[' is a special char in a bracket exps. */
2006 unsigned char c2;
2007 int token_len;
2008 if (re_string_cur_idx (input) + 1 < re_string_length (input))
2009 c2 = re_string_peek_byte (input, 1);
2010 else
2011 c2 = 0;
2012 token->opr.c = c2;
2013 token_len = 2;
2014 switch (c2)
2016 case '.':
2017 token->type = OP_OPEN_COLL_ELEM;
2018 break;
2020 case '=':
2021 token->type = OP_OPEN_EQUIV_CLASS;
2022 break;
2024 case ':':
2025 if (syntax & RE_CHAR_CLASSES)
2027 token->type = OP_OPEN_CHAR_CLASS;
2028 break;
2030 FALLTHROUGH;
2031 default:
2032 token->type = CHARACTER;
2033 token->opr.c = c;
2034 token_len = 1;
2035 break;
2037 return token_len;
2039 switch (c)
2041 case ']':
2042 token->type = OP_CLOSE_BRACKET;
2043 break;
2044 case '^':
2045 token->type = OP_NON_MATCH_LIST;
2046 break;
2047 case '-':
2048 /* In V7 Unix grep and Unix awk and mawk, [...---...]
2049 (3 adjacent minus signs) stands for a single minus sign.
2050 Support that without breaking anything else. */
2051 if (! (re_string_cur_idx (input) + 2 < re_string_length (input)
2052 && re_string_peek_byte (input, 1) == '-'
2053 && re_string_peek_byte (input, 2) == '-'))
2055 token->type = OP_CHARSET_RANGE;
2056 break;
2058 re_string_skip_bytes (input, 2);
2059 FALLTHROUGH;
2060 default:
2061 token->type = CHARACTER;
2063 return 1;
2066 /* Functions for parser. */
2068 /* Entry point of the parser.
2069 Parse the regular expression REGEXP and return the structure tree.
2070 If an error occurs, ERR is set by error code, and return NULL.
2071 This function build the following tree, from regular expression <reg_exp>:
2075 <reg_exp> EOR
2077 CAT means concatenation.
2078 EOR means end of regular expression. */
2080 static bin_tree_t *
2081 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2082 reg_errcode_t *err)
2084 re_dfa_t *dfa = preg->buffer;
2085 bin_tree_t *tree, *eor, *root;
2086 re_token_t current_token;
2087 dfa->syntax = syntax;
2088 fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2089 tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2090 if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2091 return NULL;
2092 eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2093 if (tree != NULL)
2094 root = create_tree (dfa, tree, eor, CONCAT);
2095 else
2096 root = eor;
2097 if (__glibc_unlikely (eor == NULL || root == NULL))
2099 *err = REG_ESPACE;
2100 return NULL;
2102 return root;
2105 /* This function build the following tree, from regular expression
2106 <branch1>|<branch2>:
2110 <branch1> <branch2>
2112 ALT means alternative, which represents the operator '|'. */
2114 static bin_tree_t *
2115 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2116 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2118 re_dfa_t *dfa = preg->buffer;
2119 bin_tree_t *tree, *branch = NULL;
2120 bitset_word_t initial_bkref_map = dfa->completed_bkref_map;
2121 tree = parse_branch (regexp, preg, token, syntax, nest, err);
2122 if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2123 return NULL;
2125 while (token->type == OP_ALT)
2127 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2128 if (token->type != OP_ALT && token->type != END_OF_RE
2129 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2131 bitset_word_t accumulated_bkref_map = dfa->completed_bkref_map;
2132 dfa->completed_bkref_map = initial_bkref_map;
2133 branch = parse_branch (regexp, preg, token, syntax, nest, err);
2134 if (__glibc_unlikely (*err != REG_NOERROR && branch == NULL))
2136 if (tree != NULL)
2137 postorder (tree, free_tree, NULL);
2138 return NULL;
2140 dfa->completed_bkref_map |= accumulated_bkref_map;
2142 else
2143 branch = NULL;
2144 tree = create_tree (dfa, tree, branch, OP_ALT);
2145 if (__glibc_unlikely (tree == NULL))
2147 *err = REG_ESPACE;
2148 return NULL;
2151 return tree;
2154 /* This function build the following tree, from regular expression
2155 <exp1><exp2>:
2159 <exp1> <exp2>
2161 CAT means concatenation. */
2163 static bin_tree_t *
2164 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2165 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2167 bin_tree_t *tree, *expr;
2168 re_dfa_t *dfa = preg->buffer;
2169 tree = parse_expression (regexp, preg, token, syntax, nest, err);
2170 if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2171 return NULL;
2173 while (token->type != OP_ALT && token->type != END_OF_RE
2174 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2176 expr = parse_expression (regexp, preg, token, syntax, nest, err);
2177 if (__glibc_unlikely (*err != REG_NOERROR && expr == NULL))
2179 if (tree != NULL)
2180 postorder (tree, free_tree, NULL);
2181 return NULL;
2183 if (tree != NULL && expr != NULL)
2185 bin_tree_t *newtree = create_tree (dfa, tree, expr, CONCAT);
2186 if (newtree == NULL)
2188 postorder (expr, free_tree, NULL);
2189 postorder (tree, free_tree, NULL);
2190 *err = REG_ESPACE;
2191 return NULL;
2193 tree = newtree;
2195 else if (tree == NULL)
2196 tree = expr;
2197 /* Otherwise expr == NULL, we don't need to create new tree. */
2199 return tree;
2202 /* This function build the following tree, from regular expression a*:
2208 static bin_tree_t *
2209 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2210 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2212 re_dfa_t *dfa = preg->buffer;
2213 bin_tree_t *tree;
2214 switch (token->type)
2216 case CHARACTER:
2217 tree = create_token_tree (dfa, NULL, NULL, token);
2218 if (__glibc_unlikely (tree == NULL))
2220 *err = REG_ESPACE;
2221 return NULL;
2223 if (dfa->mb_cur_max > 1)
2225 while (!re_string_eoi (regexp)
2226 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2228 bin_tree_t *mbc_remain;
2229 fetch_token (token, regexp, syntax);
2230 mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2231 tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2232 if (__glibc_unlikely (mbc_remain == NULL || tree == NULL))
2234 *err = REG_ESPACE;
2235 return NULL;
2239 break;
2241 case OP_OPEN_SUBEXP:
2242 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2243 if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2244 return NULL;
2245 break;
2247 case OP_OPEN_BRACKET:
2248 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2249 if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2250 return NULL;
2251 break;
2253 case OP_BACK_REF:
2254 if (!__glibc_likely (dfa->completed_bkref_map & (1 << token->opr.idx)))
2256 *err = REG_ESUBREG;
2257 return NULL;
2259 dfa->used_bkref_map |= 1 << token->opr.idx;
2260 tree = create_token_tree (dfa, NULL, NULL, token);
2261 if (__glibc_unlikely (tree == NULL))
2263 *err = REG_ESPACE;
2264 return NULL;
2266 ++dfa->nbackref;
2267 dfa->has_mb_node = 1;
2268 break;
2270 case OP_OPEN_DUP_NUM:
2271 if (syntax & RE_CONTEXT_INVALID_DUP)
2273 *err = REG_BADRPT;
2274 return NULL;
2276 FALLTHROUGH;
2277 case OP_DUP_ASTERISK:
2278 case OP_DUP_PLUS:
2279 case OP_DUP_QUESTION:
2280 if (syntax & RE_CONTEXT_INVALID_OPS)
2282 *err = REG_BADRPT;
2283 return NULL;
2285 else if (syntax & RE_CONTEXT_INDEP_OPS)
2287 fetch_token (token, regexp, syntax);
2288 return parse_expression (regexp, preg, token, syntax, nest, err);
2290 FALLTHROUGH;
2291 case OP_CLOSE_SUBEXP:
2292 if ((token->type == OP_CLOSE_SUBEXP)
2293 && !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2295 *err = REG_ERPAREN;
2296 return NULL;
2298 FALLTHROUGH;
2299 case OP_CLOSE_DUP_NUM:
2300 /* We treat it as a normal character. */
2302 /* Then we can these characters as normal characters. */
2303 token->type = CHARACTER;
2304 /* mb_partial and word_char bits should be initialized already
2305 by peek_token. */
2306 tree = create_token_tree (dfa, NULL, NULL, token);
2307 if (__glibc_unlikely (tree == NULL))
2309 *err = REG_ESPACE;
2310 return NULL;
2312 break;
2314 case ANCHOR:
2315 if ((token->opr.ctx_type
2316 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2317 && dfa->word_ops_used == 0)
2318 init_word_char (dfa);
2319 if (token->opr.ctx_type == WORD_DELIM
2320 || token->opr.ctx_type == NOT_WORD_DELIM)
2322 bin_tree_t *tree_first, *tree_last;
2323 if (token->opr.ctx_type == WORD_DELIM)
2325 token->opr.ctx_type = WORD_FIRST;
2326 tree_first = create_token_tree (dfa, NULL, NULL, token);
2327 token->opr.ctx_type = WORD_LAST;
2329 else
2331 token->opr.ctx_type = INSIDE_WORD;
2332 tree_first = create_token_tree (dfa, NULL, NULL, token);
2333 token->opr.ctx_type = INSIDE_NOTWORD;
2335 tree_last = create_token_tree (dfa, NULL, NULL, token);
2336 tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2337 if (__glibc_unlikely (tree_first == NULL || tree_last == NULL
2338 || tree == NULL))
2340 *err = REG_ESPACE;
2341 return NULL;
2344 else
2346 tree = create_token_tree (dfa, NULL, NULL, token);
2347 if (__glibc_unlikely (tree == NULL))
2349 *err = REG_ESPACE;
2350 return NULL;
2353 /* We must return here, since ANCHORs can't be followed
2354 by repetition operators.
2355 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2356 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2357 fetch_token (token, regexp, syntax);
2358 return tree;
2360 case OP_PERIOD:
2361 tree = create_token_tree (dfa, NULL, NULL, token);
2362 if (__glibc_unlikely (tree == NULL))
2364 *err = REG_ESPACE;
2365 return NULL;
2367 if (dfa->mb_cur_max > 1)
2368 dfa->has_mb_node = 1;
2369 break;
2371 case OP_WORD:
2372 case OP_NOTWORD:
2373 tree = build_charclass_op (dfa, regexp->trans,
2374 "alnum",
2375 "_",
2376 token->type == OP_NOTWORD, err);
2377 if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2378 return NULL;
2379 break;
2381 case OP_SPACE:
2382 case OP_NOTSPACE:
2383 tree = build_charclass_op (dfa, regexp->trans,
2384 "space",
2386 token->type == OP_NOTSPACE, err);
2387 if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2388 return NULL;
2389 break;
2391 case OP_ALT:
2392 case END_OF_RE:
2393 return NULL;
2395 case BACK_SLASH:
2396 *err = REG_EESCAPE;
2397 return NULL;
2399 default:
2400 /* Must not happen? */
2401 DEBUG_ASSERT (false);
2402 return NULL;
2404 fetch_token (token, regexp, syntax);
2406 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2407 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2409 bin_tree_t *dup_tree = parse_dup_op (tree, regexp, dfa, token,
2410 syntax, err);
2411 if (__glibc_unlikely (*err != REG_NOERROR && dup_tree == NULL))
2413 if (tree != NULL)
2414 postorder (tree, free_tree, NULL);
2415 return NULL;
2417 tree = dup_tree;
2418 /* In BRE consecutive duplications are not allowed. */
2419 if ((syntax & RE_CONTEXT_INVALID_DUP)
2420 && (token->type == OP_DUP_ASTERISK
2421 || token->type == OP_OPEN_DUP_NUM))
2423 if (tree != NULL)
2424 postorder (tree, free_tree, NULL);
2425 *err = REG_BADRPT;
2426 return NULL;
2430 return tree;
2433 /* This function build the following tree, from regular expression
2434 (<reg_exp>):
2435 SUBEXP
2437 <reg_exp>
2440 static bin_tree_t *
2441 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2442 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2444 re_dfa_t *dfa = preg->buffer;
2445 bin_tree_t *tree;
2446 size_t cur_nsub;
2447 cur_nsub = preg->re_nsub++;
2449 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2451 /* The subexpression may be a null string. */
2452 if (token->type == OP_CLOSE_SUBEXP)
2453 tree = NULL;
2454 else
2456 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2457 if (__glibc_unlikely (*err == REG_NOERROR
2458 && token->type != OP_CLOSE_SUBEXP))
2460 if (tree != NULL)
2461 postorder (tree, free_tree, NULL);
2462 *err = REG_EPAREN;
2464 if (__glibc_unlikely (*err != REG_NOERROR))
2465 return NULL;
2468 if (cur_nsub <= '9' - '1')
2469 dfa->completed_bkref_map |= 1 << cur_nsub;
2471 tree = create_tree (dfa, tree, NULL, SUBEXP);
2472 if (__glibc_unlikely (tree == NULL))
2474 *err = REG_ESPACE;
2475 return NULL;
2477 tree->token.opr.idx = cur_nsub;
2478 return tree;
2481 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2483 static bin_tree_t *
2484 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2485 re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2487 bin_tree_t *tree = NULL, *old_tree = NULL;
2488 Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2489 re_token_t start_token = *token;
2491 if (token->type == OP_OPEN_DUP_NUM)
2493 end = 0;
2494 start = fetch_number (regexp, token, syntax);
2495 if (start == -1)
2497 if (token->type == CHARACTER && token->opr.c == ',')
2498 start = 0; /* We treat "{,m}" as "{0,m}". */
2499 else
2501 *err = REG_BADBR; /* <re>{} is invalid. */
2502 return NULL;
2505 if (__glibc_likely (start != -2))
2507 /* We treat "{n}" as "{n,n}". */
2508 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2509 : ((token->type == CHARACTER && token->opr.c == ',')
2510 ? fetch_number (regexp, token, syntax) : -2));
2512 if (__glibc_unlikely (start == -2 || end == -2))
2514 /* Invalid sequence. */
2515 if (__glibc_unlikely (!(syntax & RE_INVALID_INTERVAL_ORD)))
2517 if (token->type == END_OF_RE)
2518 *err = REG_EBRACE;
2519 else
2520 *err = REG_BADBR;
2522 return NULL;
2525 /* If the syntax bit is set, rollback. */
2526 re_string_set_index (regexp, start_idx);
2527 *token = start_token;
2528 token->type = CHARACTER;
2529 /* mb_partial and word_char bits should be already initialized by
2530 peek_token. */
2531 return elem;
2534 if (__glibc_unlikely ((end != -1 && start > end)
2535 || token->type != OP_CLOSE_DUP_NUM))
2537 /* First number greater than second. */
2538 *err = REG_BADBR;
2539 return NULL;
2542 if (__glibc_unlikely (RE_DUP_MAX < (end == -1 ? start : end)))
2544 *err = REG_ESIZE;
2545 return NULL;
2548 else
2550 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2551 end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2554 fetch_token (token, regexp, syntax);
2556 if (__glibc_unlikely (elem == NULL))
2557 return NULL;
2558 if (__glibc_unlikely (start == 0 && end == 0))
2560 postorder (elem, free_tree, NULL);
2561 return NULL;
2564 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2565 if (__glibc_unlikely (start > 0))
2567 tree = elem;
2568 for (i = 2; i <= start; ++i)
2570 elem = duplicate_tree (elem, dfa);
2571 tree = create_tree (dfa, tree, elem, CONCAT);
2572 if (__glibc_unlikely (elem == NULL || tree == NULL))
2573 goto parse_dup_op_espace;
2576 if (start == end)
2577 return tree;
2579 /* Duplicate ELEM before it is marked optional. */
2580 elem = duplicate_tree (elem, dfa);
2581 if (__glibc_unlikely (elem == NULL))
2582 goto parse_dup_op_espace;
2583 old_tree = tree;
2585 else
2586 old_tree = NULL;
2588 if (elem->token.type == SUBEXP)
2590 uintptr_t subidx = elem->token.opr.idx;
2591 postorder (elem, mark_opt_subexp, (void *) subidx);
2594 tree = create_tree (dfa, elem, NULL,
2595 (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2596 if (__glibc_unlikely (tree == NULL))
2597 goto parse_dup_op_espace;
2599 /* This loop is actually executed only when end != -1,
2600 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2601 already created the start+1-th copy. */
2602 if (TYPE_SIGNED (Idx) || end != -1)
2603 for (i = start + 2; i <= end; ++i)
2605 elem = duplicate_tree (elem, dfa);
2606 tree = create_tree (dfa, tree, elem, CONCAT);
2607 if (__glibc_unlikely (elem == NULL || tree == NULL))
2608 goto parse_dup_op_espace;
2610 tree = create_tree (dfa, tree, NULL, OP_ALT);
2611 if (__glibc_unlikely (tree == NULL))
2612 goto parse_dup_op_espace;
2615 if (old_tree)
2616 tree = create_tree (dfa, old_tree, tree, CONCAT);
2618 return tree;
2620 parse_dup_op_espace:
2621 *err = REG_ESPACE;
2622 return NULL;
2625 /* Size of the names for collating symbol/equivalence_class/character_class.
2626 I'm not sure, but maybe enough. */
2627 #define BRACKET_NAME_BUF_SIZE 32
2629 #ifndef _LIBC
2631 /* Convert the byte B to the corresponding wide character. In a
2632 unibyte locale, treat B as itself. In a multibyte locale, return
2633 WEOF if B is an encoding error. */
2634 static wint_t
2635 parse_byte (unsigned char b, re_dfa_t const *dfa)
2637 return dfa->mb_cur_max > 1 ? __btowc (b) : b;
2640 /* Local function for parse_bracket_exp used in _LIBC environment.
2641 Build the range expression which starts from START_ELEM, and ends
2642 at END_ELEM. The result are written to MBCSET and SBCSET.
2643 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2644 mbcset->range_ends, is a pointer argument since we may
2645 update it. */
2647 static reg_errcode_t
2648 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, Idx *range_alloc,
2649 bracket_elem_t *start_elem, bracket_elem_t *end_elem,
2650 re_dfa_t *dfa, reg_syntax_t syntax, uint_fast32_t nrules,
2651 const unsigned char *collseqmb, const char *collseqwc,
2652 int_fast32_t table_size, const void *symb_table,
2653 const unsigned char *extra)
2655 /* Equivalence Classes and Character Classes can't be a range start/end. */
2656 if (__glibc_unlikely (start_elem->type == EQUIV_CLASS
2657 || start_elem->type == CHAR_CLASS
2658 || end_elem->type == EQUIV_CLASS
2659 || end_elem->type == CHAR_CLASS))
2660 return REG_ERANGE;
2662 /* We can handle no multi character collating elements without libc
2663 support. */
2664 if (__glibc_unlikely ((start_elem->type == COLL_SYM
2665 && strlen ((char *) start_elem->opr.name) > 1)
2666 || (end_elem->type == COLL_SYM
2667 && strlen ((char *) end_elem->opr.name) > 1)))
2668 return REG_ECOLLATE;
2670 unsigned int
2671 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2672 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2673 : 0)),
2674 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2675 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2676 : 0));
2677 wint_t
2678 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2679 ? parse_byte (start_ch, dfa) : start_elem->opr.wch),
2680 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2681 ? parse_byte (end_ch, dfa) : end_elem->opr.wch);
2683 if (start_wc == WEOF || end_wc == WEOF)
2684 return REG_ECOLLATE;
2685 else if (__glibc_unlikely ((syntax & RE_NO_EMPTY_RANGES)
2686 && start_wc > end_wc))
2687 return REG_ERANGE;
2689 /* Got valid collation sequence values, add them as a new entry.
2690 However, for !_LIBC we have no collation elements: if the
2691 character set is single byte, the single byte character set
2692 that we build below suffices. parse_bracket_exp passes
2693 no MBCSET if dfa->mb_cur_max == 1. */
2694 if (dfa->mb_cur_max > 1)
2696 /* Check the space of the arrays. */
2697 if (__glibc_unlikely (*range_alloc == mbcset->nranges))
2699 /* There is not enough space, need realloc. */
2700 wchar_t *new_array_start, *new_array_end;
2701 Idx new_nranges;
2703 /* +1 in case of mbcset->nranges is 0. */
2704 new_nranges = 2 * mbcset->nranges + 1;
2705 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2706 are NULL if *range_alloc == 0. */
2707 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2708 new_nranges);
2709 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2710 new_nranges);
2712 if (__glibc_unlikely (new_array_start == NULL
2713 || new_array_end == NULL))
2715 re_free (new_array_start);
2716 re_free (new_array_end);
2717 return REG_ESPACE;
2720 mbcset->range_starts = new_array_start;
2721 mbcset->range_ends = new_array_end;
2722 *range_alloc = new_nranges;
2725 mbcset->range_starts[mbcset->nranges] = start_wc;
2726 mbcset->range_ends[mbcset->nranges++] = end_wc;
2729 /* Build the table for single byte characters. */
2730 for (wchar_t wc = 0; wc < SBC_MAX; ++wc)
2732 if (start_wc <= wc && wc <= end_wc)
2733 bitset_set (sbcset, wc);
2736 return REG_NOERROR;
2738 #endif /* not _LIBC */
2740 #ifndef _LIBC
2741 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC.
2742 Build the collating element which is represented by NAME.
2743 The result are written to MBCSET and SBCSET.
2744 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2745 pointer argument since we may update it. */
2747 static reg_errcode_t
2748 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2749 Idx *coll_sym_alloc, const unsigned char *name,
2750 uint_fast32_t nrules, int_fast32_t table_size,
2751 const void *symb_table, const unsigned char *extra)
2753 size_t name_len = strlen ((const char *) name);
2754 if (__glibc_unlikely (name_len != 1))
2755 return REG_ECOLLATE;
2756 else
2758 bitset_set (sbcset, name[0]);
2759 return REG_NOERROR;
2762 #endif /* not _LIBC */
2764 #ifdef _LIBC
2765 /* Local function for parse_bracket_exp used in _LIBC environment.
2766 Seek the collating symbol entry corresponding to NAME.
2767 Return the index of the symbol in the SYMB_TABLE,
2768 or -1 if not found. */
2770 static __always_inline int32_t
2771 seek_collating_symbol_entry (const unsigned char *name, size_t name_len,
2772 const int32_t *symb_table,
2773 int_fast32_t table_size,
2774 const unsigned char *extra)
2776 int_fast32_t elem;
2778 for (elem = 0; elem < table_size; elem++)
2779 if (symb_table[2 * elem] != 0)
2781 int32_t idx = symb_table[2 * elem + 1];
2782 /* Skip the name of collating element name. */
2783 idx += 1 + extra[idx];
2784 if (/* Compare the length of the name. */
2785 name_len == extra[idx]
2786 /* Compare the name. */
2787 && memcmp (name, &extra[idx + 1], name_len) == 0)
2788 /* Yep, this is the entry. */
2789 return elem;
2791 return -1;
2794 /* Local function for parse_bracket_exp used in _LIBC environment.
2795 Look up the collation sequence value of BR_ELEM.
2796 Return the value if succeeded, UINT_MAX otherwise. */
2798 static __always_inline unsigned int
2799 lookup_collation_sequence_value (bracket_elem_t *br_elem, uint32_t nrules,
2800 const unsigned char *collseqmb,
2801 const char *collseqwc,
2802 int_fast32_t table_size,
2803 const int32_t *symb_table,
2804 const unsigned char *extra)
2806 if (br_elem->type == SB_CHAR)
2808 /* if (MB_CUR_MAX == 1) */
2809 if (nrules == 0)
2810 return collseqmb[br_elem->opr.ch];
2811 else
2813 wint_t wc = __btowc (br_elem->opr.ch);
2814 return __collseq_table_lookup (collseqwc, wc);
2817 else if (br_elem->type == MB_CHAR)
2819 if (nrules != 0)
2820 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2822 else if (br_elem->type == COLL_SYM)
2824 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2825 if (nrules != 0)
2827 int32_t elem, idx;
2828 elem = seek_collating_symbol_entry (br_elem->opr.name,
2829 sym_name_len,
2830 symb_table, table_size,
2831 extra);
2832 if (elem != -1)
2834 /* We found the entry. */
2835 idx = symb_table[2 * elem + 1];
2836 /* Skip the name of collating element name. */
2837 idx += 1 + extra[idx];
2838 /* Skip the byte sequence of the collating element. */
2839 idx += 1 + extra[idx];
2840 /* Adjust for the alignment. */
2841 idx = (idx + 3) & ~3;
2842 /* Skip the multibyte collation sequence value. */
2843 idx += sizeof (unsigned int);
2844 /* Skip the wide char sequence of the collating element. */
2845 idx += sizeof (unsigned int) *
2846 (1 + *(unsigned int *) (extra + idx));
2847 /* Return the collation sequence value. */
2848 return *(unsigned int *) (extra + idx);
2850 else if (sym_name_len == 1)
2852 /* No valid character. Match it as a single byte
2853 character. */
2854 return collseqmb[br_elem->opr.name[0]];
2857 else if (sym_name_len == 1)
2858 return collseqmb[br_elem->opr.name[0]];
2860 return UINT_MAX;
2863 /* Local function for parse_bracket_exp used in _LIBC environment.
2864 Build the range expression which starts from START_ELEM, and ends
2865 at END_ELEM. The result are written to MBCSET and SBCSET.
2866 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2867 mbcset->range_ends, is a pointer argument since we may
2868 update it. */
2870 static __always_inline reg_errcode_t
2871 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, Idx *range_alloc,
2872 bracket_elem_t *start_elem, bracket_elem_t *end_elem,
2873 re_dfa_t *dfa, reg_syntax_t syntax, uint32_t nrules,
2874 const unsigned char *collseqmb, const char *collseqwc,
2875 int_fast32_t table_size, const int32_t *symb_table,
2876 const unsigned char *extra)
2878 unsigned int ch;
2879 uint32_t start_collseq;
2880 uint32_t end_collseq;
2882 /* Equivalence Classes and Character Classes can't be a range
2883 start/end. */
2884 if (__glibc_unlikely (start_elem->type == EQUIV_CLASS
2885 || start_elem->type == CHAR_CLASS
2886 || end_elem->type == EQUIV_CLASS
2887 || end_elem->type == CHAR_CLASS))
2888 return REG_ERANGE;
2890 /* FIXME: Implement rational ranges here, too. */
2891 start_collseq = lookup_collation_sequence_value (start_elem, nrules, collseqmb, collseqwc,
2892 table_size, symb_table, extra);
2893 end_collseq = lookup_collation_sequence_value (end_elem, nrules, collseqmb, collseqwc,
2894 table_size, symb_table, extra);
2895 /* Check start/end collation sequence values. */
2896 if (__glibc_unlikely (start_collseq == UINT_MAX
2897 || end_collseq == UINT_MAX))
2898 return REG_ECOLLATE;
2899 if (__glibc_unlikely ((syntax & RE_NO_EMPTY_RANGES)
2900 && start_collseq > end_collseq))
2901 return REG_ERANGE;
2903 /* Got valid collation sequence values, add them as a new entry.
2904 However, if we have no collation elements, and the character set
2905 is single byte, the single byte character set that we
2906 build below suffices. */
2907 if (nrules > 0 || dfa->mb_cur_max > 1)
2909 /* Check the space of the arrays. */
2910 if (__glibc_unlikely (*range_alloc == mbcset->nranges))
2912 /* There is not enough space, need realloc. */
2913 uint32_t *new_array_start;
2914 uint32_t *new_array_end;
2915 int new_nranges;
2917 /* +1 in case of mbcset->nranges is 0. */
2918 new_nranges = 2 * mbcset->nranges + 1;
2919 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2920 new_nranges);
2921 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2922 new_nranges);
2924 if (__glibc_unlikely (new_array_start == NULL
2925 || new_array_end == NULL))
2926 return REG_ESPACE;
2928 mbcset->range_starts = new_array_start;
2929 mbcset->range_ends = new_array_end;
2930 *range_alloc = new_nranges;
2933 mbcset->range_starts[mbcset->nranges] = start_collseq;
2934 mbcset->range_ends[mbcset->nranges++] = end_collseq;
2937 /* Build the table for single byte characters. */
2938 for (ch = 0; ch < SBC_MAX; ch++)
2940 uint32_t ch_collseq;
2941 /* if (MB_CUR_MAX == 1) */
2942 if (nrules == 0)
2943 ch_collseq = collseqmb[ch];
2944 else
2945 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2946 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2947 bitset_set (sbcset, ch);
2949 return REG_NOERROR;
2952 /* Local function for parse_bracket_exp used in _LIBC environment.
2953 Build the collating element which is represented by NAME.
2954 The result are written to MBCSET and SBCSET.
2955 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2956 pointer argument since we may update it. */
2958 static __always_inline reg_errcode_t
2959 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2960 Idx *coll_sym_alloc, const unsigned char *name,
2961 uint_fast32_t nrules, int_fast32_t table_size,
2962 const int32_t *symb_table, const unsigned char *extra)
2964 int32_t elem, idx;
2965 size_t name_len = strlen ((const char *) name);
2966 if (nrules != 0)
2968 elem = seek_collating_symbol_entry (name, name_len, symb_table,
2969 table_size, extra);
2970 if (elem != -1)
2972 /* We found the entry. */
2973 idx = symb_table[2 * elem + 1];
2974 /* Skip the name of collating element name. */
2975 idx += 1 + extra[idx];
2977 else if (name_len == 1)
2979 /* No valid character, treat it as a normal
2980 character. */
2981 bitset_set (sbcset, name[0]);
2982 return REG_NOERROR;
2984 else
2985 return REG_ECOLLATE;
2987 /* Got valid collation sequence, add it as a new entry. */
2988 /* Check the space of the arrays. */
2989 if (__glibc_unlikely (*coll_sym_alloc == mbcset->ncoll_syms))
2991 /* Not enough, realloc it. */
2992 /* +1 in case of mbcset->ncoll_syms is 0. */
2993 int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
2994 /* Use realloc since mbcset->coll_syms is NULL
2995 if *alloc == 0. */
2996 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
2997 new_coll_sym_alloc);
2998 if (__glibc_unlikely (new_coll_syms == NULL))
2999 return REG_ESPACE;
3000 mbcset->coll_syms = new_coll_syms;
3001 *coll_sym_alloc = new_coll_sym_alloc;
3003 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3004 return REG_NOERROR;
3006 else
3008 if (__glibc_unlikely (name_len != 1))
3009 return REG_ECOLLATE;
3010 else
3012 bitset_set (sbcset, name[0]);
3013 return REG_NOERROR;
3017 #endif /* _LIBC */
3019 /* This function parse bracket expression like "[abc]", "[a-c]",
3020 "[[.a-a.]]" etc. */
3022 static bin_tree_t *
3023 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
3024 reg_syntax_t syntax, reg_errcode_t *err)
3026 const unsigned char *collseqmb = NULL;
3027 const char *collseqwc = NULL;
3028 uint_fast32_t nrules = 0;
3029 int_fast32_t table_size = 0;
3030 const void *symb_table = NULL;
3031 const unsigned char *extra = NULL;
3033 re_token_t br_token;
3034 re_bitset_ptr_t sbcset;
3035 re_charset_t *mbcset;
3036 Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3037 Idx equiv_class_alloc = 0, char_class_alloc = 0;
3038 bool non_match = false;
3039 bin_tree_t *work_tree;
3040 int token_len;
3041 bool first_round = true;
3042 #ifdef _LIBC
3043 collseqmb = (const unsigned char *)
3044 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3045 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3046 if (nrules)
3049 if (MB_CUR_MAX > 1)
3051 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3052 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3053 symb_table = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_TABLEMB);
3054 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3055 _NL_COLLATE_SYMB_EXTRAMB);
3057 #endif
3058 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3059 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3060 if (__glibc_unlikely (sbcset == NULL || mbcset == NULL))
3062 re_free (sbcset);
3063 re_free (mbcset);
3064 *err = REG_ESPACE;
3065 return NULL;
3068 token_len = peek_token_bracket (token, regexp, syntax);
3069 if (__glibc_unlikely (token->type == END_OF_RE))
3071 *err = REG_BADPAT;
3072 goto parse_bracket_exp_free_return;
3074 if (token->type == OP_NON_MATCH_LIST)
3076 mbcset->non_match = 1;
3077 non_match = true;
3078 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3079 bitset_set (sbcset, '\n');
3080 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3081 token_len = peek_token_bracket (token, regexp, syntax);
3082 if (__glibc_unlikely (token->type == END_OF_RE))
3084 *err = REG_BADPAT;
3085 goto parse_bracket_exp_free_return;
3089 /* We treat the first ']' as a normal character. */
3090 if (token->type == OP_CLOSE_BRACKET)
3091 token->type = CHARACTER;
3093 while (1)
3095 bracket_elem_t start_elem, end_elem;
3096 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3097 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3098 reg_errcode_t ret;
3099 int token_len2 = 0;
3100 bool is_range_exp = false;
3101 re_token_t token2;
3103 start_elem.opr.name = start_name_buf;
3104 start_elem.type = COLL_SYM;
3105 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3106 syntax, first_round);
3107 if (__glibc_unlikely (ret != REG_NOERROR))
3109 *err = ret;
3110 goto parse_bracket_exp_free_return;
3112 first_round = false;
3114 /* Get information about the next token. We need it in any case. */
3115 token_len = peek_token_bracket (token, regexp, syntax);
3117 /* Do not check for ranges if we know they are not allowed. */
3118 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3120 if (__glibc_unlikely (token->type == END_OF_RE))
3122 *err = REG_EBRACK;
3123 goto parse_bracket_exp_free_return;
3125 if (token->type == OP_CHARSET_RANGE)
3127 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3128 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3129 if (__glibc_unlikely (token2.type == END_OF_RE))
3131 *err = REG_EBRACK;
3132 goto parse_bracket_exp_free_return;
3134 if (token2.type == OP_CLOSE_BRACKET)
3136 /* We treat the last '-' as a normal character. */
3137 re_string_skip_bytes (regexp, -token_len);
3138 token->type = CHARACTER;
3140 else
3141 is_range_exp = true;
3145 if (is_range_exp == true)
3147 end_elem.opr.name = end_name_buf;
3148 end_elem.type = COLL_SYM;
3149 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3150 dfa, syntax, true);
3151 if (__glibc_unlikely (ret != REG_NOERROR))
3153 *err = ret;
3154 goto parse_bracket_exp_free_return;
3157 token_len = peek_token_bracket (token, regexp, syntax);
3159 *err = build_range_exp (sbcset, mbcset, &range_alloc,
3160 &start_elem, &end_elem,
3161 dfa, syntax, nrules, collseqmb, collseqwc,
3162 table_size, symb_table, extra);
3163 if (__glibc_unlikely (*err != REG_NOERROR))
3164 goto parse_bracket_exp_free_return;
3166 else
3168 switch (start_elem.type)
3170 case SB_CHAR:
3171 bitset_set (sbcset, start_elem.opr.ch);
3172 break;
3173 case MB_CHAR:
3174 /* Check whether the array has enough space. */
3175 if (__glibc_unlikely (mbchar_alloc == mbcset->nmbchars))
3177 wchar_t *new_mbchars;
3178 /* Not enough, realloc it. */
3179 /* +1 in case of mbcset->nmbchars is 0. */
3180 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3181 /* Use realloc since array is NULL if *alloc == 0. */
3182 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3183 mbchar_alloc);
3184 if (__glibc_unlikely (new_mbchars == NULL))
3185 goto parse_bracket_exp_espace;
3186 mbcset->mbchars = new_mbchars;
3188 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3189 break;
3190 case EQUIV_CLASS:
3191 *err = build_equiv_class (sbcset,
3192 mbcset, &equiv_class_alloc,
3193 start_elem.opr.name);
3194 if (__glibc_unlikely (*err != REG_NOERROR))
3195 goto parse_bracket_exp_free_return;
3196 break;
3197 case COLL_SYM:
3198 *err = build_collating_symbol (sbcset,
3199 mbcset, &coll_sym_alloc,
3200 start_elem.opr.name,
3201 nrules, table_size, symb_table, extra);
3202 if (__glibc_unlikely (*err != REG_NOERROR))
3203 goto parse_bracket_exp_free_return;
3204 break;
3205 case CHAR_CLASS:
3206 *err = build_charclass (regexp->trans, sbcset,
3207 mbcset, &char_class_alloc,
3208 (const char *) start_elem.opr.name,
3209 syntax);
3210 if (__glibc_unlikely (*err != REG_NOERROR))
3211 goto parse_bracket_exp_free_return;
3212 break;
3213 default:
3214 DEBUG_ASSERT (false);
3215 break;
3218 if (__glibc_unlikely (token->type == END_OF_RE))
3220 *err = REG_EBRACK;
3221 goto parse_bracket_exp_free_return;
3223 if (token->type == OP_CLOSE_BRACKET)
3224 break;
3227 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3229 /* If it is non-matching list. */
3230 if (non_match)
3231 bitset_not (sbcset);
3233 /* Ensure only single byte characters are set. */
3234 if (dfa->mb_cur_max > 1)
3235 bitset_mask (sbcset, dfa->sb_char);
3237 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3238 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3239 || mbcset->non_match)))
3241 bin_tree_t *mbc_tree;
3242 int sbc_idx;
3243 /* Build a tree for complex bracket. */
3244 dfa->has_mb_node = 1;
3245 br_token.type = COMPLEX_BRACKET;
3246 br_token.opr.mbcset = mbcset;
3247 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3248 if (__glibc_unlikely (mbc_tree == NULL))
3249 goto parse_bracket_exp_espace;
3250 for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3251 if (sbcset[sbc_idx])
3252 break;
3253 /* If there are no bits set in sbcset, there is no point
3254 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3255 if (sbc_idx < BITSET_WORDS)
3257 /* Build a tree for simple bracket. */
3258 br_token.type = SIMPLE_BRACKET;
3259 br_token.opr.sbcset = sbcset;
3260 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3261 if (__glibc_unlikely (work_tree == NULL))
3262 goto parse_bracket_exp_espace;
3264 /* Then join them by ALT node. */
3265 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3266 if (__glibc_unlikely (work_tree == NULL))
3267 goto parse_bracket_exp_espace;
3269 else
3271 re_free (sbcset);
3272 work_tree = mbc_tree;
3275 else
3277 free_charset (mbcset);
3278 /* Build a tree for simple bracket. */
3279 br_token.type = SIMPLE_BRACKET;
3280 br_token.opr.sbcset = sbcset;
3281 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3282 if (__glibc_unlikely (work_tree == NULL))
3283 goto parse_bracket_exp_espace;
3285 return work_tree;
3287 parse_bracket_exp_espace:
3288 *err = REG_ESPACE;
3289 parse_bracket_exp_free_return:
3290 re_free (sbcset);
3291 free_charset (mbcset);
3292 return NULL;
3295 /* Parse an element in the bracket expression. */
3297 static reg_errcode_t
3298 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3299 re_token_t *token, int token_len, re_dfa_t *dfa,
3300 reg_syntax_t syntax, bool accept_hyphen)
3302 int cur_char_size;
3303 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3304 if (cur_char_size > 1)
3306 elem->type = MB_CHAR;
3307 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3308 re_string_skip_bytes (regexp, cur_char_size);
3309 return REG_NOERROR;
3311 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3312 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3313 || token->type == OP_OPEN_EQUIV_CLASS)
3314 return parse_bracket_symbol (elem, regexp, token);
3315 if (__glibc_unlikely (token->type == OP_CHARSET_RANGE) && !accept_hyphen)
3317 /* A '-' must only appear as anything but a range indicator before
3318 the closing bracket. Everything else is an error. */
3319 re_token_t token2;
3320 (void) peek_token_bracket (&token2, regexp, syntax);
3321 if (token2.type != OP_CLOSE_BRACKET)
3322 /* The actual error value is not standardized since this whole
3323 case is undefined. But ERANGE makes good sense. */
3324 return REG_ERANGE;
3326 elem->type = SB_CHAR;
3327 elem->opr.ch = token->opr.c;
3328 return REG_NOERROR;
3331 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3332 such as [:<character_class>:], [.<collating_element>.], and
3333 [=<equivalent_class>=]. */
3335 static reg_errcode_t
3336 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3337 re_token_t *token)
3339 unsigned char ch, delim = token->opr.c;
3340 int i = 0;
3341 if (re_string_eoi(regexp))
3342 return REG_EBRACK;
3343 for (;; ++i)
3345 if (i >= BRACKET_NAME_BUF_SIZE)
3346 return REG_EBRACK;
3347 if (token->type == OP_OPEN_CHAR_CLASS)
3348 ch = re_string_fetch_byte_case (regexp);
3349 else
3350 ch = re_string_fetch_byte (regexp);
3351 if (re_string_eoi(regexp))
3352 return REG_EBRACK;
3353 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3354 break;
3355 elem->opr.name[i] = ch;
3357 re_string_skip_bytes (regexp, 1);
3358 elem->opr.name[i] = '\0';
3359 switch (token->type)
3361 case OP_OPEN_COLL_ELEM:
3362 elem->type = COLL_SYM;
3363 break;
3364 case OP_OPEN_EQUIV_CLASS:
3365 elem->type = EQUIV_CLASS;
3366 break;
3367 case OP_OPEN_CHAR_CLASS:
3368 elem->type = CHAR_CLASS;
3369 break;
3370 default:
3371 break;
3373 return REG_NOERROR;
3376 /* Helper function for parse_bracket_exp.
3377 Build the equivalence class which is represented by NAME.
3378 The result are written to MBCSET and SBCSET.
3379 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3380 is a pointer argument since we may update it. */
3382 static reg_errcode_t
3383 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3384 Idx *equiv_class_alloc, const unsigned char *name)
3386 #ifdef _LIBC
3387 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3388 if (nrules != 0)
3390 const int32_t *table, *indirect;
3391 const unsigned char *weights, *extra, *cp;
3392 unsigned char char_buf[2];
3393 int32_t idx1, idx2;
3394 unsigned int ch;
3395 size_t len;
3396 /* Calculate the index for equivalence class. */
3397 cp = name;
3398 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3399 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3400 _NL_COLLATE_WEIGHTMB);
3401 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3402 _NL_COLLATE_EXTRAMB);
3403 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3404 _NL_COLLATE_INDIRECTMB);
3405 idx1 = findidx (table, indirect, extra, &cp, -1);
3406 if (__glibc_unlikely (idx1 == 0 || *cp != '\0'))
3407 /* This isn't a valid character. */
3408 return REG_ECOLLATE;
3410 /* Build single byte matching table for this equivalence class. */
3411 len = weights[idx1 & 0xffffff];
3412 for (ch = 0; ch < SBC_MAX; ++ch)
3414 char_buf[0] = ch;
3415 cp = char_buf;
3416 idx2 = findidx (table, indirect, extra, &cp, 1);
3418 idx2 = table[ch];
3420 if (idx2 == 0)
3421 /* This isn't a valid character. */
3422 continue;
3423 /* Compare only if the length matches and the collation rule
3424 index is the same. */
3425 if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24)
3426 && memcmp (weights + (idx1 & 0xffffff) + 1,
3427 weights + (idx2 & 0xffffff) + 1, len) == 0)
3428 bitset_set (sbcset, ch);
3430 /* Check whether the array has enough space. */
3431 if (__glibc_unlikely (*equiv_class_alloc == mbcset->nequiv_classes))
3433 /* Not enough, realloc it. */
3434 /* +1 in case of mbcset->nequiv_classes is 0. */
3435 Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3436 /* Use realloc since the array is NULL if *alloc == 0. */
3437 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3438 int32_t,
3439 new_equiv_class_alloc);
3440 if (__glibc_unlikely (new_equiv_classes == NULL))
3441 return REG_ESPACE;
3442 mbcset->equiv_classes = new_equiv_classes;
3443 *equiv_class_alloc = new_equiv_class_alloc;
3445 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3447 else
3448 #endif /* _LIBC */
3450 if (__glibc_unlikely (strlen ((const char *) name) != 1))
3451 return REG_ECOLLATE;
3452 bitset_set (sbcset, *name);
3454 return REG_NOERROR;
3457 /* Helper function for parse_bracket_exp.
3458 Build the character class which is represented by NAME.
3459 The result are written to MBCSET and SBCSET.
3460 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3461 is a pointer argument since we may update it. */
3463 static reg_errcode_t
3464 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3465 re_charset_t *mbcset, Idx *char_class_alloc,
3466 const char *class_name, reg_syntax_t syntax)
3468 int i;
3469 const char *name = class_name;
3471 /* In case of REG_ICASE "upper" and "lower" match the both of
3472 upper and lower cases. */
3473 if ((syntax & RE_ICASE)
3474 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3475 name = "alpha";
3477 /* Check the space of the arrays. */
3478 if (__glibc_unlikely (*char_class_alloc == mbcset->nchar_classes))
3480 /* Not enough, realloc it. */
3481 /* +1 in case of mbcset->nchar_classes is 0. */
3482 Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3483 /* Use realloc since array is NULL if *alloc == 0. */
3484 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3485 new_char_class_alloc);
3486 if (__glibc_unlikely (new_char_classes == NULL))
3487 return REG_ESPACE;
3488 mbcset->char_classes = new_char_classes;
3489 *char_class_alloc = new_char_class_alloc;
3491 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3493 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3494 do { \
3495 if (__glibc_unlikely (trans != NULL)) \
3497 for (i = 0; i < SBC_MAX; ++i) \
3498 if (ctype_func (i)) \
3499 bitset_set (sbcset, trans[i]); \
3501 else \
3503 for (i = 0; i < SBC_MAX; ++i) \
3504 if (ctype_func (i)) \
3505 bitset_set (sbcset, i); \
3507 } while (0)
3509 if (strcmp (name, "alnum") == 0)
3510 BUILD_CHARCLASS_LOOP (isalnum);
3511 else if (strcmp (name, "cntrl") == 0)
3512 BUILD_CHARCLASS_LOOP (iscntrl);
3513 else if (strcmp (name, "lower") == 0)
3514 BUILD_CHARCLASS_LOOP (islower);
3515 else if (strcmp (name, "space") == 0)
3516 BUILD_CHARCLASS_LOOP (isspace);
3517 else if (strcmp (name, "alpha") == 0)
3518 BUILD_CHARCLASS_LOOP (isalpha);
3519 else if (strcmp (name, "digit") == 0)
3520 BUILD_CHARCLASS_LOOP (isdigit);
3521 else if (strcmp (name, "print") == 0)
3522 BUILD_CHARCLASS_LOOP (isprint);
3523 else if (strcmp (name, "upper") == 0)
3524 BUILD_CHARCLASS_LOOP (isupper);
3525 else if (strcmp (name, "blank") == 0)
3526 BUILD_CHARCLASS_LOOP (isblank);
3527 else if (strcmp (name, "graph") == 0)
3528 BUILD_CHARCLASS_LOOP (isgraph);
3529 else if (strcmp (name, "punct") == 0)
3530 BUILD_CHARCLASS_LOOP (ispunct);
3531 else if (strcmp (name, "xdigit") == 0)
3532 BUILD_CHARCLASS_LOOP (isxdigit);
3533 else
3534 return REG_ECTYPE;
3536 return REG_NOERROR;
3539 static bin_tree_t *
3540 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3541 const char *class_name,
3542 const char *extra, bool non_match,
3543 reg_errcode_t *err)
3545 re_bitset_ptr_t sbcset;
3546 re_charset_t *mbcset;
3547 Idx alloc = 0;
3548 reg_errcode_t ret;
3549 bin_tree_t *tree;
3551 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3552 if (__glibc_unlikely (sbcset == NULL))
3554 *err = REG_ESPACE;
3555 return NULL;
3557 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3558 if (__glibc_unlikely (mbcset == NULL))
3560 re_free (sbcset);
3561 *err = REG_ESPACE;
3562 return NULL;
3564 mbcset->non_match = non_match;
3566 /* We don't care the syntax in this case. */
3567 ret = build_charclass (trans, sbcset, mbcset, &alloc, class_name, 0);
3569 if (__glibc_unlikely (ret != REG_NOERROR))
3571 re_free (sbcset);
3572 free_charset (mbcset);
3573 *err = ret;
3574 return NULL;
3576 /* \w match '_' also. */
3577 for (; *extra; extra++)
3578 bitset_set (sbcset, *extra);
3580 /* If it is non-matching list. */
3581 if (non_match)
3582 bitset_not (sbcset);
3584 /* Ensure only single byte characters are set. */
3585 if (dfa->mb_cur_max > 1)
3586 bitset_mask (sbcset, dfa->sb_char);
3588 /* Build a tree for simple bracket. */
3589 re_token_t br_token = { .type = SIMPLE_BRACKET, .opr.sbcset = sbcset };
3590 tree = create_token_tree (dfa, NULL, NULL, &br_token);
3591 if (__glibc_unlikely (tree == NULL))
3592 goto build_word_op_espace;
3594 if (dfa->mb_cur_max > 1)
3596 bin_tree_t *mbc_tree;
3597 /* Build a tree for complex bracket. */
3598 br_token.type = COMPLEX_BRACKET;
3599 br_token.opr.mbcset = mbcset;
3600 dfa->has_mb_node = 1;
3601 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3602 if (__glibc_unlikely (mbc_tree == NULL))
3603 goto build_word_op_espace;
3604 /* Then join them by ALT node. */
3605 tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3606 if (__glibc_likely (mbc_tree != NULL))
3607 return tree;
3609 else
3611 free_charset (mbcset);
3612 return tree;
3615 build_word_op_espace:
3616 re_free (sbcset);
3617 free_charset (mbcset);
3618 *err = REG_ESPACE;
3619 return NULL;
3622 /* This is intended for the expressions like "a{1,3}".
3623 Fetch a number from 'input', and return the number.
3624 Return -1 if the number field is empty like "{,1}".
3625 Return RE_DUP_MAX + 1 if the number field is too large.
3626 Return -2 if an error occurred. */
3628 static Idx
3629 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3631 Idx num = -1;
3632 unsigned char c;
3633 while (1)
3635 fetch_token (token, input, syntax);
3636 c = token->opr.c;
3637 if (__glibc_unlikely (token->type == END_OF_RE))
3638 return -2;
3639 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3640 break;
3641 num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3642 ? -2
3643 : num == -1
3644 ? c - '0'
3645 : MIN (RE_DUP_MAX + 1, num * 10 + c - '0'));
3647 return num;
3650 static void
3651 free_charset (re_charset_t *cset)
3653 re_free (cset->mbchars);
3654 #ifdef _LIBC
3655 re_free (cset->coll_syms);
3656 re_free (cset->equiv_classes);
3657 #endif
3658 re_free (cset->range_starts);
3659 re_free (cset->range_ends);
3660 re_free (cset->char_classes);
3661 re_free (cset);
3664 /* Functions for binary tree operation. */
3666 /* Create a tree node. */
3668 static bin_tree_t *
3669 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3670 re_token_type_t type)
3672 re_token_t t = { .type = type };
3673 return create_token_tree (dfa, left, right, &t);
3676 static bin_tree_t *
3677 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3678 const re_token_t *token)
3680 bin_tree_t *tree;
3681 if (__glibc_unlikely (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE))
3683 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3685 if (storage == NULL)
3686 return NULL;
3687 storage->next = dfa->str_tree_storage;
3688 dfa->str_tree_storage = storage;
3689 dfa->str_tree_storage_idx = 0;
3691 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3693 tree->parent = NULL;
3694 tree->left = left;
3695 tree->right = right;
3696 tree->token = *token;
3697 tree->token.duplicated = 0;
3698 tree->token.opt_subexp = 0;
3699 tree->first = NULL;
3700 tree->next = NULL;
3701 tree->node_idx = -1;
3703 if (left != NULL)
3704 left->parent = tree;
3705 if (right != NULL)
3706 right->parent = tree;
3707 return tree;
3710 /* Mark the tree SRC as an optional subexpression.
3711 To be called from preorder or postorder. */
3713 static reg_errcode_t
3714 mark_opt_subexp (void *extra, bin_tree_t *node)
3716 Idx idx = (uintptr_t) extra;
3717 if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3718 node->token.opt_subexp = 1;
3720 return REG_NOERROR;
3723 /* Free the allocated memory inside NODE. */
3725 static void
3726 free_token (re_token_t *node)
3728 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3729 free_charset (node->opr.mbcset);
3730 else if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3731 re_free (node->opr.sbcset);
3734 /* Worker function for tree walking. Free the allocated memory inside NODE
3735 and its children. */
3737 static reg_errcode_t
3738 free_tree (void *extra, bin_tree_t *node)
3740 free_token (&node->token);
3741 return REG_NOERROR;
3745 /* Duplicate the node SRC, and return new node. This is a preorder
3746 visit similar to the one implemented by the generic visitor, but
3747 we need more infrastructure to maintain two parallel trees --- so,
3748 it's easier to duplicate. */
3750 static bin_tree_t *
3751 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3753 const bin_tree_t *node;
3754 bin_tree_t *dup_root;
3755 bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3757 for (node = root; ; )
3759 /* Create a new tree and link it back to the current parent. */
3760 *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3761 if (*p_new == NULL)
3762 return NULL;
3763 (*p_new)->parent = dup_node;
3764 (*p_new)->token.duplicated = 1;
3765 dup_node = *p_new;
3767 /* Go to the left node, or up and to the right. */
3768 if (node->left)
3770 node = node->left;
3771 p_new = &dup_node->left;
3773 else
3775 const bin_tree_t *prev = NULL;
3776 while (node->right == prev || node->right == NULL)
3778 prev = node;
3779 node = node->parent;
3780 dup_node = dup_node->parent;
3781 if (!node)
3782 return dup_root;
3784 node = node->right;
3785 p_new = &dup_node->right;