NEWS: Add advisories.
[glibc.git] / posix / regcomp.c
blob5380d3c7b9d5139d544f06bbaaba717dc36e14e7
1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <https://www.gnu.org/licenses/>. */
20 #ifdef _LIBC
21 # include <locale/weight.h>
22 #endif
24 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
25 size_t length, reg_syntax_t syntax);
26 static void re_compile_fastmap_iter (regex_t *bufp,
27 const re_dfastate_t *init_state,
28 char *fastmap);
29 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
30 #ifdef RE_ENABLE_I18N
31 static void free_charset (re_charset_t *cset);
32 #endif /* RE_ENABLE_I18N */
33 static void free_workarea_compile (regex_t *preg);
34 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
35 #ifdef RE_ENABLE_I18N
36 static void optimize_utf8 (re_dfa_t *dfa);
37 #endif
38 static reg_errcode_t analyze (regex_t *preg);
39 static reg_errcode_t preorder (bin_tree_t *root,
40 reg_errcode_t (fn (void *, bin_tree_t *)),
41 void *extra);
42 static reg_errcode_t postorder (bin_tree_t *root,
43 reg_errcode_t (fn (void *, bin_tree_t *)),
44 void *extra);
45 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
46 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
47 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
48 bin_tree_t *node);
49 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
50 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
51 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
52 static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
53 static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
54 unsigned int constraint);
55 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
56 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
57 Idx node, bool root);
58 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
59 static Idx fetch_number (re_string_t *input, re_token_t *token,
60 reg_syntax_t syntax);
61 static int peek_token (re_token_t *token, re_string_t *input,
62 reg_syntax_t syntax);
63 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
64 reg_syntax_t syntax, reg_errcode_t *err);
65 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
66 re_token_t *token, reg_syntax_t syntax,
67 Idx nest, reg_errcode_t *err);
68 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
69 re_token_t *token, reg_syntax_t syntax,
70 Idx nest, reg_errcode_t *err);
71 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
72 re_token_t *token, reg_syntax_t syntax,
73 Idx nest, reg_errcode_t *err);
74 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
75 re_token_t *token, reg_syntax_t syntax,
76 Idx nest, reg_errcode_t *err);
77 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
78 re_dfa_t *dfa, re_token_t *token,
79 reg_syntax_t syntax, reg_errcode_t *err);
80 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
81 re_token_t *token, reg_syntax_t syntax,
82 reg_errcode_t *err);
83 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
84 re_string_t *regexp,
85 re_token_t *token, int token_len,
86 re_dfa_t *dfa,
87 reg_syntax_t syntax,
88 bool accept_hyphen);
89 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
90 re_string_t *regexp,
91 re_token_t *token);
92 #ifdef RE_ENABLE_I18N
93 static reg_errcode_t build_equiv_class (bitset_t sbcset,
94 re_charset_t *mbcset,
95 Idx *equiv_class_alloc,
96 const unsigned char *name);
97 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
98 bitset_t sbcset,
99 re_charset_t *mbcset,
100 Idx *char_class_alloc,
101 const char *class_name,
102 reg_syntax_t syntax);
103 #else /* not RE_ENABLE_I18N */
104 static reg_errcode_t build_equiv_class (bitset_t sbcset,
105 const unsigned char *name);
106 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
107 bitset_t sbcset,
108 const char *class_name,
109 reg_syntax_t syntax);
110 #endif /* not RE_ENABLE_I18N */
111 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
112 RE_TRANSLATE_TYPE trans,
113 const char *class_name,
114 const char *extra,
115 bool non_match, reg_errcode_t *err);
116 static bin_tree_t *create_tree (re_dfa_t *dfa,
117 bin_tree_t *left, bin_tree_t *right,
118 re_token_type_t type);
119 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
120 bin_tree_t *left, bin_tree_t *right,
121 const re_token_t *token);
122 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
123 static void free_token (re_token_t *node);
124 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
125 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
127 /* This table gives an error message for each of the error codes listed
128 in regex.h. Obviously the order here has to be same as there.
129 POSIX doesn't require that we do anything for REG_NOERROR,
130 but why not be nice? */
132 static const char __re_error_msgid[] =
134 #define REG_NOERROR_IDX 0
135 gettext_noop ("Success") /* REG_NOERROR */
136 "\0"
137 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
138 gettext_noop ("No match") /* REG_NOMATCH */
139 "\0"
140 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
141 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
142 "\0"
143 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
144 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
145 "\0"
146 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
147 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
148 "\0"
149 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
150 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
151 "\0"
152 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
153 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
154 "\0"
155 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
156 gettext_noop ("Unmatched [, [^, [:, [., or [=") /* REG_EBRACK */
157 "\0"
158 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [, [^, [:, [., or [=")
159 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
160 "\0"
161 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
162 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
163 "\0"
164 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
165 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
166 "\0"
167 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
168 gettext_noop ("Invalid range end") /* REG_ERANGE */
169 "\0"
170 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
171 gettext_noop ("Memory exhausted") /* REG_ESPACE */
172 "\0"
173 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
174 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
175 "\0"
176 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
177 gettext_noop ("Premature end of regular expression") /* REG_EEND */
178 "\0"
179 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
180 gettext_noop ("Regular expression too big") /* REG_ESIZE */
181 "\0"
182 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
183 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
186 static const size_t __re_error_msgid_idx[] =
188 REG_NOERROR_IDX,
189 REG_NOMATCH_IDX,
190 REG_BADPAT_IDX,
191 REG_ECOLLATE_IDX,
192 REG_ECTYPE_IDX,
193 REG_EESCAPE_IDX,
194 REG_ESUBREG_IDX,
195 REG_EBRACK_IDX,
196 REG_EPAREN_IDX,
197 REG_EBRACE_IDX,
198 REG_BADBR_IDX,
199 REG_ERANGE_IDX,
200 REG_ESPACE_IDX,
201 REG_BADRPT_IDX,
202 REG_EEND_IDX,
203 REG_ESIZE_IDX,
204 REG_ERPAREN_IDX
207 /* Entry points for GNU code. */
209 /* re_compile_pattern is the GNU regular expression compiler: it
210 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
211 Returns 0 if the pattern was valid, otherwise an error string.
213 Assumes the 'allocated' (and perhaps 'buffer') and 'translate' fields
214 are set in BUFP on entry. */
216 const char *
217 re_compile_pattern (const char *pattern, size_t length,
218 struct re_pattern_buffer *bufp)
220 reg_errcode_t ret;
222 /* And GNU code determines whether or not to get register information
223 by passing null for the REGS argument to re_match, etc., not by
224 setting no_sub, unless RE_NO_SUB is set. */
225 bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
227 /* Match anchors at newline. */
228 bufp->newline_anchor = 1;
230 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
232 if (!ret)
233 return NULL;
234 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
236 weak_alias (__re_compile_pattern, re_compile_pattern)
238 /* Set by 're_set_syntax' to the current regexp syntax to recognize. Can
239 also be assigned to arbitrarily: each pattern buffer stores its own
240 syntax, so it can be changed between regex compilations. */
241 /* This has no initializer because initialized variables in Emacs
242 become read-only after dumping. */
243 reg_syntax_t re_syntax_options;
246 /* Specify the precise syntax of regexps for compilation. This provides
247 for compatibility for various utilities which historically have
248 different, incompatible syntaxes.
250 The argument SYNTAX is a bit mask comprised of the various bits
251 defined in regex.h. We return the old syntax. */
253 reg_syntax_t
254 re_set_syntax (reg_syntax_t syntax)
256 reg_syntax_t ret = re_syntax_options;
258 re_syntax_options = syntax;
259 return ret;
261 weak_alias (__re_set_syntax, re_set_syntax)
264 re_compile_fastmap (struct re_pattern_buffer *bufp)
266 re_dfa_t *dfa = bufp->buffer;
267 char *fastmap = bufp->fastmap;
269 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
270 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
271 if (dfa->init_state != dfa->init_state_word)
272 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
273 if (dfa->init_state != dfa->init_state_nl)
274 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
275 if (dfa->init_state != dfa->init_state_begbuf)
276 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
277 bufp->fastmap_accurate = 1;
278 return 0;
280 weak_alias (__re_compile_fastmap, re_compile_fastmap)
282 static inline void
283 __attribute__ ((always_inline))
284 re_set_fastmap (char *fastmap, bool icase, int ch)
286 fastmap[ch] = 1;
287 if (icase)
288 fastmap[tolower (ch)] = 1;
291 /* Helper function for re_compile_fastmap.
292 Compile fastmap for the initial_state INIT_STATE. */
294 static void
295 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
296 char *fastmap)
298 re_dfa_t *dfa = bufp->buffer;
299 Idx node_cnt;
300 bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
301 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
303 Idx node = init_state->nodes.elems[node_cnt];
304 re_token_type_t type = dfa->nodes[node].type;
306 if (type == CHARACTER)
308 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
309 #ifdef RE_ENABLE_I18N
310 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
312 unsigned char buf[MB_LEN_MAX];
313 unsigned char *p;
314 wchar_t wc;
315 mbstate_t state;
317 p = buf;
318 *p++ = dfa->nodes[node].opr.c;
319 while (++node < dfa->nodes_len
320 && dfa->nodes[node].type == CHARACTER
321 && dfa->nodes[node].mb_partial)
322 *p++ = dfa->nodes[node].opr.c;
323 memset (&state, '\0', sizeof (state));
324 if (__mbrtowc (&wc, (const char *) buf, p - buf,
325 &state) == p - buf
326 && (__wcrtomb ((char *) buf, __towlower (wc), &state)
327 != (size_t) -1))
328 re_set_fastmap (fastmap, false, buf[0]);
330 #endif
332 else if (type == SIMPLE_BRACKET)
334 int i, ch;
335 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
337 int j;
338 bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
339 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
340 if (w & ((bitset_word_t) 1 << j))
341 re_set_fastmap (fastmap, icase, ch);
344 #ifdef RE_ENABLE_I18N
345 else if (type == COMPLEX_BRACKET)
347 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
348 Idx i;
350 # ifdef _LIBC
351 /* See if we have to try all bytes which start multiple collation
352 elements.
353 e.g. In da_DK, we want to catch 'a' since "aa" is a valid
354 collation element, and don't catch 'b' since 'b' is
355 the only collation element which starts from 'b' (and
356 it is caught by SIMPLE_BRACKET). */
357 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
358 && (cset->ncoll_syms || cset->nranges))
360 const int32_t *table = (const int32_t *)
361 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
362 for (i = 0; i < SBC_MAX; ++i)
363 if (table[i] < 0)
364 re_set_fastmap (fastmap, icase, i);
366 # endif /* _LIBC */
368 /* See if we have to start the match at all multibyte characters,
369 i.e. where we would not find an invalid sequence. This only
370 applies to multibyte character sets; for single byte character
371 sets, the SIMPLE_BRACKET again suffices. */
372 if (dfa->mb_cur_max > 1
373 && (cset->nchar_classes || cset->non_match || cset->nranges
374 # ifdef _LIBC
375 || cset->nequiv_classes
376 # endif /* _LIBC */
379 unsigned char c = 0;
382 mbstate_t mbs;
383 memset (&mbs, 0, sizeof (mbs));
384 if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
385 re_set_fastmap (fastmap, false, (int) c);
387 while (++c != 0);
390 else
392 /* ... Else catch all bytes which can start the mbchars. */
393 for (i = 0; i < cset->nmbchars; ++i)
395 char buf[256];
396 mbstate_t state;
397 memset (&state, '\0', sizeof (state));
398 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
399 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
400 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
402 if (__wcrtomb (buf, __towlower (cset->mbchars[i]), &state)
403 != (size_t) -1)
404 re_set_fastmap (fastmap, false, *(unsigned char *) buf);
409 #endif /* RE_ENABLE_I18N */
410 else if (type == OP_PERIOD
411 #ifdef RE_ENABLE_I18N
412 || type == OP_UTF8_PERIOD
413 #endif /* RE_ENABLE_I18N */
414 || type == END_OF_RE)
416 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
417 if (type == END_OF_RE)
418 bufp->can_be_null = 1;
419 return;
424 /* Entry point for POSIX code. */
425 /* regcomp takes a regular expression as a string and compiles it.
427 PREG is a regex_t *. We do not expect any fields to be initialized,
428 since POSIX says we shouldn't. Thus, we set
430 'buffer' to the compiled pattern;
431 'used' to the length of the compiled pattern;
432 'syntax' to RE_SYNTAX_POSIX_EXTENDED if the
433 REG_EXTENDED bit in CFLAGS is set; otherwise, to
434 RE_SYNTAX_POSIX_BASIC;
435 'newline_anchor' to REG_NEWLINE being set in CFLAGS;
436 'fastmap' to an allocated space for the fastmap;
437 'fastmap_accurate' to zero;
438 're_nsub' to the number of subexpressions in PATTERN.
440 PATTERN is the address of the pattern string.
442 CFLAGS is a series of bits which affect compilation.
444 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
445 use POSIX basic syntax.
447 If REG_NEWLINE is set, then . and [^...] don't match newline.
448 Also, regexec will try a match beginning after every newline.
450 If REG_ICASE is set, then we considers upper- and lowercase
451 versions of letters to be equivalent when matching.
453 If REG_NOSUB is set, then when PREG is passed to regexec, that
454 routine will report only success or failure, and nothing about the
455 registers.
457 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
458 the return codes and their meanings.) */
461 regcomp (regex_t *__restrict preg, const char *__restrict pattern, int cflags)
463 reg_errcode_t ret;
464 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
465 : RE_SYNTAX_POSIX_BASIC);
467 preg->buffer = NULL;
468 preg->allocated = 0;
469 preg->used = 0;
471 /* Try to allocate space for the fastmap. */
472 preg->fastmap = re_malloc (char, SBC_MAX);
473 if (__glibc_unlikely (preg->fastmap == NULL))
474 return REG_ESPACE;
476 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
478 /* If REG_NEWLINE is set, newlines are treated differently. */
479 if (cflags & REG_NEWLINE)
480 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
481 syntax &= ~RE_DOT_NEWLINE;
482 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
483 /* It also changes the matching behavior. */
484 preg->newline_anchor = 1;
486 else
487 preg->newline_anchor = 0;
488 preg->no_sub = !!(cflags & REG_NOSUB);
489 preg->translate = NULL;
491 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
493 /* POSIX doesn't distinguish between an unmatched open-group and an
494 unmatched close-group: both are REG_EPAREN. */
495 if (ret == REG_ERPAREN)
496 ret = REG_EPAREN;
498 /* We have already checked preg->fastmap != NULL. */
499 if (__glibc_likely (ret == REG_NOERROR))
500 /* Compute the fastmap now, since regexec cannot modify the pattern
501 buffer. This function never fails in this implementation. */
502 (void) re_compile_fastmap (preg);
503 else
505 /* Some error occurred while compiling the expression. */
506 re_free (preg->fastmap);
507 preg->fastmap = NULL;
510 return (int) ret;
512 libc_hidden_def (__regcomp)
513 weak_alias (__regcomp, regcomp)
515 /* Returns a message corresponding to an error code, ERRCODE, returned
516 from either regcomp or regexec. We don't use PREG here. */
518 size_t
519 regerror (int errcode, const regex_t *__restrict preg, char *__restrict errbuf,
520 size_t errbuf_size)
522 const char *msg;
523 size_t msg_size;
524 int nerrcodes = sizeof __re_error_msgid_idx / sizeof __re_error_msgid_idx[0];
526 if (__glibc_unlikely (errcode < 0 || errcode >= nerrcodes))
527 /* Only error codes returned by the rest of the code should be passed
528 to this routine. If we are given anything else, or if other regex
529 code generates an invalid error code, then the program has a bug.
530 Dump core so we can fix it. */
531 abort ();
533 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
535 msg_size = strlen (msg) + 1; /* Includes the null. */
537 if (__glibc_likely (errbuf_size != 0))
539 size_t cpy_size = msg_size;
540 if (__glibc_unlikely (msg_size > errbuf_size))
542 cpy_size = errbuf_size - 1;
543 errbuf[cpy_size] = '\0';
545 memcpy (errbuf, msg, cpy_size);
548 return msg_size;
550 weak_alias (__regerror, regerror)
553 #ifdef RE_ENABLE_I18N
554 /* This static array is used for the map to single-byte characters when
555 UTF-8 is used. Otherwise we would allocate memory just to initialize
556 it the same all the time. UTF-8 is the preferred encoding so this is
557 a worthwhile optimization. */
558 static const bitset_t utf8_sb_map =
560 /* Set the first 128 bits. */
561 # if (defined __GNUC__ || __clang_major__ >= 4) && !defined __STRICT_ANSI__
562 [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
563 # else
564 # if 4 * BITSET_WORD_BITS < ASCII_CHARS
565 # error "bitset_word_t is narrower than 32 bits"
566 # elif 3 * BITSET_WORD_BITS < ASCII_CHARS
567 BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
568 # elif 2 * BITSET_WORD_BITS < ASCII_CHARS
569 BITSET_WORD_MAX, BITSET_WORD_MAX,
570 # elif 1 * BITSET_WORD_BITS < ASCII_CHARS
571 BITSET_WORD_MAX,
572 # endif
573 (BITSET_WORD_MAX
574 >> (SBC_MAX % BITSET_WORD_BITS == 0
576 : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
577 # endif
579 #endif
582 static void
583 free_dfa_content (re_dfa_t *dfa)
585 Idx i, j;
587 if (dfa->nodes)
588 for (i = 0; i < dfa->nodes_len; ++i)
589 free_token (dfa->nodes + i);
590 re_free (dfa->nexts);
591 for (i = 0; i < dfa->nodes_len; ++i)
593 if (dfa->eclosures != NULL)
594 re_node_set_free (dfa->eclosures + i);
595 if (dfa->inveclosures != NULL)
596 re_node_set_free (dfa->inveclosures + i);
597 if (dfa->edests != NULL)
598 re_node_set_free (dfa->edests + i);
600 re_free (dfa->edests);
601 re_free (dfa->eclosures);
602 re_free (dfa->inveclosures);
603 re_free (dfa->nodes);
605 if (dfa->state_table)
606 for (i = 0; i <= dfa->state_hash_mask; ++i)
608 struct re_state_table_entry *entry = dfa->state_table + i;
609 for (j = 0; j < entry->num; ++j)
611 re_dfastate_t *state = entry->array[j];
612 free_state (state);
614 re_free (entry->array);
616 re_free (dfa->state_table);
617 #ifdef RE_ENABLE_I18N
618 if (dfa->sb_char != utf8_sb_map)
619 re_free (dfa->sb_char);
620 #endif
621 re_free (dfa->subexp_map);
622 #ifdef DEBUG
623 re_free (dfa->re_str);
624 #endif
626 re_free (dfa);
630 /* Free dynamically allocated space used by PREG. */
632 void
633 regfree (regex_t *preg)
635 re_dfa_t *dfa = preg->buffer;
636 if (__glibc_likely (dfa != NULL))
638 lock_fini (dfa->lock);
639 free_dfa_content (dfa);
641 preg->buffer = NULL;
642 preg->allocated = 0;
644 re_free (preg->fastmap);
645 preg->fastmap = NULL;
647 re_free (preg->translate);
648 preg->translate = NULL;
650 libc_hidden_def (__regfree)
651 weak_alias (__regfree, regfree)
653 /* Entry points compatible with 4.2 BSD regex library. We don't define
654 them unless specifically requested. */
656 #if defined _REGEX_RE_COMP || defined _LIBC
658 /* BSD has one and only one pattern buffer. */
659 static struct re_pattern_buffer re_comp_buf;
661 char *
662 # ifdef _LIBC
663 /* Make these definitions weak in libc, so POSIX programs can redefine
664 these names if they don't use our functions, and still use
665 regcomp/regexec above without link errors. */
666 weak_function
667 # endif
668 re_comp (const char *s)
670 reg_errcode_t ret;
671 char *fastmap;
673 if (!s)
675 if (!re_comp_buf.buffer)
676 return gettext ("No previous regular expression");
677 return 0;
680 if (re_comp_buf.buffer)
682 fastmap = re_comp_buf.fastmap;
683 re_comp_buf.fastmap = NULL;
684 __regfree (&re_comp_buf);
685 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
686 re_comp_buf.fastmap = fastmap;
689 if (re_comp_buf.fastmap == NULL)
691 re_comp_buf.fastmap = re_malloc (char, SBC_MAX);
692 if (re_comp_buf.fastmap == NULL)
693 return (char *) gettext (__re_error_msgid
694 + __re_error_msgid_idx[(int) REG_ESPACE]);
697 /* Since 're_exec' always passes NULL for the 'regs' argument, we
698 don't need to initialize the pattern buffer fields which affect it. */
700 /* Match anchors at newlines. */
701 re_comp_buf.newline_anchor = 1;
703 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
705 if (!ret)
706 return NULL;
708 /* Yes, we're discarding 'const' here if !HAVE_LIBINTL. */
709 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
712 #ifdef _LIBC
713 void
714 __libc_regcomp_freemem (void)
716 __regfree (&re_comp_buf);
718 #endif
720 #endif /* _REGEX_RE_COMP */
722 /* Internal entry point.
723 Compile the regular expression PATTERN, whose length is LENGTH.
724 SYNTAX indicate regular expression's syntax. */
726 static reg_errcode_t
727 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
728 reg_syntax_t syntax)
730 reg_errcode_t err = REG_NOERROR;
731 re_dfa_t *dfa;
732 re_string_t regexp;
734 /* Initialize the pattern buffer. */
735 preg->fastmap_accurate = 0;
736 preg->syntax = syntax;
737 preg->not_bol = preg->not_eol = 0;
738 preg->used = 0;
739 preg->re_nsub = 0;
740 preg->can_be_null = 0;
741 preg->regs_allocated = REGS_UNALLOCATED;
743 /* Initialize the dfa. */
744 dfa = preg->buffer;
745 if (__glibc_unlikely (preg->allocated < sizeof (re_dfa_t)))
747 /* If zero allocated, but buffer is non-null, try to realloc
748 enough space. This loses if buffer's address is bogus, but
749 that is the user's responsibility. If ->buffer is NULL this
750 is a simple allocation. */
751 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
752 if (dfa == NULL)
753 return REG_ESPACE;
754 preg->allocated = sizeof (re_dfa_t);
755 preg->buffer = dfa;
757 preg->used = sizeof (re_dfa_t);
759 err = init_dfa (dfa, length);
760 if (__glibc_unlikely (err == REG_NOERROR && lock_init (dfa->lock) != 0))
761 err = REG_ESPACE;
762 if (__glibc_unlikely (err != REG_NOERROR))
764 free_dfa_content (dfa);
765 preg->buffer = NULL;
766 preg->allocated = 0;
767 return err;
769 #ifdef DEBUG
770 /* Note: length+1 will not overflow since it is checked in init_dfa. */
771 dfa->re_str = re_malloc (char, length + 1);
772 strncpy (dfa->re_str, pattern, length + 1);
773 #endif
775 err = re_string_construct (&regexp, pattern, length, preg->translate,
776 (syntax & RE_ICASE) != 0, dfa);
777 if (__glibc_unlikely (err != REG_NOERROR))
779 re_compile_internal_free_return:
780 free_workarea_compile (preg);
781 re_string_destruct (&regexp);
782 lock_fini (dfa->lock);
783 free_dfa_content (dfa);
784 preg->buffer = NULL;
785 preg->allocated = 0;
786 return err;
789 /* Parse the regular expression, and build a structure tree. */
790 preg->re_nsub = 0;
791 dfa->str_tree = parse (&regexp, preg, syntax, &err);
792 if (__glibc_unlikely (dfa->str_tree == NULL))
793 goto re_compile_internal_free_return;
795 /* Analyze the tree and create the nfa. */
796 err = analyze (preg);
797 if (__glibc_unlikely (err != REG_NOERROR))
798 goto re_compile_internal_free_return;
800 #ifdef RE_ENABLE_I18N
801 /* If possible, do searching in single byte encoding to speed things up. */
802 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
803 optimize_utf8 (dfa);
804 #endif
806 /* Then create the initial state of the dfa. */
807 err = create_initial_state (dfa);
809 /* Release work areas. */
810 free_workarea_compile (preg);
811 re_string_destruct (&regexp);
813 if (__glibc_unlikely (err != REG_NOERROR))
815 lock_fini (dfa->lock);
816 free_dfa_content (dfa);
817 preg->buffer = NULL;
818 preg->allocated = 0;
821 return err;
824 /* Initialize DFA. We use the length of the regular expression PAT_LEN
825 as the initial length of some arrays. */
827 static reg_errcode_t
828 init_dfa (re_dfa_t *dfa, size_t pat_len)
830 __re_size_t table_size;
831 #ifndef _LIBC
832 const char *codeset_name;
833 #endif
834 #ifdef RE_ENABLE_I18N
835 size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
836 #else
837 size_t max_i18n_object_size = 0;
838 #endif
839 size_t max_object_size =
840 MAX (sizeof (struct re_state_table_entry),
841 MAX (sizeof (re_token_t),
842 MAX (sizeof (re_node_set),
843 MAX (sizeof (regmatch_t),
844 max_i18n_object_size))));
846 memset (dfa, '\0', sizeof (re_dfa_t));
848 /* Force allocation of str_tree_storage the first time. */
849 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
851 /* Avoid overflows. The extra "/ 2" is for the table_size doubling
852 calculation below, and for similar doubling calculations
853 elsewhere. And it's <= rather than <, because some of the
854 doubling calculations add 1 afterwards. */
855 if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / max_object_size) / 2
856 <= pat_len))
857 return REG_ESPACE;
859 dfa->nodes_alloc = pat_len + 1;
860 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
862 /* table_size = 2 ^ ceil(log pat_len) */
863 for (table_size = 1; ; table_size <<= 1)
864 if (table_size > pat_len)
865 break;
867 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
868 dfa->state_hash_mask = table_size - 1;
870 dfa->mb_cur_max = MB_CUR_MAX;
871 #ifdef _LIBC
872 if (dfa->mb_cur_max == 6
873 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
874 dfa->is_utf8 = 1;
875 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
876 != 0);
877 #else
878 codeset_name = nl_langinfo (CODESET);
879 if ((codeset_name[0] == 'U' || codeset_name[0] == 'u')
880 && (codeset_name[1] == 'T' || codeset_name[1] == 't')
881 && (codeset_name[2] == 'F' || codeset_name[2] == 'f')
882 && strcmp (codeset_name + 3 + (codeset_name[3] == '-'), "8") == 0)
883 dfa->is_utf8 = 1;
885 /* We check exhaustively in the loop below if this charset is a
886 superset of ASCII. */
887 dfa->map_notascii = 0;
888 #endif
890 #ifdef RE_ENABLE_I18N
891 if (dfa->mb_cur_max > 1)
893 if (dfa->is_utf8)
894 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
895 else
897 int i, j, ch;
899 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
900 if (__glibc_unlikely (dfa->sb_char == NULL))
901 return REG_ESPACE;
903 /* Set the bits corresponding to single byte chars. */
904 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
905 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
907 wint_t wch = __btowc (ch);
908 if (wch != WEOF)
909 dfa->sb_char[i] |= (bitset_word_t) 1 << j;
910 # ifndef _LIBC
911 if (isascii (ch) && wch != ch)
912 dfa->map_notascii = 1;
913 # endif
917 #endif
919 if (__glibc_unlikely (dfa->nodes == NULL || dfa->state_table == NULL))
920 return REG_ESPACE;
921 return REG_NOERROR;
924 /* Initialize WORD_CHAR table, which indicate which character is
925 "word". In this case "word" means that it is the word construction
926 character used by some operators like "\<", "\>", etc. */
928 static void
929 init_word_char (re_dfa_t *dfa)
931 int i = 0;
932 int j;
933 int ch = 0;
934 dfa->word_ops_used = 1;
935 if (__glibc_likely (dfa->map_notascii == 0))
937 /* Avoid uint32_t and uint64_t as some non-GCC platforms lack
938 them, an issue when this code is used in Gnulib. */
939 bitset_word_t bits0 = 0x00000000;
940 bitset_word_t bits1 = 0x03ff0000;
941 bitset_word_t bits2 = 0x87fffffe;
942 bitset_word_t bits3 = 0x07fffffe;
943 if (BITSET_WORD_BITS == 64)
945 /* Pacify gcc -Woverflow on 32-bit platformns. */
946 dfa->word_char[0] = bits1 << 31 << 1 | bits0;
947 dfa->word_char[1] = bits3 << 31 << 1 | bits2;
948 i = 2;
950 else if (BITSET_WORD_BITS == 32)
952 dfa->word_char[0] = bits0;
953 dfa->word_char[1] = bits1;
954 dfa->word_char[2] = bits2;
955 dfa->word_char[3] = bits3;
956 i = 4;
958 else
959 goto general_case;
960 ch = 128;
962 if (__glibc_likely (dfa->is_utf8))
964 memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8);
965 return;
969 general_case:
970 for (; i < BITSET_WORDS; ++i)
971 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
972 if (isalnum (ch) || ch == '_')
973 dfa->word_char[i] |= (bitset_word_t) 1 << j;
976 /* Free the work area which are only used while compiling. */
978 static void
979 free_workarea_compile (regex_t *preg)
981 re_dfa_t *dfa = preg->buffer;
982 bin_tree_storage_t *storage, *next;
983 for (storage = dfa->str_tree_storage; storage; storage = next)
985 next = storage->next;
986 re_free (storage);
988 dfa->str_tree_storage = NULL;
989 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
990 dfa->str_tree = NULL;
991 re_free (dfa->org_indices);
992 dfa->org_indices = NULL;
995 /* Create initial states for all contexts. */
997 static reg_errcode_t
998 create_initial_state (re_dfa_t *dfa)
1000 Idx first, i;
1001 reg_errcode_t err;
1002 re_node_set init_nodes;
1004 /* Initial states have the epsilon closure of the node which is
1005 the first node of the regular expression. */
1006 first = dfa->str_tree->first->node_idx;
1007 dfa->init_node = first;
1008 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
1009 if (__glibc_unlikely (err != REG_NOERROR))
1010 return err;
1012 /* The back-references which are in initial states can epsilon transit,
1013 since in this case all of the subexpressions can be null.
1014 Then we add epsilon closures of the nodes which are the next nodes of
1015 the back-references. */
1016 if (dfa->nbackref > 0)
1017 for (i = 0; i < init_nodes.nelem; ++i)
1019 Idx node_idx = init_nodes.elems[i];
1020 re_token_type_t type = dfa->nodes[node_idx].type;
1022 Idx clexp_idx;
1023 if (type != OP_BACK_REF)
1024 continue;
1025 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1027 re_token_t *clexp_node;
1028 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1029 if (clexp_node->type == OP_CLOSE_SUBEXP
1030 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1031 break;
1033 if (clexp_idx == init_nodes.nelem)
1034 continue;
1036 if (type == OP_BACK_REF)
1038 Idx dest_idx = dfa->edests[node_idx].elems[0];
1039 if (!re_node_set_contains (&init_nodes, dest_idx))
1041 reg_errcode_t merge_err
1042 = re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1043 if (merge_err != REG_NOERROR)
1044 return merge_err;
1045 i = 0;
1050 /* It must be the first time to invoke acquire_state. */
1051 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1052 /* We don't check ERR here, since the initial state must not be NULL. */
1053 if (__glibc_unlikely (dfa->init_state == NULL))
1054 return err;
1055 if (dfa->init_state->has_constraint)
1057 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1058 CONTEXT_WORD);
1059 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1060 CONTEXT_NEWLINE);
1061 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1062 &init_nodes,
1063 CONTEXT_NEWLINE
1064 | CONTEXT_BEGBUF);
1065 if (__glibc_unlikely (dfa->init_state_word == NULL
1066 || dfa->init_state_nl == NULL
1067 || dfa->init_state_begbuf == NULL))
1068 return err;
1070 else
1071 dfa->init_state_word = dfa->init_state_nl
1072 = dfa->init_state_begbuf = dfa->init_state;
1074 re_node_set_free (&init_nodes);
1075 return REG_NOERROR;
1078 #ifdef RE_ENABLE_I18N
1079 /* If it is possible to do searching in single byte encoding instead of UTF-8
1080 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1081 DFA nodes where needed. */
1083 static void
1084 optimize_utf8 (re_dfa_t *dfa)
1086 Idx node;
1087 int i;
1088 bool mb_chars = false;
1089 bool has_period = false;
1091 for (node = 0; node < dfa->nodes_len; ++node)
1092 switch (dfa->nodes[node].type)
1094 case CHARACTER:
1095 if (dfa->nodes[node].opr.c >= ASCII_CHARS)
1096 mb_chars = true;
1097 break;
1098 case ANCHOR:
1099 switch (dfa->nodes[node].opr.ctx_type)
1101 case LINE_FIRST:
1102 case LINE_LAST:
1103 case BUF_FIRST:
1104 case BUF_LAST:
1105 break;
1106 default:
1107 /* Word anchors etc. cannot be handled. It's okay to test
1108 opr.ctx_type since constraints (for all DFA nodes) are
1109 created by ORing one or more opr.ctx_type values. */
1110 return;
1112 break;
1113 case OP_PERIOD:
1114 has_period = true;
1115 break;
1116 case OP_BACK_REF:
1117 case OP_ALT:
1118 case END_OF_RE:
1119 case OP_DUP_ASTERISK:
1120 case OP_OPEN_SUBEXP:
1121 case OP_CLOSE_SUBEXP:
1122 break;
1123 case COMPLEX_BRACKET:
1124 return;
1125 case SIMPLE_BRACKET:
1126 /* Just double check. */
1128 int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
1130 : BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
1131 for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1133 if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
1134 return;
1135 rshift = 0;
1138 break;
1139 default:
1140 abort ();
1143 if (mb_chars || has_period)
1144 for (node = 0; node < dfa->nodes_len; ++node)
1146 if (dfa->nodes[node].type == CHARACTER
1147 && dfa->nodes[node].opr.c >= ASCII_CHARS)
1148 dfa->nodes[node].mb_partial = 0;
1149 else if (dfa->nodes[node].type == OP_PERIOD)
1150 dfa->nodes[node].type = OP_UTF8_PERIOD;
1153 /* The search can be in single byte locale. */
1154 dfa->mb_cur_max = 1;
1155 dfa->is_utf8 = 0;
1156 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1158 #endif
1160 /* Analyze the structure tree, and calculate "first", "next", "edest",
1161 "eclosure", and "inveclosure". */
1163 static reg_errcode_t
1164 analyze (regex_t *preg)
1166 re_dfa_t *dfa = preg->buffer;
1167 reg_errcode_t ret;
1169 /* Allocate arrays. */
1170 dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1171 dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1172 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1173 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1174 if (__glibc_unlikely (dfa->nexts == NULL || dfa->org_indices == NULL
1175 || dfa->edests == NULL || dfa->eclosures == NULL))
1176 return REG_ESPACE;
1178 dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
1179 if (dfa->subexp_map != NULL)
1181 Idx i;
1182 for (i = 0; i < preg->re_nsub; i++)
1183 dfa->subexp_map[i] = i;
1184 preorder (dfa->str_tree, optimize_subexps, dfa);
1185 for (i = 0; i < preg->re_nsub; i++)
1186 if (dfa->subexp_map[i] != i)
1187 break;
1188 if (i == preg->re_nsub)
1190 re_free (dfa->subexp_map);
1191 dfa->subexp_map = NULL;
1195 ret = postorder (dfa->str_tree, lower_subexps, preg);
1196 if (__glibc_unlikely (ret != REG_NOERROR))
1197 return ret;
1198 ret = postorder (dfa->str_tree, calc_first, dfa);
1199 if (__glibc_unlikely (ret != REG_NOERROR))
1200 return ret;
1201 preorder (dfa->str_tree, calc_next, dfa);
1202 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1203 if (__glibc_unlikely (ret != REG_NOERROR))
1204 return ret;
1205 ret = calc_eclosure (dfa);
1206 if (__glibc_unlikely (ret != REG_NOERROR))
1207 return ret;
1209 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1210 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1211 if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1212 || dfa->nbackref)
1214 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1215 if (__glibc_unlikely (dfa->inveclosures == NULL))
1216 return REG_ESPACE;
1217 ret = calc_inveclosure (dfa);
1220 return ret;
1223 /* Our parse trees are very unbalanced, so we cannot use a stack to
1224 implement parse tree visits. Instead, we use parent pointers and
1225 some hairy code in these two functions. */
1226 static reg_errcode_t
1227 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1228 void *extra)
1230 bin_tree_t *node, *prev;
1232 for (node = root; ; )
1234 /* Descend down the tree, preferably to the left (or to the right
1235 if that's the only child). */
1236 while (node->left || node->right)
1237 if (node->left)
1238 node = node->left;
1239 else
1240 node = node->right;
1244 reg_errcode_t err = fn (extra, node);
1245 if (__glibc_unlikely (err != REG_NOERROR))
1246 return err;
1247 if (node->parent == NULL)
1248 return REG_NOERROR;
1249 prev = node;
1250 node = node->parent;
1252 /* Go up while we have a node that is reached from the right. */
1253 while (node->right == prev || node->right == NULL);
1254 node = node->right;
1258 static reg_errcode_t
1259 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1260 void *extra)
1262 bin_tree_t *node;
1264 for (node = root; ; )
1266 reg_errcode_t err = fn (extra, node);
1267 if (__glibc_unlikely (err != REG_NOERROR))
1268 return err;
1270 /* Go to the left node, or up and to the right. */
1271 if (node->left)
1272 node = node->left;
1273 else
1275 bin_tree_t *prev = NULL;
1276 while (node->right == prev || node->right == NULL)
1278 prev = node;
1279 node = node->parent;
1280 if (!node)
1281 return REG_NOERROR;
1283 node = node->right;
1288 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1289 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1290 backreferences as well. Requires a preorder visit. */
1291 static reg_errcode_t
1292 optimize_subexps (void *extra, bin_tree_t *node)
1294 re_dfa_t *dfa = (re_dfa_t *) extra;
1296 if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1298 int idx = node->token.opr.idx;
1299 node->token.opr.idx = dfa->subexp_map[idx];
1300 dfa->used_bkref_map |= 1 << node->token.opr.idx;
1303 else if (node->token.type == SUBEXP
1304 && node->left && node->left->token.type == SUBEXP)
1306 Idx other_idx = node->left->token.opr.idx;
1308 node->left = node->left->left;
1309 if (node->left)
1310 node->left->parent = node;
1312 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1313 if (other_idx < BITSET_WORD_BITS)
1314 dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1317 return REG_NOERROR;
1320 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1321 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1322 static reg_errcode_t
1323 lower_subexps (void *extra, bin_tree_t *node)
1325 regex_t *preg = (regex_t *) extra;
1326 reg_errcode_t err = REG_NOERROR;
1328 if (node->left && node->left->token.type == SUBEXP)
1330 node->left = lower_subexp (&err, preg, node->left);
1331 if (node->left)
1332 node->left->parent = node;
1334 if (node->right && node->right->token.type == SUBEXP)
1336 node->right = lower_subexp (&err, preg, node->right);
1337 if (node->right)
1338 node->right->parent = node;
1341 return err;
1344 static bin_tree_t *
1345 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1347 re_dfa_t *dfa = preg->buffer;
1348 bin_tree_t *body = node->left;
1349 bin_tree_t *op, *cls, *tree1, *tree;
1351 if (preg->no_sub
1352 /* We do not optimize empty subexpressions, because otherwise we may
1353 have bad CONCAT nodes with NULL children. This is obviously not
1354 very common, so we do not lose much. An example that triggers
1355 this case is the sed "script" /\(\)/x. */
1356 && node->left != NULL
1357 && (node->token.opr.idx >= BITSET_WORD_BITS
1358 || !(dfa->used_bkref_map
1359 & ((bitset_word_t) 1 << node->token.opr.idx))))
1360 return node->left;
1362 /* Convert the SUBEXP node to the concatenation of an
1363 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1364 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1365 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1366 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1367 tree = create_tree (dfa, op, tree1, CONCAT);
1368 if (__glibc_unlikely (tree == NULL || tree1 == NULL
1369 || op == NULL || cls == NULL))
1371 *err = REG_ESPACE;
1372 return NULL;
1375 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1376 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1377 return tree;
1380 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1381 nodes. Requires a postorder visit. */
1382 static reg_errcode_t
1383 calc_first (void *extra, bin_tree_t *node)
1385 re_dfa_t *dfa = (re_dfa_t *) extra;
1386 if (node->token.type == CONCAT)
1388 node->first = node->left->first;
1389 node->node_idx = node->left->node_idx;
1391 else
1393 node->first = node;
1394 node->node_idx = re_dfa_add_node (dfa, node->token);
1395 if (__glibc_unlikely (node->node_idx == -1))
1396 return REG_ESPACE;
1397 if (node->token.type == ANCHOR)
1398 dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1400 return REG_NOERROR;
1403 /* Pass 2: compute NEXT on the tree. Preorder visit. */
1404 static reg_errcode_t
1405 calc_next (void *extra, bin_tree_t *node)
1407 switch (node->token.type)
1409 case OP_DUP_ASTERISK:
1410 node->left->next = node;
1411 break;
1412 case CONCAT:
1413 node->left->next = node->right->first;
1414 node->right->next = node->next;
1415 break;
1416 default:
1417 if (node->left)
1418 node->left->next = node->next;
1419 if (node->right)
1420 node->right->next = node->next;
1421 break;
1423 return REG_NOERROR;
1426 /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1427 static reg_errcode_t
1428 link_nfa_nodes (void *extra, bin_tree_t *node)
1430 re_dfa_t *dfa = (re_dfa_t *) extra;
1431 Idx idx = node->node_idx;
1432 reg_errcode_t err = REG_NOERROR;
1434 switch (node->token.type)
1436 case CONCAT:
1437 break;
1439 case END_OF_RE:
1440 DEBUG_ASSERT (node->next == NULL);
1441 break;
1443 case OP_DUP_ASTERISK:
1444 case OP_ALT:
1446 Idx left, right;
1447 dfa->has_plural_match = 1;
1448 if (node->left != NULL)
1449 left = node->left->first->node_idx;
1450 else
1451 left = node->next->node_idx;
1452 if (node->right != NULL)
1453 right = node->right->first->node_idx;
1454 else
1455 right = node->next->node_idx;
1456 DEBUG_ASSERT (left > -1);
1457 DEBUG_ASSERT (right > -1);
1458 err = re_node_set_init_2 (dfa->edests + idx, left, right);
1460 break;
1462 case ANCHOR:
1463 case OP_OPEN_SUBEXP:
1464 case OP_CLOSE_SUBEXP:
1465 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1466 break;
1468 case OP_BACK_REF:
1469 dfa->nexts[idx] = node->next->node_idx;
1470 if (node->token.type == OP_BACK_REF)
1471 err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1472 break;
1474 default:
1475 DEBUG_ASSERT (!IS_EPSILON_NODE (node->token.type));
1476 dfa->nexts[idx] = node->next->node_idx;
1477 break;
1480 return err;
1483 /* Duplicate the epsilon closure of the node ROOT_NODE.
1484 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1485 to their own constraint. */
1487 static reg_errcode_t
1488 duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
1489 Idx root_node, unsigned int init_constraint)
1491 Idx org_node, clone_node;
1492 bool ok;
1493 unsigned int constraint = init_constraint;
1494 for (org_node = top_org_node, clone_node = top_clone_node;;)
1496 Idx org_dest, clone_dest;
1497 if (dfa->nodes[org_node].type == OP_BACK_REF)
1499 /* If the back reference epsilon-transit, its destination must
1500 also have the constraint. Then duplicate the epsilon closure
1501 of the destination of the back reference, and store it in
1502 edests of the back reference. */
1503 org_dest = dfa->nexts[org_node];
1504 re_node_set_empty (dfa->edests + clone_node);
1505 clone_dest = duplicate_node (dfa, org_dest, constraint);
1506 if (__glibc_unlikely (clone_dest == -1))
1507 return REG_ESPACE;
1508 dfa->nexts[clone_node] = dfa->nexts[org_node];
1509 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1510 if (__glibc_unlikely (! ok))
1511 return REG_ESPACE;
1513 else if (dfa->edests[org_node].nelem == 0)
1515 /* In case of the node can't epsilon-transit, don't duplicate the
1516 destination and store the original destination as the
1517 destination of the node. */
1518 dfa->nexts[clone_node] = dfa->nexts[org_node];
1519 break;
1521 else if (dfa->edests[org_node].nelem == 1)
1523 /* In case of the node can epsilon-transit, and it has only one
1524 destination. */
1525 org_dest = dfa->edests[org_node].elems[0];
1526 re_node_set_empty (dfa->edests + clone_node);
1527 /* If the node is root_node itself, it means the epsilon closure
1528 has a loop. Then tie it to the destination of the root_node. */
1529 if (org_node == root_node && clone_node != org_node)
1531 ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
1532 if (__glibc_unlikely (! ok))
1533 return REG_ESPACE;
1534 break;
1536 /* In case the node has another constraint, append it. */
1537 constraint |= dfa->nodes[org_node].constraint;
1538 clone_dest = duplicate_node (dfa, org_dest, constraint);
1539 if (__glibc_unlikely (clone_dest == -1))
1540 return REG_ESPACE;
1541 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1542 if (__glibc_unlikely (! ok))
1543 return REG_ESPACE;
1545 else /* dfa->edests[org_node].nelem == 2 */
1547 /* In case of the node can epsilon-transit, and it has two
1548 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1549 org_dest = dfa->edests[org_node].elems[0];
1550 re_node_set_empty (dfa->edests + clone_node);
1551 /* Search for a duplicated node which satisfies the constraint. */
1552 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1553 if (clone_dest == -1)
1555 /* There is no such duplicated node, create a new one. */
1556 reg_errcode_t err;
1557 clone_dest = duplicate_node (dfa, org_dest, constraint);
1558 if (__glibc_unlikely (clone_dest == -1))
1559 return REG_ESPACE;
1560 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1561 if (__glibc_unlikely (! ok))
1562 return REG_ESPACE;
1563 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1564 root_node, constraint);
1565 if (__glibc_unlikely (err != REG_NOERROR))
1566 return err;
1568 else
1570 /* There is a duplicated node which satisfies the constraint,
1571 use it to avoid infinite loop. */
1572 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1573 if (__glibc_unlikely (! ok))
1574 return REG_ESPACE;
1577 org_dest = dfa->edests[org_node].elems[1];
1578 clone_dest = duplicate_node (dfa, org_dest, constraint);
1579 if (__glibc_unlikely (clone_dest == -1))
1580 return REG_ESPACE;
1581 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1582 if (__glibc_unlikely (! ok))
1583 return REG_ESPACE;
1585 org_node = org_dest;
1586 clone_node = clone_dest;
1588 return REG_NOERROR;
1591 /* Search for a node which is duplicated from the node ORG_NODE, and
1592 satisfies the constraint CONSTRAINT. */
1594 static Idx
1595 search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1596 unsigned int constraint)
1598 Idx idx;
1599 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1601 if (org_node == dfa->org_indices[idx]
1602 && constraint == dfa->nodes[idx].constraint)
1603 return idx; /* Found. */
1605 return -1; /* Not found. */
1608 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1609 Return the index of the new node, or -1 if insufficient storage is
1610 available. */
1612 static Idx
1613 duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
1615 Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1616 if (__glibc_likely (dup_idx != -1))
1618 dfa->nodes[dup_idx].constraint = constraint;
1619 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1620 dfa->nodes[dup_idx].duplicated = 1;
1622 /* Store the index of the original node. */
1623 dfa->org_indices[dup_idx] = org_idx;
1625 return dup_idx;
1628 static reg_errcode_t
1629 calc_inveclosure (re_dfa_t *dfa)
1631 Idx src, idx;
1632 bool ok;
1633 for (idx = 0; idx < dfa->nodes_len; ++idx)
1634 re_node_set_init_empty (dfa->inveclosures + idx);
1636 for (src = 0; src < dfa->nodes_len; ++src)
1638 Idx *elems = dfa->eclosures[src].elems;
1639 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1641 ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1642 if (__glibc_unlikely (! ok))
1643 return REG_ESPACE;
1647 return REG_NOERROR;
1650 /* Calculate "eclosure" for all the node in DFA. */
1652 static reg_errcode_t
1653 calc_eclosure (re_dfa_t *dfa)
1655 Idx node_idx;
1656 bool incomplete;
1657 DEBUG_ASSERT (dfa->nodes_len > 0);
1658 incomplete = false;
1659 /* For each nodes, calculate epsilon closure. */
1660 for (node_idx = 0; ; ++node_idx)
1662 reg_errcode_t err;
1663 re_node_set eclosure_elem;
1664 if (node_idx == dfa->nodes_len)
1666 if (!incomplete)
1667 break;
1668 incomplete = false;
1669 node_idx = 0;
1672 DEBUG_ASSERT (dfa->eclosures[node_idx].nelem != -1);
1674 /* If we have already calculated, skip it. */
1675 if (dfa->eclosures[node_idx].nelem != 0)
1676 continue;
1677 /* Calculate epsilon closure of 'node_idx'. */
1678 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1679 if (__glibc_unlikely (err != REG_NOERROR))
1680 return err;
1682 if (dfa->eclosures[node_idx].nelem == 0)
1684 incomplete = true;
1685 re_node_set_free (&eclosure_elem);
1688 return REG_NOERROR;
1691 /* Calculate epsilon closure of NODE. */
1693 static reg_errcode_t
1694 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
1696 reg_errcode_t err;
1697 Idx i;
1698 re_node_set eclosure;
1699 bool incomplete = false;
1700 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1701 if (__glibc_unlikely (err != REG_NOERROR))
1702 return err;
1704 /* An epsilon closure includes itself. */
1705 eclosure.elems[eclosure.nelem++] = node;
1707 /* This indicates that we are calculating this node now.
1708 We reference this value to avoid infinite loop. */
1709 dfa->eclosures[node].nelem = -1;
1711 /* If the current node has constraints, duplicate all nodes
1712 since they must inherit the constraints. */
1713 if (dfa->nodes[node].constraint
1714 && dfa->edests[node].nelem
1715 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1717 err = duplicate_node_closure (dfa, node, node, node,
1718 dfa->nodes[node].constraint);
1719 if (__glibc_unlikely (err != REG_NOERROR))
1720 return err;
1723 /* Expand each epsilon destination nodes. */
1724 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1725 for (i = 0; i < dfa->edests[node].nelem; ++i)
1727 re_node_set eclosure_elem;
1728 Idx edest = dfa->edests[node].elems[i];
1729 /* If calculating the epsilon closure of 'edest' is in progress,
1730 return intermediate result. */
1731 if (dfa->eclosures[edest].nelem == -1)
1733 incomplete = true;
1734 continue;
1736 /* If we haven't calculated the epsilon closure of 'edest' yet,
1737 calculate now. Otherwise use calculated epsilon closure. */
1738 if (dfa->eclosures[edest].nelem == 0)
1740 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1741 if (__glibc_unlikely (err != REG_NOERROR))
1742 return err;
1744 else
1745 eclosure_elem = dfa->eclosures[edest];
1746 /* Merge the epsilon closure of 'edest'. */
1747 err = re_node_set_merge (&eclosure, &eclosure_elem);
1748 if (__glibc_unlikely (err != REG_NOERROR))
1749 return err;
1750 /* If the epsilon closure of 'edest' is incomplete,
1751 the epsilon closure of this node is also incomplete. */
1752 if (dfa->eclosures[edest].nelem == 0)
1754 incomplete = true;
1755 re_node_set_free (&eclosure_elem);
1759 if (incomplete && !root)
1760 dfa->eclosures[node].nelem = 0;
1761 else
1762 dfa->eclosures[node] = eclosure;
1763 *new_set = eclosure;
1764 return REG_NOERROR;
1767 /* Functions for token which are used in the parser. */
1769 /* Fetch a token from INPUT.
1770 We must not use this function inside bracket expressions. */
1772 static void
1773 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1775 re_string_skip_bytes (input, peek_token (result, input, syntax));
1778 /* Peek a token from INPUT, and return the length of the token.
1779 We must not use this function inside bracket expressions. */
1781 static int
1782 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1784 unsigned char c;
1786 if (re_string_eoi (input))
1788 token->type = END_OF_RE;
1789 return 0;
1792 c = re_string_peek_byte (input, 0);
1793 token->opr.c = c;
1795 token->word_char = 0;
1796 #ifdef RE_ENABLE_I18N
1797 token->mb_partial = 0;
1798 if (input->mb_cur_max > 1
1799 && !re_string_first_byte (input, re_string_cur_idx (input)))
1801 token->type = CHARACTER;
1802 token->mb_partial = 1;
1803 return 1;
1805 #endif
1806 if (c == '\\')
1808 unsigned char c2;
1809 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1811 token->type = BACK_SLASH;
1812 return 1;
1815 c2 = re_string_peek_byte_case (input, 1);
1816 token->opr.c = c2;
1817 token->type = CHARACTER;
1818 #ifdef RE_ENABLE_I18N
1819 if (input->mb_cur_max > 1)
1821 wint_t wc = re_string_wchar_at (input,
1822 re_string_cur_idx (input) + 1);
1823 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1825 else
1826 #endif
1827 token->word_char = IS_WORD_CHAR (c2) != 0;
1829 switch (c2)
1831 case '|':
1832 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1833 token->type = OP_ALT;
1834 break;
1835 case '1': case '2': case '3': case '4': case '5':
1836 case '6': case '7': case '8': case '9':
1837 if (!(syntax & RE_NO_BK_REFS))
1839 token->type = OP_BACK_REF;
1840 token->opr.idx = c2 - '1';
1842 break;
1843 case '<':
1844 if (!(syntax & RE_NO_GNU_OPS))
1846 token->type = ANCHOR;
1847 token->opr.ctx_type = WORD_FIRST;
1849 break;
1850 case '>':
1851 if (!(syntax & RE_NO_GNU_OPS))
1853 token->type = ANCHOR;
1854 token->opr.ctx_type = WORD_LAST;
1856 break;
1857 case 'b':
1858 if (!(syntax & RE_NO_GNU_OPS))
1860 token->type = ANCHOR;
1861 token->opr.ctx_type = WORD_DELIM;
1863 break;
1864 case 'B':
1865 if (!(syntax & RE_NO_GNU_OPS))
1867 token->type = ANCHOR;
1868 token->opr.ctx_type = NOT_WORD_DELIM;
1870 break;
1871 case 'w':
1872 if (!(syntax & RE_NO_GNU_OPS))
1873 token->type = OP_WORD;
1874 break;
1875 case 'W':
1876 if (!(syntax & RE_NO_GNU_OPS))
1877 token->type = OP_NOTWORD;
1878 break;
1879 case 's':
1880 if (!(syntax & RE_NO_GNU_OPS))
1881 token->type = OP_SPACE;
1882 break;
1883 case 'S':
1884 if (!(syntax & RE_NO_GNU_OPS))
1885 token->type = OP_NOTSPACE;
1886 break;
1887 case '`':
1888 if (!(syntax & RE_NO_GNU_OPS))
1890 token->type = ANCHOR;
1891 token->opr.ctx_type = BUF_FIRST;
1893 break;
1894 case '\'':
1895 if (!(syntax & RE_NO_GNU_OPS))
1897 token->type = ANCHOR;
1898 token->opr.ctx_type = BUF_LAST;
1900 break;
1901 case '(':
1902 if (!(syntax & RE_NO_BK_PARENS))
1903 token->type = OP_OPEN_SUBEXP;
1904 break;
1905 case ')':
1906 if (!(syntax & RE_NO_BK_PARENS))
1907 token->type = OP_CLOSE_SUBEXP;
1908 break;
1909 case '+':
1910 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1911 token->type = OP_DUP_PLUS;
1912 break;
1913 case '?':
1914 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1915 token->type = OP_DUP_QUESTION;
1916 break;
1917 case '{':
1918 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1919 token->type = OP_OPEN_DUP_NUM;
1920 break;
1921 case '}':
1922 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1923 token->type = OP_CLOSE_DUP_NUM;
1924 break;
1925 default:
1926 break;
1928 return 2;
1931 token->type = CHARACTER;
1932 #ifdef RE_ENABLE_I18N
1933 if (input->mb_cur_max > 1)
1935 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1936 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1938 else
1939 #endif
1940 token->word_char = IS_WORD_CHAR (token->opr.c);
1942 switch (c)
1944 case '\n':
1945 if (syntax & RE_NEWLINE_ALT)
1946 token->type = OP_ALT;
1947 break;
1948 case '|':
1949 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1950 token->type = OP_ALT;
1951 break;
1952 case '*':
1953 token->type = OP_DUP_ASTERISK;
1954 break;
1955 case '+':
1956 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1957 token->type = OP_DUP_PLUS;
1958 break;
1959 case '?':
1960 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1961 token->type = OP_DUP_QUESTION;
1962 break;
1963 case '{':
1964 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1965 token->type = OP_OPEN_DUP_NUM;
1966 break;
1967 case '}':
1968 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1969 token->type = OP_CLOSE_DUP_NUM;
1970 break;
1971 case '(':
1972 if (syntax & RE_NO_BK_PARENS)
1973 token->type = OP_OPEN_SUBEXP;
1974 break;
1975 case ')':
1976 if (syntax & RE_NO_BK_PARENS)
1977 token->type = OP_CLOSE_SUBEXP;
1978 break;
1979 case '[':
1980 token->type = OP_OPEN_BRACKET;
1981 break;
1982 case '.':
1983 token->type = OP_PERIOD;
1984 break;
1985 case '^':
1986 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE))
1987 && re_string_cur_idx (input) != 0)
1989 char prev = re_string_peek_byte (input, -1);
1990 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1991 break;
1993 token->type = ANCHOR;
1994 token->opr.ctx_type = LINE_FIRST;
1995 break;
1996 case '$':
1997 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS)
1998 && re_string_cur_idx (input) + 1 != re_string_length (input))
2000 re_token_t next;
2001 re_string_skip_bytes (input, 1);
2002 peek_token (&next, input, syntax);
2003 re_string_skip_bytes (input, -1);
2004 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
2005 break;
2007 token->type = ANCHOR;
2008 token->opr.ctx_type = LINE_LAST;
2009 break;
2010 default:
2011 break;
2013 return 1;
2016 /* Peek a token from INPUT, and return the length of the token.
2017 We must not use this function out of bracket expressions. */
2019 static int
2020 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2022 unsigned char c;
2023 if (re_string_eoi (input))
2025 token->type = END_OF_RE;
2026 return 0;
2028 c = re_string_peek_byte (input, 0);
2029 token->opr.c = c;
2031 #ifdef RE_ENABLE_I18N
2032 if (input->mb_cur_max > 1
2033 && !re_string_first_byte (input, re_string_cur_idx (input)))
2035 token->type = CHARACTER;
2036 return 1;
2038 #endif /* RE_ENABLE_I18N */
2040 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2041 && re_string_cur_idx (input) + 1 < re_string_length (input))
2043 /* In this case, '\' escape a character. */
2044 unsigned char c2;
2045 re_string_skip_bytes (input, 1);
2046 c2 = re_string_peek_byte (input, 0);
2047 token->opr.c = c2;
2048 token->type = CHARACTER;
2049 return 1;
2051 if (c == '[') /* '[' is a special char in a bracket exps. */
2053 unsigned char c2;
2054 int token_len;
2055 if (re_string_cur_idx (input) + 1 < re_string_length (input))
2056 c2 = re_string_peek_byte (input, 1);
2057 else
2058 c2 = 0;
2059 token->opr.c = c2;
2060 token_len = 2;
2061 switch (c2)
2063 case '.':
2064 token->type = OP_OPEN_COLL_ELEM;
2065 break;
2067 case '=':
2068 token->type = OP_OPEN_EQUIV_CLASS;
2069 break;
2071 case ':':
2072 if (syntax & RE_CHAR_CLASSES)
2074 token->type = OP_OPEN_CHAR_CLASS;
2075 break;
2077 FALLTHROUGH;
2078 default:
2079 token->type = CHARACTER;
2080 token->opr.c = c;
2081 token_len = 1;
2082 break;
2084 return token_len;
2086 switch (c)
2088 case '-':
2089 token->type = OP_CHARSET_RANGE;
2090 break;
2091 case ']':
2092 token->type = OP_CLOSE_BRACKET;
2093 break;
2094 case '^':
2095 token->type = OP_NON_MATCH_LIST;
2096 break;
2097 default:
2098 token->type = CHARACTER;
2100 return 1;
2103 /* Functions for parser. */
2105 /* Entry point of the parser.
2106 Parse the regular expression REGEXP and return the structure tree.
2107 If an error occurs, ERR is set by error code, and return NULL.
2108 This function build the following tree, from regular expression <reg_exp>:
2112 <reg_exp> EOR
2114 CAT means concatenation.
2115 EOR means end of regular expression. */
2117 static bin_tree_t *
2118 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2119 reg_errcode_t *err)
2121 re_dfa_t *dfa = preg->buffer;
2122 bin_tree_t *tree, *eor, *root;
2123 re_token_t current_token;
2124 dfa->syntax = syntax;
2125 fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2126 tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2127 if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2128 return NULL;
2129 eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2130 if (tree != NULL)
2131 root = create_tree (dfa, tree, eor, CONCAT);
2132 else
2133 root = eor;
2134 if (__glibc_unlikely (eor == NULL || root == NULL))
2136 *err = REG_ESPACE;
2137 return NULL;
2139 return root;
2142 /* This function build the following tree, from regular expression
2143 <branch1>|<branch2>:
2147 <branch1> <branch2>
2149 ALT means alternative, which represents the operator '|'. */
2151 static bin_tree_t *
2152 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2153 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2155 re_dfa_t *dfa = preg->buffer;
2156 bin_tree_t *tree, *branch = NULL;
2157 bitset_word_t initial_bkref_map = dfa->completed_bkref_map;
2158 tree = parse_branch (regexp, preg, token, syntax, nest, err);
2159 if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2160 return NULL;
2162 while (token->type == OP_ALT)
2164 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2165 if (token->type != OP_ALT && token->type != END_OF_RE
2166 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2168 bitset_word_t accumulated_bkref_map = dfa->completed_bkref_map;
2169 dfa->completed_bkref_map = initial_bkref_map;
2170 branch = parse_branch (regexp, preg, token, syntax, nest, err);
2171 if (__glibc_unlikely (*err != REG_NOERROR && branch == NULL))
2173 if (tree != NULL)
2174 postorder (tree, free_tree, NULL);
2175 return NULL;
2177 dfa->completed_bkref_map |= accumulated_bkref_map;
2179 else
2180 branch = NULL;
2181 tree = create_tree (dfa, tree, branch, OP_ALT);
2182 if (__glibc_unlikely (tree == NULL))
2184 *err = REG_ESPACE;
2185 return NULL;
2188 return tree;
2191 /* This function build the following tree, from regular expression
2192 <exp1><exp2>:
2196 <exp1> <exp2>
2198 CAT means concatenation. */
2200 static bin_tree_t *
2201 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2202 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2204 bin_tree_t *tree, *expr;
2205 re_dfa_t *dfa = preg->buffer;
2206 tree = parse_expression (regexp, preg, token, syntax, nest, err);
2207 if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2208 return NULL;
2210 while (token->type != OP_ALT && token->type != END_OF_RE
2211 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2213 expr = parse_expression (regexp, preg, token, syntax, nest, err);
2214 if (__glibc_unlikely (*err != REG_NOERROR && expr == NULL))
2216 if (tree != NULL)
2217 postorder (tree, free_tree, NULL);
2218 return NULL;
2220 if (tree != NULL && expr != NULL)
2222 bin_tree_t *newtree = create_tree (dfa, tree, expr, CONCAT);
2223 if (newtree == NULL)
2225 postorder (expr, free_tree, NULL);
2226 postorder (tree, free_tree, NULL);
2227 *err = REG_ESPACE;
2228 return NULL;
2230 tree = newtree;
2232 else if (tree == NULL)
2233 tree = expr;
2234 /* Otherwise expr == NULL, we don't need to create new tree. */
2236 return tree;
2239 /* This function build the following tree, from regular expression a*:
2245 static bin_tree_t *
2246 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2247 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2249 re_dfa_t *dfa = preg->buffer;
2250 bin_tree_t *tree;
2251 switch (token->type)
2253 case CHARACTER:
2254 tree = create_token_tree (dfa, NULL, NULL, token);
2255 if (__glibc_unlikely (tree == NULL))
2257 *err = REG_ESPACE;
2258 return NULL;
2260 #ifdef RE_ENABLE_I18N
2261 if (dfa->mb_cur_max > 1)
2263 while (!re_string_eoi (regexp)
2264 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2266 bin_tree_t *mbc_remain;
2267 fetch_token (token, regexp, syntax);
2268 mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2269 tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2270 if (__glibc_unlikely (mbc_remain == NULL || tree == NULL))
2272 *err = REG_ESPACE;
2273 return NULL;
2277 #endif
2278 break;
2280 case OP_OPEN_SUBEXP:
2281 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2282 if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2283 return NULL;
2284 break;
2286 case OP_OPEN_BRACKET:
2287 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2288 if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2289 return NULL;
2290 break;
2292 case OP_BACK_REF:
2293 if (!__glibc_likely (dfa->completed_bkref_map & (1 << token->opr.idx)))
2295 *err = REG_ESUBREG;
2296 return NULL;
2298 dfa->used_bkref_map |= 1 << token->opr.idx;
2299 tree = create_token_tree (dfa, NULL, NULL, token);
2300 if (__glibc_unlikely (tree == NULL))
2302 *err = REG_ESPACE;
2303 return NULL;
2305 ++dfa->nbackref;
2306 dfa->has_mb_node = 1;
2307 break;
2309 case OP_OPEN_DUP_NUM:
2310 if (syntax & RE_CONTEXT_INVALID_DUP)
2312 *err = REG_BADRPT;
2313 return NULL;
2315 FALLTHROUGH;
2316 case OP_DUP_ASTERISK:
2317 case OP_DUP_PLUS:
2318 case OP_DUP_QUESTION:
2319 if (syntax & RE_CONTEXT_INVALID_OPS)
2321 *err = REG_BADRPT;
2322 return NULL;
2324 else if (syntax & RE_CONTEXT_INDEP_OPS)
2326 fetch_token (token, regexp, syntax);
2327 return parse_expression (regexp, preg, token, syntax, nest, err);
2329 FALLTHROUGH;
2330 case OP_CLOSE_SUBEXP:
2331 if ((token->type == OP_CLOSE_SUBEXP)
2332 && !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2334 *err = REG_ERPAREN;
2335 return NULL;
2337 FALLTHROUGH;
2338 case OP_CLOSE_DUP_NUM:
2339 /* We treat it as a normal character. */
2341 /* Then we can these characters as normal characters. */
2342 token->type = CHARACTER;
2343 /* mb_partial and word_char bits should be initialized already
2344 by peek_token. */
2345 tree = create_token_tree (dfa, NULL, NULL, token);
2346 if (__glibc_unlikely (tree == NULL))
2348 *err = REG_ESPACE;
2349 return NULL;
2351 break;
2353 case ANCHOR:
2354 if ((token->opr.ctx_type
2355 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2356 && dfa->word_ops_used == 0)
2357 init_word_char (dfa);
2358 if (token->opr.ctx_type == WORD_DELIM
2359 || token->opr.ctx_type == NOT_WORD_DELIM)
2361 bin_tree_t *tree_first, *tree_last;
2362 if (token->opr.ctx_type == WORD_DELIM)
2364 token->opr.ctx_type = WORD_FIRST;
2365 tree_first = create_token_tree (dfa, NULL, NULL, token);
2366 token->opr.ctx_type = WORD_LAST;
2368 else
2370 token->opr.ctx_type = INSIDE_WORD;
2371 tree_first = create_token_tree (dfa, NULL, NULL, token);
2372 token->opr.ctx_type = INSIDE_NOTWORD;
2374 tree_last = create_token_tree (dfa, NULL, NULL, token);
2375 tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2376 if (__glibc_unlikely (tree_first == NULL || tree_last == NULL
2377 || tree == NULL))
2379 *err = REG_ESPACE;
2380 return NULL;
2383 else
2385 tree = create_token_tree (dfa, NULL, NULL, token);
2386 if (__glibc_unlikely (tree == NULL))
2388 *err = REG_ESPACE;
2389 return NULL;
2392 /* We must return here, since ANCHORs can't be followed
2393 by repetition operators.
2394 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2395 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2396 fetch_token (token, regexp, syntax);
2397 return tree;
2399 case OP_PERIOD:
2400 tree = create_token_tree (dfa, NULL, NULL, token);
2401 if (__glibc_unlikely (tree == NULL))
2403 *err = REG_ESPACE;
2404 return NULL;
2406 if (dfa->mb_cur_max > 1)
2407 dfa->has_mb_node = 1;
2408 break;
2410 case OP_WORD:
2411 case OP_NOTWORD:
2412 tree = build_charclass_op (dfa, regexp->trans,
2413 "alnum",
2414 "_",
2415 token->type == OP_NOTWORD, err);
2416 if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2417 return NULL;
2418 break;
2420 case OP_SPACE:
2421 case OP_NOTSPACE:
2422 tree = build_charclass_op (dfa, regexp->trans,
2423 "space",
2425 token->type == OP_NOTSPACE, err);
2426 if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2427 return NULL;
2428 break;
2430 case OP_ALT:
2431 case END_OF_RE:
2432 return NULL;
2434 case BACK_SLASH:
2435 *err = REG_EESCAPE;
2436 return NULL;
2438 default:
2439 /* Must not happen? */
2440 DEBUG_ASSERT (false);
2441 return NULL;
2443 fetch_token (token, regexp, syntax);
2445 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2446 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2448 bin_tree_t *dup_tree = parse_dup_op (tree, regexp, dfa, token,
2449 syntax, err);
2450 if (__glibc_unlikely (*err != REG_NOERROR && dup_tree == NULL))
2452 if (tree != NULL)
2453 postorder (tree, free_tree, NULL);
2454 return NULL;
2456 tree = dup_tree;
2457 /* In BRE consecutive duplications are not allowed. */
2458 if ((syntax & RE_CONTEXT_INVALID_DUP)
2459 && (token->type == OP_DUP_ASTERISK
2460 || token->type == OP_OPEN_DUP_NUM))
2462 if (tree != NULL)
2463 postorder (tree, free_tree, NULL);
2464 *err = REG_BADRPT;
2465 return NULL;
2469 return tree;
2472 /* This function build the following tree, from regular expression
2473 (<reg_exp>):
2474 SUBEXP
2476 <reg_exp>
2479 static bin_tree_t *
2480 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2481 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2483 re_dfa_t *dfa = preg->buffer;
2484 bin_tree_t *tree;
2485 size_t cur_nsub;
2486 cur_nsub = preg->re_nsub++;
2488 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2490 /* The subexpression may be a null string. */
2491 if (token->type == OP_CLOSE_SUBEXP)
2492 tree = NULL;
2493 else
2495 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2496 if (__glibc_unlikely (*err == REG_NOERROR
2497 && token->type != OP_CLOSE_SUBEXP))
2499 if (tree != NULL)
2500 postorder (tree, free_tree, NULL);
2501 *err = REG_EPAREN;
2503 if (__glibc_unlikely (*err != REG_NOERROR))
2504 return NULL;
2507 if (cur_nsub <= '9' - '1')
2508 dfa->completed_bkref_map |= 1 << cur_nsub;
2510 tree = create_tree (dfa, tree, NULL, SUBEXP);
2511 if (__glibc_unlikely (tree == NULL))
2513 *err = REG_ESPACE;
2514 return NULL;
2516 tree->token.opr.idx = cur_nsub;
2517 return tree;
2520 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2522 static bin_tree_t *
2523 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2524 re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2526 bin_tree_t *tree = NULL, *old_tree = NULL;
2527 Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2528 re_token_t start_token = *token;
2530 if (token->type == OP_OPEN_DUP_NUM)
2532 end = 0;
2533 start = fetch_number (regexp, token, syntax);
2534 if (start == -1)
2536 if (token->type == CHARACTER && token->opr.c == ',')
2537 start = 0; /* We treat "{,m}" as "{0,m}". */
2538 else
2540 *err = REG_BADBR; /* <re>{} is invalid. */
2541 return NULL;
2544 if (__glibc_likely (start != -2))
2546 /* We treat "{n}" as "{n,n}". */
2547 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2548 : ((token->type == CHARACTER && token->opr.c == ',')
2549 ? fetch_number (regexp, token, syntax) : -2));
2551 if (__glibc_unlikely (start == -2 || end == -2))
2553 /* Invalid sequence. */
2554 if (__glibc_unlikely (!(syntax & RE_INVALID_INTERVAL_ORD)))
2556 if (token->type == END_OF_RE)
2557 *err = REG_EBRACE;
2558 else
2559 *err = REG_BADBR;
2561 return NULL;
2564 /* If the syntax bit is set, rollback. */
2565 re_string_set_index (regexp, start_idx);
2566 *token = start_token;
2567 token->type = CHARACTER;
2568 /* mb_partial and word_char bits should be already initialized by
2569 peek_token. */
2570 return elem;
2573 if (__glibc_unlikely ((end != -1 && start > end)
2574 || token->type != OP_CLOSE_DUP_NUM))
2576 /* First number greater than second. */
2577 *err = REG_BADBR;
2578 return NULL;
2581 if (__glibc_unlikely (RE_DUP_MAX < (end == -1 ? start : end)))
2583 *err = REG_ESIZE;
2584 return NULL;
2587 else
2589 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2590 end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2593 fetch_token (token, regexp, syntax);
2595 if (__glibc_unlikely (elem == NULL))
2596 return NULL;
2597 if (__glibc_unlikely (start == 0 && end == 0))
2599 postorder (elem, free_tree, NULL);
2600 return NULL;
2603 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2604 if (__glibc_unlikely (start > 0))
2606 tree = elem;
2607 for (i = 2; i <= start; ++i)
2609 elem = duplicate_tree (elem, dfa);
2610 tree = create_tree (dfa, tree, elem, CONCAT);
2611 if (__glibc_unlikely (elem == NULL || tree == NULL))
2612 goto parse_dup_op_espace;
2615 if (start == end)
2616 return tree;
2618 /* Duplicate ELEM before it is marked optional. */
2619 elem = duplicate_tree (elem, dfa);
2620 if (__glibc_unlikely (elem == NULL))
2621 goto parse_dup_op_espace;
2622 old_tree = tree;
2624 else
2625 old_tree = NULL;
2627 if (elem->token.type == SUBEXP)
2629 uintptr_t subidx = elem->token.opr.idx;
2630 postorder (elem, mark_opt_subexp, (void *) subidx);
2633 tree = create_tree (dfa, elem, NULL,
2634 (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2635 if (__glibc_unlikely (tree == NULL))
2636 goto parse_dup_op_espace;
2638 /* This loop is actually executed only when end != -1,
2639 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2640 already created the start+1-th copy. */
2641 if (TYPE_SIGNED (Idx) || end != -1)
2642 for (i = start + 2; i <= end; ++i)
2644 elem = duplicate_tree (elem, dfa);
2645 tree = create_tree (dfa, tree, elem, CONCAT);
2646 if (__glibc_unlikely (elem == NULL || tree == NULL))
2647 goto parse_dup_op_espace;
2649 tree = create_tree (dfa, tree, NULL, OP_ALT);
2650 if (__glibc_unlikely (tree == NULL))
2651 goto parse_dup_op_espace;
2654 if (old_tree)
2655 tree = create_tree (dfa, old_tree, tree, CONCAT);
2657 return tree;
2659 parse_dup_op_espace:
2660 *err = REG_ESPACE;
2661 return NULL;
2664 /* Size of the names for collating symbol/equivalence_class/character_class.
2665 I'm not sure, but maybe enough. */
2666 #define BRACKET_NAME_BUF_SIZE 32
2668 #ifndef _LIBC
2670 # ifdef RE_ENABLE_I18N
2671 /* Convert the byte B to the corresponding wide character. In a
2672 unibyte locale, treat B as itself. In a multibyte locale, return
2673 WEOF if B is an encoding error. */
2674 static wint_t
2675 parse_byte (unsigned char b, re_charset_t *mbcset)
2677 return mbcset == NULL ? b : __btowc (b);
2679 # endif
2681 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2682 Build the range expression which starts from START_ELEM, and ends
2683 at END_ELEM. The result are written to MBCSET and SBCSET.
2684 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2685 mbcset->range_ends, is a pointer argument since we may
2686 update it. */
2688 static reg_errcode_t
2689 # ifdef RE_ENABLE_I18N
2690 build_range_exp (const reg_syntax_t syntax,
2691 bitset_t sbcset,
2692 re_charset_t *mbcset,
2693 Idx *range_alloc,
2694 const bracket_elem_t *start_elem,
2695 const bracket_elem_t *end_elem)
2696 # else /* not RE_ENABLE_I18N */
2697 build_range_exp (const reg_syntax_t syntax,
2698 bitset_t sbcset,
2699 const bracket_elem_t *start_elem,
2700 const bracket_elem_t *end_elem)
2701 # endif /* not RE_ENABLE_I18N */
2703 unsigned int start_ch, end_ch;
2704 /* Equivalence Classes and Character Classes can't be a range start/end. */
2705 if (__glibc_unlikely (start_elem->type == EQUIV_CLASS
2706 || start_elem->type == CHAR_CLASS
2707 || end_elem->type == EQUIV_CLASS
2708 || end_elem->type == CHAR_CLASS))
2709 return REG_ERANGE;
2711 /* We can handle no multi character collating elements without libc
2712 support. */
2713 if (__glibc_unlikely ((start_elem->type == COLL_SYM
2714 && strlen ((char *) start_elem->opr.name) > 1)
2715 || (end_elem->type == COLL_SYM
2716 && strlen ((char *) end_elem->opr.name) > 1)))
2717 return REG_ECOLLATE;
2719 # ifdef RE_ENABLE_I18N
2721 wchar_t wc;
2722 wint_t start_wc;
2723 wint_t end_wc;
2725 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2726 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2727 : 0));
2728 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2729 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2730 : 0));
2731 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2732 ? parse_byte (start_ch, mbcset) : start_elem->opr.wch);
2733 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2734 ? parse_byte (end_ch, mbcset) : end_elem->opr.wch);
2735 if (start_wc == WEOF || end_wc == WEOF)
2736 return REG_ECOLLATE;
2737 else if (__glibc_unlikely ((syntax & RE_NO_EMPTY_RANGES)
2738 && start_wc > end_wc))
2739 return REG_ERANGE;
2741 /* Got valid collation sequence values, add them as a new entry.
2742 However, for !_LIBC we have no collation elements: if the
2743 character set is single byte, the single byte character set
2744 that we build below suffices. parse_bracket_exp passes
2745 no MBCSET if dfa->mb_cur_max == 1. */
2746 if (mbcset)
2748 /* Check the space of the arrays. */
2749 if (__glibc_unlikely (*range_alloc == mbcset->nranges))
2751 /* There is not enough space, need realloc. */
2752 wchar_t *new_array_start, *new_array_end;
2753 Idx new_nranges;
2755 /* +1 in case of mbcset->nranges is 0. */
2756 new_nranges = 2 * mbcset->nranges + 1;
2757 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2758 are NULL if *range_alloc == 0. */
2759 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2760 new_nranges);
2761 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2762 new_nranges);
2764 if (__glibc_unlikely (new_array_start == NULL
2765 || new_array_end == NULL))
2767 re_free (new_array_start);
2768 re_free (new_array_end);
2769 return REG_ESPACE;
2772 mbcset->range_starts = new_array_start;
2773 mbcset->range_ends = new_array_end;
2774 *range_alloc = new_nranges;
2777 mbcset->range_starts[mbcset->nranges] = start_wc;
2778 mbcset->range_ends[mbcset->nranges++] = end_wc;
2781 /* Build the table for single byte characters. */
2782 for (wc = 0; wc < SBC_MAX; ++wc)
2784 if (start_wc <= wc && wc <= end_wc)
2785 bitset_set (sbcset, wc);
2788 # else /* not RE_ENABLE_I18N */
2790 unsigned int ch;
2791 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2792 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2793 : 0));
2794 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2795 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2796 : 0));
2797 if (start_ch > end_ch)
2798 return REG_ERANGE;
2799 /* Build the table for single byte characters. */
2800 for (ch = 0; ch < SBC_MAX; ++ch)
2801 if (start_ch <= ch && ch <= end_ch)
2802 bitset_set (sbcset, ch);
2804 # endif /* not RE_ENABLE_I18N */
2805 return REG_NOERROR;
2807 #endif /* not _LIBC */
2809 #ifndef _LIBC
2810 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2811 Build the collating element which is represented by NAME.
2812 The result are written to MBCSET and SBCSET.
2813 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2814 pointer argument since we may update it. */
2816 static reg_errcode_t
2817 # ifdef RE_ENABLE_I18N
2818 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2819 Idx *coll_sym_alloc, const unsigned char *name)
2820 # else /* not RE_ENABLE_I18N */
2821 build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2822 # endif /* not RE_ENABLE_I18N */
2824 size_t name_len = strlen ((const char *) name);
2825 if (__glibc_unlikely (name_len != 1))
2826 return REG_ECOLLATE;
2827 else
2829 bitset_set (sbcset, name[0]);
2830 return REG_NOERROR;
2833 #endif /* not _LIBC */
2835 #ifdef _LIBC
2836 /* Local function for parse_bracket_exp used in _LIBC environment.
2837 Seek the collating symbol entry corresponding to NAME.
2838 Return the index of the symbol in the SYMB_TABLE,
2839 or -1 if not found. */
2841 static inline int32_t
2842 __attribute__ ((always_inline))
2843 seek_collating_symbol_entry (const unsigned char *name, size_t name_len,
2844 const int32_t *symb_table, int32_t table_size,
2845 const unsigned char *extra)
2847 int32_t elem;
2849 for (elem = 0; elem < table_size; elem++)
2850 if (symb_table[2 * elem] != 0)
2852 int32_t idx = symb_table[2 * elem + 1];
2853 /* Skip the name of collating element name. */
2854 idx += 1 + extra[idx];
2855 if (/* Compare the length of the name. */
2856 name_len == extra[idx]
2857 /* Compare the name. */
2858 && memcmp (name, &extra[idx + 1], name_len) == 0)
2859 /* Yep, this is the entry. */
2860 return elem;
2862 return -1;
2865 /* Local function for parse_bracket_exp used in _LIBC environment.
2866 Look up the collation sequence value of BR_ELEM.
2867 Return the value if succeeded, UINT_MAX otherwise. */
2869 static inline unsigned int
2870 __attribute__ ((always_inline))
2871 lookup_collation_sequence_value (bracket_elem_t *br_elem, uint32_t nrules,
2872 const unsigned char *collseqmb,
2873 const char *collseqwc, int32_t table_size,
2874 const int32_t *symb_table,
2875 const unsigned char *extra)
2877 if (br_elem->type == SB_CHAR)
2879 /* if (MB_CUR_MAX == 1) */
2880 if (nrules == 0)
2881 return collseqmb[br_elem->opr.ch];
2882 else
2884 wint_t wc = __btowc (br_elem->opr.ch);
2885 return __collseq_table_lookup (collseqwc, wc);
2888 else if (br_elem->type == MB_CHAR)
2890 if (nrules != 0)
2891 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2893 else if (br_elem->type == COLL_SYM)
2895 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2896 if (nrules != 0)
2898 int32_t elem, idx;
2899 elem = seek_collating_symbol_entry (br_elem->opr.name,
2900 sym_name_len,
2901 symb_table, table_size,
2902 extra);
2903 if (elem != -1)
2905 /* We found the entry. */
2906 idx = symb_table[2 * elem + 1];
2907 /* Skip the name of collating element name. */
2908 idx += 1 + extra[idx];
2909 /* Skip the byte sequence of the collating element. */
2910 idx += 1 + extra[idx];
2911 /* Adjust for the alignment. */
2912 idx = (idx + 3) & ~3;
2913 /* Skip the multibyte collation sequence value. */
2914 idx += sizeof (unsigned int);
2915 /* Skip the wide char sequence of the collating element. */
2916 idx += sizeof (unsigned int) *
2917 (1 + *(unsigned int *) (extra + idx));
2918 /* Return the collation sequence value. */
2919 return *(unsigned int *) (extra + idx);
2921 else if (sym_name_len == 1)
2923 /* No valid character. Match it as a single byte
2924 character. */
2925 return collseqmb[br_elem->opr.name[0]];
2928 else if (sym_name_len == 1)
2929 return collseqmb[br_elem->opr.name[0]];
2931 return UINT_MAX;
2934 /* Local function for parse_bracket_exp used in _LIBC environment.
2935 Build the range expression which starts from START_ELEM, and ends
2936 at END_ELEM. The result are written to MBCSET and SBCSET.
2937 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2938 mbcset->range_ends, is a pointer argument since we may
2939 update it. */
2941 static inline reg_errcode_t
2942 __attribute__ ((always_inline))
2943 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, Idx *range_alloc,
2944 bracket_elem_t *start_elem, bracket_elem_t *end_elem,
2945 re_dfa_t *dfa, reg_syntax_t syntax, uint32_t nrules,
2946 const unsigned char *collseqmb, const char *collseqwc,
2947 int32_t table_size, const int32_t *symb_table,
2948 const unsigned char *extra)
2950 unsigned int ch;
2951 uint32_t start_collseq;
2952 uint32_t end_collseq;
2954 /* Equivalence Classes and Character Classes can't be a range
2955 start/end. */
2956 if (__glibc_unlikely (start_elem->type == EQUIV_CLASS
2957 || start_elem->type == CHAR_CLASS
2958 || end_elem->type == EQUIV_CLASS
2959 || end_elem->type == CHAR_CLASS))
2960 return REG_ERANGE;
2962 /* FIXME: Implement rational ranges here, too. */
2963 start_collseq = lookup_collation_sequence_value (start_elem, nrules, collseqmb, collseqwc,
2964 table_size, symb_table, extra);
2965 end_collseq = lookup_collation_sequence_value (end_elem, nrules, collseqmb, collseqwc,
2966 table_size, symb_table, extra);
2967 /* Check start/end collation sequence values. */
2968 if (__glibc_unlikely (start_collseq == UINT_MAX
2969 || end_collseq == UINT_MAX))
2970 return REG_ECOLLATE;
2971 if (__glibc_unlikely ((syntax & RE_NO_EMPTY_RANGES)
2972 && start_collseq > end_collseq))
2973 return REG_ERANGE;
2975 /* Got valid collation sequence values, add them as a new entry.
2976 However, if we have no collation elements, and the character set
2977 is single byte, the single byte character set that we
2978 build below suffices. */
2979 if (nrules > 0 || dfa->mb_cur_max > 1)
2981 /* Check the space of the arrays. */
2982 if (__glibc_unlikely (*range_alloc == mbcset->nranges))
2984 /* There is not enough space, need realloc. */
2985 uint32_t *new_array_start;
2986 uint32_t *new_array_end;
2987 Idx new_nranges;
2989 /* +1 in case of mbcset->nranges is 0. */
2990 new_nranges = 2 * mbcset->nranges + 1;
2991 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2992 new_nranges);
2993 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2994 new_nranges);
2996 if (__glibc_unlikely (new_array_start == NULL
2997 || new_array_end == NULL))
2998 return REG_ESPACE;
3000 mbcset->range_starts = new_array_start;
3001 mbcset->range_ends = new_array_end;
3002 *range_alloc = new_nranges;
3005 mbcset->range_starts[mbcset->nranges] = start_collseq;
3006 mbcset->range_ends[mbcset->nranges++] = end_collseq;
3009 /* Build the table for single byte characters. */
3010 for (ch = 0; ch < SBC_MAX; ch++)
3012 uint32_t ch_collseq;
3013 /* if (MB_CUR_MAX == 1) */
3014 if (nrules == 0)
3015 ch_collseq = collseqmb[ch];
3016 else
3017 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
3018 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
3019 bitset_set (sbcset, ch);
3021 return REG_NOERROR;
3024 /* Local function for parse_bracket_exp used in _LIBC environment.
3025 Build the collating element which is represented by NAME.
3026 The result are written to MBCSET and SBCSET.
3027 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
3028 pointer argument since we may update it. */
3030 static inline reg_errcode_t
3031 __attribute__ ((always_inline))
3032 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
3033 Idx *coll_sym_alloc, const unsigned char *name,
3034 uint32_t nrules, int32_t table_size,
3035 const int32_t *symb_table, const unsigned char *extra)
3037 int32_t elem, idx;
3038 size_t name_len = strlen ((const char *) name);
3039 if (nrules != 0)
3041 elem = seek_collating_symbol_entry (name, name_len, symb_table,
3042 table_size, extra);
3043 if (elem != -1)
3045 /* We found the entry. */
3046 idx = symb_table[2 * elem + 1];
3047 /* Skip the name of collating element name. */
3048 idx += 1 + extra[idx];
3050 else if (name_len == 1)
3052 /* No valid character, treat it as a normal
3053 character. */
3054 bitset_set (sbcset, name[0]);
3055 return REG_NOERROR;
3057 else
3058 return REG_ECOLLATE;
3060 /* Got valid collation sequence, add it as a new entry. */
3061 /* Check the space of the arrays. */
3062 if (__glibc_unlikely (*coll_sym_alloc == mbcset->ncoll_syms))
3064 /* Not enough, realloc it. */
3065 /* +1 in case of mbcset->ncoll_syms is 0. */
3066 Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3067 /* Use realloc since mbcset->coll_syms is NULL
3068 if *alloc == 0. */
3069 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3070 new_coll_sym_alloc);
3071 if (__glibc_unlikely (new_coll_syms == NULL))
3072 return REG_ESPACE;
3073 mbcset->coll_syms = new_coll_syms;
3074 *coll_sym_alloc = new_coll_sym_alloc;
3076 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3077 return REG_NOERROR;
3079 else
3081 if (__glibc_unlikely (name_len != 1))
3082 return REG_ECOLLATE;
3083 else
3085 bitset_set (sbcset, name[0]);
3086 return REG_NOERROR;
3090 #endif /* _LIBC */
3092 /* This function parse bracket expression like "[abc]", "[a-c]",
3093 "[[.a-a.]]" etc. */
3095 static bin_tree_t *
3096 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
3097 reg_syntax_t syntax, reg_errcode_t *err)
3099 #ifdef _LIBC
3100 const unsigned char *collseqmb;
3101 const char *collseqwc = NULL;
3102 uint32_t nrules;
3103 int32_t table_size = 0;
3104 const int32_t *symb_table = NULL;
3105 const unsigned char *extra = NULL;
3106 #endif
3108 re_token_t br_token;
3109 re_bitset_ptr_t sbcset;
3110 #ifdef RE_ENABLE_I18N
3111 re_charset_t *mbcset;
3112 Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3113 Idx equiv_class_alloc = 0, char_class_alloc = 0;
3114 #endif /* not RE_ENABLE_I18N */
3115 bool non_match = false;
3116 bin_tree_t *work_tree;
3117 int token_len;
3118 bool first_round = true;
3119 #ifdef _LIBC
3120 collseqmb = (const unsigned char *)
3121 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3122 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3123 if (nrules)
3126 if (MB_CUR_MAX > 1)
3128 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3129 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3130 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3131 _NL_COLLATE_SYMB_TABLEMB);
3132 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3133 _NL_COLLATE_SYMB_EXTRAMB);
3135 #endif
3136 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3137 #ifdef RE_ENABLE_I18N
3138 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3139 #endif /* RE_ENABLE_I18N */
3140 #ifdef RE_ENABLE_I18N
3141 if (__glibc_unlikely (sbcset == NULL || mbcset == NULL))
3142 #else
3143 if (__glibc_unlikely (sbcset == NULL))
3144 #endif /* RE_ENABLE_I18N */
3146 re_free (sbcset);
3147 #ifdef RE_ENABLE_I18N
3148 re_free (mbcset);
3149 #endif
3150 *err = REG_ESPACE;
3151 return NULL;
3154 token_len = peek_token_bracket (token, regexp, syntax);
3155 if (__glibc_unlikely (token->type == END_OF_RE))
3157 *err = REG_BADPAT;
3158 goto parse_bracket_exp_free_return;
3160 if (token->type == OP_NON_MATCH_LIST)
3162 #ifdef RE_ENABLE_I18N
3163 mbcset->non_match = 1;
3164 #endif /* not RE_ENABLE_I18N */
3165 non_match = true;
3166 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3167 bitset_set (sbcset, '\n');
3168 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3169 token_len = peek_token_bracket (token, regexp, syntax);
3170 if (__glibc_unlikely (token->type == END_OF_RE))
3172 *err = REG_BADPAT;
3173 goto parse_bracket_exp_free_return;
3177 /* We treat the first ']' as a normal character. */
3178 if (token->type == OP_CLOSE_BRACKET)
3179 token->type = CHARACTER;
3181 while (1)
3183 bracket_elem_t start_elem, end_elem;
3184 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3185 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3186 reg_errcode_t ret;
3187 int token_len2 = 0;
3188 bool is_range_exp = false;
3189 re_token_t token2;
3191 start_elem.opr.name = start_name_buf;
3192 start_elem.type = COLL_SYM;
3193 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3194 syntax, first_round);
3195 if (__glibc_unlikely (ret != REG_NOERROR))
3197 *err = ret;
3198 goto parse_bracket_exp_free_return;
3200 first_round = false;
3202 /* Get information about the next token. We need it in any case. */
3203 token_len = peek_token_bracket (token, regexp, syntax);
3205 /* Do not check for ranges if we know they are not allowed. */
3206 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3208 if (__glibc_unlikely (token->type == END_OF_RE))
3210 *err = REG_EBRACK;
3211 goto parse_bracket_exp_free_return;
3213 if (token->type == OP_CHARSET_RANGE)
3215 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3216 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3217 if (__glibc_unlikely (token2.type == END_OF_RE))
3219 *err = REG_EBRACK;
3220 goto parse_bracket_exp_free_return;
3222 if (token2.type == OP_CLOSE_BRACKET)
3224 /* We treat the last '-' as a normal character. */
3225 re_string_skip_bytes (regexp, -token_len);
3226 token->type = CHARACTER;
3228 else
3229 is_range_exp = true;
3233 if (is_range_exp == true)
3235 end_elem.opr.name = end_name_buf;
3236 end_elem.type = COLL_SYM;
3237 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3238 dfa, syntax, true);
3239 if (__glibc_unlikely (ret != REG_NOERROR))
3241 *err = ret;
3242 goto parse_bracket_exp_free_return;
3245 token_len = peek_token_bracket (token, regexp, syntax);
3247 #ifdef _LIBC
3248 *err = build_range_exp (sbcset, mbcset, &range_alloc,
3249 &start_elem, &end_elem,
3250 dfa, syntax, nrules, collseqmb, collseqwc,
3251 table_size, symb_table, extra);
3252 #else
3253 # ifdef RE_ENABLE_I18N
3254 *err = build_range_exp (syntax, sbcset,
3255 dfa->mb_cur_max > 1 ? mbcset : NULL,
3256 &range_alloc, &start_elem, &end_elem);
3257 # else
3258 *err = build_range_exp (syntax, sbcset, &start_elem, &end_elem);
3259 # endif
3260 #endif /* RE_ENABLE_I18N */
3261 if (__glibc_unlikely (*err != REG_NOERROR))
3262 goto parse_bracket_exp_free_return;
3264 else
3266 switch (start_elem.type)
3268 case SB_CHAR:
3269 bitset_set (sbcset, start_elem.opr.ch);
3270 break;
3271 #ifdef RE_ENABLE_I18N
3272 case MB_CHAR:
3273 /* Check whether the array has enough space. */
3274 if (__glibc_unlikely (mbchar_alloc == mbcset->nmbchars))
3276 wchar_t *new_mbchars;
3277 /* Not enough, realloc it. */
3278 /* +1 in case of mbcset->nmbchars is 0. */
3279 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3280 /* Use realloc since array is NULL if *alloc == 0. */
3281 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3282 mbchar_alloc);
3283 if (__glibc_unlikely (new_mbchars == NULL))
3284 goto parse_bracket_exp_espace;
3285 mbcset->mbchars = new_mbchars;
3287 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3288 break;
3289 #endif /* RE_ENABLE_I18N */
3290 case EQUIV_CLASS:
3291 *err = build_equiv_class (sbcset,
3292 #ifdef RE_ENABLE_I18N
3293 mbcset, &equiv_class_alloc,
3294 #endif /* RE_ENABLE_I18N */
3295 start_elem.opr.name);
3296 if (__glibc_unlikely (*err != REG_NOERROR))
3297 goto parse_bracket_exp_free_return;
3298 break;
3299 case COLL_SYM:
3300 *err = build_collating_symbol (sbcset,
3301 #ifdef RE_ENABLE_I18N
3302 mbcset, &coll_sym_alloc,
3303 #endif /* RE_ENABLE_I18N */
3304 start_elem.opr.name,
3305 nrules, table_size, symb_table, extra);
3306 if (__glibc_unlikely (*err != REG_NOERROR))
3307 goto parse_bracket_exp_free_return;
3308 break;
3309 case CHAR_CLASS:
3310 *err = build_charclass (regexp->trans, sbcset,
3311 #ifdef RE_ENABLE_I18N
3312 mbcset, &char_class_alloc,
3313 #endif /* RE_ENABLE_I18N */
3314 (const char *) start_elem.opr.name,
3315 syntax);
3316 if (__glibc_unlikely (*err != REG_NOERROR))
3317 goto parse_bracket_exp_free_return;
3318 break;
3319 default:
3320 DEBUG_ASSERT (false);
3321 break;
3324 if (__glibc_unlikely (token->type == END_OF_RE))
3326 *err = REG_EBRACK;
3327 goto parse_bracket_exp_free_return;
3329 if (token->type == OP_CLOSE_BRACKET)
3330 break;
3333 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3335 /* If it is non-matching list. */
3336 if (non_match)
3337 bitset_not (sbcset);
3339 #ifdef RE_ENABLE_I18N
3340 /* Ensure only single byte characters are set. */
3341 if (dfa->mb_cur_max > 1)
3342 bitset_mask (sbcset, dfa->sb_char);
3344 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3345 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3346 || mbcset->non_match)))
3348 bin_tree_t *mbc_tree;
3349 int sbc_idx;
3350 /* Build a tree for complex bracket. */
3351 dfa->has_mb_node = 1;
3352 br_token.type = COMPLEX_BRACKET;
3353 br_token.opr.mbcset = mbcset;
3354 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3355 if (__glibc_unlikely (mbc_tree == NULL))
3356 goto parse_bracket_exp_espace;
3357 for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3358 if (sbcset[sbc_idx])
3359 break;
3360 /* If there are no bits set in sbcset, there is no point
3361 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3362 if (sbc_idx < BITSET_WORDS)
3364 /* Build a tree for simple bracket. */
3365 br_token.type = SIMPLE_BRACKET;
3366 br_token.opr.sbcset = sbcset;
3367 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3368 if (__glibc_unlikely (work_tree == NULL))
3369 goto parse_bracket_exp_espace;
3371 /* Then join them by ALT node. */
3372 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3373 if (__glibc_unlikely (work_tree == NULL))
3374 goto parse_bracket_exp_espace;
3376 else
3378 re_free (sbcset);
3379 work_tree = mbc_tree;
3382 else
3383 #endif /* not RE_ENABLE_I18N */
3385 #ifdef RE_ENABLE_I18N
3386 free_charset (mbcset);
3387 #endif
3388 /* Build a tree for simple bracket. */
3389 br_token.type = SIMPLE_BRACKET;
3390 br_token.opr.sbcset = sbcset;
3391 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3392 if (__glibc_unlikely (work_tree == NULL))
3393 goto parse_bracket_exp_espace;
3395 return work_tree;
3397 parse_bracket_exp_espace:
3398 *err = REG_ESPACE;
3399 parse_bracket_exp_free_return:
3400 re_free (sbcset);
3401 #ifdef RE_ENABLE_I18N
3402 free_charset (mbcset);
3403 #endif /* RE_ENABLE_I18N */
3404 return NULL;
3407 /* Parse an element in the bracket expression. */
3409 static reg_errcode_t
3410 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3411 re_token_t *token, int token_len, re_dfa_t *dfa,
3412 reg_syntax_t syntax, bool accept_hyphen)
3414 #ifdef RE_ENABLE_I18N
3415 int cur_char_size;
3416 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3417 if (cur_char_size > 1)
3419 elem->type = MB_CHAR;
3420 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3421 re_string_skip_bytes (regexp, cur_char_size);
3422 return REG_NOERROR;
3424 #endif /* RE_ENABLE_I18N */
3425 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3426 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3427 || token->type == OP_OPEN_EQUIV_CLASS)
3428 return parse_bracket_symbol (elem, regexp, token);
3429 if (__glibc_unlikely (token->type == OP_CHARSET_RANGE) && !accept_hyphen)
3431 /* A '-' must only appear as anything but a range indicator before
3432 the closing bracket. Everything else is an error. */
3433 re_token_t token2;
3434 (void) peek_token_bracket (&token2, regexp, syntax);
3435 if (token2.type != OP_CLOSE_BRACKET)
3436 /* The actual error value is not standardized since this whole
3437 case is undefined. But ERANGE makes good sense. */
3438 return REG_ERANGE;
3440 elem->type = SB_CHAR;
3441 elem->opr.ch = token->opr.c;
3442 return REG_NOERROR;
3445 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3446 such as [:<character_class>:], [.<collating_element>.], and
3447 [=<equivalent_class>=]. */
3449 static reg_errcode_t
3450 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3451 re_token_t *token)
3453 unsigned char ch, delim = token->opr.c;
3454 int i = 0;
3455 if (re_string_eoi(regexp))
3456 return REG_EBRACK;
3457 for (;; ++i)
3459 if (i >= BRACKET_NAME_BUF_SIZE)
3460 return REG_EBRACK;
3461 if (token->type == OP_OPEN_CHAR_CLASS)
3462 ch = re_string_fetch_byte_case (regexp);
3463 else
3464 ch = re_string_fetch_byte (regexp);
3465 if (re_string_eoi(regexp))
3466 return REG_EBRACK;
3467 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3468 break;
3469 elem->opr.name[i] = ch;
3471 re_string_skip_bytes (regexp, 1);
3472 elem->opr.name[i] = '\0';
3473 switch (token->type)
3475 case OP_OPEN_COLL_ELEM:
3476 elem->type = COLL_SYM;
3477 break;
3478 case OP_OPEN_EQUIV_CLASS:
3479 elem->type = EQUIV_CLASS;
3480 break;
3481 case OP_OPEN_CHAR_CLASS:
3482 elem->type = CHAR_CLASS;
3483 break;
3484 default:
3485 break;
3487 return REG_NOERROR;
3490 /* Helper function for parse_bracket_exp.
3491 Build the equivalence class which is represented by NAME.
3492 The result are written to MBCSET and SBCSET.
3493 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3494 is a pointer argument since we may update it. */
3496 static reg_errcode_t
3497 #ifdef RE_ENABLE_I18N
3498 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3499 Idx *equiv_class_alloc, const unsigned char *name)
3500 #else /* not RE_ENABLE_I18N */
3501 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3502 #endif /* not RE_ENABLE_I18N */
3504 #ifdef _LIBC
3505 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3506 if (nrules != 0)
3508 const int32_t *table, *indirect;
3509 const unsigned char *weights, *extra, *cp;
3510 unsigned char char_buf[2];
3511 int32_t idx1, idx2;
3512 unsigned int ch;
3513 size_t len;
3514 /* Calculate the index for equivalence class. */
3515 cp = name;
3516 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3517 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3518 _NL_COLLATE_WEIGHTMB);
3519 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3520 _NL_COLLATE_EXTRAMB);
3521 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3522 _NL_COLLATE_INDIRECTMB);
3523 idx1 = findidx (table, indirect, extra, &cp, -1);
3524 if (__glibc_unlikely (idx1 == 0 || *cp != '\0'))
3525 /* This isn't a valid character. */
3526 return REG_ECOLLATE;
3528 /* Build single byte matching table for this equivalence class. */
3529 len = weights[idx1 & 0xffffff];
3530 for (ch = 0; ch < SBC_MAX; ++ch)
3532 char_buf[0] = ch;
3533 cp = char_buf;
3534 idx2 = findidx (table, indirect, extra, &cp, 1);
3536 idx2 = table[ch];
3538 if (idx2 == 0)
3539 /* This isn't a valid character. */
3540 continue;
3541 /* Compare only if the length matches and the collation rule
3542 index is the same. */
3543 if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24)
3544 && memcmp (weights + (idx1 & 0xffffff) + 1,
3545 weights + (idx2 & 0xffffff) + 1, len) == 0)
3546 bitset_set (sbcset, ch);
3548 /* Check whether the array has enough space. */
3549 if (__glibc_unlikely (*equiv_class_alloc == mbcset->nequiv_classes))
3551 /* Not enough, realloc it. */
3552 /* +1 in case of mbcset->nequiv_classes is 0. */
3553 Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3554 /* Use realloc since the array is NULL if *alloc == 0. */
3555 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3556 int32_t,
3557 new_equiv_class_alloc);
3558 if (__glibc_unlikely (new_equiv_classes == NULL))
3559 return REG_ESPACE;
3560 mbcset->equiv_classes = new_equiv_classes;
3561 *equiv_class_alloc = new_equiv_class_alloc;
3563 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3565 else
3566 #endif /* _LIBC */
3568 if (__glibc_unlikely (strlen ((const char *) name) != 1))
3569 return REG_ECOLLATE;
3570 bitset_set (sbcset, *name);
3572 return REG_NOERROR;
3575 /* Helper function for parse_bracket_exp.
3576 Build the character class which is represented by NAME.
3577 The result are written to MBCSET and SBCSET.
3578 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3579 is a pointer argument since we may update it. */
3581 static reg_errcode_t
3582 #ifdef RE_ENABLE_I18N
3583 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3584 re_charset_t *mbcset, Idx *char_class_alloc,
3585 const char *class_name, reg_syntax_t syntax)
3586 #else /* not RE_ENABLE_I18N */
3587 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3588 const char *class_name, reg_syntax_t syntax)
3589 #endif /* not RE_ENABLE_I18N */
3591 int i;
3592 const char *name = class_name;
3594 /* In case of REG_ICASE "upper" and "lower" match the both of
3595 upper and lower cases. */
3596 if ((syntax & RE_ICASE)
3597 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3598 name = "alpha";
3600 #ifdef RE_ENABLE_I18N
3601 /* Check the space of the arrays. */
3602 if (__glibc_unlikely (*char_class_alloc == mbcset->nchar_classes))
3604 /* Not enough, realloc it. */
3605 /* +1 in case of mbcset->nchar_classes is 0. */
3606 Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3607 /* Use realloc since array is NULL if *alloc == 0. */
3608 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3609 new_char_class_alloc);
3610 if (__glibc_unlikely (new_char_classes == NULL))
3611 return REG_ESPACE;
3612 mbcset->char_classes = new_char_classes;
3613 *char_class_alloc = new_char_class_alloc;
3615 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3616 #endif /* RE_ENABLE_I18N */
3618 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3619 do { \
3620 if (__glibc_unlikely (trans != NULL)) \
3622 for (i = 0; i < SBC_MAX; ++i) \
3623 if (ctype_func (i)) \
3624 bitset_set (sbcset, trans[i]); \
3626 else \
3628 for (i = 0; i < SBC_MAX; ++i) \
3629 if (ctype_func (i)) \
3630 bitset_set (sbcset, i); \
3632 } while (0)
3634 if (strcmp (name, "alnum") == 0)
3635 BUILD_CHARCLASS_LOOP (isalnum);
3636 else if (strcmp (name, "cntrl") == 0)
3637 BUILD_CHARCLASS_LOOP (iscntrl);
3638 else if (strcmp (name, "lower") == 0)
3639 BUILD_CHARCLASS_LOOP (islower);
3640 else if (strcmp (name, "space") == 0)
3641 BUILD_CHARCLASS_LOOP (isspace);
3642 else if (strcmp (name, "alpha") == 0)
3643 BUILD_CHARCLASS_LOOP (isalpha);
3644 else if (strcmp (name, "digit") == 0)
3645 BUILD_CHARCLASS_LOOP (isdigit);
3646 else if (strcmp (name, "print") == 0)
3647 BUILD_CHARCLASS_LOOP (isprint);
3648 else if (strcmp (name, "upper") == 0)
3649 BUILD_CHARCLASS_LOOP (isupper);
3650 else if (strcmp (name, "blank") == 0)
3651 BUILD_CHARCLASS_LOOP (isblank);
3652 else if (strcmp (name, "graph") == 0)
3653 BUILD_CHARCLASS_LOOP (isgraph);
3654 else if (strcmp (name, "punct") == 0)
3655 BUILD_CHARCLASS_LOOP (ispunct);
3656 else if (strcmp (name, "xdigit") == 0)
3657 BUILD_CHARCLASS_LOOP (isxdigit);
3658 else
3659 return REG_ECTYPE;
3661 return REG_NOERROR;
3664 static bin_tree_t *
3665 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3666 const char *class_name,
3667 const char *extra, bool non_match,
3668 reg_errcode_t *err)
3670 re_bitset_ptr_t sbcset;
3671 #ifdef RE_ENABLE_I18N
3672 re_charset_t *mbcset;
3673 Idx alloc = 0;
3674 #endif /* not RE_ENABLE_I18N */
3675 reg_errcode_t ret;
3676 bin_tree_t *tree;
3678 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3679 if (__glibc_unlikely (sbcset == NULL))
3681 *err = REG_ESPACE;
3682 return NULL;
3684 #ifdef RE_ENABLE_I18N
3685 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3686 if (__glibc_unlikely (mbcset == NULL))
3688 re_free (sbcset);
3689 *err = REG_ESPACE;
3690 return NULL;
3692 mbcset->non_match = non_match;
3693 #endif /* RE_ENABLE_I18N */
3695 /* We don't care the syntax in this case. */
3696 ret = build_charclass (trans, sbcset,
3697 #ifdef RE_ENABLE_I18N
3698 mbcset, &alloc,
3699 #endif /* RE_ENABLE_I18N */
3700 class_name, 0);
3702 if (__glibc_unlikely (ret != REG_NOERROR))
3704 re_free (sbcset);
3705 #ifdef RE_ENABLE_I18N
3706 free_charset (mbcset);
3707 #endif /* RE_ENABLE_I18N */
3708 *err = ret;
3709 return NULL;
3711 /* \w match '_' also. */
3712 for (; *extra; extra++)
3713 bitset_set (sbcset, *extra);
3715 /* If it is non-matching list. */
3716 if (non_match)
3717 bitset_not (sbcset);
3719 #ifdef RE_ENABLE_I18N
3720 /* Ensure only single byte characters are set. */
3721 if (dfa->mb_cur_max > 1)
3722 bitset_mask (sbcset, dfa->sb_char);
3723 #endif
3725 /* Build a tree for simple bracket. */
3726 re_token_t br_token = { .type = SIMPLE_BRACKET, .opr.sbcset = sbcset };
3727 tree = create_token_tree (dfa, NULL, NULL, &br_token);
3728 if (__glibc_unlikely (tree == NULL))
3729 goto build_word_op_espace;
3731 #ifdef RE_ENABLE_I18N
3732 if (dfa->mb_cur_max > 1)
3734 bin_tree_t *mbc_tree;
3735 /* Build a tree for complex bracket. */
3736 br_token.type = COMPLEX_BRACKET;
3737 br_token.opr.mbcset = mbcset;
3738 dfa->has_mb_node = 1;
3739 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3740 if (__glibc_unlikely (mbc_tree == NULL))
3741 goto build_word_op_espace;
3742 /* Then join them by ALT node. */
3743 tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3744 if (__glibc_likely (mbc_tree != NULL))
3745 return tree;
3747 else
3749 free_charset (mbcset);
3750 return tree;
3752 #else /* not RE_ENABLE_I18N */
3753 return tree;
3754 #endif /* not RE_ENABLE_I18N */
3756 build_word_op_espace:
3757 re_free (sbcset);
3758 #ifdef RE_ENABLE_I18N
3759 free_charset (mbcset);
3760 #endif /* RE_ENABLE_I18N */
3761 *err = REG_ESPACE;
3762 return NULL;
3765 /* This is intended for the expressions like "a{1,3}".
3766 Fetch a number from 'input', and return the number.
3767 Return -1 if the number field is empty like "{,1}".
3768 Return RE_DUP_MAX + 1 if the number field is too large.
3769 Return -2 if an error occurred. */
3771 static Idx
3772 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3774 Idx num = -1;
3775 unsigned char c;
3776 while (1)
3778 fetch_token (token, input, syntax);
3779 c = token->opr.c;
3780 if (__glibc_unlikely (token->type == END_OF_RE))
3781 return -2;
3782 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3783 break;
3784 num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3785 ? -2
3786 : num == -1
3787 ? c - '0'
3788 : MIN (RE_DUP_MAX + 1, num * 10 + c - '0'));
3790 return num;
3793 #ifdef RE_ENABLE_I18N
3794 static void
3795 free_charset (re_charset_t *cset)
3797 re_free (cset->mbchars);
3798 # ifdef _LIBC
3799 re_free (cset->coll_syms);
3800 re_free (cset->equiv_classes);
3801 # endif
3802 re_free (cset->range_starts);
3803 re_free (cset->range_ends);
3804 re_free (cset->char_classes);
3805 re_free (cset);
3807 #endif /* RE_ENABLE_I18N */
3809 /* Functions for binary tree operation. */
3811 /* Create a tree node. */
3813 static bin_tree_t *
3814 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3815 re_token_type_t type)
3817 re_token_t t = { .type = type };
3818 return create_token_tree (dfa, left, right, &t);
3821 static bin_tree_t *
3822 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3823 const re_token_t *token)
3825 bin_tree_t *tree;
3826 if (__glibc_unlikely (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE))
3828 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3830 if (storage == NULL)
3831 return NULL;
3832 storage->next = dfa->str_tree_storage;
3833 dfa->str_tree_storage = storage;
3834 dfa->str_tree_storage_idx = 0;
3836 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3838 tree->parent = NULL;
3839 tree->left = left;
3840 tree->right = right;
3841 tree->token = *token;
3842 tree->token.duplicated = 0;
3843 tree->token.opt_subexp = 0;
3844 tree->first = NULL;
3845 tree->next = NULL;
3846 tree->node_idx = -1;
3848 if (left != NULL)
3849 left->parent = tree;
3850 if (right != NULL)
3851 right->parent = tree;
3852 return tree;
3855 /* Mark the tree SRC as an optional subexpression.
3856 To be called from preorder or postorder. */
3858 static reg_errcode_t
3859 mark_opt_subexp (void *extra, bin_tree_t *node)
3861 Idx idx = (uintptr_t) extra;
3862 if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3863 node->token.opt_subexp = 1;
3865 return REG_NOERROR;
3868 /* Free the allocated memory inside NODE. */
3870 static void
3871 free_token (re_token_t *node)
3873 #ifdef RE_ENABLE_I18N
3874 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3875 free_charset (node->opr.mbcset);
3876 else
3877 #endif /* RE_ENABLE_I18N */
3878 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3879 re_free (node->opr.sbcset);
3882 /* Worker function for tree walking. Free the allocated memory inside NODE
3883 and its children. */
3885 static reg_errcode_t
3886 free_tree (void *extra, bin_tree_t *node)
3888 free_token (&node->token);
3889 return REG_NOERROR;
3893 /* Duplicate the node SRC, and return new node. This is a preorder
3894 visit similar to the one implemented by the generic visitor, but
3895 we need more infrastructure to maintain two parallel trees --- so,
3896 it's easier to duplicate. */
3898 static bin_tree_t *
3899 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3901 const bin_tree_t *node;
3902 bin_tree_t *dup_root;
3903 bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3905 for (node = root; ; )
3907 /* Create a new tree and link it back to the current parent. */
3908 *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3909 if (*p_new == NULL)
3910 return NULL;
3911 (*p_new)->parent = dup_node;
3912 (*p_new)->token.duplicated = 1;
3913 dup_node = *p_new;
3915 /* Go to the left node, or up and to the right. */
3916 if (node->left)
3918 node = node->left;
3919 p_new = &dup_node->left;
3921 else
3923 const bin_tree_t *prev = NULL;
3924 while (node->right == prev || node->right == NULL)
3926 prev = node;
3927 node = node->parent;
3928 dup_node = dup_node->parent;
3929 if (!node)
3930 return dup_root;
3932 node = node->right;
3933 p_new = &dup_node->right;