microblaze: Avoid clobbering register parameters in syscall
[glibc.git] / posix / regcomp.c
blob93bb0a0538a1f010b263bc8bca108ad3e54afee0
1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002-2020 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <https://www.gnu.org/licenses/>. */
20 #ifdef _LIBC
21 # include <locale/weight.h>
22 #endif
24 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
25 size_t length, reg_syntax_t syntax);
26 static void re_compile_fastmap_iter (regex_t *bufp,
27 const re_dfastate_t *init_state,
28 char *fastmap);
29 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
30 #ifdef RE_ENABLE_I18N
31 static void free_charset (re_charset_t *cset);
32 #endif /* RE_ENABLE_I18N */
33 static void free_workarea_compile (regex_t *preg);
34 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
35 #ifdef RE_ENABLE_I18N
36 static void optimize_utf8 (re_dfa_t *dfa);
37 #endif
38 static reg_errcode_t analyze (regex_t *preg);
39 static reg_errcode_t preorder (bin_tree_t *root,
40 reg_errcode_t (fn (void *, bin_tree_t *)),
41 void *extra);
42 static reg_errcode_t postorder (bin_tree_t *root,
43 reg_errcode_t (fn (void *, bin_tree_t *)),
44 void *extra);
45 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
46 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
47 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
48 bin_tree_t *node);
49 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
50 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
51 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
52 static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
53 static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
54 unsigned int constraint);
55 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
56 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
57 Idx node, bool root);
58 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
59 static Idx fetch_number (re_string_t *input, re_token_t *token,
60 reg_syntax_t syntax);
61 static int peek_token (re_token_t *token, re_string_t *input,
62 reg_syntax_t syntax);
63 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
64 reg_syntax_t syntax, reg_errcode_t *err);
65 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
66 re_token_t *token, reg_syntax_t syntax,
67 Idx nest, reg_errcode_t *err);
68 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
69 re_token_t *token, reg_syntax_t syntax,
70 Idx nest, reg_errcode_t *err);
71 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
72 re_token_t *token, reg_syntax_t syntax,
73 Idx nest, reg_errcode_t *err);
74 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
75 re_token_t *token, reg_syntax_t syntax,
76 Idx nest, reg_errcode_t *err);
77 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
78 re_dfa_t *dfa, re_token_t *token,
79 reg_syntax_t syntax, reg_errcode_t *err);
80 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
81 re_token_t *token, reg_syntax_t syntax,
82 reg_errcode_t *err);
83 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
84 re_string_t *regexp,
85 re_token_t *token, int token_len,
86 re_dfa_t *dfa,
87 reg_syntax_t syntax,
88 bool accept_hyphen);
89 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
90 re_string_t *regexp,
91 re_token_t *token);
92 #ifdef RE_ENABLE_I18N
93 static reg_errcode_t build_equiv_class (bitset_t sbcset,
94 re_charset_t *mbcset,
95 Idx *equiv_class_alloc,
96 const unsigned char *name);
97 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
98 bitset_t sbcset,
99 re_charset_t *mbcset,
100 Idx *char_class_alloc,
101 const char *class_name,
102 reg_syntax_t syntax);
103 #else /* not RE_ENABLE_I18N */
104 static reg_errcode_t build_equiv_class (bitset_t sbcset,
105 const unsigned char *name);
106 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
107 bitset_t sbcset,
108 const char *class_name,
109 reg_syntax_t syntax);
110 #endif /* not RE_ENABLE_I18N */
111 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
112 RE_TRANSLATE_TYPE trans,
113 const char *class_name,
114 const char *extra,
115 bool non_match, reg_errcode_t *err);
116 static bin_tree_t *create_tree (re_dfa_t *dfa,
117 bin_tree_t *left, bin_tree_t *right,
118 re_token_type_t type);
119 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
120 bin_tree_t *left, bin_tree_t *right,
121 const re_token_t *token);
122 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
123 static void free_token (re_token_t *node);
124 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
125 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
127 /* This table gives an error message for each of the error codes listed
128 in regex.h. Obviously the order here has to be same as there.
129 POSIX doesn't require that we do anything for REG_NOERROR,
130 but why not be nice? */
132 static const char __re_error_msgid[] =
134 #define REG_NOERROR_IDX 0
135 gettext_noop ("Success") /* REG_NOERROR */
136 "\0"
137 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
138 gettext_noop ("No match") /* REG_NOMATCH */
139 "\0"
140 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
141 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
142 "\0"
143 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
144 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
145 "\0"
146 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
147 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
148 "\0"
149 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
150 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
151 "\0"
152 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
153 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
154 "\0"
155 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
156 gettext_noop ("Unmatched [, [^, [:, [., or [=") /* REG_EBRACK */
157 "\0"
158 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [, [^, [:, [., or [=")
159 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
160 "\0"
161 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
162 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
163 "\0"
164 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
165 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
166 "\0"
167 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
168 gettext_noop ("Invalid range end") /* REG_ERANGE */
169 "\0"
170 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
171 gettext_noop ("Memory exhausted") /* REG_ESPACE */
172 "\0"
173 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
174 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
175 "\0"
176 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
177 gettext_noop ("Premature end of regular expression") /* REG_EEND */
178 "\0"
179 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
180 gettext_noop ("Regular expression too big") /* REG_ESIZE */
181 "\0"
182 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
183 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
186 static const size_t __re_error_msgid_idx[] =
188 REG_NOERROR_IDX,
189 REG_NOMATCH_IDX,
190 REG_BADPAT_IDX,
191 REG_ECOLLATE_IDX,
192 REG_ECTYPE_IDX,
193 REG_EESCAPE_IDX,
194 REG_ESUBREG_IDX,
195 REG_EBRACK_IDX,
196 REG_EPAREN_IDX,
197 REG_EBRACE_IDX,
198 REG_BADBR_IDX,
199 REG_ERANGE_IDX,
200 REG_ESPACE_IDX,
201 REG_BADRPT_IDX,
202 REG_EEND_IDX,
203 REG_ESIZE_IDX,
204 REG_ERPAREN_IDX
207 /* Entry points for GNU code. */
209 /* re_compile_pattern is the GNU regular expression compiler: it
210 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
211 Returns 0 if the pattern was valid, otherwise an error string.
213 Assumes the 'allocated' (and perhaps 'buffer') and 'translate' fields
214 are set in BUFP on entry. */
216 const char *
217 re_compile_pattern (const char *pattern, size_t length,
218 struct re_pattern_buffer *bufp)
220 reg_errcode_t ret;
222 /* And GNU code determines whether or not to get register information
223 by passing null for the REGS argument to re_match, etc., not by
224 setting no_sub, unless RE_NO_SUB is set. */
225 bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
227 /* Match anchors at newline. */
228 bufp->newline_anchor = 1;
230 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
232 if (!ret)
233 return NULL;
234 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
236 weak_alias (__re_compile_pattern, re_compile_pattern)
238 /* Set by 're_set_syntax' to the current regexp syntax to recognize. Can
239 also be assigned to arbitrarily: each pattern buffer stores its own
240 syntax, so it can be changed between regex compilations. */
241 /* This has no initializer because initialized variables in Emacs
242 become read-only after dumping. */
243 reg_syntax_t re_syntax_options;
246 /* Specify the precise syntax of regexps for compilation. This provides
247 for compatibility for various utilities which historically have
248 different, incompatible syntaxes.
250 The argument SYNTAX is a bit mask comprised of the various bits
251 defined in regex.h. We return the old syntax. */
253 reg_syntax_t
254 re_set_syntax (reg_syntax_t syntax)
256 reg_syntax_t ret = re_syntax_options;
258 re_syntax_options = syntax;
259 return ret;
261 weak_alias (__re_set_syntax, re_set_syntax)
264 re_compile_fastmap (struct re_pattern_buffer *bufp)
266 re_dfa_t *dfa = bufp->buffer;
267 char *fastmap = bufp->fastmap;
269 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
270 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
271 if (dfa->init_state != dfa->init_state_word)
272 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
273 if (dfa->init_state != dfa->init_state_nl)
274 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
275 if (dfa->init_state != dfa->init_state_begbuf)
276 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
277 bufp->fastmap_accurate = 1;
278 return 0;
280 weak_alias (__re_compile_fastmap, re_compile_fastmap)
282 static inline void
283 __attribute__ ((always_inline))
284 re_set_fastmap (char *fastmap, bool icase, int ch)
286 fastmap[ch] = 1;
287 if (icase)
288 fastmap[tolower (ch)] = 1;
291 /* Helper function for re_compile_fastmap.
292 Compile fastmap for the initial_state INIT_STATE. */
294 static void
295 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
296 char *fastmap)
298 re_dfa_t *dfa = bufp->buffer;
299 Idx node_cnt;
300 bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
301 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
303 Idx node = init_state->nodes.elems[node_cnt];
304 re_token_type_t type = dfa->nodes[node].type;
306 if (type == CHARACTER)
308 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
309 #ifdef RE_ENABLE_I18N
310 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
312 unsigned char buf[MB_LEN_MAX];
313 unsigned char *p;
314 wchar_t wc;
315 mbstate_t state;
317 p = buf;
318 *p++ = dfa->nodes[node].opr.c;
319 while (++node < dfa->nodes_len
320 && dfa->nodes[node].type == CHARACTER
321 && dfa->nodes[node].mb_partial)
322 *p++ = dfa->nodes[node].opr.c;
323 memset (&state, '\0', sizeof (state));
324 if (__mbrtowc (&wc, (const char *) buf, p - buf,
325 &state) == p - buf
326 && (__wcrtomb ((char *) buf, __towlower (wc), &state)
327 != (size_t) -1))
328 re_set_fastmap (fastmap, false, buf[0]);
330 #endif
332 else if (type == SIMPLE_BRACKET)
334 int i, ch;
335 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
337 int j;
338 bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
339 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
340 if (w & ((bitset_word_t) 1 << j))
341 re_set_fastmap (fastmap, icase, ch);
344 #ifdef RE_ENABLE_I18N
345 else if (type == COMPLEX_BRACKET)
347 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
348 Idx i;
350 # ifdef _LIBC
351 /* See if we have to try all bytes which start multiple collation
352 elements.
353 e.g. In da_DK, we want to catch 'a' since "aa" is a valid
354 collation element, and don't catch 'b' since 'b' is
355 the only collation element which starts from 'b' (and
356 it is caught by SIMPLE_BRACKET). */
357 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
358 && (cset->ncoll_syms || cset->nranges))
360 const int32_t *table = (const int32_t *)
361 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
362 for (i = 0; i < SBC_MAX; ++i)
363 if (table[i] < 0)
364 re_set_fastmap (fastmap, icase, i);
366 # endif /* _LIBC */
368 /* See if we have to start the match at all multibyte characters,
369 i.e. where we would not find an invalid sequence. This only
370 applies to multibyte character sets; for single byte character
371 sets, the SIMPLE_BRACKET again suffices. */
372 if (dfa->mb_cur_max > 1
373 && (cset->nchar_classes || cset->non_match || cset->nranges
374 # ifdef _LIBC
375 || cset->nequiv_classes
376 # endif /* _LIBC */
379 unsigned char c = 0;
382 mbstate_t mbs;
383 memset (&mbs, 0, sizeof (mbs));
384 if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
385 re_set_fastmap (fastmap, false, (int) c);
387 while (++c != 0);
390 else
392 /* ... Else catch all bytes which can start the mbchars. */
393 for (i = 0; i < cset->nmbchars; ++i)
395 char buf[256];
396 mbstate_t state;
397 memset (&state, '\0', sizeof (state));
398 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
399 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
400 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
402 if (__wcrtomb (buf, __towlower (cset->mbchars[i]), &state)
403 != (size_t) -1)
404 re_set_fastmap (fastmap, false, *(unsigned char *) buf);
409 #endif /* RE_ENABLE_I18N */
410 else if (type == OP_PERIOD
411 #ifdef RE_ENABLE_I18N
412 || type == OP_UTF8_PERIOD
413 #endif /* RE_ENABLE_I18N */
414 || type == END_OF_RE)
416 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
417 if (type == END_OF_RE)
418 bufp->can_be_null = 1;
419 return;
424 /* Entry point for POSIX code. */
425 /* regcomp takes a regular expression as a string and compiles it.
427 PREG is a regex_t *. We do not expect any fields to be initialized,
428 since POSIX says we shouldn't. Thus, we set
430 'buffer' to the compiled pattern;
431 'used' to the length of the compiled pattern;
432 'syntax' to RE_SYNTAX_POSIX_EXTENDED if the
433 REG_EXTENDED bit in CFLAGS is set; otherwise, to
434 RE_SYNTAX_POSIX_BASIC;
435 'newline_anchor' to REG_NEWLINE being set in CFLAGS;
436 'fastmap' to an allocated space for the fastmap;
437 'fastmap_accurate' to zero;
438 're_nsub' to the number of subexpressions in PATTERN.
440 PATTERN is the address of the pattern string.
442 CFLAGS is a series of bits which affect compilation.
444 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
445 use POSIX basic syntax.
447 If REG_NEWLINE is set, then . and [^...] don't match newline.
448 Also, regexec will try a match beginning after every newline.
450 If REG_ICASE is set, then we considers upper- and lowercase
451 versions of letters to be equivalent when matching.
453 If REG_NOSUB is set, then when PREG is passed to regexec, that
454 routine will report only success or failure, and nothing about the
455 registers.
457 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
458 the return codes and their meanings.) */
461 regcomp (regex_t *__restrict preg, const char *__restrict pattern, int cflags)
463 reg_errcode_t ret;
464 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
465 : RE_SYNTAX_POSIX_BASIC);
467 preg->buffer = NULL;
468 preg->allocated = 0;
469 preg->used = 0;
471 /* Try to allocate space for the fastmap. */
472 preg->fastmap = re_malloc (char, SBC_MAX);
473 if (__glibc_unlikely (preg->fastmap == NULL))
474 return REG_ESPACE;
476 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
478 /* If REG_NEWLINE is set, newlines are treated differently. */
479 if (cflags & REG_NEWLINE)
480 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
481 syntax &= ~RE_DOT_NEWLINE;
482 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
483 /* It also changes the matching behavior. */
484 preg->newline_anchor = 1;
486 else
487 preg->newline_anchor = 0;
488 preg->no_sub = !!(cflags & REG_NOSUB);
489 preg->translate = NULL;
491 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
493 /* POSIX doesn't distinguish between an unmatched open-group and an
494 unmatched close-group: both are REG_EPAREN. */
495 if (ret == REG_ERPAREN)
496 ret = REG_EPAREN;
498 /* We have already checked preg->fastmap != NULL. */
499 if (__glibc_likely (ret == REG_NOERROR))
500 /* Compute the fastmap now, since regexec cannot modify the pattern
501 buffer. This function never fails in this implementation. */
502 (void) re_compile_fastmap (preg);
503 else
505 /* Some error occurred while compiling the expression. */
506 re_free (preg->fastmap);
507 preg->fastmap = NULL;
510 return (int) ret;
512 libc_hidden_def (__regcomp)
513 weak_alias (__regcomp, regcomp)
515 /* Returns a message corresponding to an error code, ERRCODE, returned
516 from either regcomp or regexec. We don't use PREG here. */
518 size_t
519 regerror (int errcode, const regex_t *__restrict preg, char *__restrict errbuf,
520 size_t errbuf_size)
522 const char *msg;
523 size_t msg_size;
524 int nerrcodes = sizeof __re_error_msgid_idx / sizeof __re_error_msgid_idx[0];
526 if (__glibc_unlikely (errcode < 0 || errcode >= nerrcodes))
527 /* Only error codes returned by the rest of the code should be passed
528 to this routine. If we are given anything else, or if other regex
529 code generates an invalid error code, then the program has a bug.
530 Dump core so we can fix it. */
531 abort ();
533 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
535 msg_size = strlen (msg) + 1; /* Includes the null. */
537 if (__glibc_likely (errbuf_size != 0))
539 size_t cpy_size = msg_size;
540 if (__glibc_unlikely (msg_size > errbuf_size))
542 cpy_size = errbuf_size - 1;
543 errbuf[cpy_size] = '\0';
545 memcpy (errbuf, msg, cpy_size);
548 return msg_size;
550 weak_alias (__regerror, regerror)
553 #ifdef RE_ENABLE_I18N
554 /* This static array is used for the map to single-byte characters when
555 UTF-8 is used. Otherwise we would allocate memory just to initialize
556 it the same all the time. UTF-8 is the preferred encoding so this is
557 a worthwhile optimization. */
558 static const bitset_t utf8_sb_map =
560 /* Set the first 128 bits. */
561 # if defined __GNUC__ && !defined __STRICT_ANSI__
562 [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
563 # else
564 # if 4 * BITSET_WORD_BITS < ASCII_CHARS
565 # error "bitset_word_t is narrower than 32 bits"
566 # elif 3 * BITSET_WORD_BITS < ASCII_CHARS
567 BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
568 # elif 2 * BITSET_WORD_BITS < ASCII_CHARS
569 BITSET_WORD_MAX, BITSET_WORD_MAX,
570 # elif 1 * BITSET_WORD_BITS < ASCII_CHARS
571 BITSET_WORD_MAX,
572 # endif
573 (BITSET_WORD_MAX
574 >> (SBC_MAX % BITSET_WORD_BITS == 0
576 : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
577 # endif
579 #endif
582 static void
583 free_dfa_content (re_dfa_t *dfa)
585 Idx i, j;
587 if (dfa->nodes)
588 for (i = 0; i < dfa->nodes_len; ++i)
589 free_token (dfa->nodes + i);
590 re_free (dfa->nexts);
591 for (i = 0; i < dfa->nodes_len; ++i)
593 if (dfa->eclosures != NULL)
594 re_node_set_free (dfa->eclosures + i);
595 if (dfa->inveclosures != NULL)
596 re_node_set_free (dfa->inveclosures + i);
597 if (dfa->edests != NULL)
598 re_node_set_free (dfa->edests + i);
600 re_free (dfa->edests);
601 re_free (dfa->eclosures);
602 re_free (dfa->inveclosures);
603 re_free (dfa->nodes);
605 if (dfa->state_table)
606 for (i = 0; i <= dfa->state_hash_mask; ++i)
608 struct re_state_table_entry *entry = dfa->state_table + i;
609 for (j = 0; j < entry->num; ++j)
611 re_dfastate_t *state = entry->array[j];
612 free_state (state);
614 re_free (entry->array);
616 re_free (dfa->state_table);
617 #ifdef RE_ENABLE_I18N
618 if (dfa->sb_char != utf8_sb_map)
619 re_free (dfa->sb_char);
620 #endif
621 re_free (dfa->subexp_map);
622 #ifdef DEBUG
623 re_free (dfa->re_str);
624 #endif
626 re_free (dfa);
630 /* Free dynamically allocated space used by PREG. */
632 void
633 regfree (regex_t *preg)
635 re_dfa_t *dfa = preg->buffer;
636 if (__glibc_likely (dfa != NULL))
638 lock_fini (dfa->lock);
639 free_dfa_content (dfa);
641 preg->buffer = NULL;
642 preg->allocated = 0;
644 re_free (preg->fastmap);
645 preg->fastmap = NULL;
647 re_free (preg->translate);
648 preg->translate = NULL;
650 libc_hidden_def (__regfree)
651 weak_alias (__regfree, regfree)
653 /* Entry points compatible with 4.2 BSD regex library. We don't define
654 them unless specifically requested. */
656 #if defined _REGEX_RE_COMP || defined _LIBC
658 /* BSD has one and only one pattern buffer. */
659 static struct re_pattern_buffer re_comp_buf;
661 char *
662 # ifdef _LIBC
663 /* Make these definitions weak in libc, so POSIX programs can redefine
664 these names if they don't use our functions, and still use
665 regcomp/regexec above without link errors. */
666 weak_function
667 # endif
668 re_comp (const char *s)
670 reg_errcode_t ret;
671 char *fastmap;
673 if (!s)
675 if (!re_comp_buf.buffer)
676 return gettext ("No previous regular expression");
677 return 0;
680 if (re_comp_buf.buffer)
682 fastmap = re_comp_buf.fastmap;
683 re_comp_buf.fastmap = NULL;
684 __regfree (&re_comp_buf);
685 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
686 re_comp_buf.fastmap = fastmap;
689 if (re_comp_buf.fastmap == NULL)
691 re_comp_buf.fastmap = re_malloc (char, SBC_MAX);
692 if (re_comp_buf.fastmap == NULL)
693 return (char *) gettext (__re_error_msgid
694 + __re_error_msgid_idx[(int) REG_ESPACE]);
697 /* Since 're_exec' always passes NULL for the 'regs' argument, we
698 don't need to initialize the pattern buffer fields which affect it. */
700 /* Match anchors at newlines. */
701 re_comp_buf.newline_anchor = 1;
703 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
705 if (!ret)
706 return NULL;
708 /* Yes, we're discarding 'const' here if !HAVE_LIBINTL. */
709 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
712 #ifdef _LIBC
713 libc_freeres_fn (free_mem)
715 __regfree (&re_comp_buf);
717 #endif
719 #endif /* _REGEX_RE_COMP */
721 /* Internal entry point.
722 Compile the regular expression PATTERN, whose length is LENGTH.
723 SYNTAX indicate regular expression's syntax. */
725 static reg_errcode_t
726 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
727 reg_syntax_t syntax)
729 reg_errcode_t err = REG_NOERROR;
730 re_dfa_t *dfa;
731 re_string_t regexp;
733 /* Initialize the pattern buffer. */
734 preg->fastmap_accurate = 0;
735 preg->syntax = syntax;
736 preg->not_bol = preg->not_eol = 0;
737 preg->used = 0;
738 preg->re_nsub = 0;
739 preg->can_be_null = 0;
740 preg->regs_allocated = REGS_UNALLOCATED;
742 /* Initialize the dfa. */
743 dfa = preg->buffer;
744 if (__glibc_unlikely (preg->allocated < sizeof (re_dfa_t)))
746 /* If zero allocated, but buffer is non-null, try to realloc
747 enough space. This loses if buffer's address is bogus, but
748 that is the user's responsibility. If ->buffer is NULL this
749 is a simple allocation. */
750 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
751 if (dfa == NULL)
752 return REG_ESPACE;
753 preg->allocated = sizeof (re_dfa_t);
754 preg->buffer = dfa;
756 preg->used = sizeof (re_dfa_t);
758 err = init_dfa (dfa, length);
759 if (__glibc_unlikely (err == REG_NOERROR && lock_init (dfa->lock) != 0))
760 err = REG_ESPACE;
761 if (__glibc_unlikely (err != REG_NOERROR))
763 free_dfa_content (dfa);
764 preg->buffer = NULL;
765 preg->allocated = 0;
766 return err;
768 #ifdef DEBUG
769 /* Note: length+1 will not overflow since it is checked in init_dfa. */
770 dfa->re_str = re_malloc (char, length + 1);
771 strncpy (dfa->re_str, pattern, length + 1);
772 #endif
774 err = re_string_construct (&regexp, pattern, length, preg->translate,
775 (syntax & RE_ICASE) != 0, dfa);
776 if (__glibc_unlikely (err != REG_NOERROR))
778 re_compile_internal_free_return:
779 free_workarea_compile (preg);
780 re_string_destruct (&regexp);
781 lock_fini (dfa->lock);
782 free_dfa_content (dfa);
783 preg->buffer = NULL;
784 preg->allocated = 0;
785 return err;
788 /* Parse the regular expression, and build a structure tree. */
789 preg->re_nsub = 0;
790 dfa->str_tree = parse (&regexp, preg, syntax, &err);
791 if (__glibc_unlikely (dfa->str_tree == NULL))
792 goto re_compile_internal_free_return;
794 /* Analyze the tree and create the nfa. */
795 err = analyze (preg);
796 if (__glibc_unlikely (err != REG_NOERROR))
797 goto re_compile_internal_free_return;
799 #ifdef RE_ENABLE_I18N
800 /* If possible, do searching in single byte encoding to speed things up. */
801 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
802 optimize_utf8 (dfa);
803 #endif
805 /* Then create the initial state of the dfa. */
806 err = create_initial_state (dfa);
808 /* Release work areas. */
809 free_workarea_compile (preg);
810 re_string_destruct (&regexp);
812 if (__glibc_unlikely (err != REG_NOERROR))
814 lock_fini (dfa->lock);
815 free_dfa_content (dfa);
816 preg->buffer = NULL;
817 preg->allocated = 0;
820 return err;
823 /* Initialize DFA. We use the length of the regular expression PAT_LEN
824 as the initial length of some arrays. */
826 static reg_errcode_t
827 init_dfa (re_dfa_t *dfa, size_t pat_len)
829 __re_size_t table_size;
830 #ifndef _LIBC
831 const char *codeset_name;
832 #endif
833 #ifdef RE_ENABLE_I18N
834 size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
835 #else
836 size_t max_i18n_object_size = 0;
837 #endif
838 size_t max_object_size =
839 MAX (sizeof (struct re_state_table_entry),
840 MAX (sizeof (re_token_t),
841 MAX (sizeof (re_node_set),
842 MAX (sizeof (regmatch_t),
843 max_i18n_object_size))));
845 memset (dfa, '\0', sizeof (re_dfa_t));
847 /* Force allocation of str_tree_storage the first time. */
848 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
850 /* Avoid overflows. The extra "/ 2" is for the table_size doubling
851 calculation below, and for similar doubling calculations
852 elsewhere. And it's <= rather than <, because some of the
853 doubling calculations add 1 afterwards. */
854 if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / max_object_size) / 2
855 <= pat_len))
856 return REG_ESPACE;
858 dfa->nodes_alloc = pat_len + 1;
859 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
861 /* table_size = 2 ^ ceil(log pat_len) */
862 for (table_size = 1; ; table_size <<= 1)
863 if (table_size > pat_len)
864 break;
866 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
867 dfa->state_hash_mask = table_size - 1;
869 dfa->mb_cur_max = MB_CUR_MAX;
870 #ifdef _LIBC
871 if (dfa->mb_cur_max == 6
872 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
873 dfa->is_utf8 = 1;
874 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
875 != 0);
876 #else
877 codeset_name = nl_langinfo (CODESET);
878 if ((codeset_name[0] == 'U' || codeset_name[0] == 'u')
879 && (codeset_name[1] == 'T' || codeset_name[1] == 't')
880 && (codeset_name[2] == 'F' || codeset_name[2] == 'f')
881 && strcmp (codeset_name + 3 + (codeset_name[3] == '-'), "8") == 0)
882 dfa->is_utf8 = 1;
884 /* We check exhaustively in the loop below if this charset is a
885 superset of ASCII. */
886 dfa->map_notascii = 0;
887 #endif
889 #ifdef RE_ENABLE_I18N
890 if (dfa->mb_cur_max > 1)
892 if (dfa->is_utf8)
893 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
894 else
896 int i, j, ch;
898 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
899 if (__glibc_unlikely (dfa->sb_char == NULL))
900 return REG_ESPACE;
902 /* Set the bits corresponding to single byte chars. */
903 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
904 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
906 wint_t wch = __btowc (ch);
907 if (wch != WEOF)
908 dfa->sb_char[i] |= (bitset_word_t) 1 << j;
909 # ifndef _LIBC
910 if (isascii (ch) && wch != ch)
911 dfa->map_notascii = 1;
912 # endif
916 #endif
918 if (__glibc_unlikely (dfa->nodes == NULL || dfa->state_table == NULL))
919 return REG_ESPACE;
920 return REG_NOERROR;
923 /* Initialize WORD_CHAR table, which indicate which character is
924 "word". In this case "word" means that it is the word construction
925 character used by some operators like "\<", "\>", etc. */
927 static void
928 init_word_char (re_dfa_t *dfa)
930 int i = 0;
931 int j;
932 int ch = 0;
933 dfa->word_ops_used = 1;
934 if (__glibc_likely (dfa->map_notascii == 0))
936 /* Avoid uint32_t and uint64_t as some non-GCC platforms lack
937 them, an issue when this code is used in Gnulib. */
938 bitset_word_t bits0 = 0x00000000;
939 bitset_word_t bits1 = 0x03ff0000;
940 bitset_word_t bits2 = 0x87fffffe;
941 bitset_word_t bits3 = 0x07fffffe;
942 if (BITSET_WORD_BITS == 64)
944 /* Pacify gcc -Woverflow on 32-bit platformns. */
945 dfa->word_char[0] = bits1 << 31 << 1 | bits0;
946 dfa->word_char[1] = bits3 << 31 << 1 | bits2;
947 i = 2;
949 else if (BITSET_WORD_BITS == 32)
951 dfa->word_char[0] = bits0;
952 dfa->word_char[1] = bits1;
953 dfa->word_char[2] = bits2;
954 dfa->word_char[3] = bits3;
955 i = 4;
957 else
958 goto general_case;
959 ch = 128;
961 if (__glibc_likely (dfa->is_utf8))
963 memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8);
964 return;
968 general_case:
969 for (; i < BITSET_WORDS; ++i)
970 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
971 if (isalnum (ch) || ch == '_')
972 dfa->word_char[i] |= (bitset_word_t) 1 << j;
975 /* Free the work area which are only used while compiling. */
977 static void
978 free_workarea_compile (regex_t *preg)
980 re_dfa_t *dfa = preg->buffer;
981 bin_tree_storage_t *storage, *next;
982 for (storage = dfa->str_tree_storage; storage; storage = next)
984 next = storage->next;
985 re_free (storage);
987 dfa->str_tree_storage = NULL;
988 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
989 dfa->str_tree = NULL;
990 re_free (dfa->org_indices);
991 dfa->org_indices = NULL;
994 /* Create initial states for all contexts. */
996 static reg_errcode_t
997 create_initial_state (re_dfa_t *dfa)
999 Idx first, i;
1000 reg_errcode_t err;
1001 re_node_set init_nodes;
1003 /* Initial states have the epsilon closure of the node which is
1004 the first node of the regular expression. */
1005 first = dfa->str_tree->first->node_idx;
1006 dfa->init_node = first;
1007 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
1008 if (__glibc_unlikely (err != REG_NOERROR))
1009 return err;
1011 /* The back-references which are in initial states can epsilon transit,
1012 since in this case all of the subexpressions can be null.
1013 Then we add epsilon closures of the nodes which are the next nodes of
1014 the back-references. */
1015 if (dfa->nbackref > 0)
1016 for (i = 0; i < init_nodes.nelem; ++i)
1018 Idx node_idx = init_nodes.elems[i];
1019 re_token_type_t type = dfa->nodes[node_idx].type;
1021 Idx clexp_idx;
1022 if (type != OP_BACK_REF)
1023 continue;
1024 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1026 re_token_t *clexp_node;
1027 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1028 if (clexp_node->type == OP_CLOSE_SUBEXP
1029 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1030 break;
1032 if (clexp_idx == init_nodes.nelem)
1033 continue;
1035 if (type == OP_BACK_REF)
1037 Idx dest_idx = dfa->edests[node_idx].elems[0];
1038 if (!re_node_set_contains (&init_nodes, dest_idx))
1040 reg_errcode_t merge_err
1041 = re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1042 if (merge_err != REG_NOERROR)
1043 return merge_err;
1044 i = 0;
1049 /* It must be the first time to invoke acquire_state. */
1050 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1051 /* We don't check ERR here, since the initial state must not be NULL. */
1052 if (__glibc_unlikely (dfa->init_state == NULL))
1053 return err;
1054 if (dfa->init_state->has_constraint)
1056 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1057 CONTEXT_WORD);
1058 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1059 CONTEXT_NEWLINE);
1060 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1061 &init_nodes,
1062 CONTEXT_NEWLINE
1063 | CONTEXT_BEGBUF);
1064 if (__glibc_unlikely (dfa->init_state_word == NULL
1065 || dfa->init_state_nl == NULL
1066 || dfa->init_state_begbuf == NULL))
1067 return err;
1069 else
1070 dfa->init_state_word = dfa->init_state_nl
1071 = dfa->init_state_begbuf = dfa->init_state;
1073 re_node_set_free (&init_nodes);
1074 return REG_NOERROR;
1077 #ifdef RE_ENABLE_I18N
1078 /* If it is possible to do searching in single byte encoding instead of UTF-8
1079 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1080 DFA nodes where needed. */
1082 static void
1083 optimize_utf8 (re_dfa_t *dfa)
1085 Idx node;
1086 int i;
1087 bool mb_chars = false;
1088 bool has_period = false;
1090 for (node = 0; node < dfa->nodes_len; ++node)
1091 switch (dfa->nodes[node].type)
1093 case CHARACTER:
1094 if (dfa->nodes[node].opr.c >= ASCII_CHARS)
1095 mb_chars = true;
1096 break;
1097 case ANCHOR:
1098 switch (dfa->nodes[node].opr.ctx_type)
1100 case LINE_FIRST:
1101 case LINE_LAST:
1102 case BUF_FIRST:
1103 case BUF_LAST:
1104 break;
1105 default:
1106 /* Word anchors etc. cannot be handled. It's okay to test
1107 opr.ctx_type since constraints (for all DFA nodes) are
1108 created by ORing one or more opr.ctx_type values. */
1109 return;
1111 break;
1112 case OP_PERIOD:
1113 has_period = true;
1114 break;
1115 case OP_BACK_REF:
1116 case OP_ALT:
1117 case END_OF_RE:
1118 case OP_DUP_ASTERISK:
1119 case OP_OPEN_SUBEXP:
1120 case OP_CLOSE_SUBEXP:
1121 break;
1122 case COMPLEX_BRACKET:
1123 return;
1124 case SIMPLE_BRACKET:
1125 /* Just double check. */
1127 int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
1129 : BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
1130 for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1132 if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
1133 return;
1134 rshift = 0;
1137 break;
1138 default:
1139 abort ();
1142 if (mb_chars || has_period)
1143 for (node = 0; node < dfa->nodes_len; ++node)
1145 if (dfa->nodes[node].type == CHARACTER
1146 && dfa->nodes[node].opr.c >= ASCII_CHARS)
1147 dfa->nodes[node].mb_partial = 0;
1148 else if (dfa->nodes[node].type == OP_PERIOD)
1149 dfa->nodes[node].type = OP_UTF8_PERIOD;
1152 /* The search can be in single byte locale. */
1153 dfa->mb_cur_max = 1;
1154 dfa->is_utf8 = 0;
1155 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1157 #endif
1159 /* Analyze the structure tree, and calculate "first", "next", "edest",
1160 "eclosure", and "inveclosure". */
1162 static reg_errcode_t
1163 analyze (regex_t *preg)
1165 re_dfa_t *dfa = preg->buffer;
1166 reg_errcode_t ret;
1168 /* Allocate arrays. */
1169 dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1170 dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1171 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1172 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1173 if (__glibc_unlikely (dfa->nexts == NULL || dfa->org_indices == NULL
1174 || dfa->edests == NULL || dfa->eclosures == NULL))
1175 return REG_ESPACE;
1177 dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
1178 if (dfa->subexp_map != NULL)
1180 Idx i;
1181 for (i = 0; i < preg->re_nsub; i++)
1182 dfa->subexp_map[i] = i;
1183 preorder (dfa->str_tree, optimize_subexps, dfa);
1184 for (i = 0; i < preg->re_nsub; i++)
1185 if (dfa->subexp_map[i] != i)
1186 break;
1187 if (i == preg->re_nsub)
1189 re_free (dfa->subexp_map);
1190 dfa->subexp_map = NULL;
1194 ret = postorder (dfa->str_tree, lower_subexps, preg);
1195 if (__glibc_unlikely (ret != REG_NOERROR))
1196 return ret;
1197 ret = postorder (dfa->str_tree, calc_first, dfa);
1198 if (__glibc_unlikely (ret != REG_NOERROR))
1199 return ret;
1200 preorder (dfa->str_tree, calc_next, dfa);
1201 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1202 if (__glibc_unlikely (ret != REG_NOERROR))
1203 return ret;
1204 ret = calc_eclosure (dfa);
1205 if (__glibc_unlikely (ret != REG_NOERROR))
1206 return ret;
1208 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1209 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1210 if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1211 || dfa->nbackref)
1213 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1214 if (__glibc_unlikely (dfa->inveclosures == NULL))
1215 return REG_ESPACE;
1216 ret = calc_inveclosure (dfa);
1219 return ret;
1222 /* Our parse trees are very unbalanced, so we cannot use a stack to
1223 implement parse tree visits. Instead, we use parent pointers and
1224 some hairy code in these two functions. */
1225 static reg_errcode_t
1226 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1227 void *extra)
1229 bin_tree_t *node, *prev;
1231 for (node = root; ; )
1233 /* Descend down the tree, preferably to the left (or to the right
1234 if that's the only child). */
1235 while (node->left || node->right)
1236 if (node->left)
1237 node = node->left;
1238 else
1239 node = node->right;
1243 reg_errcode_t err = fn (extra, node);
1244 if (__glibc_unlikely (err != REG_NOERROR))
1245 return err;
1246 if (node->parent == NULL)
1247 return REG_NOERROR;
1248 prev = node;
1249 node = node->parent;
1251 /* Go up while we have a node that is reached from the right. */
1252 while (node->right == prev || node->right == NULL);
1253 node = node->right;
1257 static reg_errcode_t
1258 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1259 void *extra)
1261 bin_tree_t *node;
1263 for (node = root; ; )
1265 reg_errcode_t err = fn (extra, node);
1266 if (__glibc_unlikely (err != REG_NOERROR))
1267 return err;
1269 /* Go to the left node, or up and to the right. */
1270 if (node->left)
1271 node = node->left;
1272 else
1274 bin_tree_t *prev = NULL;
1275 while (node->right == prev || node->right == NULL)
1277 prev = node;
1278 node = node->parent;
1279 if (!node)
1280 return REG_NOERROR;
1282 node = node->right;
1287 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1288 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1289 backreferences as well. Requires a preorder visit. */
1290 static reg_errcode_t
1291 optimize_subexps (void *extra, bin_tree_t *node)
1293 re_dfa_t *dfa = (re_dfa_t *) extra;
1295 if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1297 int idx = node->token.opr.idx;
1298 node->token.opr.idx = dfa->subexp_map[idx];
1299 dfa->used_bkref_map |= 1 << node->token.opr.idx;
1302 else if (node->token.type == SUBEXP
1303 && node->left && node->left->token.type == SUBEXP)
1305 Idx other_idx = node->left->token.opr.idx;
1307 node->left = node->left->left;
1308 if (node->left)
1309 node->left->parent = node;
1311 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1312 if (other_idx < BITSET_WORD_BITS)
1313 dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1316 return REG_NOERROR;
1319 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1320 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1321 static reg_errcode_t
1322 lower_subexps (void *extra, bin_tree_t *node)
1324 regex_t *preg = (regex_t *) extra;
1325 reg_errcode_t err = REG_NOERROR;
1327 if (node->left && node->left->token.type == SUBEXP)
1329 node->left = lower_subexp (&err, preg, node->left);
1330 if (node->left)
1331 node->left->parent = node;
1333 if (node->right && node->right->token.type == SUBEXP)
1335 node->right = lower_subexp (&err, preg, node->right);
1336 if (node->right)
1337 node->right->parent = node;
1340 return err;
1343 static bin_tree_t *
1344 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1346 re_dfa_t *dfa = preg->buffer;
1347 bin_tree_t *body = node->left;
1348 bin_tree_t *op, *cls, *tree1, *tree;
1350 if (preg->no_sub
1351 /* We do not optimize empty subexpressions, because otherwise we may
1352 have bad CONCAT nodes with NULL children. This is obviously not
1353 very common, so we do not lose much. An example that triggers
1354 this case is the sed "script" /\(\)/x. */
1355 && node->left != NULL
1356 && (node->token.opr.idx >= BITSET_WORD_BITS
1357 || !(dfa->used_bkref_map
1358 & ((bitset_word_t) 1 << node->token.opr.idx))))
1359 return node->left;
1361 /* Convert the SUBEXP node to the concatenation of an
1362 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1363 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1364 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1365 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1366 tree = create_tree (dfa, op, tree1, CONCAT);
1367 if (__glibc_unlikely (tree == NULL || tree1 == NULL
1368 || op == NULL || cls == NULL))
1370 *err = REG_ESPACE;
1371 return NULL;
1374 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1375 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1376 return tree;
1379 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1380 nodes. Requires a postorder visit. */
1381 static reg_errcode_t
1382 calc_first (void *extra, bin_tree_t *node)
1384 re_dfa_t *dfa = (re_dfa_t *) extra;
1385 if (node->token.type == CONCAT)
1387 node->first = node->left->first;
1388 node->node_idx = node->left->node_idx;
1390 else
1392 node->first = node;
1393 node->node_idx = re_dfa_add_node (dfa, node->token);
1394 if (__glibc_unlikely (node->node_idx == -1))
1395 return REG_ESPACE;
1396 if (node->token.type == ANCHOR)
1397 dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1399 return REG_NOERROR;
1402 /* Pass 2: compute NEXT on the tree. Preorder visit. */
1403 static reg_errcode_t
1404 calc_next (void *extra, bin_tree_t *node)
1406 switch (node->token.type)
1408 case OP_DUP_ASTERISK:
1409 node->left->next = node;
1410 break;
1411 case CONCAT:
1412 node->left->next = node->right->first;
1413 node->right->next = node->next;
1414 break;
1415 default:
1416 if (node->left)
1417 node->left->next = node->next;
1418 if (node->right)
1419 node->right->next = node->next;
1420 break;
1422 return REG_NOERROR;
1425 /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1426 static reg_errcode_t
1427 link_nfa_nodes (void *extra, bin_tree_t *node)
1429 re_dfa_t *dfa = (re_dfa_t *) extra;
1430 Idx idx = node->node_idx;
1431 reg_errcode_t err = REG_NOERROR;
1433 switch (node->token.type)
1435 case CONCAT:
1436 break;
1438 case END_OF_RE:
1439 DEBUG_ASSERT (node->next == NULL);
1440 break;
1442 case OP_DUP_ASTERISK:
1443 case OP_ALT:
1445 Idx left, right;
1446 dfa->has_plural_match = 1;
1447 if (node->left != NULL)
1448 left = node->left->first->node_idx;
1449 else
1450 left = node->next->node_idx;
1451 if (node->right != NULL)
1452 right = node->right->first->node_idx;
1453 else
1454 right = node->next->node_idx;
1455 DEBUG_ASSERT (left > -1);
1456 DEBUG_ASSERT (right > -1);
1457 err = re_node_set_init_2 (dfa->edests + idx, left, right);
1459 break;
1461 case ANCHOR:
1462 case OP_OPEN_SUBEXP:
1463 case OP_CLOSE_SUBEXP:
1464 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1465 break;
1467 case OP_BACK_REF:
1468 dfa->nexts[idx] = node->next->node_idx;
1469 if (node->token.type == OP_BACK_REF)
1470 err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1471 break;
1473 default:
1474 DEBUG_ASSERT (!IS_EPSILON_NODE (node->token.type));
1475 dfa->nexts[idx] = node->next->node_idx;
1476 break;
1479 return err;
1482 /* Duplicate the epsilon closure of the node ROOT_NODE.
1483 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1484 to their own constraint. */
1486 static reg_errcode_t
1487 duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
1488 Idx root_node, unsigned int init_constraint)
1490 Idx org_node, clone_node;
1491 bool ok;
1492 unsigned int constraint = init_constraint;
1493 for (org_node = top_org_node, clone_node = top_clone_node;;)
1495 Idx org_dest, clone_dest;
1496 if (dfa->nodes[org_node].type == OP_BACK_REF)
1498 /* If the back reference epsilon-transit, its destination must
1499 also have the constraint. Then duplicate the epsilon closure
1500 of the destination of the back reference, and store it in
1501 edests of the back reference. */
1502 org_dest = dfa->nexts[org_node];
1503 re_node_set_empty (dfa->edests + clone_node);
1504 clone_dest = duplicate_node (dfa, org_dest, constraint);
1505 if (__glibc_unlikely (clone_dest == -1))
1506 return REG_ESPACE;
1507 dfa->nexts[clone_node] = dfa->nexts[org_node];
1508 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1509 if (__glibc_unlikely (! ok))
1510 return REG_ESPACE;
1512 else if (dfa->edests[org_node].nelem == 0)
1514 /* In case of the node can't epsilon-transit, don't duplicate the
1515 destination and store the original destination as the
1516 destination of the node. */
1517 dfa->nexts[clone_node] = dfa->nexts[org_node];
1518 break;
1520 else if (dfa->edests[org_node].nelem == 1)
1522 /* In case of the node can epsilon-transit, and it has only one
1523 destination. */
1524 org_dest = dfa->edests[org_node].elems[0];
1525 re_node_set_empty (dfa->edests + clone_node);
1526 /* If the node is root_node itself, it means the epsilon closure
1527 has a loop. Then tie it to the destination of the root_node. */
1528 if (org_node == root_node && clone_node != org_node)
1530 ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
1531 if (__glibc_unlikely (! ok))
1532 return REG_ESPACE;
1533 break;
1535 /* In case the node has another constraint, append it. */
1536 constraint |= dfa->nodes[org_node].constraint;
1537 clone_dest = duplicate_node (dfa, org_dest, constraint);
1538 if (__glibc_unlikely (clone_dest == -1))
1539 return REG_ESPACE;
1540 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1541 if (__glibc_unlikely (! ok))
1542 return REG_ESPACE;
1544 else /* dfa->edests[org_node].nelem == 2 */
1546 /* In case of the node can epsilon-transit, and it has two
1547 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1548 org_dest = dfa->edests[org_node].elems[0];
1549 re_node_set_empty (dfa->edests + clone_node);
1550 /* Search for a duplicated node which satisfies the constraint. */
1551 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1552 if (clone_dest == -1)
1554 /* There is no such duplicated node, create a new one. */
1555 reg_errcode_t err;
1556 clone_dest = duplicate_node (dfa, org_dest, constraint);
1557 if (__glibc_unlikely (clone_dest == -1))
1558 return REG_ESPACE;
1559 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1560 if (__glibc_unlikely (! ok))
1561 return REG_ESPACE;
1562 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1563 root_node, constraint);
1564 if (__glibc_unlikely (err != REG_NOERROR))
1565 return err;
1567 else
1569 /* There is a duplicated node which satisfies the constraint,
1570 use it to avoid infinite loop. */
1571 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1572 if (__glibc_unlikely (! ok))
1573 return REG_ESPACE;
1576 org_dest = dfa->edests[org_node].elems[1];
1577 clone_dest = duplicate_node (dfa, org_dest, constraint);
1578 if (__glibc_unlikely (clone_dest == -1))
1579 return REG_ESPACE;
1580 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1581 if (__glibc_unlikely (! ok))
1582 return REG_ESPACE;
1584 org_node = org_dest;
1585 clone_node = clone_dest;
1587 return REG_NOERROR;
1590 /* Search for a node which is duplicated from the node ORG_NODE, and
1591 satisfies the constraint CONSTRAINT. */
1593 static Idx
1594 search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1595 unsigned int constraint)
1597 Idx idx;
1598 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1600 if (org_node == dfa->org_indices[idx]
1601 && constraint == dfa->nodes[idx].constraint)
1602 return idx; /* Found. */
1604 return -1; /* Not found. */
1607 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1608 Return the index of the new node, or -1 if insufficient storage is
1609 available. */
1611 static Idx
1612 duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
1614 Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1615 if (__glibc_likely (dup_idx != -1))
1617 dfa->nodes[dup_idx].constraint = constraint;
1618 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1619 dfa->nodes[dup_idx].duplicated = 1;
1621 /* Store the index of the original node. */
1622 dfa->org_indices[dup_idx] = org_idx;
1624 return dup_idx;
1627 static reg_errcode_t
1628 calc_inveclosure (re_dfa_t *dfa)
1630 Idx src, idx;
1631 bool ok;
1632 for (idx = 0; idx < dfa->nodes_len; ++idx)
1633 re_node_set_init_empty (dfa->inveclosures + idx);
1635 for (src = 0; src < dfa->nodes_len; ++src)
1637 Idx *elems = dfa->eclosures[src].elems;
1638 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1640 ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1641 if (__glibc_unlikely (! ok))
1642 return REG_ESPACE;
1646 return REG_NOERROR;
1649 /* Calculate "eclosure" for all the node in DFA. */
1651 static reg_errcode_t
1652 calc_eclosure (re_dfa_t *dfa)
1654 Idx node_idx;
1655 bool incomplete;
1656 DEBUG_ASSERT (dfa->nodes_len > 0);
1657 incomplete = false;
1658 /* For each nodes, calculate epsilon closure. */
1659 for (node_idx = 0; ; ++node_idx)
1661 reg_errcode_t err;
1662 re_node_set eclosure_elem;
1663 if (node_idx == dfa->nodes_len)
1665 if (!incomplete)
1666 break;
1667 incomplete = false;
1668 node_idx = 0;
1671 DEBUG_ASSERT (dfa->eclosures[node_idx].nelem != -1);
1673 /* If we have already calculated, skip it. */
1674 if (dfa->eclosures[node_idx].nelem != 0)
1675 continue;
1676 /* Calculate epsilon closure of 'node_idx'. */
1677 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1678 if (__glibc_unlikely (err != REG_NOERROR))
1679 return err;
1681 if (dfa->eclosures[node_idx].nelem == 0)
1683 incomplete = true;
1684 re_node_set_free (&eclosure_elem);
1687 return REG_NOERROR;
1690 /* Calculate epsilon closure of NODE. */
1692 static reg_errcode_t
1693 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
1695 reg_errcode_t err;
1696 Idx i;
1697 re_node_set eclosure;
1698 bool ok;
1699 bool incomplete = false;
1700 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1701 if (__glibc_unlikely (err != REG_NOERROR))
1702 return err;
1704 /* This indicates that we are calculating this node now.
1705 We reference this value to avoid infinite loop. */
1706 dfa->eclosures[node].nelem = -1;
1708 /* If the current node has constraints, duplicate all nodes
1709 since they must inherit the constraints. */
1710 if (dfa->nodes[node].constraint
1711 && dfa->edests[node].nelem
1712 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1714 err = duplicate_node_closure (dfa, node, node, node,
1715 dfa->nodes[node].constraint);
1716 if (__glibc_unlikely (err != REG_NOERROR))
1717 return err;
1720 /* Expand each epsilon destination nodes. */
1721 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1722 for (i = 0; i < dfa->edests[node].nelem; ++i)
1724 re_node_set eclosure_elem;
1725 Idx edest = dfa->edests[node].elems[i];
1726 /* If calculating the epsilon closure of 'edest' is in progress,
1727 return intermediate result. */
1728 if (dfa->eclosures[edest].nelem == -1)
1730 incomplete = true;
1731 continue;
1733 /* If we haven't calculated the epsilon closure of 'edest' yet,
1734 calculate now. Otherwise use calculated epsilon closure. */
1735 if (dfa->eclosures[edest].nelem == 0)
1737 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1738 if (__glibc_unlikely (err != REG_NOERROR))
1739 return err;
1741 else
1742 eclosure_elem = dfa->eclosures[edest];
1743 /* Merge the epsilon closure of 'edest'. */
1744 err = re_node_set_merge (&eclosure, &eclosure_elem);
1745 if (__glibc_unlikely (err != REG_NOERROR))
1746 return err;
1747 /* If the epsilon closure of 'edest' is incomplete,
1748 the epsilon closure of this node is also incomplete. */
1749 if (dfa->eclosures[edest].nelem == 0)
1751 incomplete = true;
1752 re_node_set_free (&eclosure_elem);
1756 /* An epsilon closure includes itself. */
1757 ok = re_node_set_insert (&eclosure, node);
1758 if (__glibc_unlikely (! ok))
1759 return REG_ESPACE;
1760 if (incomplete && !root)
1761 dfa->eclosures[node].nelem = 0;
1762 else
1763 dfa->eclosures[node] = eclosure;
1764 *new_set = eclosure;
1765 return REG_NOERROR;
1768 /* Functions for token which are used in the parser. */
1770 /* Fetch a token from INPUT.
1771 We must not use this function inside bracket expressions. */
1773 static void
1774 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1776 re_string_skip_bytes (input, peek_token (result, input, syntax));
1779 /* Peek a token from INPUT, and return the length of the token.
1780 We must not use this function inside bracket expressions. */
1782 static int
1783 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1785 unsigned char c;
1787 if (re_string_eoi (input))
1789 token->type = END_OF_RE;
1790 return 0;
1793 c = re_string_peek_byte (input, 0);
1794 token->opr.c = c;
1796 token->word_char = 0;
1797 #ifdef RE_ENABLE_I18N
1798 token->mb_partial = 0;
1799 if (input->mb_cur_max > 1
1800 && !re_string_first_byte (input, re_string_cur_idx (input)))
1802 token->type = CHARACTER;
1803 token->mb_partial = 1;
1804 return 1;
1806 #endif
1807 if (c == '\\')
1809 unsigned char c2;
1810 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1812 token->type = BACK_SLASH;
1813 return 1;
1816 c2 = re_string_peek_byte_case (input, 1);
1817 token->opr.c = c2;
1818 token->type = CHARACTER;
1819 #ifdef RE_ENABLE_I18N
1820 if (input->mb_cur_max > 1)
1822 wint_t wc = re_string_wchar_at (input,
1823 re_string_cur_idx (input) + 1);
1824 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1826 else
1827 #endif
1828 token->word_char = IS_WORD_CHAR (c2) != 0;
1830 switch (c2)
1832 case '|':
1833 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1834 token->type = OP_ALT;
1835 break;
1836 case '1': case '2': case '3': case '4': case '5':
1837 case '6': case '7': case '8': case '9':
1838 if (!(syntax & RE_NO_BK_REFS))
1840 token->type = OP_BACK_REF;
1841 token->opr.idx = c2 - '1';
1843 break;
1844 case '<':
1845 if (!(syntax & RE_NO_GNU_OPS))
1847 token->type = ANCHOR;
1848 token->opr.ctx_type = WORD_FIRST;
1850 break;
1851 case '>':
1852 if (!(syntax & RE_NO_GNU_OPS))
1854 token->type = ANCHOR;
1855 token->opr.ctx_type = WORD_LAST;
1857 break;
1858 case 'b':
1859 if (!(syntax & RE_NO_GNU_OPS))
1861 token->type = ANCHOR;
1862 token->opr.ctx_type = WORD_DELIM;
1864 break;
1865 case 'B':
1866 if (!(syntax & RE_NO_GNU_OPS))
1868 token->type = ANCHOR;
1869 token->opr.ctx_type = NOT_WORD_DELIM;
1871 break;
1872 case 'w':
1873 if (!(syntax & RE_NO_GNU_OPS))
1874 token->type = OP_WORD;
1875 break;
1876 case 'W':
1877 if (!(syntax & RE_NO_GNU_OPS))
1878 token->type = OP_NOTWORD;
1879 break;
1880 case 's':
1881 if (!(syntax & RE_NO_GNU_OPS))
1882 token->type = OP_SPACE;
1883 break;
1884 case 'S':
1885 if (!(syntax & RE_NO_GNU_OPS))
1886 token->type = OP_NOTSPACE;
1887 break;
1888 case '`':
1889 if (!(syntax & RE_NO_GNU_OPS))
1891 token->type = ANCHOR;
1892 token->opr.ctx_type = BUF_FIRST;
1894 break;
1895 case '\'':
1896 if (!(syntax & RE_NO_GNU_OPS))
1898 token->type = ANCHOR;
1899 token->opr.ctx_type = BUF_LAST;
1901 break;
1902 case '(':
1903 if (!(syntax & RE_NO_BK_PARENS))
1904 token->type = OP_OPEN_SUBEXP;
1905 break;
1906 case ')':
1907 if (!(syntax & RE_NO_BK_PARENS))
1908 token->type = OP_CLOSE_SUBEXP;
1909 break;
1910 case '+':
1911 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1912 token->type = OP_DUP_PLUS;
1913 break;
1914 case '?':
1915 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1916 token->type = OP_DUP_QUESTION;
1917 break;
1918 case '{':
1919 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1920 token->type = OP_OPEN_DUP_NUM;
1921 break;
1922 case '}':
1923 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1924 token->type = OP_CLOSE_DUP_NUM;
1925 break;
1926 default:
1927 break;
1929 return 2;
1932 token->type = CHARACTER;
1933 #ifdef RE_ENABLE_I18N
1934 if (input->mb_cur_max > 1)
1936 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1937 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1939 else
1940 #endif
1941 token->word_char = IS_WORD_CHAR (token->opr.c);
1943 switch (c)
1945 case '\n':
1946 if (syntax & RE_NEWLINE_ALT)
1947 token->type = OP_ALT;
1948 break;
1949 case '|':
1950 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1951 token->type = OP_ALT;
1952 break;
1953 case '*':
1954 token->type = OP_DUP_ASTERISK;
1955 break;
1956 case '+':
1957 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1958 token->type = OP_DUP_PLUS;
1959 break;
1960 case '?':
1961 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1962 token->type = OP_DUP_QUESTION;
1963 break;
1964 case '{':
1965 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1966 token->type = OP_OPEN_DUP_NUM;
1967 break;
1968 case '}':
1969 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1970 token->type = OP_CLOSE_DUP_NUM;
1971 break;
1972 case '(':
1973 if (syntax & RE_NO_BK_PARENS)
1974 token->type = OP_OPEN_SUBEXP;
1975 break;
1976 case ')':
1977 if (syntax & RE_NO_BK_PARENS)
1978 token->type = OP_CLOSE_SUBEXP;
1979 break;
1980 case '[':
1981 token->type = OP_OPEN_BRACKET;
1982 break;
1983 case '.':
1984 token->type = OP_PERIOD;
1985 break;
1986 case '^':
1987 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE))
1988 && re_string_cur_idx (input) != 0)
1990 char prev = re_string_peek_byte (input, -1);
1991 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1992 break;
1994 token->type = ANCHOR;
1995 token->opr.ctx_type = LINE_FIRST;
1996 break;
1997 case '$':
1998 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS)
1999 && re_string_cur_idx (input) + 1 != re_string_length (input))
2001 re_token_t next;
2002 re_string_skip_bytes (input, 1);
2003 peek_token (&next, input, syntax);
2004 re_string_skip_bytes (input, -1);
2005 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
2006 break;
2008 token->type = ANCHOR;
2009 token->opr.ctx_type = LINE_LAST;
2010 break;
2011 default:
2012 break;
2014 return 1;
2017 /* Peek a token from INPUT, and return the length of the token.
2018 We must not use this function out of bracket expressions. */
2020 static int
2021 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2023 unsigned char c;
2024 if (re_string_eoi (input))
2026 token->type = END_OF_RE;
2027 return 0;
2029 c = re_string_peek_byte (input, 0);
2030 token->opr.c = c;
2032 #ifdef RE_ENABLE_I18N
2033 if (input->mb_cur_max > 1
2034 && !re_string_first_byte (input, re_string_cur_idx (input)))
2036 token->type = CHARACTER;
2037 return 1;
2039 #endif /* RE_ENABLE_I18N */
2041 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2042 && re_string_cur_idx (input) + 1 < re_string_length (input))
2044 /* In this case, '\' escape a character. */
2045 unsigned char c2;
2046 re_string_skip_bytes (input, 1);
2047 c2 = re_string_peek_byte (input, 0);
2048 token->opr.c = c2;
2049 token->type = CHARACTER;
2050 return 1;
2052 if (c == '[') /* '[' is a special char in a bracket exps. */
2054 unsigned char c2;
2055 int token_len;
2056 if (re_string_cur_idx (input) + 1 < re_string_length (input))
2057 c2 = re_string_peek_byte (input, 1);
2058 else
2059 c2 = 0;
2060 token->opr.c = c2;
2061 token_len = 2;
2062 switch (c2)
2064 case '.':
2065 token->type = OP_OPEN_COLL_ELEM;
2066 break;
2068 case '=':
2069 token->type = OP_OPEN_EQUIV_CLASS;
2070 break;
2072 case ':':
2073 if (syntax & RE_CHAR_CLASSES)
2075 token->type = OP_OPEN_CHAR_CLASS;
2076 break;
2078 FALLTHROUGH;
2079 default:
2080 token->type = CHARACTER;
2081 token->opr.c = c;
2082 token_len = 1;
2083 break;
2085 return token_len;
2087 switch (c)
2089 case '-':
2090 token->type = OP_CHARSET_RANGE;
2091 break;
2092 case ']':
2093 token->type = OP_CLOSE_BRACKET;
2094 break;
2095 case '^':
2096 token->type = OP_NON_MATCH_LIST;
2097 break;
2098 default:
2099 token->type = CHARACTER;
2101 return 1;
2104 /* Functions for parser. */
2106 /* Entry point of the parser.
2107 Parse the regular expression REGEXP and return the structure tree.
2108 If an error occurs, ERR is set by error code, and return NULL.
2109 This function build the following tree, from regular expression <reg_exp>:
2113 <reg_exp> EOR
2115 CAT means concatenation.
2116 EOR means end of regular expression. */
2118 static bin_tree_t *
2119 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2120 reg_errcode_t *err)
2122 re_dfa_t *dfa = preg->buffer;
2123 bin_tree_t *tree, *eor, *root;
2124 re_token_t current_token;
2125 dfa->syntax = syntax;
2126 fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2127 tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2128 if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2129 return NULL;
2130 eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2131 if (tree != NULL)
2132 root = create_tree (dfa, tree, eor, CONCAT);
2133 else
2134 root = eor;
2135 if (__glibc_unlikely (eor == NULL || root == NULL))
2137 *err = REG_ESPACE;
2138 return NULL;
2140 return root;
2143 /* This function build the following tree, from regular expression
2144 <branch1>|<branch2>:
2148 <branch1> <branch2>
2150 ALT means alternative, which represents the operator '|'. */
2152 static bin_tree_t *
2153 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2154 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2156 re_dfa_t *dfa = preg->buffer;
2157 bin_tree_t *tree, *branch = NULL;
2158 bitset_word_t initial_bkref_map = dfa->completed_bkref_map;
2159 tree = parse_branch (regexp, preg, token, syntax, nest, err);
2160 if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2161 return NULL;
2163 while (token->type == OP_ALT)
2165 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2166 if (token->type != OP_ALT && token->type != END_OF_RE
2167 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2169 bitset_word_t accumulated_bkref_map = dfa->completed_bkref_map;
2170 dfa->completed_bkref_map = initial_bkref_map;
2171 branch = parse_branch (regexp, preg, token, syntax, nest, err);
2172 if (__glibc_unlikely (*err != REG_NOERROR && branch == NULL))
2174 if (tree != NULL)
2175 postorder (tree, free_tree, NULL);
2176 return NULL;
2178 dfa->completed_bkref_map |= accumulated_bkref_map;
2180 else
2181 branch = NULL;
2182 tree = create_tree (dfa, tree, branch, OP_ALT);
2183 if (__glibc_unlikely (tree == NULL))
2185 *err = REG_ESPACE;
2186 return NULL;
2189 return tree;
2192 /* This function build the following tree, from regular expression
2193 <exp1><exp2>:
2197 <exp1> <exp2>
2199 CAT means concatenation. */
2201 static bin_tree_t *
2202 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2203 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2205 bin_tree_t *tree, *expr;
2206 re_dfa_t *dfa = preg->buffer;
2207 tree = parse_expression (regexp, preg, token, syntax, nest, err);
2208 if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2209 return NULL;
2211 while (token->type != OP_ALT && token->type != END_OF_RE
2212 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2214 expr = parse_expression (regexp, preg, token, syntax, nest, err);
2215 if (__glibc_unlikely (*err != REG_NOERROR && expr == NULL))
2217 if (tree != NULL)
2218 postorder (tree, free_tree, NULL);
2219 return NULL;
2221 if (tree != NULL && expr != NULL)
2223 bin_tree_t *newtree = create_tree (dfa, tree, expr, CONCAT);
2224 if (newtree == NULL)
2226 postorder (expr, free_tree, NULL);
2227 postorder (tree, free_tree, NULL);
2228 *err = REG_ESPACE;
2229 return NULL;
2231 tree = newtree;
2233 else if (tree == NULL)
2234 tree = expr;
2235 /* Otherwise expr == NULL, we don't need to create new tree. */
2237 return tree;
2240 /* This function build the following tree, from regular expression a*:
2246 static bin_tree_t *
2247 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2248 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2250 re_dfa_t *dfa = preg->buffer;
2251 bin_tree_t *tree;
2252 switch (token->type)
2254 case CHARACTER:
2255 tree = create_token_tree (dfa, NULL, NULL, token);
2256 if (__glibc_unlikely (tree == NULL))
2258 *err = REG_ESPACE;
2259 return NULL;
2261 #ifdef RE_ENABLE_I18N
2262 if (dfa->mb_cur_max > 1)
2264 while (!re_string_eoi (regexp)
2265 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2267 bin_tree_t *mbc_remain;
2268 fetch_token (token, regexp, syntax);
2269 mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2270 tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2271 if (__glibc_unlikely (mbc_remain == NULL || tree == NULL))
2273 *err = REG_ESPACE;
2274 return NULL;
2278 #endif
2279 break;
2281 case OP_OPEN_SUBEXP:
2282 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2283 if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2284 return NULL;
2285 break;
2287 case OP_OPEN_BRACKET:
2288 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2289 if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2290 return NULL;
2291 break;
2293 case OP_BACK_REF:
2294 if (!__glibc_likely (dfa->completed_bkref_map & (1 << token->opr.idx)))
2296 *err = REG_ESUBREG;
2297 return NULL;
2299 dfa->used_bkref_map |= 1 << token->opr.idx;
2300 tree = create_token_tree (dfa, NULL, NULL, token);
2301 if (__glibc_unlikely (tree == NULL))
2303 *err = REG_ESPACE;
2304 return NULL;
2306 ++dfa->nbackref;
2307 dfa->has_mb_node = 1;
2308 break;
2310 case OP_OPEN_DUP_NUM:
2311 if (syntax & RE_CONTEXT_INVALID_DUP)
2313 *err = REG_BADRPT;
2314 return NULL;
2316 FALLTHROUGH;
2317 case OP_DUP_ASTERISK:
2318 case OP_DUP_PLUS:
2319 case OP_DUP_QUESTION:
2320 if (syntax & RE_CONTEXT_INVALID_OPS)
2322 *err = REG_BADRPT;
2323 return NULL;
2325 else if (syntax & RE_CONTEXT_INDEP_OPS)
2327 fetch_token (token, regexp, syntax);
2328 return parse_expression (regexp, preg, token, syntax, nest, err);
2330 FALLTHROUGH;
2331 case OP_CLOSE_SUBEXP:
2332 if ((token->type == OP_CLOSE_SUBEXP)
2333 && !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2335 *err = REG_ERPAREN;
2336 return NULL;
2338 FALLTHROUGH;
2339 case OP_CLOSE_DUP_NUM:
2340 /* We treat it as a normal character. */
2342 /* Then we can these characters as normal characters. */
2343 token->type = CHARACTER;
2344 /* mb_partial and word_char bits should be initialized already
2345 by peek_token. */
2346 tree = create_token_tree (dfa, NULL, NULL, token);
2347 if (__glibc_unlikely (tree == NULL))
2349 *err = REG_ESPACE;
2350 return NULL;
2352 break;
2354 case ANCHOR:
2355 if ((token->opr.ctx_type
2356 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2357 && dfa->word_ops_used == 0)
2358 init_word_char (dfa);
2359 if (token->opr.ctx_type == WORD_DELIM
2360 || token->opr.ctx_type == NOT_WORD_DELIM)
2362 bin_tree_t *tree_first, *tree_last;
2363 if (token->opr.ctx_type == WORD_DELIM)
2365 token->opr.ctx_type = WORD_FIRST;
2366 tree_first = create_token_tree (dfa, NULL, NULL, token);
2367 token->opr.ctx_type = WORD_LAST;
2369 else
2371 token->opr.ctx_type = INSIDE_WORD;
2372 tree_first = create_token_tree (dfa, NULL, NULL, token);
2373 token->opr.ctx_type = INSIDE_NOTWORD;
2375 tree_last = create_token_tree (dfa, NULL, NULL, token);
2376 tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2377 if (__glibc_unlikely (tree_first == NULL || tree_last == NULL
2378 || tree == NULL))
2380 *err = REG_ESPACE;
2381 return NULL;
2384 else
2386 tree = create_token_tree (dfa, NULL, NULL, token);
2387 if (__glibc_unlikely (tree == NULL))
2389 *err = REG_ESPACE;
2390 return NULL;
2393 /* We must return here, since ANCHORs can't be followed
2394 by repetition operators.
2395 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2396 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2397 fetch_token (token, regexp, syntax);
2398 return tree;
2400 case OP_PERIOD:
2401 tree = create_token_tree (dfa, NULL, NULL, token);
2402 if (__glibc_unlikely (tree == NULL))
2404 *err = REG_ESPACE;
2405 return NULL;
2407 if (dfa->mb_cur_max > 1)
2408 dfa->has_mb_node = 1;
2409 break;
2411 case OP_WORD:
2412 case OP_NOTWORD:
2413 tree = build_charclass_op (dfa, regexp->trans,
2414 "alnum",
2415 "_",
2416 token->type == OP_NOTWORD, err);
2417 if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2418 return NULL;
2419 break;
2421 case OP_SPACE:
2422 case OP_NOTSPACE:
2423 tree = build_charclass_op (dfa, regexp->trans,
2424 "space",
2426 token->type == OP_NOTSPACE, err);
2427 if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2428 return NULL;
2429 break;
2431 case OP_ALT:
2432 case END_OF_RE:
2433 return NULL;
2435 case BACK_SLASH:
2436 *err = REG_EESCAPE;
2437 return NULL;
2439 default:
2440 /* Must not happen? */
2441 DEBUG_ASSERT (false);
2442 return NULL;
2444 fetch_token (token, regexp, syntax);
2446 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2447 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2449 bin_tree_t *dup_tree = parse_dup_op (tree, regexp, dfa, token,
2450 syntax, err);
2451 if (__glibc_unlikely (*err != REG_NOERROR && dup_tree == NULL))
2453 if (tree != NULL)
2454 postorder (tree, free_tree, NULL);
2455 return NULL;
2457 tree = dup_tree;
2458 /* In BRE consecutive duplications are not allowed. */
2459 if ((syntax & RE_CONTEXT_INVALID_DUP)
2460 && (token->type == OP_DUP_ASTERISK
2461 || token->type == OP_OPEN_DUP_NUM))
2463 if (tree != NULL)
2464 postorder (tree, free_tree, NULL);
2465 *err = REG_BADRPT;
2466 return NULL;
2470 return tree;
2473 /* This function build the following tree, from regular expression
2474 (<reg_exp>):
2475 SUBEXP
2477 <reg_exp>
2480 static bin_tree_t *
2481 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2482 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2484 re_dfa_t *dfa = preg->buffer;
2485 bin_tree_t *tree;
2486 size_t cur_nsub;
2487 cur_nsub = preg->re_nsub++;
2489 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2491 /* The subexpression may be a null string. */
2492 if (token->type == OP_CLOSE_SUBEXP)
2493 tree = NULL;
2494 else
2496 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2497 if (__glibc_unlikely (*err == REG_NOERROR
2498 && token->type != OP_CLOSE_SUBEXP))
2500 if (tree != NULL)
2501 postorder (tree, free_tree, NULL);
2502 *err = REG_EPAREN;
2504 if (__glibc_unlikely (*err != REG_NOERROR))
2505 return NULL;
2508 if (cur_nsub <= '9' - '1')
2509 dfa->completed_bkref_map |= 1 << cur_nsub;
2511 tree = create_tree (dfa, tree, NULL, SUBEXP);
2512 if (__glibc_unlikely (tree == NULL))
2514 *err = REG_ESPACE;
2515 return NULL;
2517 tree->token.opr.idx = cur_nsub;
2518 return tree;
2521 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2523 static bin_tree_t *
2524 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2525 re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2527 bin_tree_t *tree = NULL, *old_tree = NULL;
2528 Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2529 re_token_t start_token = *token;
2531 if (token->type == OP_OPEN_DUP_NUM)
2533 end = 0;
2534 start = fetch_number (regexp, token, syntax);
2535 if (start == -1)
2537 if (token->type == CHARACTER && token->opr.c == ',')
2538 start = 0; /* We treat "{,m}" as "{0,m}". */
2539 else
2541 *err = REG_BADBR; /* <re>{} is invalid. */
2542 return NULL;
2545 if (__glibc_likely (start != -2))
2547 /* We treat "{n}" as "{n,n}". */
2548 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2549 : ((token->type == CHARACTER && token->opr.c == ',')
2550 ? fetch_number (regexp, token, syntax) : -2));
2552 if (__glibc_unlikely (start == -2 || end == -2))
2554 /* Invalid sequence. */
2555 if (__glibc_unlikely (!(syntax & RE_INVALID_INTERVAL_ORD)))
2557 if (token->type == END_OF_RE)
2558 *err = REG_EBRACE;
2559 else
2560 *err = REG_BADBR;
2562 return NULL;
2565 /* If the syntax bit is set, rollback. */
2566 re_string_set_index (regexp, start_idx);
2567 *token = start_token;
2568 token->type = CHARACTER;
2569 /* mb_partial and word_char bits should be already initialized by
2570 peek_token. */
2571 return elem;
2574 if (__glibc_unlikely ((end != -1 && start > end)
2575 || token->type != OP_CLOSE_DUP_NUM))
2577 /* First number greater than second. */
2578 *err = REG_BADBR;
2579 return NULL;
2582 if (__glibc_unlikely (RE_DUP_MAX < (end == -1 ? start : end)))
2584 *err = REG_ESIZE;
2585 return NULL;
2588 else
2590 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2591 end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2594 fetch_token (token, regexp, syntax);
2596 if (__glibc_unlikely (elem == NULL))
2597 return NULL;
2598 if (__glibc_unlikely (start == 0 && end == 0))
2600 postorder (elem, free_tree, NULL);
2601 return NULL;
2604 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2605 if (__glibc_unlikely (start > 0))
2607 tree = elem;
2608 for (i = 2; i <= start; ++i)
2610 elem = duplicate_tree (elem, dfa);
2611 tree = create_tree (dfa, tree, elem, CONCAT);
2612 if (__glibc_unlikely (elem == NULL || tree == NULL))
2613 goto parse_dup_op_espace;
2616 if (start == end)
2617 return tree;
2619 /* Duplicate ELEM before it is marked optional. */
2620 elem = duplicate_tree (elem, dfa);
2621 if (__glibc_unlikely (elem == NULL))
2622 goto parse_dup_op_espace;
2623 old_tree = tree;
2625 else
2626 old_tree = NULL;
2628 if (elem->token.type == SUBEXP)
2630 uintptr_t subidx = elem->token.opr.idx;
2631 postorder (elem, mark_opt_subexp, (void *) subidx);
2634 tree = create_tree (dfa, elem, NULL,
2635 (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2636 if (__glibc_unlikely (tree == NULL))
2637 goto parse_dup_op_espace;
2639 /* This loop is actually executed only when end != -1,
2640 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2641 already created the start+1-th copy. */
2642 if (TYPE_SIGNED (Idx) || end != -1)
2643 for (i = start + 2; i <= end; ++i)
2645 elem = duplicate_tree (elem, dfa);
2646 tree = create_tree (dfa, tree, elem, CONCAT);
2647 if (__glibc_unlikely (elem == NULL || tree == NULL))
2648 goto parse_dup_op_espace;
2650 tree = create_tree (dfa, tree, NULL, OP_ALT);
2651 if (__glibc_unlikely (tree == NULL))
2652 goto parse_dup_op_espace;
2655 if (old_tree)
2656 tree = create_tree (dfa, old_tree, tree, CONCAT);
2658 return tree;
2660 parse_dup_op_espace:
2661 *err = REG_ESPACE;
2662 return NULL;
2665 /* Size of the names for collating symbol/equivalence_class/character_class.
2666 I'm not sure, but maybe enough. */
2667 #define BRACKET_NAME_BUF_SIZE 32
2669 #ifndef _LIBC
2671 # ifdef RE_ENABLE_I18N
2672 /* Convert the byte B to the corresponding wide character. In a
2673 unibyte locale, treat B as itself. In a multibyte locale, return
2674 WEOF if B is an encoding error. */
2675 static wint_t
2676 parse_byte (unsigned char b, re_charset_t *mbcset)
2678 return mbcset == NULL ? b : __btowc (b);
2680 # endif
2682 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2683 Build the range expression which starts from START_ELEM, and ends
2684 at END_ELEM. The result are written to MBCSET and SBCSET.
2685 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2686 mbcset->range_ends, is a pointer argument since we may
2687 update it. */
2689 static reg_errcode_t
2690 # ifdef RE_ENABLE_I18N
2691 build_range_exp (const reg_syntax_t syntax,
2692 bitset_t sbcset,
2693 re_charset_t *mbcset,
2694 Idx *range_alloc,
2695 const bracket_elem_t *start_elem,
2696 const bracket_elem_t *end_elem)
2697 # else /* not RE_ENABLE_I18N */
2698 build_range_exp (const reg_syntax_t syntax,
2699 bitset_t sbcset,
2700 const bracket_elem_t *start_elem,
2701 const bracket_elem_t *end_elem)
2702 # endif /* not RE_ENABLE_I18N */
2704 unsigned int start_ch, end_ch;
2705 /* Equivalence Classes and Character Classes can't be a range start/end. */
2706 if (__glibc_unlikely (start_elem->type == EQUIV_CLASS
2707 || start_elem->type == CHAR_CLASS
2708 || end_elem->type == EQUIV_CLASS
2709 || end_elem->type == CHAR_CLASS))
2710 return REG_ERANGE;
2712 /* We can handle no multi character collating elements without libc
2713 support. */
2714 if (__glibc_unlikely ((start_elem->type == COLL_SYM
2715 && strlen ((char *) start_elem->opr.name) > 1)
2716 || (end_elem->type == COLL_SYM
2717 && strlen ((char *) end_elem->opr.name) > 1)))
2718 return REG_ECOLLATE;
2720 # ifdef RE_ENABLE_I18N
2722 wchar_t wc;
2723 wint_t start_wc;
2724 wint_t end_wc;
2726 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2727 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2728 : 0));
2729 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2730 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2731 : 0));
2732 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2733 ? parse_byte (start_ch, mbcset) : start_elem->opr.wch);
2734 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2735 ? parse_byte (end_ch, mbcset) : end_elem->opr.wch);
2736 if (start_wc == WEOF || end_wc == WEOF)
2737 return REG_ECOLLATE;
2738 else if (__glibc_unlikely ((syntax & RE_NO_EMPTY_RANGES)
2739 && start_wc > end_wc))
2740 return REG_ERANGE;
2742 /* Got valid collation sequence values, add them as a new entry.
2743 However, for !_LIBC we have no collation elements: if the
2744 character set is single byte, the single byte character set
2745 that we build below suffices. parse_bracket_exp passes
2746 no MBCSET if dfa->mb_cur_max == 1. */
2747 if (mbcset)
2749 /* Check the space of the arrays. */
2750 if (__glibc_unlikely (*range_alloc == mbcset->nranges))
2752 /* There is not enough space, need realloc. */
2753 wchar_t *new_array_start, *new_array_end;
2754 Idx new_nranges;
2756 /* +1 in case of mbcset->nranges is 0. */
2757 new_nranges = 2 * mbcset->nranges + 1;
2758 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2759 are NULL if *range_alloc == 0. */
2760 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2761 new_nranges);
2762 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2763 new_nranges);
2765 if (__glibc_unlikely (new_array_start == NULL
2766 || new_array_end == NULL))
2768 re_free (new_array_start);
2769 re_free (new_array_end);
2770 return REG_ESPACE;
2773 mbcset->range_starts = new_array_start;
2774 mbcset->range_ends = new_array_end;
2775 *range_alloc = new_nranges;
2778 mbcset->range_starts[mbcset->nranges] = start_wc;
2779 mbcset->range_ends[mbcset->nranges++] = end_wc;
2782 /* Build the table for single byte characters. */
2783 for (wc = 0; wc < SBC_MAX; ++wc)
2785 if (start_wc <= wc && wc <= end_wc)
2786 bitset_set (sbcset, wc);
2789 # else /* not RE_ENABLE_I18N */
2791 unsigned int ch;
2792 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2793 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2794 : 0));
2795 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2796 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2797 : 0));
2798 if (start_ch > end_ch)
2799 return REG_ERANGE;
2800 /* Build the table for single byte characters. */
2801 for (ch = 0; ch < SBC_MAX; ++ch)
2802 if (start_ch <= ch && ch <= end_ch)
2803 bitset_set (sbcset, ch);
2805 # endif /* not RE_ENABLE_I18N */
2806 return REG_NOERROR;
2808 #endif /* not _LIBC */
2810 #ifndef _LIBC
2811 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2812 Build the collating element which is represented by NAME.
2813 The result are written to MBCSET and SBCSET.
2814 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2815 pointer argument since we may update it. */
2817 static reg_errcode_t
2818 # ifdef RE_ENABLE_I18N
2819 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2820 Idx *coll_sym_alloc, const unsigned char *name)
2821 # else /* not RE_ENABLE_I18N */
2822 build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2823 # endif /* not RE_ENABLE_I18N */
2825 size_t name_len = strlen ((const char *) name);
2826 if (__glibc_unlikely (name_len != 1))
2827 return REG_ECOLLATE;
2828 else
2830 bitset_set (sbcset, name[0]);
2831 return REG_NOERROR;
2834 #endif /* not _LIBC */
2836 /* This function parse bracket expression like "[abc]", "[a-c]",
2837 "[[.a-a.]]" etc. */
2839 static bin_tree_t *
2840 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2841 reg_syntax_t syntax, reg_errcode_t *err)
2843 #ifdef _LIBC
2844 const unsigned char *collseqmb;
2845 const char *collseqwc;
2846 uint32_t nrules;
2847 int32_t table_size;
2848 const int32_t *symb_table;
2849 const unsigned char *extra;
2851 /* Local function for parse_bracket_exp used in _LIBC environment.
2852 Seek the collating symbol entry corresponding to NAME.
2853 Return the index of the symbol in the SYMB_TABLE,
2854 or -1 if not found. */
2856 auto inline int32_t
2857 __attribute__ ((always_inline))
2858 seek_collating_symbol_entry (const unsigned char *name, size_t name_len)
2860 int32_t elem;
2862 for (elem = 0; elem < table_size; elem++)
2863 if (symb_table[2 * elem] != 0)
2865 int32_t idx = symb_table[2 * elem + 1];
2866 /* Skip the name of collating element name. */
2867 idx += 1 + extra[idx];
2868 if (/* Compare the length of the name. */
2869 name_len == extra[idx]
2870 /* Compare the name. */
2871 && memcmp (name, &extra[idx + 1], name_len) == 0)
2872 /* Yep, this is the entry. */
2873 return elem;
2875 return -1;
2878 /* Local function for parse_bracket_exp used in _LIBC environment.
2879 Look up the collation sequence value of BR_ELEM.
2880 Return the value if succeeded, UINT_MAX otherwise. */
2882 auto inline unsigned int
2883 __attribute__ ((always_inline))
2884 lookup_collation_sequence_value (bracket_elem_t *br_elem)
2886 if (br_elem->type == SB_CHAR)
2889 if (MB_CUR_MAX == 1)
2891 if (nrules == 0)
2892 return collseqmb[br_elem->opr.ch];
2893 else
2895 wint_t wc = __btowc (br_elem->opr.ch);
2896 return __collseq_table_lookup (collseqwc, wc);
2899 else if (br_elem->type == MB_CHAR)
2901 if (nrules != 0)
2902 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2904 else if (br_elem->type == COLL_SYM)
2906 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2907 if (nrules != 0)
2909 int32_t elem, idx;
2910 elem = seek_collating_symbol_entry (br_elem->opr.name,
2911 sym_name_len);
2912 if (elem != -1)
2914 /* We found the entry. */
2915 idx = symb_table[2 * elem + 1];
2916 /* Skip the name of collating element name. */
2917 idx += 1 + extra[idx];
2918 /* Skip the byte sequence of the collating element. */
2919 idx += 1 + extra[idx];
2920 /* Adjust for the alignment. */
2921 idx = (idx + 3) & ~3;
2922 /* Skip the multibyte collation sequence value. */
2923 idx += sizeof (unsigned int);
2924 /* Skip the wide char sequence of the collating element. */
2925 idx += sizeof (unsigned int) *
2926 (1 + *(unsigned int *) (extra + idx));
2927 /* Return the collation sequence value. */
2928 return *(unsigned int *) (extra + idx);
2930 else if (sym_name_len == 1)
2932 /* No valid character. Match it as a single byte
2933 character. */
2934 return collseqmb[br_elem->opr.name[0]];
2937 else if (sym_name_len == 1)
2938 return collseqmb[br_elem->opr.name[0]];
2940 return UINT_MAX;
2943 /* Local function for parse_bracket_exp used in _LIBC environment.
2944 Build the range expression which starts from START_ELEM, and ends
2945 at END_ELEM. The result are written to MBCSET and SBCSET.
2946 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2947 mbcset->range_ends, is a pointer argument since we may
2948 update it. */
2950 auto inline reg_errcode_t
2951 __attribute__ ((always_inline))
2952 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2953 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2955 unsigned int ch;
2956 uint32_t start_collseq;
2957 uint32_t end_collseq;
2959 /* Equivalence Classes and Character Classes can't be a range
2960 start/end. */
2961 if (__glibc_unlikely (start_elem->type == EQUIV_CLASS
2962 || start_elem->type == CHAR_CLASS
2963 || end_elem->type == EQUIV_CLASS
2964 || end_elem->type == CHAR_CLASS))
2965 return REG_ERANGE;
2967 /* FIXME: Implement rational ranges here, too. */
2968 start_collseq = lookup_collation_sequence_value (start_elem);
2969 end_collseq = lookup_collation_sequence_value (end_elem);
2970 /* Check start/end collation sequence values. */
2971 if (__glibc_unlikely (start_collseq == UINT_MAX
2972 || end_collseq == UINT_MAX))
2973 return REG_ECOLLATE;
2974 if (__glibc_unlikely ((syntax & RE_NO_EMPTY_RANGES)
2975 && start_collseq > end_collseq))
2976 return REG_ERANGE;
2978 /* Got valid collation sequence values, add them as a new entry.
2979 However, if we have no collation elements, and the character set
2980 is single byte, the single byte character set that we
2981 build below suffices. */
2982 if (nrules > 0 || dfa->mb_cur_max > 1)
2984 /* Check the space of the arrays. */
2985 if (__glibc_unlikely (*range_alloc == mbcset->nranges))
2987 /* There is not enough space, need realloc. */
2988 uint32_t *new_array_start;
2989 uint32_t *new_array_end;
2990 Idx new_nranges;
2992 /* +1 in case of mbcset->nranges is 0. */
2993 new_nranges = 2 * mbcset->nranges + 1;
2994 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2995 new_nranges);
2996 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2997 new_nranges);
2999 if (__glibc_unlikely (new_array_start == NULL
3000 || new_array_end == NULL))
3001 return REG_ESPACE;
3003 mbcset->range_starts = new_array_start;
3004 mbcset->range_ends = new_array_end;
3005 *range_alloc = new_nranges;
3008 mbcset->range_starts[mbcset->nranges] = start_collseq;
3009 mbcset->range_ends[mbcset->nranges++] = end_collseq;
3012 /* Build the table for single byte characters. */
3013 for (ch = 0; ch < SBC_MAX; ch++)
3015 uint32_t ch_collseq;
3017 if (MB_CUR_MAX == 1)
3019 if (nrules == 0)
3020 ch_collseq = collseqmb[ch];
3021 else
3022 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
3023 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
3024 bitset_set (sbcset, ch);
3026 return REG_NOERROR;
3029 /* Local function for parse_bracket_exp used in _LIBC environment.
3030 Build the collating element which is represented by NAME.
3031 The result are written to MBCSET and SBCSET.
3032 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
3033 pointer argument since we may update it. */
3035 auto inline reg_errcode_t
3036 __attribute__ ((always_inline))
3037 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
3038 Idx *coll_sym_alloc, const unsigned char *name)
3040 int32_t elem, idx;
3041 size_t name_len = strlen ((const char *) name);
3042 if (nrules != 0)
3044 elem = seek_collating_symbol_entry (name, name_len);
3045 if (elem != -1)
3047 /* We found the entry. */
3048 idx = symb_table[2 * elem + 1];
3049 /* Skip the name of collating element name. */
3050 idx += 1 + extra[idx];
3052 else if (name_len == 1)
3054 /* No valid character, treat it as a normal
3055 character. */
3056 bitset_set (sbcset, name[0]);
3057 return REG_NOERROR;
3059 else
3060 return REG_ECOLLATE;
3062 /* Got valid collation sequence, add it as a new entry. */
3063 /* Check the space of the arrays. */
3064 if (__glibc_unlikely (*coll_sym_alloc == mbcset->ncoll_syms))
3066 /* Not enough, realloc it. */
3067 /* +1 in case of mbcset->ncoll_syms is 0. */
3068 Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3069 /* Use realloc since mbcset->coll_syms is NULL
3070 if *alloc == 0. */
3071 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3072 new_coll_sym_alloc);
3073 if (__glibc_unlikely (new_coll_syms == NULL))
3074 return REG_ESPACE;
3075 mbcset->coll_syms = new_coll_syms;
3076 *coll_sym_alloc = new_coll_sym_alloc;
3078 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3079 return REG_NOERROR;
3081 else
3083 if (__glibc_unlikely (name_len != 1))
3084 return REG_ECOLLATE;
3085 else
3087 bitset_set (sbcset, name[0]);
3088 return REG_NOERROR;
3092 #endif
3094 re_token_t br_token;
3095 re_bitset_ptr_t sbcset;
3096 #ifdef RE_ENABLE_I18N
3097 re_charset_t *mbcset;
3098 Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3099 Idx equiv_class_alloc = 0, char_class_alloc = 0;
3100 #endif /* not RE_ENABLE_I18N */
3101 bool non_match = false;
3102 bin_tree_t *work_tree;
3103 int token_len;
3104 bool first_round = true;
3105 #ifdef _LIBC
3106 collseqmb = (const unsigned char *)
3107 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3108 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3109 if (nrules)
3112 if (MB_CUR_MAX > 1)
3114 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3115 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3116 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3117 _NL_COLLATE_SYMB_TABLEMB);
3118 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3119 _NL_COLLATE_SYMB_EXTRAMB);
3121 #endif
3122 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3123 #ifdef RE_ENABLE_I18N
3124 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3125 #endif /* RE_ENABLE_I18N */
3126 #ifdef RE_ENABLE_I18N
3127 if (__glibc_unlikely (sbcset == NULL || mbcset == NULL))
3128 #else
3129 if (__glibc_unlikely (sbcset == NULL))
3130 #endif /* RE_ENABLE_I18N */
3132 re_free (sbcset);
3133 #ifdef RE_ENABLE_I18N
3134 re_free (mbcset);
3135 #endif
3136 *err = REG_ESPACE;
3137 return NULL;
3140 token_len = peek_token_bracket (token, regexp, syntax);
3141 if (__glibc_unlikely (token->type == END_OF_RE))
3143 *err = REG_BADPAT;
3144 goto parse_bracket_exp_free_return;
3146 if (token->type == OP_NON_MATCH_LIST)
3148 #ifdef RE_ENABLE_I18N
3149 mbcset->non_match = 1;
3150 #endif /* not RE_ENABLE_I18N */
3151 non_match = true;
3152 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3153 bitset_set (sbcset, '\n');
3154 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3155 token_len = peek_token_bracket (token, regexp, syntax);
3156 if (__glibc_unlikely (token->type == END_OF_RE))
3158 *err = REG_BADPAT;
3159 goto parse_bracket_exp_free_return;
3163 /* We treat the first ']' as a normal character. */
3164 if (token->type == OP_CLOSE_BRACKET)
3165 token->type = CHARACTER;
3167 while (1)
3169 bracket_elem_t start_elem, end_elem;
3170 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3171 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3172 reg_errcode_t ret;
3173 int token_len2 = 0;
3174 bool is_range_exp = false;
3175 re_token_t token2;
3177 start_elem.opr.name = start_name_buf;
3178 start_elem.type = COLL_SYM;
3179 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3180 syntax, first_round);
3181 if (__glibc_unlikely (ret != REG_NOERROR))
3183 *err = ret;
3184 goto parse_bracket_exp_free_return;
3186 first_round = false;
3188 /* Get information about the next token. We need it in any case. */
3189 token_len = peek_token_bracket (token, regexp, syntax);
3191 /* Do not check for ranges if we know they are not allowed. */
3192 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3194 if (__glibc_unlikely (token->type == END_OF_RE))
3196 *err = REG_EBRACK;
3197 goto parse_bracket_exp_free_return;
3199 if (token->type == OP_CHARSET_RANGE)
3201 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3202 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3203 if (__glibc_unlikely (token2.type == END_OF_RE))
3205 *err = REG_EBRACK;
3206 goto parse_bracket_exp_free_return;
3208 if (token2.type == OP_CLOSE_BRACKET)
3210 /* We treat the last '-' as a normal character. */
3211 re_string_skip_bytes (regexp, -token_len);
3212 token->type = CHARACTER;
3214 else
3215 is_range_exp = true;
3219 if (is_range_exp == true)
3221 end_elem.opr.name = end_name_buf;
3222 end_elem.type = COLL_SYM;
3223 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3224 dfa, syntax, true);
3225 if (__glibc_unlikely (ret != REG_NOERROR))
3227 *err = ret;
3228 goto parse_bracket_exp_free_return;
3231 token_len = peek_token_bracket (token, regexp, syntax);
3233 #ifdef _LIBC
3234 *err = build_range_exp (sbcset, mbcset, &range_alloc,
3235 &start_elem, &end_elem);
3236 #else
3237 # ifdef RE_ENABLE_I18N
3238 *err = build_range_exp (syntax, sbcset,
3239 dfa->mb_cur_max > 1 ? mbcset : NULL,
3240 &range_alloc, &start_elem, &end_elem);
3241 # else
3242 *err = build_range_exp (syntax, sbcset, &start_elem, &end_elem);
3243 # endif
3244 #endif /* RE_ENABLE_I18N */
3245 if (__glibc_unlikely (*err != REG_NOERROR))
3246 goto parse_bracket_exp_free_return;
3248 else
3250 switch (start_elem.type)
3252 case SB_CHAR:
3253 bitset_set (sbcset, start_elem.opr.ch);
3254 break;
3255 #ifdef RE_ENABLE_I18N
3256 case MB_CHAR:
3257 /* Check whether the array has enough space. */
3258 if (__glibc_unlikely (mbchar_alloc == mbcset->nmbchars))
3260 wchar_t *new_mbchars;
3261 /* Not enough, realloc it. */
3262 /* +1 in case of mbcset->nmbchars is 0. */
3263 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3264 /* Use realloc since array is NULL if *alloc == 0. */
3265 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3266 mbchar_alloc);
3267 if (__glibc_unlikely (new_mbchars == NULL))
3268 goto parse_bracket_exp_espace;
3269 mbcset->mbchars = new_mbchars;
3271 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3272 break;
3273 #endif /* RE_ENABLE_I18N */
3274 case EQUIV_CLASS:
3275 *err = build_equiv_class (sbcset,
3276 #ifdef RE_ENABLE_I18N
3277 mbcset, &equiv_class_alloc,
3278 #endif /* RE_ENABLE_I18N */
3279 start_elem.opr.name);
3280 if (__glibc_unlikely (*err != REG_NOERROR))
3281 goto parse_bracket_exp_free_return;
3282 break;
3283 case COLL_SYM:
3284 *err = build_collating_symbol (sbcset,
3285 #ifdef RE_ENABLE_I18N
3286 mbcset, &coll_sym_alloc,
3287 #endif /* RE_ENABLE_I18N */
3288 start_elem.opr.name);
3289 if (__glibc_unlikely (*err != REG_NOERROR))
3290 goto parse_bracket_exp_free_return;
3291 break;
3292 case CHAR_CLASS:
3293 *err = build_charclass (regexp->trans, sbcset,
3294 #ifdef RE_ENABLE_I18N
3295 mbcset, &char_class_alloc,
3296 #endif /* RE_ENABLE_I18N */
3297 (const char *) start_elem.opr.name,
3298 syntax);
3299 if (__glibc_unlikely (*err != REG_NOERROR))
3300 goto parse_bracket_exp_free_return;
3301 break;
3302 default:
3303 DEBUG_ASSERT (false);
3304 break;
3307 if (__glibc_unlikely (token->type == END_OF_RE))
3309 *err = REG_EBRACK;
3310 goto parse_bracket_exp_free_return;
3312 if (token->type == OP_CLOSE_BRACKET)
3313 break;
3316 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3318 /* If it is non-matching list. */
3319 if (non_match)
3320 bitset_not (sbcset);
3322 #ifdef RE_ENABLE_I18N
3323 /* Ensure only single byte characters are set. */
3324 if (dfa->mb_cur_max > 1)
3325 bitset_mask (sbcset, dfa->sb_char);
3327 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3328 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3329 || mbcset->non_match)))
3331 bin_tree_t *mbc_tree;
3332 int sbc_idx;
3333 /* Build a tree for complex bracket. */
3334 dfa->has_mb_node = 1;
3335 br_token.type = COMPLEX_BRACKET;
3336 br_token.opr.mbcset = mbcset;
3337 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3338 if (__glibc_unlikely (mbc_tree == NULL))
3339 goto parse_bracket_exp_espace;
3340 for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3341 if (sbcset[sbc_idx])
3342 break;
3343 /* If there are no bits set in sbcset, there is no point
3344 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3345 if (sbc_idx < BITSET_WORDS)
3347 /* Build a tree for simple bracket. */
3348 br_token.type = SIMPLE_BRACKET;
3349 br_token.opr.sbcset = sbcset;
3350 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3351 if (__glibc_unlikely (work_tree == NULL))
3352 goto parse_bracket_exp_espace;
3354 /* Then join them by ALT node. */
3355 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3356 if (__glibc_unlikely (work_tree == NULL))
3357 goto parse_bracket_exp_espace;
3359 else
3361 re_free (sbcset);
3362 work_tree = mbc_tree;
3365 else
3366 #endif /* not RE_ENABLE_I18N */
3368 #ifdef RE_ENABLE_I18N
3369 free_charset (mbcset);
3370 #endif
3371 /* Build a tree for simple bracket. */
3372 br_token.type = SIMPLE_BRACKET;
3373 br_token.opr.sbcset = sbcset;
3374 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3375 if (__glibc_unlikely (work_tree == NULL))
3376 goto parse_bracket_exp_espace;
3378 return work_tree;
3380 parse_bracket_exp_espace:
3381 *err = REG_ESPACE;
3382 parse_bracket_exp_free_return:
3383 re_free (sbcset);
3384 #ifdef RE_ENABLE_I18N
3385 free_charset (mbcset);
3386 #endif /* RE_ENABLE_I18N */
3387 return NULL;
3390 /* Parse an element in the bracket expression. */
3392 static reg_errcode_t
3393 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3394 re_token_t *token, int token_len, re_dfa_t *dfa,
3395 reg_syntax_t syntax, bool accept_hyphen)
3397 #ifdef RE_ENABLE_I18N
3398 int cur_char_size;
3399 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3400 if (cur_char_size > 1)
3402 elem->type = MB_CHAR;
3403 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3404 re_string_skip_bytes (regexp, cur_char_size);
3405 return REG_NOERROR;
3407 #endif /* RE_ENABLE_I18N */
3408 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3409 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3410 || token->type == OP_OPEN_EQUIV_CLASS)
3411 return parse_bracket_symbol (elem, regexp, token);
3412 if (__glibc_unlikely (token->type == OP_CHARSET_RANGE) && !accept_hyphen)
3414 /* A '-' must only appear as anything but a range indicator before
3415 the closing bracket. Everything else is an error. */
3416 re_token_t token2;
3417 (void) peek_token_bracket (&token2, regexp, syntax);
3418 if (token2.type != OP_CLOSE_BRACKET)
3419 /* The actual error value is not standardized since this whole
3420 case is undefined. But ERANGE makes good sense. */
3421 return REG_ERANGE;
3423 elem->type = SB_CHAR;
3424 elem->opr.ch = token->opr.c;
3425 return REG_NOERROR;
3428 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3429 such as [:<character_class>:], [.<collating_element>.], and
3430 [=<equivalent_class>=]. */
3432 static reg_errcode_t
3433 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3434 re_token_t *token)
3436 unsigned char ch, delim = token->opr.c;
3437 int i = 0;
3438 if (re_string_eoi(regexp))
3439 return REG_EBRACK;
3440 for (;; ++i)
3442 if (i >= BRACKET_NAME_BUF_SIZE)
3443 return REG_EBRACK;
3444 if (token->type == OP_OPEN_CHAR_CLASS)
3445 ch = re_string_fetch_byte_case (regexp);
3446 else
3447 ch = re_string_fetch_byte (regexp);
3448 if (re_string_eoi(regexp))
3449 return REG_EBRACK;
3450 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3451 break;
3452 elem->opr.name[i] = ch;
3454 re_string_skip_bytes (regexp, 1);
3455 elem->opr.name[i] = '\0';
3456 switch (token->type)
3458 case OP_OPEN_COLL_ELEM:
3459 elem->type = COLL_SYM;
3460 break;
3461 case OP_OPEN_EQUIV_CLASS:
3462 elem->type = EQUIV_CLASS;
3463 break;
3464 case OP_OPEN_CHAR_CLASS:
3465 elem->type = CHAR_CLASS;
3466 break;
3467 default:
3468 break;
3470 return REG_NOERROR;
3473 /* Helper function for parse_bracket_exp.
3474 Build the equivalence class which is represented by NAME.
3475 The result are written to MBCSET and SBCSET.
3476 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3477 is a pointer argument since we may update it. */
3479 static reg_errcode_t
3480 #ifdef RE_ENABLE_I18N
3481 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3482 Idx *equiv_class_alloc, const unsigned char *name)
3483 #else /* not RE_ENABLE_I18N */
3484 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3485 #endif /* not RE_ENABLE_I18N */
3487 #ifdef _LIBC
3488 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3489 if (nrules != 0)
3491 const int32_t *table, *indirect;
3492 const unsigned char *weights, *extra, *cp;
3493 unsigned char char_buf[2];
3494 int32_t idx1, idx2;
3495 unsigned int ch;
3496 size_t len;
3497 /* Calculate the index for equivalence class. */
3498 cp = name;
3499 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3500 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3501 _NL_COLLATE_WEIGHTMB);
3502 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3503 _NL_COLLATE_EXTRAMB);
3504 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3505 _NL_COLLATE_INDIRECTMB);
3506 idx1 = findidx (table, indirect, extra, &cp, -1);
3507 if (__glibc_unlikely (idx1 == 0 || *cp != '\0'))
3508 /* This isn't a valid character. */
3509 return REG_ECOLLATE;
3511 /* Build single byte matching table for this equivalence class. */
3512 len = weights[idx1 & 0xffffff];
3513 for (ch = 0; ch < SBC_MAX; ++ch)
3515 char_buf[0] = ch;
3516 cp = char_buf;
3517 idx2 = findidx (table, indirect, extra, &cp, 1);
3519 idx2 = table[ch];
3521 if (idx2 == 0)
3522 /* This isn't a valid character. */
3523 continue;
3524 /* Compare only if the length matches and the collation rule
3525 index is the same. */
3526 if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24)
3527 && memcmp (weights + (idx1 & 0xffffff) + 1,
3528 weights + (idx2 & 0xffffff) + 1, len) == 0)
3529 bitset_set (sbcset, ch);
3531 /* Check whether the array has enough space. */
3532 if (__glibc_unlikely (*equiv_class_alloc == mbcset->nequiv_classes))
3534 /* Not enough, realloc it. */
3535 /* +1 in case of mbcset->nequiv_classes is 0. */
3536 Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3537 /* Use realloc since the array is NULL if *alloc == 0. */
3538 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3539 int32_t,
3540 new_equiv_class_alloc);
3541 if (__glibc_unlikely (new_equiv_classes == NULL))
3542 return REG_ESPACE;
3543 mbcset->equiv_classes = new_equiv_classes;
3544 *equiv_class_alloc = new_equiv_class_alloc;
3546 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3548 else
3549 #endif /* _LIBC */
3551 if (__glibc_unlikely (strlen ((const char *) name) != 1))
3552 return REG_ECOLLATE;
3553 bitset_set (sbcset, *name);
3555 return REG_NOERROR;
3558 /* Helper function for parse_bracket_exp.
3559 Build the character class which is represented by NAME.
3560 The result are written to MBCSET and SBCSET.
3561 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3562 is a pointer argument since we may update it. */
3564 static reg_errcode_t
3565 #ifdef RE_ENABLE_I18N
3566 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3567 re_charset_t *mbcset, Idx *char_class_alloc,
3568 const char *class_name, reg_syntax_t syntax)
3569 #else /* not RE_ENABLE_I18N */
3570 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3571 const char *class_name, reg_syntax_t syntax)
3572 #endif /* not RE_ENABLE_I18N */
3574 int i;
3575 const char *name = class_name;
3577 /* In case of REG_ICASE "upper" and "lower" match the both of
3578 upper and lower cases. */
3579 if ((syntax & RE_ICASE)
3580 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3581 name = "alpha";
3583 #ifdef RE_ENABLE_I18N
3584 /* Check the space of the arrays. */
3585 if (__glibc_unlikely (*char_class_alloc == mbcset->nchar_classes))
3587 /* Not enough, realloc it. */
3588 /* +1 in case of mbcset->nchar_classes is 0. */
3589 Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3590 /* Use realloc since array is NULL if *alloc == 0. */
3591 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3592 new_char_class_alloc);
3593 if (__glibc_unlikely (new_char_classes == NULL))
3594 return REG_ESPACE;
3595 mbcset->char_classes = new_char_classes;
3596 *char_class_alloc = new_char_class_alloc;
3598 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3599 #endif /* RE_ENABLE_I18N */
3601 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3602 do { \
3603 if (__glibc_unlikely (trans != NULL)) \
3605 for (i = 0; i < SBC_MAX; ++i) \
3606 if (ctype_func (i)) \
3607 bitset_set (sbcset, trans[i]); \
3609 else \
3611 for (i = 0; i < SBC_MAX; ++i) \
3612 if (ctype_func (i)) \
3613 bitset_set (sbcset, i); \
3615 } while (0)
3617 if (strcmp (name, "alnum") == 0)
3618 BUILD_CHARCLASS_LOOP (isalnum);
3619 else if (strcmp (name, "cntrl") == 0)
3620 BUILD_CHARCLASS_LOOP (iscntrl);
3621 else if (strcmp (name, "lower") == 0)
3622 BUILD_CHARCLASS_LOOP (islower);
3623 else if (strcmp (name, "space") == 0)
3624 BUILD_CHARCLASS_LOOP (isspace);
3625 else if (strcmp (name, "alpha") == 0)
3626 BUILD_CHARCLASS_LOOP (isalpha);
3627 else if (strcmp (name, "digit") == 0)
3628 BUILD_CHARCLASS_LOOP (isdigit);
3629 else if (strcmp (name, "print") == 0)
3630 BUILD_CHARCLASS_LOOP (isprint);
3631 else if (strcmp (name, "upper") == 0)
3632 BUILD_CHARCLASS_LOOP (isupper);
3633 else if (strcmp (name, "blank") == 0)
3634 BUILD_CHARCLASS_LOOP (isblank);
3635 else if (strcmp (name, "graph") == 0)
3636 BUILD_CHARCLASS_LOOP (isgraph);
3637 else if (strcmp (name, "punct") == 0)
3638 BUILD_CHARCLASS_LOOP (ispunct);
3639 else if (strcmp (name, "xdigit") == 0)
3640 BUILD_CHARCLASS_LOOP (isxdigit);
3641 else
3642 return REG_ECTYPE;
3644 return REG_NOERROR;
3647 static bin_tree_t *
3648 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3649 const char *class_name,
3650 const char *extra, bool non_match,
3651 reg_errcode_t *err)
3653 re_bitset_ptr_t sbcset;
3654 #ifdef RE_ENABLE_I18N
3655 re_charset_t *mbcset;
3656 Idx alloc = 0;
3657 #endif /* not RE_ENABLE_I18N */
3658 reg_errcode_t ret;
3659 bin_tree_t *tree;
3661 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3662 if (__glibc_unlikely (sbcset == NULL))
3664 *err = REG_ESPACE;
3665 return NULL;
3667 #ifdef RE_ENABLE_I18N
3668 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3669 if (__glibc_unlikely (mbcset == NULL))
3671 re_free (sbcset);
3672 *err = REG_ESPACE;
3673 return NULL;
3675 mbcset->non_match = non_match;
3676 #endif /* RE_ENABLE_I18N */
3678 /* We don't care the syntax in this case. */
3679 ret = build_charclass (trans, sbcset,
3680 #ifdef RE_ENABLE_I18N
3681 mbcset, &alloc,
3682 #endif /* RE_ENABLE_I18N */
3683 class_name, 0);
3685 if (__glibc_unlikely (ret != REG_NOERROR))
3687 re_free (sbcset);
3688 #ifdef RE_ENABLE_I18N
3689 free_charset (mbcset);
3690 #endif /* RE_ENABLE_I18N */
3691 *err = ret;
3692 return NULL;
3694 /* \w match '_' also. */
3695 for (; *extra; extra++)
3696 bitset_set (sbcset, *extra);
3698 /* If it is non-matching list. */
3699 if (non_match)
3700 bitset_not (sbcset);
3702 #ifdef RE_ENABLE_I18N
3703 /* Ensure only single byte characters are set. */
3704 if (dfa->mb_cur_max > 1)
3705 bitset_mask (sbcset, dfa->sb_char);
3706 #endif
3708 /* Build a tree for simple bracket. */
3709 re_token_t br_token = { .type = SIMPLE_BRACKET, .opr.sbcset = sbcset };
3710 tree = create_token_tree (dfa, NULL, NULL, &br_token);
3711 if (__glibc_unlikely (tree == NULL))
3712 goto build_word_op_espace;
3714 #ifdef RE_ENABLE_I18N
3715 if (dfa->mb_cur_max > 1)
3717 bin_tree_t *mbc_tree;
3718 /* Build a tree for complex bracket. */
3719 br_token.type = COMPLEX_BRACKET;
3720 br_token.opr.mbcset = mbcset;
3721 dfa->has_mb_node = 1;
3722 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3723 if (__glibc_unlikely (mbc_tree == NULL))
3724 goto build_word_op_espace;
3725 /* Then join them by ALT node. */
3726 tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3727 if (__glibc_likely (mbc_tree != NULL))
3728 return tree;
3730 else
3732 free_charset (mbcset);
3733 return tree;
3735 #else /* not RE_ENABLE_I18N */
3736 return tree;
3737 #endif /* not RE_ENABLE_I18N */
3739 build_word_op_espace:
3740 re_free (sbcset);
3741 #ifdef RE_ENABLE_I18N
3742 free_charset (mbcset);
3743 #endif /* RE_ENABLE_I18N */
3744 *err = REG_ESPACE;
3745 return NULL;
3748 /* This is intended for the expressions like "a{1,3}".
3749 Fetch a number from 'input', and return the number.
3750 Return -1 if the number field is empty like "{,1}".
3751 Return RE_DUP_MAX + 1 if the number field is too large.
3752 Return -2 if an error occurred. */
3754 static Idx
3755 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3757 Idx num = -1;
3758 unsigned char c;
3759 while (1)
3761 fetch_token (token, input, syntax);
3762 c = token->opr.c;
3763 if (__glibc_unlikely (token->type == END_OF_RE))
3764 return -2;
3765 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3766 break;
3767 num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3768 ? -2
3769 : num == -1
3770 ? c - '0'
3771 : MIN (RE_DUP_MAX + 1, num * 10 + c - '0'));
3773 return num;
3776 #ifdef RE_ENABLE_I18N
3777 static void
3778 free_charset (re_charset_t *cset)
3780 re_free (cset->mbchars);
3781 # ifdef _LIBC
3782 re_free (cset->coll_syms);
3783 re_free (cset->equiv_classes);
3784 # endif
3785 re_free (cset->range_starts);
3786 re_free (cset->range_ends);
3787 re_free (cset->char_classes);
3788 re_free (cset);
3790 #endif /* RE_ENABLE_I18N */
3792 /* Functions for binary tree operation. */
3794 /* Create a tree node. */
3796 static bin_tree_t *
3797 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3798 re_token_type_t type)
3800 re_token_t t = { .type = type };
3801 return create_token_tree (dfa, left, right, &t);
3804 static bin_tree_t *
3805 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3806 const re_token_t *token)
3808 bin_tree_t *tree;
3809 if (__glibc_unlikely (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE))
3811 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3813 if (storage == NULL)
3814 return NULL;
3815 storage->next = dfa->str_tree_storage;
3816 dfa->str_tree_storage = storage;
3817 dfa->str_tree_storage_idx = 0;
3819 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3821 tree->parent = NULL;
3822 tree->left = left;
3823 tree->right = right;
3824 tree->token = *token;
3825 tree->token.duplicated = 0;
3826 tree->token.opt_subexp = 0;
3827 tree->first = NULL;
3828 tree->next = NULL;
3829 tree->node_idx = -1;
3831 if (left != NULL)
3832 left->parent = tree;
3833 if (right != NULL)
3834 right->parent = tree;
3835 return tree;
3838 /* Mark the tree SRC as an optional subexpression.
3839 To be called from preorder or postorder. */
3841 static reg_errcode_t
3842 mark_opt_subexp (void *extra, bin_tree_t *node)
3844 Idx idx = (uintptr_t) extra;
3845 if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3846 node->token.opt_subexp = 1;
3848 return REG_NOERROR;
3851 /* Free the allocated memory inside NODE. */
3853 static void
3854 free_token (re_token_t *node)
3856 #ifdef RE_ENABLE_I18N
3857 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3858 free_charset (node->opr.mbcset);
3859 else
3860 #endif /* RE_ENABLE_I18N */
3861 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3862 re_free (node->opr.sbcset);
3865 /* Worker function for tree walking. Free the allocated memory inside NODE
3866 and its children. */
3868 static reg_errcode_t
3869 free_tree (void *extra, bin_tree_t *node)
3871 free_token (&node->token);
3872 return REG_NOERROR;
3876 /* Duplicate the node SRC, and return new node. This is a preorder
3877 visit similar to the one implemented by the generic visitor, but
3878 we need more infrastructure to maintain two parallel trees --- so,
3879 it's easier to duplicate. */
3881 static bin_tree_t *
3882 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3884 const bin_tree_t *node;
3885 bin_tree_t *dup_root;
3886 bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3888 for (node = root; ; )
3890 /* Create a new tree and link it back to the current parent. */
3891 *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3892 if (*p_new == NULL)
3893 return NULL;
3894 (*p_new)->parent = dup_node;
3895 (*p_new)->token.duplicated = 1;
3896 dup_node = *p_new;
3898 /* Go to the left node, or up and to the right. */
3899 if (node->left)
3901 node = node->left;
3902 p_new = &dup_node->left;
3904 else
3906 const bin_tree_t *prev = NULL;
3907 while (node->right == prev || node->right == NULL)
3909 prev = node;
3910 node = node->parent;
3911 dup_node = dup_node->parent;
3912 if (!node)
3913 return dup_root;
3915 node = node->right;
3916 p_new = &dup_node->right;