grep: Use get_crt_codepage(). Don't default to the UTF-8 manifest for older VCC versi...
[kbuild-mirror.git] / src / grep / lib / regcomp.c
blob02b0e7006aeac8b15c24780738f0aa4d745fdfc5
1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002-2021 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <https://www.gnu.org/licenses/>. */
20 #ifdef _LIBC
21 # include <locale/weight.h>
22 #endif
24 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
25 size_t length, reg_syntax_t syntax);
26 static void re_compile_fastmap_iter (regex_t *bufp,
27 const re_dfastate_t *init_state,
28 char *fastmap);
29 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
30 #ifdef RE_ENABLE_I18N
31 static void free_charset (re_charset_t *cset);
32 #endif /* RE_ENABLE_I18N */
33 static void free_workarea_compile (regex_t *preg);
34 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
35 #ifdef RE_ENABLE_I18N
36 static void optimize_utf8 (re_dfa_t *dfa);
37 #endif
38 static reg_errcode_t analyze (regex_t *preg);
39 static reg_errcode_t preorder (bin_tree_t *root,
40 reg_errcode_t (fn (void *, bin_tree_t *)),
41 void *extra);
42 static reg_errcode_t postorder (bin_tree_t *root,
43 reg_errcode_t (fn (void *, bin_tree_t *)),
44 void *extra);
45 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
46 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
47 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
48 bin_tree_t *node);
49 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
50 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
51 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
52 static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
53 static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
54 unsigned int constraint);
55 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
56 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
57 Idx node, bool root);
58 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
59 static Idx fetch_number (re_string_t *input, re_token_t *token,
60 reg_syntax_t syntax);
61 static int peek_token (re_token_t *token, re_string_t *input,
62 reg_syntax_t syntax);
63 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
64 reg_syntax_t syntax, reg_errcode_t *err);
65 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
66 re_token_t *token, reg_syntax_t syntax,
67 Idx nest, reg_errcode_t *err);
68 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
69 re_token_t *token, reg_syntax_t syntax,
70 Idx nest, reg_errcode_t *err);
71 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
72 re_token_t *token, reg_syntax_t syntax,
73 Idx nest, reg_errcode_t *err);
74 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
75 re_token_t *token, reg_syntax_t syntax,
76 Idx nest, reg_errcode_t *err);
77 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
78 re_dfa_t *dfa, re_token_t *token,
79 reg_syntax_t syntax, reg_errcode_t *err);
80 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
81 re_token_t *token, reg_syntax_t syntax,
82 reg_errcode_t *err);
83 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
84 re_string_t *regexp,
85 re_token_t *token, int token_len,
86 re_dfa_t *dfa,
87 reg_syntax_t syntax,
88 bool accept_hyphen);
89 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
90 re_string_t *regexp,
91 re_token_t *token);
92 #ifdef RE_ENABLE_I18N
93 static reg_errcode_t build_equiv_class (bitset_t sbcset,
94 re_charset_t *mbcset,
95 Idx *equiv_class_alloc,
96 const unsigned char *name);
97 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
98 bitset_t sbcset,
99 re_charset_t *mbcset,
100 Idx *char_class_alloc,
101 const char *class_name,
102 reg_syntax_t syntax);
103 #else /* not RE_ENABLE_I18N */
104 static reg_errcode_t build_equiv_class (bitset_t sbcset,
105 const unsigned char *name);
106 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
107 bitset_t sbcset,
108 const char *class_name,
109 reg_syntax_t syntax);
110 #endif /* not RE_ENABLE_I18N */
111 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
112 RE_TRANSLATE_TYPE trans,
113 const char *class_name,
114 const char *extra,
115 bool non_match, reg_errcode_t *err);
116 static bin_tree_t *create_tree (re_dfa_t *dfa,
117 bin_tree_t *left, bin_tree_t *right,
118 re_token_type_t type);
119 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
120 bin_tree_t *left, bin_tree_t *right,
121 const re_token_t *token);
122 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
123 static void free_token (re_token_t *node);
124 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
125 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
127 /* This table gives an error message for each of the error codes listed
128 in regex.h. Obviously the order here has to be same as there.
129 POSIX doesn't require that we do anything for REG_NOERROR,
130 but why not be nice? */
132 static const char __re_error_msgid[] =
134 #define REG_NOERROR_IDX 0
135 gettext_noop ("Success") /* REG_NOERROR */
136 "\0"
137 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
138 gettext_noop ("No match") /* REG_NOMATCH */
139 "\0"
140 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
141 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
142 "\0"
143 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
144 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
145 "\0"
146 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
147 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
148 "\0"
149 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
150 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
151 "\0"
152 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
153 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
154 "\0"
155 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
156 gettext_noop ("Unmatched [, [^, [:, [., or [=") /* REG_EBRACK */
157 "\0"
158 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [, [^, [:, [., or [=")
159 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
160 "\0"
161 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
162 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
163 "\0"
164 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
165 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
166 "\0"
167 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
168 gettext_noop ("Invalid range end") /* REG_ERANGE */
169 "\0"
170 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
171 gettext_noop ("Memory exhausted") /* REG_ESPACE */
172 "\0"
173 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
174 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
175 "\0"
176 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
177 gettext_noop ("Premature end of regular expression") /* REG_EEND */
178 "\0"
179 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
180 gettext_noop ("Regular expression too big") /* REG_ESIZE */
181 "\0"
182 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
183 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
186 static const size_t __re_error_msgid_idx[] =
188 REG_NOERROR_IDX,
189 REG_NOMATCH_IDX,
190 REG_BADPAT_IDX,
191 REG_ECOLLATE_IDX,
192 REG_ECTYPE_IDX,
193 REG_EESCAPE_IDX,
194 REG_ESUBREG_IDX,
195 REG_EBRACK_IDX,
196 REG_EPAREN_IDX,
197 REG_EBRACE_IDX,
198 REG_BADBR_IDX,
199 REG_ERANGE_IDX,
200 REG_ESPACE_IDX,
201 REG_BADRPT_IDX,
202 REG_EEND_IDX,
203 REG_ESIZE_IDX,
204 REG_ERPAREN_IDX
207 /* Entry points for GNU code. */
209 /* re_compile_pattern is the GNU regular expression compiler: it
210 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
211 Returns 0 if the pattern was valid, otherwise an error string.
213 Assumes the 'allocated' (and perhaps 'buffer') and 'translate' fields
214 are set in BUFP on entry. */
216 const char *
217 re_compile_pattern (const char *pattern, size_t length,
218 struct re_pattern_buffer *bufp)
220 reg_errcode_t ret;
222 /* And GNU code determines whether or not to get register information
223 by passing null for the REGS argument to re_match, etc., not by
224 setting no_sub, unless RE_NO_SUB is set. */
225 bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
227 /* Match anchors at newline. */
228 bufp->newline_anchor = 1;
230 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
232 if (!ret)
233 return NULL;
234 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
236 weak_alias (__re_compile_pattern, re_compile_pattern)
238 /* Set by 're_set_syntax' to the current regexp syntax to recognize. Can
239 also be assigned to arbitrarily: each pattern buffer stores its own
240 syntax, so it can be changed between regex compilations. */
241 /* This has no initializer because initialized variables in Emacs
242 become read-only after dumping. */
243 reg_syntax_t re_syntax_options;
246 /* Specify the precise syntax of regexps for compilation. This provides
247 for compatibility for various utilities which historically have
248 different, incompatible syntaxes.
250 The argument SYNTAX is a bit mask comprised of the various bits
251 defined in regex.h. We return the old syntax. */
253 reg_syntax_t
254 re_set_syntax (reg_syntax_t syntax)
256 reg_syntax_t ret = re_syntax_options;
258 re_syntax_options = syntax;
259 return ret;
261 weak_alias (__re_set_syntax, re_set_syntax)
264 re_compile_fastmap (struct re_pattern_buffer *bufp)
266 re_dfa_t *dfa = bufp->buffer;
267 char *fastmap = bufp->fastmap;
269 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
270 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
271 if (dfa->init_state != dfa->init_state_word)
272 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
273 if (dfa->init_state != dfa->init_state_nl)
274 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
275 if (dfa->init_state != dfa->init_state_begbuf)
276 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
277 bufp->fastmap_accurate = 1;
278 return 0;
280 weak_alias (__re_compile_fastmap, re_compile_fastmap)
282 static inline void
283 __attribute__ ((always_inline))
284 re_set_fastmap (char *fastmap, bool icase, int ch)
286 fastmap[ch] = 1;
287 if (icase)
288 fastmap[tolower (ch)] = 1;
291 /* Helper function for re_compile_fastmap.
292 Compile fastmap for the initial_state INIT_STATE. */
294 static void
295 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
296 char *fastmap)
298 re_dfa_t *dfa = bufp->buffer;
299 Idx node_cnt;
300 bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
301 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
303 Idx node = init_state->nodes.elems[node_cnt];
304 re_token_type_t type = dfa->nodes[node].type;
306 if (type == CHARACTER)
308 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
309 #ifdef RE_ENABLE_I18N
310 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
312 unsigned char buf[MB_LEN_MAX];
313 unsigned char *p;
314 wchar_t wc;
315 mbstate_t state;
317 p = buf;
318 *p++ = dfa->nodes[node].opr.c;
319 while (++node < dfa->nodes_len
320 && dfa->nodes[node].type == CHARACTER
321 && dfa->nodes[node].mb_partial)
322 *p++ = dfa->nodes[node].opr.c;
323 memset (&state, '\0', sizeof (state));
324 if (__mbrtowc (&wc, (const char *) buf, p - buf,
325 &state) == p - buf
326 && (__wcrtomb ((char *) buf, __towlower (wc), &state)
327 != (size_t) -1))
328 re_set_fastmap (fastmap, false, buf[0]);
330 #endif
332 else if (type == SIMPLE_BRACKET)
334 int i, ch;
335 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
337 int j;
338 bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
339 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
340 if (w & ((bitset_word_t) 1 << j))
341 re_set_fastmap (fastmap, icase, ch);
344 #ifdef RE_ENABLE_I18N
345 else if (type == COMPLEX_BRACKET)
347 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
348 Idx i;
350 # ifdef _LIBC
351 /* See if we have to try all bytes which start multiple collation
352 elements.
353 e.g. In da_DK, we want to catch 'a' since "aa" is a valid
354 collation element, and don't catch 'b' since 'b' is
355 the only collation element which starts from 'b' (and
356 it is caught by SIMPLE_BRACKET). */
357 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
358 && (cset->ncoll_syms || cset->nranges))
360 const int32_t *table = (const int32_t *)
361 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
362 for (i = 0; i < SBC_MAX; ++i)
363 if (table[i] < 0)
364 re_set_fastmap (fastmap, icase, i);
366 # endif /* _LIBC */
368 /* See if we have to start the match at all multibyte characters,
369 i.e. where we would not find an invalid sequence. This only
370 applies to multibyte character sets; for single byte character
371 sets, the SIMPLE_BRACKET again suffices. */
372 if (dfa->mb_cur_max > 1
373 && (cset->nchar_classes || cset->non_match || cset->nranges
374 # ifdef _LIBC
375 || cset->nequiv_classes
376 # endif /* _LIBC */
379 unsigned char c = 0;
382 mbstate_t mbs;
383 memset (&mbs, 0, sizeof (mbs));
384 if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
385 re_set_fastmap (fastmap, false, (int) c);
387 while (++c != 0);
390 else
392 /* ... Else catch all bytes which can start the mbchars. */
393 for (i = 0; i < cset->nmbchars; ++i)
395 char buf[256];
396 mbstate_t state;
397 memset (&state, '\0', sizeof (state));
398 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
399 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
400 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
402 if (__wcrtomb (buf, __towlower (cset->mbchars[i]), &state)
403 != (size_t) -1)
404 re_set_fastmap (fastmap, false, *(unsigned char *) buf);
409 #endif /* RE_ENABLE_I18N */
410 else if (type == OP_PERIOD
411 #ifdef RE_ENABLE_I18N
412 || type == OP_UTF8_PERIOD
413 #endif /* RE_ENABLE_I18N */
414 || type == END_OF_RE)
416 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
417 if (type == END_OF_RE)
418 bufp->can_be_null = 1;
419 return;
424 /* Entry point for POSIX code. */
425 /* regcomp takes a regular expression as a string and compiles it.
427 PREG is a regex_t *. We do not expect any fields to be initialized,
428 since POSIX says we shouldn't. Thus, we set
430 'buffer' to the compiled pattern;
431 'used' to the length of the compiled pattern;
432 'syntax' to RE_SYNTAX_POSIX_EXTENDED if the
433 REG_EXTENDED bit in CFLAGS is set; otherwise, to
434 RE_SYNTAX_POSIX_BASIC;
435 'newline_anchor' to REG_NEWLINE being set in CFLAGS;
436 'fastmap' to an allocated space for the fastmap;
437 'fastmap_accurate' to zero;
438 're_nsub' to the number of subexpressions in PATTERN.
440 PATTERN is the address of the pattern string.
442 CFLAGS is a series of bits which affect compilation.
444 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
445 use POSIX basic syntax.
447 If REG_NEWLINE is set, then . and [^...] don't match newline.
448 Also, regexec will try a match beginning after every newline.
450 If REG_ICASE is set, then we considers upper- and lowercase
451 versions of letters to be equivalent when matching.
453 If REG_NOSUB is set, then when PREG is passed to regexec, that
454 routine will report only success or failure, and nothing about the
455 registers.
457 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
458 the return codes and their meanings.) */
461 regcomp (regex_t *__restrict preg, const char *__restrict pattern, int cflags)
463 reg_errcode_t ret;
464 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
465 : RE_SYNTAX_POSIX_BASIC);
467 preg->buffer = NULL;
468 preg->allocated = 0;
469 preg->used = 0;
471 /* Try to allocate space for the fastmap. */
472 preg->fastmap = re_malloc (char, SBC_MAX);
473 if (__glibc_unlikely (preg->fastmap == NULL))
474 return REG_ESPACE;
476 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
478 /* If REG_NEWLINE is set, newlines are treated differently. */
479 if (cflags & REG_NEWLINE)
480 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
481 syntax &= ~RE_DOT_NEWLINE;
482 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
483 /* It also changes the matching behavior. */
484 preg->newline_anchor = 1;
486 else
487 preg->newline_anchor = 0;
488 preg->no_sub = !!(cflags & REG_NOSUB);
489 preg->translate = NULL;
491 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
493 /* POSIX doesn't distinguish between an unmatched open-group and an
494 unmatched close-group: both are REG_EPAREN. */
495 if (ret == REG_ERPAREN)
496 ret = REG_EPAREN;
498 /* We have already checked preg->fastmap != NULL. */
499 if (__glibc_likely (ret == REG_NOERROR))
500 /* Compute the fastmap now, since regexec cannot modify the pattern
501 buffer. This function never fails in this implementation. */
502 (void) re_compile_fastmap (preg);
503 else
505 /* Some error occurred while compiling the expression. */
506 re_free (preg->fastmap);
507 preg->fastmap = NULL;
510 return (int) ret;
512 libc_hidden_def (__regcomp)
513 weak_alias (__regcomp, regcomp)
515 /* Returns a message corresponding to an error code, ERRCODE, returned
516 from either regcomp or regexec. We don't use PREG here. */
518 size_t
519 regerror (int errcode, const regex_t *__restrict preg, char *__restrict errbuf,
520 size_t errbuf_size)
522 const char *msg;
523 size_t msg_size;
524 int nerrcodes = sizeof __re_error_msgid_idx / sizeof __re_error_msgid_idx[0];
526 if (__glibc_unlikely (errcode < 0 || errcode >= nerrcodes))
527 /* Only error codes returned by the rest of the code should be passed
528 to this routine. If we are given anything else, or if other regex
529 code generates an invalid error code, then the program has a bug.
530 Dump core so we can fix it. */
531 abort ();
533 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
535 msg_size = strlen (msg) + 1; /* Includes the null. */
537 if (__glibc_likely (errbuf_size != 0))
539 size_t cpy_size = msg_size;
540 if (__glibc_unlikely (msg_size > errbuf_size))
542 cpy_size = errbuf_size - 1;
543 errbuf[cpy_size] = '\0';
545 memcpy (errbuf, msg, cpy_size);
548 return msg_size;
550 weak_alias (__regerror, regerror)
553 #ifdef RE_ENABLE_I18N
554 /* This static array is used for the map to single-byte characters when
555 UTF-8 is used. Otherwise we would allocate memory just to initialize
556 it the same all the time. UTF-8 is the preferred encoding so this is
557 a worthwhile optimization. */
558 static const bitset_t utf8_sb_map =
560 /* Set the first 128 bits. */
561 # if (defined __GNUC__ || __clang_major__ >= 4) && !defined __STRICT_ANSI__
562 [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
563 # else
564 # if 4 * BITSET_WORD_BITS < ASCII_CHARS
565 # error "bitset_word_t is narrower than 32 bits"
566 # elif 3 * BITSET_WORD_BITS < ASCII_CHARS
567 BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
568 # elif 2 * BITSET_WORD_BITS < ASCII_CHARS
569 BITSET_WORD_MAX, BITSET_WORD_MAX,
570 # elif 1 * BITSET_WORD_BITS < ASCII_CHARS
571 BITSET_WORD_MAX,
572 # endif
573 (BITSET_WORD_MAX
574 >> (SBC_MAX % BITSET_WORD_BITS == 0
576 : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
577 # endif
579 #endif
582 static void
583 free_dfa_content (re_dfa_t *dfa)
585 Idx i, j;
587 if (dfa->nodes)
588 for (i = 0; i < dfa->nodes_len; ++i)
589 free_token (dfa->nodes + i);
590 re_free (dfa->nexts);
591 for (i = 0; i < dfa->nodes_len; ++i)
593 if (dfa->eclosures != NULL)
594 re_node_set_free (dfa->eclosures + i);
595 if (dfa->inveclosures != NULL)
596 re_node_set_free (dfa->inveclosures + i);
597 if (dfa->edests != NULL)
598 re_node_set_free (dfa->edests + i);
600 re_free (dfa->edests);
601 re_free (dfa->eclosures);
602 re_free (dfa->inveclosures);
603 re_free (dfa->nodes);
605 if (dfa->state_table)
606 for (i = 0; i <= dfa->state_hash_mask; ++i)
608 struct re_state_table_entry *entry = dfa->state_table + i;
609 for (j = 0; j < entry->num; ++j)
611 re_dfastate_t *state = entry->array[j];
612 free_state (state);
614 re_free (entry->array);
616 re_free (dfa->state_table);
617 #ifdef RE_ENABLE_I18N
618 if (dfa->sb_char != utf8_sb_map)
619 re_free (dfa->sb_char);
620 #endif
621 re_free (dfa->subexp_map);
622 #ifdef DEBUG
623 re_free (dfa->re_str);
624 #endif
626 re_free (dfa);
630 /* Free dynamically allocated space used by PREG. */
632 void
633 regfree (regex_t *preg)
635 re_dfa_t *dfa = preg->buffer;
636 if (__glibc_likely (dfa != NULL))
638 lock_fini (dfa->lock);
639 free_dfa_content (dfa);
641 preg->buffer = NULL;
642 preg->allocated = 0;
644 re_free (preg->fastmap);
645 preg->fastmap = NULL;
647 re_free (preg->translate);
648 preg->translate = NULL;
650 libc_hidden_def (__regfree)
651 weak_alias (__regfree, regfree)
653 /* Entry points compatible with 4.2 BSD regex library. We don't define
654 them unless specifically requested. */
656 #if defined _REGEX_RE_COMP || defined _LIBC
658 /* BSD has one and only one pattern buffer. */
659 static struct re_pattern_buffer re_comp_buf;
661 char *
662 # ifdef _LIBC
663 /* Make these definitions weak in libc, so POSIX programs can redefine
664 these names if they don't use our functions, and still use
665 regcomp/regexec above without link errors. */
666 weak_function
667 # endif
668 re_comp (const char *s)
670 reg_errcode_t ret;
671 char *fastmap;
673 if (!s)
675 if (!re_comp_buf.buffer)
676 return gettext ("No previous regular expression");
677 return 0;
680 if (re_comp_buf.buffer)
682 fastmap = re_comp_buf.fastmap;
683 re_comp_buf.fastmap = NULL;
684 __regfree (&re_comp_buf);
685 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
686 re_comp_buf.fastmap = fastmap;
689 if (re_comp_buf.fastmap == NULL)
691 re_comp_buf.fastmap = re_malloc (char, SBC_MAX);
692 if (re_comp_buf.fastmap == NULL)
693 return (char *) gettext (__re_error_msgid
694 + __re_error_msgid_idx[(int) REG_ESPACE]);
697 /* Since 're_exec' always passes NULL for the 'regs' argument, we
698 don't need to initialize the pattern buffer fields which affect it. */
700 /* Match anchors at newlines. */
701 re_comp_buf.newline_anchor = 1;
703 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
705 if (!ret)
706 return NULL;
708 /* Yes, we're discarding 'const' here if !HAVE_LIBINTL. */
709 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
712 #ifdef _LIBC
713 libc_freeres_fn (free_mem)
715 __regfree (&re_comp_buf);
717 #endif
719 #endif /* _REGEX_RE_COMP */
721 /* Internal entry point.
722 Compile the regular expression PATTERN, whose length is LENGTH.
723 SYNTAX indicate regular expression's syntax. */
725 static reg_errcode_t
726 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
727 reg_syntax_t syntax)
729 reg_errcode_t err = REG_NOERROR;
730 re_dfa_t *dfa;
731 re_string_t regexp;
733 /* Initialize the pattern buffer. */
734 preg->fastmap_accurate = 0;
735 preg->syntax = syntax;
736 preg->not_bol = preg->not_eol = 0;
737 preg->used = 0;
738 preg->re_nsub = 0;
739 preg->can_be_null = 0;
740 preg->regs_allocated = REGS_UNALLOCATED;
742 /* Initialize the dfa. */
743 dfa = preg->buffer;
744 if (__glibc_unlikely (preg->allocated < sizeof (re_dfa_t)))
746 /* If zero allocated, but buffer is non-null, try to realloc
747 enough space. This loses if buffer's address is bogus, but
748 that is the user's responsibility. If ->buffer is NULL this
749 is a simple allocation. */
750 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
751 if (dfa == NULL)
752 return REG_ESPACE;
753 preg->allocated = sizeof (re_dfa_t);
754 preg->buffer = dfa;
756 preg->used = sizeof (re_dfa_t);
758 err = init_dfa (dfa, length);
759 if (__glibc_unlikely (err == REG_NOERROR && lock_init (dfa->lock) != 0))
760 err = REG_ESPACE;
761 if (__glibc_unlikely (err != REG_NOERROR))
763 free_dfa_content (dfa);
764 preg->buffer = NULL;
765 preg->allocated = 0;
766 return err;
768 #ifdef DEBUG
769 /* Note: length+1 will not overflow since it is checked in init_dfa. */
770 dfa->re_str = re_malloc (char, length + 1);
771 strncpy (dfa->re_str, pattern, length + 1);
772 #endif
774 err = re_string_construct (&regexp, pattern, length, preg->translate,
775 (syntax & RE_ICASE) != 0, dfa);
776 if (__glibc_unlikely (err != REG_NOERROR))
778 re_compile_internal_free_return:
779 free_workarea_compile (preg);
780 re_string_destruct (&regexp);
781 lock_fini (dfa->lock);
782 free_dfa_content (dfa);
783 preg->buffer = NULL;
784 preg->allocated = 0;
785 return err;
788 /* Parse the regular expression, and build a structure tree. */
789 preg->re_nsub = 0;
790 dfa->str_tree = parse (&regexp, preg, syntax, &err);
791 if (__glibc_unlikely (dfa->str_tree == NULL))
792 goto re_compile_internal_free_return;
794 /* Analyze the tree and create the nfa. */
795 err = analyze (preg);
796 if (__glibc_unlikely (err != REG_NOERROR))
797 goto re_compile_internal_free_return;
799 #ifdef RE_ENABLE_I18N
800 /* If possible, do searching in single byte encoding to speed things up. */
801 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
802 optimize_utf8 (dfa);
803 #endif
805 /* Then create the initial state of the dfa. */
806 err = create_initial_state (dfa);
808 /* Release work areas. */
809 free_workarea_compile (preg);
810 re_string_destruct (&regexp);
812 if (__glibc_unlikely (err != REG_NOERROR))
814 lock_fini (dfa->lock);
815 free_dfa_content (dfa);
816 preg->buffer = NULL;
817 preg->allocated = 0;
820 return err;
823 /* Initialize DFA. We use the length of the regular expression PAT_LEN
824 as the initial length of some arrays. */
826 static reg_errcode_t
827 init_dfa (re_dfa_t *dfa, size_t pat_len)
829 __re_size_t table_size;
830 #ifndef _LIBC
831 const char *codeset_name;
832 #endif
833 #ifdef RE_ENABLE_I18N
834 size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
835 #else
836 size_t max_i18n_object_size = 0;
837 #endif
838 size_t max_object_size =
839 MAX (sizeof (struct re_state_table_entry),
840 MAX (sizeof (re_token_t),
841 MAX (sizeof (re_node_set),
842 MAX (sizeof (regmatch_t),
843 max_i18n_object_size))));
845 memset (dfa, '\0', sizeof (re_dfa_t));
847 /* Force allocation of str_tree_storage the first time. */
848 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
850 /* Avoid overflows. The extra "/ 2" is for the table_size doubling
851 calculation below, and for similar doubling calculations
852 elsewhere. And it's <= rather than <, because some of the
853 doubling calculations add 1 afterwards. */
854 if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / max_object_size) / 2
855 <= pat_len))
856 return REG_ESPACE;
858 dfa->nodes_alloc = pat_len + 1;
859 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
861 /* table_size = 2 ^ ceil(log pat_len) */
862 for (table_size = 1; ; table_size <<= 1)
863 if (table_size > pat_len)
864 break;
866 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
867 dfa->state_hash_mask = table_size - 1;
869 dfa->mb_cur_max = MB_CUR_MAX;
870 #ifdef _LIBC
871 if (dfa->mb_cur_max == 6
872 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
873 dfa->is_utf8 = 1;
874 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
875 != 0);
876 #else
877 # ifdef _MSC_VER
878 (void)codeset_name;
879 if (get_crt_codepage() == CP_UTF8)
880 # else
881 codeset_name = nl_langinfo (CODESET);
882 if ((codeset_name[0] == 'U' || codeset_name[0] == 'u')
883 && (codeset_name[1] == 'T' || codeset_name[1] == 't')
884 && (codeset_name[2] == 'F' || codeset_name[2] == 'f')
885 && strcmp (codeset_name + 3 + (codeset_name[3] == '-'), "8") == 0)
886 # endif
887 dfa->is_utf8 = 1;
889 /* We check exhaustively in the loop below if this charset is a
890 superset of ASCII. */
891 dfa->map_notascii = 0;
892 #endif
894 #ifdef RE_ENABLE_I18N
895 if (dfa->mb_cur_max > 1)
897 if (dfa->is_utf8)
898 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
899 else
901 int i, j, ch;
903 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
904 if (__glibc_unlikely (dfa->sb_char == NULL))
905 return REG_ESPACE;
907 /* Set the bits corresponding to single byte chars. */
908 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
909 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
911 wint_t wch = __btowc (ch);
912 if (wch != WEOF)
913 dfa->sb_char[i] |= (bitset_word_t) 1 << j;
914 # ifndef _LIBC
915 if (isascii (ch) && wch != ch)
916 dfa->map_notascii = 1;
917 # endif
921 #endif
923 if (__glibc_unlikely (dfa->nodes == NULL || dfa->state_table == NULL))
924 return REG_ESPACE;
925 return REG_NOERROR;
928 /* Initialize WORD_CHAR table, which indicate which character is
929 "word". In this case "word" means that it is the word construction
930 character used by some operators like "\<", "\>", etc. */
932 static void
933 init_word_char (re_dfa_t *dfa)
935 int i = 0;
936 int j;
937 int ch = 0;
938 dfa->word_ops_used = 1;
939 if (__glibc_likely (dfa->map_notascii == 0))
941 /* Avoid uint32_t and uint64_t as some non-GCC platforms lack
942 them, an issue when this code is used in Gnulib. */
943 bitset_word_t bits0 = 0x00000000;
944 bitset_word_t bits1 = 0x03ff0000;
945 bitset_word_t bits2 = 0x87fffffe;
946 bitset_word_t bits3 = 0x07fffffe;
947 if (BITSET_WORD_BITS == 64)
949 /* Pacify gcc -Woverflow on 32-bit platformns. */
950 dfa->word_char[0] = bits1 << 31 << 1 | bits0;
951 dfa->word_char[1] = bits3 << 31 << 1 | bits2;
952 i = 2;
954 else if (BITSET_WORD_BITS == 32)
956 dfa->word_char[0] = bits0;
957 dfa->word_char[1] = bits1;
958 dfa->word_char[2] = bits2;
959 dfa->word_char[3] = bits3;
960 i = 4;
962 else
963 goto general_case;
964 ch = 128;
966 if (__glibc_likely (dfa->is_utf8))
968 memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8);
969 return;
973 general_case:
974 for (; i < BITSET_WORDS; ++i)
975 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
976 if (isalnum (ch) || ch == '_')
977 dfa->word_char[i] |= (bitset_word_t) 1 << j;
980 /* Free the work area which are only used while compiling. */
982 static void
983 free_workarea_compile (regex_t *preg)
985 re_dfa_t *dfa = preg->buffer;
986 bin_tree_storage_t *storage, *next;
987 for (storage = dfa->str_tree_storage; storage; storage = next)
989 next = storage->next;
990 re_free (storage);
992 dfa->str_tree_storage = NULL;
993 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
994 dfa->str_tree = NULL;
995 re_free (dfa->org_indices);
996 dfa->org_indices = NULL;
999 /* Create initial states for all contexts. */
1001 static reg_errcode_t
1002 create_initial_state (re_dfa_t *dfa)
1004 Idx first, i;
1005 reg_errcode_t err;
1006 re_node_set init_nodes;
1008 /* Initial states have the epsilon closure of the node which is
1009 the first node of the regular expression. */
1010 first = dfa->str_tree->first->node_idx;
1011 dfa->init_node = first;
1012 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
1013 if (__glibc_unlikely (err != REG_NOERROR))
1014 return err;
1016 /* The back-references which are in initial states can epsilon transit,
1017 since in this case all of the subexpressions can be null.
1018 Then we add epsilon closures of the nodes which are the next nodes of
1019 the back-references. */
1020 if (dfa->nbackref > 0)
1021 for (i = 0; i < init_nodes.nelem; ++i)
1023 Idx node_idx = init_nodes.elems[i];
1024 re_token_type_t type = dfa->nodes[node_idx].type;
1026 Idx clexp_idx;
1027 if (type != OP_BACK_REF)
1028 continue;
1029 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1031 re_token_t *clexp_node;
1032 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1033 if (clexp_node->type == OP_CLOSE_SUBEXP
1034 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1035 break;
1037 if (clexp_idx == init_nodes.nelem)
1038 continue;
1040 if (type == OP_BACK_REF)
1042 Idx dest_idx = dfa->edests[node_idx].elems[0];
1043 if (!re_node_set_contains (&init_nodes, dest_idx))
1045 reg_errcode_t merge_err
1046 = re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1047 if (merge_err != REG_NOERROR)
1048 return merge_err;
1049 i = 0;
1054 /* It must be the first time to invoke acquire_state. */
1055 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1056 /* We don't check ERR here, since the initial state must not be NULL. */
1057 if (__glibc_unlikely (dfa->init_state == NULL))
1058 return err;
1059 if (dfa->init_state->has_constraint)
1061 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1062 CONTEXT_WORD);
1063 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1064 CONTEXT_NEWLINE);
1065 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1066 &init_nodes,
1067 CONTEXT_NEWLINE
1068 | CONTEXT_BEGBUF);
1069 if (__glibc_unlikely (dfa->init_state_word == NULL
1070 || dfa->init_state_nl == NULL
1071 || dfa->init_state_begbuf == NULL))
1072 return err;
1074 else
1075 dfa->init_state_word = dfa->init_state_nl
1076 = dfa->init_state_begbuf = dfa->init_state;
1078 re_node_set_free (&init_nodes);
1079 return REG_NOERROR;
1082 #ifdef RE_ENABLE_I18N
1083 /* If it is possible to do searching in single byte encoding instead of UTF-8
1084 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1085 DFA nodes where needed. */
1087 static void
1088 optimize_utf8 (re_dfa_t *dfa)
1090 Idx node;
1091 int i;
1092 bool mb_chars = false;
1093 bool has_period = false;
1095 for (node = 0; node < dfa->nodes_len; ++node)
1096 switch (dfa->nodes[node].type)
1098 case CHARACTER:
1099 if (dfa->nodes[node].opr.c >= ASCII_CHARS)
1100 mb_chars = true;
1101 break;
1102 case ANCHOR:
1103 switch (dfa->nodes[node].opr.ctx_type)
1105 case LINE_FIRST:
1106 case LINE_LAST:
1107 case BUF_FIRST:
1108 case BUF_LAST:
1109 break;
1110 default:
1111 /* Word anchors etc. cannot be handled. It's okay to test
1112 opr.ctx_type since constraints (for all DFA nodes) are
1113 created by ORing one or more opr.ctx_type values. */
1114 return;
1116 break;
1117 case OP_PERIOD:
1118 has_period = true;
1119 break;
1120 case OP_BACK_REF:
1121 case OP_ALT:
1122 case END_OF_RE:
1123 case OP_DUP_ASTERISK:
1124 case OP_OPEN_SUBEXP:
1125 case OP_CLOSE_SUBEXP:
1126 break;
1127 case COMPLEX_BRACKET:
1128 return;
1129 case SIMPLE_BRACKET:
1130 /* Just double check. */
1132 int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
1134 : BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
1135 for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1137 if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
1138 return;
1139 rshift = 0;
1142 break;
1143 default:
1144 abort ();
1147 if (mb_chars || has_period)
1148 for (node = 0; node < dfa->nodes_len; ++node)
1150 if (dfa->nodes[node].type == CHARACTER
1151 && dfa->nodes[node].opr.c >= ASCII_CHARS)
1152 dfa->nodes[node].mb_partial = 0;
1153 else if (dfa->nodes[node].type == OP_PERIOD)
1154 dfa->nodes[node].type = OP_UTF8_PERIOD;
1157 /* The search can be in single byte locale. */
1158 dfa->mb_cur_max = 1;
1159 dfa->is_utf8 = 0;
1160 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1162 #endif
1164 /* Analyze the structure tree, and calculate "first", "next", "edest",
1165 "eclosure", and "inveclosure". */
1167 static reg_errcode_t
1168 analyze (regex_t *preg)
1170 re_dfa_t *dfa = preg->buffer;
1171 reg_errcode_t ret;
1173 /* Allocate arrays. */
1174 dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1175 dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1176 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1177 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1178 if (__glibc_unlikely (dfa->nexts == NULL || dfa->org_indices == NULL
1179 || dfa->edests == NULL || dfa->eclosures == NULL))
1180 return REG_ESPACE;
1182 dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
1183 if (dfa->subexp_map != NULL)
1185 Idx i;
1186 for (i = 0; i < preg->re_nsub; i++)
1187 dfa->subexp_map[i] = i;
1188 preorder (dfa->str_tree, optimize_subexps, dfa);
1189 for (i = 0; i < preg->re_nsub; i++)
1190 if (dfa->subexp_map[i] != i)
1191 break;
1192 if (i == preg->re_nsub)
1194 re_free (dfa->subexp_map);
1195 dfa->subexp_map = NULL;
1199 ret = postorder (dfa->str_tree, lower_subexps, preg);
1200 if (__glibc_unlikely (ret != REG_NOERROR))
1201 return ret;
1202 ret = postorder (dfa->str_tree, calc_first, dfa);
1203 if (__glibc_unlikely (ret != REG_NOERROR))
1204 return ret;
1205 preorder (dfa->str_tree, calc_next, dfa);
1206 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1207 if (__glibc_unlikely (ret != REG_NOERROR))
1208 return ret;
1209 ret = calc_eclosure (dfa);
1210 if (__glibc_unlikely (ret != REG_NOERROR))
1211 return ret;
1213 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1214 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1215 if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1216 || dfa->nbackref)
1218 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1219 if (__glibc_unlikely (dfa->inveclosures == NULL))
1220 return REG_ESPACE;
1221 ret = calc_inveclosure (dfa);
1224 return ret;
1227 /* Our parse trees are very unbalanced, so we cannot use a stack to
1228 implement parse tree visits. Instead, we use parent pointers and
1229 some hairy code in these two functions. */
1230 static reg_errcode_t
1231 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1232 void *extra)
1234 bin_tree_t *node, *prev;
1236 for (node = root; ; )
1238 /* Descend down the tree, preferably to the left (or to the right
1239 if that's the only child). */
1240 while (node->left || node->right)
1241 if (node->left)
1242 node = node->left;
1243 else
1244 node = node->right;
1248 reg_errcode_t err = fn (extra, node);
1249 if (__glibc_unlikely (err != REG_NOERROR))
1250 return err;
1251 if (node->parent == NULL)
1252 return REG_NOERROR;
1253 prev = node;
1254 node = node->parent;
1256 /* Go up while we have a node that is reached from the right. */
1257 while (node->right == prev || node->right == NULL);
1258 node = node->right;
1262 static reg_errcode_t
1263 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1264 void *extra)
1266 bin_tree_t *node;
1268 for (node = root; ; )
1270 reg_errcode_t err = fn (extra, node);
1271 if (__glibc_unlikely (err != REG_NOERROR))
1272 return err;
1274 /* Go to the left node, or up and to the right. */
1275 if (node->left)
1276 node = node->left;
1277 else
1279 bin_tree_t *prev = NULL;
1280 while (node->right == prev || node->right == NULL)
1282 prev = node;
1283 node = node->parent;
1284 if (!node)
1285 return REG_NOERROR;
1287 node = node->right;
1292 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1293 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1294 backreferences as well. Requires a preorder visit. */
1295 static reg_errcode_t
1296 optimize_subexps (void *extra, bin_tree_t *node)
1298 re_dfa_t *dfa = (re_dfa_t *) extra;
1300 if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1302 int idx = node->token.opr.idx;
1303 node->token.opr.idx = dfa->subexp_map[idx];
1304 dfa->used_bkref_map |= 1 << node->token.opr.idx;
1307 else if (node->token.type == SUBEXP
1308 && node->left && node->left->token.type == SUBEXP)
1310 Idx other_idx = node->left->token.opr.idx;
1312 node->left = node->left->left;
1313 if (node->left)
1314 node->left->parent = node;
1316 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1317 if (other_idx < BITSET_WORD_BITS)
1318 dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1321 return REG_NOERROR;
1324 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1325 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1326 static reg_errcode_t
1327 lower_subexps (void *extra, bin_tree_t *node)
1329 regex_t *preg = (regex_t *) extra;
1330 reg_errcode_t err = REG_NOERROR;
1332 if (node->left && node->left->token.type == SUBEXP)
1334 node->left = lower_subexp (&err, preg, node->left);
1335 if (node->left)
1336 node->left->parent = node;
1338 if (node->right && node->right->token.type == SUBEXP)
1340 node->right = lower_subexp (&err, preg, node->right);
1341 if (node->right)
1342 node->right->parent = node;
1345 return err;
1348 static bin_tree_t *
1349 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1351 re_dfa_t *dfa = preg->buffer;
1352 bin_tree_t *body = node->left;
1353 bin_tree_t *op, *cls, *tree1, *tree;
1355 if (preg->no_sub
1356 /* We do not optimize empty subexpressions, because otherwise we may
1357 have bad CONCAT nodes with NULL children. This is obviously not
1358 very common, so we do not lose much. An example that triggers
1359 this case is the sed "script" /\(\)/x. */
1360 && node->left != NULL
1361 && (node->token.opr.idx >= BITSET_WORD_BITS
1362 || !(dfa->used_bkref_map
1363 & ((bitset_word_t) 1 << node->token.opr.idx))))
1364 return node->left;
1366 /* Convert the SUBEXP node to the concatenation of an
1367 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1368 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1369 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1370 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1371 tree = create_tree (dfa, op, tree1, CONCAT);
1372 if (__glibc_unlikely (tree == NULL || tree1 == NULL
1373 || op == NULL || cls == NULL))
1375 *err = REG_ESPACE;
1376 return NULL;
1379 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1380 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1381 return tree;
1384 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1385 nodes. Requires a postorder visit. */
1386 static reg_errcode_t
1387 calc_first (void *extra, bin_tree_t *node)
1389 re_dfa_t *dfa = (re_dfa_t *) extra;
1390 if (node->token.type == CONCAT)
1392 node->first = node->left->first;
1393 node->node_idx = node->left->node_idx;
1395 else
1397 node->first = node;
1398 node->node_idx = re_dfa_add_node (dfa, node->token);
1399 if (__glibc_unlikely (node->node_idx == -1))
1400 return REG_ESPACE;
1401 if (node->token.type == ANCHOR)
1402 dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1404 return REG_NOERROR;
1407 /* Pass 2: compute NEXT on the tree. Preorder visit. */
1408 static reg_errcode_t
1409 calc_next (void *extra, bin_tree_t *node)
1411 switch (node->token.type)
1413 case OP_DUP_ASTERISK:
1414 node->left->next = node;
1415 break;
1416 case CONCAT:
1417 node->left->next = node->right->first;
1418 node->right->next = node->next;
1419 break;
1420 default:
1421 if (node->left)
1422 node->left->next = node->next;
1423 if (node->right)
1424 node->right->next = node->next;
1425 break;
1427 return REG_NOERROR;
1430 /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1431 static reg_errcode_t
1432 link_nfa_nodes (void *extra, bin_tree_t *node)
1434 re_dfa_t *dfa = (re_dfa_t *) extra;
1435 Idx idx = node->node_idx;
1436 reg_errcode_t err = REG_NOERROR;
1438 switch (node->token.type)
1440 case CONCAT:
1441 break;
1443 case END_OF_RE:
1444 DEBUG_ASSERT (node->next == NULL);
1445 break;
1447 case OP_DUP_ASTERISK:
1448 case OP_ALT:
1450 Idx left, right;
1451 dfa->has_plural_match = 1;
1452 if (node->left != NULL)
1453 left = node->left->first->node_idx;
1454 else
1455 left = node->next->node_idx;
1456 if (node->right != NULL)
1457 right = node->right->first->node_idx;
1458 else
1459 right = node->next->node_idx;
1460 DEBUG_ASSERT (left > -1);
1461 DEBUG_ASSERT (right > -1);
1462 err = re_node_set_init_2 (dfa->edests + idx, left, right);
1464 break;
1466 case ANCHOR:
1467 case OP_OPEN_SUBEXP:
1468 case OP_CLOSE_SUBEXP:
1469 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1470 break;
1472 case OP_BACK_REF:
1473 dfa->nexts[idx] = node->next->node_idx;
1474 if (node->token.type == OP_BACK_REF)
1475 err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1476 break;
1478 default:
1479 DEBUG_ASSERT (!IS_EPSILON_NODE (node->token.type));
1480 dfa->nexts[idx] = node->next->node_idx;
1481 break;
1484 return err;
1487 /* Duplicate the epsilon closure of the node ROOT_NODE.
1488 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1489 to their own constraint. */
1491 static reg_errcode_t
1492 duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
1493 Idx root_node, unsigned int init_constraint)
1495 Idx org_node, clone_node;
1496 bool ok;
1497 unsigned int constraint = init_constraint;
1498 for (org_node = top_org_node, clone_node = top_clone_node;;)
1500 Idx org_dest, clone_dest;
1501 if (dfa->nodes[org_node].type == OP_BACK_REF)
1503 /* If the back reference epsilon-transit, its destination must
1504 also have the constraint. Then duplicate the epsilon closure
1505 of the destination of the back reference, and store it in
1506 edests of the back reference. */
1507 org_dest = dfa->nexts[org_node];
1508 re_node_set_empty (dfa->edests + clone_node);
1509 clone_dest = duplicate_node (dfa, org_dest, constraint);
1510 if (__glibc_unlikely (clone_dest == -1))
1511 return REG_ESPACE;
1512 dfa->nexts[clone_node] = dfa->nexts[org_node];
1513 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1514 if (__glibc_unlikely (! ok))
1515 return REG_ESPACE;
1517 else if (dfa->edests[org_node].nelem == 0)
1519 /* In case of the node can't epsilon-transit, don't duplicate the
1520 destination and store the original destination as the
1521 destination of the node. */
1522 dfa->nexts[clone_node] = dfa->nexts[org_node];
1523 break;
1525 else if (dfa->edests[org_node].nelem == 1)
1527 /* In case of the node can epsilon-transit, and it has only one
1528 destination. */
1529 org_dest = dfa->edests[org_node].elems[0];
1530 re_node_set_empty (dfa->edests + clone_node);
1531 /* If the node is root_node itself, it means the epsilon closure
1532 has a loop. Then tie it to the destination of the root_node. */
1533 if (org_node == root_node && clone_node != org_node)
1535 ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
1536 if (__glibc_unlikely (! ok))
1537 return REG_ESPACE;
1538 break;
1540 /* In case the node has another constraint, append it. */
1541 constraint |= dfa->nodes[org_node].constraint;
1542 clone_dest = duplicate_node (dfa, org_dest, constraint);
1543 if (__glibc_unlikely (clone_dest == -1))
1544 return REG_ESPACE;
1545 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1546 if (__glibc_unlikely (! ok))
1547 return REG_ESPACE;
1549 else /* dfa->edests[org_node].nelem == 2 */
1551 /* In case of the node can epsilon-transit, and it has two
1552 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1553 org_dest = dfa->edests[org_node].elems[0];
1554 re_node_set_empty (dfa->edests + clone_node);
1555 /* Search for a duplicated node which satisfies the constraint. */
1556 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1557 if (clone_dest == -1)
1559 /* There is no such duplicated node, create a new one. */
1560 reg_errcode_t err;
1561 clone_dest = duplicate_node (dfa, org_dest, constraint);
1562 if (__glibc_unlikely (clone_dest == -1))
1563 return REG_ESPACE;
1564 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1565 if (__glibc_unlikely (! ok))
1566 return REG_ESPACE;
1567 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1568 root_node, constraint);
1569 if (__glibc_unlikely (err != REG_NOERROR))
1570 return err;
1572 else
1574 /* There is a duplicated node which satisfies the constraint,
1575 use it to avoid infinite loop. */
1576 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1577 if (__glibc_unlikely (! ok))
1578 return REG_ESPACE;
1581 org_dest = dfa->edests[org_node].elems[1];
1582 clone_dest = duplicate_node (dfa, org_dest, constraint);
1583 if (__glibc_unlikely (clone_dest == -1))
1584 return REG_ESPACE;
1585 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1586 if (__glibc_unlikely (! ok))
1587 return REG_ESPACE;
1589 org_node = org_dest;
1590 clone_node = clone_dest;
1592 return REG_NOERROR;
1595 /* Search for a node which is duplicated from the node ORG_NODE, and
1596 satisfies the constraint CONSTRAINT. */
1598 static Idx
1599 search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1600 unsigned int constraint)
1602 Idx idx;
1603 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1605 if (org_node == dfa->org_indices[idx]
1606 && constraint == dfa->nodes[idx].constraint)
1607 return idx; /* Found. */
1609 return -1; /* Not found. */
1612 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1613 Return the index of the new node, or -1 if insufficient storage is
1614 available. */
1616 static Idx
1617 duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
1619 Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1620 if (__glibc_likely (dup_idx != -1))
1622 dfa->nodes[dup_idx].constraint = constraint;
1623 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1624 dfa->nodes[dup_idx].duplicated = 1;
1626 /* Store the index of the original node. */
1627 dfa->org_indices[dup_idx] = org_idx;
1629 return dup_idx;
1632 static reg_errcode_t
1633 calc_inveclosure (re_dfa_t *dfa)
1635 Idx src, idx;
1636 bool ok;
1637 for (idx = 0; idx < dfa->nodes_len; ++idx)
1638 re_node_set_init_empty (dfa->inveclosures + idx);
1640 for (src = 0; src < dfa->nodes_len; ++src)
1642 Idx *elems = dfa->eclosures[src].elems;
1643 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1645 ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1646 if (__glibc_unlikely (! ok))
1647 return REG_ESPACE;
1651 return REG_NOERROR;
1654 /* Calculate "eclosure" for all the node in DFA. */
1656 static reg_errcode_t
1657 calc_eclosure (re_dfa_t *dfa)
1659 Idx node_idx;
1660 bool incomplete;
1661 DEBUG_ASSERT (dfa->nodes_len > 0);
1662 incomplete = false;
1663 /* For each nodes, calculate epsilon closure. */
1664 for (node_idx = 0; ; ++node_idx)
1666 reg_errcode_t err;
1667 re_node_set eclosure_elem;
1668 if (node_idx == dfa->nodes_len)
1670 if (!incomplete)
1671 break;
1672 incomplete = false;
1673 node_idx = 0;
1676 DEBUG_ASSERT (dfa->eclosures[node_idx].nelem != -1);
1678 /* If we have already calculated, skip it. */
1679 if (dfa->eclosures[node_idx].nelem != 0)
1680 continue;
1681 /* Calculate epsilon closure of 'node_idx'. */
1682 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1683 if (__glibc_unlikely (err != REG_NOERROR))
1684 return err;
1686 if (dfa->eclosures[node_idx].nelem == 0)
1688 incomplete = true;
1689 re_node_set_free (&eclosure_elem);
1692 return REG_NOERROR;
1695 /* Calculate epsilon closure of NODE. */
1697 static reg_errcode_t
1698 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
1700 reg_errcode_t err;
1701 Idx i;
1702 re_node_set eclosure;
1703 bool incomplete = false;
1704 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1705 if (__glibc_unlikely (err != REG_NOERROR))
1706 return err;
1708 /* An epsilon closure includes itself. */
1709 eclosure.elems[eclosure.nelem++] = node;
1711 /* This indicates that we are calculating this node now.
1712 We reference this value to avoid infinite loop. */
1713 dfa->eclosures[node].nelem = -1;
1715 /* If the current node has constraints, duplicate all nodes
1716 since they must inherit the constraints. */
1717 if (dfa->nodes[node].constraint
1718 && dfa->edests[node].nelem
1719 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1721 err = duplicate_node_closure (dfa, node, node, node,
1722 dfa->nodes[node].constraint);
1723 if (__glibc_unlikely (err != REG_NOERROR))
1724 return err;
1727 /* Expand each epsilon destination nodes. */
1728 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1729 for (i = 0; i < dfa->edests[node].nelem; ++i)
1731 re_node_set eclosure_elem;
1732 Idx edest = dfa->edests[node].elems[i];
1733 /* If calculating the epsilon closure of 'edest' is in progress,
1734 return intermediate result. */
1735 if (dfa->eclosures[edest].nelem == -1)
1737 incomplete = true;
1738 continue;
1740 /* If we haven't calculated the epsilon closure of 'edest' yet,
1741 calculate now. Otherwise use calculated epsilon closure. */
1742 if (dfa->eclosures[edest].nelem == 0)
1744 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1745 if (__glibc_unlikely (err != REG_NOERROR))
1746 return err;
1748 else
1749 eclosure_elem = dfa->eclosures[edest];
1750 /* Merge the epsilon closure of 'edest'. */
1751 err = re_node_set_merge (&eclosure, &eclosure_elem);
1752 if (__glibc_unlikely (err != REG_NOERROR))
1753 return err;
1754 /* If the epsilon closure of 'edest' is incomplete,
1755 the epsilon closure of this node is also incomplete. */
1756 if (dfa->eclosures[edest].nelem == 0)
1758 incomplete = true;
1759 re_node_set_free (&eclosure_elem);
1763 if (incomplete && !root)
1764 dfa->eclosures[node].nelem = 0;
1765 else
1766 dfa->eclosures[node] = eclosure;
1767 *new_set = eclosure;
1768 return REG_NOERROR;
1771 /* Functions for token which are used in the parser. */
1773 /* Fetch a token from INPUT.
1774 We must not use this function inside bracket expressions. */
1776 static void
1777 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1779 re_string_skip_bytes (input, peek_token (result, input, syntax));
1782 /* Peek a token from INPUT, and return the length of the token.
1783 We must not use this function inside bracket expressions. */
1785 static int
1786 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1788 unsigned char c;
1790 if (re_string_eoi (input))
1792 token->type = END_OF_RE;
1793 return 0;
1796 c = re_string_peek_byte (input, 0);
1797 token->opr.c = c;
1799 token->word_char = 0;
1800 #ifdef RE_ENABLE_I18N
1801 token->mb_partial = 0;
1802 if (input->mb_cur_max > 1
1803 && !re_string_first_byte (input, re_string_cur_idx (input)))
1805 token->type = CHARACTER;
1806 token->mb_partial = 1;
1807 return 1;
1809 #endif
1810 if (c == '\\')
1812 unsigned char c2;
1813 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1815 token->type = BACK_SLASH;
1816 return 1;
1819 c2 = re_string_peek_byte_case (input, 1);
1820 token->opr.c = c2;
1821 token->type = CHARACTER;
1822 #ifdef RE_ENABLE_I18N
1823 if (input->mb_cur_max > 1)
1825 wint_t wc = re_string_wchar_at (input,
1826 re_string_cur_idx (input) + 1);
1827 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1829 else
1830 #endif
1831 token->word_char = IS_WORD_CHAR (c2) != 0;
1833 switch (c2)
1835 case '|':
1836 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1837 token->type = OP_ALT;
1838 break;
1839 case '1': case '2': case '3': case '4': case '5':
1840 case '6': case '7': case '8': case '9':
1841 if (!(syntax & RE_NO_BK_REFS))
1843 token->type = OP_BACK_REF;
1844 token->opr.idx = c2 - '1';
1846 break;
1847 case '<':
1848 if (!(syntax & RE_NO_GNU_OPS))
1850 token->type = ANCHOR;
1851 token->opr.ctx_type = WORD_FIRST;
1853 break;
1854 case '>':
1855 if (!(syntax & RE_NO_GNU_OPS))
1857 token->type = ANCHOR;
1858 token->opr.ctx_type = WORD_LAST;
1860 break;
1861 case 'b':
1862 if (!(syntax & RE_NO_GNU_OPS))
1864 token->type = ANCHOR;
1865 token->opr.ctx_type = WORD_DELIM;
1867 break;
1868 case 'B':
1869 if (!(syntax & RE_NO_GNU_OPS))
1871 token->type = ANCHOR;
1872 token->opr.ctx_type = NOT_WORD_DELIM;
1874 break;
1875 case 'w':
1876 if (!(syntax & RE_NO_GNU_OPS))
1877 token->type = OP_WORD;
1878 break;
1879 case 'W':
1880 if (!(syntax & RE_NO_GNU_OPS))
1881 token->type = OP_NOTWORD;
1882 break;
1883 case 's':
1884 if (!(syntax & RE_NO_GNU_OPS))
1885 token->type = OP_SPACE;
1886 break;
1887 case 'S':
1888 if (!(syntax & RE_NO_GNU_OPS))
1889 token->type = OP_NOTSPACE;
1890 break;
1891 case '`':
1892 if (!(syntax & RE_NO_GNU_OPS))
1894 token->type = ANCHOR;
1895 token->opr.ctx_type = BUF_FIRST;
1897 break;
1898 case '\'':
1899 if (!(syntax & RE_NO_GNU_OPS))
1901 token->type = ANCHOR;
1902 token->opr.ctx_type = BUF_LAST;
1904 break;
1905 case '(':
1906 if (!(syntax & RE_NO_BK_PARENS))
1907 token->type = OP_OPEN_SUBEXP;
1908 break;
1909 case ')':
1910 if (!(syntax & RE_NO_BK_PARENS))
1911 token->type = OP_CLOSE_SUBEXP;
1912 break;
1913 case '+':
1914 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1915 token->type = OP_DUP_PLUS;
1916 break;
1917 case '?':
1918 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1919 token->type = OP_DUP_QUESTION;
1920 break;
1921 case '{':
1922 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1923 token->type = OP_OPEN_DUP_NUM;
1924 break;
1925 case '}':
1926 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1927 token->type = OP_CLOSE_DUP_NUM;
1928 break;
1929 default:
1930 break;
1932 return 2;
1935 token->type = CHARACTER;
1936 #ifdef RE_ENABLE_I18N
1937 if (input->mb_cur_max > 1)
1939 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1940 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1942 else
1943 #endif
1944 token->word_char = IS_WORD_CHAR (token->opr.c);
1946 switch (c)
1948 case '\n':
1949 if (syntax & RE_NEWLINE_ALT)
1950 token->type = OP_ALT;
1951 break;
1952 case '|':
1953 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1954 token->type = OP_ALT;
1955 break;
1956 case '*':
1957 token->type = OP_DUP_ASTERISK;
1958 break;
1959 case '+':
1960 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1961 token->type = OP_DUP_PLUS;
1962 break;
1963 case '?':
1964 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1965 token->type = OP_DUP_QUESTION;
1966 break;
1967 case '{':
1968 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1969 token->type = OP_OPEN_DUP_NUM;
1970 break;
1971 case '}':
1972 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1973 token->type = OP_CLOSE_DUP_NUM;
1974 break;
1975 case '(':
1976 if (syntax & RE_NO_BK_PARENS)
1977 token->type = OP_OPEN_SUBEXP;
1978 break;
1979 case ')':
1980 if (syntax & RE_NO_BK_PARENS)
1981 token->type = OP_CLOSE_SUBEXP;
1982 break;
1983 case '[':
1984 token->type = OP_OPEN_BRACKET;
1985 break;
1986 case '.':
1987 token->type = OP_PERIOD;
1988 break;
1989 case '^':
1990 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE))
1991 && re_string_cur_idx (input) != 0)
1993 char prev = re_string_peek_byte (input, -1);
1994 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1995 break;
1997 token->type = ANCHOR;
1998 token->opr.ctx_type = LINE_FIRST;
1999 break;
2000 case '$':
2001 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS)
2002 && re_string_cur_idx (input) + 1 != re_string_length (input))
2004 re_token_t next;
2005 re_string_skip_bytes (input, 1);
2006 peek_token (&next, input, syntax);
2007 re_string_skip_bytes (input, -1);
2008 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
2009 break;
2011 token->type = ANCHOR;
2012 token->opr.ctx_type = LINE_LAST;
2013 break;
2014 default:
2015 break;
2017 return 1;
2020 /* Peek a token from INPUT, and return the length of the token.
2021 We must not use this function out of bracket expressions. */
2023 static int
2024 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2026 unsigned char c;
2027 if (re_string_eoi (input))
2029 token->type = END_OF_RE;
2030 return 0;
2032 c = re_string_peek_byte (input, 0);
2033 token->opr.c = c;
2035 #ifdef RE_ENABLE_I18N
2036 if (input->mb_cur_max > 1
2037 && !re_string_first_byte (input, re_string_cur_idx (input)))
2039 token->type = CHARACTER;
2040 return 1;
2042 #endif /* RE_ENABLE_I18N */
2044 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2045 && re_string_cur_idx (input) + 1 < re_string_length (input))
2047 /* In this case, '\' escape a character. */
2048 unsigned char c2;
2049 re_string_skip_bytes (input, 1);
2050 c2 = re_string_peek_byte (input, 0);
2051 token->opr.c = c2;
2052 token->type = CHARACTER;
2053 return 1;
2055 if (c == '[') /* '[' is a special char in a bracket exps. */
2057 unsigned char c2;
2058 int token_len;
2059 if (re_string_cur_idx (input) + 1 < re_string_length (input))
2060 c2 = re_string_peek_byte (input, 1);
2061 else
2062 c2 = 0;
2063 token->opr.c = c2;
2064 token_len = 2;
2065 switch (c2)
2067 case '.':
2068 token->type = OP_OPEN_COLL_ELEM;
2069 break;
2071 case '=':
2072 token->type = OP_OPEN_EQUIV_CLASS;
2073 break;
2075 case ':':
2076 if (syntax & RE_CHAR_CLASSES)
2078 token->type = OP_OPEN_CHAR_CLASS;
2079 break;
2081 FALLTHROUGH;
2082 default:
2083 token->type = CHARACTER;
2084 token->opr.c = c;
2085 token_len = 1;
2086 break;
2088 return token_len;
2090 switch (c)
2092 case '-':
2093 token->type = OP_CHARSET_RANGE;
2094 break;
2095 case ']':
2096 token->type = OP_CLOSE_BRACKET;
2097 break;
2098 case '^':
2099 token->type = OP_NON_MATCH_LIST;
2100 break;
2101 default:
2102 token->type = CHARACTER;
2104 return 1;
2107 /* Functions for parser. */
2109 /* Entry point of the parser.
2110 Parse the regular expression REGEXP and return the structure tree.
2111 If an error occurs, ERR is set by error code, and return NULL.
2112 This function build the following tree, from regular expression <reg_exp>:
2116 <reg_exp> EOR
2118 CAT means concatenation.
2119 EOR means end of regular expression. */
2121 static bin_tree_t *
2122 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2123 reg_errcode_t *err)
2125 re_dfa_t *dfa = preg->buffer;
2126 bin_tree_t *tree, *eor, *root;
2127 re_token_t current_token;
2128 dfa->syntax = syntax;
2129 fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2130 tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2131 if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2132 return NULL;
2133 eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2134 if (tree != NULL)
2135 root = create_tree (dfa, tree, eor, CONCAT);
2136 else
2137 root = eor;
2138 if (__glibc_unlikely (eor == NULL || root == NULL))
2140 *err = REG_ESPACE;
2141 return NULL;
2143 return root;
2146 /* This function build the following tree, from regular expression
2147 <branch1>|<branch2>:
2151 <branch1> <branch2>
2153 ALT means alternative, which represents the operator '|'. */
2155 static bin_tree_t *
2156 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2157 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2159 re_dfa_t *dfa = preg->buffer;
2160 bin_tree_t *tree, *branch = NULL;
2161 bitset_word_t initial_bkref_map = dfa->completed_bkref_map;
2162 tree = parse_branch (regexp, preg, token, syntax, nest, err);
2163 if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2164 return NULL;
2166 while (token->type == OP_ALT)
2168 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2169 if (token->type != OP_ALT && token->type != END_OF_RE
2170 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2172 bitset_word_t accumulated_bkref_map = dfa->completed_bkref_map;
2173 dfa->completed_bkref_map = initial_bkref_map;
2174 branch = parse_branch (regexp, preg, token, syntax, nest, err);
2175 if (__glibc_unlikely (*err != REG_NOERROR && branch == NULL))
2177 if (tree != NULL)
2178 postorder (tree, free_tree, NULL);
2179 return NULL;
2181 dfa->completed_bkref_map |= accumulated_bkref_map;
2183 else
2184 branch = NULL;
2185 tree = create_tree (dfa, tree, branch, OP_ALT);
2186 if (__glibc_unlikely (tree == NULL))
2188 *err = REG_ESPACE;
2189 return NULL;
2192 return tree;
2195 /* This function build the following tree, from regular expression
2196 <exp1><exp2>:
2200 <exp1> <exp2>
2202 CAT means concatenation. */
2204 static bin_tree_t *
2205 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2206 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2208 bin_tree_t *tree, *expr;
2209 re_dfa_t *dfa = preg->buffer;
2210 tree = parse_expression (regexp, preg, token, syntax, nest, err);
2211 if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2212 return NULL;
2214 while (token->type != OP_ALT && token->type != END_OF_RE
2215 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2217 expr = parse_expression (regexp, preg, token, syntax, nest, err);
2218 if (__glibc_unlikely (*err != REG_NOERROR && expr == NULL))
2220 if (tree != NULL)
2221 postorder (tree, free_tree, NULL);
2222 return NULL;
2224 if (tree != NULL && expr != NULL)
2226 bin_tree_t *newtree = create_tree (dfa, tree, expr, CONCAT);
2227 if (newtree == NULL)
2229 postorder (expr, free_tree, NULL);
2230 postorder (tree, free_tree, NULL);
2231 *err = REG_ESPACE;
2232 return NULL;
2234 tree = newtree;
2236 else if (tree == NULL)
2237 tree = expr;
2238 /* Otherwise expr == NULL, we don't need to create new tree. */
2240 return tree;
2243 /* This function build the following tree, from regular expression a*:
2249 static bin_tree_t *
2250 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2251 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2253 re_dfa_t *dfa = preg->buffer;
2254 bin_tree_t *tree;
2255 switch (token->type)
2257 case CHARACTER:
2258 tree = create_token_tree (dfa, NULL, NULL, token);
2259 if (__glibc_unlikely (tree == NULL))
2261 *err = REG_ESPACE;
2262 return NULL;
2264 #ifdef RE_ENABLE_I18N
2265 if (dfa->mb_cur_max > 1)
2267 while (!re_string_eoi (regexp)
2268 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2270 bin_tree_t *mbc_remain;
2271 fetch_token (token, regexp, syntax);
2272 mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2273 tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2274 if (__glibc_unlikely (mbc_remain == NULL || tree == NULL))
2276 *err = REG_ESPACE;
2277 return NULL;
2281 #endif
2282 break;
2284 case OP_OPEN_SUBEXP:
2285 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2286 if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2287 return NULL;
2288 break;
2290 case OP_OPEN_BRACKET:
2291 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2292 if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2293 return NULL;
2294 break;
2296 case OP_BACK_REF:
2297 if (!__glibc_likely (dfa->completed_bkref_map & (1 << token->opr.idx)))
2299 *err = REG_ESUBREG;
2300 return NULL;
2302 dfa->used_bkref_map |= 1 << token->opr.idx;
2303 tree = create_token_tree (dfa, NULL, NULL, token);
2304 if (__glibc_unlikely (tree == NULL))
2306 *err = REG_ESPACE;
2307 return NULL;
2309 ++dfa->nbackref;
2310 dfa->has_mb_node = 1;
2311 break;
2313 case OP_OPEN_DUP_NUM:
2314 if (syntax & RE_CONTEXT_INVALID_DUP)
2316 *err = REG_BADRPT;
2317 return NULL;
2319 FALLTHROUGH;
2320 case OP_DUP_ASTERISK:
2321 case OP_DUP_PLUS:
2322 case OP_DUP_QUESTION:
2323 if (syntax & RE_CONTEXT_INVALID_OPS)
2325 *err = REG_BADRPT;
2326 return NULL;
2328 else if (syntax & RE_CONTEXT_INDEP_OPS)
2330 fetch_token (token, regexp, syntax);
2331 return parse_expression (regexp, preg, token, syntax, nest, err);
2333 FALLTHROUGH;
2334 case OP_CLOSE_SUBEXP:
2335 if ((token->type == OP_CLOSE_SUBEXP)
2336 && !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2338 *err = REG_ERPAREN;
2339 return NULL;
2341 FALLTHROUGH;
2342 case OP_CLOSE_DUP_NUM:
2343 /* We treat it as a normal character. */
2345 /* Then we can these characters as normal characters. */
2346 token->type = CHARACTER;
2347 /* mb_partial and word_char bits should be initialized already
2348 by peek_token. */
2349 tree = create_token_tree (dfa, NULL, NULL, token);
2350 if (__glibc_unlikely (tree == NULL))
2352 *err = REG_ESPACE;
2353 return NULL;
2355 break;
2357 case ANCHOR:
2358 if ((token->opr.ctx_type
2359 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2360 && dfa->word_ops_used == 0)
2361 init_word_char (dfa);
2362 if (token->opr.ctx_type == WORD_DELIM
2363 || token->opr.ctx_type == NOT_WORD_DELIM)
2365 bin_tree_t *tree_first, *tree_last;
2366 if (token->opr.ctx_type == WORD_DELIM)
2368 token->opr.ctx_type = WORD_FIRST;
2369 tree_first = create_token_tree (dfa, NULL, NULL, token);
2370 token->opr.ctx_type = WORD_LAST;
2372 else
2374 token->opr.ctx_type = INSIDE_WORD;
2375 tree_first = create_token_tree (dfa, NULL, NULL, token);
2376 token->opr.ctx_type = INSIDE_NOTWORD;
2378 tree_last = create_token_tree (dfa, NULL, NULL, token);
2379 tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2380 if (__glibc_unlikely (tree_first == NULL || tree_last == NULL
2381 || tree == NULL))
2383 *err = REG_ESPACE;
2384 return NULL;
2387 else
2389 tree = create_token_tree (dfa, NULL, NULL, token);
2390 if (__glibc_unlikely (tree == NULL))
2392 *err = REG_ESPACE;
2393 return NULL;
2396 /* We must return here, since ANCHORs can't be followed
2397 by repetition operators.
2398 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2399 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2400 fetch_token (token, regexp, syntax);
2401 return tree;
2403 case OP_PERIOD:
2404 tree = create_token_tree (dfa, NULL, NULL, token);
2405 if (__glibc_unlikely (tree == NULL))
2407 *err = REG_ESPACE;
2408 return NULL;
2410 if (dfa->mb_cur_max > 1)
2411 dfa->has_mb_node = 1;
2412 break;
2414 case OP_WORD:
2415 case OP_NOTWORD:
2416 tree = build_charclass_op (dfa, regexp->trans,
2417 "alnum",
2418 "_",
2419 token->type == OP_NOTWORD, err);
2420 if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2421 return NULL;
2422 break;
2424 case OP_SPACE:
2425 case OP_NOTSPACE:
2426 tree = build_charclass_op (dfa, regexp->trans,
2427 "space",
2429 token->type == OP_NOTSPACE, err);
2430 if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2431 return NULL;
2432 break;
2434 case OP_ALT:
2435 case END_OF_RE:
2436 return NULL;
2438 case BACK_SLASH:
2439 *err = REG_EESCAPE;
2440 return NULL;
2442 default:
2443 /* Must not happen? */
2444 DEBUG_ASSERT (false);
2445 return NULL;
2447 fetch_token (token, regexp, syntax);
2449 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2450 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2452 bin_tree_t *dup_tree = parse_dup_op (tree, regexp, dfa, token,
2453 syntax, err);
2454 if (__glibc_unlikely (*err != REG_NOERROR && dup_tree == NULL))
2456 if (tree != NULL)
2457 postorder (tree, free_tree, NULL);
2458 return NULL;
2460 tree = dup_tree;
2461 /* In BRE consecutive duplications are not allowed. */
2462 if ((syntax & RE_CONTEXT_INVALID_DUP)
2463 && (token->type == OP_DUP_ASTERISK
2464 || token->type == OP_OPEN_DUP_NUM))
2466 if (tree != NULL)
2467 postorder (tree, free_tree, NULL);
2468 *err = REG_BADRPT;
2469 return NULL;
2473 return tree;
2476 /* This function build the following tree, from regular expression
2477 (<reg_exp>):
2478 SUBEXP
2480 <reg_exp>
2483 static bin_tree_t *
2484 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2485 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2487 re_dfa_t *dfa = preg->buffer;
2488 bin_tree_t *tree;
2489 size_t cur_nsub;
2490 cur_nsub = preg->re_nsub++;
2492 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2494 /* The subexpression may be a null string. */
2495 if (token->type == OP_CLOSE_SUBEXP)
2496 tree = NULL;
2497 else
2499 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2500 if (__glibc_unlikely (*err == REG_NOERROR
2501 && token->type != OP_CLOSE_SUBEXP))
2503 if (tree != NULL)
2504 postorder (tree, free_tree, NULL);
2505 *err = REG_EPAREN;
2507 if (__glibc_unlikely (*err != REG_NOERROR))
2508 return NULL;
2511 if (cur_nsub <= '9' - '1')
2512 dfa->completed_bkref_map |= 1 << cur_nsub;
2514 tree = create_tree (dfa, tree, NULL, SUBEXP);
2515 if (__glibc_unlikely (tree == NULL))
2517 *err = REG_ESPACE;
2518 return NULL;
2520 tree->token.opr.idx = cur_nsub;
2521 return tree;
2524 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2526 static bin_tree_t *
2527 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2528 re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2530 bin_tree_t *tree = NULL, *old_tree = NULL;
2531 Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2532 re_token_t start_token = *token;
2534 if (token->type == OP_OPEN_DUP_NUM)
2536 end = 0;
2537 start = fetch_number (regexp, token, syntax);
2538 if (start == -1)
2540 if (token->type == CHARACTER && token->opr.c == ',')
2541 start = 0; /* We treat "{,m}" as "{0,m}". */
2542 else
2544 *err = REG_BADBR; /* <re>{} is invalid. */
2545 return NULL;
2548 if (__glibc_likely (start != -2))
2550 /* We treat "{n}" as "{n,n}". */
2551 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2552 : ((token->type == CHARACTER && token->opr.c == ',')
2553 ? fetch_number (regexp, token, syntax) : -2));
2555 if (__glibc_unlikely (start == -2 || end == -2))
2557 /* Invalid sequence. */
2558 if (__glibc_unlikely (!(syntax & RE_INVALID_INTERVAL_ORD)))
2560 if (token->type == END_OF_RE)
2561 *err = REG_EBRACE;
2562 else
2563 *err = REG_BADBR;
2565 return NULL;
2568 /* If the syntax bit is set, rollback. */
2569 re_string_set_index (regexp, start_idx);
2570 *token = start_token;
2571 token->type = CHARACTER;
2572 /* mb_partial and word_char bits should be already initialized by
2573 peek_token. */
2574 return elem;
2577 if (__glibc_unlikely ((end != -1 && start > end)
2578 || token->type != OP_CLOSE_DUP_NUM))
2580 /* First number greater than second. */
2581 *err = REG_BADBR;
2582 return NULL;
2585 if (__glibc_unlikely (RE_DUP_MAX < (end == -1 ? start : end)))
2587 *err = REG_ESIZE;
2588 return NULL;
2591 else
2593 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2594 end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2597 fetch_token (token, regexp, syntax);
2599 if (__glibc_unlikely (elem == NULL))
2600 return NULL;
2601 if (__glibc_unlikely (start == 0 && end == 0))
2603 postorder (elem, free_tree, NULL);
2604 return NULL;
2607 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2608 if (__glibc_unlikely (start > 0))
2610 tree = elem;
2611 for (i = 2; i <= start; ++i)
2613 elem = duplicate_tree (elem, dfa);
2614 tree = create_tree (dfa, tree, elem, CONCAT);
2615 if (__glibc_unlikely (elem == NULL || tree == NULL))
2616 goto parse_dup_op_espace;
2619 if (start == end)
2620 return tree;
2622 /* Duplicate ELEM before it is marked optional. */
2623 elem = duplicate_tree (elem, dfa);
2624 if (__glibc_unlikely (elem == NULL))
2625 goto parse_dup_op_espace;
2626 old_tree = tree;
2628 else
2629 old_tree = NULL;
2631 if (elem->token.type == SUBEXP)
2633 uintptr_t subidx = elem->token.opr.idx;
2634 postorder (elem, mark_opt_subexp, (void *) subidx);
2637 tree = create_tree (dfa, elem, NULL,
2638 (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2639 if (__glibc_unlikely (tree == NULL))
2640 goto parse_dup_op_espace;
2642 /* This loop is actually executed only when end != -1,
2643 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2644 already created the start+1-th copy. */
2645 if (TYPE_SIGNED (Idx) || end != -1)
2646 for (i = start + 2; i <= end; ++i)
2648 elem = duplicate_tree (elem, dfa);
2649 tree = create_tree (dfa, tree, elem, CONCAT);
2650 if (__glibc_unlikely (elem == NULL || tree == NULL))
2651 goto parse_dup_op_espace;
2653 tree = create_tree (dfa, tree, NULL, OP_ALT);
2654 if (__glibc_unlikely (tree == NULL))
2655 goto parse_dup_op_espace;
2658 if (old_tree)
2659 tree = create_tree (dfa, old_tree, tree, CONCAT);
2661 return tree;
2663 parse_dup_op_espace:
2664 *err = REG_ESPACE;
2665 return NULL;
2668 /* Size of the names for collating symbol/equivalence_class/character_class.
2669 I'm not sure, but maybe enough. */
2670 #define BRACKET_NAME_BUF_SIZE 32
2672 #ifndef _LIBC
2674 # ifdef RE_ENABLE_I18N
2675 /* Convert the byte B to the corresponding wide character. In a
2676 unibyte locale, treat B as itself. In a multibyte locale, return
2677 WEOF if B is an encoding error. */
2678 static wint_t
2679 parse_byte (unsigned char b, re_charset_t *mbcset)
2681 return mbcset == NULL ? b : __btowc (b);
2683 # endif
2685 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2686 Build the range expression which starts from START_ELEM, and ends
2687 at END_ELEM. The result are written to MBCSET and SBCSET.
2688 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2689 mbcset->range_ends, is a pointer argument since we may
2690 update it. */
2692 static reg_errcode_t
2693 # ifdef RE_ENABLE_I18N
2694 build_range_exp (const reg_syntax_t syntax,
2695 bitset_t sbcset,
2696 re_charset_t *mbcset,
2697 Idx *range_alloc,
2698 const bracket_elem_t *start_elem,
2699 const bracket_elem_t *end_elem)
2700 # else /* not RE_ENABLE_I18N */
2701 build_range_exp (const reg_syntax_t syntax,
2702 bitset_t sbcset,
2703 const bracket_elem_t *start_elem,
2704 const bracket_elem_t *end_elem)
2705 # endif /* not RE_ENABLE_I18N */
2707 unsigned int start_ch, end_ch;
2708 /* Equivalence Classes and Character Classes can't be a range start/end. */
2709 if (__glibc_unlikely (start_elem->type == EQUIV_CLASS
2710 || start_elem->type == CHAR_CLASS
2711 || end_elem->type == EQUIV_CLASS
2712 || end_elem->type == CHAR_CLASS))
2713 return REG_ERANGE;
2715 /* We can handle no multi character collating elements without libc
2716 support. */
2717 if (__glibc_unlikely ((start_elem->type == COLL_SYM
2718 && strlen ((char *) start_elem->opr.name) > 1)
2719 || (end_elem->type == COLL_SYM
2720 && strlen ((char *) end_elem->opr.name) > 1)))
2721 return REG_ECOLLATE;
2723 # ifdef RE_ENABLE_I18N
2725 wchar_t wc;
2726 wint_t start_wc;
2727 wint_t end_wc;
2729 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2730 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2731 : 0));
2732 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2733 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2734 : 0));
2735 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2736 ? parse_byte (start_ch, mbcset) : start_elem->opr.wch);
2737 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2738 ? parse_byte (end_ch, mbcset) : end_elem->opr.wch);
2739 if (start_wc == WEOF || end_wc == WEOF)
2740 return REG_ECOLLATE;
2741 else if (__glibc_unlikely ((syntax & RE_NO_EMPTY_RANGES)
2742 && start_wc > end_wc))
2743 return REG_ERANGE;
2745 /* Got valid collation sequence values, add them as a new entry.
2746 However, for !_LIBC we have no collation elements: if the
2747 character set is single byte, the single byte character set
2748 that we build below suffices. parse_bracket_exp passes
2749 no MBCSET if dfa->mb_cur_max == 1. */
2750 if (mbcset)
2752 /* Check the space of the arrays. */
2753 if (__glibc_unlikely (*range_alloc == mbcset->nranges))
2755 /* There is not enough space, need realloc. */
2756 wchar_t *new_array_start, *new_array_end;
2757 Idx new_nranges;
2759 /* +1 in case of mbcset->nranges is 0. */
2760 new_nranges = 2 * mbcset->nranges + 1;
2761 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2762 are NULL if *range_alloc == 0. */
2763 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2764 new_nranges);
2765 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2766 new_nranges);
2768 if (__glibc_unlikely (new_array_start == NULL
2769 || new_array_end == NULL))
2771 re_free (new_array_start);
2772 re_free (new_array_end);
2773 return REG_ESPACE;
2776 mbcset->range_starts = new_array_start;
2777 mbcset->range_ends = new_array_end;
2778 *range_alloc = new_nranges;
2781 mbcset->range_starts[mbcset->nranges] = start_wc;
2782 mbcset->range_ends[mbcset->nranges++] = end_wc;
2785 /* Build the table for single byte characters. */
2786 for (wc = 0; wc < SBC_MAX; ++wc)
2788 if (start_wc <= wc && wc <= end_wc)
2789 bitset_set (sbcset, wc);
2792 # else /* not RE_ENABLE_I18N */
2794 unsigned int ch;
2795 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2796 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2797 : 0));
2798 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2799 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2800 : 0));
2801 if (start_ch > end_ch)
2802 return REG_ERANGE;
2803 /* Build the table for single byte characters. */
2804 for (ch = 0; ch < SBC_MAX; ++ch)
2805 if (start_ch <= ch && ch <= end_ch)
2806 bitset_set (sbcset, ch);
2808 # endif /* not RE_ENABLE_I18N */
2809 return REG_NOERROR;
2811 #endif /* not _LIBC */
2813 #ifndef _LIBC
2814 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2815 Build the collating element which is represented by NAME.
2816 The result are written to MBCSET and SBCSET.
2817 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2818 pointer argument since we may update it. */
2820 static reg_errcode_t
2821 # ifdef RE_ENABLE_I18N
2822 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2823 Idx *coll_sym_alloc, const unsigned char *name)
2824 # else /* not RE_ENABLE_I18N */
2825 build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2826 # endif /* not RE_ENABLE_I18N */
2828 size_t name_len = strlen ((const char *) name);
2829 if (__glibc_unlikely (name_len != 1))
2830 return REG_ECOLLATE;
2831 else
2833 bitset_set (sbcset, name[0]);
2834 return REG_NOERROR;
2837 #endif /* not _LIBC */
2839 /* This function parse bracket expression like "[abc]", "[a-c]",
2840 "[[.a-a.]]" etc. */
2842 static bin_tree_t *
2843 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2844 reg_syntax_t syntax, reg_errcode_t *err)
2846 #ifdef _LIBC
2847 const unsigned char *collseqmb;
2848 const char *collseqwc;
2849 uint32_t nrules;
2850 int32_t table_size;
2851 const int32_t *symb_table;
2852 const unsigned char *extra;
2854 /* Local function for parse_bracket_exp used in _LIBC environment.
2855 Seek the collating symbol entry corresponding to NAME.
2856 Return the index of the symbol in the SYMB_TABLE,
2857 or -1 if not found. */
2859 auto inline int32_t
2860 __attribute__ ((always_inline))
2861 seek_collating_symbol_entry (const unsigned char *name, size_t name_len)
2863 int32_t elem;
2865 for (elem = 0; elem < table_size; elem++)
2866 if (symb_table[2 * elem] != 0)
2868 int32_t idx = symb_table[2 * elem + 1];
2869 /* Skip the name of collating element name. */
2870 idx += 1 + extra[idx];
2871 if (/* Compare the length of the name. */
2872 name_len == extra[idx]
2873 /* Compare the name. */
2874 && memcmp (name, &extra[idx + 1], name_len) == 0)
2875 /* Yep, this is the entry. */
2876 return elem;
2878 return -1;
2881 /* Local function for parse_bracket_exp used in _LIBC environment.
2882 Look up the collation sequence value of BR_ELEM.
2883 Return the value if succeeded, UINT_MAX otherwise. */
2885 auto inline unsigned int
2886 __attribute__ ((always_inline))
2887 lookup_collation_sequence_value (bracket_elem_t *br_elem)
2889 if (br_elem->type == SB_CHAR)
2892 if (MB_CUR_MAX == 1)
2894 if (nrules == 0)
2895 return collseqmb[br_elem->opr.ch];
2896 else
2898 wint_t wc = __btowc (br_elem->opr.ch);
2899 return __collseq_table_lookup (collseqwc, wc);
2902 else if (br_elem->type == MB_CHAR)
2904 if (nrules != 0)
2905 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2907 else if (br_elem->type == COLL_SYM)
2909 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2910 if (nrules != 0)
2912 int32_t elem, idx;
2913 elem = seek_collating_symbol_entry (br_elem->opr.name,
2914 sym_name_len);
2915 if (elem != -1)
2917 /* We found the entry. */
2918 idx = symb_table[2 * elem + 1];
2919 /* Skip the name of collating element name. */
2920 idx += 1 + extra[idx];
2921 /* Skip the byte sequence of the collating element. */
2922 idx += 1 + extra[idx];
2923 /* Adjust for the alignment. */
2924 idx = (idx + 3) & ~3;
2925 /* Skip the multibyte collation sequence value. */
2926 idx += sizeof (unsigned int);
2927 /* Skip the wide char sequence of the collating element. */
2928 idx += sizeof (unsigned int) *
2929 (1 + *(unsigned int *) (extra + idx));
2930 /* Return the collation sequence value. */
2931 return *(unsigned int *) (extra + idx);
2933 else if (sym_name_len == 1)
2935 /* No valid character. Match it as a single byte
2936 character. */
2937 return collseqmb[br_elem->opr.name[0]];
2940 else if (sym_name_len == 1)
2941 return collseqmb[br_elem->opr.name[0]];
2943 return UINT_MAX;
2946 /* Local function for parse_bracket_exp used in _LIBC environment.
2947 Build the range expression which starts from START_ELEM, and ends
2948 at END_ELEM. The result are written to MBCSET and SBCSET.
2949 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2950 mbcset->range_ends, is a pointer argument since we may
2951 update it. */
2953 auto inline reg_errcode_t
2954 __attribute__ ((always_inline))
2955 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2956 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2958 unsigned int ch;
2959 uint32_t start_collseq;
2960 uint32_t end_collseq;
2962 /* Equivalence Classes and Character Classes can't be a range
2963 start/end. */
2964 if (__glibc_unlikely (start_elem->type == EQUIV_CLASS
2965 || start_elem->type == CHAR_CLASS
2966 || end_elem->type == EQUIV_CLASS
2967 || end_elem->type == CHAR_CLASS))
2968 return REG_ERANGE;
2970 /* FIXME: Implement rational ranges here, too. */
2971 start_collseq = lookup_collation_sequence_value (start_elem);
2972 end_collseq = lookup_collation_sequence_value (end_elem);
2973 /* Check start/end collation sequence values. */
2974 if (__glibc_unlikely (start_collseq == UINT_MAX
2975 || end_collseq == UINT_MAX))
2976 return REG_ECOLLATE;
2977 if (__glibc_unlikely ((syntax & RE_NO_EMPTY_RANGES)
2978 && start_collseq > end_collseq))
2979 return REG_ERANGE;
2981 /* Got valid collation sequence values, add them as a new entry.
2982 However, if we have no collation elements, and the character set
2983 is single byte, the single byte character set that we
2984 build below suffices. */
2985 if (nrules > 0 || dfa->mb_cur_max > 1)
2987 /* Check the space of the arrays. */
2988 if (__glibc_unlikely (*range_alloc == mbcset->nranges))
2990 /* There is not enough space, need realloc. */
2991 uint32_t *new_array_start;
2992 uint32_t *new_array_end;
2993 Idx new_nranges;
2995 /* +1 in case of mbcset->nranges is 0. */
2996 new_nranges = 2 * mbcset->nranges + 1;
2997 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2998 new_nranges);
2999 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
3000 new_nranges);
3002 if (__glibc_unlikely (new_array_start == NULL
3003 || new_array_end == NULL))
3004 return REG_ESPACE;
3006 mbcset->range_starts = new_array_start;
3007 mbcset->range_ends = new_array_end;
3008 *range_alloc = new_nranges;
3011 mbcset->range_starts[mbcset->nranges] = start_collseq;
3012 mbcset->range_ends[mbcset->nranges++] = end_collseq;
3015 /* Build the table for single byte characters. */
3016 for (ch = 0; ch < SBC_MAX; ch++)
3018 uint32_t ch_collseq;
3020 if (MB_CUR_MAX == 1)
3022 if (nrules == 0)
3023 ch_collseq = collseqmb[ch];
3024 else
3025 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
3026 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
3027 bitset_set (sbcset, ch);
3029 return REG_NOERROR;
3032 /* Local function for parse_bracket_exp used in _LIBC environment.
3033 Build the collating element which is represented by NAME.
3034 The result are written to MBCSET and SBCSET.
3035 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
3036 pointer argument since we may update it. */
3038 auto inline reg_errcode_t
3039 __attribute__ ((always_inline))
3040 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
3041 Idx *coll_sym_alloc, const unsigned char *name)
3043 int32_t elem, idx;
3044 size_t name_len = strlen ((const char *) name);
3045 if (nrules != 0)
3047 elem = seek_collating_symbol_entry (name, name_len);
3048 if (elem != -1)
3050 /* We found the entry. */
3051 idx = symb_table[2 * elem + 1];
3052 /* Skip the name of collating element name. */
3053 idx += 1 + extra[idx];
3055 else if (name_len == 1)
3057 /* No valid character, treat it as a normal
3058 character. */
3059 bitset_set (sbcset, name[0]);
3060 return REG_NOERROR;
3062 else
3063 return REG_ECOLLATE;
3065 /* Got valid collation sequence, add it as a new entry. */
3066 /* Check the space of the arrays. */
3067 if (__glibc_unlikely (*coll_sym_alloc == mbcset->ncoll_syms))
3069 /* Not enough, realloc it. */
3070 /* +1 in case of mbcset->ncoll_syms is 0. */
3071 Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3072 /* Use realloc since mbcset->coll_syms is NULL
3073 if *alloc == 0. */
3074 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3075 new_coll_sym_alloc);
3076 if (__glibc_unlikely (new_coll_syms == NULL))
3077 return REG_ESPACE;
3078 mbcset->coll_syms = new_coll_syms;
3079 *coll_sym_alloc = new_coll_sym_alloc;
3081 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3082 return REG_NOERROR;
3084 else
3086 if (__glibc_unlikely (name_len != 1))
3087 return REG_ECOLLATE;
3088 else
3090 bitset_set (sbcset, name[0]);
3091 return REG_NOERROR;
3095 #endif
3097 re_token_t br_token;
3098 re_bitset_ptr_t sbcset;
3099 #ifdef RE_ENABLE_I18N
3100 re_charset_t *mbcset;
3101 Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3102 Idx equiv_class_alloc = 0, char_class_alloc = 0;
3103 #endif /* not RE_ENABLE_I18N */
3104 bool non_match = false;
3105 bin_tree_t *work_tree;
3106 int token_len;
3107 bool first_round = true;
3108 #ifdef _LIBC
3109 collseqmb = (const unsigned char *)
3110 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3111 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3112 if (nrules)
3115 if (MB_CUR_MAX > 1)
3117 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3118 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3119 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3120 _NL_COLLATE_SYMB_TABLEMB);
3121 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3122 _NL_COLLATE_SYMB_EXTRAMB);
3124 #endif
3125 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3126 #ifdef RE_ENABLE_I18N
3127 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3128 #endif /* RE_ENABLE_I18N */
3129 #ifdef RE_ENABLE_I18N
3130 if (__glibc_unlikely (sbcset == NULL || mbcset == NULL))
3131 #else
3132 if (__glibc_unlikely (sbcset == NULL))
3133 #endif /* RE_ENABLE_I18N */
3135 re_free (sbcset);
3136 #ifdef RE_ENABLE_I18N
3137 re_free (mbcset);
3138 #endif
3139 *err = REG_ESPACE;
3140 return NULL;
3143 token_len = peek_token_bracket (token, regexp, syntax);
3144 if (__glibc_unlikely (token->type == END_OF_RE))
3146 *err = REG_BADPAT;
3147 goto parse_bracket_exp_free_return;
3149 if (token->type == OP_NON_MATCH_LIST)
3151 #ifdef RE_ENABLE_I18N
3152 mbcset->non_match = 1;
3153 #endif /* not RE_ENABLE_I18N */
3154 non_match = true;
3155 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3156 bitset_set (sbcset, '\n');
3157 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3158 token_len = peek_token_bracket (token, regexp, syntax);
3159 if (__glibc_unlikely (token->type == END_OF_RE))
3161 *err = REG_BADPAT;
3162 goto parse_bracket_exp_free_return;
3166 /* We treat the first ']' as a normal character. */
3167 if (token->type == OP_CLOSE_BRACKET)
3168 token->type = CHARACTER;
3170 while (1)
3172 bracket_elem_t start_elem, end_elem;
3173 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3174 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3175 reg_errcode_t ret;
3176 int token_len2 = 0;
3177 bool is_range_exp = false;
3178 re_token_t token2;
3180 start_elem.opr.name = start_name_buf;
3181 start_elem.type = COLL_SYM;
3182 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3183 syntax, first_round);
3184 if (__glibc_unlikely (ret != REG_NOERROR))
3186 *err = ret;
3187 goto parse_bracket_exp_free_return;
3189 first_round = false;
3191 /* Get information about the next token. We need it in any case. */
3192 token_len = peek_token_bracket (token, regexp, syntax);
3194 /* Do not check for ranges if we know they are not allowed. */
3195 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3197 if (__glibc_unlikely (token->type == END_OF_RE))
3199 *err = REG_EBRACK;
3200 goto parse_bracket_exp_free_return;
3202 if (token->type == OP_CHARSET_RANGE)
3204 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3205 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3206 if (__glibc_unlikely (token2.type == END_OF_RE))
3208 *err = REG_EBRACK;
3209 goto parse_bracket_exp_free_return;
3211 if (token2.type == OP_CLOSE_BRACKET)
3213 /* We treat the last '-' as a normal character. */
3214 re_string_skip_bytes (regexp, -token_len);
3215 token->type = CHARACTER;
3217 else
3218 is_range_exp = true;
3222 if (is_range_exp == true)
3224 end_elem.opr.name = end_name_buf;
3225 end_elem.type = COLL_SYM;
3226 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3227 dfa, syntax, true);
3228 if (__glibc_unlikely (ret != REG_NOERROR))
3230 *err = ret;
3231 goto parse_bracket_exp_free_return;
3234 token_len = peek_token_bracket (token, regexp, syntax);
3236 #ifdef _LIBC
3237 *err = build_range_exp (sbcset, mbcset, &range_alloc,
3238 &start_elem, &end_elem);
3239 #else
3240 # ifdef RE_ENABLE_I18N
3241 *err = build_range_exp (syntax, sbcset,
3242 dfa->mb_cur_max > 1 ? mbcset : NULL,
3243 &range_alloc, &start_elem, &end_elem);
3244 # else
3245 *err = build_range_exp (syntax, sbcset, &start_elem, &end_elem);
3246 # endif
3247 #endif /* RE_ENABLE_I18N */
3248 if (__glibc_unlikely (*err != REG_NOERROR))
3249 goto parse_bracket_exp_free_return;
3251 else
3253 switch (start_elem.type)
3255 case SB_CHAR:
3256 bitset_set (sbcset, start_elem.opr.ch);
3257 break;
3258 #ifdef RE_ENABLE_I18N
3259 case MB_CHAR:
3260 /* Check whether the array has enough space. */
3261 if (__glibc_unlikely (mbchar_alloc == mbcset->nmbchars))
3263 wchar_t *new_mbchars;
3264 /* Not enough, realloc it. */
3265 /* +1 in case of mbcset->nmbchars is 0. */
3266 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3267 /* Use realloc since array is NULL if *alloc == 0. */
3268 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3269 mbchar_alloc);
3270 if (__glibc_unlikely (new_mbchars == NULL))
3271 goto parse_bracket_exp_espace;
3272 mbcset->mbchars = new_mbchars;
3274 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3275 break;
3276 #endif /* RE_ENABLE_I18N */
3277 case EQUIV_CLASS:
3278 *err = build_equiv_class (sbcset,
3279 #ifdef RE_ENABLE_I18N
3280 mbcset, &equiv_class_alloc,
3281 #endif /* RE_ENABLE_I18N */
3282 start_elem.opr.name);
3283 if (__glibc_unlikely (*err != REG_NOERROR))
3284 goto parse_bracket_exp_free_return;
3285 break;
3286 case COLL_SYM:
3287 *err = build_collating_symbol (sbcset,
3288 #ifdef RE_ENABLE_I18N
3289 mbcset, &coll_sym_alloc,
3290 #endif /* RE_ENABLE_I18N */
3291 start_elem.opr.name);
3292 if (__glibc_unlikely (*err != REG_NOERROR))
3293 goto parse_bracket_exp_free_return;
3294 break;
3295 case CHAR_CLASS:
3296 *err = build_charclass (regexp->trans, sbcset,
3297 #ifdef RE_ENABLE_I18N
3298 mbcset, &char_class_alloc,
3299 #endif /* RE_ENABLE_I18N */
3300 (const char *) start_elem.opr.name,
3301 syntax);
3302 if (__glibc_unlikely (*err != REG_NOERROR))
3303 goto parse_bracket_exp_free_return;
3304 break;
3305 default:
3306 DEBUG_ASSERT (false);
3307 break;
3310 if (__glibc_unlikely (token->type == END_OF_RE))
3312 *err = REG_EBRACK;
3313 goto parse_bracket_exp_free_return;
3315 if (token->type == OP_CLOSE_BRACKET)
3316 break;
3319 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3321 /* If it is non-matching list. */
3322 if (non_match)
3323 bitset_not (sbcset);
3325 #ifdef RE_ENABLE_I18N
3326 /* Ensure only single byte characters are set. */
3327 if (dfa->mb_cur_max > 1)
3328 bitset_mask (sbcset, dfa->sb_char);
3330 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3331 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3332 || mbcset->non_match)))
3334 bin_tree_t *mbc_tree;
3335 int sbc_idx;
3336 /* Build a tree for complex bracket. */
3337 dfa->has_mb_node = 1;
3338 br_token.type = COMPLEX_BRACKET;
3339 br_token.opr.mbcset = mbcset;
3340 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3341 if (__glibc_unlikely (mbc_tree == NULL))
3342 goto parse_bracket_exp_espace;
3343 for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3344 if (sbcset[sbc_idx])
3345 break;
3346 /* If there are no bits set in sbcset, there is no point
3347 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3348 if (sbc_idx < BITSET_WORDS)
3350 /* Build a tree for simple bracket. */
3351 br_token.type = SIMPLE_BRACKET;
3352 br_token.opr.sbcset = sbcset;
3353 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3354 if (__glibc_unlikely (work_tree == NULL))
3355 goto parse_bracket_exp_espace;
3357 /* Then join them by ALT node. */
3358 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3359 if (__glibc_unlikely (work_tree == NULL))
3360 goto parse_bracket_exp_espace;
3362 else
3364 re_free (sbcset);
3365 work_tree = mbc_tree;
3368 else
3369 #endif /* not RE_ENABLE_I18N */
3371 #ifdef RE_ENABLE_I18N
3372 free_charset (mbcset);
3373 #endif
3374 /* Build a tree for simple bracket. */
3375 br_token.type = SIMPLE_BRACKET;
3376 br_token.opr.sbcset = sbcset;
3377 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3378 if (__glibc_unlikely (work_tree == NULL))
3379 goto parse_bracket_exp_espace;
3381 return work_tree;
3383 parse_bracket_exp_espace:
3384 *err = REG_ESPACE;
3385 parse_bracket_exp_free_return:
3386 re_free (sbcset);
3387 #ifdef RE_ENABLE_I18N
3388 free_charset (mbcset);
3389 #endif /* RE_ENABLE_I18N */
3390 return NULL;
3393 /* Parse an element in the bracket expression. */
3395 static reg_errcode_t
3396 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3397 re_token_t *token, int token_len, re_dfa_t *dfa,
3398 reg_syntax_t syntax, bool accept_hyphen)
3400 #ifdef RE_ENABLE_I18N
3401 int cur_char_size;
3402 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3403 if (cur_char_size > 1)
3405 elem->type = MB_CHAR;
3406 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3407 re_string_skip_bytes (regexp, cur_char_size);
3408 return REG_NOERROR;
3410 #endif /* RE_ENABLE_I18N */
3411 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3412 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3413 || token->type == OP_OPEN_EQUIV_CLASS)
3414 return parse_bracket_symbol (elem, regexp, token);
3415 if (__glibc_unlikely (token->type == OP_CHARSET_RANGE) && !accept_hyphen)
3417 /* A '-' must only appear as anything but a range indicator before
3418 the closing bracket. Everything else is an error. */
3419 re_token_t token2;
3420 (void) peek_token_bracket (&token2, regexp, syntax);
3421 if (token2.type != OP_CLOSE_BRACKET)
3422 /* The actual error value is not standardized since this whole
3423 case is undefined. But ERANGE makes good sense. */
3424 return REG_ERANGE;
3426 elem->type = SB_CHAR;
3427 elem->opr.ch = token->opr.c;
3428 return REG_NOERROR;
3431 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3432 such as [:<character_class>:], [.<collating_element>.], and
3433 [=<equivalent_class>=]. */
3435 static reg_errcode_t
3436 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3437 re_token_t *token)
3439 unsigned char ch, delim = token->opr.c;
3440 int i = 0;
3441 if (re_string_eoi(regexp))
3442 return REG_EBRACK;
3443 for (;; ++i)
3445 if (i >= BRACKET_NAME_BUF_SIZE)
3446 return REG_EBRACK;
3447 if (token->type == OP_OPEN_CHAR_CLASS)
3448 ch = re_string_fetch_byte_case (regexp);
3449 else
3450 ch = re_string_fetch_byte (regexp);
3451 if (re_string_eoi(regexp))
3452 return REG_EBRACK;
3453 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3454 break;
3455 elem->opr.name[i] = ch;
3457 re_string_skip_bytes (regexp, 1);
3458 elem->opr.name[i] = '\0';
3459 switch (token->type)
3461 case OP_OPEN_COLL_ELEM:
3462 elem->type = COLL_SYM;
3463 break;
3464 case OP_OPEN_EQUIV_CLASS:
3465 elem->type = EQUIV_CLASS;
3466 break;
3467 case OP_OPEN_CHAR_CLASS:
3468 elem->type = CHAR_CLASS;
3469 break;
3470 default:
3471 break;
3473 return REG_NOERROR;
3476 /* Helper function for parse_bracket_exp.
3477 Build the equivalence class which is represented by NAME.
3478 The result are written to MBCSET and SBCSET.
3479 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3480 is a pointer argument since we may update it. */
3482 static reg_errcode_t
3483 #ifdef RE_ENABLE_I18N
3484 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3485 Idx *equiv_class_alloc, const unsigned char *name)
3486 #else /* not RE_ENABLE_I18N */
3487 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3488 #endif /* not RE_ENABLE_I18N */
3490 #ifdef _LIBC
3491 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3492 if (nrules != 0)
3494 const int32_t *table, *indirect;
3495 const unsigned char *weights, *extra, *cp;
3496 unsigned char char_buf[2];
3497 int32_t idx1, idx2;
3498 unsigned int ch;
3499 size_t len;
3500 /* Calculate the index for equivalence class. */
3501 cp = name;
3502 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3503 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3504 _NL_COLLATE_WEIGHTMB);
3505 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3506 _NL_COLLATE_EXTRAMB);
3507 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3508 _NL_COLLATE_INDIRECTMB);
3509 idx1 = findidx (table, indirect, extra, &cp, -1);
3510 if (__glibc_unlikely (idx1 == 0 || *cp != '\0'))
3511 /* This isn't a valid character. */
3512 return REG_ECOLLATE;
3514 /* Build single byte matching table for this equivalence class. */
3515 len = weights[idx1 & 0xffffff];
3516 for (ch = 0; ch < SBC_MAX; ++ch)
3518 char_buf[0] = ch;
3519 cp = char_buf;
3520 idx2 = findidx (table, indirect, extra, &cp, 1);
3522 idx2 = table[ch];
3524 if (idx2 == 0)
3525 /* This isn't a valid character. */
3526 continue;
3527 /* Compare only if the length matches and the collation rule
3528 index is the same. */
3529 if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24)
3530 && memcmp (weights + (idx1 & 0xffffff) + 1,
3531 weights + (idx2 & 0xffffff) + 1, len) == 0)
3532 bitset_set (sbcset, ch);
3534 /* Check whether the array has enough space. */
3535 if (__glibc_unlikely (*equiv_class_alloc == mbcset->nequiv_classes))
3537 /* Not enough, realloc it. */
3538 /* +1 in case of mbcset->nequiv_classes is 0. */
3539 Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3540 /* Use realloc since the array is NULL if *alloc == 0. */
3541 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3542 int32_t,
3543 new_equiv_class_alloc);
3544 if (__glibc_unlikely (new_equiv_classes == NULL))
3545 return REG_ESPACE;
3546 mbcset->equiv_classes = new_equiv_classes;
3547 *equiv_class_alloc = new_equiv_class_alloc;
3549 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3551 else
3552 #endif /* _LIBC */
3554 if (__glibc_unlikely (strlen ((const char *) name) != 1))
3555 return REG_ECOLLATE;
3556 bitset_set (sbcset, *name);
3558 return REG_NOERROR;
3561 /* Helper function for parse_bracket_exp.
3562 Build the character class which is represented by NAME.
3563 The result are written to MBCSET and SBCSET.
3564 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3565 is a pointer argument since we may update it. */
3567 static reg_errcode_t
3568 #ifdef RE_ENABLE_I18N
3569 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3570 re_charset_t *mbcset, Idx *char_class_alloc,
3571 const char *class_name, reg_syntax_t syntax)
3572 #else /* not RE_ENABLE_I18N */
3573 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3574 const char *class_name, reg_syntax_t syntax)
3575 #endif /* not RE_ENABLE_I18N */
3577 int i;
3578 const char *name = class_name;
3580 /* In case of REG_ICASE "upper" and "lower" match the both of
3581 upper and lower cases. */
3582 if ((syntax & RE_ICASE)
3583 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3584 name = "alpha";
3586 #ifdef RE_ENABLE_I18N
3587 /* Check the space of the arrays. */
3588 if (__glibc_unlikely (*char_class_alloc == mbcset->nchar_classes))
3590 /* Not enough, realloc it. */
3591 /* +1 in case of mbcset->nchar_classes is 0. */
3592 Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3593 /* Use realloc since array is NULL if *alloc == 0. */
3594 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3595 new_char_class_alloc);
3596 if (__glibc_unlikely (new_char_classes == NULL))
3597 return REG_ESPACE;
3598 mbcset->char_classes = new_char_classes;
3599 *char_class_alloc = new_char_class_alloc;
3601 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3602 #endif /* RE_ENABLE_I18N */
3604 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3605 do { \
3606 if (__glibc_unlikely (trans != NULL)) \
3608 for (i = 0; i < SBC_MAX; ++i) \
3609 if (ctype_func (i)) \
3610 bitset_set (sbcset, trans[i]); \
3612 else \
3614 for (i = 0; i < SBC_MAX; ++i) \
3615 if (ctype_func (i)) \
3616 bitset_set (sbcset, i); \
3618 } while (0)
3620 if (strcmp (name, "alnum") == 0)
3621 BUILD_CHARCLASS_LOOP (isalnum);
3622 else if (strcmp (name, "cntrl") == 0)
3623 BUILD_CHARCLASS_LOOP (iscntrl);
3624 else if (strcmp (name, "lower") == 0)
3625 BUILD_CHARCLASS_LOOP (islower);
3626 else if (strcmp (name, "space") == 0)
3627 BUILD_CHARCLASS_LOOP (isspace);
3628 else if (strcmp (name, "alpha") == 0)
3629 BUILD_CHARCLASS_LOOP (isalpha);
3630 else if (strcmp (name, "digit") == 0)
3631 BUILD_CHARCLASS_LOOP (isdigit);
3632 else if (strcmp (name, "print") == 0)
3633 BUILD_CHARCLASS_LOOP (isprint);
3634 else if (strcmp (name, "upper") == 0)
3635 BUILD_CHARCLASS_LOOP (isupper);
3636 else if (strcmp (name, "blank") == 0)
3637 BUILD_CHARCLASS_LOOP (isblank);
3638 else if (strcmp (name, "graph") == 0)
3639 BUILD_CHARCLASS_LOOP (isgraph);
3640 else if (strcmp (name, "punct") == 0)
3641 BUILD_CHARCLASS_LOOP (ispunct);
3642 else if (strcmp (name, "xdigit") == 0)
3643 BUILD_CHARCLASS_LOOP (isxdigit);
3644 else
3645 return REG_ECTYPE;
3647 return REG_NOERROR;
3650 static bin_tree_t *
3651 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3652 const char *class_name,
3653 const char *extra, bool non_match,
3654 reg_errcode_t *err)
3656 re_bitset_ptr_t sbcset;
3657 #ifdef RE_ENABLE_I18N
3658 re_charset_t *mbcset;
3659 Idx alloc = 0;
3660 #endif /* not RE_ENABLE_I18N */
3661 reg_errcode_t ret;
3662 bin_tree_t *tree;
3664 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3665 if (__glibc_unlikely (sbcset == NULL))
3667 *err = REG_ESPACE;
3668 return NULL;
3670 #ifdef RE_ENABLE_I18N
3671 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3672 if (__glibc_unlikely (mbcset == NULL))
3674 re_free (sbcset);
3675 *err = REG_ESPACE;
3676 return NULL;
3678 mbcset->non_match = non_match;
3679 #endif /* RE_ENABLE_I18N */
3681 /* We don't care the syntax in this case. */
3682 ret = build_charclass (trans, sbcset,
3683 #ifdef RE_ENABLE_I18N
3684 mbcset, &alloc,
3685 #endif /* RE_ENABLE_I18N */
3686 class_name, 0);
3688 if (__glibc_unlikely (ret != REG_NOERROR))
3690 re_free (sbcset);
3691 #ifdef RE_ENABLE_I18N
3692 free_charset (mbcset);
3693 #endif /* RE_ENABLE_I18N */
3694 *err = ret;
3695 return NULL;
3697 /* \w match '_' also. */
3698 for (; *extra; extra++)
3699 bitset_set (sbcset, *extra);
3701 /* If it is non-matching list. */
3702 if (non_match)
3703 bitset_not (sbcset);
3705 #ifdef RE_ENABLE_I18N
3706 /* Ensure only single byte characters are set. */
3707 if (dfa->mb_cur_max > 1)
3708 bitset_mask (sbcset, dfa->sb_char);
3709 #endif
3711 /* Build a tree for simple bracket. */
3712 re_token_t br_token = { .type = SIMPLE_BRACKET, .opr.sbcset = sbcset };
3713 tree = create_token_tree (dfa, NULL, NULL, &br_token);
3714 if (__glibc_unlikely (tree == NULL))
3715 goto build_word_op_espace;
3717 #ifdef RE_ENABLE_I18N
3718 if (dfa->mb_cur_max > 1)
3720 bin_tree_t *mbc_tree;
3721 /* Build a tree for complex bracket. */
3722 br_token.type = COMPLEX_BRACKET;
3723 br_token.opr.mbcset = mbcset;
3724 dfa->has_mb_node = 1;
3725 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3726 if (__glibc_unlikely (mbc_tree == NULL))
3727 goto build_word_op_espace;
3728 /* Then join them by ALT node. */
3729 tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3730 if (__glibc_likely (mbc_tree != NULL))
3731 return tree;
3733 else
3735 free_charset (mbcset);
3736 return tree;
3738 #else /* not RE_ENABLE_I18N */
3739 return tree;
3740 #endif /* not RE_ENABLE_I18N */
3742 build_word_op_espace:
3743 re_free (sbcset);
3744 #ifdef RE_ENABLE_I18N
3745 free_charset (mbcset);
3746 #endif /* RE_ENABLE_I18N */
3747 *err = REG_ESPACE;
3748 return NULL;
3751 /* This is intended for the expressions like "a{1,3}".
3752 Fetch a number from 'input', and return the number.
3753 Return -1 if the number field is empty like "{,1}".
3754 Return RE_DUP_MAX + 1 if the number field is too large.
3755 Return -2 if an error occurred. */
3757 static Idx
3758 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3760 Idx num = -1;
3761 unsigned char c;
3762 while (1)
3764 fetch_token (token, input, syntax);
3765 c = token->opr.c;
3766 if (__glibc_unlikely (token->type == END_OF_RE))
3767 return -2;
3768 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3769 break;
3770 num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3771 ? -2
3772 : num == -1
3773 ? c - '0'
3774 : MIN (RE_DUP_MAX + 1, num * 10 + c - '0'));
3776 return num;
3779 #ifdef RE_ENABLE_I18N
3780 static void
3781 free_charset (re_charset_t *cset)
3783 re_free (cset->mbchars);
3784 # ifdef _LIBC
3785 re_free (cset->coll_syms);
3786 re_free (cset->equiv_classes);
3787 # endif
3788 re_free (cset->range_starts);
3789 re_free (cset->range_ends);
3790 re_free (cset->char_classes);
3791 re_free (cset);
3793 #endif /* RE_ENABLE_I18N */
3795 /* Functions for binary tree operation. */
3797 /* Create a tree node. */
3799 static bin_tree_t *
3800 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3801 re_token_type_t type)
3803 re_token_t t = { .type = type };
3804 return create_token_tree (dfa, left, right, &t);
3807 static bin_tree_t *
3808 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3809 const re_token_t *token)
3811 bin_tree_t *tree;
3812 if (__glibc_unlikely (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE))
3814 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3816 if (storage == NULL)
3817 return NULL;
3818 storage->next = dfa->str_tree_storage;
3819 dfa->str_tree_storage = storage;
3820 dfa->str_tree_storage_idx = 0;
3822 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3824 tree->parent = NULL;
3825 tree->left = left;
3826 tree->right = right;
3827 tree->token = *token;
3828 tree->token.duplicated = 0;
3829 tree->token.opt_subexp = 0;
3830 tree->first = NULL;
3831 tree->next = NULL;
3832 tree->node_idx = -1;
3834 if (left != NULL)
3835 left->parent = tree;
3836 if (right != NULL)
3837 right->parent = tree;
3838 return tree;
3841 /* Mark the tree SRC as an optional subexpression.
3842 To be called from preorder or postorder. */
3844 static reg_errcode_t
3845 mark_opt_subexp (void *extra, bin_tree_t *node)
3847 Idx idx = (uintptr_t) extra;
3848 if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3849 node->token.opt_subexp = 1;
3851 return REG_NOERROR;
3854 /* Free the allocated memory inside NODE. */
3856 static void
3857 free_token (re_token_t *node)
3859 #ifdef RE_ENABLE_I18N
3860 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3861 free_charset (node->opr.mbcset);
3862 else
3863 #endif /* RE_ENABLE_I18N */
3864 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3865 re_free (node->opr.sbcset);
3868 /* Worker function for tree walking. Free the allocated memory inside NODE
3869 and its children. */
3871 static reg_errcode_t
3872 free_tree (void *extra, bin_tree_t *node)
3874 free_token (&node->token);
3875 return REG_NOERROR;
3879 /* Duplicate the node SRC, and return new node. This is a preorder
3880 visit similar to the one implemented by the generic visitor, but
3881 we need more infrastructure to maintain two parallel trees --- so,
3882 it's easier to duplicate. */
3884 static bin_tree_t *
3885 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3887 const bin_tree_t *node;
3888 bin_tree_t *dup_root;
3889 bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3891 for (node = root; ; )
3893 /* Create a new tree and link it back to the current parent. */
3894 *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3895 if (*p_new == NULL)
3896 return NULL;
3897 (*p_new)->parent = dup_node;
3898 (*p_new)->token.duplicated = 1;
3899 dup_node = *p_new;
3901 /* Go to the left node, or up and to the right. */
3902 if (node->left)
3904 node = node->left;
3905 p_new = &dup_node->left;
3907 else
3909 const bin_tree_t *prev = NULL;
3910 while (node->right == prev || node->right == NULL)
3912 prev = node;
3913 node = node->parent;
3914 dup_node = dup_node->parent;
3915 if (!node)
3916 return dup_root;
3918 node = node->right;
3919 p_new = &dup_node->right;