autoupdate
[gnulib.git] / lib / regcomp.c
blob3ede12bfc200aee865f78f03900350cbcdb5240c
1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002-2018 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <https://www.gnu.org/licenses/>. */
20 #ifdef _LIBC
21 # include <locale/weight.h>
22 #endif
24 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
25 size_t length, reg_syntax_t syntax);
26 static void re_compile_fastmap_iter (regex_t *bufp,
27 const re_dfastate_t *init_state,
28 char *fastmap);
29 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
30 #ifdef RE_ENABLE_I18N
31 static void free_charset (re_charset_t *cset);
32 #endif /* RE_ENABLE_I18N */
33 static void free_workarea_compile (regex_t *preg);
34 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
35 #ifdef RE_ENABLE_I18N
36 static void optimize_utf8 (re_dfa_t *dfa);
37 #endif
38 static reg_errcode_t analyze (regex_t *preg);
39 static reg_errcode_t preorder (bin_tree_t *root,
40 reg_errcode_t (fn (void *, bin_tree_t *)),
41 void *extra);
42 static reg_errcode_t postorder (bin_tree_t *root,
43 reg_errcode_t (fn (void *, bin_tree_t *)),
44 void *extra);
45 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
46 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
47 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
48 bin_tree_t *node);
49 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
50 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
51 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
52 static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
53 static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
54 unsigned int constraint);
55 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
56 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
57 Idx node, bool root);
58 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
59 static Idx fetch_number (re_string_t *input, re_token_t *token,
60 reg_syntax_t syntax);
61 static int peek_token (re_token_t *token, re_string_t *input,
62 reg_syntax_t syntax);
63 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
64 reg_syntax_t syntax, reg_errcode_t *err);
65 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
66 re_token_t *token, reg_syntax_t syntax,
67 Idx nest, reg_errcode_t *err);
68 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
69 re_token_t *token, reg_syntax_t syntax,
70 Idx nest, reg_errcode_t *err);
71 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
72 re_token_t *token, reg_syntax_t syntax,
73 Idx nest, reg_errcode_t *err);
74 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
75 re_token_t *token, reg_syntax_t syntax,
76 Idx nest, reg_errcode_t *err);
77 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
78 re_dfa_t *dfa, re_token_t *token,
79 reg_syntax_t syntax, reg_errcode_t *err);
80 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
81 re_token_t *token, reg_syntax_t syntax,
82 reg_errcode_t *err);
83 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
84 re_string_t *regexp,
85 re_token_t *token, int token_len,
86 re_dfa_t *dfa,
87 reg_syntax_t syntax,
88 bool accept_hyphen);
89 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
90 re_string_t *regexp,
91 re_token_t *token);
92 #ifdef RE_ENABLE_I18N
93 static reg_errcode_t build_equiv_class (bitset_t sbcset,
94 re_charset_t *mbcset,
95 Idx *equiv_class_alloc,
96 const unsigned char *name);
97 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
98 bitset_t sbcset,
99 re_charset_t *mbcset,
100 Idx *char_class_alloc,
101 const char *class_name,
102 reg_syntax_t syntax);
103 #else /* not RE_ENABLE_I18N */
104 static reg_errcode_t build_equiv_class (bitset_t sbcset,
105 const unsigned char *name);
106 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
107 bitset_t sbcset,
108 const char *class_name,
109 reg_syntax_t syntax);
110 #endif /* not RE_ENABLE_I18N */
111 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
112 RE_TRANSLATE_TYPE trans,
113 const char *class_name,
114 const char *extra,
115 bool non_match, reg_errcode_t *err);
116 static bin_tree_t *create_tree (re_dfa_t *dfa,
117 bin_tree_t *left, bin_tree_t *right,
118 re_token_type_t type);
119 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
120 bin_tree_t *left, bin_tree_t *right,
121 const re_token_t *token);
122 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
123 static void free_token (re_token_t *node);
124 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
125 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
127 /* This table gives an error message for each of the error codes listed
128 in regex.h. Obviously the order here has to be same as there.
129 POSIX doesn't require that we do anything for REG_NOERROR,
130 but why not be nice? */
132 static const char __re_error_msgid[] =
134 #define REG_NOERROR_IDX 0
135 gettext_noop ("Success") /* REG_NOERROR */
136 "\0"
137 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
138 gettext_noop ("No match") /* REG_NOMATCH */
139 "\0"
140 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
141 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
142 "\0"
143 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
144 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
145 "\0"
146 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
147 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
148 "\0"
149 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
150 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
151 "\0"
152 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
153 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
154 "\0"
155 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
156 gettext_noop ("Unmatched [, [^, [:, [., or [=") /* REG_EBRACK */
157 "\0"
158 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [, [^, [:, [., or [=")
159 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
160 "\0"
161 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
162 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
163 "\0"
164 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
165 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
166 "\0"
167 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
168 gettext_noop ("Invalid range end") /* REG_ERANGE */
169 "\0"
170 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
171 gettext_noop ("Memory exhausted") /* REG_ESPACE */
172 "\0"
173 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
174 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
175 "\0"
176 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
177 gettext_noop ("Premature end of regular expression") /* REG_EEND */
178 "\0"
179 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
180 gettext_noop ("Regular expression too big") /* REG_ESIZE */
181 "\0"
182 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
183 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
186 static const size_t __re_error_msgid_idx[] =
188 REG_NOERROR_IDX,
189 REG_NOMATCH_IDX,
190 REG_BADPAT_IDX,
191 REG_ECOLLATE_IDX,
192 REG_ECTYPE_IDX,
193 REG_EESCAPE_IDX,
194 REG_ESUBREG_IDX,
195 REG_EBRACK_IDX,
196 REG_EPAREN_IDX,
197 REG_EBRACE_IDX,
198 REG_BADBR_IDX,
199 REG_ERANGE_IDX,
200 REG_ESPACE_IDX,
201 REG_BADRPT_IDX,
202 REG_EEND_IDX,
203 REG_ESIZE_IDX,
204 REG_ERPAREN_IDX
207 /* Entry points for GNU code. */
209 /* re_compile_pattern is the GNU regular expression compiler: it
210 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
211 Returns 0 if the pattern was valid, otherwise an error string.
213 Assumes the 'allocated' (and perhaps 'buffer') and 'translate' fields
214 are set in BUFP on entry. */
216 const char *
217 re_compile_pattern (const char *pattern, size_t length,
218 struct re_pattern_buffer *bufp)
220 reg_errcode_t ret;
222 /* And GNU code determines whether or not to get register information
223 by passing null for the REGS argument to re_match, etc., not by
224 setting no_sub, unless RE_NO_SUB is set. */
225 bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
227 /* Match anchors at newline. */
228 bufp->newline_anchor = 1;
230 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
232 if (!ret)
233 return NULL;
234 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
236 #ifdef _LIBC
237 weak_alias (__re_compile_pattern, re_compile_pattern)
238 #endif
240 /* Set by 're_set_syntax' to the current regexp syntax to recognize. Can
241 also be assigned to arbitrarily: each pattern buffer stores its own
242 syntax, so it can be changed between regex compilations. */
243 /* This has no initializer because initialized variables in Emacs
244 become read-only after dumping. */
245 reg_syntax_t re_syntax_options;
248 /* Specify the precise syntax of regexps for compilation. This provides
249 for compatibility for various utilities which historically have
250 different, incompatible syntaxes.
252 The argument SYNTAX is a bit mask comprised of the various bits
253 defined in regex.h. We return the old syntax. */
255 reg_syntax_t
256 re_set_syntax (reg_syntax_t syntax)
258 reg_syntax_t ret = re_syntax_options;
260 re_syntax_options = syntax;
261 return ret;
263 #ifdef _LIBC
264 weak_alias (__re_set_syntax, re_set_syntax)
265 #endif
268 re_compile_fastmap (struct re_pattern_buffer *bufp)
270 re_dfa_t *dfa = bufp->buffer;
271 char *fastmap = bufp->fastmap;
273 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
274 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
275 if (dfa->init_state != dfa->init_state_word)
276 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
277 if (dfa->init_state != dfa->init_state_nl)
278 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
279 if (dfa->init_state != dfa->init_state_begbuf)
280 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
281 bufp->fastmap_accurate = 1;
282 return 0;
284 #ifdef _LIBC
285 weak_alias (__re_compile_fastmap, re_compile_fastmap)
286 #endif
288 static inline void
289 __attribute__ ((always_inline))
290 re_set_fastmap (char *fastmap, bool icase, int ch)
292 fastmap[ch] = 1;
293 if (icase)
294 fastmap[tolower (ch)] = 1;
297 /* Helper function for re_compile_fastmap.
298 Compile fastmap for the initial_state INIT_STATE. */
300 static void
301 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
302 char *fastmap)
304 re_dfa_t *dfa = bufp->buffer;
305 Idx node_cnt;
306 bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
307 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
309 Idx node = init_state->nodes.elems[node_cnt];
310 re_token_type_t type = dfa->nodes[node].type;
312 if (type == CHARACTER)
314 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
315 #ifdef RE_ENABLE_I18N
316 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
318 unsigned char buf[MB_LEN_MAX];
319 unsigned char *p;
320 wchar_t wc;
321 mbstate_t state;
323 p = buf;
324 *p++ = dfa->nodes[node].opr.c;
325 while (++node < dfa->nodes_len
326 && dfa->nodes[node].type == CHARACTER
327 && dfa->nodes[node].mb_partial)
328 *p++ = dfa->nodes[node].opr.c;
329 memset (&state, '\0', sizeof (state));
330 if (__mbrtowc (&wc, (const char *) buf, p - buf,
331 &state) == p - buf
332 && (__wcrtomb ((char *) buf, __towlower (wc), &state)
333 != (size_t) -1))
334 re_set_fastmap (fastmap, false, buf[0]);
336 #endif
338 else if (type == SIMPLE_BRACKET)
340 int i, ch;
341 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
343 int j;
344 bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
345 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
346 if (w & ((bitset_word_t) 1 << j))
347 re_set_fastmap (fastmap, icase, ch);
350 #ifdef RE_ENABLE_I18N
351 else if (type == COMPLEX_BRACKET)
353 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
354 Idx i;
356 # ifdef _LIBC
357 /* See if we have to try all bytes which start multiple collation
358 elements.
359 e.g. In da_DK, we want to catch 'a' since "aa" is a valid
360 collation element, and don't catch 'b' since 'b' is
361 the only collation element which starts from 'b' (and
362 it is caught by SIMPLE_BRACKET). */
363 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
364 && (cset->ncoll_syms || cset->nranges))
366 const int32_t *table = (const int32_t *)
367 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
368 for (i = 0; i < SBC_MAX; ++i)
369 if (table[i] < 0)
370 re_set_fastmap (fastmap, icase, i);
372 # endif /* _LIBC */
374 /* See if we have to start the match at all multibyte characters,
375 i.e. where we would not find an invalid sequence. This only
376 applies to multibyte character sets; for single byte character
377 sets, the SIMPLE_BRACKET again suffices. */
378 if (dfa->mb_cur_max > 1
379 && (cset->nchar_classes || cset->non_match || cset->nranges
380 # ifdef _LIBC
381 || cset->nequiv_classes
382 # endif /* _LIBC */
385 unsigned char c = 0;
388 mbstate_t mbs;
389 memset (&mbs, 0, sizeof (mbs));
390 if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
391 re_set_fastmap (fastmap, false, (int) c);
393 while (++c != 0);
396 else
398 /* ... Else catch all bytes which can start the mbchars. */
399 for (i = 0; i < cset->nmbchars; ++i)
401 char buf[256];
402 mbstate_t state;
403 memset (&state, '\0', sizeof (state));
404 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
405 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
406 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
408 if (__wcrtomb (buf, __towlower (cset->mbchars[i]), &state)
409 != (size_t) -1)
410 re_set_fastmap (fastmap, false, *(unsigned char *) buf);
415 #endif /* RE_ENABLE_I18N */
416 else if (type == OP_PERIOD
417 #ifdef RE_ENABLE_I18N
418 || type == OP_UTF8_PERIOD
419 #endif /* RE_ENABLE_I18N */
420 || type == END_OF_RE)
422 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
423 if (type == END_OF_RE)
424 bufp->can_be_null = 1;
425 return;
430 /* Entry point for POSIX code. */
431 /* regcomp takes a regular expression as a string and compiles it.
433 PREG is a regex_t *. We do not expect any fields to be initialized,
434 since POSIX says we shouldn't. Thus, we set
436 'buffer' to the compiled pattern;
437 'used' to the length of the compiled pattern;
438 'syntax' to RE_SYNTAX_POSIX_EXTENDED if the
439 REG_EXTENDED bit in CFLAGS is set; otherwise, to
440 RE_SYNTAX_POSIX_BASIC;
441 'newline_anchor' to REG_NEWLINE being set in CFLAGS;
442 'fastmap' to an allocated space for the fastmap;
443 'fastmap_accurate' to zero;
444 're_nsub' to the number of subexpressions in PATTERN.
446 PATTERN is the address of the pattern string.
448 CFLAGS is a series of bits which affect compilation.
450 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
451 use POSIX basic syntax.
453 If REG_NEWLINE is set, then . and [^...] don't match newline.
454 Also, regexec will try a match beginning after every newline.
456 If REG_ICASE is set, then we considers upper- and lowercase
457 versions of letters to be equivalent when matching.
459 If REG_NOSUB is set, then when PREG is passed to regexec, that
460 routine will report only success or failure, and nothing about the
461 registers.
463 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
464 the return codes and their meanings.) */
467 regcomp (regex_t *_Restrict_ preg, const char *_Restrict_ pattern, int cflags)
469 reg_errcode_t ret;
470 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
471 : RE_SYNTAX_POSIX_BASIC);
473 preg->buffer = NULL;
474 preg->allocated = 0;
475 preg->used = 0;
477 /* Try to allocate space for the fastmap. */
478 preg->fastmap = re_malloc (char, SBC_MAX);
479 if (__glibc_unlikely (preg->fastmap == NULL))
480 return REG_ESPACE;
482 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
484 /* If REG_NEWLINE is set, newlines are treated differently. */
485 if (cflags & REG_NEWLINE)
486 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
487 syntax &= ~RE_DOT_NEWLINE;
488 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
489 /* It also changes the matching behavior. */
490 preg->newline_anchor = 1;
492 else
493 preg->newline_anchor = 0;
494 preg->no_sub = !!(cflags & REG_NOSUB);
495 preg->translate = NULL;
497 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
499 /* POSIX doesn't distinguish between an unmatched open-group and an
500 unmatched close-group: both are REG_EPAREN. */
501 if (ret == REG_ERPAREN)
502 ret = REG_EPAREN;
504 /* We have already checked preg->fastmap != NULL. */
505 if (__glibc_likely (ret == REG_NOERROR))
506 /* Compute the fastmap now, since regexec cannot modify the pattern
507 buffer. This function never fails in this implementation. */
508 (void) re_compile_fastmap (preg);
509 else
511 /* Some error occurred while compiling the expression. */
512 re_free (preg->fastmap);
513 preg->fastmap = NULL;
516 return (int) ret;
518 #ifdef _LIBC
519 libc_hidden_def (__regcomp)
520 weak_alias (__regcomp, regcomp)
521 #endif
523 /* Returns a message corresponding to an error code, ERRCODE, returned
524 from either regcomp or regexec. We don't use PREG here. */
526 size_t
527 regerror (int errcode, const regex_t *_Restrict_ preg, char *_Restrict_ errbuf,
528 size_t errbuf_size)
530 const char *msg;
531 size_t msg_size;
532 int nerrcodes = sizeof __re_error_msgid_idx / sizeof __re_error_msgid_idx[0];
534 if (__glibc_unlikely (errcode < 0 || errcode >= nerrcodes))
535 /* Only error codes returned by the rest of the code should be passed
536 to this routine. If we are given anything else, or if other regex
537 code generates an invalid error code, then the program has a bug.
538 Dump core so we can fix it. */
539 abort ();
541 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
543 msg_size = strlen (msg) + 1; /* Includes the null. */
545 if (__glibc_likely (errbuf_size != 0))
547 size_t cpy_size = msg_size;
548 if (__glibc_unlikely (msg_size > errbuf_size))
550 cpy_size = errbuf_size - 1;
551 errbuf[cpy_size] = '\0';
553 memcpy (errbuf, msg, cpy_size);
556 return msg_size;
558 #ifdef _LIBC
559 weak_alias (__regerror, regerror)
560 #endif
563 #ifdef RE_ENABLE_I18N
564 /* This static array is used for the map to single-byte characters when
565 UTF-8 is used. Otherwise we would allocate memory just to initialize
566 it the same all the time. UTF-8 is the preferred encoding so this is
567 a worthwhile optimization. */
568 static const bitset_t utf8_sb_map =
570 /* Set the first 128 bits. */
571 # if defined __GNUC__ && !defined __STRICT_ANSI__
572 [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
573 # else
574 # if 4 * BITSET_WORD_BITS < ASCII_CHARS
575 # error "bitset_word_t is narrower than 32 bits"
576 # elif 3 * BITSET_WORD_BITS < ASCII_CHARS
577 BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
578 # elif 2 * BITSET_WORD_BITS < ASCII_CHARS
579 BITSET_WORD_MAX, BITSET_WORD_MAX,
580 # elif 1 * BITSET_WORD_BITS < ASCII_CHARS
581 BITSET_WORD_MAX,
582 # endif
583 (BITSET_WORD_MAX
584 >> (SBC_MAX % BITSET_WORD_BITS == 0
586 : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
587 # endif
589 #endif
592 static void
593 free_dfa_content (re_dfa_t *dfa)
595 Idx i, j;
597 if (dfa->nodes)
598 for (i = 0; i < dfa->nodes_len; ++i)
599 free_token (dfa->nodes + i);
600 re_free (dfa->nexts);
601 for (i = 0; i < dfa->nodes_len; ++i)
603 if (dfa->eclosures != NULL)
604 re_node_set_free (dfa->eclosures + i);
605 if (dfa->inveclosures != NULL)
606 re_node_set_free (dfa->inveclosures + i);
607 if (dfa->edests != NULL)
608 re_node_set_free (dfa->edests + i);
610 re_free (dfa->edests);
611 re_free (dfa->eclosures);
612 re_free (dfa->inveclosures);
613 re_free (dfa->nodes);
615 if (dfa->state_table)
616 for (i = 0; i <= dfa->state_hash_mask; ++i)
618 struct re_state_table_entry *entry = dfa->state_table + i;
619 for (j = 0; j < entry->num; ++j)
621 re_dfastate_t *state = entry->array[j];
622 free_state (state);
624 re_free (entry->array);
626 re_free (dfa->state_table);
627 #ifdef RE_ENABLE_I18N
628 if (dfa->sb_char != utf8_sb_map)
629 re_free (dfa->sb_char);
630 #endif
631 re_free (dfa->subexp_map);
632 #ifdef DEBUG
633 re_free (dfa->re_str);
634 #endif
636 re_free (dfa);
640 /* Free dynamically allocated space used by PREG. */
642 void
643 regfree (regex_t *preg)
645 re_dfa_t *dfa = preg->buffer;
646 if (__glibc_likely (dfa != NULL))
648 lock_fini (dfa->lock);
649 free_dfa_content (dfa);
651 preg->buffer = NULL;
652 preg->allocated = 0;
654 re_free (preg->fastmap);
655 preg->fastmap = NULL;
657 re_free (preg->translate);
658 preg->translate = NULL;
660 #ifdef _LIBC
661 libc_hidden_def (__regfree)
662 weak_alias (__regfree, regfree)
663 #endif
665 /* Entry points compatible with 4.2 BSD regex library. We don't define
666 them unless specifically requested. */
668 #if defined _REGEX_RE_COMP || defined _LIBC
670 /* BSD has one and only one pattern buffer. */
671 static struct re_pattern_buffer re_comp_buf;
673 char *
674 # ifdef _LIBC
675 /* Make these definitions weak in libc, so POSIX programs can redefine
676 these names if they don't use our functions, and still use
677 regcomp/regexec above without link errors. */
678 weak_function
679 # endif
680 re_comp (const char *s)
682 reg_errcode_t ret;
683 char *fastmap;
685 if (!s)
687 if (!re_comp_buf.buffer)
688 return gettext ("No previous regular expression");
689 return 0;
692 if (re_comp_buf.buffer)
694 fastmap = re_comp_buf.fastmap;
695 re_comp_buf.fastmap = NULL;
696 __regfree (&re_comp_buf);
697 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
698 re_comp_buf.fastmap = fastmap;
701 if (re_comp_buf.fastmap == NULL)
703 re_comp_buf.fastmap = re_malloc (char, SBC_MAX);
704 if (re_comp_buf.fastmap == NULL)
705 return (char *) gettext (__re_error_msgid
706 + __re_error_msgid_idx[(int) REG_ESPACE]);
709 /* Since 're_exec' always passes NULL for the 'regs' argument, we
710 don't need to initialize the pattern buffer fields which affect it. */
712 /* Match anchors at newlines. */
713 re_comp_buf.newline_anchor = 1;
715 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
717 if (!ret)
718 return NULL;
720 /* Yes, we're discarding 'const' here if !HAVE_LIBINTL. */
721 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
724 #ifdef _LIBC
725 libc_freeres_fn (free_mem)
727 __regfree (&re_comp_buf);
729 #endif
731 #endif /* _REGEX_RE_COMP */
733 /* Internal entry point.
734 Compile the regular expression PATTERN, whose length is LENGTH.
735 SYNTAX indicate regular expression's syntax. */
737 static reg_errcode_t
738 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
739 reg_syntax_t syntax)
741 reg_errcode_t err = REG_NOERROR;
742 re_dfa_t *dfa;
743 re_string_t regexp;
745 /* Initialize the pattern buffer. */
746 preg->fastmap_accurate = 0;
747 preg->syntax = syntax;
748 preg->not_bol = preg->not_eol = 0;
749 preg->used = 0;
750 preg->re_nsub = 0;
751 preg->can_be_null = 0;
752 preg->regs_allocated = REGS_UNALLOCATED;
754 /* Initialize the dfa. */
755 dfa = preg->buffer;
756 if (__glibc_unlikely (preg->allocated < sizeof (re_dfa_t)))
758 /* If zero allocated, but buffer is non-null, try to realloc
759 enough space. This loses if buffer's address is bogus, but
760 that is the user's responsibility. If ->buffer is NULL this
761 is a simple allocation. */
762 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
763 if (dfa == NULL)
764 return REG_ESPACE;
765 preg->allocated = sizeof (re_dfa_t);
766 preg->buffer = dfa;
768 preg->used = sizeof (re_dfa_t);
770 err = init_dfa (dfa, length);
771 if (__glibc_unlikely (err == REG_NOERROR && lock_init (dfa->lock) != 0))
772 err = REG_ESPACE;
773 if (__glibc_unlikely (err != REG_NOERROR))
775 free_dfa_content (dfa);
776 preg->buffer = NULL;
777 preg->allocated = 0;
778 return err;
780 #ifdef DEBUG
781 /* Note: length+1 will not overflow since it is checked in init_dfa. */
782 dfa->re_str = re_malloc (char, length + 1);
783 strncpy (dfa->re_str, pattern, length + 1);
784 #endif
786 err = re_string_construct (&regexp, pattern, length, preg->translate,
787 (syntax & RE_ICASE) != 0, dfa);
788 if (__glibc_unlikely (err != REG_NOERROR))
790 re_compile_internal_free_return:
791 free_workarea_compile (preg);
792 re_string_destruct (&regexp);
793 lock_fini (dfa->lock);
794 free_dfa_content (dfa);
795 preg->buffer = NULL;
796 preg->allocated = 0;
797 return err;
800 /* Parse the regular expression, and build a structure tree. */
801 preg->re_nsub = 0;
802 dfa->str_tree = parse (&regexp, preg, syntax, &err);
803 if (__glibc_unlikely (dfa->str_tree == NULL))
804 goto re_compile_internal_free_return;
806 /* Analyze the tree and create the nfa. */
807 err = analyze (preg);
808 if (__glibc_unlikely (err != REG_NOERROR))
809 goto re_compile_internal_free_return;
811 #ifdef RE_ENABLE_I18N
812 /* If possible, do searching in single byte encoding to speed things up. */
813 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
814 optimize_utf8 (dfa);
815 #endif
817 /* Then create the initial state of the dfa. */
818 err = create_initial_state (dfa);
820 /* Release work areas. */
821 free_workarea_compile (preg);
822 re_string_destruct (&regexp);
824 if (__glibc_unlikely (err != REG_NOERROR))
826 lock_fini (dfa->lock);
827 free_dfa_content (dfa);
828 preg->buffer = NULL;
829 preg->allocated = 0;
832 return err;
835 /* Initialize DFA. We use the length of the regular expression PAT_LEN
836 as the initial length of some arrays. */
838 static reg_errcode_t
839 init_dfa (re_dfa_t *dfa, size_t pat_len)
841 __re_size_t table_size;
842 #ifndef _LIBC
843 const char *codeset_name;
844 #endif
845 #ifdef RE_ENABLE_I18N
846 size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
847 #else
848 size_t max_i18n_object_size = 0;
849 #endif
850 size_t max_object_size =
851 MAX (sizeof (struct re_state_table_entry),
852 MAX (sizeof (re_token_t),
853 MAX (sizeof (re_node_set),
854 MAX (sizeof (regmatch_t),
855 max_i18n_object_size))));
857 memset (dfa, '\0', sizeof (re_dfa_t));
859 /* Force allocation of str_tree_storage the first time. */
860 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
862 /* Avoid overflows. The extra "/ 2" is for the table_size doubling
863 calculation below, and for similar doubling calculations
864 elsewhere. And it's <= rather than <, because some of the
865 doubling calculations add 1 afterwards. */
866 if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / max_object_size) / 2
867 <= pat_len))
868 return REG_ESPACE;
870 dfa->nodes_alloc = pat_len + 1;
871 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
873 /* table_size = 2 ^ ceil(log pat_len) */
874 for (table_size = 1; ; table_size <<= 1)
875 if (table_size > pat_len)
876 break;
878 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
879 dfa->state_hash_mask = table_size - 1;
881 dfa->mb_cur_max = MB_CUR_MAX;
882 #ifdef _LIBC
883 if (dfa->mb_cur_max == 6
884 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
885 dfa->is_utf8 = 1;
886 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
887 != 0);
888 #else
889 codeset_name = nl_langinfo (CODESET);
890 if ((codeset_name[0] == 'U' || codeset_name[0] == 'u')
891 && (codeset_name[1] == 'T' || codeset_name[1] == 't')
892 && (codeset_name[2] == 'F' || codeset_name[2] == 'f')
893 && strcmp (codeset_name + 3 + (codeset_name[3] == '-'), "8") == 0)
894 dfa->is_utf8 = 1;
896 /* We check exhaustively in the loop below if this charset is a
897 superset of ASCII. */
898 dfa->map_notascii = 0;
899 #endif
901 #ifdef RE_ENABLE_I18N
902 if (dfa->mb_cur_max > 1)
904 if (dfa->is_utf8)
905 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
906 else
908 int i, j, ch;
910 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
911 if (__glibc_unlikely (dfa->sb_char == NULL))
912 return REG_ESPACE;
914 /* Set the bits corresponding to single byte chars. */
915 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
916 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
918 wint_t wch = __btowc (ch);
919 if (wch != WEOF)
920 dfa->sb_char[i] |= (bitset_word_t) 1 << j;
921 # ifndef _LIBC
922 if (isascii (ch) && wch != ch)
923 dfa->map_notascii = 1;
924 # endif
928 #endif
930 if (__glibc_unlikely (dfa->nodes == NULL || dfa->state_table == NULL))
931 return REG_ESPACE;
932 return REG_NOERROR;
935 /* Initialize WORD_CHAR table, which indicate which character is
936 "word". In this case "word" means that it is the word construction
937 character used by some operators like "\<", "\>", etc. */
939 static void
940 init_word_char (re_dfa_t *dfa)
942 int i = 0;
943 int j;
944 int ch = 0;
945 dfa->word_ops_used = 1;
946 if (__glibc_likely (dfa->map_notascii == 0))
948 /* Avoid uint32_t and uint64_t as some non-GCC platforms lack
949 them, an issue when this code is used in Gnulib. */
950 bitset_word_t bits0 = 0x00000000;
951 bitset_word_t bits1 = 0x03ff0000;
952 bitset_word_t bits2 = 0x87fffffe;
953 bitset_word_t bits3 = 0x07fffffe;
954 if (BITSET_WORD_BITS == 64)
956 /* Pacify gcc -Woverflow on 32-bit platformns. */
957 dfa->word_char[0] = bits1 << 31 << 1 | bits0;
958 dfa->word_char[1] = bits3 << 31 << 1 | bits2;
959 i = 2;
961 else if (BITSET_WORD_BITS == 32)
963 dfa->word_char[0] = bits0;
964 dfa->word_char[1] = bits1;
965 dfa->word_char[2] = bits2;
966 dfa->word_char[3] = bits3;
967 i = 4;
969 else
970 goto general_case;
971 ch = 128;
973 if (__glibc_likely (dfa->is_utf8))
975 memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8);
976 return;
980 general_case:
981 for (; i < BITSET_WORDS; ++i)
982 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
983 if (isalnum (ch) || ch == '_')
984 dfa->word_char[i] |= (bitset_word_t) 1 << j;
987 /* Free the work area which are only used while compiling. */
989 static void
990 free_workarea_compile (regex_t *preg)
992 re_dfa_t *dfa = preg->buffer;
993 bin_tree_storage_t *storage, *next;
994 for (storage = dfa->str_tree_storage; storage; storage = next)
996 next = storage->next;
997 re_free (storage);
999 dfa->str_tree_storage = NULL;
1000 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
1001 dfa->str_tree = NULL;
1002 re_free (dfa->org_indices);
1003 dfa->org_indices = NULL;
1006 /* Create initial states for all contexts. */
1008 static reg_errcode_t
1009 create_initial_state (re_dfa_t *dfa)
1011 Idx first, i;
1012 reg_errcode_t err;
1013 re_node_set init_nodes;
1015 /* Initial states have the epsilon closure of the node which is
1016 the first node of the regular expression. */
1017 first = dfa->str_tree->first->node_idx;
1018 dfa->init_node = first;
1019 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
1020 if (__glibc_unlikely (err != REG_NOERROR))
1021 return err;
1023 /* The back-references which are in initial states can epsilon transit,
1024 since in this case all of the subexpressions can be null.
1025 Then we add epsilon closures of the nodes which are the next nodes of
1026 the back-references. */
1027 if (dfa->nbackref > 0)
1028 for (i = 0; i < init_nodes.nelem; ++i)
1030 Idx node_idx = init_nodes.elems[i];
1031 re_token_type_t type = dfa->nodes[node_idx].type;
1033 Idx clexp_idx;
1034 if (type != OP_BACK_REF)
1035 continue;
1036 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1038 re_token_t *clexp_node;
1039 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1040 if (clexp_node->type == OP_CLOSE_SUBEXP
1041 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1042 break;
1044 if (clexp_idx == init_nodes.nelem)
1045 continue;
1047 if (type == OP_BACK_REF)
1049 Idx dest_idx = dfa->edests[node_idx].elems[0];
1050 if (!re_node_set_contains (&init_nodes, dest_idx))
1052 reg_errcode_t merge_err
1053 = re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1054 if (merge_err != REG_NOERROR)
1055 return merge_err;
1056 i = 0;
1061 /* It must be the first time to invoke acquire_state. */
1062 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1063 /* We don't check ERR here, since the initial state must not be NULL. */
1064 if (__glibc_unlikely (dfa->init_state == NULL))
1065 return err;
1066 if (dfa->init_state->has_constraint)
1068 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1069 CONTEXT_WORD);
1070 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1071 CONTEXT_NEWLINE);
1072 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1073 &init_nodes,
1074 CONTEXT_NEWLINE
1075 | CONTEXT_BEGBUF);
1076 if (__glibc_unlikely (dfa->init_state_word == NULL
1077 || dfa->init_state_nl == NULL
1078 || dfa->init_state_begbuf == NULL))
1079 return err;
1081 else
1082 dfa->init_state_word = dfa->init_state_nl
1083 = dfa->init_state_begbuf = dfa->init_state;
1085 re_node_set_free (&init_nodes);
1086 return REG_NOERROR;
1089 #ifdef RE_ENABLE_I18N
1090 /* If it is possible to do searching in single byte encoding instead of UTF-8
1091 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1092 DFA nodes where needed. */
1094 static void
1095 optimize_utf8 (re_dfa_t *dfa)
1097 Idx node;
1098 int i;
1099 bool mb_chars = false;
1100 bool has_period = false;
1102 for (node = 0; node < dfa->nodes_len; ++node)
1103 switch (dfa->nodes[node].type)
1105 case CHARACTER:
1106 if (dfa->nodes[node].opr.c >= ASCII_CHARS)
1107 mb_chars = true;
1108 break;
1109 case ANCHOR:
1110 switch (dfa->nodes[node].opr.ctx_type)
1112 case LINE_FIRST:
1113 case LINE_LAST:
1114 case BUF_FIRST:
1115 case BUF_LAST:
1116 break;
1117 default:
1118 /* Word anchors etc. cannot be handled. It's okay to test
1119 opr.ctx_type since constraints (for all DFA nodes) are
1120 created by ORing one or more opr.ctx_type values. */
1121 return;
1123 break;
1124 case OP_PERIOD:
1125 has_period = true;
1126 break;
1127 case OP_BACK_REF:
1128 case OP_ALT:
1129 case END_OF_RE:
1130 case OP_DUP_ASTERISK:
1131 case OP_OPEN_SUBEXP:
1132 case OP_CLOSE_SUBEXP:
1133 break;
1134 case COMPLEX_BRACKET:
1135 return;
1136 case SIMPLE_BRACKET:
1137 /* Just double check. */
1139 int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
1141 : BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
1142 for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1144 if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
1145 return;
1146 rshift = 0;
1149 break;
1150 default:
1151 abort ();
1154 if (mb_chars || has_period)
1155 for (node = 0; node < dfa->nodes_len; ++node)
1157 if (dfa->nodes[node].type == CHARACTER
1158 && dfa->nodes[node].opr.c >= ASCII_CHARS)
1159 dfa->nodes[node].mb_partial = 0;
1160 else if (dfa->nodes[node].type == OP_PERIOD)
1161 dfa->nodes[node].type = OP_UTF8_PERIOD;
1164 /* The search can be in single byte locale. */
1165 dfa->mb_cur_max = 1;
1166 dfa->is_utf8 = 0;
1167 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1169 #endif
1171 /* Analyze the structure tree, and calculate "first", "next", "edest",
1172 "eclosure", and "inveclosure". */
1174 static reg_errcode_t
1175 analyze (regex_t *preg)
1177 re_dfa_t *dfa = preg->buffer;
1178 reg_errcode_t ret;
1180 /* Allocate arrays. */
1181 dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1182 dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1183 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1184 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1185 if (__glibc_unlikely (dfa->nexts == NULL || dfa->org_indices == NULL
1186 || dfa->edests == NULL || dfa->eclosures == NULL))
1187 return REG_ESPACE;
1189 dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
1190 if (dfa->subexp_map != NULL)
1192 Idx i;
1193 for (i = 0; i < preg->re_nsub; i++)
1194 dfa->subexp_map[i] = i;
1195 preorder (dfa->str_tree, optimize_subexps, dfa);
1196 for (i = 0; i < preg->re_nsub; i++)
1197 if (dfa->subexp_map[i] != i)
1198 break;
1199 if (i == preg->re_nsub)
1201 re_free (dfa->subexp_map);
1202 dfa->subexp_map = NULL;
1206 ret = postorder (dfa->str_tree, lower_subexps, preg);
1207 if (__glibc_unlikely (ret != REG_NOERROR))
1208 return ret;
1209 ret = postorder (dfa->str_tree, calc_first, dfa);
1210 if (__glibc_unlikely (ret != REG_NOERROR))
1211 return ret;
1212 preorder (dfa->str_tree, calc_next, dfa);
1213 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1214 if (__glibc_unlikely (ret != REG_NOERROR))
1215 return ret;
1216 ret = calc_eclosure (dfa);
1217 if (__glibc_unlikely (ret != REG_NOERROR))
1218 return ret;
1220 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1221 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1222 if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1223 || dfa->nbackref)
1225 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1226 if (__glibc_unlikely (dfa->inveclosures == NULL))
1227 return REG_ESPACE;
1228 ret = calc_inveclosure (dfa);
1231 return ret;
1234 /* Our parse trees are very unbalanced, so we cannot use a stack to
1235 implement parse tree visits. Instead, we use parent pointers and
1236 some hairy code in these two functions. */
1237 static reg_errcode_t
1238 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1239 void *extra)
1241 bin_tree_t *node, *prev;
1243 for (node = root; ; )
1245 /* Descend down the tree, preferably to the left (or to the right
1246 if that's the only child). */
1247 while (node->left || node->right)
1248 if (node->left)
1249 node = node->left;
1250 else
1251 node = node->right;
1255 reg_errcode_t err = fn (extra, node);
1256 if (__glibc_unlikely (err != REG_NOERROR))
1257 return err;
1258 if (node->parent == NULL)
1259 return REG_NOERROR;
1260 prev = node;
1261 node = node->parent;
1263 /* Go up while we have a node that is reached from the right. */
1264 while (node->right == prev || node->right == NULL);
1265 node = node->right;
1269 static reg_errcode_t
1270 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1271 void *extra)
1273 bin_tree_t *node;
1275 for (node = root; ; )
1277 reg_errcode_t err = fn (extra, node);
1278 if (__glibc_unlikely (err != REG_NOERROR))
1279 return err;
1281 /* Go to the left node, or up and to the right. */
1282 if (node->left)
1283 node = node->left;
1284 else
1286 bin_tree_t *prev = NULL;
1287 while (node->right == prev || node->right == NULL)
1289 prev = node;
1290 node = node->parent;
1291 if (!node)
1292 return REG_NOERROR;
1294 node = node->right;
1299 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1300 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1301 backreferences as well. Requires a preorder visit. */
1302 static reg_errcode_t
1303 optimize_subexps (void *extra, bin_tree_t *node)
1305 re_dfa_t *dfa = (re_dfa_t *) extra;
1307 if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1309 int idx = node->token.opr.idx;
1310 node->token.opr.idx = dfa->subexp_map[idx];
1311 dfa->used_bkref_map |= 1 << node->token.opr.idx;
1314 else if (node->token.type == SUBEXP
1315 && node->left && node->left->token.type == SUBEXP)
1317 Idx other_idx = node->left->token.opr.idx;
1319 node->left = node->left->left;
1320 if (node->left)
1321 node->left->parent = node;
1323 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1324 if (other_idx < BITSET_WORD_BITS)
1325 dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1328 return REG_NOERROR;
1331 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1332 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1333 static reg_errcode_t
1334 lower_subexps (void *extra, bin_tree_t *node)
1336 regex_t *preg = (regex_t *) extra;
1337 reg_errcode_t err = REG_NOERROR;
1339 if (node->left && node->left->token.type == SUBEXP)
1341 node->left = lower_subexp (&err, preg, node->left);
1342 if (node->left)
1343 node->left->parent = node;
1345 if (node->right && node->right->token.type == SUBEXP)
1347 node->right = lower_subexp (&err, preg, node->right);
1348 if (node->right)
1349 node->right->parent = node;
1352 return err;
1355 static bin_tree_t *
1356 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1358 re_dfa_t *dfa = preg->buffer;
1359 bin_tree_t *body = node->left;
1360 bin_tree_t *op, *cls, *tree1, *tree;
1362 if (preg->no_sub
1363 /* We do not optimize empty subexpressions, because otherwise we may
1364 have bad CONCAT nodes with NULL children. This is obviously not
1365 very common, so we do not lose much. An example that triggers
1366 this case is the sed "script" /\(\)/x. */
1367 && node->left != NULL
1368 && (node->token.opr.idx >= BITSET_WORD_BITS
1369 || !(dfa->used_bkref_map
1370 & ((bitset_word_t) 1 << node->token.opr.idx))))
1371 return node->left;
1373 /* Convert the SUBEXP node to the concatenation of an
1374 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1375 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1376 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1377 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1378 tree = create_tree (dfa, op, tree1, CONCAT);
1379 if (__glibc_unlikely (tree == NULL || tree1 == NULL
1380 || op == NULL || cls == NULL))
1382 *err = REG_ESPACE;
1383 return NULL;
1386 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1387 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1388 return tree;
1391 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1392 nodes. Requires a postorder visit. */
1393 static reg_errcode_t
1394 calc_first (void *extra, bin_tree_t *node)
1396 re_dfa_t *dfa = (re_dfa_t *) extra;
1397 if (node->token.type == CONCAT)
1399 node->first = node->left->first;
1400 node->node_idx = node->left->node_idx;
1402 else
1404 node->first = node;
1405 node->node_idx = re_dfa_add_node (dfa, node->token);
1406 if (__glibc_unlikely (node->node_idx == -1))
1407 return REG_ESPACE;
1408 if (node->token.type == ANCHOR)
1409 dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1411 return REG_NOERROR;
1414 /* Pass 2: compute NEXT on the tree. Preorder visit. */
1415 static reg_errcode_t
1416 calc_next (void *extra, bin_tree_t *node)
1418 switch (node->token.type)
1420 case OP_DUP_ASTERISK:
1421 node->left->next = node;
1422 break;
1423 case CONCAT:
1424 node->left->next = node->right->first;
1425 node->right->next = node->next;
1426 break;
1427 default:
1428 if (node->left)
1429 node->left->next = node->next;
1430 if (node->right)
1431 node->right->next = node->next;
1432 break;
1434 return REG_NOERROR;
1437 /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1438 static reg_errcode_t
1439 link_nfa_nodes (void *extra, bin_tree_t *node)
1441 re_dfa_t *dfa = (re_dfa_t *) extra;
1442 Idx idx = node->node_idx;
1443 reg_errcode_t err = REG_NOERROR;
1445 switch (node->token.type)
1447 case CONCAT:
1448 break;
1450 case END_OF_RE:
1451 assert (node->next == NULL);
1452 break;
1454 case OP_DUP_ASTERISK:
1455 case OP_ALT:
1457 Idx left, right;
1458 dfa->has_plural_match = 1;
1459 if (node->left != NULL)
1460 left = node->left->first->node_idx;
1461 else
1462 left = node->next->node_idx;
1463 if (node->right != NULL)
1464 right = node->right->first->node_idx;
1465 else
1466 right = node->next->node_idx;
1467 assert (left > -1);
1468 assert (right > -1);
1469 err = re_node_set_init_2 (dfa->edests + idx, left, right);
1471 break;
1473 case ANCHOR:
1474 case OP_OPEN_SUBEXP:
1475 case OP_CLOSE_SUBEXP:
1476 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1477 break;
1479 case OP_BACK_REF:
1480 dfa->nexts[idx] = node->next->node_idx;
1481 if (node->token.type == OP_BACK_REF)
1482 err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1483 break;
1485 default:
1486 assert (!IS_EPSILON_NODE (node->token.type));
1487 dfa->nexts[idx] = node->next->node_idx;
1488 break;
1491 return err;
1494 /* Duplicate the epsilon closure of the node ROOT_NODE.
1495 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1496 to their own constraint. */
1498 static reg_errcode_t
1499 duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
1500 Idx root_node, unsigned int init_constraint)
1502 Idx org_node, clone_node;
1503 bool ok;
1504 unsigned int constraint = init_constraint;
1505 for (org_node = top_org_node, clone_node = top_clone_node;;)
1507 Idx org_dest, clone_dest;
1508 if (dfa->nodes[org_node].type == OP_BACK_REF)
1510 /* If the back reference epsilon-transit, its destination must
1511 also have the constraint. Then duplicate the epsilon closure
1512 of the destination of the back reference, and store it in
1513 edests of the back reference. */
1514 org_dest = dfa->nexts[org_node];
1515 re_node_set_empty (dfa->edests + clone_node);
1516 clone_dest = duplicate_node (dfa, org_dest, constraint);
1517 if (__glibc_unlikely (clone_dest == -1))
1518 return REG_ESPACE;
1519 dfa->nexts[clone_node] = dfa->nexts[org_node];
1520 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1521 if (__glibc_unlikely (! ok))
1522 return REG_ESPACE;
1524 else if (dfa->edests[org_node].nelem == 0)
1526 /* In case of the node can't epsilon-transit, don't duplicate the
1527 destination and store the original destination as the
1528 destination of the node. */
1529 dfa->nexts[clone_node] = dfa->nexts[org_node];
1530 break;
1532 else if (dfa->edests[org_node].nelem == 1)
1534 /* In case of the node can epsilon-transit, and it has only one
1535 destination. */
1536 org_dest = dfa->edests[org_node].elems[0];
1537 re_node_set_empty (dfa->edests + clone_node);
1538 /* If the node is root_node itself, it means the epsilon closure
1539 has a loop. Then tie it to the destination of the root_node. */
1540 if (org_node == root_node && clone_node != org_node)
1542 ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
1543 if (__glibc_unlikely (! ok))
1544 return REG_ESPACE;
1545 break;
1547 /* In case the node has another constraint, append it. */
1548 constraint |= dfa->nodes[org_node].constraint;
1549 clone_dest = duplicate_node (dfa, org_dest, constraint);
1550 if (__glibc_unlikely (clone_dest == -1))
1551 return REG_ESPACE;
1552 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1553 if (__glibc_unlikely (! ok))
1554 return REG_ESPACE;
1556 else /* dfa->edests[org_node].nelem == 2 */
1558 /* In case of the node can epsilon-transit, and it has two
1559 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1560 org_dest = dfa->edests[org_node].elems[0];
1561 re_node_set_empty (dfa->edests + clone_node);
1562 /* Search for a duplicated node which satisfies the constraint. */
1563 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1564 if (clone_dest == -1)
1566 /* There is no such duplicated node, create a new one. */
1567 reg_errcode_t err;
1568 clone_dest = duplicate_node (dfa, org_dest, constraint);
1569 if (__glibc_unlikely (clone_dest == -1))
1570 return REG_ESPACE;
1571 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1572 if (__glibc_unlikely (! ok))
1573 return REG_ESPACE;
1574 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1575 root_node, constraint);
1576 if (__glibc_unlikely (err != REG_NOERROR))
1577 return err;
1579 else
1581 /* There is a duplicated node which satisfies the constraint,
1582 use it to avoid infinite loop. */
1583 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1584 if (__glibc_unlikely (! ok))
1585 return REG_ESPACE;
1588 org_dest = dfa->edests[org_node].elems[1];
1589 clone_dest = duplicate_node (dfa, org_dest, constraint);
1590 if (__glibc_unlikely (clone_dest == -1))
1591 return REG_ESPACE;
1592 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1593 if (__glibc_unlikely (! ok))
1594 return REG_ESPACE;
1596 org_node = org_dest;
1597 clone_node = clone_dest;
1599 return REG_NOERROR;
1602 /* Search for a node which is duplicated from the node ORG_NODE, and
1603 satisfies the constraint CONSTRAINT. */
1605 static Idx
1606 search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1607 unsigned int constraint)
1609 Idx idx;
1610 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1612 if (org_node == dfa->org_indices[idx]
1613 && constraint == dfa->nodes[idx].constraint)
1614 return idx; /* Found. */
1616 return -1; /* Not found. */
1619 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1620 Return the index of the new node, or -1 if insufficient storage is
1621 available. */
1623 static Idx
1624 duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
1626 Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1627 if (__glibc_likely (dup_idx != -1))
1629 dfa->nodes[dup_idx].constraint = constraint;
1630 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1631 dfa->nodes[dup_idx].duplicated = 1;
1633 /* Store the index of the original node. */
1634 dfa->org_indices[dup_idx] = org_idx;
1636 return dup_idx;
1639 static reg_errcode_t
1640 calc_inveclosure (re_dfa_t *dfa)
1642 Idx src, idx;
1643 bool ok;
1644 for (idx = 0; idx < dfa->nodes_len; ++idx)
1645 re_node_set_init_empty (dfa->inveclosures + idx);
1647 for (src = 0; src < dfa->nodes_len; ++src)
1649 Idx *elems = dfa->eclosures[src].elems;
1650 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1652 ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1653 if (__glibc_unlikely (! ok))
1654 return REG_ESPACE;
1658 return REG_NOERROR;
1661 /* Calculate "eclosure" for all the node in DFA. */
1663 static reg_errcode_t
1664 calc_eclosure (re_dfa_t *dfa)
1666 Idx node_idx;
1667 bool incomplete;
1668 #ifdef DEBUG
1669 assert (dfa->nodes_len > 0);
1670 #endif
1671 incomplete = false;
1672 /* For each nodes, calculate epsilon closure. */
1673 for (node_idx = 0; ; ++node_idx)
1675 reg_errcode_t err;
1676 re_node_set eclosure_elem;
1677 if (node_idx == dfa->nodes_len)
1679 if (!incomplete)
1680 break;
1681 incomplete = false;
1682 node_idx = 0;
1685 #ifdef DEBUG
1686 assert (dfa->eclosures[node_idx].nelem != -1);
1687 #endif
1689 /* If we have already calculated, skip it. */
1690 if (dfa->eclosures[node_idx].nelem != 0)
1691 continue;
1692 /* Calculate epsilon closure of 'node_idx'. */
1693 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1694 if (__glibc_unlikely (err != REG_NOERROR))
1695 return err;
1697 if (dfa->eclosures[node_idx].nelem == 0)
1699 incomplete = true;
1700 re_node_set_free (&eclosure_elem);
1703 return REG_NOERROR;
1706 /* Calculate epsilon closure of NODE. */
1708 static reg_errcode_t
1709 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
1711 reg_errcode_t err;
1712 Idx i;
1713 re_node_set eclosure;
1714 bool ok;
1715 bool incomplete = false;
1716 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1717 if (__glibc_unlikely (err != REG_NOERROR))
1718 return err;
1720 /* This indicates that we are calculating this node now.
1721 We reference this value to avoid infinite loop. */
1722 dfa->eclosures[node].nelem = -1;
1724 /* If the current node has constraints, duplicate all nodes
1725 since they must inherit the constraints. */
1726 if (dfa->nodes[node].constraint
1727 && dfa->edests[node].nelem
1728 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1730 err = duplicate_node_closure (dfa, node, node, node,
1731 dfa->nodes[node].constraint);
1732 if (__glibc_unlikely (err != REG_NOERROR))
1733 return err;
1736 /* Expand each epsilon destination nodes. */
1737 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1738 for (i = 0; i < dfa->edests[node].nelem; ++i)
1740 re_node_set eclosure_elem;
1741 Idx edest = dfa->edests[node].elems[i];
1742 /* If calculating the epsilon closure of 'edest' is in progress,
1743 return intermediate result. */
1744 if (dfa->eclosures[edest].nelem == -1)
1746 incomplete = true;
1747 continue;
1749 /* If we haven't calculated the epsilon closure of 'edest' yet,
1750 calculate now. Otherwise use calculated epsilon closure. */
1751 if (dfa->eclosures[edest].nelem == 0)
1753 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1754 if (__glibc_unlikely (err != REG_NOERROR))
1755 return err;
1757 else
1758 eclosure_elem = dfa->eclosures[edest];
1759 /* Merge the epsilon closure of 'edest'. */
1760 err = re_node_set_merge (&eclosure, &eclosure_elem);
1761 if (__glibc_unlikely (err != REG_NOERROR))
1762 return err;
1763 /* If the epsilon closure of 'edest' is incomplete,
1764 the epsilon closure of this node is also incomplete. */
1765 if (dfa->eclosures[edest].nelem == 0)
1767 incomplete = true;
1768 re_node_set_free (&eclosure_elem);
1772 /* An epsilon closure includes itself. */
1773 ok = re_node_set_insert (&eclosure, node);
1774 if (__glibc_unlikely (! ok))
1775 return REG_ESPACE;
1776 if (incomplete && !root)
1777 dfa->eclosures[node].nelem = 0;
1778 else
1779 dfa->eclosures[node] = eclosure;
1780 *new_set = eclosure;
1781 return REG_NOERROR;
1784 /* Functions for token which are used in the parser. */
1786 /* Fetch a token from INPUT.
1787 We must not use this function inside bracket expressions. */
1789 static void
1790 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1792 re_string_skip_bytes (input, peek_token (result, input, syntax));
1795 /* Peek a token from INPUT, and return the length of the token.
1796 We must not use this function inside bracket expressions. */
1798 static int
1799 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1801 unsigned char c;
1803 if (re_string_eoi (input))
1805 token->type = END_OF_RE;
1806 return 0;
1809 c = re_string_peek_byte (input, 0);
1810 token->opr.c = c;
1812 token->word_char = 0;
1813 #ifdef RE_ENABLE_I18N
1814 token->mb_partial = 0;
1815 if (input->mb_cur_max > 1 &&
1816 !re_string_first_byte (input, re_string_cur_idx (input)))
1818 token->type = CHARACTER;
1819 token->mb_partial = 1;
1820 return 1;
1822 #endif
1823 if (c == '\\')
1825 unsigned char c2;
1826 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1828 token->type = BACK_SLASH;
1829 return 1;
1832 c2 = re_string_peek_byte_case (input, 1);
1833 token->opr.c = c2;
1834 token->type = CHARACTER;
1835 #ifdef RE_ENABLE_I18N
1836 if (input->mb_cur_max > 1)
1838 wint_t wc = re_string_wchar_at (input,
1839 re_string_cur_idx (input) + 1);
1840 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1842 else
1843 #endif
1844 token->word_char = IS_WORD_CHAR (c2) != 0;
1846 switch (c2)
1848 case '|':
1849 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1850 token->type = OP_ALT;
1851 break;
1852 case '1': case '2': case '3': case '4': case '5':
1853 case '6': case '7': case '8': case '9':
1854 if (!(syntax & RE_NO_BK_REFS))
1856 token->type = OP_BACK_REF;
1857 token->opr.idx = c2 - '1';
1859 break;
1860 case '<':
1861 if (!(syntax & RE_NO_GNU_OPS))
1863 token->type = ANCHOR;
1864 token->opr.ctx_type = WORD_FIRST;
1866 break;
1867 case '>':
1868 if (!(syntax & RE_NO_GNU_OPS))
1870 token->type = ANCHOR;
1871 token->opr.ctx_type = WORD_LAST;
1873 break;
1874 case 'b':
1875 if (!(syntax & RE_NO_GNU_OPS))
1877 token->type = ANCHOR;
1878 token->opr.ctx_type = WORD_DELIM;
1880 break;
1881 case 'B':
1882 if (!(syntax & RE_NO_GNU_OPS))
1884 token->type = ANCHOR;
1885 token->opr.ctx_type = NOT_WORD_DELIM;
1887 break;
1888 case 'w':
1889 if (!(syntax & RE_NO_GNU_OPS))
1890 token->type = OP_WORD;
1891 break;
1892 case 'W':
1893 if (!(syntax & RE_NO_GNU_OPS))
1894 token->type = OP_NOTWORD;
1895 break;
1896 case 's':
1897 if (!(syntax & RE_NO_GNU_OPS))
1898 token->type = OP_SPACE;
1899 break;
1900 case 'S':
1901 if (!(syntax & RE_NO_GNU_OPS))
1902 token->type = OP_NOTSPACE;
1903 break;
1904 case '`':
1905 if (!(syntax & RE_NO_GNU_OPS))
1907 token->type = ANCHOR;
1908 token->opr.ctx_type = BUF_FIRST;
1910 break;
1911 case '\'':
1912 if (!(syntax & RE_NO_GNU_OPS))
1914 token->type = ANCHOR;
1915 token->opr.ctx_type = BUF_LAST;
1917 break;
1918 case '(':
1919 if (!(syntax & RE_NO_BK_PARENS))
1920 token->type = OP_OPEN_SUBEXP;
1921 break;
1922 case ')':
1923 if (!(syntax & RE_NO_BK_PARENS))
1924 token->type = OP_CLOSE_SUBEXP;
1925 break;
1926 case '+':
1927 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1928 token->type = OP_DUP_PLUS;
1929 break;
1930 case '?':
1931 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1932 token->type = OP_DUP_QUESTION;
1933 break;
1934 case '{':
1935 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1936 token->type = OP_OPEN_DUP_NUM;
1937 break;
1938 case '}':
1939 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1940 token->type = OP_CLOSE_DUP_NUM;
1941 break;
1942 default:
1943 break;
1945 return 2;
1948 token->type = CHARACTER;
1949 #ifdef RE_ENABLE_I18N
1950 if (input->mb_cur_max > 1)
1952 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1953 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1955 else
1956 #endif
1957 token->word_char = IS_WORD_CHAR (token->opr.c);
1959 switch (c)
1961 case '\n':
1962 if (syntax & RE_NEWLINE_ALT)
1963 token->type = OP_ALT;
1964 break;
1965 case '|':
1966 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1967 token->type = OP_ALT;
1968 break;
1969 case '*':
1970 token->type = OP_DUP_ASTERISK;
1971 break;
1972 case '+':
1973 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1974 token->type = OP_DUP_PLUS;
1975 break;
1976 case '?':
1977 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1978 token->type = OP_DUP_QUESTION;
1979 break;
1980 case '{':
1981 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1982 token->type = OP_OPEN_DUP_NUM;
1983 break;
1984 case '}':
1985 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1986 token->type = OP_CLOSE_DUP_NUM;
1987 break;
1988 case '(':
1989 if (syntax & RE_NO_BK_PARENS)
1990 token->type = OP_OPEN_SUBEXP;
1991 break;
1992 case ')':
1993 if (syntax & RE_NO_BK_PARENS)
1994 token->type = OP_CLOSE_SUBEXP;
1995 break;
1996 case '[':
1997 token->type = OP_OPEN_BRACKET;
1998 break;
1999 case '.':
2000 token->type = OP_PERIOD;
2001 break;
2002 case '^':
2003 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
2004 re_string_cur_idx (input) != 0)
2006 char prev = re_string_peek_byte (input, -1);
2007 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
2008 break;
2010 token->type = ANCHOR;
2011 token->opr.ctx_type = LINE_FIRST;
2012 break;
2013 case '$':
2014 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
2015 re_string_cur_idx (input) + 1 != re_string_length (input))
2017 re_token_t next;
2018 re_string_skip_bytes (input, 1);
2019 peek_token (&next, input, syntax);
2020 re_string_skip_bytes (input, -1);
2021 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
2022 break;
2024 token->type = ANCHOR;
2025 token->opr.ctx_type = LINE_LAST;
2026 break;
2027 default:
2028 break;
2030 return 1;
2033 /* Peek a token from INPUT, and return the length of the token.
2034 We must not use this function out of bracket expressions. */
2036 static int
2037 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2039 unsigned char c;
2040 if (re_string_eoi (input))
2042 token->type = END_OF_RE;
2043 return 0;
2045 c = re_string_peek_byte (input, 0);
2046 token->opr.c = c;
2048 #ifdef RE_ENABLE_I18N
2049 if (input->mb_cur_max > 1 &&
2050 !re_string_first_byte (input, re_string_cur_idx (input)))
2052 token->type = CHARACTER;
2053 return 1;
2055 #endif /* RE_ENABLE_I18N */
2057 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2058 && re_string_cur_idx (input) + 1 < re_string_length (input))
2060 /* In this case, '\' escape a character. */
2061 unsigned char c2;
2062 re_string_skip_bytes (input, 1);
2063 c2 = re_string_peek_byte (input, 0);
2064 token->opr.c = c2;
2065 token->type = CHARACTER;
2066 return 1;
2068 if (c == '[') /* '[' is a special char in a bracket exps. */
2070 unsigned char c2;
2071 int token_len;
2072 if (re_string_cur_idx (input) + 1 < re_string_length (input))
2073 c2 = re_string_peek_byte (input, 1);
2074 else
2075 c2 = 0;
2076 token->opr.c = c2;
2077 token_len = 2;
2078 switch (c2)
2080 case '.':
2081 token->type = OP_OPEN_COLL_ELEM;
2082 break;
2084 case '=':
2085 token->type = OP_OPEN_EQUIV_CLASS;
2086 break;
2088 case ':':
2089 if (syntax & RE_CHAR_CLASSES)
2091 token->type = OP_OPEN_CHAR_CLASS;
2092 break;
2094 FALLTHROUGH;
2095 default:
2096 token->type = CHARACTER;
2097 token->opr.c = c;
2098 token_len = 1;
2099 break;
2101 return token_len;
2103 switch (c)
2105 case '-':
2106 token->type = OP_CHARSET_RANGE;
2107 break;
2108 case ']':
2109 token->type = OP_CLOSE_BRACKET;
2110 break;
2111 case '^':
2112 token->type = OP_NON_MATCH_LIST;
2113 break;
2114 default:
2115 token->type = CHARACTER;
2117 return 1;
2120 /* Functions for parser. */
2122 /* Entry point of the parser.
2123 Parse the regular expression REGEXP and return the structure tree.
2124 If an error occurs, ERR is set by error code, and return NULL.
2125 This function build the following tree, from regular expression <reg_exp>:
2129 <reg_exp> EOR
2131 CAT means concatenation.
2132 EOR means end of regular expression. */
2134 static bin_tree_t *
2135 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2136 reg_errcode_t *err)
2138 re_dfa_t *dfa = preg->buffer;
2139 bin_tree_t *tree, *eor, *root;
2140 re_token_t current_token;
2141 dfa->syntax = syntax;
2142 fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2143 tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2144 if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2145 return NULL;
2146 eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2147 if (tree != NULL)
2148 root = create_tree (dfa, tree, eor, CONCAT);
2149 else
2150 root = eor;
2151 if (__glibc_unlikely (eor == NULL || root == NULL))
2153 *err = REG_ESPACE;
2154 return NULL;
2156 return root;
2159 /* This function build the following tree, from regular expression
2160 <branch1>|<branch2>:
2164 <branch1> <branch2>
2166 ALT means alternative, which represents the operator '|'. */
2168 static bin_tree_t *
2169 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2170 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2172 re_dfa_t *dfa = preg->buffer;
2173 bin_tree_t *tree, *branch = NULL;
2174 bitset_word_t initial_bkref_map = dfa->completed_bkref_map;
2175 tree = parse_branch (regexp, preg, token, syntax, nest, err);
2176 if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2177 return NULL;
2179 while (token->type == OP_ALT)
2181 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2182 if (token->type != OP_ALT && token->type != END_OF_RE
2183 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2185 bitset_word_t accumulated_bkref_map = dfa->completed_bkref_map;
2186 dfa->completed_bkref_map = initial_bkref_map;
2187 branch = parse_branch (regexp, preg, token, syntax, nest, err);
2188 if (__glibc_unlikely (*err != REG_NOERROR && branch == NULL))
2190 if (tree != NULL)
2191 postorder (tree, free_tree, NULL);
2192 return NULL;
2194 dfa->completed_bkref_map |= accumulated_bkref_map;
2196 else
2197 branch = NULL;
2198 tree = create_tree (dfa, tree, branch, OP_ALT);
2199 if (__glibc_unlikely (tree == NULL))
2201 *err = REG_ESPACE;
2202 return NULL;
2205 return tree;
2208 /* This function build the following tree, from regular expression
2209 <exp1><exp2>:
2213 <exp1> <exp2>
2215 CAT means concatenation. */
2217 static bin_tree_t *
2218 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2219 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2221 bin_tree_t *tree, *expr;
2222 re_dfa_t *dfa = preg->buffer;
2223 tree = parse_expression (regexp, preg, token, syntax, nest, err);
2224 if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2225 return NULL;
2227 while (token->type != OP_ALT && token->type != END_OF_RE
2228 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2230 expr = parse_expression (regexp, preg, token, syntax, nest, err);
2231 if (__glibc_unlikely (*err != REG_NOERROR && expr == NULL))
2233 if (tree != NULL)
2234 postorder (tree, free_tree, NULL);
2235 return NULL;
2237 if (tree != NULL && expr != NULL)
2239 bin_tree_t *newtree = create_tree (dfa, tree, expr, CONCAT);
2240 if (newtree == NULL)
2242 postorder (expr, free_tree, NULL);
2243 postorder (tree, free_tree, NULL);
2244 *err = REG_ESPACE;
2245 return NULL;
2247 tree = newtree;
2249 else if (tree == NULL)
2250 tree = expr;
2251 /* Otherwise expr == NULL, we don't need to create new tree. */
2253 return tree;
2256 /* This function build the following tree, from regular expression a*:
2262 static bin_tree_t *
2263 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2264 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2266 re_dfa_t *dfa = preg->buffer;
2267 bin_tree_t *tree;
2268 switch (token->type)
2270 case CHARACTER:
2271 tree = create_token_tree (dfa, NULL, NULL, token);
2272 if (__glibc_unlikely (tree == NULL))
2274 *err = REG_ESPACE;
2275 return NULL;
2277 #ifdef RE_ENABLE_I18N
2278 if (dfa->mb_cur_max > 1)
2280 while (!re_string_eoi (regexp)
2281 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2283 bin_tree_t *mbc_remain;
2284 fetch_token (token, regexp, syntax);
2285 mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2286 tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2287 if (__glibc_unlikely (mbc_remain == NULL || tree == NULL))
2289 *err = REG_ESPACE;
2290 return NULL;
2294 #endif
2295 break;
2297 case OP_OPEN_SUBEXP:
2298 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2299 if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2300 return NULL;
2301 break;
2303 case OP_OPEN_BRACKET:
2304 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2305 if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2306 return NULL;
2307 break;
2309 case OP_BACK_REF:
2310 if (!__glibc_likely (dfa->completed_bkref_map & (1 << token->opr.idx)))
2312 *err = REG_ESUBREG;
2313 return NULL;
2315 dfa->used_bkref_map |= 1 << token->opr.idx;
2316 tree = create_token_tree (dfa, NULL, NULL, token);
2317 if (__glibc_unlikely (tree == NULL))
2319 *err = REG_ESPACE;
2320 return NULL;
2322 ++dfa->nbackref;
2323 dfa->has_mb_node = 1;
2324 break;
2326 case OP_OPEN_DUP_NUM:
2327 if (syntax & RE_CONTEXT_INVALID_DUP)
2329 *err = REG_BADRPT;
2330 return NULL;
2332 FALLTHROUGH;
2333 case OP_DUP_ASTERISK:
2334 case OP_DUP_PLUS:
2335 case OP_DUP_QUESTION:
2336 if (syntax & RE_CONTEXT_INVALID_OPS)
2338 *err = REG_BADRPT;
2339 return NULL;
2341 else if (syntax & RE_CONTEXT_INDEP_OPS)
2343 fetch_token (token, regexp, syntax);
2344 return parse_expression (regexp, preg, token, syntax, nest, err);
2346 FALLTHROUGH;
2347 case OP_CLOSE_SUBEXP:
2348 if ((token->type == OP_CLOSE_SUBEXP) &&
2349 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2351 *err = REG_ERPAREN;
2352 return NULL;
2354 FALLTHROUGH;
2355 case OP_CLOSE_DUP_NUM:
2356 /* We treat it as a normal character. */
2358 /* Then we can these characters as normal characters. */
2359 token->type = CHARACTER;
2360 /* mb_partial and word_char bits should be initialized already
2361 by peek_token. */
2362 tree = create_token_tree (dfa, NULL, NULL, token);
2363 if (__glibc_unlikely (tree == NULL))
2365 *err = REG_ESPACE;
2366 return NULL;
2368 break;
2370 case ANCHOR:
2371 if ((token->opr.ctx_type
2372 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2373 && dfa->word_ops_used == 0)
2374 init_word_char (dfa);
2375 if (token->opr.ctx_type == WORD_DELIM
2376 || token->opr.ctx_type == NOT_WORD_DELIM)
2378 bin_tree_t *tree_first, *tree_last;
2379 if (token->opr.ctx_type == WORD_DELIM)
2381 token->opr.ctx_type = WORD_FIRST;
2382 tree_first = create_token_tree (dfa, NULL, NULL, token);
2383 token->opr.ctx_type = WORD_LAST;
2385 else
2387 token->opr.ctx_type = INSIDE_WORD;
2388 tree_first = create_token_tree (dfa, NULL, NULL, token);
2389 token->opr.ctx_type = INSIDE_NOTWORD;
2391 tree_last = create_token_tree (dfa, NULL, NULL, token);
2392 tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2393 if (__glibc_unlikely (tree_first == NULL || tree_last == NULL
2394 || tree == NULL))
2396 *err = REG_ESPACE;
2397 return NULL;
2400 else
2402 tree = create_token_tree (dfa, NULL, NULL, token);
2403 if (__glibc_unlikely (tree == NULL))
2405 *err = REG_ESPACE;
2406 return NULL;
2409 /* We must return here, since ANCHORs can't be followed
2410 by repetition operators.
2411 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2412 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2413 fetch_token (token, regexp, syntax);
2414 return tree;
2416 case OP_PERIOD:
2417 tree = create_token_tree (dfa, NULL, NULL, token);
2418 if (__glibc_unlikely (tree == NULL))
2420 *err = REG_ESPACE;
2421 return NULL;
2423 if (dfa->mb_cur_max > 1)
2424 dfa->has_mb_node = 1;
2425 break;
2427 case OP_WORD:
2428 case OP_NOTWORD:
2429 tree = build_charclass_op (dfa, regexp->trans,
2430 "alnum",
2431 "_",
2432 token->type == OP_NOTWORD, err);
2433 if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2434 return NULL;
2435 break;
2437 case OP_SPACE:
2438 case OP_NOTSPACE:
2439 tree = build_charclass_op (dfa, regexp->trans,
2440 "space",
2442 token->type == OP_NOTSPACE, err);
2443 if (__glibc_unlikely (*err != REG_NOERROR && tree == NULL))
2444 return NULL;
2445 break;
2447 case OP_ALT:
2448 case END_OF_RE:
2449 return NULL;
2451 case BACK_SLASH:
2452 *err = REG_EESCAPE;
2453 return NULL;
2455 default:
2456 /* Must not happen? */
2457 #ifdef DEBUG
2458 assert (0);
2459 #endif
2460 return NULL;
2462 fetch_token (token, regexp, syntax);
2464 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2465 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2467 bin_tree_t *dup_tree = parse_dup_op (tree, regexp, dfa, token,
2468 syntax, err);
2469 if (__glibc_unlikely (*err != REG_NOERROR && dup_tree == NULL))
2471 if (tree != NULL)
2472 postorder (tree, free_tree, NULL);
2473 return NULL;
2475 tree = dup_tree;
2476 /* In BRE consecutive duplications are not allowed. */
2477 if ((syntax & RE_CONTEXT_INVALID_DUP)
2478 && (token->type == OP_DUP_ASTERISK
2479 || token->type == OP_OPEN_DUP_NUM))
2481 if (tree != NULL)
2482 postorder (tree, free_tree, NULL);
2483 *err = REG_BADRPT;
2484 return NULL;
2488 return tree;
2491 /* This function build the following tree, from regular expression
2492 (<reg_exp>):
2493 SUBEXP
2495 <reg_exp>
2498 static bin_tree_t *
2499 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2500 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2502 re_dfa_t *dfa = preg->buffer;
2503 bin_tree_t *tree;
2504 size_t cur_nsub;
2505 cur_nsub = preg->re_nsub++;
2507 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2509 /* The subexpression may be a null string. */
2510 if (token->type == OP_CLOSE_SUBEXP)
2511 tree = NULL;
2512 else
2514 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2515 if (__glibc_unlikely (*err == REG_NOERROR
2516 && token->type != OP_CLOSE_SUBEXP))
2518 if (tree != NULL)
2519 postorder (tree, free_tree, NULL);
2520 *err = REG_EPAREN;
2522 if (__glibc_unlikely (*err != REG_NOERROR))
2523 return NULL;
2526 if (cur_nsub <= '9' - '1')
2527 dfa->completed_bkref_map |= 1 << cur_nsub;
2529 tree = create_tree (dfa, tree, NULL, SUBEXP);
2530 if (__glibc_unlikely (tree == NULL))
2532 *err = REG_ESPACE;
2533 return NULL;
2535 tree->token.opr.idx = cur_nsub;
2536 return tree;
2539 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2541 static bin_tree_t *
2542 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2543 re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2545 bin_tree_t *tree = NULL, *old_tree = NULL;
2546 Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2547 re_token_t start_token = *token;
2549 if (token->type == OP_OPEN_DUP_NUM)
2551 end = 0;
2552 start = fetch_number (regexp, token, syntax);
2553 if (start == -1)
2555 if (token->type == CHARACTER && token->opr.c == ',')
2556 start = 0; /* We treat "{,m}" as "{0,m}". */
2557 else
2559 *err = REG_BADBR; /* <re>{} is invalid. */
2560 return NULL;
2563 if (__glibc_likely (start != -2))
2565 /* We treat "{n}" as "{n,n}". */
2566 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2567 : ((token->type == CHARACTER && token->opr.c == ',')
2568 ? fetch_number (regexp, token, syntax) : -2));
2570 if (__glibc_unlikely (start == -2 || end == -2))
2572 /* Invalid sequence. */
2573 if (__glibc_unlikely (!(syntax & RE_INVALID_INTERVAL_ORD)))
2575 if (token->type == END_OF_RE)
2576 *err = REG_EBRACE;
2577 else
2578 *err = REG_BADBR;
2580 return NULL;
2583 /* If the syntax bit is set, rollback. */
2584 re_string_set_index (regexp, start_idx);
2585 *token = start_token;
2586 token->type = CHARACTER;
2587 /* mb_partial and word_char bits should be already initialized by
2588 peek_token. */
2589 return elem;
2592 if (__glibc_unlikely ((end != -1 && start > end)
2593 || token->type != OP_CLOSE_DUP_NUM))
2595 /* First number greater than second. */
2596 *err = REG_BADBR;
2597 return NULL;
2600 if (__glibc_unlikely (RE_DUP_MAX < (end == -1 ? start : end)))
2602 *err = REG_ESIZE;
2603 return NULL;
2606 else
2608 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2609 end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2612 fetch_token (token, regexp, syntax);
2614 if (__glibc_unlikely (elem == NULL))
2615 return NULL;
2616 if (__glibc_unlikely (start == 0 && end == 0))
2618 postorder (elem, free_tree, NULL);
2619 return NULL;
2622 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2623 if (__glibc_unlikely (start > 0))
2625 tree = elem;
2626 for (i = 2; i <= start; ++i)
2628 elem = duplicate_tree (elem, dfa);
2629 tree = create_tree (dfa, tree, elem, CONCAT);
2630 if (__glibc_unlikely (elem == NULL || tree == NULL))
2631 goto parse_dup_op_espace;
2634 if (start == end)
2635 return tree;
2637 /* Duplicate ELEM before it is marked optional. */
2638 elem = duplicate_tree (elem, dfa);
2639 if (__glibc_unlikely (elem == NULL))
2640 goto parse_dup_op_espace;
2641 old_tree = tree;
2643 else
2644 old_tree = NULL;
2646 if (elem->token.type == SUBEXP)
2648 uintptr_t subidx = elem->token.opr.idx;
2649 postorder (elem, mark_opt_subexp, (void *) subidx);
2652 tree = create_tree (dfa, elem, NULL,
2653 (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2654 if (__glibc_unlikely (tree == NULL))
2655 goto parse_dup_op_espace;
2657 /* This loop is actually executed only when end != -1,
2658 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2659 already created the start+1-th copy. */
2660 if (TYPE_SIGNED (Idx) || end != -1)
2661 for (i = start + 2; i <= end; ++i)
2663 elem = duplicate_tree (elem, dfa);
2664 tree = create_tree (dfa, tree, elem, CONCAT);
2665 if (__glibc_unlikely (elem == NULL || tree == NULL))
2666 goto parse_dup_op_espace;
2668 tree = create_tree (dfa, tree, NULL, OP_ALT);
2669 if (__glibc_unlikely (tree == NULL))
2670 goto parse_dup_op_espace;
2673 if (old_tree)
2674 tree = create_tree (dfa, old_tree, tree, CONCAT);
2676 return tree;
2678 parse_dup_op_espace:
2679 *err = REG_ESPACE;
2680 return NULL;
2683 /* Size of the names for collating symbol/equivalence_class/character_class.
2684 I'm not sure, but maybe enough. */
2685 #define BRACKET_NAME_BUF_SIZE 32
2687 #ifndef _LIBC
2689 # ifdef RE_ENABLE_I18N
2690 /* Convert the byte B to the corresponding wide character. In a
2691 unibyte locale, treat B as itself. In a multibyte locale, return
2692 WEOF if B is an encoding error. */
2693 static wint_t
2694 parse_byte (unsigned char b, re_charset_t *mbcset)
2696 return mbcset == NULL ? b : __btowc (b);
2698 # endif
2700 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2701 Build the range expression which starts from START_ELEM, and ends
2702 at END_ELEM. The result are written to MBCSET and SBCSET.
2703 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2704 mbcset->range_ends, is a pointer argument since we may
2705 update it. */
2707 static reg_errcode_t
2708 # ifdef RE_ENABLE_I18N
2709 build_range_exp (const reg_syntax_t syntax,
2710 bitset_t sbcset,
2711 re_charset_t *mbcset,
2712 Idx *range_alloc,
2713 const bracket_elem_t *start_elem,
2714 const bracket_elem_t *end_elem)
2715 # else /* not RE_ENABLE_I18N */
2716 build_range_exp (const reg_syntax_t syntax,
2717 bitset_t sbcset,
2718 const bracket_elem_t *start_elem,
2719 const bracket_elem_t *end_elem)
2720 # endif /* not RE_ENABLE_I18N */
2722 unsigned int start_ch, end_ch;
2723 /* Equivalence Classes and Character Classes can't be a range start/end. */
2724 if (__glibc_unlikely (start_elem->type == EQUIV_CLASS
2725 || start_elem->type == CHAR_CLASS
2726 || end_elem->type == EQUIV_CLASS
2727 || end_elem->type == CHAR_CLASS))
2728 return REG_ERANGE;
2730 /* We can handle no multi character collating elements without libc
2731 support. */
2732 if (__glibc_unlikely ((start_elem->type == COLL_SYM
2733 && strlen ((char *) start_elem->opr.name) > 1)
2734 || (end_elem->type == COLL_SYM
2735 && strlen ((char *) end_elem->opr.name) > 1)))
2736 return REG_ECOLLATE;
2738 # ifdef RE_ENABLE_I18N
2740 wchar_t wc;
2741 wint_t start_wc;
2742 wint_t end_wc;
2744 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2745 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2746 : 0));
2747 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2748 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2749 : 0));
2750 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2751 ? parse_byte (start_ch, mbcset) : start_elem->opr.wch);
2752 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2753 ? parse_byte (end_ch, mbcset) : end_elem->opr.wch);
2754 if (start_wc == WEOF || end_wc == WEOF)
2755 return REG_ECOLLATE;
2756 else if (__glibc_unlikely ((syntax & RE_NO_EMPTY_RANGES)
2757 && start_wc > end_wc))
2758 return REG_ERANGE;
2760 /* Got valid collation sequence values, add them as a new entry.
2761 However, for !_LIBC we have no collation elements: if the
2762 character set is single byte, the single byte character set
2763 that we build below suffices. parse_bracket_exp passes
2764 no MBCSET if dfa->mb_cur_max == 1. */
2765 if (mbcset)
2767 /* Check the space of the arrays. */
2768 if (__glibc_unlikely (*range_alloc == mbcset->nranges))
2770 /* There is not enough space, need realloc. */
2771 wchar_t *new_array_start, *new_array_end;
2772 Idx new_nranges;
2774 /* +1 in case of mbcset->nranges is 0. */
2775 new_nranges = 2 * mbcset->nranges + 1;
2776 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2777 are NULL if *range_alloc == 0. */
2778 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2779 new_nranges);
2780 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2781 new_nranges);
2783 if (__glibc_unlikely (new_array_start == NULL
2784 || new_array_end == NULL))
2786 re_free (new_array_start);
2787 re_free (new_array_end);
2788 return REG_ESPACE;
2791 mbcset->range_starts = new_array_start;
2792 mbcset->range_ends = new_array_end;
2793 *range_alloc = new_nranges;
2796 mbcset->range_starts[mbcset->nranges] = start_wc;
2797 mbcset->range_ends[mbcset->nranges++] = end_wc;
2800 /* Build the table for single byte characters. */
2801 for (wc = 0; wc < SBC_MAX; ++wc)
2803 if (start_wc <= wc && wc <= end_wc)
2804 bitset_set (sbcset, wc);
2807 # else /* not RE_ENABLE_I18N */
2809 unsigned int ch;
2810 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2811 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2812 : 0));
2813 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2814 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2815 : 0));
2816 if (start_ch > end_ch)
2817 return REG_ERANGE;
2818 /* Build the table for single byte characters. */
2819 for (ch = 0; ch < SBC_MAX; ++ch)
2820 if (start_ch <= ch && ch <= end_ch)
2821 bitset_set (sbcset, ch);
2823 # endif /* not RE_ENABLE_I18N */
2824 return REG_NOERROR;
2826 #endif /* not _LIBC */
2828 #ifndef _LIBC
2829 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2830 Build the collating element which is represented by NAME.
2831 The result are written to MBCSET and SBCSET.
2832 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2833 pointer argument since we may update it. */
2835 static reg_errcode_t
2836 # ifdef RE_ENABLE_I18N
2837 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2838 Idx *coll_sym_alloc, const unsigned char *name)
2839 # else /* not RE_ENABLE_I18N */
2840 build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2841 # endif /* not RE_ENABLE_I18N */
2843 size_t name_len = strlen ((const char *) name);
2844 if (__glibc_unlikely (name_len != 1))
2845 return REG_ECOLLATE;
2846 else
2848 bitset_set (sbcset, name[0]);
2849 return REG_NOERROR;
2852 #endif /* not _LIBC */
2854 /* This function parse bracket expression like "[abc]", "[a-c]",
2855 "[[.a-a.]]" etc. */
2857 static bin_tree_t *
2858 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2859 reg_syntax_t syntax, reg_errcode_t *err)
2861 #ifdef _LIBC
2862 const unsigned char *collseqmb;
2863 const char *collseqwc;
2864 uint32_t nrules;
2865 int32_t table_size;
2866 const int32_t *symb_table;
2867 const unsigned char *extra;
2869 /* Local function for parse_bracket_exp used in _LIBC environment.
2870 Seek the collating symbol entry corresponding to NAME.
2871 Return the index of the symbol in the SYMB_TABLE,
2872 or -1 if not found. */
2874 auto inline int32_t
2875 __attribute__ ((always_inline))
2876 seek_collating_symbol_entry (const unsigned char *name, size_t name_len)
2878 int32_t elem;
2880 for (elem = 0; elem < table_size; elem++)
2881 if (symb_table[2 * elem] != 0)
2883 int32_t idx = symb_table[2 * elem + 1];
2884 /* Skip the name of collating element name. */
2885 idx += 1 + extra[idx];
2886 if (/* Compare the length of the name. */
2887 name_len == extra[idx]
2888 /* Compare the name. */
2889 && memcmp (name, &extra[idx + 1], name_len) == 0)
2890 /* Yep, this is the entry. */
2891 return elem;
2893 return -1;
2896 /* Local function for parse_bracket_exp used in _LIBC environment.
2897 Look up the collation sequence value of BR_ELEM.
2898 Return the value if succeeded, UINT_MAX otherwise. */
2900 auto inline unsigned int
2901 __attribute__ ((always_inline))
2902 lookup_collation_sequence_value (bracket_elem_t *br_elem)
2904 if (br_elem->type == SB_CHAR)
2907 if (MB_CUR_MAX == 1)
2909 if (nrules == 0)
2910 return collseqmb[br_elem->opr.ch];
2911 else
2913 wint_t wc = __btowc (br_elem->opr.ch);
2914 return __collseq_table_lookup (collseqwc, wc);
2917 else if (br_elem->type == MB_CHAR)
2919 if (nrules != 0)
2920 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2922 else if (br_elem->type == COLL_SYM)
2924 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2925 if (nrules != 0)
2927 int32_t elem, idx;
2928 elem = seek_collating_symbol_entry (br_elem->opr.name,
2929 sym_name_len);
2930 if (elem != -1)
2932 /* We found the entry. */
2933 idx = symb_table[2 * elem + 1];
2934 /* Skip the name of collating element name. */
2935 idx += 1 + extra[idx];
2936 /* Skip the byte sequence of the collating element. */
2937 idx += 1 + extra[idx];
2938 /* Adjust for the alignment. */
2939 idx = (idx + 3) & ~3;
2940 /* Skip the multibyte collation sequence value. */
2941 idx += sizeof (unsigned int);
2942 /* Skip the wide char sequence of the collating element. */
2943 idx += sizeof (unsigned int) *
2944 (1 + *(unsigned int *) (extra + idx));
2945 /* Return the collation sequence value. */
2946 return *(unsigned int *) (extra + idx);
2948 else if (sym_name_len == 1)
2950 /* No valid character. Match it as a single byte
2951 character. */
2952 return collseqmb[br_elem->opr.name[0]];
2955 else if (sym_name_len == 1)
2956 return collseqmb[br_elem->opr.name[0]];
2958 return UINT_MAX;
2961 /* Local function for parse_bracket_exp used in _LIBC environment.
2962 Build the range expression which starts from START_ELEM, and ends
2963 at END_ELEM. The result are written to MBCSET and SBCSET.
2964 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2965 mbcset->range_ends, is a pointer argument since we may
2966 update it. */
2968 auto inline reg_errcode_t
2969 __attribute__ ((always_inline))
2970 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2971 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2973 unsigned int ch;
2974 uint32_t start_collseq;
2975 uint32_t end_collseq;
2977 /* Equivalence Classes and Character Classes can't be a range
2978 start/end. */
2979 if (__glibc_unlikely (start_elem->type == EQUIV_CLASS
2980 || start_elem->type == CHAR_CLASS
2981 || end_elem->type == EQUIV_CLASS
2982 || end_elem->type == CHAR_CLASS))
2983 return REG_ERANGE;
2985 /* FIXME: Implement rational ranges here, too. */
2986 start_collseq = lookup_collation_sequence_value (start_elem);
2987 end_collseq = lookup_collation_sequence_value (end_elem);
2988 /* Check start/end collation sequence values. */
2989 if (__glibc_unlikely (start_collseq == UINT_MAX
2990 || end_collseq == UINT_MAX))
2991 return REG_ECOLLATE;
2992 if (__glibc_unlikely ((syntax & RE_NO_EMPTY_RANGES)
2993 && start_collseq > end_collseq))
2994 return REG_ERANGE;
2996 /* Got valid collation sequence values, add them as a new entry.
2997 However, if we have no collation elements, and the character set
2998 is single byte, the single byte character set that we
2999 build below suffices. */
3000 if (nrules > 0 || dfa->mb_cur_max > 1)
3002 /* Check the space of the arrays. */
3003 if (__glibc_unlikely (*range_alloc == mbcset->nranges))
3005 /* There is not enough space, need realloc. */
3006 uint32_t *new_array_start;
3007 uint32_t *new_array_end;
3008 Idx new_nranges;
3010 /* +1 in case of mbcset->nranges is 0. */
3011 new_nranges = 2 * mbcset->nranges + 1;
3012 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
3013 new_nranges);
3014 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
3015 new_nranges);
3017 if (__glibc_unlikely (new_array_start == NULL
3018 || new_array_end == NULL))
3019 return REG_ESPACE;
3021 mbcset->range_starts = new_array_start;
3022 mbcset->range_ends = new_array_end;
3023 *range_alloc = new_nranges;
3026 mbcset->range_starts[mbcset->nranges] = start_collseq;
3027 mbcset->range_ends[mbcset->nranges++] = end_collseq;
3030 /* Build the table for single byte characters. */
3031 for (ch = 0; ch < SBC_MAX; ch++)
3033 uint32_t ch_collseq;
3035 if (MB_CUR_MAX == 1)
3037 if (nrules == 0)
3038 ch_collseq = collseqmb[ch];
3039 else
3040 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
3041 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
3042 bitset_set (sbcset, ch);
3044 return REG_NOERROR;
3047 /* Local function for parse_bracket_exp used in _LIBC environment.
3048 Build the collating element which is represented by NAME.
3049 The result are written to MBCSET and SBCSET.
3050 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
3051 pointer argument since we may update it. */
3053 auto inline reg_errcode_t
3054 __attribute__ ((always_inline))
3055 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
3056 Idx *coll_sym_alloc, const unsigned char *name)
3058 int32_t elem, idx;
3059 size_t name_len = strlen ((const char *) name);
3060 if (nrules != 0)
3062 elem = seek_collating_symbol_entry (name, name_len);
3063 if (elem != -1)
3065 /* We found the entry. */
3066 idx = symb_table[2 * elem + 1];
3067 /* Skip the name of collating element name. */
3068 idx += 1 + extra[idx];
3070 else if (name_len == 1)
3072 /* No valid character, treat it as a normal
3073 character. */
3074 bitset_set (sbcset, name[0]);
3075 return REG_NOERROR;
3077 else
3078 return REG_ECOLLATE;
3080 /* Got valid collation sequence, add it as a new entry. */
3081 /* Check the space of the arrays. */
3082 if (__glibc_unlikely (*coll_sym_alloc == mbcset->ncoll_syms))
3084 /* Not enough, realloc it. */
3085 /* +1 in case of mbcset->ncoll_syms is 0. */
3086 Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3087 /* Use realloc since mbcset->coll_syms is NULL
3088 if *alloc == 0. */
3089 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3090 new_coll_sym_alloc);
3091 if (__glibc_unlikely (new_coll_syms == NULL))
3092 return REG_ESPACE;
3093 mbcset->coll_syms = new_coll_syms;
3094 *coll_sym_alloc = new_coll_sym_alloc;
3096 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3097 return REG_NOERROR;
3099 else
3101 if (__glibc_unlikely (name_len != 1))
3102 return REG_ECOLLATE;
3103 else
3105 bitset_set (sbcset, name[0]);
3106 return REG_NOERROR;
3110 #endif
3112 re_token_t br_token;
3113 re_bitset_ptr_t sbcset;
3114 #ifdef RE_ENABLE_I18N
3115 re_charset_t *mbcset;
3116 Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3117 Idx equiv_class_alloc = 0, char_class_alloc = 0;
3118 #endif /* not RE_ENABLE_I18N */
3119 bool non_match = false;
3120 bin_tree_t *work_tree;
3121 int token_len;
3122 bool first_round = true;
3123 #ifdef _LIBC
3124 collseqmb = (const unsigned char *)
3125 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3126 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3127 if (nrules)
3130 if (MB_CUR_MAX > 1)
3132 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3133 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3134 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3135 _NL_COLLATE_SYMB_TABLEMB);
3136 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3137 _NL_COLLATE_SYMB_EXTRAMB);
3139 #endif
3140 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3141 #ifdef RE_ENABLE_I18N
3142 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3143 #endif /* RE_ENABLE_I18N */
3144 #ifdef RE_ENABLE_I18N
3145 if (__glibc_unlikely (sbcset == NULL || mbcset == NULL))
3146 #else
3147 if (__glibc_unlikely (sbcset == NULL))
3148 #endif /* RE_ENABLE_I18N */
3150 re_free (sbcset);
3151 #ifdef RE_ENABLE_I18N
3152 re_free (mbcset);
3153 #endif
3154 *err = REG_ESPACE;
3155 return NULL;
3158 token_len = peek_token_bracket (token, regexp, syntax);
3159 if (__glibc_unlikely (token->type == END_OF_RE))
3161 *err = REG_BADPAT;
3162 goto parse_bracket_exp_free_return;
3164 if (token->type == OP_NON_MATCH_LIST)
3166 #ifdef RE_ENABLE_I18N
3167 mbcset->non_match = 1;
3168 #endif /* not RE_ENABLE_I18N */
3169 non_match = true;
3170 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3171 bitset_set (sbcset, '\n');
3172 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3173 token_len = peek_token_bracket (token, regexp, syntax);
3174 if (__glibc_unlikely (token->type == END_OF_RE))
3176 *err = REG_BADPAT;
3177 goto parse_bracket_exp_free_return;
3181 /* We treat the first ']' as a normal character. */
3182 if (token->type == OP_CLOSE_BRACKET)
3183 token->type = CHARACTER;
3185 while (1)
3187 bracket_elem_t start_elem, end_elem;
3188 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3189 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3190 reg_errcode_t ret;
3191 int token_len2 = 0;
3192 bool is_range_exp = false;
3193 re_token_t token2;
3195 start_elem.opr.name = start_name_buf;
3196 start_elem.type = COLL_SYM;
3197 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3198 syntax, first_round);
3199 if (__glibc_unlikely (ret != REG_NOERROR))
3201 *err = ret;
3202 goto parse_bracket_exp_free_return;
3204 first_round = false;
3206 /* Get information about the next token. We need it in any case. */
3207 token_len = peek_token_bracket (token, regexp, syntax);
3209 /* Do not check for ranges if we know they are not allowed. */
3210 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3212 if (__glibc_unlikely (token->type == END_OF_RE))
3214 *err = REG_EBRACK;
3215 goto parse_bracket_exp_free_return;
3217 if (token->type == OP_CHARSET_RANGE)
3219 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3220 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3221 if (__glibc_unlikely (token2.type == END_OF_RE))
3223 *err = REG_EBRACK;
3224 goto parse_bracket_exp_free_return;
3226 if (token2.type == OP_CLOSE_BRACKET)
3228 /* We treat the last '-' as a normal character. */
3229 re_string_skip_bytes (regexp, -token_len);
3230 token->type = CHARACTER;
3232 else
3233 is_range_exp = true;
3237 if (is_range_exp == true)
3239 end_elem.opr.name = end_name_buf;
3240 end_elem.type = COLL_SYM;
3241 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3242 dfa, syntax, true);
3243 if (__glibc_unlikely (ret != REG_NOERROR))
3245 *err = ret;
3246 goto parse_bracket_exp_free_return;
3249 token_len = peek_token_bracket (token, regexp, syntax);
3251 #ifdef _LIBC
3252 *err = build_range_exp (sbcset, mbcset, &range_alloc,
3253 &start_elem, &end_elem);
3254 #else
3255 # ifdef RE_ENABLE_I18N
3256 *err = build_range_exp (syntax, sbcset,
3257 dfa->mb_cur_max > 1 ? mbcset : NULL,
3258 &range_alloc, &start_elem, &end_elem);
3259 # else
3260 *err = build_range_exp (syntax, sbcset, &start_elem, &end_elem);
3261 # endif
3262 #endif /* RE_ENABLE_I18N */
3263 if (__glibc_unlikely (*err != REG_NOERROR))
3264 goto parse_bracket_exp_free_return;
3266 else
3268 switch (start_elem.type)
3270 case SB_CHAR:
3271 bitset_set (sbcset, start_elem.opr.ch);
3272 break;
3273 #ifdef RE_ENABLE_I18N
3274 case MB_CHAR:
3275 /* Check whether the array has enough space. */
3276 if (__glibc_unlikely (mbchar_alloc == mbcset->nmbchars))
3278 wchar_t *new_mbchars;
3279 /* Not enough, realloc it. */
3280 /* +1 in case of mbcset->nmbchars is 0. */
3281 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3282 /* Use realloc since array is NULL if *alloc == 0. */
3283 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3284 mbchar_alloc);
3285 if (__glibc_unlikely (new_mbchars == NULL))
3286 goto parse_bracket_exp_espace;
3287 mbcset->mbchars = new_mbchars;
3289 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3290 break;
3291 #endif /* RE_ENABLE_I18N */
3292 case EQUIV_CLASS:
3293 *err = build_equiv_class (sbcset,
3294 #ifdef RE_ENABLE_I18N
3295 mbcset, &equiv_class_alloc,
3296 #endif /* RE_ENABLE_I18N */
3297 start_elem.opr.name);
3298 if (__glibc_unlikely (*err != REG_NOERROR))
3299 goto parse_bracket_exp_free_return;
3300 break;
3301 case COLL_SYM:
3302 *err = build_collating_symbol (sbcset,
3303 #ifdef RE_ENABLE_I18N
3304 mbcset, &coll_sym_alloc,
3305 #endif /* RE_ENABLE_I18N */
3306 start_elem.opr.name);
3307 if (__glibc_unlikely (*err != REG_NOERROR))
3308 goto parse_bracket_exp_free_return;
3309 break;
3310 case CHAR_CLASS:
3311 *err = build_charclass (regexp->trans, sbcset,
3312 #ifdef RE_ENABLE_I18N
3313 mbcset, &char_class_alloc,
3314 #endif /* RE_ENABLE_I18N */
3315 (const char *) start_elem.opr.name,
3316 syntax);
3317 if (__glibc_unlikely (*err != REG_NOERROR))
3318 goto parse_bracket_exp_free_return;
3319 break;
3320 default:
3321 assert (0);
3322 break;
3325 if (__glibc_unlikely (token->type == END_OF_RE))
3327 *err = REG_EBRACK;
3328 goto parse_bracket_exp_free_return;
3330 if (token->type == OP_CLOSE_BRACKET)
3331 break;
3334 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3336 /* If it is non-matching list. */
3337 if (non_match)
3338 bitset_not (sbcset);
3340 #ifdef RE_ENABLE_I18N
3341 /* Ensure only single byte characters are set. */
3342 if (dfa->mb_cur_max > 1)
3343 bitset_mask (sbcset, dfa->sb_char);
3345 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3346 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3347 || mbcset->non_match)))
3349 bin_tree_t *mbc_tree;
3350 int sbc_idx;
3351 /* Build a tree for complex bracket. */
3352 dfa->has_mb_node = 1;
3353 br_token.type = COMPLEX_BRACKET;
3354 br_token.opr.mbcset = mbcset;
3355 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3356 if (__glibc_unlikely (mbc_tree == NULL))
3357 goto parse_bracket_exp_espace;
3358 for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3359 if (sbcset[sbc_idx])
3360 break;
3361 /* If there are no bits set in sbcset, there is no point
3362 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3363 if (sbc_idx < BITSET_WORDS)
3365 /* Build a tree for simple bracket. */
3366 br_token.type = SIMPLE_BRACKET;
3367 br_token.opr.sbcset = sbcset;
3368 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3369 if (__glibc_unlikely (work_tree == NULL))
3370 goto parse_bracket_exp_espace;
3372 /* Then join them by ALT node. */
3373 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3374 if (__glibc_unlikely (work_tree == NULL))
3375 goto parse_bracket_exp_espace;
3377 else
3379 re_free (sbcset);
3380 work_tree = mbc_tree;
3383 else
3384 #endif /* not RE_ENABLE_I18N */
3386 #ifdef RE_ENABLE_I18N
3387 free_charset (mbcset);
3388 #endif
3389 /* Build a tree for simple bracket. */
3390 br_token.type = SIMPLE_BRACKET;
3391 br_token.opr.sbcset = sbcset;
3392 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3393 if (__glibc_unlikely (work_tree == NULL))
3394 goto parse_bracket_exp_espace;
3396 return work_tree;
3398 parse_bracket_exp_espace:
3399 *err = REG_ESPACE;
3400 parse_bracket_exp_free_return:
3401 re_free (sbcset);
3402 #ifdef RE_ENABLE_I18N
3403 free_charset (mbcset);
3404 #endif /* RE_ENABLE_I18N */
3405 return NULL;
3408 /* Parse an element in the bracket expression. */
3410 static reg_errcode_t
3411 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3412 re_token_t *token, int token_len, re_dfa_t *dfa,
3413 reg_syntax_t syntax, bool accept_hyphen)
3415 #ifdef RE_ENABLE_I18N
3416 int cur_char_size;
3417 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3418 if (cur_char_size > 1)
3420 elem->type = MB_CHAR;
3421 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3422 re_string_skip_bytes (regexp, cur_char_size);
3423 return REG_NOERROR;
3425 #endif /* RE_ENABLE_I18N */
3426 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3427 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3428 || token->type == OP_OPEN_EQUIV_CLASS)
3429 return parse_bracket_symbol (elem, regexp, token);
3430 if (__glibc_unlikely (token->type == OP_CHARSET_RANGE) && !accept_hyphen)
3432 /* A '-' must only appear as anything but a range indicator before
3433 the closing bracket. Everything else is an error. */
3434 re_token_t token2;
3435 (void) peek_token_bracket (&token2, regexp, syntax);
3436 if (token2.type != OP_CLOSE_BRACKET)
3437 /* The actual error value is not standardized since this whole
3438 case is undefined. But ERANGE makes good sense. */
3439 return REG_ERANGE;
3441 elem->type = SB_CHAR;
3442 elem->opr.ch = token->opr.c;
3443 return REG_NOERROR;
3446 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3447 such as [:<character_class>:], [.<collating_element>.], and
3448 [=<equivalent_class>=]. */
3450 static reg_errcode_t
3451 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3452 re_token_t *token)
3454 unsigned char ch, delim = token->opr.c;
3455 int i = 0;
3456 if (re_string_eoi(regexp))
3457 return REG_EBRACK;
3458 for (;; ++i)
3460 if (i >= BRACKET_NAME_BUF_SIZE)
3461 return REG_EBRACK;
3462 if (token->type == OP_OPEN_CHAR_CLASS)
3463 ch = re_string_fetch_byte_case (regexp);
3464 else
3465 ch = re_string_fetch_byte (regexp);
3466 if (re_string_eoi(regexp))
3467 return REG_EBRACK;
3468 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3469 break;
3470 elem->opr.name[i] = ch;
3472 re_string_skip_bytes (regexp, 1);
3473 elem->opr.name[i] = '\0';
3474 switch (token->type)
3476 case OP_OPEN_COLL_ELEM:
3477 elem->type = COLL_SYM;
3478 break;
3479 case OP_OPEN_EQUIV_CLASS:
3480 elem->type = EQUIV_CLASS;
3481 break;
3482 case OP_OPEN_CHAR_CLASS:
3483 elem->type = CHAR_CLASS;
3484 break;
3485 default:
3486 break;
3488 return REG_NOERROR;
3491 /* Helper function for parse_bracket_exp.
3492 Build the equivalence class which is represented by NAME.
3493 The result are written to MBCSET and SBCSET.
3494 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3495 is a pointer argument since we may update it. */
3497 static reg_errcode_t
3498 #ifdef RE_ENABLE_I18N
3499 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3500 Idx *equiv_class_alloc, const unsigned char *name)
3501 #else /* not RE_ENABLE_I18N */
3502 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3503 #endif /* not RE_ENABLE_I18N */
3505 #ifdef _LIBC
3506 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3507 if (nrules != 0)
3509 const int32_t *table, *indirect;
3510 const unsigned char *weights, *extra, *cp;
3511 unsigned char char_buf[2];
3512 int32_t idx1, idx2;
3513 unsigned int ch;
3514 size_t len;
3515 /* Calculate the index for equivalence class. */
3516 cp = name;
3517 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3518 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3519 _NL_COLLATE_WEIGHTMB);
3520 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3521 _NL_COLLATE_EXTRAMB);
3522 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3523 _NL_COLLATE_INDIRECTMB);
3524 idx1 = findidx (table, indirect, extra, &cp, -1);
3525 if (__glibc_unlikely (idx1 == 0 || *cp != '\0'))
3526 /* This isn't a valid character. */
3527 return REG_ECOLLATE;
3529 /* Build single byte matching table for this equivalence class. */
3530 len = weights[idx1 & 0xffffff];
3531 for (ch = 0; ch < SBC_MAX; ++ch)
3533 char_buf[0] = ch;
3534 cp = char_buf;
3535 idx2 = findidx (table, indirect, extra, &cp, 1);
3537 idx2 = table[ch];
3539 if (idx2 == 0)
3540 /* This isn't a valid character. */
3541 continue;
3542 /* Compare only if the length matches and the collation rule
3543 index is the same. */
3544 if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24)
3545 && memcmp (weights + (idx1 & 0xffffff) + 1,
3546 weights + (idx2 & 0xffffff) + 1, len) == 0)
3547 bitset_set (sbcset, ch);
3549 /* Check whether the array has enough space. */
3550 if (__glibc_unlikely (*equiv_class_alloc == mbcset->nequiv_classes))
3552 /* Not enough, realloc it. */
3553 /* +1 in case of mbcset->nequiv_classes is 0. */
3554 Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3555 /* Use realloc since the array is NULL if *alloc == 0. */
3556 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3557 int32_t,
3558 new_equiv_class_alloc);
3559 if (__glibc_unlikely (new_equiv_classes == NULL))
3560 return REG_ESPACE;
3561 mbcset->equiv_classes = new_equiv_classes;
3562 *equiv_class_alloc = new_equiv_class_alloc;
3564 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3566 else
3567 #endif /* _LIBC */
3569 if (__glibc_unlikely (strlen ((const char *) name) != 1))
3570 return REG_ECOLLATE;
3571 bitset_set (sbcset, *name);
3573 return REG_NOERROR;
3576 /* Helper function for parse_bracket_exp.
3577 Build the character class which is represented by NAME.
3578 The result are written to MBCSET and SBCSET.
3579 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3580 is a pointer argument since we may update it. */
3582 static reg_errcode_t
3583 #ifdef RE_ENABLE_I18N
3584 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3585 re_charset_t *mbcset, Idx *char_class_alloc,
3586 const char *class_name, reg_syntax_t syntax)
3587 #else /* not RE_ENABLE_I18N */
3588 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3589 const char *class_name, reg_syntax_t syntax)
3590 #endif /* not RE_ENABLE_I18N */
3592 int i;
3593 const char *name = class_name;
3595 /* In case of REG_ICASE "upper" and "lower" match the both of
3596 upper and lower cases. */
3597 if ((syntax & RE_ICASE)
3598 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3599 name = "alpha";
3601 #ifdef RE_ENABLE_I18N
3602 /* Check the space of the arrays. */
3603 if (__glibc_unlikely (*char_class_alloc == mbcset->nchar_classes))
3605 /* Not enough, realloc it. */
3606 /* +1 in case of mbcset->nchar_classes is 0. */
3607 Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3608 /* Use realloc since array is NULL if *alloc == 0. */
3609 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3610 new_char_class_alloc);
3611 if (__glibc_unlikely (new_char_classes == NULL))
3612 return REG_ESPACE;
3613 mbcset->char_classes = new_char_classes;
3614 *char_class_alloc = new_char_class_alloc;
3616 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3617 #endif /* RE_ENABLE_I18N */
3619 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3620 do { \
3621 if (__glibc_unlikely (trans != NULL)) \
3623 for (i = 0; i < SBC_MAX; ++i) \
3624 if (ctype_func (i)) \
3625 bitset_set (sbcset, trans[i]); \
3627 else \
3629 for (i = 0; i < SBC_MAX; ++i) \
3630 if (ctype_func (i)) \
3631 bitset_set (sbcset, i); \
3633 } while (0)
3635 if (strcmp (name, "alnum") == 0)
3636 BUILD_CHARCLASS_LOOP (isalnum);
3637 else if (strcmp (name, "cntrl") == 0)
3638 BUILD_CHARCLASS_LOOP (iscntrl);
3639 else if (strcmp (name, "lower") == 0)
3640 BUILD_CHARCLASS_LOOP (islower);
3641 else if (strcmp (name, "space") == 0)
3642 BUILD_CHARCLASS_LOOP (isspace);
3643 else if (strcmp (name, "alpha") == 0)
3644 BUILD_CHARCLASS_LOOP (isalpha);
3645 else if (strcmp (name, "digit") == 0)
3646 BUILD_CHARCLASS_LOOP (isdigit);
3647 else if (strcmp (name, "print") == 0)
3648 BUILD_CHARCLASS_LOOP (isprint);
3649 else if (strcmp (name, "upper") == 0)
3650 BUILD_CHARCLASS_LOOP (isupper);
3651 else if (strcmp (name, "blank") == 0)
3652 BUILD_CHARCLASS_LOOP (isblank);
3653 else if (strcmp (name, "graph") == 0)
3654 BUILD_CHARCLASS_LOOP (isgraph);
3655 else if (strcmp (name, "punct") == 0)
3656 BUILD_CHARCLASS_LOOP (ispunct);
3657 else if (strcmp (name, "xdigit") == 0)
3658 BUILD_CHARCLASS_LOOP (isxdigit);
3659 else
3660 return REG_ECTYPE;
3662 return REG_NOERROR;
3665 static bin_tree_t *
3666 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3667 const char *class_name,
3668 const char *extra, bool non_match,
3669 reg_errcode_t *err)
3671 re_bitset_ptr_t sbcset;
3672 #ifdef RE_ENABLE_I18N
3673 re_charset_t *mbcset;
3674 Idx alloc = 0;
3675 #endif /* not RE_ENABLE_I18N */
3676 reg_errcode_t ret;
3677 re_token_t br_token;
3678 bin_tree_t *tree;
3680 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3681 if (__glibc_unlikely (sbcset == NULL))
3683 *err = REG_ESPACE;
3684 return NULL;
3686 #ifdef RE_ENABLE_I18N
3687 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3688 if (__glibc_unlikely (mbcset == NULL))
3690 re_free (sbcset);
3691 *err = REG_ESPACE;
3692 return NULL;
3694 mbcset->non_match = non_match;
3695 #endif /* RE_ENABLE_I18N */
3697 /* We don't care the syntax in this case. */
3698 ret = build_charclass (trans, sbcset,
3699 #ifdef RE_ENABLE_I18N
3700 mbcset, &alloc,
3701 #endif /* RE_ENABLE_I18N */
3702 class_name, 0);
3704 if (__glibc_unlikely (ret != REG_NOERROR))
3706 re_free (sbcset);
3707 #ifdef RE_ENABLE_I18N
3708 free_charset (mbcset);
3709 #endif /* RE_ENABLE_I18N */
3710 *err = ret;
3711 return NULL;
3713 /* \w match '_' also. */
3714 for (; *extra; extra++)
3715 bitset_set (sbcset, *extra);
3717 /* If it is non-matching list. */
3718 if (non_match)
3719 bitset_not (sbcset);
3721 #ifdef RE_ENABLE_I18N
3722 /* Ensure only single byte characters are set. */
3723 if (dfa->mb_cur_max > 1)
3724 bitset_mask (sbcset, dfa->sb_char);
3725 #endif
3727 /* Build a tree for simple bracket. */
3728 #if defined GCC_LINT || defined lint
3729 memset (&br_token, 0, sizeof br_token);
3730 #endif
3731 br_token.type = SIMPLE_BRACKET;
3732 br_token.opr.sbcset = sbcset;
3733 tree = create_token_tree (dfa, NULL, NULL, &br_token);
3734 if (__glibc_unlikely (tree == NULL))
3735 goto build_word_op_espace;
3737 #ifdef RE_ENABLE_I18N
3738 if (dfa->mb_cur_max > 1)
3740 bin_tree_t *mbc_tree;
3741 /* Build a tree for complex bracket. */
3742 br_token.type = COMPLEX_BRACKET;
3743 br_token.opr.mbcset = mbcset;
3744 dfa->has_mb_node = 1;
3745 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3746 if (__glibc_unlikely (mbc_tree == NULL))
3747 goto build_word_op_espace;
3748 /* Then join them by ALT node. */
3749 tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3750 if (__glibc_likely (mbc_tree != NULL))
3751 return tree;
3753 else
3755 free_charset (mbcset);
3756 return tree;
3758 #else /* not RE_ENABLE_I18N */
3759 return tree;
3760 #endif /* not RE_ENABLE_I18N */
3762 build_word_op_espace:
3763 re_free (sbcset);
3764 #ifdef RE_ENABLE_I18N
3765 free_charset (mbcset);
3766 #endif /* RE_ENABLE_I18N */
3767 *err = REG_ESPACE;
3768 return NULL;
3771 /* This is intended for the expressions like "a{1,3}".
3772 Fetch a number from 'input', and return the number.
3773 Return -1 if the number field is empty like "{,1}".
3774 Return RE_DUP_MAX + 1 if the number field is too large.
3775 Return -2 if an error occurred. */
3777 static Idx
3778 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3780 Idx num = -1;
3781 unsigned char c;
3782 while (1)
3784 fetch_token (token, input, syntax);
3785 c = token->opr.c;
3786 if (__glibc_unlikely (token->type == END_OF_RE))
3787 return -2;
3788 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3789 break;
3790 num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3791 ? -2
3792 : num == -1
3793 ? c - '0'
3794 : MIN (RE_DUP_MAX + 1, num * 10 + c - '0'));
3796 return num;
3799 #ifdef RE_ENABLE_I18N
3800 static void
3801 free_charset (re_charset_t *cset)
3803 re_free (cset->mbchars);
3804 # ifdef _LIBC
3805 re_free (cset->coll_syms);
3806 re_free (cset->equiv_classes);
3807 # endif
3808 re_free (cset->range_starts);
3809 re_free (cset->range_ends);
3810 re_free (cset->char_classes);
3811 re_free (cset);
3813 #endif /* RE_ENABLE_I18N */
3815 /* Functions for binary tree operation. */
3817 /* Create a tree node. */
3819 static bin_tree_t *
3820 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3821 re_token_type_t type)
3823 re_token_t t;
3824 #if defined GCC_LINT || defined lint
3825 memset (&t, 0, sizeof t);
3826 #endif
3827 t.type = type;
3828 return create_token_tree (dfa, left, right, &t);
3831 static bin_tree_t *
3832 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3833 const re_token_t *token)
3835 bin_tree_t *tree;
3836 if (__glibc_unlikely (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE))
3838 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3840 if (storage == NULL)
3841 return NULL;
3842 storage->next = dfa->str_tree_storage;
3843 dfa->str_tree_storage = storage;
3844 dfa->str_tree_storage_idx = 0;
3846 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3848 tree->parent = NULL;
3849 tree->left = left;
3850 tree->right = right;
3851 tree->token = *token;
3852 tree->token.duplicated = 0;
3853 tree->token.opt_subexp = 0;
3854 tree->first = NULL;
3855 tree->next = NULL;
3856 tree->node_idx = -1;
3858 if (left != NULL)
3859 left->parent = tree;
3860 if (right != NULL)
3861 right->parent = tree;
3862 return tree;
3865 /* Mark the tree SRC as an optional subexpression.
3866 To be called from preorder or postorder. */
3868 static reg_errcode_t
3869 mark_opt_subexp (void *extra, bin_tree_t *node)
3871 Idx idx = (uintptr_t) extra;
3872 if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3873 node->token.opt_subexp = 1;
3875 return REG_NOERROR;
3878 /* Free the allocated memory inside NODE. */
3880 static void
3881 free_token (re_token_t *node)
3883 #ifdef RE_ENABLE_I18N
3884 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3885 free_charset (node->opr.mbcset);
3886 else
3887 #endif /* RE_ENABLE_I18N */
3888 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3889 re_free (node->opr.sbcset);
3892 /* Worker function for tree walking. Free the allocated memory inside NODE
3893 and its children. */
3895 static reg_errcode_t
3896 free_tree (void *extra, bin_tree_t *node)
3898 free_token (&node->token);
3899 return REG_NOERROR;
3903 /* Duplicate the node SRC, and return new node. This is a preorder
3904 visit similar to the one implemented by the generic visitor, but
3905 we need more infrastructure to maintain two parallel trees --- so,
3906 it's easier to duplicate. */
3908 static bin_tree_t *
3909 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3911 const bin_tree_t *node;
3912 bin_tree_t *dup_root;
3913 bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3915 for (node = root; ; )
3917 /* Create a new tree and link it back to the current parent. */
3918 *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3919 if (*p_new == NULL)
3920 return NULL;
3921 (*p_new)->parent = dup_node;
3922 (*p_new)->token.duplicated = 1;
3923 dup_node = *p_new;
3925 /* Go to the left node, or up and to the right. */
3926 if (node->left)
3928 node = node->left;
3929 p_new = &dup_node->left;
3931 else
3933 const bin_tree_t *prev = NULL;
3934 while (node->right == prev || node->right == NULL)
3936 prev = node;
3937 node = node->parent;
3938 dup_node = dup_node->parent;
3939 if (!node)
3940 return dup_root;
3942 node = node->right;
3943 p_new = &dup_node->right;