kernel - Handle spinlock indefinite wait edge case
[dragonfly.git] / contrib / grep / lib / regcomp.c
blob01b668b193265e9a6bc7081bf02f4e16a38d8ac5
1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002-2015 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU General Public
8 License as published by the Free Software Foundation; either
9 version 3 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 General Public License for more details.
16 You should have received a copy of the GNU General Public
17 License along with the GNU C Library; if not, see
18 <http://www.gnu.org/licenses/>. */
20 #ifdef _LIBC
21 # include <locale/weight.h>
22 #endif
24 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
25 size_t length, reg_syntax_t syntax);
26 static void re_compile_fastmap_iter (regex_t *bufp,
27 const re_dfastate_t *init_state,
28 char *fastmap);
29 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
30 #ifdef RE_ENABLE_I18N
31 static void free_charset (re_charset_t *cset);
32 #endif /* RE_ENABLE_I18N */
33 static void free_workarea_compile (regex_t *preg);
34 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
35 #ifdef RE_ENABLE_I18N
36 static void optimize_utf8 (re_dfa_t *dfa);
37 #endif
38 static reg_errcode_t analyze (regex_t *preg);
39 static reg_errcode_t preorder (bin_tree_t *root,
40 reg_errcode_t (fn (void *, bin_tree_t *)),
41 void *extra);
42 static reg_errcode_t postorder (bin_tree_t *root,
43 reg_errcode_t (fn (void *, bin_tree_t *)),
44 void *extra);
45 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
46 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
47 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
48 bin_tree_t *node);
49 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
50 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
51 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
52 static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
53 static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
54 unsigned int constraint);
55 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
56 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
57 Idx node, bool root);
58 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
59 static Idx fetch_number (re_string_t *input, re_token_t *token,
60 reg_syntax_t syntax);
61 static int peek_token (re_token_t *token, re_string_t *input,
62 reg_syntax_t syntax) internal_function;
63 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
64 reg_syntax_t syntax, reg_errcode_t *err);
65 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
66 re_token_t *token, reg_syntax_t syntax,
67 Idx nest, reg_errcode_t *err);
68 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
69 re_token_t *token, reg_syntax_t syntax,
70 Idx nest, reg_errcode_t *err);
71 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
72 re_token_t *token, reg_syntax_t syntax,
73 Idx nest, reg_errcode_t *err);
74 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
75 re_token_t *token, reg_syntax_t syntax,
76 Idx nest, reg_errcode_t *err);
77 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
78 re_dfa_t *dfa, re_token_t *token,
79 reg_syntax_t syntax, reg_errcode_t *err);
80 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
81 re_token_t *token, reg_syntax_t syntax,
82 reg_errcode_t *err);
83 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
84 re_string_t *regexp,
85 re_token_t *token, int token_len,
86 re_dfa_t *dfa,
87 reg_syntax_t syntax,
88 bool accept_hyphen);
89 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
90 re_string_t *regexp,
91 re_token_t *token);
92 #ifdef RE_ENABLE_I18N
93 static reg_errcode_t build_equiv_class (bitset_t sbcset,
94 re_charset_t *mbcset,
95 Idx *equiv_class_alloc,
96 const unsigned char *name);
97 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
98 bitset_t sbcset,
99 re_charset_t *mbcset,
100 Idx *char_class_alloc,
101 const char *class_name,
102 reg_syntax_t syntax);
103 #else /* not RE_ENABLE_I18N */
104 static reg_errcode_t build_equiv_class (bitset_t sbcset,
105 const unsigned char *name);
106 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
107 bitset_t sbcset,
108 const char *class_name,
109 reg_syntax_t syntax);
110 #endif /* not RE_ENABLE_I18N */
111 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
112 RE_TRANSLATE_TYPE trans,
113 const char *class_name,
114 const char *extra,
115 bool non_match, reg_errcode_t *err);
116 static bin_tree_t *create_tree (re_dfa_t *dfa,
117 bin_tree_t *left, bin_tree_t *right,
118 re_token_type_t type);
119 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
120 bin_tree_t *left, bin_tree_t *right,
121 const re_token_t *token);
122 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
123 static void free_token (re_token_t *node);
124 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
125 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
127 /* This table gives an error message for each of the error codes listed
128 in regex.h. Obviously the order here has to be same as there.
129 POSIX doesn't require that we do anything for REG_NOERROR,
130 but why not be nice? */
132 static const char __re_error_msgid[] =
134 #define REG_NOERROR_IDX 0
135 gettext_noop ("Success") /* REG_NOERROR */
136 "\0"
137 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
138 gettext_noop ("No match") /* REG_NOMATCH */
139 "\0"
140 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
141 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
142 "\0"
143 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
144 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
145 "\0"
146 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
147 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
148 "\0"
149 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
150 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
151 "\0"
152 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
153 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
154 "\0"
155 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
156 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
157 "\0"
158 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
159 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
160 "\0"
161 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
162 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
163 "\0"
164 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
165 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
166 "\0"
167 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
168 gettext_noop ("Invalid range end") /* REG_ERANGE */
169 "\0"
170 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
171 gettext_noop ("Memory exhausted") /* REG_ESPACE */
172 "\0"
173 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
174 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
175 "\0"
176 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
177 gettext_noop ("Premature end of regular expression") /* REG_EEND */
178 "\0"
179 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
180 gettext_noop ("Regular expression too big") /* REG_ESIZE */
181 "\0"
182 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
183 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
186 static const size_t __re_error_msgid_idx[] =
188 REG_NOERROR_IDX,
189 REG_NOMATCH_IDX,
190 REG_BADPAT_IDX,
191 REG_ECOLLATE_IDX,
192 REG_ECTYPE_IDX,
193 REG_EESCAPE_IDX,
194 REG_ESUBREG_IDX,
195 REG_EBRACK_IDX,
196 REG_EPAREN_IDX,
197 REG_EBRACE_IDX,
198 REG_BADBR_IDX,
199 REG_ERANGE_IDX,
200 REG_ESPACE_IDX,
201 REG_BADRPT_IDX,
202 REG_EEND_IDX,
203 REG_ESIZE_IDX,
204 REG_ERPAREN_IDX
207 /* Entry points for GNU code. */
209 /* re_compile_pattern is the GNU regular expression compiler: it
210 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
211 Returns 0 if the pattern was valid, otherwise an error string.
213 Assumes the 'allocated' (and perhaps 'buffer') and 'translate' fields
214 are set in BUFP on entry. */
216 #ifdef _LIBC
217 const char *
218 re_compile_pattern (pattern, length, bufp)
219 const char *pattern;
220 size_t length;
221 struct re_pattern_buffer *bufp;
222 #else /* size_t might promote */
223 const char *
224 re_compile_pattern (const char *pattern, size_t length,
225 struct re_pattern_buffer *bufp)
226 #endif
228 reg_errcode_t ret;
230 /* And GNU code determines whether or not to get register information
231 by passing null for the REGS argument to re_match, etc., not by
232 setting no_sub, unless RE_NO_SUB is set. */
233 bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
235 /* Match anchors at newline. */
236 bufp->newline_anchor = 1;
238 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
240 if (!ret)
241 return NULL;
242 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
244 #ifdef _LIBC
245 weak_alias (__re_compile_pattern, re_compile_pattern)
246 #endif
248 /* Set by 're_set_syntax' to the current regexp syntax to recognize. Can
249 also be assigned to arbitrarily: each pattern buffer stores its own
250 syntax, so it can be changed between regex compilations. */
251 /* This has no initializer because initialized variables in Emacs
252 become read-only after dumping. */
253 reg_syntax_t re_syntax_options;
256 /* Specify the precise syntax of regexps for compilation. This provides
257 for compatibility for various utilities which historically have
258 different, incompatible syntaxes.
260 The argument SYNTAX is a bit mask comprised of the various bits
261 defined in regex.h. We return the old syntax. */
263 reg_syntax_t
264 re_set_syntax (syntax)
265 reg_syntax_t syntax;
267 reg_syntax_t ret = re_syntax_options;
269 re_syntax_options = syntax;
270 return ret;
272 #ifdef _LIBC
273 weak_alias (__re_set_syntax, re_set_syntax)
274 #endif
277 re_compile_fastmap (bufp)
278 struct re_pattern_buffer *bufp;
280 re_dfa_t *dfa = bufp->buffer;
281 char *fastmap = bufp->fastmap;
283 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
284 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
285 if (dfa->init_state != dfa->init_state_word)
286 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
287 if (dfa->init_state != dfa->init_state_nl)
288 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
289 if (dfa->init_state != dfa->init_state_begbuf)
290 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
291 bufp->fastmap_accurate = 1;
292 return 0;
294 #ifdef _LIBC
295 weak_alias (__re_compile_fastmap, re_compile_fastmap)
296 #endif
298 static inline void
299 __attribute__ ((always_inline))
300 re_set_fastmap (char *fastmap, bool icase, int ch)
302 fastmap[ch] = 1;
303 if (icase)
304 fastmap[tolower (ch)] = 1;
307 /* Helper function for re_compile_fastmap.
308 Compile fastmap for the initial_state INIT_STATE. */
310 static void
311 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
312 char *fastmap)
314 re_dfa_t *dfa = bufp->buffer;
315 Idx node_cnt;
316 bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
317 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
319 Idx node = init_state->nodes.elems[node_cnt];
320 re_token_type_t type = dfa->nodes[node].type;
322 if (type == CHARACTER)
324 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
325 #ifdef RE_ENABLE_I18N
326 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
328 unsigned char buf[MB_LEN_MAX];
329 unsigned char *p;
330 wchar_t wc;
331 mbstate_t state;
333 p = buf;
334 *p++ = dfa->nodes[node].opr.c;
335 while (++node < dfa->nodes_len
336 && dfa->nodes[node].type == CHARACTER
337 && dfa->nodes[node].mb_partial)
338 *p++ = dfa->nodes[node].opr.c;
339 memset (&state, '\0', sizeof (state));
340 if (__mbrtowc (&wc, (const char *) buf, p - buf,
341 &state) == p - buf
342 && (__wcrtomb ((char *) buf, __towlower (wc), &state)
343 != (size_t) -1))
344 re_set_fastmap (fastmap, false, buf[0]);
346 #endif
348 else if (type == SIMPLE_BRACKET)
350 int i, ch;
351 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
353 int j;
354 bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
355 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
356 if (w & ((bitset_word_t) 1 << j))
357 re_set_fastmap (fastmap, icase, ch);
360 #ifdef RE_ENABLE_I18N
361 else if (type == COMPLEX_BRACKET)
363 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
364 Idx i;
366 # ifdef _LIBC
367 /* See if we have to try all bytes which start multiple collation
368 elements.
369 e.g. In da_DK, we want to catch 'a' since "aa" is a valid
370 collation element, and don't catch 'b' since 'b' is
371 the only collation element which starts from 'b' (and
372 it is caught by SIMPLE_BRACKET). */
373 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
374 && (cset->ncoll_syms || cset->nranges))
376 const int32_t *table = (const int32_t *)
377 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
378 for (i = 0; i < SBC_MAX; ++i)
379 if (table[i] < 0)
380 re_set_fastmap (fastmap, icase, i);
382 # endif /* _LIBC */
384 /* See if we have to start the match at all multibyte characters,
385 i.e. where we would not find an invalid sequence. This only
386 applies to multibyte character sets; for single byte character
387 sets, the SIMPLE_BRACKET again suffices. */
388 if (dfa->mb_cur_max > 1
389 && (cset->nchar_classes || cset->non_match || cset->nranges
390 # ifdef _LIBC
391 || cset->nequiv_classes
392 # endif /* _LIBC */
395 unsigned char c = 0;
398 mbstate_t mbs;
399 memset (&mbs, 0, sizeof (mbs));
400 if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
401 re_set_fastmap (fastmap, false, (int) c);
403 while (++c != 0);
406 else
408 /* ... Else catch all bytes which can start the mbchars. */
409 for (i = 0; i < cset->nmbchars; ++i)
411 char buf[256];
412 mbstate_t state;
413 memset (&state, '\0', sizeof (state));
414 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
415 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
416 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
418 if (__wcrtomb (buf, __towlower (cset->mbchars[i]), &state)
419 != (size_t) -1)
420 re_set_fastmap (fastmap, false, *(unsigned char *) buf);
425 #endif /* RE_ENABLE_I18N */
426 else if (type == OP_PERIOD
427 #ifdef RE_ENABLE_I18N
428 || type == OP_UTF8_PERIOD
429 #endif /* RE_ENABLE_I18N */
430 || type == END_OF_RE)
432 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
433 if (type == END_OF_RE)
434 bufp->can_be_null = 1;
435 return;
440 /* Entry point for POSIX code. */
441 /* regcomp takes a regular expression as a string and compiles it.
443 PREG is a regex_t *. We do not expect any fields to be initialized,
444 since POSIX says we shouldn't. Thus, we set
446 'buffer' to the compiled pattern;
447 'used' to the length of the compiled pattern;
448 'syntax' to RE_SYNTAX_POSIX_EXTENDED if the
449 REG_EXTENDED bit in CFLAGS is set; otherwise, to
450 RE_SYNTAX_POSIX_BASIC;
451 'newline_anchor' to REG_NEWLINE being set in CFLAGS;
452 'fastmap' to an allocated space for the fastmap;
453 'fastmap_accurate' to zero;
454 're_nsub' to the number of subexpressions in PATTERN.
456 PATTERN is the address of the pattern string.
458 CFLAGS is a series of bits which affect compilation.
460 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
461 use POSIX basic syntax.
463 If REG_NEWLINE is set, then . and [^...] don't match newline.
464 Also, regexec will try a match beginning after every newline.
466 If REG_ICASE is set, then we considers upper- and lowercase
467 versions of letters to be equivalent when matching.
469 If REG_NOSUB is set, then when PREG is passed to regexec, that
470 routine will report only success or failure, and nothing about the
471 registers.
473 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
474 the return codes and their meanings.) */
477 regcomp (preg, pattern, cflags)
478 regex_t *_Restrict_ preg;
479 const char *_Restrict_ pattern;
480 int cflags;
482 reg_errcode_t ret;
483 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
484 : RE_SYNTAX_POSIX_BASIC);
486 preg->buffer = NULL;
487 preg->allocated = 0;
488 preg->used = 0;
490 /* Try to allocate space for the fastmap. */
491 preg->fastmap = re_malloc (char, SBC_MAX);
492 if (BE (preg->fastmap == NULL, 0))
493 return REG_ESPACE;
495 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
497 /* If REG_NEWLINE is set, newlines are treated differently. */
498 if (cflags & REG_NEWLINE)
499 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
500 syntax &= ~RE_DOT_NEWLINE;
501 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
502 /* It also changes the matching behavior. */
503 preg->newline_anchor = 1;
505 else
506 preg->newline_anchor = 0;
507 preg->no_sub = !!(cflags & REG_NOSUB);
508 preg->translate = NULL;
510 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
512 /* POSIX doesn't distinguish between an unmatched open-group and an
513 unmatched close-group: both are REG_EPAREN. */
514 if (ret == REG_ERPAREN)
515 ret = REG_EPAREN;
517 /* We have already checked preg->fastmap != NULL. */
518 if (BE (ret == REG_NOERROR, 1))
519 /* Compute the fastmap now, since regexec cannot modify the pattern
520 buffer. This function never fails in this implementation. */
521 (void) re_compile_fastmap (preg);
522 else
524 /* Some error occurred while compiling the expression. */
525 re_free (preg->fastmap);
526 preg->fastmap = NULL;
529 return (int) ret;
531 #ifdef _LIBC
532 weak_alias (__regcomp, regcomp)
533 #endif
535 /* Returns a message corresponding to an error code, ERRCODE, returned
536 from either regcomp or regexec. We don't use PREG here. */
538 #ifdef _LIBC
539 size_t
540 regerror (errcode, preg, errbuf, errbuf_size)
541 int errcode;
542 const regex_t *_Restrict_ preg;
543 char *_Restrict_ errbuf;
544 size_t errbuf_size;
545 #else /* size_t might promote */
546 size_t
547 regerror (int errcode, const regex_t *_Restrict_ preg,
548 char *_Restrict_ errbuf, size_t errbuf_size)
549 #endif
551 const char *msg;
552 size_t msg_size;
554 if (BE (errcode < 0
555 || errcode >= (int) (sizeof (__re_error_msgid_idx)
556 / sizeof (__re_error_msgid_idx[0])), 0))
557 /* Only error codes returned by the rest of the code should be passed
558 to this routine. If we are given anything else, or if other regex
559 code generates an invalid error code, then the program has a bug.
560 Dump core so we can fix it. */
561 abort ();
563 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
565 msg_size = strlen (msg) + 1; /* Includes the null. */
567 if (BE (errbuf_size != 0, 1))
569 size_t cpy_size = msg_size;
570 if (BE (msg_size > errbuf_size, 0))
572 cpy_size = errbuf_size - 1;
573 errbuf[cpy_size] = '\0';
575 memcpy (errbuf, msg, cpy_size);
578 return msg_size;
580 #ifdef _LIBC
581 weak_alias (__regerror, regerror)
582 #endif
585 #ifdef RE_ENABLE_I18N
586 /* This static array is used for the map to single-byte characters when
587 UTF-8 is used. Otherwise we would allocate memory just to initialize
588 it the same all the time. UTF-8 is the preferred encoding so this is
589 a worthwhile optimization. */
590 static const bitset_t utf8_sb_map =
592 /* Set the first 128 bits. */
593 # if defined __GNUC__ && !defined __STRICT_ANSI__
594 [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
595 # else
596 # if 4 * BITSET_WORD_BITS < ASCII_CHARS
597 # error "bitset_word_t is narrower than 32 bits"
598 # elif 3 * BITSET_WORD_BITS < ASCII_CHARS
599 BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
600 # elif 2 * BITSET_WORD_BITS < ASCII_CHARS
601 BITSET_WORD_MAX, BITSET_WORD_MAX,
602 # elif 1 * BITSET_WORD_BITS < ASCII_CHARS
603 BITSET_WORD_MAX,
604 # endif
605 (BITSET_WORD_MAX
606 >> (SBC_MAX % BITSET_WORD_BITS == 0
608 : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
609 # endif
611 #endif
614 static void
615 free_dfa_content (re_dfa_t *dfa)
617 Idx i, j;
619 if (dfa->nodes)
620 for (i = 0; i < dfa->nodes_len; ++i)
621 free_token (dfa->nodes + i);
622 re_free (dfa->nexts);
623 for (i = 0; i < dfa->nodes_len; ++i)
625 if (dfa->eclosures != NULL)
626 re_node_set_free (dfa->eclosures + i);
627 if (dfa->inveclosures != NULL)
628 re_node_set_free (dfa->inveclosures + i);
629 if (dfa->edests != NULL)
630 re_node_set_free (dfa->edests + i);
632 re_free (dfa->edests);
633 re_free (dfa->eclosures);
634 re_free (dfa->inveclosures);
635 re_free (dfa->nodes);
637 if (dfa->state_table)
638 for (i = 0; i <= dfa->state_hash_mask; ++i)
640 struct re_state_table_entry *entry = dfa->state_table + i;
641 for (j = 0; j < entry->num; ++j)
643 re_dfastate_t *state = entry->array[j];
644 free_state (state);
646 re_free (entry->array);
648 re_free (dfa->state_table);
649 #ifdef RE_ENABLE_I18N
650 if (dfa->sb_char != utf8_sb_map)
651 re_free (dfa->sb_char);
652 #endif
653 re_free (dfa->subexp_map);
654 #ifdef DEBUG
655 re_free (dfa->re_str);
656 #endif
658 re_free (dfa);
662 /* Free dynamically allocated space used by PREG. */
664 void
665 regfree (preg)
666 regex_t *preg;
668 re_dfa_t *dfa = preg->buffer;
669 if (BE (dfa != NULL, 1))
671 lock_fini (dfa->lock);
672 free_dfa_content (dfa);
674 preg->buffer = NULL;
675 preg->allocated = 0;
677 re_free (preg->fastmap);
678 preg->fastmap = NULL;
680 re_free (preg->translate);
681 preg->translate = NULL;
683 #ifdef _LIBC
684 weak_alias (__regfree, regfree)
685 #endif
687 /* Entry points compatible with 4.2 BSD regex library. We don't define
688 them unless specifically requested. */
690 #if defined _REGEX_RE_COMP || defined _LIBC
692 /* BSD has one and only one pattern buffer. */
693 static struct re_pattern_buffer re_comp_buf;
695 char *
696 # ifdef _LIBC
697 /* Make these definitions weak in libc, so POSIX programs can redefine
698 these names if they don't use our functions, and still use
699 regcomp/regexec above without link errors. */
700 weak_function
701 # endif
702 re_comp (s)
703 const char *s;
705 reg_errcode_t ret;
706 char *fastmap;
708 if (!s)
710 if (!re_comp_buf.buffer)
711 return gettext ("No previous regular expression");
712 return 0;
715 if (re_comp_buf.buffer)
717 fastmap = re_comp_buf.fastmap;
718 re_comp_buf.fastmap = NULL;
719 __regfree (&re_comp_buf);
720 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
721 re_comp_buf.fastmap = fastmap;
724 if (re_comp_buf.fastmap == NULL)
726 re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
727 if (re_comp_buf.fastmap == NULL)
728 return (char *) gettext (__re_error_msgid
729 + __re_error_msgid_idx[(int) REG_ESPACE]);
732 /* Since 're_exec' always passes NULL for the 'regs' argument, we
733 don't need to initialize the pattern buffer fields which affect it. */
735 /* Match anchors at newlines. */
736 re_comp_buf.newline_anchor = 1;
738 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
740 if (!ret)
741 return NULL;
743 /* Yes, we're discarding 'const' here if !HAVE_LIBINTL. */
744 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
747 #ifdef _LIBC
748 libc_freeres_fn (free_mem)
750 __regfree (&re_comp_buf);
752 #endif
754 #endif /* _REGEX_RE_COMP */
756 /* Internal entry point.
757 Compile the regular expression PATTERN, whose length is LENGTH.
758 SYNTAX indicate regular expression's syntax. */
760 static reg_errcode_t
761 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
762 reg_syntax_t syntax)
764 reg_errcode_t err = REG_NOERROR;
765 re_dfa_t *dfa;
766 re_string_t regexp;
768 /* Initialize the pattern buffer. */
769 preg->fastmap_accurate = 0;
770 preg->syntax = syntax;
771 preg->not_bol = preg->not_eol = 0;
772 preg->used = 0;
773 preg->re_nsub = 0;
774 preg->can_be_null = 0;
775 preg->regs_allocated = REGS_UNALLOCATED;
777 /* Initialize the dfa. */
778 dfa = preg->buffer;
779 if (BE (preg->allocated < sizeof (re_dfa_t), 0))
781 /* If zero allocated, but buffer is non-null, try to realloc
782 enough space. This loses if buffer's address is bogus, but
783 that is the user's responsibility. If ->buffer is NULL this
784 is a simple allocation. */
785 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
786 if (dfa == NULL)
787 return REG_ESPACE;
788 preg->allocated = sizeof (re_dfa_t);
789 preg->buffer = dfa;
791 preg->used = sizeof (re_dfa_t);
793 err = init_dfa (dfa, length);
794 if (BE (err == REG_NOERROR && lock_init (dfa->lock) != 0, 0))
795 err = REG_ESPACE;
796 if (BE (err != REG_NOERROR, 0))
798 free_dfa_content (dfa);
799 preg->buffer = NULL;
800 preg->allocated = 0;
801 return err;
803 #ifdef DEBUG
804 /* Note: length+1 will not overflow since it is checked in init_dfa. */
805 dfa->re_str = re_malloc (char, length + 1);
806 strncpy (dfa->re_str, pattern, length + 1);
807 #endif
809 err = re_string_construct (&regexp, pattern, length, preg->translate,
810 (syntax & RE_ICASE) != 0, dfa);
811 if (BE (err != REG_NOERROR, 0))
813 re_compile_internal_free_return:
814 free_workarea_compile (preg);
815 re_string_destruct (&regexp);
816 lock_fini (dfa->lock);
817 free_dfa_content (dfa);
818 preg->buffer = NULL;
819 preg->allocated = 0;
820 return err;
823 /* Parse the regular expression, and build a structure tree. */
824 preg->re_nsub = 0;
825 dfa->str_tree = parse (&regexp, preg, syntax, &err);
826 if (BE (dfa->str_tree == NULL, 0))
827 goto re_compile_internal_free_return;
829 /* Analyze the tree and create the nfa. */
830 err = analyze (preg);
831 if (BE (err != REG_NOERROR, 0))
832 goto re_compile_internal_free_return;
834 #ifdef RE_ENABLE_I18N
835 /* If possible, do searching in single byte encoding to speed things up. */
836 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
837 optimize_utf8 (dfa);
838 #endif
840 /* Then create the initial state of the dfa. */
841 err = create_initial_state (dfa);
843 /* Release work areas. */
844 free_workarea_compile (preg);
845 re_string_destruct (&regexp);
847 if (BE (err != REG_NOERROR, 0))
849 lock_fini (dfa->lock);
850 free_dfa_content (dfa);
851 preg->buffer = NULL;
852 preg->allocated = 0;
855 return err;
858 /* Initialize DFA. We use the length of the regular expression PAT_LEN
859 as the initial length of some arrays. */
861 static reg_errcode_t
862 init_dfa (re_dfa_t *dfa, size_t pat_len)
864 __re_size_t table_size;
865 #ifndef _LIBC
866 const char *codeset_name;
867 #endif
868 #ifdef RE_ENABLE_I18N
869 size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
870 #else
871 size_t max_i18n_object_size = 0;
872 #endif
873 size_t max_object_size =
874 MAX (sizeof (struct re_state_table_entry),
875 MAX (sizeof (re_token_t),
876 MAX (sizeof (re_node_set),
877 MAX (sizeof (regmatch_t),
878 max_i18n_object_size))));
880 memset (dfa, '\0', sizeof (re_dfa_t));
882 /* Force allocation of str_tree_storage the first time. */
883 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
885 /* Avoid overflows. The extra "/ 2" is for the table_size doubling
886 calculation below, and for similar doubling calculations
887 elsewhere. And it's <= rather than <, because some of the
888 doubling calculations add 1 afterwards. */
889 if (BE (MIN (IDX_MAX, SIZE_MAX / max_object_size) / 2 <= pat_len, 0))
890 return REG_ESPACE;
892 dfa->nodes_alloc = pat_len + 1;
893 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
895 /* table_size = 2 ^ ceil(log pat_len) */
896 for (table_size = 1; ; table_size <<= 1)
897 if (table_size > pat_len)
898 break;
900 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
901 dfa->state_hash_mask = table_size - 1;
903 dfa->mb_cur_max = MB_CUR_MAX;
904 #ifdef _LIBC
905 if (dfa->mb_cur_max == 6
906 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
907 dfa->is_utf8 = 1;
908 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
909 != 0);
910 #else
911 codeset_name = nl_langinfo (CODESET);
912 if ((codeset_name[0] == 'U' || codeset_name[0] == 'u')
913 && (codeset_name[1] == 'T' || codeset_name[1] == 't')
914 && (codeset_name[2] == 'F' || codeset_name[2] == 'f')
915 && strcmp (codeset_name + 3 + (codeset_name[3] == '-'), "8") == 0)
916 dfa->is_utf8 = 1;
918 /* We check exhaustively in the loop below if this charset is a
919 superset of ASCII. */
920 dfa->map_notascii = 0;
921 #endif
923 #ifdef RE_ENABLE_I18N
924 if (dfa->mb_cur_max > 1)
926 if (dfa->is_utf8)
927 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
928 else
930 int i, j, ch;
932 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
933 if (BE (dfa->sb_char == NULL, 0))
934 return REG_ESPACE;
936 /* Set the bits corresponding to single byte chars. */
937 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
938 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
940 wint_t wch = __btowc (ch);
941 if (wch != WEOF)
942 dfa->sb_char[i] |= (bitset_word_t) 1 << j;
943 # ifndef _LIBC
944 if (isascii (ch) && wch != ch)
945 dfa->map_notascii = 1;
946 # endif
950 #endif
952 if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
953 return REG_ESPACE;
954 return REG_NOERROR;
957 /* Initialize WORD_CHAR table, which indicate which character is
958 "word". In this case "word" means that it is the word construction
959 character used by some operators like "\<", "\>", etc. */
961 static void
962 internal_function
963 init_word_char (re_dfa_t *dfa)
965 int i = 0;
966 int j;
967 int ch = 0;
968 dfa->word_ops_used = 1;
969 if (BE (dfa->map_notascii == 0, 1))
971 bitset_word_t bits0 = 0x00000000;
972 bitset_word_t bits1 = 0x03ff0000;
973 bitset_word_t bits2 = 0x87fffffe;
974 bitset_word_t bits3 = 0x07fffffe;
975 if (BITSET_WORD_BITS == 64)
977 dfa->word_char[0] = bits1 << 31 << 1 | bits0;
978 dfa->word_char[1] = bits3 << 31 << 1 | bits2;
979 i = 2;
981 else if (BITSET_WORD_BITS == 32)
983 dfa->word_char[0] = bits0;
984 dfa->word_char[1] = bits1;
985 dfa->word_char[2] = bits2;
986 dfa->word_char[3] = bits3;
987 i = 4;
989 else
990 goto general_case;
991 ch = 128;
993 if (BE (dfa->is_utf8, 1))
995 memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8);
996 return;
1000 general_case:
1001 for (; i < BITSET_WORDS; ++i)
1002 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
1003 if (isalnum (ch) || ch == '_')
1004 dfa->word_char[i] |= (bitset_word_t) 1 << j;
1007 /* Free the work area which are only used while compiling. */
1009 static void
1010 free_workarea_compile (regex_t *preg)
1012 re_dfa_t *dfa = preg->buffer;
1013 bin_tree_storage_t *storage, *next;
1014 for (storage = dfa->str_tree_storage; storage; storage = next)
1016 next = storage->next;
1017 re_free (storage);
1019 dfa->str_tree_storage = NULL;
1020 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
1021 dfa->str_tree = NULL;
1022 re_free (dfa->org_indices);
1023 dfa->org_indices = NULL;
1026 /* Create initial states for all contexts. */
1028 static reg_errcode_t
1029 create_initial_state (re_dfa_t *dfa)
1031 Idx first, i;
1032 reg_errcode_t err;
1033 re_node_set init_nodes;
1035 /* Initial states have the epsilon closure of the node which is
1036 the first node of the regular expression. */
1037 first = dfa->str_tree->first->node_idx;
1038 dfa->init_node = first;
1039 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
1040 if (BE (err != REG_NOERROR, 0))
1041 return err;
1043 /* The back-references which are in initial states can epsilon transit,
1044 since in this case all of the subexpressions can be null.
1045 Then we add epsilon closures of the nodes which are the next nodes of
1046 the back-references. */
1047 if (dfa->nbackref > 0)
1048 for (i = 0; i < init_nodes.nelem; ++i)
1050 Idx node_idx = init_nodes.elems[i];
1051 re_token_type_t type = dfa->nodes[node_idx].type;
1053 Idx clexp_idx;
1054 if (type != OP_BACK_REF)
1055 continue;
1056 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1058 re_token_t *clexp_node;
1059 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1060 if (clexp_node->type == OP_CLOSE_SUBEXP
1061 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1062 break;
1064 if (clexp_idx == init_nodes.nelem)
1065 continue;
1067 if (type == OP_BACK_REF)
1069 Idx dest_idx = dfa->edests[node_idx].elems[0];
1070 if (!re_node_set_contains (&init_nodes, dest_idx))
1072 reg_errcode_t merge_err
1073 = re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1074 if (merge_err != REG_NOERROR)
1075 return merge_err;
1076 i = 0;
1081 /* It must be the first time to invoke acquire_state. */
1082 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1083 /* We don't check ERR here, since the initial state must not be NULL. */
1084 if (BE (dfa->init_state == NULL, 0))
1085 return err;
1086 if (dfa->init_state->has_constraint)
1088 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1089 CONTEXT_WORD);
1090 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1091 CONTEXT_NEWLINE);
1092 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1093 &init_nodes,
1094 CONTEXT_NEWLINE
1095 | CONTEXT_BEGBUF);
1096 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1097 || dfa->init_state_begbuf == NULL, 0))
1098 return err;
1100 else
1101 dfa->init_state_word = dfa->init_state_nl
1102 = dfa->init_state_begbuf = dfa->init_state;
1104 re_node_set_free (&init_nodes);
1105 return REG_NOERROR;
1108 #ifdef RE_ENABLE_I18N
1109 /* If it is possible to do searching in single byte encoding instead of UTF-8
1110 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1111 DFA nodes where needed. */
1113 static void
1114 optimize_utf8 (re_dfa_t *dfa)
1116 Idx node;
1117 int i;
1118 bool mb_chars = false;
1119 bool has_period = false;
1121 for (node = 0; node < dfa->nodes_len; ++node)
1122 switch (dfa->nodes[node].type)
1124 case CHARACTER:
1125 if (dfa->nodes[node].opr.c >= ASCII_CHARS)
1126 mb_chars = true;
1127 break;
1128 case ANCHOR:
1129 switch (dfa->nodes[node].opr.ctx_type)
1131 case LINE_FIRST:
1132 case LINE_LAST:
1133 case BUF_FIRST:
1134 case BUF_LAST:
1135 break;
1136 default:
1137 /* Word anchors etc. cannot be handled. It's okay to test
1138 opr.ctx_type since constraints (for all DFA nodes) are
1139 created by ORing one or more opr.ctx_type values. */
1140 return;
1142 break;
1143 case OP_PERIOD:
1144 has_period = true;
1145 break;
1146 case OP_BACK_REF:
1147 case OP_ALT:
1148 case END_OF_RE:
1149 case OP_DUP_ASTERISK:
1150 case OP_OPEN_SUBEXP:
1151 case OP_CLOSE_SUBEXP:
1152 break;
1153 case COMPLEX_BRACKET:
1154 return;
1155 case SIMPLE_BRACKET:
1156 /* Just double check. */
1158 int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
1160 : BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
1161 for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1163 if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
1164 return;
1165 rshift = 0;
1168 break;
1169 default:
1170 abort ();
1173 if (mb_chars || has_period)
1174 for (node = 0; node < dfa->nodes_len; ++node)
1176 if (dfa->nodes[node].type == CHARACTER
1177 && dfa->nodes[node].opr.c >= ASCII_CHARS)
1178 dfa->nodes[node].mb_partial = 0;
1179 else if (dfa->nodes[node].type == OP_PERIOD)
1180 dfa->nodes[node].type = OP_UTF8_PERIOD;
1183 /* The search can be in single byte locale. */
1184 dfa->mb_cur_max = 1;
1185 dfa->is_utf8 = 0;
1186 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1188 #endif
1190 /* Analyze the structure tree, and calculate "first", "next", "edest",
1191 "eclosure", and "inveclosure". */
1193 static reg_errcode_t
1194 analyze (regex_t *preg)
1196 re_dfa_t *dfa = preg->buffer;
1197 reg_errcode_t ret;
1199 /* Allocate arrays. */
1200 dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1201 dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1202 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1203 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1204 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1205 || dfa->eclosures == NULL, 0))
1206 return REG_ESPACE;
1208 dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
1209 if (dfa->subexp_map != NULL)
1211 Idx i;
1212 for (i = 0; i < preg->re_nsub; i++)
1213 dfa->subexp_map[i] = i;
1214 preorder (dfa->str_tree, optimize_subexps, dfa);
1215 for (i = 0; i < preg->re_nsub; i++)
1216 if (dfa->subexp_map[i] != i)
1217 break;
1218 if (i == preg->re_nsub)
1220 free (dfa->subexp_map);
1221 dfa->subexp_map = NULL;
1225 ret = postorder (dfa->str_tree, lower_subexps, preg);
1226 if (BE (ret != REG_NOERROR, 0))
1227 return ret;
1228 ret = postorder (dfa->str_tree, calc_first, dfa);
1229 if (BE (ret != REG_NOERROR, 0))
1230 return ret;
1231 preorder (dfa->str_tree, calc_next, dfa);
1232 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1233 if (BE (ret != REG_NOERROR, 0))
1234 return ret;
1235 ret = calc_eclosure (dfa);
1236 if (BE (ret != REG_NOERROR, 0))
1237 return ret;
1239 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1240 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1241 if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1242 || dfa->nbackref)
1244 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1245 if (BE (dfa->inveclosures == NULL, 0))
1246 return REG_ESPACE;
1247 ret = calc_inveclosure (dfa);
1250 return ret;
1253 /* Our parse trees are very unbalanced, so we cannot use a stack to
1254 implement parse tree visits. Instead, we use parent pointers and
1255 some hairy code in these two functions. */
1256 static reg_errcode_t
1257 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1258 void *extra)
1260 bin_tree_t *node, *prev;
1262 for (node = root; ; )
1264 /* Descend down the tree, preferably to the left (or to the right
1265 if that's the only child). */
1266 while (node->left || node->right)
1267 if (node->left)
1268 node = node->left;
1269 else
1270 node = node->right;
1274 reg_errcode_t err = fn (extra, node);
1275 if (BE (err != REG_NOERROR, 0))
1276 return err;
1277 if (node->parent == NULL)
1278 return REG_NOERROR;
1279 prev = node;
1280 node = node->parent;
1282 /* Go up while we have a node that is reached from the right. */
1283 while (node->right == prev || node->right == NULL);
1284 node = node->right;
1288 static reg_errcode_t
1289 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1290 void *extra)
1292 bin_tree_t *node;
1294 for (node = root; ; )
1296 reg_errcode_t err = fn (extra, node);
1297 if (BE (err != REG_NOERROR, 0))
1298 return err;
1300 /* Go to the left node, or up and to the right. */
1301 if (node->left)
1302 node = node->left;
1303 else
1305 bin_tree_t *prev = NULL;
1306 while (node->right == prev || node->right == NULL)
1308 prev = node;
1309 node = node->parent;
1310 if (!node)
1311 return REG_NOERROR;
1313 node = node->right;
1318 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1319 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1320 backreferences as well. Requires a preorder visit. */
1321 static reg_errcode_t
1322 optimize_subexps (void *extra, bin_tree_t *node)
1324 re_dfa_t *dfa = (re_dfa_t *) extra;
1326 if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1328 int idx = node->token.opr.idx;
1329 node->token.opr.idx = dfa->subexp_map[idx];
1330 dfa->used_bkref_map |= 1 << node->token.opr.idx;
1333 else if (node->token.type == SUBEXP
1334 && node->left && node->left->token.type == SUBEXP)
1336 Idx other_idx = node->left->token.opr.idx;
1338 node->left = node->left->left;
1339 if (node->left)
1340 node->left->parent = node;
1342 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1343 if (other_idx < BITSET_WORD_BITS)
1344 dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1347 return REG_NOERROR;
1350 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1351 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1352 static reg_errcode_t
1353 lower_subexps (void *extra, bin_tree_t *node)
1355 regex_t *preg = (regex_t *) extra;
1356 reg_errcode_t err = REG_NOERROR;
1358 if (node->left && node->left->token.type == SUBEXP)
1360 node->left = lower_subexp (&err, preg, node->left);
1361 if (node->left)
1362 node->left->parent = node;
1364 if (node->right && node->right->token.type == SUBEXP)
1366 node->right = lower_subexp (&err, preg, node->right);
1367 if (node->right)
1368 node->right->parent = node;
1371 return err;
1374 static bin_tree_t *
1375 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1377 re_dfa_t *dfa = preg->buffer;
1378 bin_tree_t *body = node->left;
1379 bin_tree_t *op, *cls, *tree1, *tree;
1381 if (preg->no_sub
1382 /* We do not optimize empty subexpressions, because otherwise we may
1383 have bad CONCAT nodes with NULL children. This is obviously not
1384 very common, so we do not lose much. An example that triggers
1385 this case is the sed "script" /\(\)/x. */
1386 && node->left != NULL
1387 && (node->token.opr.idx >= BITSET_WORD_BITS
1388 || !(dfa->used_bkref_map
1389 & ((bitset_word_t) 1 << node->token.opr.idx))))
1390 return node->left;
1392 /* Convert the SUBEXP node to the concatenation of an
1393 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1394 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1395 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1396 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1397 tree = create_tree (dfa, op, tree1, CONCAT);
1398 if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1400 *err = REG_ESPACE;
1401 return NULL;
1404 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1405 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1406 return tree;
1409 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1410 nodes. Requires a postorder visit. */
1411 static reg_errcode_t
1412 calc_first (void *extra, bin_tree_t *node)
1414 re_dfa_t *dfa = (re_dfa_t *) extra;
1415 if (node->token.type == CONCAT)
1417 node->first = node->left->first;
1418 node->node_idx = node->left->node_idx;
1420 else
1422 node->first = node;
1423 node->node_idx = re_dfa_add_node (dfa, node->token);
1424 if (BE (node->node_idx == REG_MISSING, 0))
1425 return REG_ESPACE;
1426 if (node->token.type == ANCHOR)
1427 dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1429 return REG_NOERROR;
1432 /* Pass 2: compute NEXT on the tree. Preorder visit. */
1433 static reg_errcode_t
1434 calc_next (void *extra, bin_tree_t *node)
1436 switch (node->token.type)
1438 case OP_DUP_ASTERISK:
1439 node->left->next = node;
1440 break;
1441 case CONCAT:
1442 node->left->next = node->right->first;
1443 node->right->next = node->next;
1444 break;
1445 default:
1446 if (node->left)
1447 node->left->next = node->next;
1448 if (node->right)
1449 node->right->next = node->next;
1450 break;
1452 return REG_NOERROR;
1455 /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1456 static reg_errcode_t
1457 link_nfa_nodes (void *extra, bin_tree_t *node)
1459 re_dfa_t *dfa = (re_dfa_t *) extra;
1460 Idx idx = node->node_idx;
1461 reg_errcode_t err = REG_NOERROR;
1463 switch (node->token.type)
1465 case CONCAT:
1466 break;
1468 case END_OF_RE:
1469 assert (node->next == NULL);
1470 break;
1472 case OP_DUP_ASTERISK:
1473 case OP_ALT:
1475 Idx left, right;
1476 dfa->has_plural_match = 1;
1477 if (node->left != NULL)
1478 left = node->left->first->node_idx;
1479 else
1480 left = node->next->node_idx;
1481 if (node->right != NULL)
1482 right = node->right->first->node_idx;
1483 else
1484 right = node->next->node_idx;
1485 assert (REG_VALID_INDEX (left));
1486 assert (REG_VALID_INDEX (right));
1487 err = re_node_set_init_2 (dfa->edests + idx, left, right);
1489 break;
1491 case ANCHOR:
1492 case OP_OPEN_SUBEXP:
1493 case OP_CLOSE_SUBEXP:
1494 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1495 break;
1497 case OP_BACK_REF:
1498 dfa->nexts[idx] = node->next->node_idx;
1499 if (node->token.type == OP_BACK_REF)
1500 err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1501 break;
1503 default:
1504 assert (!IS_EPSILON_NODE (node->token.type));
1505 dfa->nexts[idx] = node->next->node_idx;
1506 break;
1509 return err;
1512 /* Duplicate the epsilon closure of the node ROOT_NODE.
1513 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1514 to their own constraint. */
1516 static reg_errcode_t
1517 internal_function
1518 duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
1519 Idx root_node, unsigned int init_constraint)
1521 Idx org_node, clone_node;
1522 bool ok;
1523 unsigned int constraint = init_constraint;
1524 for (org_node = top_org_node, clone_node = top_clone_node;;)
1526 Idx org_dest, clone_dest;
1527 if (dfa->nodes[org_node].type == OP_BACK_REF)
1529 /* If the back reference epsilon-transit, its destination must
1530 also have the constraint. Then duplicate the epsilon closure
1531 of the destination of the back reference, and store it in
1532 edests of the back reference. */
1533 org_dest = dfa->nexts[org_node];
1534 re_node_set_empty (dfa->edests + clone_node);
1535 clone_dest = duplicate_node (dfa, org_dest, constraint);
1536 if (BE (clone_dest == REG_MISSING, 0))
1537 return REG_ESPACE;
1538 dfa->nexts[clone_node] = dfa->nexts[org_node];
1539 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1540 if (BE (! ok, 0))
1541 return REG_ESPACE;
1543 else if (dfa->edests[org_node].nelem == 0)
1545 /* In case of the node can't epsilon-transit, don't duplicate the
1546 destination and store the original destination as the
1547 destination of the node. */
1548 dfa->nexts[clone_node] = dfa->nexts[org_node];
1549 break;
1551 else if (dfa->edests[org_node].nelem == 1)
1553 /* In case of the node can epsilon-transit, and it has only one
1554 destination. */
1555 org_dest = dfa->edests[org_node].elems[0];
1556 re_node_set_empty (dfa->edests + clone_node);
1557 /* If the node is root_node itself, it means the epsilon closure
1558 has a loop. Then tie it to the destination of the root_node. */
1559 if (org_node == root_node && clone_node != org_node)
1561 ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
1562 if (BE (! ok, 0))
1563 return REG_ESPACE;
1564 break;
1566 /* In case the node has another constraint, append it. */
1567 constraint |= dfa->nodes[org_node].constraint;
1568 clone_dest = duplicate_node (dfa, org_dest, constraint);
1569 if (BE (clone_dest == REG_MISSING, 0))
1570 return REG_ESPACE;
1571 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1572 if (BE (! ok, 0))
1573 return REG_ESPACE;
1575 else /* dfa->edests[org_node].nelem == 2 */
1577 /* In case of the node can epsilon-transit, and it has two
1578 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1579 org_dest = dfa->edests[org_node].elems[0];
1580 re_node_set_empty (dfa->edests + clone_node);
1581 /* Search for a duplicated node which satisfies the constraint. */
1582 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1583 if (clone_dest == REG_MISSING)
1585 /* There is no such duplicated node, create a new one. */
1586 reg_errcode_t err;
1587 clone_dest = duplicate_node (dfa, org_dest, constraint);
1588 if (BE (clone_dest == REG_MISSING, 0))
1589 return REG_ESPACE;
1590 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1591 if (BE (! ok, 0))
1592 return REG_ESPACE;
1593 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1594 root_node, constraint);
1595 if (BE (err != REG_NOERROR, 0))
1596 return err;
1598 else
1600 /* There is a duplicated node which satisfies the constraint,
1601 use it to avoid infinite loop. */
1602 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1603 if (BE (! ok, 0))
1604 return REG_ESPACE;
1607 org_dest = dfa->edests[org_node].elems[1];
1608 clone_dest = duplicate_node (dfa, org_dest, constraint);
1609 if (BE (clone_dest == REG_MISSING, 0))
1610 return REG_ESPACE;
1611 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1612 if (BE (! ok, 0))
1613 return REG_ESPACE;
1615 org_node = org_dest;
1616 clone_node = clone_dest;
1618 return REG_NOERROR;
1621 /* Search for a node which is duplicated from the node ORG_NODE, and
1622 satisfies the constraint CONSTRAINT. */
1624 static Idx
1625 search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1626 unsigned int constraint)
1628 Idx idx;
1629 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1631 if (org_node == dfa->org_indices[idx]
1632 && constraint == dfa->nodes[idx].constraint)
1633 return idx; /* Found. */
1635 return REG_MISSING; /* Not found. */
1638 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1639 Return the index of the new node, or REG_MISSING if insufficient storage is
1640 available. */
1642 static Idx
1643 duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
1645 Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1646 if (BE (dup_idx != REG_MISSING, 1))
1648 dfa->nodes[dup_idx].constraint = constraint;
1649 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1650 dfa->nodes[dup_idx].duplicated = 1;
1652 /* Store the index of the original node. */
1653 dfa->org_indices[dup_idx] = org_idx;
1655 return dup_idx;
1658 static reg_errcode_t
1659 calc_inveclosure (re_dfa_t *dfa)
1661 Idx src, idx;
1662 bool ok;
1663 for (idx = 0; idx < dfa->nodes_len; ++idx)
1664 re_node_set_init_empty (dfa->inveclosures + idx);
1666 for (src = 0; src < dfa->nodes_len; ++src)
1668 Idx *elems = dfa->eclosures[src].elems;
1669 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1671 ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1672 if (BE (! ok, 0))
1673 return REG_ESPACE;
1677 return REG_NOERROR;
1680 /* Calculate "eclosure" for all the node in DFA. */
1682 static reg_errcode_t
1683 calc_eclosure (re_dfa_t *dfa)
1685 Idx node_idx;
1686 bool incomplete;
1687 #ifdef DEBUG
1688 assert (dfa->nodes_len > 0);
1689 #endif
1690 incomplete = false;
1691 /* For each nodes, calculate epsilon closure. */
1692 for (node_idx = 0; ; ++node_idx)
1694 reg_errcode_t err;
1695 re_node_set eclosure_elem;
1696 if (node_idx == dfa->nodes_len)
1698 if (!incomplete)
1699 break;
1700 incomplete = false;
1701 node_idx = 0;
1704 #ifdef DEBUG
1705 assert (dfa->eclosures[node_idx].nelem != REG_MISSING);
1706 #endif
1708 /* If we have already calculated, skip it. */
1709 if (dfa->eclosures[node_idx].nelem != 0)
1710 continue;
1711 /* Calculate epsilon closure of 'node_idx'. */
1712 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1713 if (BE (err != REG_NOERROR, 0))
1714 return err;
1716 if (dfa->eclosures[node_idx].nelem == 0)
1718 incomplete = true;
1719 re_node_set_free (&eclosure_elem);
1722 return REG_NOERROR;
1725 /* Calculate epsilon closure of NODE. */
1727 static reg_errcode_t
1728 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
1730 reg_errcode_t err;
1731 Idx i;
1732 re_node_set eclosure;
1733 bool ok;
1734 bool incomplete = false;
1735 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1736 if (BE (err != REG_NOERROR, 0))
1737 return err;
1739 /* This indicates that we are calculating this node now.
1740 We reference this value to avoid infinite loop. */
1741 dfa->eclosures[node].nelem = REG_MISSING;
1743 /* If the current node has constraints, duplicate all nodes
1744 since they must inherit the constraints. */
1745 if (dfa->nodes[node].constraint
1746 && dfa->edests[node].nelem
1747 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1749 err = duplicate_node_closure (dfa, node, node, node,
1750 dfa->nodes[node].constraint);
1751 if (BE (err != REG_NOERROR, 0))
1752 return err;
1755 /* Expand each epsilon destination nodes. */
1756 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1757 for (i = 0; i < dfa->edests[node].nelem; ++i)
1759 re_node_set eclosure_elem;
1760 Idx edest = dfa->edests[node].elems[i];
1761 /* If calculating the epsilon closure of 'edest' is in progress,
1762 return intermediate result. */
1763 if (dfa->eclosures[edest].nelem == REG_MISSING)
1765 incomplete = true;
1766 continue;
1768 /* If we haven't calculated the epsilon closure of 'edest' yet,
1769 calculate now. Otherwise use calculated epsilon closure. */
1770 if (dfa->eclosures[edest].nelem == 0)
1772 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1773 if (BE (err != REG_NOERROR, 0))
1774 return err;
1776 else
1777 eclosure_elem = dfa->eclosures[edest];
1778 /* Merge the epsilon closure of 'edest'. */
1779 err = re_node_set_merge (&eclosure, &eclosure_elem);
1780 if (BE (err != REG_NOERROR, 0))
1781 return err;
1782 /* If the epsilon closure of 'edest' is incomplete,
1783 the epsilon closure of this node is also incomplete. */
1784 if (dfa->eclosures[edest].nelem == 0)
1786 incomplete = true;
1787 re_node_set_free (&eclosure_elem);
1791 /* An epsilon closure includes itself. */
1792 ok = re_node_set_insert (&eclosure, node);
1793 if (BE (! ok, 0))
1794 return REG_ESPACE;
1795 if (incomplete && !root)
1796 dfa->eclosures[node].nelem = 0;
1797 else
1798 dfa->eclosures[node] = eclosure;
1799 *new_set = eclosure;
1800 return REG_NOERROR;
1803 /* Functions for token which are used in the parser. */
1805 /* Fetch a token from INPUT.
1806 We must not use this function inside bracket expressions. */
1808 static void
1809 internal_function
1810 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1812 re_string_skip_bytes (input, peek_token (result, input, syntax));
1815 /* Peek a token from INPUT, and return the length of the token.
1816 We must not use this function inside bracket expressions. */
1818 static int
1819 internal_function
1820 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1822 unsigned char c;
1824 if (re_string_eoi (input))
1826 token->type = END_OF_RE;
1827 return 0;
1830 c = re_string_peek_byte (input, 0);
1831 token->opr.c = c;
1833 token->word_char = 0;
1834 #ifdef RE_ENABLE_I18N
1835 token->mb_partial = 0;
1836 if (input->mb_cur_max > 1 &&
1837 !re_string_first_byte (input, re_string_cur_idx (input)))
1839 token->type = CHARACTER;
1840 token->mb_partial = 1;
1841 return 1;
1843 #endif
1844 if (c == '\\')
1846 unsigned char c2;
1847 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1849 token->type = BACK_SLASH;
1850 return 1;
1853 c2 = re_string_peek_byte_case (input, 1);
1854 token->opr.c = c2;
1855 token->type = CHARACTER;
1856 #ifdef RE_ENABLE_I18N
1857 if (input->mb_cur_max > 1)
1859 wint_t wc = re_string_wchar_at (input,
1860 re_string_cur_idx (input) + 1);
1861 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1863 else
1864 #endif
1865 token->word_char = IS_WORD_CHAR (c2) != 0;
1867 switch (c2)
1869 case '|':
1870 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1871 token->type = OP_ALT;
1872 break;
1873 case '1': case '2': case '3': case '4': case '5':
1874 case '6': case '7': case '8': case '9':
1875 if (!(syntax & RE_NO_BK_REFS))
1877 token->type = OP_BACK_REF;
1878 token->opr.idx = c2 - '1';
1880 break;
1881 case '<':
1882 if (!(syntax & RE_NO_GNU_OPS))
1884 token->type = ANCHOR;
1885 token->opr.ctx_type = WORD_FIRST;
1887 break;
1888 case '>':
1889 if (!(syntax & RE_NO_GNU_OPS))
1891 token->type = ANCHOR;
1892 token->opr.ctx_type = WORD_LAST;
1894 break;
1895 case 'b':
1896 if (!(syntax & RE_NO_GNU_OPS))
1898 token->type = ANCHOR;
1899 token->opr.ctx_type = WORD_DELIM;
1901 break;
1902 case 'B':
1903 if (!(syntax & RE_NO_GNU_OPS))
1905 token->type = ANCHOR;
1906 token->opr.ctx_type = NOT_WORD_DELIM;
1908 break;
1909 case 'w':
1910 if (!(syntax & RE_NO_GNU_OPS))
1911 token->type = OP_WORD;
1912 break;
1913 case 'W':
1914 if (!(syntax & RE_NO_GNU_OPS))
1915 token->type = OP_NOTWORD;
1916 break;
1917 case 's':
1918 if (!(syntax & RE_NO_GNU_OPS))
1919 token->type = OP_SPACE;
1920 break;
1921 case 'S':
1922 if (!(syntax & RE_NO_GNU_OPS))
1923 token->type = OP_NOTSPACE;
1924 break;
1925 case '`':
1926 if (!(syntax & RE_NO_GNU_OPS))
1928 token->type = ANCHOR;
1929 token->opr.ctx_type = BUF_FIRST;
1931 break;
1932 case '\'':
1933 if (!(syntax & RE_NO_GNU_OPS))
1935 token->type = ANCHOR;
1936 token->opr.ctx_type = BUF_LAST;
1938 break;
1939 case '(':
1940 if (!(syntax & RE_NO_BK_PARENS))
1941 token->type = OP_OPEN_SUBEXP;
1942 break;
1943 case ')':
1944 if (!(syntax & RE_NO_BK_PARENS))
1945 token->type = OP_CLOSE_SUBEXP;
1946 break;
1947 case '+':
1948 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1949 token->type = OP_DUP_PLUS;
1950 break;
1951 case '?':
1952 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1953 token->type = OP_DUP_QUESTION;
1954 break;
1955 case '{':
1956 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1957 token->type = OP_OPEN_DUP_NUM;
1958 break;
1959 case '}':
1960 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1961 token->type = OP_CLOSE_DUP_NUM;
1962 break;
1963 default:
1964 break;
1966 return 2;
1969 token->type = CHARACTER;
1970 #ifdef RE_ENABLE_I18N
1971 if (input->mb_cur_max > 1)
1973 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1974 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1976 else
1977 #endif
1978 token->word_char = IS_WORD_CHAR (token->opr.c);
1980 switch (c)
1982 case '\n':
1983 if (syntax & RE_NEWLINE_ALT)
1984 token->type = OP_ALT;
1985 break;
1986 case '|':
1987 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1988 token->type = OP_ALT;
1989 break;
1990 case '*':
1991 token->type = OP_DUP_ASTERISK;
1992 break;
1993 case '+':
1994 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1995 token->type = OP_DUP_PLUS;
1996 break;
1997 case '?':
1998 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1999 token->type = OP_DUP_QUESTION;
2000 break;
2001 case '{':
2002 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
2003 token->type = OP_OPEN_DUP_NUM;
2004 break;
2005 case '}':
2006 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
2007 token->type = OP_CLOSE_DUP_NUM;
2008 break;
2009 case '(':
2010 if (syntax & RE_NO_BK_PARENS)
2011 token->type = OP_OPEN_SUBEXP;
2012 break;
2013 case ')':
2014 if (syntax & RE_NO_BK_PARENS)
2015 token->type = OP_CLOSE_SUBEXP;
2016 break;
2017 case '[':
2018 token->type = OP_OPEN_BRACKET;
2019 break;
2020 case '.':
2021 token->type = OP_PERIOD;
2022 break;
2023 case '^':
2024 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
2025 re_string_cur_idx (input) != 0)
2027 char prev = re_string_peek_byte (input, -1);
2028 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
2029 break;
2031 token->type = ANCHOR;
2032 token->opr.ctx_type = LINE_FIRST;
2033 break;
2034 case '$':
2035 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
2036 re_string_cur_idx (input) + 1 != re_string_length (input))
2038 re_token_t next;
2039 re_string_skip_bytes (input, 1);
2040 peek_token (&next, input, syntax);
2041 re_string_skip_bytes (input, -1);
2042 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
2043 break;
2045 token->type = ANCHOR;
2046 token->opr.ctx_type = LINE_LAST;
2047 break;
2048 default:
2049 break;
2051 return 1;
2054 /* Peek a token from INPUT, and return the length of the token.
2055 We must not use this function out of bracket expressions. */
2057 static int
2058 internal_function
2059 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2061 unsigned char c;
2062 if (re_string_eoi (input))
2064 token->type = END_OF_RE;
2065 return 0;
2067 c = re_string_peek_byte (input, 0);
2068 token->opr.c = c;
2070 #ifdef RE_ENABLE_I18N
2071 if (input->mb_cur_max > 1 &&
2072 !re_string_first_byte (input, re_string_cur_idx (input)))
2074 token->type = CHARACTER;
2075 return 1;
2077 #endif /* RE_ENABLE_I18N */
2079 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2080 && re_string_cur_idx (input) + 1 < re_string_length (input))
2082 /* In this case, '\' escape a character. */
2083 unsigned char c2;
2084 re_string_skip_bytes (input, 1);
2085 c2 = re_string_peek_byte (input, 0);
2086 token->opr.c = c2;
2087 token->type = CHARACTER;
2088 return 1;
2090 if (c == '[') /* '[' is a special char in a bracket exps. */
2092 unsigned char c2;
2093 int token_len;
2094 if (re_string_cur_idx (input) + 1 < re_string_length (input))
2095 c2 = re_string_peek_byte (input, 1);
2096 else
2097 c2 = 0;
2098 token->opr.c = c2;
2099 token_len = 2;
2100 switch (c2)
2102 case '.':
2103 token->type = OP_OPEN_COLL_ELEM;
2104 break;
2105 case '=':
2106 token->type = OP_OPEN_EQUIV_CLASS;
2107 break;
2108 case ':':
2109 if (syntax & RE_CHAR_CLASSES)
2111 token->type = OP_OPEN_CHAR_CLASS;
2112 break;
2114 /* else fall through. */
2115 default:
2116 token->type = CHARACTER;
2117 token->opr.c = c;
2118 token_len = 1;
2119 break;
2121 return token_len;
2123 switch (c)
2125 case '-':
2126 token->type = OP_CHARSET_RANGE;
2127 break;
2128 case ']':
2129 token->type = OP_CLOSE_BRACKET;
2130 break;
2131 case '^':
2132 token->type = OP_NON_MATCH_LIST;
2133 break;
2134 default:
2135 token->type = CHARACTER;
2137 return 1;
2140 /* Functions for parser. */
2142 /* Entry point of the parser.
2143 Parse the regular expression REGEXP and return the structure tree.
2144 If an error occurs, ERR is set by error code, and return NULL.
2145 This function build the following tree, from regular expression <reg_exp>:
2149 <reg_exp> EOR
2151 CAT means concatenation.
2152 EOR means end of regular expression. */
2154 static bin_tree_t *
2155 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2156 reg_errcode_t *err)
2158 re_dfa_t *dfa = preg->buffer;
2159 bin_tree_t *tree, *eor, *root;
2160 re_token_t current_token;
2161 dfa->syntax = syntax;
2162 fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2163 tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2164 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2165 return NULL;
2166 eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2167 if (tree != NULL)
2168 root = create_tree (dfa, tree, eor, CONCAT);
2169 else
2170 root = eor;
2171 if (BE (eor == NULL || root == NULL, 0))
2173 *err = REG_ESPACE;
2174 return NULL;
2176 return root;
2179 /* This function build the following tree, from regular expression
2180 <branch1>|<branch2>:
2184 <branch1> <branch2>
2186 ALT means alternative, which represents the operator '|'. */
2188 static bin_tree_t *
2189 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2190 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2192 re_dfa_t *dfa = preg->buffer;
2193 bin_tree_t *tree, *branch = NULL;
2194 bitset_word_t initial_bkref_map = dfa->completed_bkref_map;
2195 tree = parse_branch (regexp, preg, token, syntax, nest, err);
2196 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2197 return NULL;
2199 while (token->type == OP_ALT)
2201 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2202 if (token->type != OP_ALT && token->type != END_OF_RE
2203 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2205 bitset_word_t accumulated_bkref_map = dfa->completed_bkref_map;
2206 dfa->completed_bkref_map = initial_bkref_map;
2207 branch = parse_branch (regexp, preg, token, syntax, nest, err);
2208 if (BE (*err != REG_NOERROR && branch == NULL, 0))
2210 if (tree != NULL)
2211 postorder (tree, free_tree, NULL);
2212 return NULL;
2214 dfa->completed_bkref_map |= accumulated_bkref_map;
2216 else
2217 branch = NULL;
2218 tree = create_tree (dfa, tree, branch, OP_ALT);
2219 if (BE (tree == NULL, 0))
2221 *err = REG_ESPACE;
2222 return NULL;
2225 return tree;
2228 /* This function build the following tree, from regular expression
2229 <exp1><exp2>:
2233 <exp1> <exp2>
2235 CAT means concatenation. */
2237 static bin_tree_t *
2238 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2239 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2241 bin_tree_t *tree, *expr;
2242 re_dfa_t *dfa = preg->buffer;
2243 tree = parse_expression (regexp, preg, token, syntax, nest, err);
2244 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2245 return NULL;
2247 while (token->type != OP_ALT && token->type != END_OF_RE
2248 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2250 expr = parse_expression (regexp, preg, token, syntax, nest, err);
2251 if (BE (*err != REG_NOERROR && expr == NULL, 0))
2253 if (tree != NULL)
2254 postorder (tree, free_tree, NULL);
2255 return NULL;
2257 if (tree != NULL && expr != NULL)
2259 bin_tree_t *newtree = create_tree (dfa, tree, expr, CONCAT);
2260 if (newtree == NULL)
2262 postorder (expr, free_tree, NULL);
2263 postorder (tree, free_tree, NULL);
2264 *err = REG_ESPACE;
2265 return NULL;
2267 tree = newtree;
2269 else if (tree == NULL)
2270 tree = expr;
2271 /* Otherwise expr == NULL, we don't need to create new tree. */
2273 return tree;
2276 /* This function build the following tree, from regular expression a*:
2282 static bin_tree_t *
2283 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2284 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2286 re_dfa_t *dfa = preg->buffer;
2287 bin_tree_t *tree;
2288 switch (token->type)
2290 case CHARACTER:
2291 tree = create_token_tree (dfa, NULL, NULL, token);
2292 if (BE (tree == NULL, 0))
2294 *err = REG_ESPACE;
2295 return NULL;
2297 #ifdef RE_ENABLE_I18N
2298 if (dfa->mb_cur_max > 1)
2300 while (!re_string_eoi (regexp)
2301 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2303 bin_tree_t *mbc_remain;
2304 fetch_token (token, regexp, syntax);
2305 mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2306 tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2307 if (BE (mbc_remain == NULL || tree == NULL, 0))
2309 *err = REG_ESPACE;
2310 return NULL;
2314 #endif
2315 break;
2316 case OP_OPEN_SUBEXP:
2317 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2318 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2319 return NULL;
2320 break;
2321 case OP_OPEN_BRACKET:
2322 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2323 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2324 return NULL;
2325 break;
2326 case OP_BACK_REF:
2327 if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2329 *err = REG_ESUBREG;
2330 return NULL;
2332 dfa->used_bkref_map |= 1 << token->opr.idx;
2333 tree = create_token_tree (dfa, NULL, NULL, token);
2334 if (BE (tree == NULL, 0))
2336 *err = REG_ESPACE;
2337 return NULL;
2339 ++dfa->nbackref;
2340 dfa->has_mb_node = 1;
2341 break;
2342 case OP_OPEN_DUP_NUM:
2343 if (syntax & RE_CONTEXT_INVALID_DUP)
2345 *err = REG_BADRPT;
2346 return NULL;
2348 /* FALLTHROUGH */
2349 case OP_DUP_ASTERISK:
2350 case OP_DUP_PLUS:
2351 case OP_DUP_QUESTION:
2352 if (syntax & RE_CONTEXT_INVALID_OPS)
2354 *err = REG_BADRPT;
2355 return NULL;
2357 else if (syntax & RE_CONTEXT_INDEP_OPS)
2359 fetch_token (token, regexp, syntax);
2360 return parse_expression (regexp, preg, token, syntax, nest, err);
2362 /* else fall through */
2363 case OP_CLOSE_SUBEXP:
2364 if ((token->type == OP_CLOSE_SUBEXP) &&
2365 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2367 *err = REG_ERPAREN;
2368 return NULL;
2370 /* else fall through */
2371 case OP_CLOSE_DUP_NUM:
2372 /* We treat it as a normal character. */
2374 /* Then we can these characters as normal characters. */
2375 token->type = CHARACTER;
2376 /* mb_partial and word_char bits should be initialized already
2377 by peek_token. */
2378 tree = create_token_tree (dfa, NULL, NULL, token);
2379 if (BE (tree == NULL, 0))
2381 *err = REG_ESPACE;
2382 return NULL;
2384 break;
2385 case ANCHOR:
2386 if ((token->opr.ctx_type
2387 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2388 && dfa->word_ops_used == 0)
2389 init_word_char (dfa);
2390 if (token->opr.ctx_type == WORD_DELIM
2391 || token->opr.ctx_type == NOT_WORD_DELIM)
2393 bin_tree_t *tree_first, *tree_last;
2394 if (token->opr.ctx_type == WORD_DELIM)
2396 token->opr.ctx_type = WORD_FIRST;
2397 tree_first = create_token_tree (dfa, NULL, NULL, token);
2398 token->opr.ctx_type = WORD_LAST;
2400 else
2402 token->opr.ctx_type = INSIDE_WORD;
2403 tree_first = create_token_tree (dfa, NULL, NULL, token);
2404 token->opr.ctx_type = INSIDE_NOTWORD;
2406 tree_last = create_token_tree (dfa, NULL, NULL, token);
2407 tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2408 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2410 *err = REG_ESPACE;
2411 return NULL;
2414 else
2416 tree = create_token_tree (dfa, NULL, NULL, token);
2417 if (BE (tree == NULL, 0))
2419 *err = REG_ESPACE;
2420 return NULL;
2423 /* We must return here, since ANCHORs can't be followed
2424 by repetition operators.
2425 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2426 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2427 fetch_token (token, regexp, syntax);
2428 return tree;
2429 case OP_PERIOD:
2430 tree = create_token_tree (dfa, NULL, NULL, token);
2431 if (BE (tree == NULL, 0))
2433 *err = REG_ESPACE;
2434 return NULL;
2436 if (dfa->mb_cur_max > 1)
2437 dfa->has_mb_node = 1;
2438 break;
2439 case OP_WORD:
2440 case OP_NOTWORD:
2441 tree = build_charclass_op (dfa, regexp->trans,
2442 "alnum",
2443 "_",
2444 token->type == OP_NOTWORD, err);
2445 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2446 return NULL;
2447 break;
2448 case OP_SPACE:
2449 case OP_NOTSPACE:
2450 tree = build_charclass_op (dfa, regexp->trans,
2451 "space",
2453 token->type == OP_NOTSPACE, err);
2454 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2455 return NULL;
2456 break;
2457 case OP_ALT:
2458 case END_OF_RE:
2459 return NULL;
2460 case BACK_SLASH:
2461 *err = REG_EESCAPE;
2462 return NULL;
2463 default:
2464 /* Must not happen? */
2465 #ifdef DEBUG
2466 assert (0);
2467 #endif
2468 return NULL;
2470 fetch_token (token, regexp, syntax);
2472 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2473 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2475 bin_tree_t *dup_tree = parse_dup_op (tree, regexp, dfa, token,
2476 syntax, err);
2477 if (BE (*err != REG_NOERROR && dup_tree == NULL, 0))
2479 if (tree != NULL)
2480 postorder (tree, free_tree, NULL);
2481 return NULL;
2483 tree = dup_tree;
2484 /* In BRE consecutive duplications are not allowed. */
2485 if ((syntax & RE_CONTEXT_INVALID_DUP)
2486 && (token->type == OP_DUP_ASTERISK
2487 || token->type == OP_OPEN_DUP_NUM))
2489 if (tree != NULL)
2490 postorder (tree, free_tree, NULL);
2491 *err = REG_BADRPT;
2492 return NULL;
2496 return tree;
2499 /* This function build the following tree, from regular expression
2500 (<reg_exp>):
2501 SUBEXP
2503 <reg_exp>
2506 static bin_tree_t *
2507 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2508 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2510 re_dfa_t *dfa = preg->buffer;
2511 bin_tree_t *tree;
2512 size_t cur_nsub;
2513 cur_nsub = preg->re_nsub++;
2515 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2517 /* The subexpression may be a null string. */
2518 if (token->type == OP_CLOSE_SUBEXP)
2519 tree = NULL;
2520 else
2522 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2523 if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2525 if (tree != NULL)
2526 postorder (tree, free_tree, NULL);
2527 *err = REG_EPAREN;
2529 if (BE (*err != REG_NOERROR, 0))
2530 return NULL;
2533 if (cur_nsub <= '9' - '1')
2534 dfa->completed_bkref_map |= 1 << cur_nsub;
2536 tree = create_tree (dfa, tree, NULL, SUBEXP);
2537 if (BE (tree == NULL, 0))
2539 *err = REG_ESPACE;
2540 return NULL;
2542 tree->token.opr.idx = cur_nsub;
2543 return tree;
2546 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2548 static bin_tree_t *
2549 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2550 re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2552 bin_tree_t *tree = NULL, *old_tree = NULL;
2553 Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2554 re_token_t start_token = *token;
2556 if (token->type == OP_OPEN_DUP_NUM)
2558 end = 0;
2559 start = fetch_number (regexp, token, syntax);
2560 if (start == REG_MISSING)
2562 if (token->type == CHARACTER && token->opr.c == ',')
2563 start = 0; /* We treat "{,m}" as "{0,m}". */
2564 else
2566 *err = REG_BADBR; /* <re>{} is invalid. */
2567 return NULL;
2570 if (BE (start != REG_ERROR, 1))
2572 /* We treat "{n}" as "{n,n}". */
2573 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2574 : ((token->type == CHARACTER && token->opr.c == ',')
2575 ? fetch_number (regexp, token, syntax) : REG_ERROR));
2577 if (BE (start == REG_ERROR || end == REG_ERROR, 0))
2579 /* Invalid sequence. */
2580 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2582 if (token->type == END_OF_RE)
2583 *err = REG_EBRACE;
2584 else
2585 *err = REG_BADBR;
2587 return NULL;
2590 /* If the syntax bit is set, rollback. */
2591 re_string_set_index (regexp, start_idx);
2592 *token = start_token;
2593 token->type = CHARACTER;
2594 /* mb_partial and word_char bits should be already initialized by
2595 peek_token. */
2596 return elem;
2599 if (BE ((end != REG_MISSING && start > end)
2600 || token->type != OP_CLOSE_DUP_NUM, 0))
2602 /* First number greater than second. */
2603 *err = REG_BADBR;
2604 return NULL;
2607 if (BE (RE_DUP_MAX < (end == REG_MISSING ? start : end), 0))
2609 *err = REG_ESIZE;
2610 return NULL;
2613 else
2615 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2616 end = (token->type == OP_DUP_QUESTION) ? 1 : REG_MISSING;
2619 fetch_token (token, regexp, syntax);
2621 if (BE (elem == NULL, 0))
2622 return NULL;
2623 if (BE (start == 0 && end == 0, 0))
2625 postorder (elem, free_tree, NULL);
2626 return NULL;
2629 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2630 if (BE (start > 0, 0))
2632 tree = elem;
2633 for (i = 2; i <= start; ++i)
2635 elem = duplicate_tree (elem, dfa);
2636 tree = create_tree (dfa, tree, elem, CONCAT);
2637 if (BE (elem == NULL || tree == NULL, 0))
2638 goto parse_dup_op_espace;
2641 if (start == end)
2642 return tree;
2644 /* Duplicate ELEM before it is marked optional. */
2645 elem = duplicate_tree (elem, dfa);
2646 if (BE (elem == NULL, 0))
2647 goto parse_dup_op_espace;
2648 old_tree = tree;
2650 else
2651 old_tree = NULL;
2653 if (elem->token.type == SUBEXP)
2655 uintptr_t subidx = elem->token.opr.idx;
2656 postorder (elem, mark_opt_subexp, (void *) subidx);
2659 tree = create_tree (dfa, elem, NULL,
2660 (end == REG_MISSING ? OP_DUP_ASTERISK : OP_ALT));
2661 if (BE (tree == NULL, 0))
2662 goto parse_dup_op_espace;
2664 /* From gnulib's "intprops.h":
2665 True if the arithmetic type T is signed. */
2666 #define TYPE_SIGNED(t) (! ((t) 0 < (t) -1))
2668 /* This loop is actually executed only when end != REG_MISSING,
2669 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2670 already created the start+1-th copy. */
2671 if (TYPE_SIGNED (Idx) || end != REG_MISSING)
2672 for (i = start + 2; i <= end; ++i)
2674 elem = duplicate_tree (elem, dfa);
2675 tree = create_tree (dfa, tree, elem, CONCAT);
2676 if (BE (elem == NULL || tree == NULL, 0))
2677 goto parse_dup_op_espace;
2679 tree = create_tree (dfa, tree, NULL, OP_ALT);
2680 if (BE (tree == NULL, 0))
2681 goto parse_dup_op_espace;
2684 if (old_tree)
2685 tree = create_tree (dfa, old_tree, tree, CONCAT);
2687 return tree;
2689 parse_dup_op_espace:
2690 *err = REG_ESPACE;
2691 return NULL;
2694 /* Size of the names for collating symbol/equivalence_class/character_class.
2695 I'm not sure, but maybe enough. */
2696 #define BRACKET_NAME_BUF_SIZE 32
2698 #ifndef _LIBC
2699 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2700 Build the range expression which starts from START_ELEM, and ends
2701 at END_ELEM. The result are written to MBCSET and SBCSET.
2702 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2703 mbcset->range_ends, is a pointer argument since we may
2704 update it. */
2706 static reg_errcode_t
2707 internal_function
2708 # ifdef RE_ENABLE_I18N
2709 build_range_exp (const reg_syntax_t syntax,
2710 bitset_t sbcset,
2711 re_charset_t *mbcset,
2712 Idx *range_alloc,
2713 const bracket_elem_t *start_elem,
2714 const bracket_elem_t *end_elem)
2715 # else /* not RE_ENABLE_I18N */
2716 build_range_exp (const reg_syntax_t syntax,
2717 bitset_t sbcset,
2718 const bracket_elem_t *start_elem,
2719 const bracket_elem_t *end_elem)
2720 # endif /* not RE_ENABLE_I18N */
2722 unsigned int start_ch, end_ch;
2723 /* Equivalence Classes and Character Classes can't be a range start/end. */
2724 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2725 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2727 return REG_ERANGE;
2729 /* We can handle no multi character collating elements without libc
2730 support. */
2731 if (BE ((start_elem->type == COLL_SYM
2732 && strlen ((char *) start_elem->opr.name) > 1)
2733 || (end_elem->type == COLL_SYM
2734 && strlen ((char *) end_elem->opr.name) > 1), 0))
2735 return REG_ECOLLATE;
2737 # ifdef RE_ENABLE_I18N
2739 wchar_t wc;
2740 wint_t start_wc;
2741 wint_t end_wc;
2743 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2744 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2745 : 0));
2746 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2747 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2748 : 0));
2749 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2750 ? __btowc (start_ch) : start_elem->opr.wch);
2751 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2752 ? __btowc (end_ch) : end_elem->opr.wch);
2753 if (start_wc == WEOF || end_wc == WEOF)
2754 return REG_ECOLLATE;
2755 else if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_wc > end_wc, 0))
2756 return REG_ERANGE;
2758 /* Got valid collation sequence values, add them as a new entry.
2759 However, for !_LIBC we have no collation elements: if the
2760 character set is single byte, the single byte character set
2761 that we build below suffices. parse_bracket_exp passes
2762 no MBCSET if dfa->mb_cur_max == 1. */
2763 if (mbcset)
2765 /* Check the space of the arrays. */
2766 if (BE (*range_alloc == mbcset->nranges, 0))
2768 /* There is not enough space, need realloc. */
2769 wchar_t *new_array_start, *new_array_end;
2770 Idx new_nranges;
2772 /* +1 in case of mbcset->nranges is 0. */
2773 new_nranges = 2 * mbcset->nranges + 1;
2774 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2775 are NULL if *range_alloc == 0. */
2776 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2777 new_nranges);
2778 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2779 new_nranges);
2781 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2782 return REG_ESPACE;
2784 mbcset->range_starts = new_array_start;
2785 mbcset->range_ends = new_array_end;
2786 *range_alloc = new_nranges;
2789 mbcset->range_starts[mbcset->nranges] = start_wc;
2790 mbcset->range_ends[mbcset->nranges++] = end_wc;
2793 /* Build the table for single byte characters. */
2794 for (wc = 0; wc < SBC_MAX; ++wc)
2796 if (start_wc <= wc && wc <= end_wc)
2797 bitset_set (sbcset, wc);
2800 # else /* not RE_ENABLE_I18N */
2802 unsigned int ch;
2803 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2804 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2805 : 0));
2806 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2807 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2808 : 0));
2809 if (start_ch > end_ch)
2810 return REG_ERANGE;
2811 /* Build the table for single byte characters. */
2812 for (ch = 0; ch < SBC_MAX; ++ch)
2813 if (start_ch <= ch && ch <= end_ch)
2814 bitset_set (sbcset, ch);
2816 # endif /* not RE_ENABLE_I18N */
2817 return REG_NOERROR;
2819 #endif /* not _LIBC */
2821 #ifndef _LIBC
2822 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2823 Build the collating element which is represented by NAME.
2824 The result are written to MBCSET and SBCSET.
2825 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2826 pointer argument since we may update it. */
2828 static reg_errcode_t
2829 internal_function
2830 # ifdef RE_ENABLE_I18N
2831 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2832 Idx *coll_sym_alloc, const unsigned char *name)
2833 # else /* not RE_ENABLE_I18N */
2834 build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2835 # endif /* not RE_ENABLE_I18N */
2837 size_t name_len = strlen ((const char *) name);
2838 if (BE (name_len != 1, 0))
2839 return REG_ECOLLATE;
2840 else
2842 bitset_set (sbcset, name[0]);
2843 return REG_NOERROR;
2846 #endif /* not _LIBC */
2848 /* This function parse bracket expression like "[abc]", "[a-c]",
2849 "[[.a-a.]]" etc. */
2851 static bin_tree_t *
2852 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2853 reg_syntax_t syntax, reg_errcode_t *err)
2855 #ifdef _LIBC
2856 const unsigned char *collseqmb;
2857 const char *collseqwc;
2858 uint32_t nrules;
2859 int32_t table_size;
2860 const int32_t *symb_table;
2861 const unsigned char *extra;
2863 /* Local function for parse_bracket_exp used in _LIBC environment.
2864 Seek the collating symbol entry corresponding to NAME.
2865 Return the index of the symbol in the SYMB_TABLE,
2866 or -1 if not found. */
2868 auto inline int32_t
2869 __attribute__ ((always_inline))
2870 seek_collating_symbol_entry (const unsigned char *name, size_t name_len)
2872 int32_t elem;
2874 for (elem = 0; elem < table_size; elem++)
2875 if (symb_table[2 * elem] != 0)
2877 int32_t idx = symb_table[2 * elem + 1];
2878 /* Skip the name of collating element name. */
2879 idx += 1 + extra[idx];
2880 if (/* Compare the length of the name. */
2881 name_len == extra[idx]
2882 /* Compare the name. */
2883 && memcmp (name, &extra[idx + 1], name_len) == 0)
2884 /* Yep, this is the entry. */
2885 return elem;
2887 return -1;
2890 /* Local function for parse_bracket_exp used in _LIBC environment.
2891 Look up the collation sequence value of BR_ELEM.
2892 Return the value if succeeded, UINT_MAX otherwise. */
2894 auto inline unsigned int
2895 __attribute__ ((always_inline))
2896 lookup_collation_sequence_value (bracket_elem_t *br_elem)
2898 if (br_elem->type == SB_CHAR)
2901 if (MB_CUR_MAX == 1)
2903 if (nrules == 0)
2904 return collseqmb[br_elem->opr.ch];
2905 else
2907 wint_t wc = __btowc (br_elem->opr.ch);
2908 return __collseq_table_lookup (collseqwc, wc);
2911 else if (br_elem->type == MB_CHAR)
2913 if (nrules != 0)
2914 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2916 else if (br_elem->type == COLL_SYM)
2918 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2919 if (nrules != 0)
2921 int32_t elem, idx;
2922 elem = seek_collating_symbol_entry (br_elem->opr.name,
2923 sym_name_len);
2924 if (elem != -1)
2926 /* We found the entry. */
2927 idx = symb_table[2 * elem + 1];
2928 /* Skip the name of collating element name. */
2929 idx += 1 + extra[idx];
2930 /* Skip the byte sequence of the collating element. */
2931 idx += 1 + extra[idx];
2932 /* Adjust for the alignment. */
2933 idx = (idx + 3) & ~3;
2934 /* Skip the multibyte collation sequence value. */
2935 idx += sizeof (unsigned int);
2936 /* Skip the wide char sequence of the collating element. */
2937 idx += sizeof (unsigned int) *
2938 (1 + *(unsigned int *) (extra + idx));
2939 /* Return the collation sequence value. */
2940 return *(unsigned int *) (extra + idx);
2942 else if (sym_name_len == 1)
2944 /* No valid character. Match it as a single byte
2945 character. */
2946 return collseqmb[br_elem->opr.name[0]];
2949 else if (sym_name_len == 1)
2950 return collseqmb[br_elem->opr.name[0]];
2952 return UINT_MAX;
2955 /* Local function for parse_bracket_exp used in _LIBC environment.
2956 Build the range expression which starts from START_ELEM, and ends
2957 at END_ELEM. The result are written to MBCSET and SBCSET.
2958 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2959 mbcset->range_ends, is a pointer argument since we may
2960 update it. */
2962 auto inline reg_errcode_t
2963 __attribute__ ((always_inline))
2964 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2965 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2967 unsigned int ch;
2968 uint32_t start_collseq;
2969 uint32_t end_collseq;
2971 /* Equivalence Classes and Character Classes can't be a range
2972 start/end. */
2973 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2974 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2976 return REG_ERANGE;
2978 /* FIXME: Implement rational ranges here, too. */
2979 start_collseq = lookup_collation_sequence_value (start_elem);
2980 end_collseq = lookup_collation_sequence_value (end_elem);
2981 /* Check start/end collation sequence values. */
2982 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2983 return REG_ECOLLATE;
2984 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2985 return REG_ERANGE;
2987 /* Got valid collation sequence values, add them as a new entry.
2988 However, if we have no collation elements, and the character set
2989 is single byte, the single byte character set that we
2990 build below suffices. */
2991 if (nrules > 0 || dfa->mb_cur_max > 1)
2993 /* Check the space of the arrays. */
2994 if (BE (*range_alloc == mbcset->nranges, 0))
2996 /* There is not enough space, need realloc. */
2997 uint32_t *new_array_start;
2998 uint32_t *new_array_end;
2999 Idx new_nranges;
3001 /* +1 in case of mbcset->nranges is 0. */
3002 new_nranges = 2 * mbcset->nranges + 1;
3003 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
3004 new_nranges);
3005 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
3006 new_nranges);
3008 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
3009 return REG_ESPACE;
3011 mbcset->range_starts = new_array_start;
3012 mbcset->range_ends = new_array_end;
3013 *range_alloc = new_nranges;
3016 mbcset->range_starts[mbcset->nranges] = start_collseq;
3017 mbcset->range_ends[mbcset->nranges++] = end_collseq;
3020 /* Build the table for single byte characters. */
3021 for (ch = 0; ch < SBC_MAX; ch++)
3023 uint32_t ch_collseq;
3025 if (MB_CUR_MAX == 1)
3027 if (nrules == 0)
3028 ch_collseq = collseqmb[ch];
3029 else
3030 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
3031 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
3032 bitset_set (sbcset, ch);
3034 return REG_NOERROR;
3037 /* Local function for parse_bracket_exp used in _LIBC environment.
3038 Build the collating element which is represented by NAME.
3039 The result are written to MBCSET and SBCSET.
3040 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
3041 pointer argument since we may update it. */
3043 auto inline reg_errcode_t
3044 __attribute__ ((always_inline))
3045 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
3046 Idx *coll_sym_alloc, const unsigned char *name)
3048 int32_t elem, idx;
3049 size_t name_len = strlen ((const char *) name);
3050 if (nrules != 0)
3052 elem = seek_collating_symbol_entry (name, name_len);
3053 if (elem != -1)
3055 /* We found the entry. */
3056 idx = symb_table[2 * elem + 1];
3057 /* Skip the name of collating element name. */
3058 idx += 1 + extra[idx];
3060 else if (name_len == 1)
3062 /* No valid character, treat it as a normal
3063 character. */
3064 bitset_set (sbcset, name[0]);
3065 return REG_NOERROR;
3067 else
3068 return REG_ECOLLATE;
3070 /* Got valid collation sequence, add it as a new entry. */
3071 /* Check the space of the arrays. */
3072 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3074 /* Not enough, realloc it. */
3075 /* +1 in case of mbcset->ncoll_syms is 0. */
3076 Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3077 /* Use realloc since mbcset->coll_syms is NULL
3078 if *alloc == 0. */
3079 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3080 new_coll_sym_alloc);
3081 if (BE (new_coll_syms == NULL, 0))
3082 return REG_ESPACE;
3083 mbcset->coll_syms = new_coll_syms;
3084 *coll_sym_alloc = new_coll_sym_alloc;
3086 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3087 return REG_NOERROR;
3089 else
3091 if (BE (name_len != 1, 0))
3092 return REG_ECOLLATE;
3093 else
3095 bitset_set (sbcset, name[0]);
3096 return REG_NOERROR;
3100 #endif
3102 re_token_t br_token;
3103 re_bitset_ptr_t sbcset;
3104 #ifdef RE_ENABLE_I18N
3105 re_charset_t *mbcset;
3106 Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3107 Idx equiv_class_alloc = 0, char_class_alloc = 0;
3108 #endif /* not RE_ENABLE_I18N */
3109 bool non_match = false;
3110 bin_tree_t *work_tree;
3111 int token_len;
3112 bool first_round = true;
3113 #ifdef _LIBC
3114 collseqmb = (const unsigned char *)
3115 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3116 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3117 if (nrules)
3120 if (MB_CUR_MAX > 1)
3122 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3123 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3124 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3125 _NL_COLLATE_SYMB_TABLEMB);
3126 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3127 _NL_COLLATE_SYMB_EXTRAMB);
3129 #endif
3130 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3131 #ifdef RE_ENABLE_I18N
3132 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3133 #endif /* RE_ENABLE_I18N */
3134 #ifdef RE_ENABLE_I18N
3135 if (BE (sbcset == NULL || mbcset == NULL, 0))
3136 #else
3137 if (BE (sbcset == NULL, 0))
3138 #endif /* RE_ENABLE_I18N */
3140 re_free (sbcset);
3141 #ifdef RE_ENABLE_I18N
3142 re_free (mbcset);
3143 #endif
3144 *err = REG_ESPACE;
3145 return NULL;
3148 token_len = peek_token_bracket (token, regexp, syntax);
3149 if (BE (token->type == END_OF_RE, 0))
3151 *err = REG_BADPAT;
3152 goto parse_bracket_exp_free_return;
3154 if (token->type == OP_NON_MATCH_LIST)
3156 #ifdef RE_ENABLE_I18N
3157 mbcset->non_match = 1;
3158 #endif /* not RE_ENABLE_I18N */
3159 non_match = true;
3160 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3161 bitset_set (sbcset, '\n');
3162 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3163 token_len = peek_token_bracket (token, regexp, syntax);
3164 if (BE (token->type == END_OF_RE, 0))
3166 *err = REG_BADPAT;
3167 goto parse_bracket_exp_free_return;
3171 /* We treat the first ']' as a normal character. */
3172 if (token->type == OP_CLOSE_BRACKET)
3173 token->type = CHARACTER;
3175 while (1)
3177 bracket_elem_t start_elem, end_elem;
3178 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3179 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3180 reg_errcode_t ret;
3181 int token_len2 = 0;
3182 bool is_range_exp = false;
3183 re_token_t token2;
3185 start_elem.opr.name = start_name_buf;
3186 start_elem.type = COLL_SYM;
3187 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3188 syntax, first_round);
3189 if (BE (ret != REG_NOERROR, 0))
3191 *err = ret;
3192 goto parse_bracket_exp_free_return;
3194 first_round = false;
3196 /* Get information about the next token. We need it in any case. */
3197 token_len = peek_token_bracket (token, regexp, syntax);
3199 /* Do not check for ranges if we know they are not allowed. */
3200 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3202 if (BE (token->type == END_OF_RE, 0))
3204 *err = REG_EBRACK;
3205 goto parse_bracket_exp_free_return;
3207 if (token->type == OP_CHARSET_RANGE)
3209 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3210 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3211 if (BE (token2.type == END_OF_RE, 0))
3213 *err = REG_EBRACK;
3214 goto parse_bracket_exp_free_return;
3216 if (token2.type == OP_CLOSE_BRACKET)
3218 /* We treat the last '-' as a normal character. */
3219 re_string_skip_bytes (regexp, -token_len);
3220 token->type = CHARACTER;
3222 else
3223 is_range_exp = true;
3227 if (is_range_exp == true)
3229 end_elem.opr.name = end_name_buf;
3230 end_elem.type = COLL_SYM;
3231 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3232 dfa, syntax, true);
3233 if (BE (ret != REG_NOERROR, 0))
3235 *err = ret;
3236 goto parse_bracket_exp_free_return;
3239 token_len = peek_token_bracket (token, regexp, syntax);
3241 #ifdef _LIBC
3242 *err = build_range_exp (sbcset, mbcset, &range_alloc,
3243 &start_elem, &end_elem);
3244 #else
3245 # ifdef RE_ENABLE_I18N
3246 *err = build_range_exp (syntax, sbcset,
3247 dfa->mb_cur_max > 1 ? mbcset : NULL,
3248 &range_alloc, &start_elem, &end_elem);
3249 # else
3250 *err = build_range_exp (syntax, sbcset, &start_elem, &end_elem);
3251 # endif
3252 #endif /* RE_ENABLE_I18N */
3253 if (BE (*err != REG_NOERROR, 0))
3254 goto parse_bracket_exp_free_return;
3256 else
3258 switch (start_elem.type)
3260 case SB_CHAR:
3261 bitset_set (sbcset, start_elem.opr.ch);
3262 break;
3263 #ifdef RE_ENABLE_I18N
3264 case MB_CHAR:
3265 /* Check whether the array has enough space. */
3266 if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3268 wchar_t *new_mbchars;
3269 /* Not enough, realloc it. */
3270 /* +1 in case of mbcset->nmbchars is 0. */
3271 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3272 /* Use realloc since array is NULL if *alloc == 0. */
3273 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3274 mbchar_alloc);
3275 if (BE (new_mbchars == NULL, 0))
3276 goto parse_bracket_exp_espace;
3277 mbcset->mbchars = new_mbchars;
3279 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3280 break;
3281 #endif /* RE_ENABLE_I18N */
3282 case EQUIV_CLASS:
3283 *err = build_equiv_class (sbcset,
3284 #ifdef RE_ENABLE_I18N
3285 mbcset, &equiv_class_alloc,
3286 #endif /* RE_ENABLE_I18N */
3287 start_elem.opr.name);
3288 if (BE (*err != REG_NOERROR, 0))
3289 goto parse_bracket_exp_free_return;
3290 break;
3291 case COLL_SYM:
3292 *err = build_collating_symbol (sbcset,
3293 #ifdef RE_ENABLE_I18N
3294 mbcset, &coll_sym_alloc,
3295 #endif /* RE_ENABLE_I18N */
3296 start_elem.opr.name);
3297 if (BE (*err != REG_NOERROR, 0))
3298 goto parse_bracket_exp_free_return;
3299 break;
3300 case CHAR_CLASS:
3301 *err = build_charclass (regexp->trans, sbcset,
3302 #ifdef RE_ENABLE_I18N
3303 mbcset, &char_class_alloc,
3304 #endif /* RE_ENABLE_I18N */
3305 (const char *) start_elem.opr.name,
3306 syntax);
3307 if (BE (*err != REG_NOERROR, 0))
3308 goto parse_bracket_exp_free_return;
3309 break;
3310 default:
3311 assert (0);
3312 break;
3315 if (BE (token->type == END_OF_RE, 0))
3317 *err = REG_EBRACK;
3318 goto parse_bracket_exp_free_return;
3320 if (token->type == OP_CLOSE_BRACKET)
3321 break;
3324 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3326 /* If it is non-matching list. */
3327 if (non_match)
3328 bitset_not (sbcset);
3330 #ifdef RE_ENABLE_I18N
3331 /* Ensure only single byte characters are set. */
3332 if (dfa->mb_cur_max > 1)
3333 bitset_mask (sbcset, dfa->sb_char);
3335 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3336 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3337 || mbcset->non_match)))
3339 bin_tree_t *mbc_tree;
3340 int sbc_idx;
3341 /* Build a tree for complex bracket. */
3342 dfa->has_mb_node = 1;
3343 br_token.type = COMPLEX_BRACKET;
3344 br_token.opr.mbcset = mbcset;
3345 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3346 if (BE (mbc_tree == NULL, 0))
3347 goto parse_bracket_exp_espace;
3348 for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3349 if (sbcset[sbc_idx])
3350 break;
3351 /* If there are no bits set in sbcset, there is no point
3352 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3353 if (sbc_idx < BITSET_WORDS)
3355 /* Build a tree for simple bracket. */
3356 br_token.type = SIMPLE_BRACKET;
3357 br_token.opr.sbcset = sbcset;
3358 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3359 if (BE (work_tree == NULL, 0))
3360 goto parse_bracket_exp_espace;
3362 /* Then join them by ALT node. */
3363 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3364 if (BE (work_tree == NULL, 0))
3365 goto parse_bracket_exp_espace;
3367 else
3369 re_free (sbcset);
3370 work_tree = mbc_tree;
3373 else
3374 #endif /* not RE_ENABLE_I18N */
3376 #ifdef RE_ENABLE_I18N
3377 free_charset (mbcset);
3378 #endif
3379 /* Build a tree for simple bracket. */
3380 br_token.type = SIMPLE_BRACKET;
3381 br_token.opr.sbcset = sbcset;
3382 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3383 if (BE (work_tree == NULL, 0))
3384 goto parse_bracket_exp_espace;
3386 return work_tree;
3388 parse_bracket_exp_espace:
3389 *err = REG_ESPACE;
3390 parse_bracket_exp_free_return:
3391 re_free (sbcset);
3392 #ifdef RE_ENABLE_I18N
3393 free_charset (mbcset);
3394 #endif /* RE_ENABLE_I18N */
3395 return NULL;
3398 /* Parse an element in the bracket expression. */
3400 static reg_errcode_t
3401 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3402 re_token_t *token, int token_len, re_dfa_t *dfa,
3403 reg_syntax_t syntax, bool accept_hyphen)
3405 #ifdef RE_ENABLE_I18N
3406 int cur_char_size;
3407 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3408 if (cur_char_size > 1)
3410 elem->type = MB_CHAR;
3411 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3412 re_string_skip_bytes (regexp, cur_char_size);
3413 return REG_NOERROR;
3415 #endif /* RE_ENABLE_I18N */
3416 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3417 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3418 || token->type == OP_OPEN_EQUIV_CLASS)
3419 return parse_bracket_symbol (elem, regexp, token);
3420 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3422 /* A '-' must only appear as anything but a range indicator before
3423 the closing bracket. Everything else is an error. */
3424 re_token_t token2;
3425 (void) peek_token_bracket (&token2, regexp, syntax);
3426 if (token2.type != OP_CLOSE_BRACKET)
3427 /* The actual error value is not standardized since this whole
3428 case is undefined. But ERANGE makes good sense. */
3429 return REG_ERANGE;
3431 elem->type = SB_CHAR;
3432 elem->opr.ch = token->opr.c;
3433 return REG_NOERROR;
3436 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3437 such as [:<character_class>:], [.<collating_element>.], and
3438 [=<equivalent_class>=]. */
3440 static reg_errcode_t
3441 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3442 re_token_t *token)
3444 unsigned char ch, delim = token->opr.c;
3445 int i = 0;
3446 if (re_string_eoi(regexp))
3447 return REG_EBRACK;
3448 for (;; ++i)
3450 if (i >= BRACKET_NAME_BUF_SIZE)
3451 return REG_EBRACK;
3452 if (token->type == OP_OPEN_CHAR_CLASS)
3453 ch = re_string_fetch_byte_case (regexp);
3454 else
3455 ch = re_string_fetch_byte (regexp);
3456 if (re_string_eoi(regexp))
3457 return REG_EBRACK;
3458 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3459 break;
3460 elem->opr.name[i] = ch;
3462 re_string_skip_bytes (regexp, 1);
3463 elem->opr.name[i] = '\0';
3464 switch (token->type)
3466 case OP_OPEN_COLL_ELEM:
3467 elem->type = COLL_SYM;
3468 break;
3469 case OP_OPEN_EQUIV_CLASS:
3470 elem->type = EQUIV_CLASS;
3471 break;
3472 case OP_OPEN_CHAR_CLASS:
3473 elem->type = CHAR_CLASS;
3474 break;
3475 default:
3476 break;
3478 return REG_NOERROR;
3481 /* Helper function for parse_bracket_exp.
3482 Build the equivalence class which is represented by NAME.
3483 The result are written to MBCSET and SBCSET.
3484 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3485 is a pointer argument since we may update it. */
3487 static reg_errcode_t
3488 #ifdef RE_ENABLE_I18N
3489 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3490 Idx *equiv_class_alloc, const unsigned char *name)
3491 #else /* not RE_ENABLE_I18N */
3492 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3493 #endif /* not RE_ENABLE_I18N */
3495 #ifdef _LIBC
3496 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3497 if (nrules != 0)
3499 const int32_t *table, *indirect;
3500 const unsigned char *weights, *extra, *cp;
3501 unsigned char char_buf[2];
3502 int32_t idx1, idx2;
3503 unsigned int ch;
3504 size_t len;
3505 /* Calculate the index for equivalence class. */
3506 cp = name;
3507 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3508 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3509 _NL_COLLATE_WEIGHTMB);
3510 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3511 _NL_COLLATE_EXTRAMB);
3512 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3513 _NL_COLLATE_INDIRECTMB);
3514 idx1 = findidx (table, indirect, extra, &cp, -1);
3515 if (BE (idx1 == 0 || *cp != '\0', 0))
3516 /* This isn't a valid character. */
3517 return REG_ECOLLATE;
3519 /* Build single byte matching table for this equivalence class. */
3520 len = weights[idx1 & 0xffffff];
3521 for (ch = 0; ch < SBC_MAX; ++ch)
3523 char_buf[0] = ch;
3524 cp = char_buf;
3525 idx2 = findidx (table, indirect, extra, &cp, 1);
3527 idx2 = table[ch];
3529 if (idx2 == 0)
3530 /* This isn't a valid character. */
3531 continue;
3532 /* Compare only if the length matches and the collation rule
3533 index is the same. */
3534 if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3536 int cnt = 0;
3538 while (cnt <= len &&
3539 weights[(idx1 & 0xffffff) + 1 + cnt]
3540 == weights[(idx2 & 0xffffff) + 1 + cnt])
3541 ++cnt;
3543 if (cnt > len)
3544 bitset_set (sbcset, ch);
3547 /* Check whether the array has enough space. */
3548 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3550 /* Not enough, realloc it. */
3551 /* +1 in case of mbcset->nequiv_classes is 0. */
3552 Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3553 /* Use realloc since the array is NULL if *alloc == 0. */
3554 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3555 int32_t,
3556 new_equiv_class_alloc);
3557 if (BE (new_equiv_classes == NULL, 0))
3558 return REG_ESPACE;
3559 mbcset->equiv_classes = new_equiv_classes;
3560 *equiv_class_alloc = new_equiv_class_alloc;
3562 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3564 else
3565 #endif /* _LIBC */
3567 if (BE (strlen ((const char *) name) != 1, 0))
3568 return REG_ECOLLATE;
3569 bitset_set (sbcset, *name);
3571 return REG_NOERROR;
3574 /* Helper function for parse_bracket_exp.
3575 Build the character class which is represented by NAME.
3576 The result are written to MBCSET and SBCSET.
3577 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3578 is a pointer argument since we may update it. */
3580 static reg_errcode_t
3581 #ifdef RE_ENABLE_I18N
3582 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3583 re_charset_t *mbcset, Idx *char_class_alloc,
3584 const char *class_name, reg_syntax_t syntax)
3585 #else /* not RE_ENABLE_I18N */
3586 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3587 const char *class_name, reg_syntax_t syntax)
3588 #endif /* not RE_ENABLE_I18N */
3590 int i;
3591 const char *name = class_name;
3593 /* In case of REG_ICASE "upper" and "lower" match the both of
3594 upper and lower cases. */
3595 if ((syntax & RE_ICASE)
3596 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3597 name = "alpha";
3599 #ifdef RE_ENABLE_I18N
3600 /* Check the space of the arrays. */
3601 if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3603 /* Not enough, realloc it. */
3604 /* +1 in case of mbcset->nchar_classes is 0. */
3605 Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3606 /* Use realloc since array is NULL if *alloc == 0. */
3607 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3608 new_char_class_alloc);
3609 if (BE (new_char_classes == NULL, 0))
3610 return REG_ESPACE;
3611 mbcset->char_classes = new_char_classes;
3612 *char_class_alloc = new_char_class_alloc;
3614 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3615 #endif /* RE_ENABLE_I18N */
3617 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3618 do { \
3619 if (BE (trans != NULL, 0)) \
3621 for (i = 0; i < SBC_MAX; ++i) \
3622 if (ctype_func (i)) \
3623 bitset_set (sbcset, trans[i]); \
3625 else \
3627 for (i = 0; i < SBC_MAX; ++i) \
3628 if (ctype_func (i)) \
3629 bitset_set (sbcset, i); \
3631 } while (0)
3633 if (strcmp (name, "alnum") == 0)
3634 BUILD_CHARCLASS_LOOP (isalnum);
3635 else if (strcmp (name, "cntrl") == 0)
3636 BUILD_CHARCLASS_LOOP (iscntrl);
3637 else if (strcmp (name, "lower") == 0)
3638 BUILD_CHARCLASS_LOOP (islower);
3639 else if (strcmp (name, "space") == 0)
3640 BUILD_CHARCLASS_LOOP (isspace);
3641 else if (strcmp (name, "alpha") == 0)
3642 BUILD_CHARCLASS_LOOP (isalpha);
3643 else if (strcmp (name, "digit") == 0)
3644 BUILD_CHARCLASS_LOOP (isdigit);
3645 else if (strcmp (name, "print") == 0)
3646 BUILD_CHARCLASS_LOOP (isprint);
3647 else if (strcmp (name, "upper") == 0)
3648 BUILD_CHARCLASS_LOOP (isupper);
3649 else if (strcmp (name, "blank") == 0)
3650 BUILD_CHARCLASS_LOOP (isblank);
3651 else if (strcmp (name, "graph") == 0)
3652 BUILD_CHARCLASS_LOOP (isgraph);
3653 else if (strcmp (name, "punct") == 0)
3654 BUILD_CHARCLASS_LOOP (ispunct);
3655 else if (strcmp (name, "xdigit") == 0)
3656 BUILD_CHARCLASS_LOOP (isxdigit);
3657 else
3658 return REG_ECTYPE;
3660 return REG_NOERROR;
3663 static bin_tree_t *
3664 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3665 const char *class_name,
3666 const char *extra, bool non_match,
3667 reg_errcode_t *err)
3669 re_bitset_ptr_t sbcset;
3670 #ifdef RE_ENABLE_I18N
3671 re_charset_t *mbcset;
3672 Idx alloc = 0;
3673 #endif /* not RE_ENABLE_I18N */
3674 reg_errcode_t ret;
3675 re_token_t br_token;
3676 bin_tree_t *tree;
3678 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3679 #ifdef RE_ENABLE_I18N
3680 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3681 #endif /* RE_ENABLE_I18N */
3683 #ifdef RE_ENABLE_I18N
3684 if (BE (sbcset == NULL || mbcset == NULL, 0))
3685 #else /* not RE_ENABLE_I18N */
3686 if (BE (sbcset == NULL, 0))
3687 #endif /* not RE_ENABLE_I18N */
3689 *err = REG_ESPACE;
3690 return NULL;
3693 if (non_match)
3695 #ifdef RE_ENABLE_I18N
3696 mbcset->non_match = 1;
3697 #endif /* not RE_ENABLE_I18N */
3700 /* We don't care the syntax in this case. */
3701 ret = build_charclass (trans, sbcset,
3702 #ifdef RE_ENABLE_I18N
3703 mbcset, &alloc,
3704 #endif /* RE_ENABLE_I18N */
3705 class_name, 0);
3707 if (BE (ret != REG_NOERROR, 0))
3709 re_free (sbcset);
3710 #ifdef RE_ENABLE_I18N
3711 free_charset (mbcset);
3712 #endif /* RE_ENABLE_I18N */
3713 *err = ret;
3714 return NULL;
3716 /* \w match '_' also. */
3717 for (; *extra; extra++)
3718 bitset_set (sbcset, *extra);
3720 /* If it is non-matching list. */
3721 if (non_match)
3722 bitset_not (sbcset);
3724 #ifdef RE_ENABLE_I18N
3725 /* Ensure only single byte characters are set. */
3726 if (dfa->mb_cur_max > 1)
3727 bitset_mask (sbcset, dfa->sb_char);
3728 #endif
3730 /* Build a tree for simple bracket. */
3731 br_token.type = SIMPLE_BRACKET;
3732 br_token.opr.sbcset = sbcset;
3733 tree = create_token_tree (dfa, NULL, NULL, &br_token);
3734 if (BE (tree == NULL, 0))
3735 goto build_word_op_espace;
3737 #ifdef RE_ENABLE_I18N
3738 if (dfa->mb_cur_max > 1)
3740 bin_tree_t *mbc_tree;
3741 /* Build a tree for complex bracket. */
3742 br_token.type = COMPLEX_BRACKET;
3743 br_token.opr.mbcset = mbcset;
3744 dfa->has_mb_node = 1;
3745 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3746 if (BE (mbc_tree == NULL, 0))
3747 goto build_word_op_espace;
3748 /* Then join them by ALT node. */
3749 tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3750 if (BE (mbc_tree != NULL, 1))
3751 return tree;
3753 else
3755 free_charset (mbcset);
3756 return tree;
3758 #else /* not RE_ENABLE_I18N */
3759 return tree;
3760 #endif /* not RE_ENABLE_I18N */
3762 build_word_op_espace:
3763 re_free (sbcset);
3764 #ifdef RE_ENABLE_I18N
3765 free_charset (mbcset);
3766 #endif /* RE_ENABLE_I18N */
3767 *err = REG_ESPACE;
3768 return NULL;
3771 /* This is intended for the expressions like "a{1,3}".
3772 Fetch a number from 'input', and return the number.
3773 Return REG_MISSING if the number field is empty like "{,1}".
3774 Return RE_DUP_MAX + 1 if the number field is too large.
3775 Return REG_ERROR if an error occurred. */
3777 static Idx
3778 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3780 Idx num = REG_MISSING;
3781 unsigned char c;
3782 while (1)
3784 fetch_token (token, input, syntax);
3785 c = token->opr.c;
3786 if (BE (token->type == END_OF_RE, 0))
3787 return REG_ERROR;
3788 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3789 break;
3790 num = ((token->type != CHARACTER || c < '0' || '9' < c
3791 || num == REG_ERROR)
3792 ? REG_ERROR
3793 : num == REG_MISSING
3794 ? c - '0'
3795 : MIN (RE_DUP_MAX + 1, num * 10 + c - '0'));
3797 return num;
3800 #ifdef RE_ENABLE_I18N
3801 static void
3802 free_charset (re_charset_t *cset)
3804 re_free (cset->mbchars);
3805 # ifdef _LIBC
3806 re_free (cset->coll_syms);
3807 re_free (cset->equiv_classes);
3808 re_free (cset->range_starts);
3809 re_free (cset->range_ends);
3810 # endif
3811 re_free (cset->char_classes);
3812 re_free (cset);
3814 #endif /* RE_ENABLE_I18N */
3816 /* Functions for binary tree operation. */
3818 /* Create a tree node. */
3820 static bin_tree_t *
3821 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3822 re_token_type_t type)
3824 re_token_t t;
3825 t.type = type;
3826 return create_token_tree (dfa, left, right, &t);
3829 static bin_tree_t *
3830 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3831 const re_token_t *token)
3833 bin_tree_t *tree;
3834 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3836 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3838 if (storage == NULL)
3839 return NULL;
3840 storage->next = dfa->str_tree_storage;
3841 dfa->str_tree_storage = storage;
3842 dfa->str_tree_storage_idx = 0;
3844 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3846 tree->parent = NULL;
3847 tree->left = left;
3848 tree->right = right;
3849 tree->token = *token;
3850 tree->token.duplicated = 0;
3851 tree->token.opt_subexp = 0;
3852 tree->first = NULL;
3853 tree->next = NULL;
3854 tree->node_idx = REG_MISSING;
3856 if (left != NULL)
3857 left->parent = tree;
3858 if (right != NULL)
3859 right->parent = tree;
3860 return tree;
3863 /* Mark the tree SRC as an optional subexpression.
3864 To be called from preorder or postorder. */
3866 static reg_errcode_t
3867 mark_opt_subexp (void *extra, bin_tree_t *node)
3869 Idx idx = (uintptr_t) extra;
3870 if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3871 node->token.opt_subexp = 1;
3873 return REG_NOERROR;
3876 /* Free the allocated memory inside NODE. */
3878 static void
3879 free_token (re_token_t *node)
3881 #ifdef RE_ENABLE_I18N
3882 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3883 free_charset (node->opr.mbcset);
3884 else
3885 #endif /* RE_ENABLE_I18N */
3886 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3887 re_free (node->opr.sbcset);
3890 /* Worker function for tree walking. Free the allocated memory inside NODE
3891 and its children. */
3893 static reg_errcode_t
3894 free_tree (void *extra, bin_tree_t *node)
3896 free_token (&node->token);
3897 return REG_NOERROR;
3901 /* Duplicate the node SRC, and return new node. This is a preorder
3902 visit similar to the one implemented by the generic visitor, but
3903 we need more infrastructure to maintain two parallel trees --- so,
3904 it's easier to duplicate. */
3906 static bin_tree_t *
3907 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3909 const bin_tree_t *node;
3910 bin_tree_t *dup_root;
3911 bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3913 for (node = root; ; )
3915 /* Create a new tree and link it back to the current parent. */
3916 *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3917 if (*p_new == NULL)
3918 return NULL;
3919 (*p_new)->parent = dup_node;
3920 (*p_new)->token.duplicated = 1;
3921 dup_node = *p_new;
3923 /* Go to the left node, or up and to the right. */
3924 if (node->left)
3926 node = node->left;
3927 p_new = &dup_node->left;
3929 else
3931 const bin_tree_t *prev = NULL;
3932 while (node->right == prev || node->right == NULL)
3934 prev = node;
3935 node = node->parent;
3936 dup_node = dup_node->parent;
3937 if (!node)
3938 return dup_root;
3940 node = node->right;
3941 p_new = &dup_node->right;