benchtests: Allow selecting baseline for compare_string.py
[glibc.git] / posix / regcomp.c
blobb724ee3389279db15124925e6ee1f932b2a05b80
1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002-2017 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <http://www.gnu.org/licenses/>. */
20 #include <stdint.h>
22 #ifdef _LIBC
23 # include <locale/weight.h>
24 #endif
26 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
27 size_t length, reg_syntax_t syntax);
28 static void re_compile_fastmap_iter (regex_t *bufp,
29 const re_dfastate_t *init_state,
30 char *fastmap);
31 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
32 #ifdef RE_ENABLE_I18N
33 static void free_charset (re_charset_t *cset);
34 #endif /* RE_ENABLE_I18N */
35 static void free_workarea_compile (regex_t *preg);
36 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
37 #ifdef RE_ENABLE_I18N
38 static void optimize_utf8 (re_dfa_t *dfa);
39 #endif
40 static reg_errcode_t analyze (regex_t *preg);
41 static reg_errcode_t preorder (bin_tree_t *root,
42 reg_errcode_t (fn (void *, bin_tree_t *)),
43 void *extra);
44 static reg_errcode_t postorder (bin_tree_t *root,
45 reg_errcode_t (fn (void *, bin_tree_t *)),
46 void *extra);
47 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
48 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
49 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
50 bin_tree_t *node);
51 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
52 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
53 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
54 static int duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint);
55 static int search_duplicated_node (const re_dfa_t *dfa, int org_node,
56 unsigned int constraint);
57 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
58 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
59 int node, int root);
60 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
61 static int fetch_number (re_string_t *input, re_token_t *token,
62 reg_syntax_t syntax);
63 static int peek_token (re_token_t *token, re_string_t *input,
64 reg_syntax_t syntax) internal_function;
65 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
66 reg_syntax_t syntax, reg_errcode_t *err);
67 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
68 re_token_t *token, reg_syntax_t syntax,
69 int nest, reg_errcode_t *err);
70 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
71 re_token_t *token, reg_syntax_t syntax,
72 int nest, reg_errcode_t *err);
73 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
74 re_token_t *token, reg_syntax_t syntax,
75 int nest, reg_errcode_t *err);
76 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
77 re_token_t *token, reg_syntax_t syntax,
78 int nest, reg_errcode_t *err);
79 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
80 re_dfa_t *dfa, re_token_t *token,
81 reg_syntax_t syntax, reg_errcode_t *err);
82 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
83 re_token_t *token, reg_syntax_t syntax,
84 reg_errcode_t *err);
85 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
86 re_string_t *regexp,
87 re_token_t *token, int token_len,
88 re_dfa_t *dfa,
89 reg_syntax_t syntax,
90 int accept_hyphen);
91 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
92 re_string_t *regexp,
93 re_token_t *token);
94 #ifdef RE_ENABLE_I18N
95 static reg_errcode_t build_equiv_class (bitset_t sbcset,
96 re_charset_t *mbcset,
97 int *equiv_class_alloc,
98 const unsigned char *name);
99 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
100 bitset_t sbcset,
101 re_charset_t *mbcset,
102 int *char_class_alloc,
103 const unsigned char *class_name,
104 reg_syntax_t syntax);
105 #else /* not RE_ENABLE_I18N */
106 static reg_errcode_t build_equiv_class (bitset_t sbcset,
107 const unsigned char *name);
108 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
109 bitset_t sbcset,
110 const unsigned char *class_name,
111 reg_syntax_t syntax);
112 #endif /* not RE_ENABLE_I18N */
113 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
114 RE_TRANSLATE_TYPE trans,
115 const unsigned char *class_name,
116 const unsigned char *extra,
117 int non_match, reg_errcode_t *err);
118 static bin_tree_t *create_tree (re_dfa_t *dfa,
119 bin_tree_t *left, bin_tree_t *right,
120 re_token_type_t type);
121 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
122 bin_tree_t *left, bin_tree_t *right,
123 const re_token_t *token);
124 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
125 static void free_token (re_token_t *node);
126 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
127 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
129 /* This table gives an error message for each of the error codes listed
130 in regex.h. Obviously the order here has to be same as there.
131 POSIX doesn't require that we do anything for REG_NOERROR,
132 but why not be nice? */
134 const char __re_error_msgid[] attribute_hidden =
136 #define REG_NOERROR_IDX 0
137 gettext_noop ("Success") /* REG_NOERROR */
138 "\0"
139 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
140 gettext_noop ("No match") /* REG_NOMATCH */
141 "\0"
142 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
143 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
144 "\0"
145 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
146 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
147 "\0"
148 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
149 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
150 "\0"
151 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
152 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
153 "\0"
154 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
155 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
156 "\0"
157 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
158 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
159 "\0"
160 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
161 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
162 "\0"
163 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
164 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
165 "\0"
166 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
167 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
168 "\0"
169 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
170 gettext_noop ("Invalid range end") /* REG_ERANGE */
171 "\0"
172 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
173 gettext_noop ("Memory exhausted") /* REG_ESPACE */
174 "\0"
175 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
176 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
177 "\0"
178 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
179 gettext_noop ("Premature end of regular expression") /* REG_EEND */
180 "\0"
181 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
182 gettext_noop ("Regular expression too big") /* REG_ESIZE */
183 "\0"
184 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
185 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
188 const size_t __re_error_msgid_idx[] attribute_hidden =
190 REG_NOERROR_IDX,
191 REG_NOMATCH_IDX,
192 REG_BADPAT_IDX,
193 REG_ECOLLATE_IDX,
194 REG_ECTYPE_IDX,
195 REG_EESCAPE_IDX,
196 REG_ESUBREG_IDX,
197 REG_EBRACK_IDX,
198 REG_EPAREN_IDX,
199 REG_EBRACE_IDX,
200 REG_BADBR_IDX,
201 REG_ERANGE_IDX,
202 REG_ESPACE_IDX,
203 REG_BADRPT_IDX,
204 REG_EEND_IDX,
205 REG_ESIZE_IDX,
206 REG_ERPAREN_IDX
209 /* Entry points for GNU code. */
211 /* re_compile_pattern is the GNU regular expression compiler: it
212 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
213 Returns 0 if the pattern was valid, otherwise an error string.
215 Assumes the 'allocated' (and perhaps 'buffer') and 'translate' fields
216 are set in BUFP on entry. */
218 const char *
219 re_compile_pattern (const char *pattern, size_t length,
220 struct re_pattern_buffer *bufp)
222 reg_errcode_t ret;
224 /* And GNU code determines whether or not to get register information
225 by passing null for the REGS argument to re_match, etc., not by
226 setting no_sub, unless RE_NO_SUB is set. */
227 bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
229 /* Match anchors at newline. */
230 bufp->newline_anchor = 1;
232 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
234 if (!ret)
235 return NULL;
236 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
238 #ifdef _LIBC
239 weak_alias (__re_compile_pattern, re_compile_pattern)
240 #endif
242 /* Set by 're_set_syntax' to the current regexp syntax to recognize. Can
243 also be assigned to arbitrarily: each pattern buffer stores its own
244 syntax, so it can be changed between regex compilations. */
245 /* This has no initializer because initialized variables in Emacs
246 become read-only after dumping. */
247 reg_syntax_t re_syntax_options;
250 /* Specify the precise syntax of regexps for compilation. This provides
251 for compatibility for various utilities which historically have
252 different, incompatible syntaxes.
254 The argument SYNTAX is a bit mask comprised of the various bits
255 defined in regex.h. We return the old syntax. */
257 reg_syntax_t
258 re_set_syntax (reg_syntax_t syntax)
260 reg_syntax_t ret = re_syntax_options;
262 re_syntax_options = syntax;
263 return ret;
265 #ifdef _LIBC
266 weak_alias (__re_set_syntax, re_set_syntax)
267 #endif
270 re_compile_fastmap (struct re_pattern_buffer *bufp)
272 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
273 char *fastmap = bufp->fastmap;
275 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
276 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
277 if (dfa->init_state != dfa->init_state_word)
278 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
279 if (dfa->init_state != dfa->init_state_nl)
280 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
281 if (dfa->init_state != dfa->init_state_begbuf)
282 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
283 bufp->fastmap_accurate = 1;
284 return 0;
286 #ifdef _LIBC
287 weak_alias (__re_compile_fastmap, re_compile_fastmap)
288 #endif
290 static inline void
291 __attribute__ ((always_inline))
292 re_set_fastmap (char *fastmap, bool icase, int ch)
294 fastmap[ch] = 1;
295 if (icase)
296 fastmap[tolower (ch)] = 1;
299 /* Helper function for re_compile_fastmap.
300 Compile fastmap for the initial_state INIT_STATE. */
302 static void
303 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
304 char *fastmap)
306 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
307 int node_cnt;
308 int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
309 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
311 int node = init_state->nodes.elems[node_cnt];
312 re_token_type_t type = dfa->nodes[node].type;
314 if (type == CHARACTER)
316 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
317 #ifdef RE_ENABLE_I18N
318 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
320 unsigned char *buf = alloca (dfa->mb_cur_max), *p;
321 wchar_t wc;
322 mbstate_t state;
324 p = buf;
325 *p++ = dfa->nodes[node].opr.c;
326 while (++node < dfa->nodes_len
327 && dfa->nodes[node].type == CHARACTER
328 && dfa->nodes[node].mb_partial)
329 *p++ = dfa->nodes[node].opr.c;
330 memset (&state, '\0', sizeof (state));
331 if (__mbrtowc (&wc, (const char *) buf, p - buf,
332 &state) == p - buf
333 && (__wcrtomb ((char *) buf, __towlower (wc), &state)
334 != (size_t) -1))
335 re_set_fastmap (fastmap, 0, buf[0]);
337 #endif
339 else if (type == SIMPLE_BRACKET)
341 int i, ch;
342 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
344 int j;
345 bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
346 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
347 if (w & ((bitset_word_t) 1 << j))
348 re_set_fastmap (fastmap, icase, ch);
351 #ifdef RE_ENABLE_I18N
352 else if (type == COMPLEX_BRACKET)
354 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
355 int i;
357 # ifdef _LIBC
358 /* See if we have to try all bytes which start multiple collation
359 elements.
360 e.g. In da_DK, we want to catch 'a' since "aa" is a valid
361 collation element, and don't catch 'b' since 'b' is
362 the only collation element which starts from 'b' (and
363 it is caught by SIMPLE_BRACKET). */
364 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
365 && (cset->ncoll_syms || cset->nranges))
367 const int32_t *table = (const int32_t *)
368 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
369 for (i = 0; i < SBC_MAX; ++i)
370 if (table[i] < 0)
371 re_set_fastmap (fastmap, icase, i);
373 # endif /* _LIBC */
375 /* See if we have to start the match at all multibyte characters,
376 i.e. where we would not find an invalid sequence. This only
377 applies to multibyte character sets; for single byte character
378 sets, the SIMPLE_BRACKET again suffices. */
379 if (dfa->mb_cur_max > 1
380 && (cset->nchar_classes || cset->non_match || cset->nranges
381 # ifdef _LIBC
382 || cset->nequiv_classes
383 # endif /* _LIBC */
386 unsigned char c = 0;
389 mbstate_t mbs;
390 memset (&mbs, 0, sizeof (mbs));
391 if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
392 re_set_fastmap (fastmap, false, (int) c);
394 while (++c != 0);
397 else
399 /* ... Else catch all bytes which can start the mbchars. */
400 for (i = 0; i < cset->nmbchars; ++i)
402 char buf[256];
403 mbstate_t state;
404 memset (&state, '\0', sizeof (state));
405 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
406 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
407 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
409 if (__wcrtomb (buf, __towlower (cset->mbchars[i]), &state)
410 != (size_t) -1)
411 re_set_fastmap (fastmap, false, *(unsigned char *) buf);
416 #endif /* RE_ENABLE_I18N */
417 else if (type == OP_PERIOD
418 #ifdef RE_ENABLE_I18N
419 || type == OP_UTF8_PERIOD
420 #endif /* RE_ENABLE_I18N */
421 || type == END_OF_RE)
423 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
424 if (type == END_OF_RE)
425 bufp->can_be_null = 1;
426 return;
431 /* Entry point for POSIX code. */
432 /* regcomp takes a regular expression as a string and compiles it.
434 PREG is a regex_t *. We do not expect any fields to be initialized,
435 since POSIX says we shouldn't. Thus, we set
437 'buffer' to the compiled pattern;
438 'used' to the length of the compiled pattern;
439 'syntax' to RE_SYNTAX_POSIX_EXTENDED if the
440 REG_EXTENDED bit in CFLAGS is set; otherwise, to
441 RE_SYNTAX_POSIX_BASIC;
442 'newline_anchor' to REG_NEWLINE being set in CFLAGS;
443 'fastmap' to an allocated space for the fastmap;
444 'fastmap_accurate' to zero;
445 're_nsub' to the number of subexpressions in PATTERN.
447 PATTERN is the address of the pattern string.
449 CFLAGS is a series of bits which affect compilation.
451 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
452 use POSIX basic syntax.
454 If REG_NEWLINE is set, then . and [^...] don't match newline.
455 Also, regexec will try a match beginning after every newline.
457 If REG_ICASE is set, then we considers upper- and lowercase
458 versions of letters to be equivalent when matching.
460 If REG_NOSUB is set, then when PREG is passed to regexec, that
461 routine will report only success or failure, and nothing about the
462 registers.
464 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
465 the return codes and their meanings.) */
468 regcomp (regex_t *__restrict preg, const char *__restrict pattern, int cflags)
470 reg_errcode_t ret;
471 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
472 : RE_SYNTAX_POSIX_BASIC);
474 preg->buffer = NULL;
475 preg->allocated = 0;
476 preg->used = 0;
478 /* Try to allocate space for the fastmap. */
479 preg->fastmap = re_malloc (char, SBC_MAX);
480 if (BE (preg->fastmap == NULL, 0))
481 return REG_ESPACE;
483 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
485 /* If REG_NEWLINE is set, newlines are treated differently. */
486 if (cflags & REG_NEWLINE)
487 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
488 syntax &= ~RE_DOT_NEWLINE;
489 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
490 /* It also changes the matching behavior. */
491 preg->newline_anchor = 1;
493 else
494 preg->newline_anchor = 0;
495 preg->no_sub = !!(cflags & REG_NOSUB);
496 preg->translate = NULL;
498 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
500 /* POSIX doesn't distinguish between an unmatched open-group and an
501 unmatched close-group: both are REG_EPAREN. */
502 if (ret == REG_ERPAREN)
503 ret = REG_EPAREN;
505 /* We have already checked preg->fastmap != NULL. */
506 if (BE (ret == REG_NOERROR, 1))
507 /* Compute the fastmap now, since regexec cannot modify the pattern
508 buffer. This function never fails in this implementation. */
509 (void) re_compile_fastmap (preg);
510 else
512 /* Some error occurred while compiling the expression. */
513 re_free (preg->fastmap);
514 preg->fastmap = NULL;
517 return (int) ret;
519 #ifdef _LIBC
520 weak_alias (__regcomp, regcomp)
521 #endif
523 /* Returns a message corresponding to an error code, ERRCODE, returned
524 from either regcomp or regexec. We don't use PREG here. */
526 size_t
527 regerror (int errcode, const regex_t *__restrict preg, char *__restrict errbuf,
528 size_t errbuf_size)
530 const char *msg;
531 size_t msg_size;
533 if (BE (errcode < 0
534 || errcode >= (int) (sizeof (__re_error_msgid_idx)
535 / sizeof (__re_error_msgid_idx[0])), 0))
536 /* Only error codes returned by the rest of the code should be passed
537 to this routine. If we are given anything else, or if other regex
538 code generates an invalid error code, then the program has a bug.
539 Dump core so we can fix it. */
540 abort ();
542 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
544 msg_size = strlen (msg) + 1; /* Includes the null. */
546 if (BE (errbuf_size != 0, 1))
548 if (BE (msg_size > errbuf_size, 0))
550 #if defined HAVE_MEMPCPY || defined _LIBC
551 *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
552 #else
553 memcpy (errbuf, msg, errbuf_size - 1);
554 errbuf[errbuf_size - 1] = 0;
555 #endif
557 else
558 memcpy (errbuf, msg, msg_size);
561 return msg_size;
563 #ifdef _LIBC
564 weak_alias (__regerror, regerror)
565 #endif
568 #ifdef RE_ENABLE_I18N
569 /* This static array is used for the map to single-byte characters when
570 UTF-8 is used. Otherwise we would allocate memory just to initialize
571 it the same all the time. UTF-8 is the preferred encoding so this is
572 a worthwhile optimization. */
573 static const bitset_t utf8_sb_map =
575 /* Set the first 128 bits. */
576 [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
578 #endif
581 static void
582 free_dfa_content (re_dfa_t *dfa)
584 int i, j;
586 if (dfa->nodes)
587 for (i = 0; i < dfa->nodes_len; ++i)
588 free_token (dfa->nodes + i);
589 re_free (dfa->nexts);
590 for (i = 0; i < dfa->nodes_len; ++i)
592 if (dfa->eclosures != NULL)
593 re_node_set_free (dfa->eclosures + i);
594 if (dfa->inveclosures != NULL)
595 re_node_set_free (dfa->inveclosures + i);
596 if (dfa->edests != NULL)
597 re_node_set_free (dfa->edests + i);
599 re_free (dfa->edests);
600 re_free (dfa->eclosures);
601 re_free (dfa->inveclosures);
602 re_free (dfa->nodes);
604 if (dfa->state_table)
605 for (i = 0; i <= dfa->state_hash_mask; ++i)
607 struct re_state_table_entry *entry = dfa->state_table + i;
608 for (j = 0; j < entry->num; ++j)
610 re_dfastate_t *state = entry->array[j];
611 free_state (state);
613 re_free (entry->array);
615 re_free (dfa->state_table);
616 #ifdef RE_ENABLE_I18N
617 if (dfa->sb_char != utf8_sb_map)
618 re_free (dfa->sb_char);
619 #endif
620 re_free (dfa->subexp_map);
621 #ifdef DEBUG
622 re_free (dfa->re_str);
623 #endif
625 re_free (dfa);
629 /* Free dynamically allocated space used by PREG. */
631 void
632 regfree (regex_t *preg)
634 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
635 if (BE (dfa != NULL, 1))
636 free_dfa_content (dfa);
637 preg->buffer = NULL;
638 preg->allocated = 0;
640 re_free (preg->fastmap);
641 preg->fastmap = NULL;
643 re_free (preg->translate);
644 preg->translate = NULL;
646 #ifdef _LIBC
647 weak_alias (__regfree, regfree)
648 #endif
650 /* Entry points compatible with 4.2 BSD regex library. We don't define
651 them unless specifically requested. */
653 #if defined _REGEX_RE_COMP || defined _LIBC
655 /* BSD has one and only one pattern buffer. */
656 static struct re_pattern_buffer re_comp_buf;
658 char *
659 # ifdef _LIBC
660 /* Make these definitions weak in libc, so POSIX programs can redefine
661 these names if they don't use our functions, and still use
662 regcomp/regexec above without link errors. */
663 weak_function
664 # endif
665 re_comp (const char *s)
667 reg_errcode_t ret;
668 char *fastmap;
670 if (!s)
672 if (!re_comp_buf.buffer)
673 return gettext ("No previous regular expression");
674 return 0;
677 if (re_comp_buf.buffer)
679 fastmap = re_comp_buf.fastmap;
680 re_comp_buf.fastmap = NULL;
681 __regfree (&re_comp_buf);
682 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
683 re_comp_buf.fastmap = fastmap;
686 if (re_comp_buf.fastmap == NULL)
688 re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
689 if (re_comp_buf.fastmap == NULL)
690 return (char *) gettext (__re_error_msgid
691 + __re_error_msgid_idx[(int) REG_ESPACE]);
694 /* Since 're_exec' always passes NULL for the 'regs' argument, we
695 don't need to initialize the pattern buffer fields which affect it. */
697 /* Match anchors at newlines. */
698 re_comp_buf.newline_anchor = 1;
700 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
702 if (!ret)
703 return NULL;
705 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
706 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
709 #ifdef _LIBC
710 libc_freeres_fn (free_mem)
712 __regfree (&re_comp_buf);
714 #endif
716 #endif /* _REGEX_RE_COMP */
718 /* Internal entry point.
719 Compile the regular expression PATTERN, whose length is LENGTH.
720 SYNTAX indicate regular expression's syntax. */
722 static reg_errcode_t
723 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
724 reg_syntax_t syntax)
726 reg_errcode_t err = REG_NOERROR;
727 re_dfa_t *dfa;
728 re_string_t regexp;
730 /* Initialize the pattern buffer. */
731 preg->fastmap_accurate = 0;
732 preg->syntax = syntax;
733 preg->not_bol = preg->not_eol = 0;
734 preg->used = 0;
735 preg->re_nsub = 0;
736 preg->can_be_null = 0;
737 preg->regs_allocated = REGS_UNALLOCATED;
739 /* Initialize the dfa. */
740 dfa = (re_dfa_t *) preg->buffer;
741 if (BE (preg->allocated < sizeof (re_dfa_t), 0))
743 /* If zero allocated, but buffer is non-null, try to realloc
744 enough space. This loses if buffer's address is bogus, but
745 that is the user's responsibility. If ->buffer is NULL this
746 is a simple allocation. */
747 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
748 if (dfa == NULL)
749 return REG_ESPACE;
750 preg->allocated = sizeof (re_dfa_t);
751 preg->buffer = (unsigned char *) dfa;
753 preg->used = sizeof (re_dfa_t);
755 err = init_dfa (dfa, length);
756 if (BE (err != REG_NOERROR, 0))
758 free_dfa_content (dfa);
759 preg->buffer = NULL;
760 preg->allocated = 0;
761 return err;
763 #ifdef DEBUG
764 /* Note: length+1 will not overflow since it is checked in init_dfa. */
765 dfa->re_str = re_malloc (char, length + 1);
766 strncpy (dfa->re_str, pattern, length + 1);
767 #endif
769 __libc_lock_init (dfa->lock);
771 err = re_string_construct (&regexp, pattern, length, preg->translate,
772 syntax & RE_ICASE, dfa);
773 if (BE (err != REG_NOERROR, 0))
775 re_compile_internal_free_return:
776 free_workarea_compile (preg);
777 re_string_destruct (&regexp);
778 free_dfa_content (dfa);
779 preg->buffer = NULL;
780 preg->allocated = 0;
781 return err;
784 /* Parse the regular expression, and build a structure tree. */
785 preg->re_nsub = 0;
786 dfa->str_tree = parse (&regexp, preg, syntax, &err);
787 if (BE (dfa->str_tree == NULL, 0))
788 goto re_compile_internal_free_return;
790 /* Analyze the tree and create the nfa. */
791 err = analyze (preg);
792 if (BE (err != REG_NOERROR, 0))
793 goto re_compile_internal_free_return;
795 #ifdef RE_ENABLE_I18N
796 /* If possible, do searching in single byte encoding to speed things up. */
797 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
798 optimize_utf8 (dfa);
799 #endif
801 /* Then create the initial state of the dfa. */
802 err = create_initial_state (dfa);
804 /* Release work areas. */
805 free_workarea_compile (preg);
806 re_string_destruct (&regexp);
808 if (BE (err != REG_NOERROR, 0))
810 free_dfa_content (dfa);
811 preg->buffer = NULL;
812 preg->allocated = 0;
815 return err;
818 /* Initialize DFA. We use the length of the regular expression PAT_LEN
819 as the initial length of some arrays. */
821 static reg_errcode_t
822 init_dfa (re_dfa_t *dfa, size_t pat_len)
824 unsigned int table_size;
825 #ifndef _LIBC
826 char *codeset_name;
827 #endif
829 memset (dfa, '\0', sizeof (re_dfa_t));
831 /* Force allocation of str_tree_storage the first time. */
832 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
834 /* Avoid overflows. */
835 if (pat_len == SIZE_MAX)
836 return REG_ESPACE;
838 dfa->nodes_alloc = pat_len + 1;
839 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
841 /* table_size = 2 ^ ceil(log pat_len) */
842 for (table_size = 1; ; table_size <<= 1)
843 if (table_size > pat_len)
844 break;
846 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
847 dfa->state_hash_mask = table_size - 1;
849 dfa->mb_cur_max = MB_CUR_MAX;
850 #ifdef _LIBC
851 if (dfa->mb_cur_max == 6
852 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
853 dfa->is_utf8 = 1;
854 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
855 != 0);
856 #else
857 # ifdef HAVE_LANGINFO_CODESET
858 codeset_name = nl_langinfo (CODESET);
859 # else
860 codeset_name = getenv ("LC_ALL");
861 if (codeset_name == NULL || codeset_name[0] == '\0')
862 codeset_name = getenv ("LC_CTYPE");
863 if (codeset_name == NULL || codeset_name[0] == '\0')
864 codeset_name = getenv ("LANG");
865 if (codeset_name == NULL)
866 codeset_name = "";
867 else if (strchr (codeset_name, '.') != NULL)
868 codeset_name = strchr (codeset_name, '.') + 1;
869 # endif
871 if (strcasecmp (codeset_name, "UTF-8") == 0
872 || strcasecmp (codeset_name, "UTF8") == 0)
873 dfa->is_utf8 = 1;
875 /* We check exhaustively in the loop below if this charset is a
876 superset of ASCII. */
877 dfa->map_notascii = 0;
878 #endif
880 #ifdef RE_ENABLE_I18N
881 if (dfa->mb_cur_max > 1)
883 if (dfa->is_utf8)
884 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
885 else
887 int i, j, ch;
889 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
890 if (BE (dfa->sb_char == NULL, 0))
891 return REG_ESPACE;
893 /* Set the bits corresponding to single byte chars. */
894 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
895 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
897 wint_t wch = __btowc (ch);
898 if (wch != WEOF)
899 dfa->sb_char[i] |= (bitset_word_t) 1 << j;
900 # ifndef _LIBC
901 if (isascii (ch) && wch != ch)
902 dfa->map_notascii = 1;
903 # endif
907 #endif
909 if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
910 return REG_ESPACE;
911 return REG_NOERROR;
914 /* Initialize WORD_CHAR table, which indicate which character is
915 "word". In this case "word" means that it is the word construction
916 character used by some operators like "\<", "\>", etc. */
918 static void
919 internal_function
920 init_word_char (re_dfa_t *dfa)
922 dfa->word_ops_used = 1;
923 int i = 0;
924 int ch = 0;
925 if (BE (dfa->map_notascii == 0, 1))
927 if (sizeof (dfa->word_char[0]) == 8)
929 /* The extra temporaries here avoid "implicitly truncated"
930 warnings in the case when this is dead code, i.e. 32-bit. */
931 const uint64_t wc0 = UINT64_C (0x03ff000000000000);
932 const uint64_t wc1 = UINT64_C (0x07fffffe87fffffe);
933 dfa->word_char[0] = wc0;
934 dfa->word_char[1] = wc1;
935 i = 2;
937 else if (sizeof (dfa->word_char[0]) == 4)
939 dfa->word_char[0] = UINT32_C (0x00000000);
940 dfa->word_char[1] = UINT32_C (0x03ff0000);
941 dfa->word_char[2] = UINT32_C (0x87fffffe);
942 dfa->word_char[3] = UINT32_C (0x07fffffe);
943 i = 4;
945 else
946 abort ();
947 ch = 128;
949 if (BE (dfa->is_utf8, 1))
951 memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8);
952 return;
956 for (; i < BITSET_WORDS; ++i)
957 for (int j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
958 if (isalnum (ch) || ch == '_')
959 dfa->word_char[i] |= (bitset_word_t) 1 << j;
962 /* Free the work area which are only used while compiling. */
964 static void
965 free_workarea_compile (regex_t *preg)
967 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
968 bin_tree_storage_t *storage, *next;
969 for (storage = dfa->str_tree_storage; storage; storage = next)
971 next = storage->next;
972 re_free (storage);
974 dfa->str_tree_storage = NULL;
975 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
976 dfa->str_tree = NULL;
977 re_free (dfa->org_indices);
978 dfa->org_indices = NULL;
981 /* Create initial states for all contexts. */
983 static reg_errcode_t
984 create_initial_state (re_dfa_t *dfa)
986 int first, i;
987 reg_errcode_t err;
988 re_node_set init_nodes;
990 /* Initial states have the epsilon closure of the node which is
991 the first node of the regular expression. */
992 first = dfa->str_tree->first->node_idx;
993 dfa->init_node = first;
994 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
995 if (BE (err != REG_NOERROR, 0))
996 return err;
998 /* The back-references which are in initial states can epsilon transit,
999 since in this case all of the subexpressions can be null.
1000 Then we add epsilon closures of the nodes which are the next nodes of
1001 the back-references. */
1002 if (dfa->nbackref > 0)
1003 for (i = 0; i < init_nodes.nelem; ++i)
1005 int node_idx = init_nodes.elems[i];
1006 re_token_type_t type = dfa->nodes[node_idx].type;
1008 int clexp_idx;
1009 if (type != OP_BACK_REF)
1010 continue;
1011 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1013 re_token_t *clexp_node;
1014 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1015 if (clexp_node->type == OP_CLOSE_SUBEXP
1016 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1017 break;
1019 if (clexp_idx == init_nodes.nelem)
1020 continue;
1022 if (type == OP_BACK_REF)
1024 int dest_idx = dfa->edests[node_idx].elems[0];
1025 if (!re_node_set_contains (&init_nodes, dest_idx))
1027 reg_errcode_t err = re_node_set_merge (&init_nodes,
1028 dfa->eclosures
1029 + dest_idx);
1030 if (err != REG_NOERROR)
1031 return err;
1032 i = 0;
1037 /* It must be the first time to invoke acquire_state. */
1038 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1039 /* We don't check ERR here, since the initial state must not be NULL. */
1040 if (BE (dfa->init_state == NULL, 0))
1041 return err;
1042 if (dfa->init_state->has_constraint)
1044 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1045 CONTEXT_WORD);
1046 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1047 CONTEXT_NEWLINE);
1048 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1049 &init_nodes,
1050 CONTEXT_NEWLINE
1051 | CONTEXT_BEGBUF);
1052 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1053 || dfa->init_state_begbuf == NULL, 0))
1054 return err;
1056 else
1057 dfa->init_state_word = dfa->init_state_nl
1058 = dfa->init_state_begbuf = dfa->init_state;
1060 re_node_set_free (&init_nodes);
1061 return REG_NOERROR;
1064 #ifdef RE_ENABLE_I18N
1065 /* If it is possible to do searching in single byte encoding instead of UTF-8
1066 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1067 DFA nodes where needed. */
1069 static void
1070 optimize_utf8 (re_dfa_t *dfa)
1072 int node, i, mb_chars = 0, has_period = 0;
1074 for (node = 0; node < dfa->nodes_len; ++node)
1075 switch (dfa->nodes[node].type)
1077 case CHARACTER:
1078 if (dfa->nodes[node].opr.c >= 0x80)
1079 mb_chars = 1;
1080 break;
1081 case ANCHOR:
1082 switch (dfa->nodes[node].opr.ctx_type)
1084 case LINE_FIRST:
1085 case LINE_LAST:
1086 case BUF_FIRST:
1087 case BUF_LAST:
1088 break;
1089 default:
1090 /* Word anchors etc. cannot be handled. It's okay to test
1091 opr.ctx_type since constraints (for all DFA nodes) are
1092 created by ORing one or more opr.ctx_type values. */
1093 return;
1095 break;
1096 case OP_PERIOD:
1097 has_period = 1;
1098 break;
1099 case OP_BACK_REF:
1100 case OP_ALT:
1101 case END_OF_RE:
1102 case OP_DUP_ASTERISK:
1103 case OP_OPEN_SUBEXP:
1104 case OP_CLOSE_SUBEXP:
1105 break;
1106 case COMPLEX_BRACKET:
1107 return;
1108 case SIMPLE_BRACKET:
1109 /* Just double check. The non-ASCII range starts at 0x80. */
1110 assert (0x80 % BITSET_WORD_BITS == 0);
1111 for (i = 0x80 / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1112 if (dfa->nodes[node].opr.sbcset[i])
1113 return;
1114 break;
1115 default:
1116 abort ();
1119 if (mb_chars || has_period)
1120 for (node = 0; node < dfa->nodes_len; ++node)
1122 if (dfa->nodes[node].type == CHARACTER
1123 && dfa->nodes[node].opr.c >= 0x80)
1124 dfa->nodes[node].mb_partial = 0;
1125 else if (dfa->nodes[node].type == OP_PERIOD)
1126 dfa->nodes[node].type = OP_UTF8_PERIOD;
1129 /* The search can be in single byte locale. */
1130 dfa->mb_cur_max = 1;
1131 dfa->is_utf8 = 0;
1132 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1134 #endif
1136 /* Analyze the structure tree, and calculate "first", "next", "edest",
1137 "eclosure", and "inveclosure". */
1139 static reg_errcode_t
1140 analyze (regex_t *preg)
1142 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1143 reg_errcode_t ret;
1145 /* Allocate arrays. */
1146 dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1147 dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1148 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1149 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1150 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1151 || dfa->eclosures == NULL, 0))
1152 return REG_ESPACE;
1154 dfa->subexp_map = re_malloc (int, preg->re_nsub);
1155 if (dfa->subexp_map != NULL)
1157 int i;
1158 for (i = 0; i < preg->re_nsub; i++)
1159 dfa->subexp_map[i] = i;
1160 preorder (dfa->str_tree, optimize_subexps, dfa);
1161 for (i = 0; i < preg->re_nsub; i++)
1162 if (dfa->subexp_map[i] != i)
1163 break;
1164 if (i == preg->re_nsub)
1166 free (dfa->subexp_map);
1167 dfa->subexp_map = NULL;
1171 ret = postorder (dfa->str_tree, lower_subexps, preg);
1172 if (BE (ret != REG_NOERROR, 0))
1173 return ret;
1174 ret = postorder (dfa->str_tree, calc_first, dfa);
1175 if (BE (ret != REG_NOERROR, 0))
1176 return ret;
1177 preorder (dfa->str_tree, calc_next, dfa);
1178 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1179 if (BE (ret != REG_NOERROR, 0))
1180 return ret;
1181 ret = calc_eclosure (dfa);
1182 if (BE (ret != REG_NOERROR, 0))
1183 return ret;
1185 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1186 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1187 if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1188 || dfa->nbackref)
1190 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1191 if (BE (dfa->inveclosures == NULL, 0))
1192 return REG_ESPACE;
1193 ret = calc_inveclosure (dfa);
1196 return ret;
1199 /* Our parse trees are very unbalanced, so we cannot use a stack to
1200 implement parse tree visits. Instead, we use parent pointers and
1201 some hairy code in these two functions. */
1202 static reg_errcode_t
1203 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1204 void *extra)
1206 bin_tree_t *node, *prev;
1208 for (node = root; ; )
1210 /* Descend down the tree, preferably to the left (or to the right
1211 if that's the only child). */
1212 while (node->left || node->right)
1213 if (node->left)
1214 node = node->left;
1215 else
1216 node = node->right;
1220 reg_errcode_t err = fn (extra, node);
1221 if (BE (err != REG_NOERROR, 0))
1222 return err;
1223 if (node->parent == NULL)
1224 return REG_NOERROR;
1225 prev = node;
1226 node = node->parent;
1228 /* Go up while we have a node that is reached from the right. */
1229 while (node->right == prev || node->right == NULL);
1230 node = node->right;
1234 static reg_errcode_t
1235 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1236 void *extra)
1238 bin_tree_t *node;
1240 for (node = root; ; )
1242 reg_errcode_t err = fn (extra, node);
1243 if (BE (err != REG_NOERROR, 0))
1244 return err;
1246 /* Go to the left node, or up and to the right. */
1247 if (node->left)
1248 node = node->left;
1249 else
1251 bin_tree_t *prev = NULL;
1252 while (node->right == prev || node->right == NULL)
1254 prev = node;
1255 node = node->parent;
1256 if (!node)
1257 return REG_NOERROR;
1259 node = node->right;
1264 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1265 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1266 backreferences as well. Requires a preorder visit. */
1267 static reg_errcode_t
1268 optimize_subexps (void *extra, bin_tree_t *node)
1270 re_dfa_t *dfa = (re_dfa_t *) extra;
1272 if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1274 int idx = node->token.opr.idx;
1275 node->token.opr.idx = dfa->subexp_map[idx];
1276 dfa->used_bkref_map |= 1 << node->token.opr.idx;
1279 else if (node->token.type == SUBEXP
1280 && node->left && node->left->token.type == SUBEXP)
1282 int other_idx = node->left->token.opr.idx;
1284 node->left = node->left->left;
1285 if (node->left)
1286 node->left->parent = node;
1288 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1289 if (other_idx < BITSET_WORD_BITS)
1290 dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1293 return REG_NOERROR;
1296 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1297 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1298 static reg_errcode_t
1299 lower_subexps (void *extra, bin_tree_t *node)
1301 regex_t *preg = (regex_t *) extra;
1302 reg_errcode_t err = REG_NOERROR;
1304 if (node->left && node->left->token.type == SUBEXP)
1306 node->left = lower_subexp (&err, preg, node->left);
1307 if (node->left)
1308 node->left->parent = node;
1310 if (node->right && node->right->token.type == SUBEXP)
1312 node->right = lower_subexp (&err, preg, node->right);
1313 if (node->right)
1314 node->right->parent = node;
1317 return err;
1320 static bin_tree_t *
1321 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1323 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1324 bin_tree_t *body = node->left;
1325 bin_tree_t *op, *cls, *tree1, *tree;
1327 if (preg->no_sub
1328 /* We do not optimize empty subexpressions, because otherwise we may
1329 have bad CONCAT nodes with NULL children. This is obviously not
1330 very common, so we do not lose much. An example that triggers
1331 this case is the sed "script" /\(\)/x. */
1332 && node->left != NULL
1333 && (node->token.opr.idx >= BITSET_WORD_BITS
1334 || !(dfa->used_bkref_map
1335 & ((bitset_word_t) 1 << node->token.opr.idx))))
1336 return node->left;
1338 /* Convert the SUBEXP node to the concatenation of an
1339 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1340 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1341 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1342 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1343 tree = create_tree (dfa, op, tree1, CONCAT);
1344 if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1346 *err = REG_ESPACE;
1347 return NULL;
1350 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1351 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1352 return tree;
1355 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1356 nodes. Requires a postorder visit. */
1357 static reg_errcode_t
1358 calc_first (void *extra, bin_tree_t *node)
1360 re_dfa_t *dfa = (re_dfa_t *) extra;
1361 if (node->token.type == CONCAT)
1363 node->first = node->left->first;
1364 node->node_idx = node->left->node_idx;
1366 else
1368 node->first = node;
1369 node->node_idx = re_dfa_add_node (dfa, node->token);
1370 if (BE (node->node_idx == -1, 0))
1371 return REG_ESPACE;
1372 if (node->token.type == ANCHOR)
1373 dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1375 return REG_NOERROR;
1378 /* Pass 2: compute NEXT on the tree. Preorder visit. */
1379 static reg_errcode_t
1380 calc_next (void *extra, bin_tree_t *node)
1382 switch (node->token.type)
1384 case OP_DUP_ASTERISK:
1385 node->left->next = node;
1386 break;
1387 case CONCAT:
1388 node->left->next = node->right->first;
1389 node->right->next = node->next;
1390 break;
1391 default:
1392 if (node->left)
1393 node->left->next = node->next;
1394 if (node->right)
1395 node->right->next = node->next;
1396 break;
1398 return REG_NOERROR;
1401 /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1402 static reg_errcode_t
1403 link_nfa_nodes (void *extra, bin_tree_t *node)
1405 re_dfa_t *dfa = (re_dfa_t *) extra;
1406 int idx = node->node_idx;
1407 reg_errcode_t err = REG_NOERROR;
1409 switch (node->token.type)
1411 case CONCAT:
1412 break;
1414 case END_OF_RE:
1415 assert (node->next == NULL);
1416 break;
1418 case OP_DUP_ASTERISK:
1419 case OP_ALT:
1421 int left, right;
1422 dfa->has_plural_match = 1;
1423 if (node->left != NULL)
1424 left = node->left->first->node_idx;
1425 else
1426 left = node->next->node_idx;
1427 if (node->right != NULL)
1428 right = node->right->first->node_idx;
1429 else
1430 right = node->next->node_idx;
1431 assert (left > -1);
1432 assert (right > -1);
1433 err = re_node_set_init_2 (dfa->edests + idx, left, right);
1435 break;
1437 case ANCHOR:
1438 case OP_OPEN_SUBEXP:
1439 case OP_CLOSE_SUBEXP:
1440 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1441 break;
1443 case OP_BACK_REF:
1444 dfa->nexts[idx] = node->next->node_idx;
1445 if (node->token.type == OP_BACK_REF)
1446 err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1447 break;
1449 default:
1450 assert (!IS_EPSILON_NODE (node->token.type));
1451 dfa->nexts[idx] = node->next->node_idx;
1452 break;
1455 return err;
1458 /* Duplicate the epsilon closure of the node ROOT_NODE.
1459 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1460 to their own constraint. */
1462 static reg_errcode_t
1463 internal_function
1464 duplicate_node_closure (re_dfa_t *dfa, int top_org_node, int top_clone_node,
1465 int root_node, unsigned int init_constraint)
1467 int org_node, clone_node, ret;
1468 unsigned int constraint = init_constraint;
1469 for (org_node = top_org_node, clone_node = top_clone_node;;)
1471 int org_dest, clone_dest;
1472 if (dfa->nodes[org_node].type == OP_BACK_REF)
1474 /* If the back reference epsilon-transit, its destination must
1475 also have the constraint. Then duplicate the epsilon closure
1476 of the destination of the back reference, and store it in
1477 edests of the back reference. */
1478 org_dest = dfa->nexts[org_node];
1479 re_node_set_empty (dfa->edests + clone_node);
1480 clone_dest = duplicate_node (dfa, org_dest, constraint);
1481 if (BE (clone_dest == -1, 0))
1482 return REG_ESPACE;
1483 dfa->nexts[clone_node] = dfa->nexts[org_node];
1484 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1485 if (BE (ret < 0, 0))
1486 return REG_ESPACE;
1488 else if (dfa->edests[org_node].nelem == 0)
1490 /* In case of the node can't epsilon-transit, don't duplicate the
1491 destination and store the original destination as the
1492 destination of the node. */
1493 dfa->nexts[clone_node] = dfa->nexts[org_node];
1494 break;
1496 else if (dfa->edests[org_node].nelem == 1)
1498 /* In case of the node can epsilon-transit, and it has only one
1499 destination. */
1500 org_dest = dfa->edests[org_node].elems[0];
1501 re_node_set_empty (dfa->edests + clone_node);
1502 /* If the node is root_node itself, it means the epsilon closure
1503 has a loop. Then tie it to the destination of the root_node. */
1504 if (org_node == root_node && clone_node != org_node)
1506 ret = re_node_set_insert (dfa->edests + clone_node, org_dest);
1507 if (BE (ret < 0, 0))
1508 return REG_ESPACE;
1509 break;
1511 /* In case the node has another constraint, append it. */
1512 constraint |= dfa->nodes[org_node].constraint;
1513 clone_dest = duplicate_node (dfa, org_dest, constraint);
1514 if (BE (clone_dest == -1, 0))
1515 return REG_ESPACE;
1516 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1517 if (BE (ret < 0, 0))
1518 return REG_ESPACE;
1520 else /* dfa->edests[org_node].nelem == 2 */
1522 /* In case of the node can epsilon-transit, and it has two
1523 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1524 org_dest = dfa->edests[org_node].elems[0];
1525 re_node_set_empty (dfa->edests + clone_node);
1526 /* Search for a duplicated node which satisfies the constraint. */
1527 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1528 if (clone_dest == -1)
1530 /* There is no such duplicated node, create a new one. */
1531 reg_errcode_t err;
1532 clone_dest = duplicate_node (dfa, org_dest, constraint);
1533 if (BE (clone_dest == -1, 0))
1534 return REG_ESPACE;
1535 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1536 if (BE (ret < 0, 0))
1537 return REG_ESPACE;
1538 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1539 root_node, constraint);
1540 if (BE (err != REG_NOERROR, 0))
1541 return err;
1543 else
1545 /* There is a duplicated node which satisfies the constraint,
1546 use it to avoid infinite loop. */
1547 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1548 if (BE (ret < 0, 0))
1549 return REG_ESPACE;
1552 org_dest = dfa->edests[org_node].elems[1];
1553 clone_dest = duplicate_node (dfa, org_dest, constraint);
1554 if (BE (clone_dest == -1, 0))
1555 return REG_ESPACE;
1556 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1557 if (BE (ret < 0, 0))
1558 return REG_ESPACE;
1560 org_node = org_dest;
1561 clone_node = clone_dest;
1563 return REG_NOERROR;
1566 /* Search for a node which is duplicated from the node ORG_NODE, and
1567 satisfies the constraint CONSTRAINT. */
1569 static int
1570 search_duplicated_node (const re_dfa_t *dfa, int org_node,
1571 unsigned int constraint)
1573 int idx;
1574 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1576 if (org_node == dfa->org_indices[idx]
1577 && constraint == dfa->nodes[idx].constraint)
1578 return idx; /* Found. */
1580 return -1; /* Not found. */
1583 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1584 Return the index of the new node, or -1 if insufficient storage is
1585 available. */
1587 static int
1588 duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint)
1590 int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1591 if (BE (dup_idx != -1, 1))
1593 dfa->nodes[dup_idx].constraint = constraint;
1594 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1595 dfa->nodes[dup_idx].duplicated = 1;
1597 /* Store the index of the original node. */
1598 dfa->org_indices[dup_idx] = org_idx;
1600 return dup_idx;
1603 static reg_errcode_t
1604 calc_inveclosure (re_dfa_t *dfa)
1606 int src, idx, ret;
1607 for (idx = 0; idx < dfa->nodes_len; ++idx)
1608 re_node_set_init_empty (dfa->inveclosures + idx);
1610 for (src = 0; src < dfa->nodes_len; ++src)
1612 int *elems = dfa->eclosures[src].elems;
1613 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1615 ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1616 if (BE (ret == -1, 0))
1617 return REG_ESPACE;
1621 return REG_NOERROR;
1624 /* Calculate "eclosure" for all the node in DFA. */
1626 static reg_errcode_t
1627 calc_eclosure (re_dfa_t *dfa)
1629 int node_idx, incomplete;
1630 #ifdef DEBUG
1631 assert (dfa->nodes_len > 0);
1632 #endif
1633 incomplete = 0;
1634 /* For each nodes, calculate epsilon closure. */
1635 for (node_idx = 0; ; ++node_idx)
1637 reg_errcode_t err;
1638 re_node_set eclosure_elem;
1639 if (node_idx == dfa->nodes_len)
1641 if (!incomplete)
1642 break;
1643 incomplete = 0;
1644 node_idx = 0;
1647 #ifdef DEBUG
1648 assert (dfa->eclosures[node_idx].nelem != -1);
1649 #endif
1651 /* If we have already calculated, skip it. */
1652 if (dfa->eclosures[node_idx].nelem != 0)
1653 continue;
1654 /* Calculate epsilon closure of 'node_idx'. */
1655 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1656 if (BE (err != REG_NOERROR, 0))
1657 return err;
1659 if (dfa->eclosures[node_idx].nelem == 0)
1661 incomplete = 1;
1662 re_node_set_free (&eclosure_elem);
1665 return REG_NOERROR;
1668 /* Calculate epsilon closure of NODE. */
1670 static reg_errcode_t
1671 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, int node, int root)
1673 reg_errcode_t err;
1674 int i;
1675 re_node_set eclosure;
1676 int ret;
1677 int incomplete = 0;
1678 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1679 if (BE (err != REG_NOERROR, 0))
1680 return err;
1682 /* This indicates that we are calculating this node now.
1683 We reference this value to avoid infinite loop. */
1684 dfa->eclosures[node].nelem = -1;
1686 /* If the current node has constraints, duplicate all nodes
1687 since they must inherit the constraints. */
1688 if (dfa->nodes[node].constraint
1689 && dfa->edests[node].nelem
1690 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1692 err = duplicate_node_closure (dfa, node, node, node,
1693 dfa->nodes[node].constraint);
1694 if (BE (err != REG_NOERROR, 0))
1695 return err;
1698 /* Expand each epsilon destination nodes. */
1699 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1700 for (i = 0; i < dfa->edests[node].nelem; ++i)
1702 re_node_set eclosure_elem;
1703 int edest = dfa->edests[node].elems[i];
1704 /* If calculating the epsilon closure of `edest' is in progress,
1705 return intermediate result. */
1706 if (dfa->eclosures[edest].nelem == -1)
1708 incomplete = 1;
1709 continue;
1711 /* If we haven't calculated the epsilon closure of `edest' yet,
1712 calculate now. Otherwise use calculated epsilon closure. */
1713 if (dfa->eclosures[edest].nelem == 0)
1715 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1716 if (BE (err != REG_NOERROR, 0))
1717 return err;
1719 else
1720 eclosure_elem = dfa->eclosures[edest];
1721 /* Merge the epsilon closure of 'edest'. */
1722 err = re_node_set_merge (&eclosure, &eclosure_elem);
1723 if (BE (err != REG_NOERROR, 0))
1724 return err;
1725 /* If the epsilon closure of 'edest' is incomplete,
1726 the epsilon closure of this node is also incomplete. */
1727 if (dfa->eclosures[edest].nelem == 0)
1729 incomplete = 1;
1730 re_node_set_free (&eclosure_elem);
1734 /* An epsilon closure includes itself. */
1735 ret = re_node_set_insert (&eclosure, node);
1736 if (BE (ret < 0, 0))
1737 return REG_ESPACE;
1738 if (incomplete && !root)
1739 dfa->eclosures[node].nelem = 0;
1740 else
1741 dfa->eclosures[node] = eclosure;
1742 *new_set = eclosure;
1743 return REG_NOERROR;
1746 /* Functions for token which are used in the parser. */
1748 /* Fetch a token from INPUT.
1749 We must not use this function inside bracket expressions. */
1751 static void
1752 internal_function
1753 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1755 re_string_skip_bytes (input, peek_token (result, input, syntax));
1758 /* Peek a token from INPUT, and return the length of the token.
1759 We must not use this function inside bracket expressions. */
1761 static int
1762 internal_function
1763 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1765 unsigned char c;
1767 if (re_string_eoi (input))
1769 token->type = END_OF_RE;
1770 return 0;
1773 c = re_string_peek_byte (input, 0);
1774 token->opr.c = c;
1776 token->word_char = 0;
1777 #ifdef RE_ENABLE_I18N
1778 token->mb_partial = 0;
1779 if (input->mb_cur_max > 1 &&
1780 !re_string_first_byte (input, re_string_cur_idx (input)))
1782 token->type = CHARACTER;
1783 token->mb_partial = 1;
1784 return 1;
1786 #endif
1787 if (c == '\\')
1789 unsigned char c2;
1790 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1792 token->type = BACK_SLASH;
1793 return 1;
1796 c2 = re_string_peek_byte_case (input, 1);
1797 token->opr.c = c2;
1798 token->type = CHARACTER;
1799 #ifdef RE_ENABLE_I18N
1800 if (input->mb_cur_max > 1)
1802 wint_t wc = re_string_wchar_at (input,
1803 re_string_cur_idx (input) + 1);
1804 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1806 else
1807 #endif
1808 token->word_char = IS_WORD_CHAR (c2) != 0;
1810 switch (c2)
1812 case '|':
1813 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1814 token->type = OP_ALT;
1815 break;
1816 case '1': case '2': case '3': case '4': case '5':
1817 case '6': case '7': case '8': case '9':
1818 if (!(syntax & RE_NO_BK_REFS))
1820 token->type = OP_BACK_REF;
1821 token->opr.idx = c2 - '1';
1823 break;
1824 case '<':
1825 if (!(syntax & RE_NO_GNU_OPS))
1827 token->type = ANCHOR;
1828 token->opr.ctx_type = WORD_FIRST;
1830 break;
1831 case '>':
1832 if (!(syntax & RE_NO_GNU_OPS))
1834 token->type = ANCHOR;
1835 token->opr.ctx_type = WORD_LAST;
1837 break;
1838 case 'b':
1839 if (!(syntax & RE_NO_GNU_OPS))
1841 token->type = ANCHOR;
1842 token->opr.ctx_type = WORD_DELIM;
1844 break;
1845 case 'B':
1846 if (!(syntax & RE_NO_GNU_OPS))
1848 token->type = ANCHOR;
1849 token->opr.ctx_type = NOT_WORD_DELIM;
1851 break;
1852 case 'w':
1853 if (!(syntax & RE_NO_GNU_OPS))
1854 token->type = OP_WORD;
1855 break;
1856 case 'W':
1857 if (!(syntax & RE_NO_GNU_OPS))
1858 token->type = OP_NOTWORD;
1859 break;
1860 case 's':
1861 if (!(syntax & RE_NO_GNU_OPS))
1862 token->type = OP_SPACE;
1863 break;
1864 case 'S':
1865 if (!(syntax & RE_NO_GNU_OPS))
1866 token->type = OP_NOTSPACE;
1867 break;
1868 case '`':
1869 if (!(syntax & RE_NO_GNU_OPS))
1871 token->type = ANCHOR;
1872 token->opr.ctx_type = BUF_FIRST;
1874 break;
1875 case '\'':
1876 if (!(syntax & RE_NO_GNU_OPS))
1878 token->type = ANCHOR;
1879 token->opr.ctx_type = BUF_LAST;
1881 break;
1882 case '(':
1883 if (!(syntax & RE_NO_BK_PARENS))
1884 token->type = OP_OPEN_SUBEXP;
1885 break;
1886 case ')':
1887 if (!(syntax & RE_NO_BK_PARENS))
1888 token->type = OP_CLOSE_SUBEXP;
1889 break;
1890 case '+':
1891 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1892 token->type = OP_DUP_PLUS;
1893 break;
1894 case '?':
1895 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1896 token->type = OP_DUP_QUESTION;
1897 break;
1898 case '{':
1899 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1900 token->type = OP_OPEN_DUP_NUM;
1901 break;
1902 case '}':
1903 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1904 token->type = OP_CLOSE_DUP_NUM;
1905 break;
1906 default:
1907 break;
1909 return 2;
1912 token->type = CHARACTER;
1913 #ifdef RE_ENABLE_I18N
1914 if (input->mb_cur_max > 1)
1916 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1917 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1919 else
1920 #endif
1921 token->word_char = IS_WORD_CHAR (token->opr.c);
1923 switch (c)
1925 case '\n':
1926 if (syntax & RE_NEWLINE_ALT)
1927 token->type = OP_ALT;
1928 break;
1929 case '|':
1930 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1931 token->type = OP_ALT;
1932 break;
1933 case '*':
1934 token->type = OP_DUP_ASTERISK;
1935 break;
1936 case '+':
1937 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1938 token->type = OP_DUP_PLUS;
1939 break;
1940 case '?':
1941 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1942 token->type = OP_DUP_QUESTION;
1943 break;
1944 case '{':
1945 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1946 token->type = OP_OPEN_DUP_NUM;
1947 break;
1948 case '}':
1949 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1950 token->type = OP_CLOSE_DUP_NUM;
1951 break;
1952 case '(':
1953 if (syntax & RE_NO_BK_PARENS)
1954 token->type = OP_OPEN_SUBEXP;
1955 break;
1956 case ')':
1957 if (syntax & RE_NO_BK_PARENS)
1958 token->type = OP_CLOSE_SUBEXP;
1959 break;
1960 case '[':
1961 token->type = OP_OPEN_BRACKET;
1962 break;
1963 case '.':
1964 token->type = OP_PERIOD;
1965 break;
1966 case '^':
1967 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1968 re_string_cur_idx (input) != 0)
1970 char prev = re_string_peek_byte (input, -1);
1971 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1972 break;
1974 token->type = ANCHOR;
1975 token->opr.ctx_type = LINE_FIRST;
1976 break;
1977 case '$':
1978 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1979 re_string_cur_idx (input) + 1 != re_string_length (input))
1981 re_token_t next;
1982 re_string_skip_bytes (input, 1);
1983 peek_token (&next, input, syntax);
1984 re_string_skip_bytes (input, -1);
1985 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1986 break;
1988 token->type = ANCHOR;
1989 token->opr.ctx_type = LINE_LAST;
1990 break;
1991 default:
1992 break;
1994 return 1;
1997 /* Peek a token from INPUT, and return the length of the token.
1998 We must not use this function out of bracket expressions. */
2000 static int
2001 internal_function
2002 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2004 unsigned char c;
2005 if (re_string_eoi (input))
2007 token->type = END_OF_RE;
2008 return 0;
2010 c = re_string_peek_byte (input, 0);
2011 token->opr.c = c;
2013 #ifdef RE_ENABLE_I18N
2014 if (input->mb_cur_max > 1 &&
2015 !re_string_first_byte (input, re_string_cur_idx (input)))
2017 token->type = CHARACTER;
2018 return 1;
2020 #endif /* RE_ENABLE_I18N */
2022 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2023 && re_string_cur_idx (input) + 1 < re_string_length (input))
2025 /* In this case, '\' escape a character. */
2026 unsigned char c2;
2027 re_string_skip_bytes (input, 1);
2028 c2 = re_string_peek_byte (input, 0);
2029 token->opr.c = c2;
2030 token->type = CHARACTER;
2031 return 1;
2033 if (c == '[') /* '[' is a special char in a bracket exps. */
2035 unsigned char c2;
2036 int token_len;
2037 if (re_string_cur_idx (input) + 1 < re_string_length (input))
2038 c2 = re_string_peek_byte (input, 1);
2039 else
2040 c2 = 0;
2041 token->opr.c = c2;
2042 token_len = 2;
2043 switch (c2)
2045 case '.':
2046 token->type = OP_OPEN_COLL_ELEM;
2047 break;
2048 case '=':
2049 token->type = OP_OPEN_EQUIV_CLASS;
2050 break;
2051 case ':':
2052 if (syntax & RE_CHAR_CLASSES)
2054 token->type = OP_OPEN_CHAR_CLASS;
2055 break;
2057 /* else fall through. */
2058 default:
2059 token->type = CHARACTER;
2060 token->opr.c = c;
2061 token_len = 1;
2062 break;
2064 return token_len;
2066 switch (c)
2068 case '-':
2069 token->type = OP_CHARSET_RANGE;
2070 break;
2071 case ']':
2072 token->type = OP_CLOSE_BRACKET;
2073 break;
2074 case '^':
2075 token->type = OP_NON_MATCH_LIST;
2076 break;
2077 default:
2078 token->type = CHARACTER;
2080 return 1;
2083 /* Functions for parser. */
2085 /* Entry point of the parser.
2086 Parse the regular expression REGEXP and return the structure tree.
2087 If an error occurs, ERR is set by error code, and return NULL.
2088 This function build the following tree, from regular expression <reg_exp>:
2092 <reg_exp> EOR
2094 CAT means concatenation.
2095 EOR means end of regular expression. */
2097 static bin_tree_t *
2098 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2099 reg_errcode_t *err)
2101 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2102 bin_tree_t *tree, *eor, *root;
2103 re_token_t current_token;
2104 dfa->syntax = syntax;
2105 fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2106 tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2107 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2108 return NULL;
2109 eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2110 if (tree != NULL)
2111 root = create_tree (dfa, tree, eor, CONCAT);
2112 else
2113 root = eor;
2114 if (BE (eor == NULL || root == NULL, 0))
2116 *err = REG_ESPACE;
2117 return NULL;
2119 return root;
2122 /* This function build the following tree, from regular expression
2123 <branch1>|<branch2>:
2127 <branch1> <branch2>
2129 ALT means alternative, which represents the operator '|'. */
2131 static bin_tree_t *
2132 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2133 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2135 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2136 bin_tree_t *tree, *branch = NULL;
2137 tree = parse_branch (regexp, preg, token, syntax, nest, err);
2138 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2139 return NULL;
2141 while (token->type == OP_ALT)
2143 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2144 if (token->type != OP_ALT && token->type != END_OF_RE
2145 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2147 branch = parse_branch (regexp, preg, token, syntax, nest, err);
2148 if (BE (*err != REG_NOERROR && branch == NULL, 0))
2150 if (tree != NULL)
2151 postorder (tree, free_tree, NULL);
2152 return NULL;
2155 else
2156 branch = NULL;
2157 tree = create_tree (dfa, tree, branch, OP_ALT);
2158 if (BE (tree == NULL, 0))
2160 *err = REG_ESPACE;
2161 return NULL;
2164 return tree;
2167 /* This function build the following tree, from regular expression
2168 <exp1><exp2>:
2172 <exp1> <exp2>
2174 CAT means concatenation. */
2176 static bin_tree_t *
2177 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2178 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2180 bin_tree_t *tree, *exp;
2181 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2182 tree = parse_expression (regexp, preg, token, syntax, nest, err);
2183 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2184 return NULL;
2186 while (token->type != OP_ALT && token->type != END_OF_RE
2187 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2189 exp = parse_expression (regexp, preg, token, syntax, nest, err);
2190 if (BE (*err != REG_NOERROR && exp == NULL, 0))
2192 if (tree != NULL)
2193 postorder (tree, free_tree, NULL);
2194 return NULL;
2196 if (tree != NULL && exp != NULL)
2198 bin_tree_t *newtree = create_tree (dfa, tree, exp, CONCAT);
2199 if (newtree == NULL)
2201 postorder (exp, free_tree, NULL);
2202 postorder (tree, free_tree, NULL);
2203 *err = REG_ESPACE;
2204 return NULL;
2206 tree = newtree;
2208 else if (tree == NULL)
2209 tree = exp;
2210 /* Otherwise exp == NULL, we don't need to create new tree. */
2212 return tree;
2215 /* This function build the following tree, from regular expression a*:
2221 static bin_tree_t *
2222 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2223 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2225 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2226 bin_tree_t *tree;
2227 switch (token->type)
2229 case CHARACTER:
2230 tree = create_token_tree (dfa, NULL, NULL, token);
2231 if (BE (tree == NULL, 0))
2233 *err = REG_ESPACE;
2234 return NULL;
2236 #ifdef RE_ENABLE_I18N
2237 if (dfa->mb_cur_max > 1)
2239 while (!re_string_eoi (regexp)
2240 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2242 bin_tree_t *mbc_remain;
2243 fetch_token (token, regexp, syntax);
2244 mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2245 tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2246 if (BE (mbc_remain == NULL || tree == NULL, 0))
2248 *err = REG_ESPACE;
2249 return NULL;
2253 #endif
2254 break;
2255 case OP_OPEN_SUBEXP:
2256 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2257 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2258 return NULL;
2259 break;
2260 case OP_OPEN_BRACKET:
2261 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2262 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2263 return NULL;
2264 break;
2265 case OP_BACK_REF:
2266 if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2268 *err = REG_ESUBREG;
2269 return NULL;
2271 dfa->used_bkref_map |= 1 << token->opr.idx;
2272 tree = create_token_tree (dfa, NULL, NULL, token);
2273 if (BE (tree == NULL, 0))
2275 *err = REG_ESPACE;
2276 return NULL;
2278 ++dfa->nbackref;
2279 dfa->has_mb_node = 1;
2280 break;
2281 case OP_OPEN_DUP_NUM:
2282 if (syntax & RE_CONTEXT_INVALID_DUP)
2284 *err = REG_BADRPT;
2285 return NULL;
2287 /* FALLTHROUGH */
2288 case OP_DUP_ASTERISK:
2289 case OP_DUP_PLUS:
2290 case OP_DUP_QUESTION:
2291 if (syntax & RE_CONTEXT_INVALID_OPS)
2293 *err = REG_BADRPT;
2294 return NULL;
2296 else if (syntax & RE_CONTEXT_INDEP_OPS)
2298 fetch_token (token, regexp, syntax);
2299 return parse_expression (regexp, preg, token, syntax, nest, err);
2301 /* else fall through */
2302 case OP_CLOSE_SUBEXP:
2303 if ((token->type == OP_CLOSE_SUBEXP) &&
2304 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2306 *err = REG_ERPAREN;
2307 return NULL;
2309 /* else fall through */
2310 case OP_CLOSE_DUP_NUM:
2311 /* We treat it as a normal character. */
2313 /* Then we can these characters as normal characters. */
2314 token->type = CHARACTER;
2315 /* mb_partial and word_char bits should be initialized already
2316 by peek_token. */
2317 tree = create_token_tree (dfa, NULL, NULL, token);
2318 if (BE (tree == NULL, 0))
2320 *err = REG_ESPACE;
2321 return NULL;
2323 break;
2324 case ANCHOR:
2325 if ((token->opr.ctx_type
2326 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2327 && dfa->word_ops_used == 0)
2328 init_word_char (dfa);
2329 if (token->opr.ctx_type == WORD_DELIM
2330 || token->opr.ctx_type == NOT_WORD_DELIM)
2332 bin_tree_t *tree_first, *tree_last;
2333 if (token->opr.ctx_type == WORD_DELIM)
2335 token->opr.ctx_type = WORD_FIRST;
2336 tree_first = create_token_tree (dfa, NULL, NULL, token);
2337 token->opr.ctx_type = WORD_LAST;
2339 else
2341 token->opr.ctx_type = INSIDE_WORD;
2342 tree_first = create_token_tree (dfa, NULL, NULL, token);
2343 token->opr.ctx_type = INSIDE_NOTWORD;
2345 tree_last = create_token_tree (dfa, NULL, NULL, token);
2346 tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2347 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2349 *err = REG_ESPACE;
2350 return NULL;
2353 else
2355 tree = create_token_tree (dfa, NULL, NULL, token);
2356 if (BE (tree == NULL, 0))
2358 *err = REG_ESPACE;
2359 return NULL;
2362 /* We must return here, since ANCHORs can't be followed
2363 by repetition operators.
2364 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2365 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2366 fetch_token (token, regexp, syntax);
2367 return tree;
2368 case OP_PERIOD:
2369 tree = create_token_tree (dfa, NULL, NULL, token);
2370 if (BE (tree == NULL, 0))
2372 *err = REG_ESPACE;
2373 return NULL;
2375 if (dfa->mb_cur_max > 1)
2376 dfa->has_mb_node = 1;
2377 break;
2378 case OP_WORD:
2379 case OP_NOTWORD:
2380 tree = build_charclass_op (dfa, regexp->trans,
2381 (const unsigned char *) "alnum",
2382 (const unsigned char *) "_",
2383 token->type == OP_NOTWORD, err);
2384 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2385 return NULL;
2386 break;
2387 case OP_SPACE:
2388 case OP_NOTSPACE:
2389 tree = build_charclass_op (dfa, regexp->trans,
2390 (const unsigned char *) "space",
2391 (const unsigned char *) "",
2392 token->type == OP_NOTSPACE, err);
2393 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2394 return NULL;
2395 break;
2396 case OP_ALT:
2397 case END_OF_RE:
2398 return NULL;
2399 case BACK_SLASH:
2400 *err = REG_EESCAPE;
2401 return NULL;
2402 default:
2403 /* Must not happen? */
2404 #ifdef DEBUG
2405 assert (0);
2406 #endif
2407 return NULL;
2409 fetch_token (token, regexp, syntax);
2411 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2412 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2414 bin_tree_t *dup_tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2415 if (BE (*err != REG_NOERROR && dup_tree == NULL, 0))
2417 if (tree != NULL)
2418 postorder (tree, free_tree, NULL);
2419 return NULL;
2421 tree = dup_tree;
2422 /* In BRE consecutive duplications are not allowed. */
2423 if ((syntax & RE_CONTEXT_INVALID_DUP)
2424 && (token->type == OP_DUP_ASTERISK
2425 || token->type == OP_OPEN_DUP_NUM))
2427 if (tree != NULL)
2428 postorder (tree, free_tree, NULL);
2429 *err = REG_BADRPT;
2430 return NULL;
2434 return tree;
2437 /* This function build the following tree, from regular expression
2438 (<reg_exp>):
2439 SUBEXP
2441 <reg_exp>
2444 static bin_tree_t *
2445 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2446 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2448 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2449 bin_tree_t *tree;
2450 size_t cur_nsub;
2451 cur_nsub = preg->re_nsub++;
2453 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2455 /* The subexpression may be a null string. */
2456 if (token->type == OP_CLOSE_SUBEXP)
2457 tree = NULL;
2458 else
2460 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2461 if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2463 if (tree != NULL)
2464 postorder (tree, free_tree, NULL);
2465 *err = REG_EPAREN;
2467 if (BE (*err != REG_NOERROR, 0))
2468 return NULL;
2471 if (cur_nsub <= '9' - '1')
2472 dfa->completed_bkref_map |= 1 << cur_nsub;
2474 tree = create_tree (dfa, tree, NULL, SUBEXP);
2475 if (BE (tree == NULL, 0))
2477 *err = REG_ESPACE;
2478 return NULL;
2480 tree->token.opr.idx = cur_nsub;
2481 return tree;
2484 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2486 static bin_tree_t *
2487 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2488 re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2490 bin_tree_t *tree = NULL, *old_tree = NULL;
2491 int i, start, end, start_idx = re_string_cur_idx (regexp);
2492 re_token_t start_token = *token;
2494 if (token->type == OP_OPEN_DUP_NUM)
2496 end = 0;
2497 start = fetch_number (regexp, token, syntax);
2498 if (start == -1)
2500 if (token->type == CHARACTER && token->opr.c == ',')
2501 start = 0; /* We treat "{,m}" as "{0,m}". */
2502 else
2504 *err = REG_BADBR; /* <re>{} is invalid. */
2505 return NULL;
2508 if (BE (start != -2, 1))
2510 /* We treat "{n}" as "{n,n}". */
2511 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2512 : ((token->type == CHARACTER && token->opr.c == ',')
2513 ? fetch_number (regexp, token, syntax) : -2));
2515 if (BE (start == -2 || end == -2, 0))
2517 /* Invalid sequence. */
2518 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2520 if (token->type == END_OF_RE)
2521 *err = REG_EBRACE;
2522 else
2523 *err = REG_BADBR;
2525 return NULL;
2528 /* If the syntax bit is set, rollback. */
2529 re_string_set_index (regexp, start_idx);
2530 *token = start_token;
2531 token->type = CHARACTER;
2532 /* mb_partial and word_char bits should be already initialized by
2533 peek_token. */
2534 return elem;
2537 if (BE ((end != -1 && start > end) || token->type != OP_CLOSE_DUP_NUM, 0))
2539 /* First number greater than second. */
2540 *err = REG_BADBR;
2541 return NULL;
2544 else
2546 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2547 end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2550 fetch_token (token, regexp, syntax);
2552 if (BE (elem == NULL, 0))
2553 return NULL;
2554 if (BE (start == 0 && end == 0, 0))
2556 postorder (elem, free_tree, NULL);
2557 return NULL;
2560 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2561 if (BE (start > 0, 0))
2563 tree = elem;
2564 for (i = 2; i <= start; ++i)
2566 elem = duplicate_tree (elem, dfa);
2567 tree = create_tree (dfa, tree, elem, CONCAT);
2568 if (BE (elem == NULL || tree == NULL, 0))
2569 goto parse_dup_op_espace;
2572 if (start == end)
2573 return tree;
2575 /* Duplicate ELEM before it is marked optional. */
2576 elem = duplicate_tree (elem, dfa);
2577 if (BE (elem == NULL, 0))
2578 goto parse_dup_op_espace;
2579 old_tree = tree;
2581 else
2582 old_tree = NULL;
2584 if (elem->token.type == SUBEXP)
2585 postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2587 tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2588 if (BE (tree == NULL, 0))
2589 goto parse_dup_op_espace;
2591 /* This loop is actually executed only when end != -1,
2592 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2593 already created the start+1-th copy. */
2594 for (i = start + 2; i <= end; ++i)
2596 elem = duplicate_tree (elem, dfa);
2597 tree = create_tree (dfa, tree, elem, CONCAT);
2598 if (BE (elem == NULL || tree == NULL, 0))
2599 goto parse_dup_op_espace;
2601 tree = create_tree (dfa, tree, NULL, OP_ALT);
2602 if (BE (tree == NULL, 0))
2603 goto parse_dup_op_espace;
2606 if (old_tree)
2607 tree = create_tree (dfa, old_tree, tree, CONCAT);
2609 return tree;
2611 parse_dup_op_espace:
2612 *err = REG_ESPACE;
2613 return NULL;
2616 /* Size of the names for collating symbol/equivalence_class/character_class.
2617 I'm not sure, but maybe enough. */
2618 #define BRACKET_NAME_BUF_SIZE 32
2620 #ifndef _LIBC
2621 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2622 Build the range expression which starts from START_ELEM, and ends
2623 at END_ELEM. The result are written to MBCSET and SBCSET.
2624 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2625 mbcset->range_ends, is a pointer argument since we may
2626 update it. */
2628 static reg_errcode_t
2629 internal_function
2630 # ifdef RE_ENABLE_I18N
2631 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2632 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2633 # else /* not RE_ENABLE_I18N */
2634 build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
2635 bracket_elem_t *end_elem)
2636 # endif /* not RE_ENABLE_I18N */
2638 unsigned int start_ch, end_ch;
2639 /* Equivalence Classes and Character Classes can't be a range start/end. */
2640 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2641 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2643 return REG_ERANGE;
2645 /* We can handle no multi character collating elements without libc
2646 support. */
2647 if (BE ((start_elem->type == COLL_SYM
2648 && strlen ((char *) start_elem->opr.name) > 1)
2649 || (end_elem->type == COLL_SYM
2650 && strlen ((char *) end_elem->opr.name) > 1), 0))
2651 return REG_ECOLLATE;
2653 # ifdef RE_ENABLE_I18N
2655 wchar_t wc;
2656 wint_t start_wc;
2657 wint_t end_wc;
2658 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2660 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2661 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2662 : 0));
2663 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2664 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2665 : 0));
2666 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2667 ? __btowc (start_ch) : start_elem->opr.wch);
2668 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2669 ? __btowc (end_ch) : end_elem->opr.wch);
2670 if (start_wc == WEOF || end_wc == WEOF)
2671 return REG_ECOLLATE;
2672 cmp_buf[0] = start_wc;
2673 cmp_buf[4] = end_wc;
2674 if (__wcscoll (cmp_buf, cmp_buf + 4) > 0)
2675 return REG_ERANGE;
2677 /* Got valid collation sequence values, add them as a new entry.
2678 However, for !_LIBC we have no collation elements: if the
2679 character set is single byte, the single byte character set
2680 that we build below suffices. parse_bracket_exp passes
2681 no MBCSET if dfa->mb_cur_max == 1. */
2682 if (mbcset)
2684 /* Check the space of the arrays. */
2685 if (BE (*range_alloc == mbcset->nranges, 0))
2687 /* There is not enough space, need realloc. */
2688 wchar_t *new_array_start, *new_array_end;
2689 int new_nranges;
2691 /* +1 in case of mbcset->nranges is 0. */
2692 new_nranges = 2 * mbcset->nranges + 1;
2693 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2694 are NULL if *range_alloc == 0. */
2695 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2696 new_nranges);
2697 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2698 new_nranges);
2700 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2701 return REG_ESPACE;
2703 mbcset->range_starts = new_array_start;
2704 mbcset->range_ends = new_array_end;
2705 *range_alloc = new_nranges;
2708 mbcset->range_starts[mbcset->nranges] = start_wc;
2709 mbcset->range_ends[mbcset->nranges++] = end_wc;
2712 /* Build the table for single byte characters. */
2713 for (wc = 0; wc < SBC_MAX; ++wc)
2715 cmp_buf[2] = wc;
2716 if (__wcscoll (cmp_buf, cmp_buf + 2) <= 0
2717 && __wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2718 bitset_set (sbcset, wc);
2721 # else /* not RE_ENABLE_I18N */
2723 unsigned int ch;
2724 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2725 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2726 : 0));
2727 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2728 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2729 : 0));
2730 if (start_ch > end_ch)
2731 return REG_ERANGE;
2732 /* Build the table for single byte characters. */
2733 for (ch = 0; ch < SBC_MAX; ++ch)
2734 if (start_ch <= ch && ch <= end_ch)
2735 bitset_set (sbcset, ch);
2737 # endif /* not RE_ENABLE_I18N */
2738 return REG_NOERROR;
2740 #endif /* not _LIBC */
2742 #ifndef _LIBC
2743 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2744 Build the collating element which is represented by NAME.
2745 The result are written to MBCSET and SBCSET.
2746 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2747 pointer argument since we may update it. */
2749 static reg_errcode_t
2750 internal_function
2751 # ifdef RE_ENABLE_I18N
2752 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2753 int *coll_sym_alloc, const unsigned char *name)
2754 # else /* not RE_ENABLE_I18N */
2755 build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2756 # endif /* not RE_ENABLE_I18N */
2758 size_t name_len = strlen ((const char *) name);
2759 if (BE (name_len != 1, 0))
2760 return REG_ECOLLATE;
2761 else
2763 bitset_set (sbcset, name[0]);
2764 return REG_NOERROR;
2767 #endif /* not _LIBC */
2769 /* This function parse bracket expression like "[abc]", "[a-c]",
2770 "[[.a-a.]]" etc. */
2772 static bin_tree_t *
2773 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2774 reg_syntax_t syntax, reg_errcode_t *err)
2776 #ifdef _LIBC
2777 const unsigned char *collseqmb;
2778 const char *collseqwc;
2779 uint32_t nrules;
2780 int32_t table_size;
2781 const int32_t *symb_table;
2782 const unsigned char *extra;
2784 /* Local function for parse_bracket_exp used in _LIBC environment.
2785 Seek the collating symbol entry corresponding to NAME.
2786 Return the index of the symbol in the SYMB_TABLE,
2787 or -1 if not found. */
2789 auto inline int32_t
2790 __attribute__ ((always_inline))
2791 seek_collating_symbol_entry (const unsigned char *name, size_t name_len)
2793 int32_t elem;
2795 for (elem = 0; elem < table_size; elem++)
2796 if (symb_table[2 * elem] != 0)
2798 int32_t idx = symb_table[2 * elem + 1];
2799 /* Skip the name of collating element name. */
2800 idx += 1 + extra[idx];
2801 if (/* Compare the length of the name. */
2802 name_len == extra[idx]
2803 /* Compare the name. */
2804 && memcmp (name, &extra[idx + 1], name_len) == 0)
2805 /* Yep, this is the entry. */
2806 return elem;
2808 return -1;
2811 /* Local function for parse_bracket_exp used in _LIBC environment.
2812 Look up the collation sequence value of BR_ELEM.
2813 Return the value if succeeded, UINT_MAX otherwise. */
2815 auto inline unsigned int
2816 __attribute__ ((always_inline))
2817 lookup_collation_sequence_value (bracket_elem_t *br_elem)
2819 if (br_elem->type == SB_CHAR)
2822 if (MB_CUR_MAX == 1)
2824 if (nrules == 0)
2825 return collseqmb[br_elem->opr.ch];
2826 else
2828 wint_t wc = __btowc (br_elem->opr.ch);
2829 return __collseq_table_lookup (collseqwc, wc);
2832 else if (br_elem->type == MB_CHAR)
2834 if (nrules != 0)
2835 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2837 else if (br_elem->type == COLL_SYM)
2839 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2840 if (nrules != 0)
2842 int32_t elem, idx;
2843 elem = seek_collating_symbol_entry (br_elem->opr.name,
2844 sym_name_len);
2845 if (elem != -1)
2847 /* We found the entry. */
2848 idx = symb_table[2 * elem + 1];
2849 /* Skip the name of collating element name. */
2850 idx += 1 + extra[idx];
2851 /* Skip the byte sequence of the collating element. */
2852 idx += 1 + extra[idx];
2853 /* Adjust for the alignment. */
2854 idx = (idx + 3) & ~3;
2855 /* Skip the multibyte collation sequence value. */
2856 idx += sizeof (unsigned int);
2857 /* Skip the wide char sequence of the collating element. */
2858 idx += sizeof (unsigned int) *
2859 (1 + *(unsigned int *) (extra + idx));
2860 /* Return the collation sequence value. */
2861 return *(unsigned int *) (extra + idx);
2863 else if (sym_name_len == 1)
2865 /* No valid character. Match it as a single byte
2866 character. */
2867 return collseqmb[br_elem->opr.name[0]];
2870 else if (sym_name_len == 1)
2871 return collseqmb[br_elem->opr.name[0]];
2873 return UINT_MAX;
2876 /* Local function for parse_bracket_exp used in _LIBC environment.
2877 Build the range expression which starts from START_ELEM, and ends
2878 at END_ELEM. The result are written to MBCSET and SBCSET.
2879 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2880 mbcset->range_ends, is a pointer argument since we may
2881 update it. */
2883 auto inline reg_errcode_t
2884 __attribute__ ((always_inline))
2885 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2886 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2888 unsigned int ch;
2889 uint32_t start_collseq;
2890 uint32_t end_collseq;
2892 /* Equivalence Classes and Character Classes can't be a range
2893 start/end. */
2894 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2895 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2897 return REG_ERANGE;
2899 start_collseq = lookup_collation_sequence_value (start_elem);
2900 end_collseq = lookup_collation_sequence_value (end_elem);
2901 /* Check start/end collation sequence values. */
2902 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2903 return REG_ECOLLATE;
2904 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2905 return REG_ERANGE;
2907 /* Got valid collation sequence values, add them as a new entry.
2908 However, if we have no collation elements, and the character set
2909 is single byte, the single byte character set that we
2910 build below suffices. */
2911 if (nrules > 0 || dfa->mb_cur_max > 1)
2913 /* Check the space of the arrays. */
2914 if (BE (*range_alloc == mbcset->nranges, 0))
2916 /* There is not enough space, need realloc. */
2917 uint32_t *new_array_start;
2918 uint32_t *new_array_end;
2919 int new_nranges;
2921 /* +1 in case of mbcset->nranges is 0. */
2922 new_nranges = 2 * mbcset->nranges + 1;
2923 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2924 new_nranges);
2925 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2926 new_nranges);
2928 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2929 return REG_ESPACE;
2931 mbcset->range_starts = new_array_start;
2932 mbcset->range_ends = new_array_end;
2933 *range_alloc = new_nranges;
2936 mbcset->range_starts[mbcset->nranges] = start_collseq;
2937 mbcset->range_ends[mbcset->nranges++] = end_collseq;
2940 /* Build the table for single byte characters. */
2941 for (ch = 0; ch < SBC_MAX; ch++)
2943 uint32_t ch_collseq;
2945 if (MB_CUR_MAX == 1)
2947 if (nrules == 0)
2948 ch_collseq = collseqmb[ch];
2949 else
2950 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2951 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2952 bitset_set (sbcset, ch);
2954 return REG_NOERROR;
2957 /* Local function for parse_bracket_exp used in _LIBC environment.
2958 Build the collating element which is represented by NAME.
2959 The result are written to MBCSET and SBCSET.
2960 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2961 pointer argument since we may update it. */
2963 auto inline reg_errcode_t
2964 __attribute__ ((always_inline))
2965 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2966 int *coll_sym_alloc, const unsigned char *name)
2968 int32_t elem, idx;
2969 size_t name_len = strlen ((const char *) name);
2970 if (nrules != 0)
2972 elem = seek_collating_symbol_entry (name, name_len);
2973 if (elem != -1)
2975 /* We found the entry. */
2976 idx = symb_table[2 * elem + 1];
2977 /* Skip the name of collating element name. */
2978 idx += 1 + extra[idx];
2980 else if (name_len == 1)
2982 /* No valid character, treat it as a normal
2983 character. */
2984 bitset_set (sbcset, name[0]);
2985 return REG_NOERROR;
2987 else
2988 return REG_ECOLLATE;
2990 /* Got valid collation sequence, add it as a new entry. */
2991 /* Check the space of the arrays. */
2992 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
2994 /* Not enough, realloc it. */
2995 /* +1 in case of mbcset->ncoll_syms is 0. */
2996 int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
2997 /* Use realloc since mbcset->coll_syms is NULL
2998 if *alloc == 0. */
2999 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3000 new_coll_sym_alloc);
3001 if (BE (new_coll_syms == NULL, 0))
3002 return REG_ESPACE;
3003 mbcset->coll_syms = new_coll_syms;
3004 *coll_sym_alloc = new_coll_sym_alloc;
3006 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3007 return REG_NOERROR;
3009 else
3011 if (BE (name_len != 1, 0))
3012 return REG_ECOLLATE;
3013 else
3015 bitset_set (sbcset, name[0]);
3016 return REG_NOERROR;
3020 #endif
3022 re_token_t br_token;
3023 re_bitset_ptr_t sbcset;
3024 #ifdef RE_ENABLE_I18N
3025 re_charset_t *mbcset;
3026 int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3027 int equiv_class_alloc = 0, char_class_alloc = 0;
3028 #endif /* not RE_ENABLE_I18N */
3029 int non_match = 0;
3030 bin_tree_t *work_tree;
3031 int token_len;
3032 int first_round = 1;
3033 #ifdef _LIBC
3034 collseqmb = (const unsigned char *)
3035 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3036 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3037 if (nrules)
3040 if (MB_CUR_MAX > 1)
3042 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3043 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3044 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3045 _NL_COLLATE_SYMB_TABLEMB);
3046 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3047 _NL_COLLATE_SYMB_EXTRAMB);
3049 #endif
3050 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3051 #ifdef RE_ENABLE_I18N
3052 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3053 #endif /* RE_ENABLE_I18N */
3054 #ifdef RE_ENABLE_I18N
3055 if (BE (sbcset == NULL || mbcset == NULL, 0))
3056 #else
3057 if (BE (sbcset == NULL, 0))
3058 #endif /* RE_ENABLE_I18N */
3060 re_free (sbcset);
3061 #ifdef RE_ENABLE_I18N
3062 re_free (mbcset);
3063 #endif
3064 *err = REG_ESPACE;
3065 return NULL;
3068 token_len = peek_token_bracket (token, regexp, syntax);
3069 if (BE (token->type == END_OF_RE, 0))
3071 *err = REG_BADPAT;
3072 goto parse_bracket_exp_free_return;
3074 if (token->type == OP_NON_MATCH_LIST)
3076 #ifdef RE_ENABLE_I18N
3077 mbcset->non_match = 1;
3078 #endif /* not RE_ENABLE_I18N */
3079 non_match = 1;
3080 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3081 bitset_set (sbcset, '\n');
3082 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3083 token_len = peek_token_bracket (token, regexp, syntax);
3084 if (BE (token->type == END_OF_RE, 0))
3086 *err = REG_BADPAT;
3087 goto parse_bracket_exp_free_return;
3091 /* We treat the first ']' as a normal character. */
3092 if (token->type == OP_CLOSE_BRACKET)
3093 token->type = CHARACTER;
3095 while (1)
3097 bracket_elem_t start_elem, end_elem;
3098 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3099 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3100 reg_errcode_t ret;
3101 int token_len2 = 0, is_range_exp = 0;
3102 re_token_t token2;
3104 start_elem.opr.name = start_name_buf;
3105 start_elem.type = COLL_SYM;
3106 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3107 syntax, first_round);
3108 if (BE (ret != REG_NOERROR, 0))
3110 *err = ret;
3111 goto parse_bracket_exp_free_return;
3113 first_round = 0;
3115 /* Get information about the next token. We need it in any case. */
3116 token_len = peek_token_bracket (token, regexp, syntax);
3118 /* Do not check for ranges if we know they are not allowed. */
3119 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3121 if (BE (token->type == END_OF_RE, 0))
3123 *err = REG_EBRACK;
3124 goto parse_bracket_exp_free_return;
3126 if (token->type == OP_CHARSET_RANGE)
3128 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3129 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3130 if (BE (token2.type == END_OF_RE, 0))
3132 *err = REG_EBRACK;
3133 goto parse_bracket_exp_free_return;
3135 if (token2.type == OP_CLOSE_BRACKET)
3137 /* We treat the last '-' as a normal character. */
3138 re_string_skip_bytes (regexp, -token_len);
3139 token->type = CHARACTER;
3141 else
3142 is_range_exp = 1;
3146 if (is_range_exp == 1)
3148 end_elem.opr.name = end_name_buf;
3149 end_elem.type = COLL_SYM;
3150 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3151 dfa, syntax, 1);
3152 if (BE (ret != REG_NOERROR, 0))
3154 *err = ret;
3155 goto parse_bracket_exp_free_return;
3158 token_len = peek_token_bracket (token, regexp, syntax);
3160 #ifdef _LIBC
3161 *err = build_range_exp (sbcset, mbcset, &range_alloc,
3162 &start_elem, &end_elem);
3163 #else
3164 # ifdef RE_ENABLE_I18N
3165 *err = build_range_exp (sbcset,
3166 dfa->mb_cur_max > 1 ? mbcset : NULL,
3167 &range_alloc, &start_elem, &end_elem);
3168 # else
3169 *err = build_range_exp (sbcset, &start_elem, &end_elem);
3170 # endif
3171 #endif /* RE_ENABLE_I18N */
3172 if (BE (*err != REG_NOERROR, 0))
3173 goto parse_bracket_exp_free_return;
3175 else
3177 switch (start_elem.type)
3179 case SB_CHAR:
3180 bitset_set (sbcset, start_elem.opr.ch);
3181 break;
3182 #ifdef RE_ENABLE_I18N
3183 case MB_CHAR:
3184 /* Check whether the array has enough space. */
3185 if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3187 wchar_t *new_mbchars;
3188 /* Not enough, realloc it. */
3189 /* +1 in case of mbcset->nmbchars is 0. */
3190 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3191 /* Use realloc since array is NULL if *alloc == 0. */
3192 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3193 mbchar_alloc);
3194 if (BE (new_mbchars == NULL, 0))
3195 goto parse_bracket_exp_espace;
3196 mbcset->mbchars = new_mbchars;
3198 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3199 break;
3200 #endif /* RE_ENABLE_I18N */
3201 case EQUIV_CLASS:
3202 *err = build_equiv_class (sbcset,
3203 #ifdef RE_ENABLE_I18N
3204 mbcset, &equiv_class_alloc,
3205 #endif /* RE_ENABLE_I18N */
3206 start_elem.opr.name);
3207 if (BE (*err != REG_NOERROR, 0))
3208 goto parse_bracket_exp_free_return;
3209 break;
3210 case COLL_SYM:
3211 *err = build_collating_symbol (sbcset,
3212 #ifdef RE_ENABLE_I18N
3213 mbcset, &coll_sym_alloc,
3214 #endif /* RE_ENABLE_I18N */
3215 start_elem.opr.name);
3216 if (BE (*err != REG_NOERROR, 0))
3217 goto parse_bracket_exp_free_return;
3218 break;
3219 case CHAR_CLASS:
3220 *err = build_charclass (regexp->trans, sbcset,
3221 #ifdef RE_ENABLE_I18N
3222 mbcset, &char_class_alloc,
3223 #endif /* RE_ENABLE_I18N */
3224 start_elem.opr.name, syntax);
3225 if (BE (*err != REG_NOERROR, 0))
3226 goto parse_bracket_exp_free_return;
3227 break;
3228 default:
3229 assert (0);
3230 break;
3233 if (BE (token->type == END_OF_RE, 0))
3235 *err = REG_EBRACK;
3236 goto parse_bracket_exp_free_return;
3238 if (token->type == OP_CLOSE_BRACKET)
3239 break;
3242 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3244 /* If it is non-matching list. */
3245 if (non_match)
3246 bitset_not (sbcset);
3248 #ifdef RE_ENABLE_I18N
3249 /* Ensure only single byte characters are set. */
3250 if (dfa->mb_cur_max > 1)
3251 bitset_mask (sbcset, dfa->sb_char);
3253 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3254 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3255 || mbcset->non_match)))
3257 bin_tree_t *mbc_tree;
3258 int sbc_idx;
3259 /* Build a tree for complex bracket. */
3260 dfa->has_mb_node = 1;
3261 br_token.type = COMPLEX_BRACKET;
3262 br_token.opr.mbcset = mbcset;
3263 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3264 if (BE (mbc_tree == NULL, 0))
3265 goto parse_bracket_exp_espace;
3266 for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3267 if (sbcset[sbc_idx])
3268 break;
3269 /* If there are no bits set in sbcset, there is no point
3270 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3271 if (sbc_idx < BITSET_WORDS)
3273 /* Build a tree for simple bracket. */
3274 br_token.type = SIMPLE_BRACKET;
3275 br_token.opr.sbcset = sbcset;
3276 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3277 if (BE (work_tree == NULL, 0))
3278 goto parse_bracket_exp_espace;
3280 /* Then join them by ALT node. */
3281 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3282 if (BE (work_tree == NULL, 0))
3283 goto parse_bracket_exp_espace;
3285 else
3287 re_free (sbcset);
3288 work_tree = mbc_tree;
3291 else
3292 #endif /* not RE_ENABLE_I18N */
3294 #ifdef RE_ENABLE_I18N
3295 free_charset (mbcset);
3296 #endif
3297 /* Build a tree for simple bracket. */
3298 br_token.type = SIMPLE_BRACKET;
3299 br_token.opr.sbcset = sbcset;
3300 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3301 if (BE (work_tree == NULL, 0))
3302 goto parse_bracket_exp_espace;
3304 return work_tree;
3306 parse_bracket_exp_espace:
3307 *err = REG_ESPACE;
3308 parse_bracket_exp_free_return:
3309 re_free (sbcset);
3310 #ifdef RE_ENABLE_I18N
3311 free_charset (mbcset);
3312 #endif /* RE_ENABLE_I18N */
3313 return NULL;
3316 /* Parse an element in the bracket expression. */
3318 static reg_errcode_t
3319 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3320 re_token_t *token, int token_len, re_dfa_t *dfa,
3321 reg_syntax_t syntax, int accept_hyphen)
3323 #ifdef RE_ENABLE_I18N
3324 int cur_char_size;
3325 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3326 if (cur_char_size > 1)
3328 elem->type = MB_CHAR;
3329 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3330 re_string_skip_bytes (regexp, cur_char_size);
3331 return REG_NOERROR;
3333 #endif /* RE_ENABLE_I18N */
3334 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3335 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3336 || token->type == OP_OPEN_EQUIV_CLASS)
3337 return parse_bracket_symbol (elem, regexp, token);
3338 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3340 /* A '-' must only appear as anything but a range indicator before
3341 the closing bracket. Everything else is an error. */
3342 re_token_t token2;
3343 (void) peek_token_bracket (&token2, regexp, syntax);
3344 if (token2.type != OP_CLOSE_BRACKET)
3345 /* The actual error value is not standardized since this whole
3346 case is undefined. But ERANGE makes good sense. */
3347 return REG_ERANGE;
3349 elem->type = SB_CHAR;
3350 elem->opr.ch = token->opr.c;
3351 return REG_NOERROR;
3354 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3355 such as [:<character_class>:], [.<collating_element>.], and
3356 [=<equivalent_class>=]. */
3358 static reg_errcode_t
3359 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3360 re_token_t *token)
3362 unsigned char ch, delim = token->opr.c;
3363 int i = 0;
3364 if (re_string_eoi(regexp))
3365 return REG_EBRACK;
3366 for (;; ++i)
3368 if (i >= BRACKET_NAME_BUF_SIZE)
3369 return REG_EBRACK;
3370 if (token->type == OP_OPEN_CHAR_CLASS)
3371 ch = re_string_fetch_byte_case (regexp);
3372 else
3373 ch = re_string_fetch_byte (regexp);
3374 if (re_string_eoi(regexp))
3375 return REG_EBRACK;
3376 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3377 break;
3378 elem->opr.name[i] = ch;
3380 re_string_skip_bytes (regexp, 1);
3381 elem->opr.name[i] = '\0';
3382 switch (token->type)
3384 case OP_OPEN_COLL_ELEM:
3385 elem->type = COLL_SYM;
3386 break;
3387 case OP_OPEN_EQUIV_CLASS:
3388 elem->type = EQUIV_CLASS;
3389 break;
3390 case OP_OPEN_CHAR_CLASS:
3391 elem->type = CHAR_CLASS;
3392 break;
3393 default:
3394 break;
3396 return REG_NOERROR;
3399 /* Helper function for parse_bracket_exp.
3400 Build the equivalence class which is represented by NAME.
3401 The result are written to MBCSET and SBCSET.
3402 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3403 is a pointer argument since we may update it. */
3405 static reg_errcode_t
3406 #ifdef RE_ENABLE_I18N
3407 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3408 int *equiv_class_alloc, const unsigned char *name)
3409 #else /* not RE_ENABLE_I18N */
3410 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3411 #endif /* not RE_ENABLE_I18N */
3413 #ifdef _LIBC
3414 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3415 if (nrules != 0)
3417 const int32_t *table, *indirect;
3418 const unsigned char *weights, *extra, *cp;
3419 unsigned char char_buf[2];
3420 int32_t idx1, idx2;
3421 unsigned int ch;
3422 size_t len;
3423 /* Calculate the index for equivalence class. */
3424 cp = name;
3425 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3426 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3427 _NL_COLLATE_WEIGHTMB);
3428 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3429 _NL_COLLATE_EXTRAMB);
3430 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3431 _NL_COLLATE_INDIRECTMB);
3432 idx1 = findidx (table, indirect, extra, &cp, -1);
3433 if (BE (idx1 == 0 || *cp != '\0', 0))
3434 /* This isn't a valid character. */
3435 return REG_ECOLLATE;
3437 /* Build single byte matcing table for this equivalence class. */
3438 len = weights[idx1 & 0xffffff];
3439 for (ch = 0; ch < SBC_MAX; ++ch)
3441 char_buf[0] = ch;
3442 cp = char_buf;
3443 idx2 = findidx (table, indirect, extra, &cp, 1);
3445 idx2 = table[ch];
3447 if (idx2 == 0)
3448 /* This isn't a valid character. */
3449 continue;
3450 /* Compare only if the length matches and the collation rule
3451 index is the same. */
3452 if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3454 int cnt = 0;
3456 while (cnt <= len &&
3457 weights[(idx1 & 0xffffff) + 1 + cnt]
3458 == weights[(idx2 & 0xffffff) + 1 + cnt])
3459 ++cnt;
3461 if (cnt > len)
3462 bitset_set (sbcset, ch);
3465 /* Check whether the array has enough space. */
3466 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3468 /* Not enough, realloc it. */
3469 /* +1 in case of mbcset->nequiv_classes is 0. */
3470 int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3471 /* Use realloc since the array is NULL if *alloc == 0. */
3472 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3473 int32_t,
3474 new_equiv_class_alloc);
3475 if (BE (new_equiv_classes == NULL, 0))
3476 return REG_ESPACE;
3477 mbcset->equiv_classes = new_equiv_classes;
3478 *equiv_class_alloc = new_equiv_class_alloc;
3480 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3482 else
3483 #endif /* _LIBC */
3485 if (BE (strlen ((const char *) name) != 1, 0))
3486 return REG_ECOLLATE;
3487 bitset_set (sbcset, *name);
3489 return REG_NOERROR;
3492 /* Helper function for parse_bracket_exp.
3493 Build the character class which is represented by NAME.
3494 The result are written to MBCSET and SBCSET.
3495 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3496 is a pointer argument since we may update it. */
3498 static reg_errcode_t
3499 #ifdef RE_ENABLE_I18N
3500 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3501 re_charset_t *mbcset, int *char_class_alloc,
3502 const unsigned char *class_name, reg_syntax_t syntax)
3503 #else /* not RE_ENABLE_I18N */
3504 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3505 const unsigned char *class_name, reg_syntax_t syntax)
3506 #endif /* not RE_ENABLE_I18N */
3508 int i;
3509 const char *name = (const char *) class_name;
3511 /* In case of REG_ICASE "upper" and "lower" match the both of
3512 upper and lower cases. */
3513 if ((syntax & RE_ICASE)
3514 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3515 name = "alpha";
3517 #ifdef RE_ENABLE_I18N
3518 /* Check the space of the arrays. */
3519 if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3521 /* Not enough, realloc it. */
3522 /* +1 in case of mbcset->nchar_classes is 0. */
3523 int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3524 /* Use realloc since array is NULL if *alloc == 0. */
3525 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3526 new_char_class_alloc);
3527 if (BE (new_char_classes == NULL, 0))
3528 return REG_ESPACE;
3529 mbcset->char_classes = new_char_classes;
3530 *char_class_alloc = new_char_class_alloc;
3532 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3533 #endif /* RE_ENABLE_I18N */
3535 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3536 do { \
3537 if (BE (trans != NULL, 0)) \
3539 for (i = 0; i < SBC_MAX; ++i) \
3540 if (ctype_func (i)) \
3541 bitset_set (sbcset, trans[i]); \
3543 else \
3545 for (i = 0; i < SBC_MAX; ++i) \
3546 if (ctype_func (i)) \
3547 bitset_set (sbcset, i); \
3549 } while (0)
3551 if (strcmp (name, "alnum") == 0)
3552 BUILD_CHARCLASS_LOOP (isalnum);
3553 else if (strcmp (name, "cntrl") == 0)
3554 BUILD_CHARCLASS_LOOP (iscntrl);
3555 else if (strcmp (name, "lower") == 0)
3556 BUILD_CHARCLASS_LOOP (islower);
3557 else if (strcmp (name, "space") == 0)
3558 BUILD_CHARCLASS_LOOP (isspace);
3559 else if (strcmp (name, "alpha") == 0)
3560 BUILD_CHARCLASS_LOOP (isalpha);
3561 else if (strcmp (name, "digit") == 0)
3562 BUILD_CHARCLASS_LOOP (isdigit);
3563 else if (strcmp (name, "print") == 0)
3564 BUILD_CHARCLASS_LOOP (isprint);
3565 else if (strcmp (name, "upper") == 0)
3566 BUILD_CHARCLASS_LOOP (isupper);
3567 else if (strcmp (name, "blank") == 0)
3568 BUILD_CHARCLASS_LOOP (isblank);
3569 else if (strcmp (name, "graph") == 0)
3570 BUILD_CHARCLASS_LOOP (isgraph);
3571 else if (strcmp (name, "punct") == 0)
3572 BUILD_CHARCLASS_LOOP (ispunct);
3573 else if (strcmp (name, "xdigit") == 0)
3574 BUILD_CHARCLASS_LOOP (isxdigit);
3575 else
3576 return REG_ECTYPE;
3578 return REG_NOERROR;
3581 static bin_tree_t *
3582 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3583 const unsigned char *class_name,
3584 const unsigned char *extra, int non_match,
3585 reg_errcode_t *err)
3587 re_bitset_ptr_t sbcset;
3588 #ifdef RE_ENABLE_I18N
3589 re_charset_t *mbcset;
3590 int alloc = 0;
3591 #endif /* not RE_ENABLE_I18N */
3592 reg_errcode_t ret;
3593 re_token_t br_token;
3594 bin_tree_t *tree;
3596 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3597 #ifdef RE_ENABLE_I18N
3598 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3599 #endif /* RE_ENABLE_I18N */
3601 #ifdef RE_ENABLE_I18N
3602 if (BE (sbcset == NULL || mbcset == NULL, 0))
3603 #else /* not RE_ENABLE_I18N */
3604 if (BE (sbcset == NULL, 0))
3605 #endif /* not RE_ENABLE_I18N */
3607 *err = REG_ESPACE;
3608 return NULL;
3611 if (non_match)
3613 #ifdef RE_ENABLE_I18N
3614 mbcset->non_match = 1;
3615 #endif /* not RE_ENABLE_I18N */
3618 /* We don't care the syntax in this case. */
3619 ret = build_charclass (trans, sbcset,
3620 #ifdef RE_ENABLE_I18N
3621 mbcset, &alloc,
3622 #endif /* RE_ENABLE_I18N */
3623 class_name, 0);
3625 if (BE (ret != REG_NOERROR, 0))
3627 re_free (sbcset);
3628 #ifdef RE_ENABLE_I18N
3629 free_charset (mbcset);
3630 #endif /* RE_ENABLE_I18N */
3631 *err = ret;
3632 return NULL;
3634 /* \w match '_' also. */
3635 for (; *extra; extra++)
3636 bitset_set (sbcset, *extra);
3638 /* If it is non-matching list. */
3639 if (non_match)
3640 bitset_not (sbcset);
3642 #ifdef RE_ENABLE_I18N
3643 /* Ensure only single byte characters are set. */
3644 if (dfa->mb_cur_max > 1)
3645 bitset_mask (sbcset, dfa->sb_char);
3646 #endif
3648 /* Build a tree for simple bracket. */
3649 br_token.type = SIMPLE_BRACKET;
3650 br_token.opr.sbcset = sbcset;
3651 tree = create_token_tree (dfa, NULL, NULL, &br_token);
3652 if (BE (tree == NULL, 0))
3653 goto build_word_op_espace;
3655 #ifdef RE_ENABLE_I18N
3656 if (dfa->mb_cur_max > 1)
3658 bin_tree_t *mbc_tree;
3659 /* Build a tree for complex bracket. */
3660 br_token.type = COMPLEX_BRACKET;
3661 br_token.opr.mbcset = mbcset;
3662 dfa->has_mb_node = 1;
3663 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3664 if (BE (mbc_tree == NULL, 0))
3665 goto build_word_op_espace;
3666 /* Then join them by ALT node. */
3667 tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3668 if (BE (mbc_tree != NULL, 1))
3669 return tree;
3671 else
3673 free_charset (mbcset);
3674 return tree;
3676 #else /* not RE_ENABLE_I18N */
3677 return tree;
3678 #endif /* not RE_ENABLE_I18N */
3680 build_word_op_espace:
3681 re_free (sbcset);
3682 #ifdef RE_ENABLE_I18N
3683 free_charset (mbcset);
3684 #endif /* RE_ENABLE_I18N */
3685 *err = REG_ESPACE;
3686 return NULL;
3689 /* This is intended for the expressions like "a{1,3}".
3690 Fetch a number from `input', and return the number.
3691 Return -1, if the number field is empty like "{,1}".
3692 Return -2, If an error is occured. */
3694 static int
3695 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3697 int num = -1;
3698 unsigned char c;
3699 while (1)
3701 fetch_token (token, input, syntax);
3702 c = token->opr.c;
3703 if (BE (token->type == END_OF_RE, 0))
3704 return -2;
3705 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3706 break;
3707 num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3708 ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3709 num = (num > RE_DUP_MAX) ? -2 : num;
3711 return num;
3714 #ifdef RE_ENABLE_I18N
3715 static void
3716 free_charset (re_charset_t *cset)
3718 re_free (cset->mbchars);
3719 # ifdef _LIBC
3720 re_free (cset->coll_syms);
3721 re_free (cset->equiv_classes);
3722 re_free (cset->range_starts);
3723 re_free (cset->range_ends);
3724 # endif
3725 re_free (cset->char_classes);
3726 re_free (cset);
3728 #endif /* RE_ENABLE_I18N */
3730 /* Functions for binary tree operation. */
3732 /* Create a tree node. */
3734 static bin_tree_t *
3735 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3736 re_token_type_t type)
3738 re_token_t t;
3739 t.type = type;
3740 return create_token_tree (dfa, left, right, &t);
3743 static bin_tree_t *
3744 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3745 const re_token_t *token)
3747 bin_tree_t *tree;
3748 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3750 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3752 if (storage == NULL)
3753 return NULL;
3754 storage->next = dfa->str_tree_storage;
3755 dfa->str_tree_storage = storage;
3756 dfa->str_tree_storage_idx = 0;
3758 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3760 tree->parent = NULL;
3761 tree->left = left;
3762 tree->right = right;
3763 tree->token = *token;
3764 tree->token.duplicated = 0;
3765 tree->token.opt_subexp = 0;
3766 tree->first = NULL;
3767 tree->next = NULL;
3768 tree->node_idx = -1;
3770 if (left != NULL)
3771 left->parent = tree;
3772 if (right != NULL)
3773 right->parent = tree;
3774 return tree;
3777 /* Mark the tree SRC as an optional subexpression.
3778 To be called from preorder or postorder. */
3780 static reg_errcode_t
3781 mark_opt_subexp (void *extra, bin_tree_t *node)
3783 int idx = (int) (long) extra;
3784 if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3785 node->token.opt_subexp = 1;
3787 return REG_NOERROR;
3790 /* Free the allocated memory inside NODE. */
3792 static void
3793 free_token (re_token_t *node)
3795 #ifdef RE_ENABLE_I18N
3796 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3797 free_charset (node->opr.mbcset);
3798 else
3799 #endif /* RE_ENABLE_I18N */
3800 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3801 re_free (node->opr.sbcset);
3804 /* Worker function for tree walking. Free the allocated memory inside NODE
3805 and its children. */
3807 static reg_errcode_t
3808 free_tree (void *extra, bin_tree_t *node)
3810 free_token (&node->token);
3811 return REG_NOERROR;
3815 /* Duplicate the node SRC, and return new node. This is a preorder
3816 visit similar to the one implemented by the generic visitor, but
3817 we need more infrastructure to maintain two parallel trees --- so,
3818 it's easier to duplicate. */
3820 static bin_tree_t *
3821 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3823 const bin_tree_t *node;
3824 bin_tree_t *dup_root;
3825 bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3827 for (node = root; ; )
3829 /* Create a new tree and link it back to the current parent. */
3830 *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3831 if (*p_new == NULL)
3832 return NULL;
3833 (*p_new)->parent = dup_node;
3834 (*p_new)->token.duplicated = 1;
3835 dup_node = *p_new;
3837 /* Go to the left node, or up and to the right. */
3838 if (node->left)
3840 node = node->left;
3841 p_new = &dup_node->left;
3843 else
3845 const bin_tree_t *prev = NULL;
3846 while (node->right == prev || node->right == NULL)
3848 prev = node;
3849 node = node->parent;
3850 dup_node = dup_node->parent;
3851 if (!node)
3852 return dup_root;
3854 node = node->right;
3855 p_new = &dup_node->right;