Update copyright dates with scripts/update-copyrights.
[glibc.git] / posix / regcomp.c
blob8e6d2f8cb5b3a28d3cb5ad72a7ea9815842342c4
1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002-2015 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <http://www.gnu.org/licenses/>. */
20 #include <stdint.h>
22 #ifdef _LIBC
23 # include <locale/weight.h>
24 #endif
26 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
27 size_t length, reg_syntax_t syntax);
28 static void re_compile_fastmap_iter (regex_t *bufp,
29 const re_dfastate_t *init_state,
30 char *fastmap);
31 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
32 #ifdef RE_ENABLE_I18N
33 static void free_charset (re_charset_t *cset);
34 #endif /* RE_ENABLE_I18N */
35 static void free_workarea_compile (regex_t *preg);
36 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
37 #ifdef RE_ENABLE_I18N
38 static void optimize_utf8 (re_dfa_t *dfa);
39 #endif
40 static reg_errcode_t analyze (regex_t *preg);
41 static reg_errcode_t preorder (bin_tree_t *root,
42 reg_errcode_t (fn (void *, bin_tree_t *)),
43 void *extra);
44 static reg_errcode_t postorder (bin_tree_t *root,
45 reg_errcode_t (fn (void *, bin_tree_t *)),
46 void *extra);
47 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
48 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
49 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
50 bin_tree_t *node);
51 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
52 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
53 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
54 static int duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint);
55 static int search_duplicated_node (const re_dfa_t *dfa, int org_node,
56 unsigned int constraint);
57 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
58 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
59 int node, int root);
60 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
61 static int fetch_number (re_string_t *input, re_token_t *token,
62 reg_syntax_t syntax);
63 static int peek_token (re_token_t *token, re_string_t *input,
64 reg_syntax_t syntax) internal_function;
65 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
66 reg_syntax_t syntax, reg_errcode_t *err);
67 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
68 re_token_t *token, reg_syntax_t syntax,
69 int nest, reg_errcode_t *err);
70 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
71 re_token_t *token, reg_syntax_t syntax,
72 int nest, reg_errcode_t *err);
73 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
74 re_token_t *token, reg_syntax_t syntax,
75 int nest, reg_errcode_t *err);
76 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
77 re_token_t *token, reg_syntax_t syntax,
78 int nest, reg_errcode_t *err);
79 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
80 re_dfa_t *dfa, re_token_t *token,
81 reg_syntax_t syntax, reg_errcode_t *err);
82 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
83 re_token_t *token, reg_syntax_t syntax,
84 reg_errcode_t *err);
85 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
86 re_string_t *regexp,
87 re_token_t *token, int token_len,
88 re_dfa_t *dfa,
89 reg_syntax_t syntax,
90 int accept_hyphen);
91 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
92 re_string_t *regexp,
93 re_token_t *token);
94 #ifdef RE_ENABLE_I18N
95 static reg_errcode_t build_equiv_class (bitset_t sbcset,
96 re_charset_t *mbcset,
97 int *equiv_class_alloc,
98 const unsigned char *name);
99 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
100 bitset_t sbcset,
101 re_charset_t *mbcset,
102 int *char_class_alloc,
103 const unsigned char *class_name,
104 reg_syntax_t syntax);
105 #else /* not RE_ENABLE_I18N */
106 static reg_errcode_t build_equiv_class (bitset_t sbcset,
107 const unsigned char *name);
108 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
109 bitset_t sbcset,
110 const unsigned char *class_name,
111 reg_syntax_t syntax);
112 #endif /* not RE_ENABLE_I18N */
113 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
114 RE_TRANSLATE_TYPE trans,
115 const unsigned char *class_name,
116 const unsigned char *extra,
117 int non_match, reg_errcode_t *err);
118 static bin_tree_t *create_tree (re_dfa_t *dfa,
119 bin_tree_t *left, bin_tree_t *right,
120 re_token_type_t type);
121 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
122 bin_tree_t *left, bin_tree_t *right,
123 const re_token_t *token);
124 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
125 static void free_token (re_token_t *node);
126 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
127 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
129 /* This table gives an error message for each of the error codes listed
130 in regex.h. Obviously the order here has to be same as there.
131 POSIX doesn't require that we do anything for REG_NOERROR,
132 but why not be nice? */
134 const char __re_error_msgid[] attribute_hidden =
136 #define REG_NOERROR_IDX 0
137 gettext_noop ("Success") /* REG_NOERROR */
138 "\0"
139 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
140 gettext_noop ("No match") /* REG_NOMATCH */
141 "\0"
142 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
143 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
144 "\0"
145 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
146 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
147 "\0"
148 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
149 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
150 "\0"
151 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
152 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
153 "\0"
154 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
155 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
156 "\0"
157 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
158 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
159 "\0"
160 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
161 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
162 "\0"
163 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
164 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
165 "\0"
166 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
167 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
168 "\0"
169 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
170 gettext_noop ("Invalid range end") /* REG_ERANGE */
171 "\0"
172 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
173 gettext_noop ("Memory exhausted") /* REG_ESPACE */
174 "\0"
175 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
176 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
177 "\0"
178 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
179 gettext_noop ("Premature end of regular expression") /* REG_EEND */
180 "\0"
181 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
182 gettext_noop ("Regular expression too big") /* REG_ESIZE */
183 "\0"
184 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
185 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
188 const size_t __re_error_msgid_idx[] attribute_hidden =
190 REG_NOERROR_IDX,
191 REG_NOMATCH_IDX,
192 REG_BADPAT_IDX,
193 REG_ECOLLATE_IDX,
194 REG_ECTYPE_IDX,
195 REG_EESCAPE_IDX,
196 REG_ESUBREG_IDX,
197 REG_EBRACK_IDX,
198 REG_EPAREN_IDX,
199 REG_EBRACE_IDX,
200 REG_BADBR_IDX,
201 REG_ERANGE_IDX,
202 REG_ESPACE_IDX,
203 REG_BADRPT_IDX,
204 REG_EEND_IDX,
205 REG_ESIZE_IDX,
206 REG_ERPAREN_IDX
209 /* Entry points for GNU code. */
211 /* re_compile_pattern is the GNU regular expression compiler: it
212 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
213 Returns 0 if the pattern was valid, otherwise an error string.
215 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
216 are set in BUFP on entry. */
218 const char *
219 re_compile_pattern (pattern, length, bufp)
220 const char *pattern;
221 size_t length;
222 struct re_pattern_buffer *bufp;
224 reg_errcode_t ret;
226 /* And GNU code determines whether or not to get register information
227 by passing null for the REGS argument to re_match, etc., not by
228 setting no_sub, unless RE_NO_SUB is set. */
229 bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
231 /* Match anchors at newline. */
232 bufp->newline_anchor = 1;
234 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
236 if (!ret)
237 return NULL;
238 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
240 #ifdef _LIBC
241 weak_alias (__re_compile_pattern, re_compile_pattern)
242 #endif
244 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
245 also be assigned to arbitrarily: each pattern buffer stores its own
246 syntax, so it can be changed between regex compilations. */
247 /* This has no initializer because initialized variables in Emacs
248 become read-only after dumping. */
249 reg_syntax_t re_syntax_options;
252 /* Specify the precise syntax of regexps for compilation. This provides
253 for compatibility for various utilities which historically have
254 different, incompatible syntaxes.
256 The argument SYNTAX is a bit mask comprised of the various bits
257 defined in regex.h. We return the old syntax. */
259 reg_syntax_t
260 re_set_syntax (syntax)
261 reg_syntax_t syntax;
263 reg_syntax_t ret = re_syntax_options;
265 re_syntax_options = syntax;
266 return ret;
268 #ifdef _LIBC
269 weak_alias (__re_set_syntax, re_set_syntax)
270 #endif
273 re_compile_fastmap (bufp)
274 struct re_pattern_buffer *bufp;
276 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
277 char *fastmap = bufp->fastmap;
279 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
280 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
281 if (dfa->init_state != dfa->init_state_word)
282 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
283 if (dfa->init_state != dfa->init_state_nl)
284 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
285 if (dfa->init_state != dfa->init_state_begbuf)
286 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
287 bufp->fastmap_accurate = 1;
288 return 0;
290 #ifdef _LIBC
291 weak_alias (__re_compile_fastmap, re_compile_fastmap)
292 #endif
294 static inline void
295 __attribute ((always_inline))
296 re_set_fastmap (char *fastmap, int icase, int ch)
298 fastmap[ch] = 1;
299 if (icase)
300 fastmap[tolower (ch)] = 1;
303 /* Helper function for re_compile_fastmap.
304 Compile fastmap for the initial_state INIT_STATE. */
306 static void
307 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
308 char *fastmap)
310 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
311 int node_cnt;
312 int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
313 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
315 int node = init_state->nodes.elems[node_cnt];
316 re_token_type_t type = dfa->nodes[node].type;
318 if (type == CHARACTER)
320 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
321 #ifdef RE_ENABLE_I18N
322 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
324 unsigned char *buf = alloca (dfa->mb_cur_max), *p;
325 wchar_t wc;
326 mbstate_t state;
328 p = buf;
329 *p++ = dfa->nodes[node].opr.c;
330 while (++node < dfa->nodes_len
331 && dfa->nodes[node].type == CHARACTER
332 && dfa->nodes[node].mb_partial)
333 *p++ = dfa->nodes[node].opr.c;
334 memset (&state, '\0', sizeof (state));
335 if (__mbrtowc (&wc, (const char *) buf, p - buf,
336 &state) == p - buf
337 && (__wcrtomb ((char *) buf, towlower (wc), &state)
338 != (size_t) -1))
339 re_set_fastmap (fastmap, 0, buf[0]);
341 #endif
343 else if (type == SIMPLE_BRACKET)
345 int i, ch;
346 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
348 int j;
349 bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
350 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
351 if (w & ((bitset_word_t) 1 << j))
352 re_set_fastmap (fastmap, icase, ch);
355 #ifdef RE_ENABLE_I18N
356 else if (type == COMPLEX_BRACKET)
358 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
359 int i;
361 # ifdef _LIBC
362 /* See if we have to try all bytes which start multiple collation
363 elements.
364 e.g. In da_DK, we want to catch 'a' since "aa" is a valid
365 collation element, and don't catch 'b' since 'b' is
366 the only collation element which starts from 'b' (and
367 it is caught by SIMPLE_BRACKET). */
368 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
369 && (cset->ncoll_syms || cset->nranges))
371 const int32_t *table = (const int32_t *)
372 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
373 for (i = 0; i < SBC_MAX; ++i)
374 if (table[i] < 0)
375 re_set_fastmap (fastmap, icase, i);
377 # endif /* _LIBC */
379 /* See if we have to start the match at all multibyte characters,
380 i.e. where we would not find an invalid sequence. This only
381 applies to multibyte character sets; for single byte character
382 sets, the SIMPLE_BRACKET again suffices. */
383 if (dfa->mb_cur_max > 1
384 && (cset->nchar_classes || cset->non_match || cset->nranges
385 # ifdef _LIBC
386 || cset->nequiv_classes
387 # endif /* _LIBC */
390 unsigned char c = 0;
393 mbstate_t mbs;
394 memset (&mbs, 0, sizeof (mbs));
395 if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
396 re_set_fastmap (fastmap, false, (int) c);
398 while (++c != 0);
401 else
403 /* ... Else catch all bytes which can start the mbchars. */
404 for (i = 0; i < cset->nmbchars; ++i)
406 char buf[256];
407 mbstate_t state;
408 memset (&state, '\0', sizeof (state));
409 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
410 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
411 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
413 if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
414 != (size_t) -1)
415 re_set_fastmap (fastmap, false, *(unsigned char *) buf);
420 #endif /* RE_ENABLE_I18N */
421 else if (type == OP_PERIOD
422 #ifdef RE_ENABLE_I18N
423 || type == OP_UTF8_PERIOD
424 #endif /* RE_ENABLE_I18N */
425 || type == END_OF_RE)
427 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
428 if (type == END_OF_RE)
429 bufp->can_be_null = 1;
430 return;
435 /* Entry point for POSIX code. */
436 /* regcomp takes a regular expression as a string and compiles it.
438 PREG is a regex_t *. We do not expect any fields to be initialized,
439 since POSIX says we shouldn't. Thus, we set
441 `buffer' to the compiled pattern;
442 `used' to the length of the compiled pattern;
443 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
444 REG_EXTENDED bit in CFLAGS is set; otherwise, to
445 RE_SYNTAX_POSIX_BASIC;
446 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
447 `fastmap' to an allocated space for the fastmap;
448 `fastmap_accurate' to zero;
449 `re_nsub' to the number of subexpressions in PATTERN.
451 PATTERN is the address of the pattern string.
453 CFLAGS is a series of bits which affect compilation.
455 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
456 use POSIX basic syntax.
458 If REG_NEWLINE is set, then . and [^...] don't match newline.
459 Also, regexec will try a match beginning after every newline.
461 If REG_ICASE is set, then we considers upper- and lowercase
462 versions of letters to be equivalent when matching.
464 If REG_NOSUB is set, then when PREG is passed to regexec, that
465 routine will report only success or failure, and nothing about the
466 registers.
468 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
469 the return codes and their meanings.) */
472 regcomp (preg, pattern, cflags)
473 regex_t *__restrict preg;
474 const char *__restrict pattern;
475 int cflags;
477 reg_errcode_t ret;
478 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
479 : RE_SYNTAX_POSIX_BASIC);
481 preg->buffer = NULL;
482 preg->allocated = 0;
483 preg->used = 0;
485 /* Try to allocate space for the fastmap. */
486 preg->fastmap = re_malloc (char, SBC_MAX);
487 if (BE (preg->fastmap == NULL, 0))
488 return REG_ESPACE;
490 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
492 /* If REG_NEWLINE is set, newlines are treated differently. */
493 if (cflags & REG_NEWLINE)
494 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
495 syntax &= ~RE_DOT_NEWLINE;
496 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
497 /* It also changes the matching behavior. */
498 preg->newline_anchor = 1;
500 else
501 preg->newline_anchor = 0;
502 preg->no_sub = !!(cflags & REG_NOSUB);
503 preg->translate = NULL;
505 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
507 /* POSIX doesn't distinguish between an unmatched open-group and an
508 unmatched close-group: both are REG_EPAREN. */
509 if (ret == REG_ERPAREN)
510 ret = REG_EPAREN;
512 /* We have already checked preg->fastmap != NULL. */
513 if (BE (ret == REG_NOERROR, 1))
514 /* Compute the fastmap now, since regexec cannot modify the pattern
515 buffer. This function never fails in this implementation. */
516 (void) re_compile_fastmap (preg);
517 else
519 /* Some error occurred while compiling the expression. */
520 re_free (preg->fastmap);
521 preg->fastmap = NULL;
524 return (int) ret;
526 #ifdef _LIBC
527 weak_alias (__regcomp, regcomp)
528 #endif
530 /* Returns a message corresponding to an error code, ERRCODE, returned
531 from either regcomp or regexec. We don't use PREG here. */
533 size_t
534 regerror (errcode, preg, errbuf, errbuf_size)
535 int errcode;
536 const regex_t *__restrict preg;
537 char *__restrict errbuf;
538 size_t errbuf_size;
540 const char *msg;
541 size_t msg_size;
543 if (BE (errcode < 0
544 || errcode >= (int) (sizeof (__re_error_msgid_idx)
545 / sizeof (__re_error_msgid_idx[0])), 0))
546 /* Only error codes returned by the rest of the code should be passed
547 to this routine. If we are given anything else, or if other regex
548 code generates an invalid error code, then the program has a bug.
549 Dump core so we can fix it. */
550 abort ();
552 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
554 msg_size = strlen (msg) + 1; /* Includes the null. */
556 if (BE (errbuf_size != 0, 1))
558 if (BE (msg_size > errbuf_size, 0))
560 #if defined HAVE_MEMPCPY || defined _LIBC
561 *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
562 #else
563 memcpy (errbuf, msg, errbuf_size - 1);
564 errbuf[errbuf_size - 1] = 0;
565 #endif
567 else
568 memcpy (errbuf, msg, msg_size);
571 return msg_size;
573 #ifdef _LIBC
574 weak_alias (__regerror, regerror)
575 #endif
578 #ifdef RE_ENABLE_I18N
579 /* This static array is used for the map to single-byte characters when
580 UTF-8 is used. Otherwise we would allocate memory just to initialize
581 it the same all the time. UTF-8 is the preferred encoding so this is
582 a worthwhile optimization. */
583 static const bitset_t utf8_sb_map =
585 /* Set the first 128 bits. */
586 [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
588 #endif
591 static void
592 free_dfa_content (re_dfa_t *dfa)
594 int i, j;
596 if (dfa->nodes)
597 for (i = 0; i < dfa->nodes_len; ++i)
598 free_token (dfa->nodes + i);
599 re_free (dfa->nexts);
600 for (i = 0; i < dfa->nodes_len; ++i)
602 if (dfa->eclosures != NULL)
603 re_node_set_free (dfa->eclosures + i);
604 if (dfa->inveclosures != NULL)
605 re_node_set_free (dfa->inveclosures + i);
606 if (dfa->edests != NULL)
607 re_node_set_free (dfa->edests + i);
609 re_free (dfa->edests);
610 re_free (dfa->eclosures);
611 re_free (dfa->inveclosures);
612 re_free (dfa->nodes);
614 if (dfa->state_table)
615 for (i = 0; i <= dfa->state_hash_mask; ++i)
617 struct re_state_table_entry *entry = dfa->state_table + i;
618 for (j = 0; j < entry->num; ++j)
620 re_dfastate_t *state = entry->array[j];
621 free_state (state);
623 re_free (entry->array);
625 re_free (dfa->state_table);
626 #ifdef RE_ENABLE_I18N
627 if (dfa->sb_char != utf8_sb_map)
628 re_free (dfa->sb_char);
629 #endif
630 re_free (dfa->subexp_map);
631 #ifdef DEBUG
632 re_free (dfa->re_str);
633 #endif
635 re_free (dfa);
639 /* Free dynamically allocated space used by PREG. */
641 void
642 regfree (preg)
643 regex_t *preg;
645 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
646 if (BE (dfa != NULL, 1))
647 free_dfa_content (dfa);
648 preg->buffer = NULL;
649 preg->allocated = 0;
651 re_free (preg->fastmap);
652 preg->fastmap = NULL;
654 re_free (preg->translate);
655 preg->translate = NULL;
657 #ifdef _LIBC
658 weak_alias (__regfree, regfree)
659 #endif
661 /* Entry points compatible with 4.2 BSD regex library. We don't define
662 them unless specifically requested. */
664 #if defined _REGEX_RE_COMP || defined _LIBC
666 /* BSD has one and only one pattern buffer. */
667 static struct re_pattern_buffer re_comp_buf;
669 char *
670 # ifdef _LIBC
671 /* Make these definitions weak in libc, so POSIX programs can redefine
672 these names if they don't use our functions, and still use
673 regcomp/regexec above without link errors. */
674 weak_function
675 # endif
676 re_comp (s)
677 const char *s;
679 reg_errcode_t ret;
680 char *fastmap;
682 if (!s)
684 if (!re_comp_buf.buffer)
685 return gettext ("No previous regular expression");
686 return 0;
689 if (re_comp_buf.buffer)
691 fastmap = re_comp_buf.fastmap;
692 re_comp_buf.fastmap = NULL;
693 __regfree (&re_comp_buf);
694 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
695 re_comp_buf.fastmap = fastmap;
698 if (re_comp_buf.fastmap == NULL)
700 re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
701 if (re_comp_buf.fastmap == NULL)
702 return (char *) gettext (__re_error_msgid
703 + __re_error_msgid_idx[(int) REG_ESPACE]);
706 /* Since `re_exec' always passes NULL for the `regs' argument, we
707 don't need to initialize the pattern buffer fields which affect it. */
709 /* Match anchors at newlines. */
710 re_comp_buf.newline_anchor = 1;
712 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
714 if (!ret)
715 return NULL;
717 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
718 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
721 #ifdef _LIBC
722 libc_freeres_fn (free_mem)
724 __regfree (&re_comp_buf);
726 #endif
728 #endif /* _REGEX_RE_COMP */
730 /* Internal entry point.
731 Compile the regular expression PATTERN, whose length is LENGTH.
732 SYNTAX indicate regular expression's syntax. */
734 static reg_errcode_t
735 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
736 reg_syntax_t syntax)
738 reg_errcode_t err = REG_NOERROR;
739 re_dfa_t *dfa;
740 re_string_t regexp;
742 /* Initialize the pattern buffer. */
743 preg->fastmap_accurate = 0;
744 preg->syntax = syntax;
745 preg->not_bol = preg->not_eol = 0;
746 preg->used = 0;
747 preg->re_nsub = 0;
748 preg->can_be_null = 0;
749 preg->regs_allocated = REGS_UNALLOCATED;
751 /* Initialize the dfa. */
752 dfa = (re_dfa_t *) preg->buffer;
753 if (BE (preg->allocated < sizeof (re_dfa_t), 0))
755 /* If zero allocated, but buffer is non-null, try to realloc
756 enough space. This loses if buffer's address is bogus, but
757 that is the user's responsibility. If ->buffer is NULL this
758 is a simple allocation. */
759 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
760 if (dfa == NULL)
761 return REG_ESPACE;
762 preg->allocated = sizeof (re_dfa_t);
763 preg->buffer = (unsigned char *) dfa;
765 preg->used = sizeof (re_dfa_t);
767 err = init_dfa (dfa, length);
768 if (BE (err != REG_NOERROR, 0))
770 free_dfa_content (dfa);
771 preg->buffer = NULL;
772 preg->allocated = 0;
773 return err;
775 #ifdef DEBUG
776 /* Note: length+1 will not overflow since it is checked in init_dfa. */
777 dfa->re_str = re_malloc (char, length + 1);
778 strncpy (dfa->re_str, pattern, length + 1);
779 #endif
781 __libc_lock_init (dfa->lock);
783 err = re_string_construct (&regexp, pattern, length, preg->translate,
784 syntax & RE_ICASE, dfa);
785 if (BE (err != REG_NOERROR, 0))
787 re_compile_internal_free_return:
788 free_workarea_compile (preg);
789 re_string_destruct (&regexp);
790 free_dfa_content (dfa);
791 preg->buffer = NULL;
792 preg->allocated = 0;
793 return err;
796 /* Parse the regular expression, and build a structure tree. */
797 preg->re_nsub = 0;
798 dfa->str_tree = parse (&regexp, preg, syntax, &err);
799 if (BE (dfa->str_tree == NULL, 0))
800 goto re_compile_internal_free_return;
802 /* Analyze the tree and create the nfa. */
803 err = analyze (preg);
804 if (BE (err != REG_NOERROR, 0))
805 goto re_compile_internal_free_return;
807 #ifdef RE_ENABLE_I18N
808 /* If possible, do searching in single byte encoding to speed things up. */
809 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
810 optimize_utf8 (dfa);
811 #endif
813 /* Then create the initial state of the dfa. */
814 err = create_initial_state (dfa);
816 /* Release work areas. */
817 free_workarea_compile (preg);
818 re_string_destruct (&regexp);
820 if (BE (err != REG_NOERROR, 0))
822 free_dfa_content (dfa);
823 preg->buffer = NULL;
824 preg->allocated = 0;
827 return err;
830 /* Initialize DFA. We use the length of the regular expression PAT_LEN
831 as the initial length of some arrays. */
833 static reg_errcode_t
834 init_dfa (re_dfa_t *dfa, size_t pat_len)
836 unsigned int table_size;
837 #ifndef _LIBC
838 char *codeset_name;
839 #endif
841 memset (dfa, '\0', sizeof (re_dfa_t));
843 /* Force allocation of str_tree_storage the first time. */
844 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
846 /* Avoid overflows. */
847 if (pat_len == SIZE_MAX)
848 return REG_ESPACE;
850 dfa->nodes_alloc = pat_len + 1;
851 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
853 /* table_size = 2 ^ ceil(log pat_len) */
854 for (table_size = 1; ; table_size <<= 1)
855 if (table_size > pat_len)
856 break;
858 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
859 dfa->state_hash_mask = table_size - 1;
861 dfa->mb_cur_max = MB_CUR_MAX;
862 #ifdef _LIBC
863 if (dfa->mb_cur_max == 6
864 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
865 dfa->is_utf8 = 1;
866 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
867 != 0);
868 #else
869 # ifdef HAVE_LANGINFO_CODESET
870 codeset_name = nl_langinfo (CODESET);
871 # else
872 codeset_name = getenv ("LC_ALL");
873 if (codeset_name == NULL || codeset_name[0] == '\0')
874 codeset_name = getenv ("LC_CTYPE");
875 if (codeset_name == NULL || codeset_name[0] == '\0')
876 codeset_name = getenv ("LANG");
877 if (codeset_name == NULL)
878 codeset_name = "";
879 else if (strchr (codeset_name, '.') != NULL)
880 codeset_name = strchr (codeset_name, '.') + 1;
881 # endif
883 if (strcasecmp (codeset_name, "UTF-8") == 0
884 || strcasecmp (codeset_name, "UTF8") == 0)
885 dfa->is_utf8 = 1;
887 /* We check exhaustively in the loop below if this charset is a
888 superset of ASCII. */
889 dfa->map_notascii = 0;
890 #endif
892 #ifdef RE_ENABLE_I18N
893 if (dfa->mb_cur_max > 1)
895 if (dfa->is_utf8)
896 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
897 else
899 int i, j, ch;
901 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
902 if (BE (dfa->sb_char == NULL, 0))
903 return REG_ESPACE;
905 /* Set the bits corresponding to single byte chars. */
906 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
907 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
909 wint_t wch = __btowc (ch);
910 if (wch != WEOF)
911 dfa->sb_char[i] |= (bitset_word_t) 1 << j;
912 # ifndef _LIBC
913 if (isascii (ch) && wch != ch)
914 dfa->map_notascii = 1;
915 # endif
919 #endif
921 if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
922 return REG_ESPACE;
923 return REG_NOERROR;
926 /* Initialize WORD_CHAR table, which indicate which character is
927 "word". In this case "word" means that it is the word construction
928 character used by some operators like "\<", "\>", etc. */
930 static void
931 internal_function
932 init_word_char (re_dfa_t *dfa)
934 dfa->word_ops_used = 1;
935 int i = 0;
936 int ch = 0;
937 if (BE (dfa->map_notascii == 0, 1))
939 if (sizeof (dfa->word_char[0]) == 8)
941 /* The extra temporaries here avoid "implicitly truncated"
942 warnings in the case when this is dead code, i.e. 32-bit. */
943 const uint64_t wc0 = UINT64_C (0x03ff000000000000);
944 const uint64_t wc1 = UINT64_C (0x07fffffe87fffffe);
945 dfa->word_char[0] = wc0;
946 dfa->word_char[1] = wc1;
947 i = 2;
949 else if (sizeof (dfa->word_char[0]) == 4)
951 dfa->word_char[0] = UINT32_C (0x00000000);
952 dfa->word_char[1] = UINT32_C (0x03ff0000);
953 dfa->word_char[2] = UINT32_C (0x87fffffe);
954 dfa->word_char[3] = UINT32_C (0x07fffffe);
955 i = 4;
957 else
958 abort ();
959 ch = 128;
961 if (BE (dfa->is_utf8, 1))
963 memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8);
964 return;
968 for (; i < BITSET_WORDS; ++i)
969 for (int j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
970 if (isalnum (ch) || ch == '_')
971 dfa->word_char[i] |= (bitset_word_t) 1 << j;
974 /* Free the work area which are only used while compiling. */
976 static void
977 free_workarea_compile (regex_t *preg)
979 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
980 bin_tree_storage_t *storage, *next;
981 for (storage = dfa->str_tree_storage; storage; storage = next)
983 next = storage->next;
984 re_free (storage);
986 dfa->str_tree_storage = NULL;
987 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
988 dfa->str_tree = NULL;
989 re_free (dfa->org_indices);
990 dfa->org_indices = NULL;
993 /* Create initial states for all contexts. */
995 static reg_errcode_t
996 create_initial_state (re_dfa_t *dfa)
998 int first, i;
999 reg_errcode_t err;
1000 re_node_set init_nodes;
1002 /* Initial states have the epsilon closure of the node which is
1003 the first node of the regular expression. */
1004 first = dfa->str_tree->first->node_idx;
1005 dfa->init_node = first;
1006 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
1007 if (BE (err != REG_NOERROR, 0))
1008 return err;
1010 /* The back-references which are in initial states can epsilon transit,
1011 since in this case all of the subexpressions can be null.
1012 Then we add epsilon closures of the nodes which are the next nodes of
1013 the back-references. */
1014 if (dfa->nbackref > 0)
1015 for (i = 0; i < init_nodes.nelem; ++i)
1017 int node_idx = init_nodes.elems[i];
1018 re_token_type_t type = dfa->nodes[node_idx].type;
1020 int clexp_idx;
1021 if (type != OP_BACK_REF)
1022 continue;
1023 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1025 re_token_t *clexp_node;
1026 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1027 if (clexp_node->type == OP_CLOSE_SUBEXP
1028 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1029 break;
1031 if (clexp_idx == init_nodes.nelem)
1032 continue;
1034 if (type == OP_BACK_REF)
1036 int dest_idx = dfa->edests[node_idx].elems[0];
1037 if (!re_node_set_contains (&init_nodes, dest_idx))
1039 reg_errcode_t err = re_node_set_merge (&init_nodes,
1040 dfa->eclosures
1041 + dest_idx);
1042 if (err != REG_NOERROR)
1043 return err;
1044 i = 0;
1049 /* It must be the first time to invoke acquire_state. */
1050 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1051 /* We don't check ERR here, since the initial state must not be NULL. */
1052 if (BE (dfa->init_state == NULL, 0))
1053 return err;
1054 if (dfa->init_state->has_constraint)
1056 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1057 CONTEXT_WORD);
1058 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1059 CONTEXT_NEWLINE);
1060 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1061 &init_nodes,
1062 CONTEXT_NEWLINE
1063 | CONTEXT_BEGBUF);
1064 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1065 || dfa->init_state_begbuf == NULL, 0))
1066 return err;
1068 else
1069 dfa->init_state_word = dfa->init_state_nl
1070 = dfa->init_state_begbuf = dfa->init_state;
1072 re_node_set_free (&init_nodes);
1073 return REG_NOERROR;
1076 #ifdef RE_ENABLE_I18N
1077 /* If it is possible to do searching in single byte encoding instead of UTF-8
1078 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1079 DFA nodes where needed. */
1081 static void
1082 optimize_utf8 (re_dfa_t *dfa)
1084 int node, i, mb_chars = 0, has_period = 0;
1086 for (node = 0; node < dfa->nodes_len; ++node)
1087 switch (dfa->nodes[node].type)
1089 case CHARACTER:
1090 if (dfa->nodes[node].opr.c >= 0x80)
1091 mb_chars = 1;
1092 break;
1093 case ANCHOR:
1094 switch (dfa->nodes[node].opr.ctx_type)
1096 case LINE_FIRST:
1097 case LINE_LAST:
1098 case BUF_FIRST:
1099 case BUF_LAST:
1100 break;
1101 default:
1102 /* Word anchors etc. cannot be handled. It's okay to test
1103 opr.ctx_type since constraints (for all DFA nodes) are
1104 created by ORing one or more opr.ctx_type values. */
1105 return;
1107 break;
1108 case OP_PERIOD:
1109 has_period = 1;
1110 break;
1111 case OP_BACK_REF:
1112 case OP_ALT:
1113 case END_OF_RE:
1114 case OP_DUP_ASTERISK:
1115 case OP_OPEN_SUBEXP:
1116 case OP_CLOSE_SUBEXP:
1117 break;
1118 case COMPLEX_BRACKET:
1119 return;
1120 case SIMPLE_BRACKET:
1121 /* Just double check. The non-ASCII range starts at 0x80. */
1122 assert (0x80 % BITSET_WORD_BITS == 0);
1123 for (i = 0x80 / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1124 if (dfa->nodes[node].opr.sbcset[i])
1125 return;
1126 break;
1127 default:
1128 abort ();
1131 if (mb_chars || has_period)
1132 for (node = 0; node < dfa->nodes_len; ++node)
1134 if (dfa->nodes[node].type == CHARACTER
1135 && dfa->nodes[node].opr.c >= 0x80)
1136 dfa->nodes[node].mb_partial = 0;
1137 else if (dfa->nodes[node].type == OP_PERIOD)
1138 dfa->nodes[node].type = OP_UTF8_PERIOD;
1141 /* The search can be in single byte locale. */
1142 dfa->mb_cur_max = 1;
1143 dfa->is_utf8 = 0;
1144 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1146 #endif
1148 /* Analyze the structure tree, and calculate "first", "next", "edest",
1149 "eclosure", and "inveclosure". */
1151 static reg_errcode_t
1152 analyze (regex_t *preg)
1154 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1155 reg_errcode_t ret;
1157 /* Allocate arrays. */
1158 dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1159 dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1160 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1161 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1162 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1163 || dfa->eclosures == NULL, 0))
1164 return REG_ESPACE;
1166 dfa->subexp_map = re_malloc (int, preg->re_nsub);
1167 if (dfa->subexp_map != NULL)
1169 int i;
1170 for (i = 0; i < preg->re_nsub; i++)
1171 dfa->subexp_map[i] = i;
1172 preorder (dfa->str_tree, optimize_subexps, dfa);
1173 for (i = 0; i < preg->re_nsub; i++)
1174 if (dfa->subexp_map[i] != i)
1175 break;
1176 if (i == preg->re_nsub)
1178 free (dfa->subexp_map);
1179 dfa->subexp_map = NULL;
1183 ret = postorder (dfa->str_tree, lower_subexps, preg);
1184 if (BE (ret != REG_NOERROR, 0))
1185 return ret;
1186 ret = postorder (dfa->str_tree, calc_first, dfa);
1187 if (BE (ret != REG_NOERROR, 0))
1188 return ret;
1189 preorder (dfa->str_tree, calc_next, dfa);
1190 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1191 if (BE (ret != REG_NOERROR, 0))
1192 return ret;
1193 ret = calc_eclosure (dfa);
1194 if (BE (ret != REG_NOERROR, 0))
1195 return ret;
1197 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1198 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1199 if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1200 || dfa->nbackref)
1202 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1203 if (BE (dfa->inveclosures == NULL, 0))
1204 return REG_ESPACE;
1205 ret = calc_inveclosure (dfa);
1208 return ret;
1211 /* Our parse trees are very unbalanced, so we cannot use a stack to
1212 implement parse tree visits. Instead, we use parent pointers and
1213 some hairy code in these two functions. */
1214 static reg_errcode_t
1215 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1216 void *extra)
1218 bin_tree_t *node, *prev;
1220 for (node = root; ; )
1222 /* Descend down the tree, preferably to the left (or to the right
1223 if that's the only child). */
1224 while (node->left || node->right)
1225 if (node->left)
1226 node = node->left;
1227 else
1228 node = node->right;
1232 reg_errcode_t err = fn (extra, node);
1233 if (BE (err != REG_NOERROR, 0))
1234 return err;
1235 if (node->parent == NULL)
1236 return REG_NOERROR;
1237 prev = node;
1238 node = node->parent;
1240 /* Go up while we have a node that is reached from the right. */
1241 while (node->right == prev || node->right == NULL);
1242 node = node->right;
1246 static reg_errcode_t
1247 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1248 void *extra)
1250 bin_tree_t *node;
1252 for (node = root; ; )
1254 reg_errcode_t err = fn (extra, node);
1255 if (BE (err != REG_NOERROR, 0))
1256 return err;
1258 /* Go to the left node, or up and to the right. */
1259 if (node->left)
1260 node = node->left;
1261 else
1263 bin_tree_t *prev = NULL;
1264 while (node->right == prev || node->right == NULL)
1266 prev = node;
1267 node = node->parent;
1268 if (!node)
1269 return REG_NOERROR;
1271 node = node->right;
1276 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1277 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1278 backreferences as well. Requires a preorder visit. */
1279 static reg_errcode_t
1280 optimize_subexps (void *extra, bin_tree_t *node)
1282 re_dfa_t *dfa = (re_dfa_t *) extra;
1284 if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1286 int idx = node->token.opr.idx;
1287 node->token.opr.idx = dfa->subexp_map[idx];
1288 dfa->used_bkref_map |= 1 << node->token.opr.idx;
1291 else if (node->token.type == SUBEXP
1292 && node->left && node->left->token.type == SUBEXP)
1294 int other_idx = node->left->token.opr.idx;
1296 node->left = node->left->left;
1297 if (node->left)
1298 node->left->parent = node;
1300 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1301 if (other_idx < BITSET_WORD_BITS)
1302 dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1305 return REG_NOERROR;
1308 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1309 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1310 static reg_errcode_t
1311 lower_subexps (void *extra, bin_tree_t *node)
1313 regex_t *preg = (regex_t *) extra;
1314 reg_errcode_t err = REG_NOERROR;
1316 if (node->left && node->left->token.type == SUBEXP)
1318 node->left = lower_subexp (&err, preg, node->left);
1319 if (node->left)
1320 node->left->parent = node;
1322 if (node->right && node->right->token.type == SUBEXP)
1324 node->right = lower_subexp (&err, preg, node->right);
1325 if (node->right)
1326 node->right->parent = node;
1329 return err;
1332 static bin_tree_t *
1333 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1335 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1336 bin_tree_t *body = node->left;
1337 bin_tree_t *op, *cls, *tree1, *tree;
1339 if (preg->no_sub
1340 /* We do not optimize empty subexpressions, because otherwise we may
1341 have bad CONCAT nodes with NULL children. This is obviously not
1342 very common, so we do not lose much. An example that triggers
1343 this case is the sed "script" /\(\)/x. */
1344 && node->left != NULL
1345 && (node->token.opr.idx >= BITSET_WORD_BITS
1346 || !(dfa->used_bkref_map
1347 & ((bitset_word_t) 1 << node->token.opr.idx))))
1348 return node->left;
1350 /* Convert the SUBEXP node to the concatenation of an
1351 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1352 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1353 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1354 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1355 tree = create_tree (dfa, op, tree1, CONCAT);
1356 if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1358 *err = REG_ESPACE;
1359 return NULL;
1362 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1363 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1364 return tree;
1367 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1368 nodes. Requires a postorder visit. */
1369 static reg_errcode_t
1370 calc_first (void *extra, bin_tree_t *node)
1372 re_dfa_t *dfa = (re_dfa_t *) extra;
1373 if (node->token.type == CONCAT)
1375 node->first = node->left->first;
1376 node->node_idx = node->left->node_idx;
1378 else
1380 node->first = node;
1381 node->node_idx = re_dfa_add_node (dfa, node->token);
1382 if (BE (node->node_idx == -1, 0))
1383 return REG_ESPACE;
1384 if (node->token.type == ANCHOR)
1385 dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1387 return REG_NOERROR;
1390 /* Pass 2: compute NEXT on the tree. Preorder visit. */
1391 static reg_errcode_t
1392 calc_next (void *extra, bin_tree_t *node)
1394 switch (node->token.type)
1396 case OP_DUP_ASTERISK:
1397 node->left->next = node;
1398 break;
1399 case CONCAT:
1400 node->left->next = node->right->first;
1401 node->right->next = node->next;
1402 break;
1403 default:
1404 if (node->left)
1405 node->left->next = node->next;
1406 if (node->right)
1407 node->right->next = node->next;
1408 break;
1410 return REG_NOERROR;
1413 /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1414 static reg_errcode_t
1415 link_nfa_nodes (void *extra, bin_tree_t *node)
1417 re_dfa_t *dfa = (re_dfa_t *) extra;
1418 int idx = node->node_idx;
1419 reg_errcode_t err = REG_NOERROR;
1421 switch (node->token.type)
1423 case CONCAT:
1424 break;
1426 case END_OF_RE:
1427 assert (node->next == NULL);
1428 break;
1430 case OP_DUP_ASTERISK:
1431 case OP_ALT:
1433 int left, right;
1434 dfa->has_plural_match = 1;
1435 if (node->left != NULL)
1436 left = node->left->first->node_idx;
1437 else
1438 left = node->next->node_idx;
1439 if (node->right != NULL)
1440 right = node->right->first->node_idx;
1441 else
1442 right = node->next->node_idx;
1443 assert (left > -1);
1444 assert (right > -1);
1445 err = re_node_set_init_2 (dfa->edests + idx, left, right);
1447 break;
1449 case ANCHOR:
1450 case OP_OPEN_SUBEXP:
1451 case OP_CLOSE_SUBEXP:
1452 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1453 break;
1455 case OP_BACK_REF:
1456 dfa->nexts[idx] = node->next->node_idx;
1457 if (node->token.type == OP_BACK_REF)
1458 err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1459 break;
1461 default:
1462 assert (!IS_EPSILON_NODE (node->token.type));
1463 dfa->nexts[idx] = node->next->node_idx;
1464 break;
1467 return err;
1470 /* Duplicate the epsilon closure of the node ROOT_NODE.
1471 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1472 to their own constraint. */
1474 static reg_errcode_t
1475 internal_function
1476 duplicate_node_closure (re_dfa_t *dfa, int top_org_node, int top_clone_node,
1477 int root_node, unsigned int init_constraint)
1479 int org_node, clone_node, ret;
1480 unsigned int constraint = init_constraint;
1481 for (org_node = top_org_node, clone_node = top_clone_node;;)
1483 int org_dest, clone_dest;
1484 if (dfa->nodes[org_node].type == OP_BACK_REF)
1486 /* If the back reference epsilon-transit, its destination must
1487 also have the constraint. Then duplicate the epsilon closure
1488 of the destination of the back reference, and store it in
1489 edests of the back reference. */
1490 org_dest = dfa->nexts[org_node];
1491 re_node_set_empty (dfa->edests + clone_node);
1492 clone_dest = duplicate_node (dfa, org_dest, constraint);
1493 if (BE (clone_dest == -1, 0))
1494 return REG_ESPACE;
1495 dfa->nexts[clone_node] = dfa->nexts[org_node];
1496 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1497 if (BE (ret < 0, 0))
1498 return REG_ESPACE;
1500 else if (dfa->edests[org_node].nelem == 0)
1502 /* In case of the node can't epsilon-transit, don't duplicate the
1503 destination and store the original destination as the
1504 destination of the node. */
1505 dfa->nexts[clone_node] = dfa->nexts[org_node];
1506 break;
1508 else if (dfa->edests[org_node].nelem == 1)
1510 /* In case of the node can epsilon-transit, and it has only one
1511 destination. */
1512 org_dest = dfa->edests[org_node].elems[0];
1513 re_node_set_empty (dfa->edests + clone_node);
1514 /* If the node is root_node itself, it means the epsilon clsoure
1515 has a loop. Then tie it to the destination of the root_node. */
1516 if (org_node == root_node && clone_node != org_node)
1518 ret = re_node_set_insert (dfa->edests + clone_node, org_dest);
1519 if (BE (ret < 0, 0))
1520 return REG_ESPACE;
1521 break;
1523 /* In case of the node has another constraint, add it. */
1524 constraint |= dfa->nodes[org_node].constraint;
1525 clone_dest = duplicate_node (dfa, org_dest, constraint);
1526 if (BE (clone_dest == -1, 0))
1527 return REG_ESPACE;
1528 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1529 if (BE (ret < 0, 0))
1530 return REG_ESPACE;
1532 else /* dfa->edests[org_node].nelem == 2 */
1534 /* In case of the node can epsilon-transit, and it has two
1535 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1536 org_dest = dfa->edests[org_node].elems[0];
1537 re_node_set_empty (dfa->edests + clone_node);
1538 /* Search for a duplicated node which satisfies the constraint. */
1539 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1540 if (clone_dest == -1)
1542 /* There is no such duplicated node, create a new one. */
1543 reg_errcode_t err;
1544 clone_dest = duplicate_node (dfa, org_dest, constraint);
1545 if (BE (clone_dest == -1, 0))
1546 return REG_ESPACE;
1547 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1548 if (BE (ret < 0, 0))
1549 return REG_ESPACE;
1550 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1551 root_node, constraint);
1552 if (BE (err != REG_NOERROR, 0))
1553 return err;
1555 else
1557 /* There is a duplicated node which satisfies the constraint,
1558 use it to avoid infinite loop. */
1559 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1560 if (BE (ret < 0, 0))
1561 return REG_ESPACE;
1564 org_dest = dfa->edests[org_node].elems[1];
1565 clone_dest = duplicate_node (dfa, org_dest, constraint);
1566 if (BE (clone_dest == -1, 0))
1567 return REG_ESPACE;
1568 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1569 if (BE (ret < 0, 0))
1570 return REG_ESPACE;
1572 org_node = org_dest;
1573 clone_node = clone_dest;
1575 return REG_NOERROR;
1578 /* Search for a node which is duplicated from the node ORG_NODE, and
1579 satisfies the constraint CONSTRAINT. */
1581 static int
1582 search_duplicated_node (const re_dfa_t *dfa, int org_node,
1583 unsigned int constraint)
1585 int idx;
1586 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1588 if (org_node == dfa->org_indices[idx]
1589 && constraint == dfa->nodes[idx].constraint)
1590 return idx; /* Found. */
1592 return -1; /* Not found. */
1595 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1596 Return the index of the new node, or -1 if insufficient storage is
1597 available. */
1599 static int
1600 duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint)
1602 int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1603 if (BE (dup_idx != -1, 1))
1605 dfa->nodes[dup_idx].constraint = constraint;
1606 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1607 dfa->nodes[dup_idx].duplicated = 1;
1609 /* Store the index of the original node. */
1610 dfa->org_indices[dup_idx] = org_idx;
1612 return dup_idx;
1615 static reg_errcode_t
1616 calc_inveclosure (re_dfa_t *dfa)
1618 int src, idx, ret;
1619 for (idx = 0; idx < dfa->nodes_len; ++idx)
1620 re_node_set_init_empty (dfa->inveclosures + idx);
1622 for (src = 0; src < dfa->nodes_len; ++src)
1624 int *elems = dfa->eclosures[src].elems;
1625 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1627 ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1628 if (BE (ret == -1, 0))
1629 return REG_ESPACE;
1633 return REG_NOERROR;
1636 /* Calculate "eclosure" for all the node in DFA. */
1638 static reg_errcode_t
1639 calc_eclosure (re_dfa_t *dfa)
1641 int node_idx, incomplete;
1642 #ifdef DEBUG
1643 assert (dfa->nodes_len > 0);
1644 #endif
1645 incomplete = 0;
1646 /* For each nodes, calculate epsilon closure. */
1647 for (node_idx = 0; ; ++node_idx)
1649 reg_errcode_t err;
1650 re_node_set eclosure_elem;
1651 if (node_idx == dfa->nodes_len)
1653 if (!incomplete)
1654 break;
1655 incomplete = 0;
1656 node_idx = 0;
1659 #ifdef DEBUG
1660 assert (dfa->eclosures[node_idx].nelem != -1);
1661 #endif
1663 /* If we have already calculated, skip it. */
1664 if (dfa->eclosures[node_idx].nelem != 0)
1665 continue;
1666 /* Calculate epsilon closure of `node_idx'. */
1667 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1668 if (BE (err != REG_NOERROR, 0))
1669 return err;
1671 if (dfa->eclosures[node_idx].nelem == 0)
1673 incomplete = 1;
1674 re_node_set_free (&eclosure_elem);
1677 return REG_NOERROR;
1680 /* Calculate epsilon closure of NODE. */
1682 static reg_errcode_t
1683 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, int node, int root)
1685 reg_errcode_t err;
1686 int i;
1687 re_node_set eclosure;
1688 int ret;
1689 int incomplete = 0;
1690 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1691 if (BE (err != REG_NOERROR, 0))
1692 return err;
1694 /* This indicates that we are calculating this node now.
1695 We reference this value to avoid infinite loop. */
1696 dfa->eclosures[node].nelem = -1;
1698 /* If the current node has constraints, duplicate all nodes
1699 since they must inherit the constraints. */
1700 if (dfa->nodes[node].constraint
1701 && dfa->edests[node].nelem
1702 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1704 err = duplicate_node_closure (dfa, node, node, node,
1705 dfa->nodes[node].constraint);
1706 if (BE (err != REG_NOERROR, 0))
1707 return err;
1710 /* Expand each epsilon destination nodes. */
1711 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1712 for (i = 0; i < dfa->edests[node].nelem; ++i)
1714 re_node_set eclosure_elem;
1715 int edest = dfa->edests[node].elems[i];
1716 /* If calculating the epsilon closure of `edest' is in progress,
1717 return intermediate result. */
1718 if (dfa->eclosures[edest].nelem == -1)
1720 incomplete = 1;
1721 continue;
1723 /* If we haven't calculated the epsilon closure of `edest' yet,
1724 calculate now. Otherwise use calculated epsilon closure. */
1725 if (dfa->eclosures[edest].nelem == 0)
1727 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1728 if (BE (err != REG_NOERROR, 0))
1729 return err;
1731 else
1732 eclosure_elem = dfa->eclosures[edest];
1733 /* Merge the epsilon closure of `edest'. */
1734 err = re_node_set_merge (&eclosure, &eclosure_elem);
1735 if (BE (err != REG_NOERROR, 0))
1736 return err;
1737 /* If the epsilon closure of `edest' is incomplete,
1738 the epsilon closure of this node is also incomplete. */
1739 if (dfa->eclosures[edest].nelem == 0)
1741 incomplete = 1;
1742 re_node_set_free (&eclosure_elem);
1746 /* An epsilon closure includes itself. */
1747 ret = re_node_set_insert (&eclosure, node);
1748 if (BE (ret < 0, 0))
1749 return REG_ESPACE;
1750 if (incomplete && !root)
1751 dfa->eclosures[node].nelem = 0;
1752 else
1753 dfa->eclosures[node] = eclosure;
1754 *new_set = eclosure;
1755 return REG_NOERROR;
1758 /* Functions for token which are used in the parser. */
1760 /* Fetch a token from INPUT.
1761 We must not use this function inside bracket expressions. */
1763 static void
1764 internal_function
1765 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1767 re_string_skip_bytes (input, peek_token (result, input, syntax));
1770 /* Peek a token from INPUT, and return the length of the token.
1771 We must not use this function inside bracket expressions. */
1773 static int
1774 internal_function
1775 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1777 unsigned char c;
1779 if (re_string_eoi (input))
1781 token->type = END_OF_RE;
1782 return 0;
1785 c = re_string_peek_byte (input, 0);
1786 token->opr.c = c;
1788 token->word_char = 0;
1789 #ifdef RE_ENABLE_I18N
1790 token->mb_partial = 0;
1791 if (input->mb_cur_max > 1 &&
1792 !re_string_first_byte (input, re_string_cur_idx (input)))
1794 token->type = CHARACTER;
1795 token->mb_partial = 1;
1796 return 1;
1798 #endif
1799 if (c == '\\')
1801 unsigned char c2;
1802 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1804 token->type = BACK_SLASH;
1805 return 1;
1808 c2 = re_string_peek_byte_case (input, 1);
1809 token->opr.c = c2;
1810 token->type = CHARACTER;
1811 #ifdef RE_ENABLE_I18N
1812 if (input->mb_cur_max > 1)
1814 wint_t wc = re_string_wchar_at (input,
1815 re_string_cur_idx (input) + 1);
1816 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1818 else
1819 #endif
1820 token->word_char = IS_WORD_CHAR (c2) != 0;
1822 switch (c2)
1824 case '|':
1825 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1826 token->type = OP_ALT;
1827 break;
1828 case '1': case '2': case '3': case '4': case '5':
1829 case '6': case '7': case '8': case '9':
1830 if (!(syntax & RE_NO_BK_REFS))
1832 token->type = OP_BACK_REF;
1833 token->opr.idx = c2 - '1';
1835 break;
1836 case '<':
1837 if (!(syntax & RE_NO_GNU_OPS))
1839 token->type = ANCHOR;
1840 token->opr.ctx_type = WORD_FIRST;
1842 break;
1843 case '>':
1844 if (!(syntax & RE_NO_GNU_OPS))
1846 token->type = ANCHOR;
1847 token->opr.ctx_type = WORD_LAST;
1849 break;
1850 case 'b':
1851 if (!(syntax & RE_NO_GNU_OPS))
1853 token->type = ANCHOR;
1854 token->opr.ctx_type = WORD_DELIM;
1856 break;
1857 case 'B':
1858 if (!(syntax & RE_NO_GNU_OPS))
1860 token->type = ANCHOR;
1861 token->opr.ctx_type = NOT_WORD_DELIM;
1863 break;
1864 case 'w':
1865 if (!(syntax & RE_NO_GNU_OPS))
1866 token->type = OP_WORD;
1867 break;
1868 case 'W':
1869 if (!(syntax & RE_NO_GNU_OPS))
1870 token->type = OP_NOTWORD;
1871 break;
1872 case 's':
1873 if (!(syntax & RE_NO_GNU_OPS))
1874 token->type = OP_SPACE;
1875 break;
1876 case 'S':
1877 if (!(syntax & RE_NO_GNU_OPS))
1878 token->type = OP_NOTSPACE;
1879 break;
1880 case '`':
1881 if (!(syntax & RE_NO_GNU_OPS))
1883 token->type = ANCHOR;
1884 token->opr.ctx_type = BUF_FIRST;
1886 break;
1887 case '\'':
1888 if (!(syntax & RE_NO_GNU_OPS))
1890 token->type = ANCHOR;
1891 token->opr.ctx_type = BUF_LAST;
1893 break;
1894 case '(':
1895 if (!(syntax & RE_NO_BK_PARENS))
1896 token->type = OP_OPEN_SUBEXP;
1897 break;
1898 case ')':
1899 if (!(syntax & RE_NO_BK_PARENS))
1900 token->type = OP_CLOSE_SUBEXP;
1901 break;
1902 case '+':
1903 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1904 token->type = OP_DUP_PLUS;
1905 break;
1906 case '?':
1907 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1908 token->type = OP_DUP_QUESTION;
1909 break;
1910 case '{':
1911 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1912 token->type = OP_OPEN_DUP_NUM;
1913 break;
1914 case '}':
1915 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1916 token->type = OP_CLOSE_DUP_NUM;
1917 break;
1918 default:
1919 break;
1921 return 2;
1924 token->type = CHARACTER;
1925 #ifdef RE_ENABLE_I18N
1926 if (input->mb_cur_max > 1)
1928 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1929 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1931 else
1932 #endif
1933 token->word_char = IS_WORD_CHAR (token->opr.c);
1935 switch (c)
1937 case '\n':
1938 if (syntax & RE_NEWLINE_ALT)
1939 token->type = OP_ALT;
1940 break;
1941 case '|':
1942 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1943 token->type = OP_ALT;
1944 break;
1945 case '*':
1946 token->type = OP_DUP_ASTERISK;
1947 break;
1948 case '+':
1949 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1950 token->type = OP_DUP_PLUS;
1951 break;
1952 case '?':
1953 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1954 token->type = OP_DUP_QUESTION;
1955 break;
1956 case '{':
1957 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1958 token->type = OP_OPEN_DUP_NUM;
1959 break;
1960 case '}':
1961 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1962 token->type = OP_CLOSE_DUP_NUM;
1963 break;
1964 case '(':
1965 if (syntax & RE_NO_BK_PARENS)
1966 token->type = OP_OPEN_SUBEXP;
1967 break;
1968 case ')':
1969 if (syntax & RE_NO_BK_PARENS)
1970 token->type = OP_CLOSE_SUBEXP;
1971 break;
1972 case '[':
1973 token->type = OP_OPEN_BRACKET;
1974 break;
1975 case '.':
1976 token->type = OP_PERIOD;
1977 break;
1978 case '^':
1979 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1980 re_string_cur_idx (input) != 0)
1982 char prev = re_string_peek_byte (input, -1);
1983 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1984 break;
1986 token->type = ANCHOR;
1987 token->opr.ctx_type = LINE_FIRST;
1988 break;
1989 case '$':
1990 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1991 re_string_cur_idx (input) + 1 != re_string_length (input))
1993 re_token_t next;
1994 re_string_skip_bytes (input, 1);
1995 peek_token (&next, input, syntax);
1996 re_string_skip_bytes (input, -1);
1997 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1998 break;
2000 token->type = ANCHOR;
2001 token->opr.ctx_type = LINE_LAST;
2002 break;
2003 default:
2004 break;
2006 return 1;
2009 /* Peek a token from INPUT, and return the length of the token.
2010 We must not use this function out of bracket expressions. */
2012 static int
2013 internal_function
2014 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2016 unsigned char c;
2017 if (re_string_eoi (input))
2019 token->type = END_OF_RE;
2020 return 0;
2022 c = re_string_peek_byte (input, 0);
2023 token->opr.c = c;
2025 #ifdef RE_ENABLE_I18N
2026 if (input->mb_cur_max > 1 &&
2027 !re_string_first_byte (input, re_string_cur_idx (input)))
2029 token->type = CHARACTER;
2030 return 1;
2032 #endif /* RE_ENABLE_I18N */
2034 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2035 && re_string_cur_idx (input) + 1 < re_string_length (input))
2037 /* In this case, '\' escape a character. */
2038 unsigned char c2;
2039 re_string_skip_bytes (input, 1);
2040 c2 = re_string_peek_byte (input, 0);
2041 token->opr.c = c2;
2042 token->type = CHARACTER;
2043 return 1;
2045 if (c == '[') /* '[' is a special char in a bracket exps. */
2047 unsigned char c2;
2048 int token_len;
2049 if (re_string_cur_idx (input) + 1 < re_string_length (input))
2050 c2 = re_string_peek_byte (input, 1);
2051 else
2052 c2 = 0;
2053 token->opr.c = c2;
2054 token_len = 2;
2055 switch (c2)
2057 case '.':
2058 token->type = OP_OPEN_COLL_ELEM;
2059 break;
2060 case '=':
2061 token->type = OP_OPEN_EQUIV_CLASS;
2062 break;
2063 case ':':
2064 if (syntax & RE_CHAR_CLASSES)
2066 token->type = OP_OPEN_CHAR_CLASS;
2067 break;
2069 /* else fall through. */
2070 default:
2071 token->type = CHARACTER;
2072 token->opr.c = c;
2073 token_len = 1;
2074 break;
2076 return token_len;
2078 switch (c)
2080 case '-':
2081 token->type = OP_CHARSET_RANGE;
2082 break;
2083 case ']':
2084 token->type = OP_CLOSE_BRACKET;
2085 break;
2086 case '^':
2087 token->type = OP_NON_MATCH_LIST;
2088 break;
2089 default:
2090 token->type = CHARACTER;
2092 return 1;
2095 /* Functions for parser. */
2097 /* Entry point of the parser.
2098 Parse the regular expression REGEXP and return the structure tree.
2099 If an error is occured, ERR is set by error code, and return NULL.
2100 This function build the following tree, from regular expression <reg_exp>:
2104 <reg_exp> EOR
2106 CAT means concatenation.
2107 EOR means end of regular expression. */
2109 static bin_tree_t *
2110 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2111 reg_errcode_t *err)
2113 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2114 bin_tree_t *tree, *eor, *root;
2115 re_token_t current_token;
2116 dfa->syntax = syntax;
2117 fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2118 tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2119 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2120 return NULL;
2121 eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2122 if (tree != NULL)
2123 root = create_tree (dfa, tree, eor, CONCAT);
2124 else
2125 root = eor;
2126 if (BE (eor == NULL || root == NULL, 0))
2128 *err = REG_ESPACE;
2129 return NULL;
2131 return root;
2134 /* This function build the following tree, from regular expression
2135 <branch1>|<branch2>:
2139 <branch1> <branch2>
2141 ALT means alternative, which represents the operator `|'. */
2143 static bin_tree_t *
2144 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2145 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2147 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2148 bin_tree_t *tree, *branch = NULL;
2149 tree = parse_branch (regexp, preg, token, syntax, nest, err);
2150 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2151 return NULL;
2153 while (token->type == OP_ALT)
2155 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2156 if (token->type != OP_ALT && token->type != END_OF_RE
2157 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2159 branch = parse_branch (regexp, preg, token, syntax, nest, err);
2160 if (BE (*err != REG_NOERROR && branch == NULL, 0))
2162 if (tree != NULL)
2163 postorder (tree, free_tree, NULL);
2164 return NULL;
2167 else
2168 branch = NULL;
2169 tree = create_tree (dfa, tree, branch, OP_ALT);
2170 if (BE (tree == NULL, 0))
2172 *err = REG_ESPACE;
2173 return NULL;
2176 return tree;
2179 /* This function build the following tree, from regular expression
2180 <exp1><exp2>:
2184 <exp1> <exp2>
2186 CAT means concatenation. */
2188 static bin_tree_t *
2189 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2190 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2192 bin_tree_t *tree, *exp;
2193 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2194 tree = parse_expression (regexp, preg, token, syntax, nest, err);
2195 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2196 return NULL;
2198 while (token->type != OP_ALT && token->type != END_OF_RE
2199 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2201 exp = parse_expression (regexp, preg, token, syntax, nest, err);
2202 if (BE (*err != REG_NOERROR && exp == NULL, 0))
2204 if (tree != NULL)
2205 postorder (tree, free_tree, NULL);
2206 return NULL;
2208 if (tree != NULL && exp != NULL)
2210 bin_tree_t *newtree = create_tree (dfa, tree, exp, CONCAT);
2211 if (newtree == NULL)
2213 postorder (exp, free_tree, NULL);
2214 postorder (tree, free_tree, NULL);
2215 *err = REG_ESPACE;
2216 return NULL;
2218 tree = newtree;
2220 else if (tree == NULL)
2221 tree = exp;
2222 /* Otherwise exp == NULL, we don't need to create new tree. */
2224 return tree;
2227 /* This function build the following tree, from regular expression a*:
2233 static bin_tree_t *
2234 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2235 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2237 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2238 bin_tree_t *tree;
2239 switch (token->type)
2241 case CHARACTER:
2242 tree = create_token_tree (dfa, NULL, NULL, token);
2243 if (BE (tree == NULL, 0))
2245 *err = REG_ESPACE;
2246 return NULL;
2248 #ifdef RE_ENABLE_I18N
2249 if (dfa->mb_cur_max > 1)
2251 while (!re_string_eoi (regexp)
2252 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2254 bin_tree_t *mbc_remain;
2255 fetch_token (token, regexp, syntax);
2256 mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2257 tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2258 if (BE (mbc_remain == NULL || tree == NULL, 0))
2260 *err = REG_ESPACE;
2261 return NULL;
2265 #endif
2266 break;
2267 case OP_OPEN_SUBEXP:
2268 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2269 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2270 return NULL;
2271 break;
2272 case OP_OPEN_BRACKET:
2273 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2274 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2275 return NULL;
2276 break;
2277 case OP_BACK_REF:
2278 if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2280 *err = REG_ESUBREG;
2281 return NULL;
2283 dfa->used_bkref_map |= 1 << token->opr.idx;
2284 tree = create_token_tree (dfa, NULL, NULL, token);
2285 if (BE (tree == NULL, 0))
2287 *err = REG_ESPACE;
2288 return NULL;
2290 ++dfa->nbackref;
2291 dfa->has_mb_node = 1;
2292 break;
2293 case OP_OPEN_DUP_NUM:
2294 if (syntax & RE_CONTEXT_INVALID_DUP)
2296 *err = REG_BADRPT;
2297 return NULL;
2299 /* FALLTHROUGH */
2300 case OP_DUP_ASTERISK:
2301 case OP_DUP_PLUS:
2302 case OP_DUP_QUESTION:
2303 if (syntax & RE_CONTEXT_INVALID_OPS)
2305 *err = REG_BADRPT;
2306 return NULL;
2308 else if (syntax & RE_CONTEXT_INDEP_OPS)
2310 fetch_token (token, regexp, syntax);
2311 return parse_expression (regexp, preg, token, syntax, nest, err);
2313 /* else fall through */
2314 case OP_CLOSE_SUBEXP:
2315 if ((token->type == OP_CLOSE_SUBEXP) &&
2316 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2318 *err = REG_ERPAREN;
2319 return NULL;
2321 /* else fall through */
2322 case OP_CLOSE_DUP_NUM:
2323 /* We treat it as a normal character. */
2325 /* Then we can these characters as normal characters. */
2326 token->type = CHARACTER;
2327 /* mb_partial and word_char bits should be initialized already
2328 by peek_token. */
2329 tree = create_token_tree (dfa, NULL, NULL, token);
2330 if (BE (tree == NULL, 0))
2332 *err = REG_ESPACE;
2333 return NULL;
2335 break;
2336 case ANCHOR:
2337 if ((token->opr.ctx_type
2338 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2339 && dfa->word_ops_used == 0)
2340 init_word_char (dfa);
2341 if (token->opr.ctx_type == WORD_DELIM
2342 || token->opr.ctx_type == NOT_WORD_DELIM)
2344 bin_tree_t *tree_first, *tree_last;
2345 if (token->opr.ctx_type == WORD_DELIM)
2347 token->opr.ctx_type = WORD_FIRST;
2348 tree_first = create_token_tree (dfa, NULL, NULL, token);
2349 token->opr.ctx_type = WORD_LAST;
2351 else
2353 token->opr.ctx_type = INSIDE_WORD;
2354 tree_first = create_token_tree (dfa, NULL, NULL, token);
2355 token->opr.ctx_type = INSIDE_NOTWORD;
2357 tree_last = create_token_tree (dfa, NULL, NULL, token);
2358 tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2359 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2361 *err = REG_ESPACE;
2362 return NULL;
2365 else
2367 tree = create_token_tree (dfa, NULL, NULL, token);
2368 if (BE (tree == NULL, 0))
2370 *err = REG_ESPACE;
2371 return NULL;
2374 /* We must return here, since ANCHORs can't be followed
2375 by repetition operators.
2376 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2377 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2378 fetch_token (token, regexp, syntax);
2379 return tree;
2380 case OP_PERIOD:
2381 tree = create_token_tree (dfa, NULL, NULL, token);
2382 if (BE (tree == NULL, 0))
2384 *err = REG_ESPACE;
2385 return NULL;
2387 if (dfa->mb_cur_max > 1)
2388 dfa->has_mb_node = 1;
2389 break;
2390 case OP_WORD:
2391 case OP_NOTWORD:
2392 tree = build_charclass_op (dfa, regexp->trans,
2393 (const unsigned char *) "alnum",
2394 (const unsigned char *) "_",
2395 token->type == OP_NOTWORD, err);
2396 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2397 return NULL;
2398 break;
2399 case OP_SPACE:
2400 case OP_NOTSPACE:
2401 tree = build_charclass_op (dfa, regexp->trans,
2402 (const unsigned char *) "space",
2403 (const unsigned char *) "",
2404 token->type == OP_NOTSPACE, err);
2405 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2406 return NULL;
2407 break;
2408 case OP_ALT:
2409 case END_OF_RE:
2410 return NULL;
2411 case BACK_SLASH:
2412 *err = REG_EESCAPE;
2413 return NULL;
2414 default:
2415 /* Must not happen? */
2416 #ifdef DEBUG
2417 assert (0);
2418 #endif
2419 return NULL;
2421 fetch_token (token, regexp, syntax);
2423 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2424 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2426 bin_tree_t *dup_tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2427 if (BE (*err != REG_NOERROR && dup_tree == NULL, 0))
2429 if (tree != NULL)
2430 postorder (tree, free_tree, NULL);
2431 return NULL;
2433 tree = dup_tree;
2434 /* In BRE consecutive duplications are not allowed. */
2435 if ((syntax & RE_CONTEXT_INVALID_DUP)
2436 && (token->type == OP_DUP_ASTERISK
2437 || token->type == OP_OPEN_DUP_NUM))
2439 if (tree != NULL)
2440 postorder (tree, free_tree, NULL);
2441 *err = REG_BADRPT;
2442 return NULL;
2446 return tree;
2449 /* This function build the following tree, from regular expression
2450 (<reg_exp>):
2451 SUBEXP
2453 <reg_exp>
2456 static bin_tree_t *
2457 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2458 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2460 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2461 bin_tree_t *tree;
2462 size_t cur_nsub;
2463 cur_nsub = preg->re_nsub++;
2465 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2467 /* The subexpression may be a null string. */
2468 if (token->type == OP_CLOSE_SUBEXP)
2469 tree = NULL;
2470 else
2472 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2473 if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2475 if (tree != NULL)
2476 postorder (tree, free_tree, NULL);
2477 *err = REG_EPAREN;
2479 if (BE (*err != REG_NOERROR, 0))
2480 return NULL;
2483 if (cur_nsub <= '9' - '1')
2484 dfa->completed_bkref_map |= 1 << cur_nsub;
2486 tree = create_tree (dfa, tree, NULL, SUBEXP);
2487 if (BE (tree == NULL, 0))
2489 *err = REG_ESPACE;
2490 return NULL;
2492 tree->token.opr.idx = cur_nsub;
2493 return tree;
2496 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2498 static bin_tree_t *
2499 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2500 re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2502 bin_tree_t *tree = NULL, *old_tree = NULL;
2503 int i, start, end, start_idx = re_string_cur_idx (regexp);
2504 re_token_t start_token = *token;
2506 if (token->type == OP_OPEN_DUP_NUM)
2508 end = 0;
2509 start = fetch_number (regexp, token, syntax);
2510 if (start == -1)
2512 if (token->type == CHARACTER && token->opr.c == ',')
2513 start = 0; /* We treat "{,m}" as "{0,m}". */
2514 else
2516 *err = REG_BADBR; /* <re>{} is invalid. */
2517 return NULL;
2520 if (BE (start != -2, 1))
2522 /* We treat "{n}" as "{n,n}". */
2523 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2524 : ((token->type == CHARACTER && token->opr.c == ',')
2525 ? fetch_number (regexp, token, syntax) : -2));
2527 if (BE (start == -2 || end == -2, 0))
2529 /* Invalid sequence. */
2530 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2532 if (token->type == END_OF_RE)
2533 *err = REG_EBRACE;
2534 else
2535 *err = REG_BADBR;
2537 return NULL;
2540 /* If the syntax bit is set, rollback. */
2541 re_string_set_index (regexp, start_idx);
2542 *token = start_token;
2543 token->type = CHARACTER;
2544 /* mb_partial and word_char bits should be already initialized by
2545 peek_token. */
2546 return elem;
2549 if (BE ((end != -1 && start > end) || token->type != OP_CLOSE_DUP_NUM, 0))
2551 /* First number greater than second. */
2552 *err = REG_BADBR;
2553 return NULL;
2556 else
2558 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2559 end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2562 fetch_token (token, regexp, syntax);
2564 if (BE (elem == NULL, 0))
2565 return NULL;
2566 if (BE (start == 0 && end == 0, 0))
2568 postorder (elem, free_tree, NULL);
2569 return NULL;
2572 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2573 if (BE (start > 0, 0))
2575 tree = elem;
2576 for (i = 2; i <= start; ++i)
2578 elem = duplicate_tree (elem, dfa);
2579 tree = create_tree (dfa, tree, elem, CONCAT);
2580 if (BE (elem == NULL || tree == NULL, 0))
2581 goto parse_dup_op_espace;
2584 if (start == end)
2585 return tree;
2587 /* Duplicate ELEM before it is marked optional. */
2588 elem = duplicate_tree (elem, dfa);
2589 if (BE (elem == NULL, 0))
2590 goto parse_dup_op_espace;
2591 old_tree = tree;
2593 else
2594 old_tree = NULL;
2596 if (elem->token.type == SUBEXP)
2597 postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2599 tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2600 if (BE (tree == NULL, 0))
2601 goto parse_dup_op_espace;
2603 /* This loop is actually executed only when end != -1,
2604 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2605 already created the start+1-th copy. */
2606 for (i = start + 2; i <= end; ++i)
2608 elem = duplicate_tree (elem, dfa);
2609 tree = create_tree (dfa, tree, elem, CONCAT);
2610 if (BE (elem == NULL || tree == NULL, 0))
2611 goto parse_dup_op_espace;
2613 tree = create_tree (dfa, tree, NULL, OP_ALT);
2614 if (BE (tree == NULL, 0))
2615 goto parse_dup_op_espace;
2618 if (old_tree)
2619 tree = create_tree (dfa, old_tree, tree, CONCAT);
2621 return tree;
2623 parse_dup_op_espace:
2624 *err = REG_ESPACE;
2625 return NULL;
2628 /* Size of the names for collating symbol/equivalence_class/character_class.
2629 I'm not sure, but maybe enough. */
2630 #define BRACKET_NAME_BUF_SIZE 32
2632 #ifndef _LIBC
2633 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2634 Build the range expression which starts from START_ELEM, and ends
2635 at END_ELEM. The result are written to MBCSET and SBCSET.
2636 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2637 mbcset->range_ends, is a pointer argument sinse we may
2638 update it. */
2640 static reg_errcode_t
2641 internal_function
2642 # ifdef RE_ENABLE_I18N
2643 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2644 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2645 # else /* not RE_ENABLE_I18N */
2646 build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
2647 bracket_elem_t *end_elem)
2648 # endif /* not RE_ENABLE_I18N */
2650 unsigned int start_ch, end_ch;
2651 /* Equivalence Classes and Character Classes can't be a range start/end. */
2652 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2653 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2655 return REG_ERANGE;
2657 /* We can handle no multi character collating elements without libc
2658 support. */
2659 if (BE ((start_elem->type == COLL_SYM
2660 && strlen ((char *) start_elem->opr.name) > 1)
2661 || (end_elem->type == COLL_SYM
2662 && strlen ((char *) end_elem->opr.name) > 1), 0))
2663 return REG_ECOLLATE;
2665 # ifdef RE_ENABLE_I18N
2667 wchar_t wc;
2668 wint_t start_wc;
2669 wint_t end_wc;
2670 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2672 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2673 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2674 : 0));
2675 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2676 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2677 : 0));
2678 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2679 ? __btowc (start_ch) : start_elem->opr.wch);
2680 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2681 ? __btowc (end_ch) : end_elem->opr.wch);
2682 if (start_wc == WEOF || end_wc == WEOF)
2683 return REG_ECOLLATE;
2684 cmp_buf[0] = start_wc;
2685 cmp_buf[4] = end_wc;
2686 if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2687 return REG_ERANGE;
2689 /* Got valid collation sequence values, add them as a new entry.
2690 However, for !_LIBC we have no collation elements: if the
2691 character set is single byte, the single byte character set
2692 that we build below suffices. parse_bracket_exp passes
2693 no MBCSET if dfa->mb_cur_max == 1. */
2694 if (mbcset)
2696 /* Check the space of the arrays. */
2697 if (BE (*range_alloc == mbcset->nranges, 0))
2699 /* There is not enough space, need realloc. */
2700 wchar_t *new_array_start, *new_array_end;
2701 int new_nranges;
2703 /* +1 in case of mbcset->nranges is 0. */
2704 new_nranges = 2 * mbcset->nranges + 1;
2705 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2706 are NULL if *range_alloc == 0. */
2707 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2708 new_nranges);
2709 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2710 new_nranges);
2712 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2713 return REG_ESPACE;
2715 mbcset->range_starts = new_array_start;
2716 mbcset->range_ends = new_array_end;
2717 *range_alloc = new_nranges;
2720 mbcset->range_starts[mbcset->nranges] = start_wc;
2721 mbcset->range_ends[mbcset->nranges++] = end_wc;
2724 /* Build the table for single byte characters. */
2725 for (wc = 0; wc < SBC_MAX; ++wc)
2727 cmp_buf[2] = wc;
2728 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2729 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2730 bitset_set (sbcset, wc);
2733 # else /* not RE_ENABLE_I18N */
2735 unsigned int ch;
2736 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2737 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2738 : 0));
2739 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2740 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2741 : 0));
2742 if (start_ch > end_ch)
2743 return REG_ERANGE;
2744 /* Build the table for single byte characters. */
2745 for (ch = 0; ch < SBC_MAX; ++ch)
2746 if (start_ch <= ch && ch <= end_ch)
2747 bitset_set (sbcset, ch);
2749 # endif /* not RE_ENABLE_I18N */
2750 return REG_NOERROR;
2752 #endif /* not _LIBC */
2754 #ifndef _LIBC
2755 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2756 Build the collating element which is represented by NAME.
2757 The result are written to MBCSET and SBCSET.
2758 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2759 pointer argument since we may update it. */
2761 static reg_errcode_t
2762 internal_function
2763 # ifdef RE_ENABLE_I18N
2764 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2765 int *coll_sym_alloc, const unsigned char *name)
2766 # else /* not RE_ENABLE_I18N */
2767 build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2768 # endif /* not RE_ENABLE_I18N */
2770 size_t name_len = strlen ((const char *) name);
2771 if (BE (name_len != 1, 0))
2772 return REG_ECOLLATE;
2773 else
2775 bitset_set (sbcset, name[0]);
2776 return REG_NOERROR;
2779 #endif /* not _LIBC */
2781 /* This function parse bracket expression like "[abc]", "[a-c]",
2782 "[[.a-a.]]" etc. */
2784 static bin_tree_t *
2785 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2786 reg_syntax_t syntax, reg_errcode_t *err)
2788 #ifdef _LIBC
2789 const unsigned char *collseqmb;
2790 const char *collseqwc;
2791 uint32_t nrules;
2792 int32_t table_size;
2793 const int32_t *symb_table;
2794 const unsigned char *extra;
2796 /* Local function for parse_bracket_exp used in _LIBC environement.
2797 Seek the collating symbol entry correspondings to NAME.
2798 Return the index of the symbol in the SYMB_TABLE,
2799 or -1 if not found. */
2801 auto inline int32_t
2802 __attribute ((always_inline))
2803 seek_collating_symbol_entry (const unsigned char *name, size_t name_len)
2805 int32_t elem;
2807 for (elem = 0; elem < table_size; elem++)
2808 if (symb_table[2 * elem] != 0)
2810 int32_t idx = symb_table[2 * elem + 1];
2811 /* Skip the name of collating element name. */
2812 idx += 1 + extra[idx];
2813 if (/* Compare the length of the name. */
2814 name_len == extra[idx]
2815 /* Compare the name. */
2816 && memcmp (name, &extra[idx + 1], name_len) == 0)
2817 /* Yep, this is the entry. */
2818 return elem;
2820 return -1;
2823 /* Local function for parse_bracket_exp used in _LIBC environment.
2824 Look up the collation sequence value of BR_ELEM.
2825 Return the value if succeeded, UINT_MAX otherwise. */
2827 auto inline unsigned int
2828 __attribute ((always_inline))
2829 lookup_collation_sequence_value (bracket_elem_t *br_elem)
2831 if (br_elem->type == SB_CHAR)
2834 if (MB_CUR_MAX == 1)
2836 if (nrules == 0)
2837 return collseqmb[br_elem->opr.ch];
2838 else
2840 wint_t wc = __btowc (br_elem->opr.ch);
2841 return __collseq_table_lookup (collseqwc, wc);
2844 else if (br_elem->type == MB_CHAR)
2846 if (nrules != 0)
2847 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2849 else if (br_elem->type == COLL_SYM)
2851 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2852 if (nrules != 0)
2854 int32_t elem, idx;
2855 elem = seek_collating_symbol_entry (br_elem->opr.name,
2856 sym_name_len);
2857 if (elem != -1)
2859 /* We found the entry. */
2860 idx = symb_table[2 * elem + 1];
2861 /* Skip the name of collating element name. */
2862 idx += 1 + extra[idx];
2863 /* Skip the byte sequence of the collating element. */
2864 idx += 1 + extra[idx];
2865 /* Adjust for the alignment. */
2866 idx = (idx + 3) & ~3;
2867 /* Skip the multibyte collation sequence value. */
2868 idx += sizeof (unsigned int);
2869 /* Skip the wide char sequence of the collating element. */
2870 idx += sizeof (unsigned int) *
2871 (1 + *(unsigned int *) (extra + idx));
2872 /* Return the collation sequence value. */
2873 return *(unsigned int *) (extra + idx);
2875 else if (sym_name_len == 1)
2877 /* No valid character. Match it as a single byte
2878 character. */
2879 return collseqmb[br_elem->opr.name[0]];
2882 else if (sym_name_len == 1)
2883 return collseqmb[br_elem->opr.name[0]];
2885 return UINT_MAX;
2888 /* Local function for parse_bracket_exp used in _LIBC environement.
2889 Build the range expression which starts from START_ELEM, and ends
2890 at END_ELEM. The result are written to MBCSET and SBCSET.
2891 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2892 mbcset->range_ends, is a pointer argument sinse we may
2893 update it. */
2895 auto inline reg_errcode_t
2896 __attribute ((always_inline))
2897 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2898 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2900 unsigned int ch;
2901 uint32_t start_collseq;
2902 uint32_t end_collseq;
2904 /* Equivalence Classes and Character Classes can't be a range
2905 start/end. */
2906 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2907 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2909 return REG_ERANGE;
2911 start_collseq = lookup_collation_sequence_value (start_elem);
2912 end_collseq = lookup_collation_sequence_value (end_elem);
2913 /* Check start/end collation sequence values. */
2914 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2915 return REG_ECOLLATE;
2916 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2917 return REG_ERANGE;
2919 /* Got valid collation sequence values, add them as a new entry.
2920 However, if we have no collation elements, and the character set
2921 is single byte, the single byte character set that we
2922 build below suffices. */
2923 if (nrules > 0 || dfa->mb_cur_max > 1)
2925 /* Check the space of the arrays. */
2926 if (BE (*range_alloc == mbcset->nranges, 0))
2928 /* There is not enough space, need realloc. */
2929 uint32_t *new_array_start;
2930 uint32_t *new_array_end;
2931 int new_nranges;
2933 /* +1 in case of mbcset->nranges is 0. */
2934 new_nranges = 2 * mbcset->nranges + 1;
2935 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2936 new_nranges);
2937 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2938 new_nranges);
2940 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2941 return REG_ESPACE;
2943 mbcset->range_starts = new_array_start;
2944 mbcset->range_ends = new_array_end;
2945 *range_alloc = new_nranges;
2948 mbcset->range_starts[mbcset->nranges] = start_collseq;
2949 mbcset->range_ends[mbcset->nranges++] = end_collseq;
2952 /* Build the table for single byte characters. */
2953 for (ch = 0; ch < SBC_MAX; ch++)
2955 uint32_t ch_collseq;
2957 if (MB_CUR_MAX == 1)
2959 if (nrules == 0)
2960 ch_collseq = collseqmb[ch];
2961 else
2962 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2963 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2964 bitset_set (sbcset, ch);
2966 return REG_NOERROR;
2969 /* Local function for parse_bracket_exp used in _LIBC environement.
2970 Build the collating element which is represented by NAME.
2971 The result are written to MBCSET and SBCSET.
2972 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2973 pointer argument sinse we may update it. */
2975 auto inline reg_errcode_t
2976 __attribute ((always_inline))
2977 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2978 int *coll_sym_alloc, const unsigned char *name)
2980 int32_t elem, idx;
2981 size_t name_len = strlen ((const char *) name);
2982 if (nrules != 0)
2984 elem = seek_collating_symbol_entry (name, name_len);
2985 if (elem != -1)
2987 /* We found the entry. */
2988 idx = symb_table[2 * elem + 1];
2989 /* Skip the name of collating element name. */
2990 idx += 1 + extra[idx];
2992 else if (name_len == 1)
2994 /* No valid character, treat it as a normal
2995 character. */
2996 bitset_set (sbcset, name[0]);
2997 return REG_NOERROR;
2999 else
3000 return REG_ECOLLATE;
3002 /* Got valid collation sequence, add it as a new entry. */
3003 /* Check the space of the arrays. */
3004 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3006 /* Not enough, realloc it. */
3007 /* +1 in case of mbcset->ncoll_syms is 0. */
3008 int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3009 /* Use realloc since mbcset->coll_syms is NULL
3010 if *alloc == 0. */
3011 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3012 new_coll_sym_alloc);
3013 if (BE (new_coll_syms == NULL, 0))
3014 return REG_ESPACE;
3015 mbcset->coll_syms = new_coll_syms;
3016 *coll_sym_alloc = new_coll_sym_alloc;
3018 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3019 return REG_NOERROR;
3021 else
3023 if (BE (name_len != 1, 0))
3024 return REG_ECOLLATE;
3025 else
3027 bitset_set (sbcset, name[0]);
3028 return REG_NOERROR;
3032 #endif
3034 re_token_t br_token;
3035 re_bitset_ptr_t sbcset;
3036 #ifdef RE_ENABLE_I18N
3037 re_charset_t *mbcset;
3038 int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3039 int equiv_class_alloc = 0, char_class_alloc = 0;
3040 #endif /* not RE_ENABLE_I18N */
3041 int non_match = 0;
3042 bin_tree_t *work_tree;
3043 int token_len;
3044 int first_round = 1;
3045 #ifdef _LIBC
3046 collseqmb = (const unsigned char *)
3047 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3048 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3049 if (nrules)
3052 if (MB_CUR_MAX > 1)
3054 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3055 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3056 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3057 _NL_COLLATE_SYMB_TABLEMB);
3058 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3059 _NL_COLLATE_SYMB_EXTRAMB);
3061 #endif
3062 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3063 #ifdef RE_ENABLE_I18N
3064 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3065 #endif /* RE_ENABLE_I18N */
3066 #ifdef RE_ENABLE_I18N
3067 if (BE (sbcset == NULL || mbcset == NULL, 0))
3068 #else
3069 if (BE (sbcset == NULL, 0))
3070 #endif /* RE_ENABLE_I18N */
3072 re_free (sbcset);
3073 #ifdef RE_ENABLE_I18N
3074 re_free (mbcset);
3075 #endif
3076 *err = REG_ESPACE;
3077 return NULL;
3080 token_len = peek_token_bracket (token, regexp, syntax);
3081 if (BE (token->type == END_OF_RE, 0))
3083 *err = REG_BADPAT;
3084 goto parse_bracket_exp_free_return;
3086 if (token->type == OP_NON_MATCH_LIST)
3088 #ifdef RE_ENABLE_I18N
3089 mbcset->non_match = 1;
3090 #endif /* not RE_ENABLE_I18N */
3091 non_match = 1;
3092 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3093 bitset_set (sbcset, '\n');
3094 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3095 token_len = peek_token_bracket (token, regexp, syntax);
3096 if (BE (token->type == END_OF_RE, 0))
3098 *err = REG_BADPAT;
3099 goto parse_bracket_exp_free_return;
3103 /* We treat the first ']' as a normal character. */
3104 if (token->type == OP_CLOSE_BRACKET)
3105 token->type = CHARACTER;
3107 while (1)
3109 bracket_elem_t start_elem, end_elem;
3110 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3111 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3112 reg_errcode_t ret;
3113 int token_len2 = 0, is_range_exp = 0;
3114 re_token_t token2;
3116 start_elem.opr.name = start_name_buf;
3117 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3118 syntax, first_round);
3119 if (BE (ret != REG_NOERROR, 0))
3121 *err = ret;
3122 goto parse_bracket_exp_free_return;
3124 first_round = 0;
3126 /* Get information about the next token. We need it in any case. */
3127 token_len = peek_token_bracket (token, regexp, syntax);
3129 /* Do not check for ranges if we know they are not allowed. */
3130 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3132 if (BE (token->type == END_OF_RE, 0))
3134 *err = REG_EBRACK;
3135 goto parse_bracket_exp_free_return;
3137 if (token->type == OP_CHARSET_RANGE)
3139 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3140 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3141 if (BE (token2.type == END_OF_RE, 0))
3143 *err = REG_EBRACK;
3144 goto parse_bracket_exp_free_return;
3146 if (token2.type == OP_CLOSE_BRACKET)
3148 /* We treat the last '-' as a normal character. */
3149 re_string_skip_bytes (regexp, -token_len);
3150 token->type = CHARACTER;
3152 else
3153 is_range_exp = 1;
3157 if (is_range_exp == 1)
3159 end_elem.opr.name = end_name_buf;
3160 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3161 dfa, syntax, 1);
3162 if (BE (ret != REG_NOERROR, 0))
3164 *err = ret;
3165 goto parse_bracket_exp_free_return;
3168 token_len = peek_token_bracket (token, regexp, syntax);
3170 #ifdef _LIBC
3171 *err = build_range_exp (sbcset, mbcset, &range_alloc,
3172 &start_elem, &end_elem);
3173 #else
3174 # ifdef RE_ENABLE_I18N
3175 *err = build_range_exp (sbcset,
3176 dfa->mb_cur_max > 1 ? mbcset : NULL,
3177 &range_alloc, &start_elem, &end_elem);
3178 # else
3179 *err = build_range_exp (sbcset, &start_elem, &end_elem);
3180 # endif
3181 #endif /* RE_ENABLE_I18N */
3182 if (BE (*err != REG_NOERROR, 0))
3183 goto parse_bracket_exp_free_return;
3185 else
3187 switch (start_elem.type)
3189 case SB_CHAR:
3190 bitset_set (sbcset, start_elem.opr.ch);
3191 break;
3192 #ifdef RE_ENABLE_I18N
3193 case MB_CHAR:
3194 /* Check whether the array has enough space. */
3195 if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3197 wchar_t *new_mbchars;
3198 /* Not enough, realloc it. */
3199 /* +1 in case of mbcset->nmbchars is 0. */
3200 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3201 /* Use realloc since array is NULL if *alloc == 0. */
3202 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3203 mbchar_alloc);
3204 if (BE (new_mbchars == NULL, 0))
3205 goto parse_bracket_exp_espace;
3206 mbcset->mbchars = new_mbchars;
3208 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3209 break;
3210 #endif /* RE_ENABLE_I18N */
3211 case EQUIV_CLASS:
3212 *err = build_equiv_class (sbcset,
3213 #ifdef RE_ENABLE_I18N
3214 mbcset, &equiv_class_alloc,
3215 #endif /* RE_ENABLE_I18N */
3216 start_elem.opr.name);
3217 if (BE (*err != REG_NOERROR, 0))
3218 goto parse_bracket_exp_free_return;
3219 break;
3220 case COLL_SYM:
3221 *err = build_collating_symbol (sbcset,
3222 #ifdef RE_ENABLE_I18N
3223 mbcset, &coll_sym_alloc,
3224 #endif /* RE_ENABLE_I18N */
3225 start_elem.opr.name);
3226 if (BE (*err != REG_NOERROR, 0))
3227 goto parse_bracket_exp_free_return;
3228 break;
3229 case CHAR_CLASS:
3230 *err = build_charclass (regexp->trans, sbcset,
3231 #ifdef RE_ENABLE_I18N
3232 mbcset, &char_class_alloc,
3233 #endif /* RE_ENABLE_I18N */
3234 start_elem.opr.name, syntax);
3235 if (BE (*err != REG_NOERROR, 0))
3236 goto parse_bracket_exp_free_return;
3237 break;
3238 default:
3239 assert (0);
3240 break;
3243 if (BE (token->type == END_OF_RE, 0))
3245 *err = REG_EBRACK;
3246 goto parse_bracket_exp_free_return;
3248 if (token->type == OP_CLOSE_BRACKET)
3249 break;
3252 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3254 /* If it is non-matching list. */
3255 if (non_match)
3256 bitset_not (sbcset);
3258 #ifdef RE_ENABLE_I18N
3259 /* Ensure only single byte characters are set. */
3260 if (dfa->mb_cur_max > 1)
3261 bitset_mask (sbcset, dfa->sb_char);
3263 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3264 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3265 || mbcset->non_match)))
3267 bin_tree_t *mbc_tree;
3268 int sbc_idx;
3269 /* Build a tree for complex bracket. */
3270 dfa->has_mb_node = 1;
3271 br_token.type = COMPLEX_BRACKET;
3272 br_token.opr.mbcset = mbcset;
3273 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3274 if (BE (mbc_tree == NULL, 0))
3275 goto parse_bracket_exp_espace;
3276 for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3277 if (sbcset[sbc_idx])
3278 break;
3279 /* If there are no bits set in sbcset, there is no point
3280 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3281 if (sbc_idx < BITSET_WORDS)
3283 /* Build a tree for simple bracket. */
3284 br_token.type = SIMPLE_BRACKET;
3285 br_token.opr.sbcset = sbcset;
3286 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3287 if (BE (work_tree == NULL, 0))
3288 goto parse_bracket_exp_espace;
3290 /* Then join them by ALT node. */
3291 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3292 if (BE (work_tree == NULL, 0))
3293 goto parse_bracket_exp_espace;
3295 else
3297 re_free (sbcset);
3298 work_tree = mbc_tree;
3301 else
3302 #endif /* not RE_ENABLE_I18N */
3304 #ifdef RE_ENABLE_I18N
3305 free_charset (mbcset);
3306 #endif
3307 /* Build a tree for simple bracket. */
3308 br_token.type = SIMPLE_BRACKET;
3309 br_token.opr.sbcset = sbcset;
3310 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3311 if (BE (work_tree == NULL, 0))
3312 goto parse_bracket_exp_espace;
3314 return work_tree;
3316 parse_bracket_exp_espace:
3317 *err = REG_ESPACE;
3318 parse_bracket_exp_free_return:
3319 re_free (sbcset);
3320 #ifdef RE_ENABLE_I18N
3321 free_charset (mbcset);
3322 #endif /* RE_ENABLE_I18N */
3323 return NULL;
3326 /* Parse an element in the bracket expression. */
3328 static reg_errcode_t
3329 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3330 re_token_t *token, int token_len, re_dfa_t *dfa,
3331 reg_syntax_t syntax, int accept_hyphen)
3333 #ifdef RE_ENABLE_I18N
3334 int cur_char_size;
3335 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3336 if (cur_char_size > 1)
3338 elem->type = MB_CHAR;
3339 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3340 re_string_skip_bytes (regexp, cur_char_size);
3341 return REG_NOERROR;
3343 #endif /* RE_ENABLE_I18N */
3344 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3345 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3346 || token->type == OP_OPEN_EQUIV_CLASS)
3347 return parse_bracket_symbol (elem, regexp, token);
3348 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3350 /* A '-' must only appear as anything but a range indicator before
3351 the closing bracket. Everything else is an error. */
3352 re_token_t token2;
3353 (void) peek_token_bracket (&token2, regexp, syntax);
3354 if (token2.type != OP_CLOSE_BRACKET)
3355 /* The actual error value is not standardized since this whole
3356 case is undefined. But ERANGE makes good sense. */
3357 return REG_ERANGE;
3359 elem->type = SB_CHAR;
3360 elem->opr.ch = token->opr.c;
3361 return REG_NOERROR;
3364 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3365 such as [:<character_class>:], [.<collating_element>.], and
3366 [=<equivalent_class>=]. */
3368 static reg_errcode_t
3369 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3370 re_token_t *token)
3372 unsigned char ch, delim = token->opr.c;
3373 int i = 0;
3374 if (re_string_eoi(regexp))
3375 return REG_EBRACK;
3376 for (;; ++i)
3378 if (i >= BRACKET_NAME_BUF_SIZE)
3379 return REG_EBRACK;
3380 if (token->type == OP_OPEN_CHAR_CLASS)
3381 ch = re_string_fetch_byte_case (regexp);
3382 else
3383 ch = re_string_fetch_byte (regexp);
3384 if (re_string_eoi(regexp))
3385 return REG_EBRACK;
3386 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3387 break;
3388 elem->opr.name[i] = ch;
3390 re_string_skip_bytes (regexp, 1);
3391 elem->opr.name[i] = '\0';
3392 switch (token->type)
3394 case OP_OPEN_COLL_ELEM:
3395 elem->type = COLL_SYM;
3396 break;
3397 case OP_OPEN_EQUIV_CLASS:
3398 elem->type = EQUIV_CLASS;
3399 break;
3400 case OP_OPEN_CHAR_CLASS:
3401 elem->type = CHAR_CLASS;
3402 break;
3403 default:
3404 break;
3406 return REG_NOERROR;
3409 /* Helper function for parse_bracket_exp.
3410 Build the equivalence class which is represented by NAME.
3411 The result are written to MBCSET and SBCSET.
3412 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3413 is a pointer argument sinse we may update it. */
3415 static reg_errcode_t
3416 #ifdef RE_ENABLE_I18N
3417 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3418 int *equiv_class_alloc, const unsigned char *name)
3419 #else /* not RE_ENABLE_I18N */
3420 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3421 #endif /* not RE_ENABLE_I18N */
3423 #ifdef _LIBC
3424 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3425 if (nrules != 0)
3427 const int32_t *table, *indirect;
3428 const unsigned char *weights, *extra, *cp;
3429 unsigned char char_buf[2];
3430 int32_t idx1, idx2;
3431 unsigned int ch;
3432 size_t len;
3433 /* Calculate the index for equivalence class. */
3434 cp = name;
3435 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3436 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3437 _NL_COLLATE_WEIGHTMB);
3438 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3439 _NL_COLLATE_EXTRAMB);
3440 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3441 _NL_COLLATE_INDIRECTMB);
3442 idx1 = findidx (table, indirect, extra, &cp, -1);
3443 if (BE (idx1 == 0 || *cp != '\0', 0))
3444 /* This isn't a valid character. */
3445 return REG_ECOLLATE;
3447 /* Build single byte matcing table for this equivalence class. */
3448 len = weights[idx1 & 0xffffff];
3449 for (ch = 0; ch < SBC_MAX; ++ch)
3451 char_buf[0] = ch;
3452 cp = char_buf;
3453 idx2 = findidx (table, indirect, extra, &cp, 1);
3455 idx2 = table[ch];
3457 if (idx2 == 0)
3458 /* This isn't a valid character. */
3459 continue;
3460 /* Compare only if the length matches and the collation rule
3461 index is the same. */
3462 if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3464 int cnt = 0;
3466 while (cnt <= len &&
3467 weights[(idx1 & 0xffffff) + 1 + cnt]
3468 == weights[(idx2 & 0xffffff) + 1 + cnt])
3469 ++cnt;
3471 if (cnt > len)
3472 bitset_set (sbcset, ch);
3475 /* Check whether the array has enough space. */
3476 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3478 /* Not enough, realloc it. */
3479 /* +1 in case of mbcset->nequiv_classes is 0. */
3480 int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3481 /* Use realloc since the array is NULL if *alloc == 0. */
3482 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3483 int32_t,
3484 new_equiv_class_alloc);
3485 if (BE (new_equiv_classes == NULL, 0))
3486 return REG_ESPACE;
3487 mbcset->equiv_classes = new_equiv_classes;
3488 *equiv_class_alloc = new_equiv_class_alloc;
3490 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3492 else
3493 #endif /* _LIBC */
3495 if (BE (strlen ((const char *) name) != 1, 0))
3496 return REG_ECOLLATE;
3497 bitset_set (sbcset, *name);
3499 return REG_NOERROR;
3502 /* Helper function for parse_bracket_exp.
3503 Build the character class which is represented by NAME.
3504 The result are written to MBCSET and SBCSET.
3505 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3506 is a pointer argument sinse we may update it. */
3508 static reg_errcode_t
3509 #ifdef RE_ENABLE_I18N
3510 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3511 re_charset_t *mbcset, int *char_class_alloc,
3512 const unsigned char *class_name, reg_syntax_t syntax)
3513 #else /* not RE_ENABLE_I18N */
3514 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3515 const unsigned char *class_name, reg_syntax_t syntax)
3516 #endif /* not RE_ENABLE_I18N */
3518 int i;
3519 const char *name = (const char *) class_name;
3521 /* In case of REG_ICASE "upper" and "lower" match the both of
3522 upper and lower cases. */
3523 if ((syntax & RE_ICASE)
3524 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3525 name = "alpha";
3527 #ifdef RE_ENABLE_I18N
3528 /* Check the space of the arrays. */
3529 if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3531 /* Not enough, realloc it. */
3532 /* +1 in case of mbcset->nchar_classes is 0. */
3533 int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3534 /* Use realloc since array is NULL if *alloc == 0. */
3535 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3536 new_char_class_alloc);
3537 if (BE (new_char_classes == NULL, 0))
3538 return REG_ESPACE;
3539 mbcset->char_classes = new_char_classes;
3540 *char_class_alloc = new_char_class_alloc;
3542 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3543 #endif /* RE_ENABLE_I18N */
3545 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3546 do { \
3547 if (BE (trans != NULL, 0)) \
3549 for (i = 0; i < SBC_MAX; ++i) \
3550 if (ctype_func (i)) \
3551 bitset_set (sbcset, trans[i]); \
3553 else \
3555 for (i = 0; i < SBC_MAX; ++i) \
3556 if (ctype_func (i)) \
3557 bitset_set (sbcset, i); \
3559 } while (0)
3561 if (strcmp (name, "alnum") == 0)
3562 BUILD_CHARCLASS_LOOP (isalnum);
3563 else if (strcmp (name, "cntrl") == 0)
3564 BUILD_CHARCLASS_LOOP (iscntrl);
3565 else if (strcmp (name, "lower") == 0)
3566 BUILD_CHARCLASS_LOOP (islower);
3567 else if (strcmp (name, "space") == 0)
3568 BUILD_CHARCLASS_LOOP (isspace);
3569 else if (strcmp (name, "alpha") == 0)
3570 BUILD_CHARCLASS_LOOP (isalpha);
3571 else if (strcmp (name, "digit") == 0)
3572 BUILD_CHARCLASS_LOOP (isdigit);
3573 else if (strcmp (name, "print") == 0)
3574 BUILD_CHARCLASS_LOOP (isprint);
3575 else if (strcmp (name, "upper") == 0)
3576 BUILD_CHARCLASS_LOOP (isupper);
3577 else if (strcmp (name, "blank") == 0)
3578 BUILD_CHARCLASS_LOOP (isblank);
3579 else if (strcmp (name, "graph") == 0)
3580 BUILD_CHARCLASS_LOOP (isgraph);
3581 else if (strcmp (name, "punct") == 0)
3582 BUILD_CHARCLASS_LOOP (ispunct);
3583 else if (strcmp (name, "xdigit") == 0)
3584 BUILD_CHARCLASS_LOOP (isxdigit);
3585 else
3586 return REG_ECTYPE;
3588 return REG_NOERROR;
3591 static bin_tree_t *
3592 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3593 const unsigned char *class_name,
3594 const unsigned char *extra, int non_match,
3595 reg_errcode_t *err)
3597 re_bitset_ptr_t sbcset;
3598 #ifdef RE_ENABLE_I18N
3599 re_charset_t *mbcset;
3600 int alloc = 0;
3601 #endif /* not RE_ENABLE_I18N */
3602 reg_errcode_t ret;
3603 re_token_t br_token;
3604 bin_tree_t *tree;
3606 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3607 #ifdef RE_ENABLE_I18N
3608 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3609 #endif /* RE_ENABLE_I18N */
3611 #ifdef RE_ENABLE_I18N
3612 if (BE (sbcset == NULL || mbcset == NULL, 0))
3613 #else /* not RE_ENABLE_I18N */
3614 if (BE (sbcset == NULL, 0))
3615 #endif /* not RE_ENABLE_I18N */
3617 *err = REG_ESPACE;
3618 return NULL;
3621 if (non_match)
3623 #ifdef RE_ENABLE_I18N
3624 mbcset->non_match = 1;
3625 #endif /* not RE_ENABLE_I18N */
3628 /* We don't care the syntax in this case. */
3629 ret = build_charclass (trans, sbcset,
3630 #ifdef RE_ENABLE_I18N
3631 mbcset, &alloc,
3632 #endif /* RE_ENABLE_I18N */
3633 class_name, 0);
3635 if (BE (ret != REG_NOERROR, 0))
3637 re_free (sbcset);
3638 #ifdef RE_ENABLE_I18N
3639 free_charset (mbcset);
3640 #endif /* RE_ENABLE_I18N */
3641 *err = ret;
3642 return NULL;
3644 /* \w match '_' also. */
3645 for (; *extra; extra++)
3646 bitset_set (sbcset, *extra);
3648 /* If it is non-matching list. */
3649 if (non_match)
3650 bitset_not (sbcset);
3652 #ifdef RE_ENABLE_I18N
3653 /* Ensure only single byte characters are set. */
3654 if (dfa->mb_cur_max > 1)
3655 bitset_mask (sbcset, dfa->sb_char);
3656 #endif
3658 /* Build a tree for simple bracket. */
3659 br_token.type = SIMPLE_BRACKET;
3660 br_token.opr.sbcset = sbcset;
3661 tree = create_token_tree (dfa, NULL, NULL, &br_token);
3662 if (BE (tree == NULL, 0))
3663 goto build_word_op_espace;
3665 #ifdef RE_ENABLE_I18N
3666 if (dfa->mb_cur_max > 1)
3668 bin_tree_t *mbc_tree;
3669 /* Build a tree for complex bracket. */
3670 br_token.type = COMPLEX_BRACKET;
3671 br_token.opr.mbcset = mbcset;
3672 dfa->has_mb_node = 1;
3673 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3674 if (BE (mbc_tree == NULL, 0))
3675 goto build_word_op_espace;
3676 /* Then join them by ALT node. */
3677 tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3678 if (BE (mbc_tree != NULL, 1))
3679 return tree;
3681 else
3683 free_charset (mbcset);
3684 return tree;
3686 #else /* not RE_ENABLE_I18N */
3687 return tree;
3688 #endif /* not RE_ENABLE_I18N */
3690 build_word_op_espace:
3691 re_free (sbcset);
3692 #ifdef RE_ENABLE_I18N
3693 free_charset (mbcset);
3694 #endif /* RE_ENABLE_I18N */
3695 *err = REG_ESPACE;
3696 return NULL;
3699 /* This is intended for the expressions like "a{1,3}".
3700 Fetch a number from `input', and return the number.
3701 Return -1, if the number field is empty like "{,1}".
3702 Return -2, If an error is occured. */
3704 static int
3705 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3707 int num = -1;
3708 unsigned char c;
3709 while (1)
3711 fetch_token (token, input, syntax);
3712 c = token->opr.c;
3713 if (BE (token->type == END_OF_RE, 0))
3714 return -2;
3715 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3716 break;
3717 num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3718 ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3719 num = (num > RE_DUP_MAX) ? -2 : num;
3721 return num;
3724 #ifdef RE_ENABLE_I18N
3725 static void
3726 free_charset (re_charset_t *cset)
3728 re_free (cset->mbchars);
3729 # ifdef _LIBC
3730 re_free (cset->coll_syms);
3731 re_free (cset->equiv_classes);
3732 re_free (cset->range_starts);
3733 re_free (cset->range_ends);
3734 # endif
3735 re_free (cset->char_classes);
3736 re_free (cset);
3738 #endif /* RE_ENABLE_I18N */
3740 /* Functions for binary tree operation. */
3742 /* Create a tree node. */
3744 static bin_tree_t *
3745 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3746 re_token_type_t type)
3748 re_token_t t;
3749 t.type = type;
3750 return create_token_tree (dfa, left, right, &t);
3753 static bin_tree_t *
3754 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3755 const re_token_t *token)
3757 bin_tree_t *tree;
3758 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3760 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3762 if (storage == NULL)
3763 return NULL;
3764 storage->next = dfa->str_tree_storage;
3765 dfa->str_tree_storage = storage;
3766 dfa->str_tree_storage_idx = 0;
3768 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3770 tree->parent = NULL;
3771 tree->left = left;
3772 tree->right = right;
3773 tree->token = *token;
3774 tree->token.duplicated = 0;
3775 tree->token.opt_subexp = 0;
3776 tree->first = NULL;
3777 tree->next = NULL;
3778 tree->node_idx = -1;
3780 if (left != NULL)
3781 left->parent = tree;
3782 if (right != NULL)
3783 right->parent = tree;
3784 return tree;
3787 /* Mark the tree SRC as an optional subexpression.
3788 To be called from preorder or postorder. */
3790 static reg_errcode_t
3791 mark_opt_subexp (void *extra, bin_tree_t *node)
3793 int idx = (int) (long) extra;
3794 if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3795 node->token.opt_subexp = 1;
3797 return REG_NOERROR;
3800 /* Free the allocated memory inside NODE. */
3802 static void
3803 free_token (re_token_t *node)
3805 #ifdef RE_ENABLE_I18N
3806 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3807 free_charset (node->opr.mbcset);
3808 else
3809 #endif /* RE_ENABLE_I18N */
3810 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3811 re_free (node->opr.sbcset);
3814 /* Worker function for tree walking. Free the allocated memory inside NODE
3815 and its children. */
3817 static reg_errcode_t
3818 free_tree (void *extra, bin_tree_t *node)
3820 free_token (&node->token);
3821 return REG_NOERROR;
3825 /* Duplicate the node SRC, and return new node. This is a preorder
3826 visit similar to the one implemented by the generic visitor, but
3827 we need more infrastructure to maintain two parallel trees --- so,
3828 it's easier to duplicate. */
3830 static bin_tree_t *
3831 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3833 const bin_tree_t *node;
3834 bin_tree_t *dup_root;
3835 bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3837 for (node = root; ; )
3839 /* Create a new tree and link it back to the current parent. */
3840 *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3841 if (*p_new == NULL)
3842 return NULL;
3843 (*p_new)->parent = dup_node;
3844 (*p_new)->token.duplicated = 1;
3845 dup_node = *p_new;
3847 /* Go to the left node, or up and to the right. */
3848 if (node->left)
3850 node = node->left;
3851 p_new = &dup_node->left;
3853 else
3855 const bin_tree_t *prev = NULL;
3856 while (node->right == prev || node->right == NULL)
3858 prev = node;
3859 node = node->parent;
3860 dup_node = dup_node->parent;
3861 if (!node)
3862 return dup_root;
3864 node = node->right;
3865 p_new = &dup_node->right;