Add a testcase for BZ #14716
[glibc.git] / posix / regcomp.c
blobe85b2351459c438118755eef5c2916e3c1105742
1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002-2007,2009,2010,2011,2012 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <http://www.gnu.org/licenses/>. */
20 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
21 size_t length, reg_syntax_t syntax);
22 static void re_compile_fastmap_iter (regex_t *bufp,
23 const re_dfastate_t *init_state,
24 char *fastmap);
25 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
26 #ifdef RE_ENABLE_I18N
27 static void free_charset (re_charset_t *cset);
28 #endif /* RE_ENABLE_I18N */
29 static void free_workarea_compile (regex_t *preg);
30 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
31 #ifdef RE_ENABLE_I18N
32 static void optimize_utf8 (re_dfa_t *dfa);
33 #endif
34 static reg_errcode_t analyze (regex_t *preg);
35 static reg_errcode_t preorder (bin_tree_t *root,
36 reg_errcode_t (fn (void *, bin_tree_t *)),
37 void *extra);
38 static reg_errcode_t postorder (bin_tree_t *root,
39 reg_errcode_t (fn (void *, bin_tree_t *)),
40 void *extra);
41 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
42 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
43 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
44 bin_tree_t *node);
45 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
46 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
47 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
48 static int duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint);
49 static int search_duplicated_node (const re_dfa_t *dfa, int org_node,
50 unsigned int constraint);
51 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
52 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
53 int node, int root);
54 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
55 static int fetch_number (re_string_t *input, re_token_t *token,
56 reg_syntax_t syntax);
57 static int peek_token (re_token_t *token, re_string_t *input,
58 reg_syntax_t syntax) internal_function;
59 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
60 reg_syntax_t syntax, reg_errcode_t *err);
61 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
62 re_token_t *token, reg_syntax_t syntax,
63 int nest, reg_errcode_t *err);
64 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
65 re_token_t *token, reg_syntax_t syntax,
66 int nest, reg_errcode_t *err);
67 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
68 re_token_t *token, reg_syntax_t syntax,
69 int nest, reg_errcode_t *err);
70 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
71 re_token_t *token, reg_syntax_t syntax,
72 int nest, reg_errcode_t *err);
73 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
74 re_dfa_t *dfa, re_token_t *token,
75 reg_syntax_t syntax, reg_errcode_t *err);
76 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
77 re_token_t *token, reg_syntax_t syntax,
78 reg_errcode_t *err);
79 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
80 re_string_t *regexp,
81 re_token_t *token, int token_len,
82 re_dfa_t *dfa,
83 reg_syntax_t syntax,
84 int accept_hyphen);
85 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
86 re_string_t *regexp,
87 re_token_t *token);
88 #ifdef RE_ENABLE_I18N
89 static reg_errcode_t build_equiv_class (bitset_t sbcset,
90 re_charset_t *mbcset,
91 int *equiv_class_alloc,
92 const unsigned char *name);
93 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
94 bitset_t sbcset,
95 re_charset_t *mbcset,
96 int *char_class_alloc,
97 const unsigned char *class_name,
98 reg_syntax_t syntax);
99 #else /* not RE_ENABLE_I18N */
100 static reg_errcode_t build_equiv_class (bitset_t sbcset,
101 const unsigned char *name);
102 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
103 bitset_t sbcset,
104 const unsigned char *class_name,
105 reg_syntax_t syntax);
106 #endif /* not RE_ENABLE_I18N */
107 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
108 RE_TRANSLATE_TYPE trans,
109 const unsigned char *class_name,
110 const unsigned char *extra,
111 int non_match, reg_errcode_t *err);
112 static bin_tree_t *create_tree (re_dfa_t *dfa,
113 bin_tree_t *left, bin_tree_t *right,
114 re_token_type_t type);
115 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
116 bin_tree_t *left, bin_tree_t *right,
117 const re_token_t *token);
118 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
119 static void free_token (re_token_t *node);
120 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
121 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
123 /* This table gives an error message for each of the error codes listed
124 in regex.h. Obviously the order here has to be same as there.
125 POSIX doesn't require that we do anything for REG_NOERROR,
126 but why not be nice? */
128 const char __re_error_msgid[] attribute_hidden =
130 #define REG_NOERROR_IDX 0
131 gettext_noop ("Success") /* REG_NOERROR */
132 "\0"
133 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
134 gettext_noop ("No match") /* REG_NOMATCH */
135 "\0"
136 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
137 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
138 "\0"
139 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
140 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
141 "\0"
142 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
143 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
144 "\0"
145 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
146 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
147 "\0"
148 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
149 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
150 "\0"
151 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
152 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
153 "\0"
154 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
155 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
156 "\0"
157 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
158 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
159 "\0"
160 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
161 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
162 "\0"
163 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
164 gettext_noop ("Invalid range end") /* REG_ERANGE */
165 "\0"
166 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
167 gettext_noop ("Memory exhausted") /* REG_ESPACE */
168 "\0"
169 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
170 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
171 "\0"
172 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
173 gettext_noop ("Premature end of regular expression") /* REG_EEND */
174 "\0"
175 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
176 gettext_noop ("Regular expression too big") /* REG_ESIZE */
177 "\0"
178 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
179 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
182 const size_t __re_error_msgid_idx[] attribute_hidden =
184 REG_NOERROR_IDX,
185 REG_NOMATCH_IDX,
186 REG_BADPAT_IDX,
187 REG_ECOLLATE_IDX,
188 REG_ECTYPE_IDX,
189 REG_EESCAPE_IDX,
190 REG_ESUBREG_IDX,
191 REG_EBRACK_IDX,
192 REG_EPAREN_IDX,
193 REG_EBRACE_IDX,
194 REG_BADBR_IDX,
195 REG_ERANGE_IDX,
196 REG_ESPACE_IDX,
197 REG_BADRPT_IDX,
198 REG_EEND_IDX,
199 REG_ESIZE_IDX,
200 REG_ERPAREN_IDX
203 /* Entry points for GNU code. */
205 /* re_compile_pattern is the GNU regular expression compiler: it
206 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
207 Returns 0 if the pattern was valid, otherwise an error string.
209 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
210 are set in BUFP on entry. */
212 const char *
213 re_compile_pattern (pattern, length, bufp)
214 const char *pattern;
215 size_t length;
216 struct re_pattern_buffer *bufp;
218 reg_errcode_t ret;
220 /* And GNU code determines whether or not to get register information
221 by passing null for the REGS argument to re_match, etc., not by
222 setting no_sub, unless RE_NO_SUB is set. */
223 bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
225 /* Match anchors at newline. */
226 bufp->newline_anchor = 1;
228 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
230 if (!ret)
231 return NULL;
232 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
234 #ifdef _LIBC
235 weak_alias (__re_compile_pattern, re_compile_pattern)
236 #endif
238 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
239 also be assigned to arbitrarily: each pattern buffer stores its own
240 syntax, so it can be changed between regex compilations. */
241 /* This has no initializer because initialized variables in Emacs
242 become read-only after dumping. */
243 reg_syntax_t re_syntax_options;
246 /* Specify the precise syntax of regexps for compilation. This provides
247 for compatibility for various utilities which historically have
248 different, incompatible syntaxes.
250 The argument SYNTAX is a bit mask comprised of the various bits
251 defined in regex.h. We return the old syntax. */
253 reg_syntax_t
254 re_set_syntax (syntax)
255 reg_syntax_t syntax;
257 reg_syntax_t ret = re_syntax_options;
259 re_syntax_options = syntax;
260 return ret;
262 #ifdef _LIBC
263 weak_alias (__re_set_syntax, re_set_syntax)
264 #endif
267 re_compile_fastmap (bufp)
268 struct re_pattern_buffer *bufp;
270 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
271 char *fastmap = bufp->fastmap;
273 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
274 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
275 if (dfa->init_state != dfa->init_state_word)
276 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
277 if (dfa->init_state != dfa->init_state_nl)
278 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
279 if (dfa->init_state != dfa->init_state_begbuf)
280 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
281 bufp->fastmap_accurate = 1;
282 return 0;
284 #ifdef _LIBC
285 weak_alias (__re_compile_fastmap, re_compile_fastmap)
286 #endif
288 static inline void
289 __attribute ((always_inline))
290 re_set_fastmap (char *fastmap, int icase, int ch)
292 fastmap[ch] = 1;
293 if (icase)
294 fastmap[tolower (ch)] = 1;
297 /* Helper function for re_compile_fastmap.
298 Compile fastmap for the initial_state INIT_STATE. */
300 static void
301 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
302 char *fastmap)
304 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
305 int node_cnt;
306 int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
307 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
309 int node = init_state->nodes.elems[node_cnt];
310 re_token_type_t type = dfa->nodes[node].type;
312 if (type == CHARACTER)
314 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
315 #ifdef RE_ENABLE_I18N
316 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
318 unsigned char *buf = alloca (dfa->mb_cur_max), *p;
319 wchar_t wc;
320 mbstate_t state;
322 p = buf;
323 *p++ = dfa->nodes[node].opr.c;
324 while (++node < dfa->nodes_len
325 && dfa->nodes[node].type == CHARACTER
326 && dfa->nodes[node].mb_partial)
327 *p++ = dfa->nodes[node].opr.c;
328 memset (&state, '\0', sizeof (state));
329 if (__mbrtowc (&wc, (const char *) buf, p - buf,
330 &state) == p - buf
331 && (__wcrtomb ((char *) buf, towlower (wc), &state)
332 != (size_t) -1))
333 re_set_fastmap (fastmap, 0, buf[0]);
335 #endif
337 else if (type == SIMPLE_BRACKET)
339 int i, ch;
340 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
342 int j;
343 bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
344 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
345 if (w & ((bitset_word_t) 1 << j))
346 re_set_fastmap (fastmap, icase, ch);
349 #ifdef RE_ENABLE_I18N
350 else if (type == COMPLEX_BRACKET)
352 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
353 int i;
355 # ifdef _LIBC
356 /* See if we have to try all bytes which start multiple collation
357 elements.
358 e.g. In da_DK, we want to catch 'a' since "aa" is a valid
359 collation element, and don't catch 'b' since 'b' is
360 the only collation element which starts from 'b' (and
361 it is caught by SIMPLE_BRACKET). */
362 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
363 && (cset->ncoll_syms || cset->nranges))
365 const int32_t *table = (const int32_t *)
366 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
367 for (i = 0; i < SBC_MAX; ++i)
368 if (table[i] < 0)
369 re_set_fastmap (fastmap, icase, i);
371 # endif /* _LIBC */
373 /* See if we have to start the match at all multibyte characters,
374 i.e. where we would not find an invalid sequence. This only
375 applies to multibyte character sets; for single byte character
376 sets, the SIMPLE_BRACKET again suffices. */
377 if (dfa->mb_cur_max > 1
378 && (cset->nchar_classes || cset->non_match || cset->nranges
379 # ifdef _LIBC
380 || cset->nequiv_classes
381 # endif /* _LIBC */
384 unsigned char c = 0;
387 mbstate_t mbs;
388 memset (&mbs, 0, sizeof (mbs));
389 if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
390 re_set_fastmap (fastmap, false, (int) c);
392 while (++c != 0);
395 else
397 /* ... Else catch all bytes which can start the mbchars. */
398 for (i = 0; i < cset->nmbchars; ++i)
400 char buf[256];
401 mbstate_t state;
402 memset (&state, '\0', sizeof (state));
403 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
404 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
405 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
407 if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
408 != (size_t) -1)
409 re_set_fastmap (fastmap, false, *(unsigned char *) buf);
414 #endif /* RE_ENABLE_I18N */
415 else if (type == OP_PERIOD
416 #ifdef RE_ENABLE_I18N
417 || type == OP_UTF8_PERIOD
418 #endif /* RE_ENABLE_I18N */
419 || type == END_OF_RE)
421 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
422 if (type == END_OF_RE)
423 bufp->can_be_null = 1;
424 return;
429 /* Entry point for POSIX code. */
430 /* regcomp takes a regular expression as a string and compiles it.
432 PREG is a regex_t *. We do not expect any fields to be initialized,
433 since POSIX says we shouldn't. Thus, we set
435 `buffer' to the compiled pattern;
436 `used' to the length of the compiled pattern;
437 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
438 REG_EXTENDED bit in CFLAGS is set; otherwise, to
439 RE_SYNTAX_POSIX_BASIC;
440 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
441 `fastmap' to an allocated space for the fastmap;
442 `fastmap_accurate' to zero;
443 `re_nsub' to the number of subexpressions in PATTERN.
445 PATTERN is the address of the pattern string.
447 CFLAGS is a series of bits which affect compilation.
449 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
450 use POSIX basic syntax.
452 If REG_NEWLINE is set, then . and [^...] don't match newline.
453 Also, regexec will try a match beginning after every newline.
455 If REG_ICASE is set, then we considers upper- and lowercase
456 versions of letters to be equivalent when matching.
458 If REG_NOSUB is set, then when PREG is passed to regexec, that
459 routine will report only success or failure, and nothing about the
460 registers.
462 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
463 the return codes and their meanings.) */
466 regcomp (preg, pattern, cflags)
467 regex_t *__restrict preg;
468 const char *__restrict pattern;
469 int cflags;
471 reg_errcode_t ret;
472 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
473 : RE_SYNTAX_POSIX_BASIC);
475 preg->buffer = NULL;
476 preg->allocated = 0;
477 preg->used = 0;
479 /* Try to allocate space for the fastmap. */
480 preg->fastmap = re_malloc (char, SBC_MAX);
481 if (BE (preg->fastmap == NULL, 0))
482 return REG_ESPACE;
484 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
486 /* If REG_NEWLINE is set, newlines are treated differently. */
487 if (cflags & REG_NEWLINE)
488 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
489 syntax &= ~RE_DOT_NEWLINE;
490 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
491 /* It also changes the matching behavior. */
492 preg->newline_anchor = 1;
494 else
495 preg->newline_anchor = 0;
496 preg->no_sub = !!(cflags & REG_NOSUB);
497 preg->translate = NULL;
499 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
501 /* POSIX doesn't distinguish between an unmatched open-group and an
502 unmatched close-group: both are REG_EPAREN. */
503 if (ret == REG_ERPAREN)
504 ret = REG_EPAREN;
506 /* We have already checked preg->fastmap != NULL. */
507 if (BE (ret == REG_NOERROR, 1))
508 /* Compute the fastmap now, since regexec cannot modify the pattern
509 buffer. This function never fails in this implementation. */
510 (void) re_compile_fastmap (preg);
511 else
513 /* Some error occurred while compiling the expression. */
514 re_free (preg->fastmap);
515 preg->fastmap = NULL;
518 return (int) ret;
520 #ifdef _LIBC
521 weak_alias (__regcomp, regcomp)
522 #endif
524 /* Returns a message corresponding to an error code, ERRCODE, returned
525 from either regcomp or regexec. We don't use PREG here. */
527 size_t
528 regerror (errcode, preg, errbuf, errbuf_size)
529 int errcode;
530 const regex_t *__restrict preg;
531 char *__restrict errbuf;
532 size_t errbuf_size;
534 const char *msg;
535 size_t msg_size;
537 if (BE (errcode < 0
538 || errcode >= (int) (sizeof (__re_error_msgid_idx)
539 / sizeof (__re_error_msgid_idx[0])), 0))
540 /* Only error codes returned by the rest of the code should be passed
541 to this routine. If we are given anything else, or if other regex
542 code generates an invalid error code, then the program has a bug.
543 Dump core so we can fix it. */
544 abort ();
546 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
548 msg_size = strlen (msg) + 1; /* Includes the null. */
550 if (BE (errbuf_size != 0, 1))
552 if (BE (msg_size > errbuf_size, 0))
554 #if defined HAVE_MEMPCPY || defined _LIBC
555 *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
556 #else
557 memcpy (errbuf, msg, errbuf_size - 1);
558 errbuf[errbuf_size - 1] = 0;
559 #endif
561 else
562 memcpy (errbuf, msg, msg_size);
565 return msg_size;
567 #ifdef _LIBC
568 weak_alias (__regerror, regerror)
569 #endif
572 #ifdef RE_ENABLE_I18N
573 /* This static array is used for the map to single-byte characters when
574 UTF-8 is used. Otherwise we would allocate memory just to initialize
575 it the same all the time. UTF-8 is the preferred encoding so this is
576 a worthwhile optimization. */
577 static const bitset_t utf8_sb_map =
579 /* Set the first 128 bits. */
580 [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
582 #endif
585 static void
586 free_dfa_content (re_dfa_t *dfa)
588 int i, j;
590 if (dfa->nodes)
591 for (i = 0; i < dfa->nodes_len; ++i)
592 free_token (dfa->nodes + i);
593 re_free (dfa->nexts);
594 for (i = 0; i < dfa->nodes_len; ++i)
596 if (dfa->eclosures != NULL)
597 re_node_set_free (dfa->eclosures + i);
598 if (dfa->inveclosures != NULL)
599 re_node_set_free (dfa->inveclosures + i);
600 if (dfa->edests != NULL)
601 re_node_set_free (dfa->edests + i);
603 re_free (dfa->edests);
604 re_free (dfa->eclosures);
605 re_free (dfa->inveclosures);
606 re_free (dfa->nodes);
608 if (dfa->state_table)
609 for (i = 0; i <= dfa->state_hash_mask; ++i)
611 struct re_state_table_entry *entry = dfa->state_table + i;
612 for (j = 0; j < entry->num; ++j)
614 re_dfastate_t *state = entry->array[j];
615 free_state (state);
617 re_free (entry->array);
619 re_free (dfa->state_table);
620 #ifdef RE_ENABLE_I18N
621 if (dfa->sb_char != utf8_sb_map)
622 re_free (dfa->sb_char);
623 #endif
624 re_free (dfa->subexp_map);
625 #ifdef DEBUG
626 re_free (dfa->re_str);
627 #endif
629 re_free (dfa);
633 /* Free dynamically allocated space used by PREG. */
635 void
636 regfree (preg)
637 regex_t *preg;
639 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
640 if (BE (dfa != NULL, 1))
641 free_dfa_content (dfa);
642 preg->buffer = NULL;
643 preg->allocated = 0;
645 re_free (preg->fastmap);
646 preg->fastmap = NULL;
648 re_free (preg->translate);
649 preg->translate = NULL;
651 #ifdef _LIBC
652 weak_alias (__regfree, regfree)
653 #endif
655 /* Entry points compatible with 4.2 BSD regex library. We don't define
656 them unless specifically requested. */
658 #if defined _REGEX_RE_COMP || defined _LIBC
660 /* BSD has one and only one pattern buffer. */
661 static struct re_pattern_buffer re_comp_buf;
663 char *
664 # ifdef _LIBC
665 /* Make these definitions weak in libc, so POSIX programs can redefine
666 these names if they don't use our functions, and still use
667 regcomp/regexec above without link errors. */
668 weak_function
669 # endif
670 re_comp (s)
671 const char *s;
673 reg_errcode_t ret;
674 char *fastmap;
676 if (!s)
678 if (!re_comp_buf.buffer)
679 return gettext ("No previous regular expression");
680 return 0;
683 if (re_comp_buf.buffer)
685 fastmap = re_comp_buf.fastmap;
686 re_comp_buf.fastmap = NULL;
687 __regfree (&re_comp_buf);
688 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
689 re_comp_buf.fastmap = fastmap;
692 if (re_comp_buf.fastmap == NULL)
694 re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
695 if (re_comp_buf.fastmap == NULL)
696 return (char *) gettext (__re_error_msgid
697 + __re_error_msgid_idx[(int) REG_ESPACE]);
700 /* Since `re_exec' always passes NULL for the `regs' argument, we
701 don't need to initialize the pattern buffer fields which affect it. */
703 /* Match anchors at newlines. */
704 re_comp_buf.newline_anchor = 1;
706 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
708 if (!ret)
709 return NULL;
711 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
712 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
715 #ifdef _LIBC
716 libc_freeres_fn (free_mem)
718 __regfree (&re_comp_buf);
720 #endif
722 #endif /* _REGEX_RE_COMP */
724 /* Internal entry point.
725 Compile the regular expression PATTERN, whose length is LENGTH.
726 SYNTAX indicate regular expression's syntax. */
728 static reg_errcode_t
729 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
730 reg_syntax_t syntax)
732 reg_errcode_t err = REG_NOERROR;
733 re_dfa_t *dfa;
734 re_string_t regexp;
736 /* Initialize the pattern buffer. */
737 preg->fastmap_accurate = 0;
738 preg->syntax = syntax;
739 preg->not_bol = preg->not_eol = 0;
740 preg->used = 0;
741 preg->re_nsub = 0;
742 preg->can_be_null = 0;
743 preg->regs_allocated = REGS_UNALLOCATED;
745 /* Initialize the dfa. */
746 dfa = (re_dfa_t *) preg->buffer;
747 if (BE (preg->allocated < sizeof (re_dfa_t), 0))
749 /* If zero allocated, but buffer is non-null, try to realloc
750 enough space. This loses if buffer's address is bogus, but
751 that is the user's responsibility. If ->buffer is NULL this
752 is a simple allocation. */
753 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
754 if (dfa == NULL)
755 return REG_ESPACE;
756 preg->allocated = sizeof (re_dfa_t);
757 preg->buffer = (unsigned char *) dfa;
759 preg->used = sizeof (re_dfa_t);
761 err = init_dfa (dfa, length);
762 if (BE (err != REG_NOERROR, 0))
764 free_dfa_content (dfa);
765 preg->buffer = NULL;
766 preg->allocated = 0;
767 return err;
769 #ifdef DEBUG
770 /* Note: length+1 will not overflow since it is checked in init_dfa. */
771 dfa->re_str = re_malloc (char, length + 1);
772 strncpy (dfa->re_str, pattern, length + 1);
773 #endif
775 __libc_lock_init (dfa->lock);
777 err = re_string_construct (&regexp, pattern, length, preg->translate,
778 syntax & RE_ICASE, dfa);
779 if (BE (err != REG_NOERROR, 0))
781 re_compile_internal_free_return:
782 free_workarea_compile (preg);
783 re_string_destruct (&regexp);
784 free_dfa_content (dfa);
785 preg->buffer = NULL;
786 preg->allocated = 0;
787 return err;
790 /* Parse the regular expression, and build a structure tree. */
791 preg->re_nsub = 0;
792 dfa->str_tree = parse (&regexp, preg, syntax, &err);
793 if (BE (dfa->str_tree == NULL, 0))
794 goto re_compile_internal_free_return;
796 /* Analyze the tree and create the nfa. */
797 err = analyze (preg);
798 if (BE (err != REG_NOERROR, 0))
799 goto re_compile_internal_free_return;
801 #ifdef RE_ENABLE_I18N
802 /* If possible, do searching in single byte encoding to speed things up. */
803 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
804 optimize_utf8 (dfa);
805 #endif
807 /* Then create the initial state of the dfa. */
808 err = create_initial_state (dfa);
810 /* Release work areas. */
811 free_workarea_compile (preg);
812 re_string_destruct (&regexp);
814 if (BE (err != REG_NOERROR, 0))
816 free_dfa_content (dfa);
817 preg->buffer = NULL;
818 preg->allocated = 0;
821 return err;
824 /* Initialize DFA. We use the length of the regular expression PAT_LEN
825 as the initial length of some arrays. */
827 static reg_errcode_t
828 init_dfa (re_dfa_t *dfa, size_t pat_len)
830 unsigned int table_size;
831 #ifndef _LIBC
832 char *codeset_name;
833 #endif
835 memset (dfa, '\0', sizeof (re_dfa_t));
837 /* Force allocation of str_tree_storage the first time. */
838 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
840 /* Avoid overflows. */
841 if (pat_len == SIZE_MAX)
842 return REG_ESPACE;
844 dfa->nodes_alloc = pat_len + 1;
845 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
847 /* table_size = 2 ^ ceil(log pat_len) */
848 for (table_size = 1; ; table_size <<= 1)
849 if (table_size > pat_len)
850 break;
852 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
853 dfa->state_hash_mask = table_size - 1;
855 dfa->mb_cur_max = MB_CUR_MAX;
856 #ifdef _LIBC
857 if (dfa->mb_cur_max == 6
858 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
859 dfa->is_utf8 = 1;
860 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
861 != 0);
862 #else
863 # ifdef HAVE_LANGINFO_CODESET
864 codeset_name = nl_langinfo (CODESET);
865 # else
866 codeset_name = getenv ("LC_ALL");
867 if (codeset_name == NULL || codeset_name[0] == '\0')
868 codeset_name = getenv ("LC_CTYPE");
869 if (codeset_name == NULL || codeset_name[0] == '\0')
870 codeset_name = getenv ("LANG");
871 if (codeset_name == NULL)
872 codeset_name = "";
873 else if (strchr (codeset_name, '.') != NULL)
874 codeset_name = strchr (codeset_name, '.') + 1;
875 # endif
877 if (strcasecmp (codeset_name, "UTF-8") == 0
878 || strcasecmp (codeset_name, "UTF8") == 0)
879 dfa->is_utf8 = 1;
881 /* We check exhaustively in the loop below if this charset is a
882 superset of ASCII. */
883 dfa->map_notascii = 0;
884 #endif
886 #ifdef RE_ENABLE_I18N
887 if (dfa->mb_cur_max > 1)
889 if (dfa->is_utf8)
890 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
891 else
893 int i, j, ch;
895 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
896 if (BE (dfa->sb_char == NULL, 0))
897 return REG_ESPACE;
899 /* Set the bits corresponding to single byte chars. */
900 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
901 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
903 wint_t wch = __btowc (ch);
904 if (wch != WEOF)
905 dfa->sb_char[i] |= (bitset_word_t) 1 << j;
906 # ifndef _LIBC
907 if (isascii (ch) && wch != ch)
908 dfa->map_notascii = 1;
909 # endif
913 #endif
915 if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
916 return REG_ESPACE;
917 return REG_NOERROR;
920 /* Initialize WORD_CHAR table, which indicate which character is
921 "word". In this case "word" means that it is the word construction
922 character used by some operators like "\<", "\>", etc. */
924 static void
925 internal_function
926 init_word_char (re_dfa_t *dfa)
928 dfa->word_ops_used = 1;
929 int i = 0;
930 int ch = 0;
931 if (BE (dfa->map_notascii == 0, 1))
933 if (sizeof (dfa->word_char[0]) == 8)
935 /* The extra temporaries here avoid "implicitly truncated"
936 warnings in the case when this is dead code, i.e. 32-bit. */
937 const uint64_t wc0 = UINT64_C (0x03ff000000000000);
938 const uint64_t wc1 = UINT64_C (0x07fffffe87fffffe);
939 dfa->word_char[0] = wc0;
940 dfa->word_char[1] = wc1;
941 i = 2;
943 else if (sizeof (dfa->word_char[0]) == 4)
945 dfa->word_char[0] = UINT32_C (0x00000000);
946 dfa->word_char[1] = UINT32_C (0x03ff0000);
947 dfa->word_char[2] = UINT32_C (0x87fffffe);
948 dfa->word_char[3] = UINT32_C (0x07fffffe);
949 i = 4;
951 else
952 abort ();
953 ch = 128;
955 if (BE (dfa->is_utf8, 1))
957 memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8);
958 return;
962 for (; i < BITSET_WORDS; ++i)
963 for (int j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
964 if (isalnum (ch) || ch == '_')
965 dfa->word_char[i] |= (bitset_word_t) 1 << j;
968 /* Free the work area which are only used while compiling. */
970 static void
971 free_workarea_compile (regex_t *preg)
973 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
974 bin_tree_storage_t *storage, *next;
975 for (storage = dfa->str_tree_storage; storage; storage = next)
977 next = storage->next;
978 re_free (storage);
980 dfa->str_tree_storage = NULL;
981 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
982 dfa->str_tree = NULL;
983 re_free (dfa->org_indices);
984 dfa->org_indices = NULL;
987 /* Create initial states for all contexts. */
989 static reg_errcode_t
990 create_initial_state (re_dfa_t *dfa)
992 int first, i;
993 reg_errcode_t err;
994 re_node_set init_nodes;
996 /* Initial states have the epsilon closure of the node which is
997 the first node of the regular expression. */
998 first = dfa->str_tree->first->node_idx;
999 dfa->init_node = first;
1000 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
1001 if (BE (err != REG_NOERROR, 0))
1002 return err;
1004 /* The back-references which are in initial states can epsilon transit,
1005 since in this case all of the subexpressions can be null.
1006 Then we add epsilon closures of the nodes which are the next nodes of
1007 the back-references. */
1008 if (dfa->nbackref > 0)
1009 for (i = 0; i < init_nodes.nelem; ++i)
1011 int node_idx = init_nodes.elems[i];
1012 re_token_type_t type = dfa->nodes[node_idx].type;
1014 int clexp_idx;
1015 if (type != OP_BACK_REF)
1016 continue;
1017 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1019 re_token_t *clexp_node;
1020 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1021 if (clexp_node->type == OP_CLOSE_SUBEXP
1022 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1023 break;
1025 if (clexp_idx == init_nodes.nelem)
1026 continue;
1028 if (type == OP_BACK_REF)
1030 int dest_idx = dfa->edests[node_idx].elems[0];
1031 if (!re_node_set_contains (&init_nodes, dest_idx))
1033 reg_errcode_t err = re_node_set_merge (&init_nodes,
1034 dfa->eclosures
1035 + dest_idx);
1036 if (err != REG_NOERROR)
1037 return err;
1038 i = 0;
1043 /* It must be the first time to invoke acquire_state. */
1044 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1045 /* We don't check ERR here, since the initial state must not be NULL. */
1046 if (BE (dfa->init_state == NULL, 0))
1047 return err;
1048 if (dfa->init_state->has_constraint)
1050 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1051 CONTEXT_WORD);
1052 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1053 CONTEXT_NEWLINE);
1054 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1055 &init_nodes,
1056 CONTEXT_NEWLINE
1057 | CONTEXT_BEGBUF);
1058 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1059 || dfa->init_state_begbuf == NULL, 0))
1060 return err;
1062 else
1063 dfa->init_state_word = dfa->init_state_nl
1064 = dfa->init_state_begbuf = dfa->init_state;
1066 re_node_set_free (&init_nodes);
1067 return REG_NOERROR;
1070 #ifdef RE_ENABLE_I18N
1071 /* If it is possible to do searching in single byte encoding instead of UTF-8
1072 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1073 DFA nodes where needed. */
1075 static void
1076 optimize_utf8 (re_dfa_t *dfa)
1078 int node, i, mb_chars = 0, has_period = 0;
1080 for (node = 0; node < dfa->nodes_len; ++node)
1081 switch (dfa->nodes[node].type)
1083 case CHARACTER:
1084 if (dfa->nodes[node].opr.c >= 0x80)
1085 mb_chars = 1;
1086 break;
1087 case ANCHOR:
1088 switch (dfa->nodes[node].opr.ctx_type)
1090 case LINE_FIRST:
1091 case LINE_LAST:
1092 case BUF_FIRST:
1093 case BUF_LAST:
1094 break;
1095 default:
1096 /* Word anchors etc. cannot be handled. It's okay to test
1097 opr.ctx_type since constraints (for all DFA nodes) are
1098 created by ORing one or more opr.ctx_type values. */
1099 return;
1101 break;
1102 case OP_PERIOD:
1103 has_period = 1;
1104 break;
1105 case OP_BACK_REF:
1106 case OP_ALT:
1107 case END_OF_RE:
1108 case OP_DUP_ASTERISK:
1109 case OP_OPEN_SUBEXP:
1110 case OP_CLOSE_SUBEXP:
1111 break;
1112 case COMPLEX_BRACKET:
1113 return;
1114 case SIMPLE_BRACKET:
1115 /* Just double check. The non-ASCII range starts at 0x80. */
1116 assert (0x80 % BITSET_WORD_BITS == 0);
1117 for (i = 0x80 / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1118 if (dfa->nodes[node].opr.sbcset[i])
1119 return;
1120 break;
1121 default:
1122 abort ();
1125 if (mb_chars || has_period)
1126 for (node = 0; node < dfa->nodes_len; ++node)
1128 if (dfa->nodes[node].type == CHARACTER
1129 && dfa->nodes[node].opr.c >= 0x80)
1130 dfa->nodes[node].mb_partial = 0;
1131 else if (dfa->nodes[node].type == OP_PERIOD)
1132 dfa->nodes[node].type = OP_UTF8_PERIOD;
1135 /* The search can be in single byte locale. */
1136 dfa->mb_cur_max = 1;
1137 dfa->is_utf8 = 0;
1138 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1140 #endif
1142 /* Analyze the structure tree, and calculate "first", "next", "edest",
1143 "eclosure", and "inveclosure". */
1145 static reg_errcode_t
1146 analyze (regex_t *preg)
1148 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1149 reg_errcode_t ret;
1151 /* Allocate arrays. */
1152 dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1153 dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1154 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1155 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1156 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1157 || dfa->eclosures == NULL, 0))
1158 return REG_ESPACE;
1160 dfa->subexp_map = re_malloc (int, preg->re_nsub);
1161 if (dfa->subexp_map != NULL)
1163 int i;
1164 for (i = 0; i < preg->re_nsub; i++)
1165 dfa->subexp_map[i] = i;
1166 preorder (dfa->str_tree, optimize_subexps, dfa);
1167 for (i = 0; i < preg->re_nsub; i++)
1168 if (dfa->subexp_map[i] != i)
1169 break;
1170 if (i == preg->re_nsub)
1172 free (dfa->subexp_map);
1173 dfa->subexp_map = NULL;
1177 ret = postorder (dfa->str_tree, lower_subexps, preg);
1178 if (BE (ret != REG_NOERROR, 0))
1179 return ret;
1180 ret = postorder (dfa->str_tree, calc_first, dfa);
1181 if (BE (ret != REG_NOERROR, 0))
1182 return ret;
1183 preorder (dfa->str_tree, calc_next, dfa);
1184 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1185 if (BE (ret != REG_NOERROR, 0))
1186 return ret;
1187 ret = calc_eclosure (dfa);
1188 if (BE (ret != REG_NOERROR, 0))
1189 return ret;
1191 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1192 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1193 if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1194 || dfa->nbackref)
1196 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1197 if (BE (dfa->inveclosures == NULL, 0))
1198 return REG_ESPACE;
1199 ret = calc_inveclosure (dfa);
1202 return ret;
1205 /* Our parse trees are very unbalanced, so we cannot use a stack to
1206 implement parse tree visits. Instead, we use parent pointers and
1207 some hairy code in these two functions. */
1208 static reg_errcode_t
1209 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1210 void *extra)
1212 bin_tree_t *node, *prev;
1214 for (node = root; ; )
1216 /* Descend down the tree, preferably to the left (or to the right
1217 if that's the only child). */
1218 while (node->left || node->right)
1219 if (node->left)
1220 node = node->left;
1221 else
1222 node = node->right;
1226 reg_errcode_t err = fn (extra, node);
1227 if (BE (err != REG_NOERROR, 0))
1228 return err;
1229 if (node->parent == NULL)
1230 return REG_NOERROR;
1231 prev = node;
1232 node = node->parent;
1234 /* Go up while we have a node that is reached from the right. */
1235 while (node->right == prev || node->right == NULL);
1236 node = node->right;
1240 static reg_errcode_t
1241 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1242 void *extra)
1244 bin_tree_t *node;
1246 for (node = root; ; )
1248 reg_errcode_t err = fn (extra, node);
1249 if (BE (err != REG_NOERROR, 0))
1250 return err;
1252 /* Go to the left node, or up and to the right. */
1253 if (node->left)
1254 node = node->left;
1255 else
1257 bin_tree_t *prev = NULL;
1258 while (node->right == prev || node->right == NULL)
1260 prev = node;
1261 node = node->parent;
1262 if (!node)
1263 return REG_NOERROR;
1265 node = node->right;
1270 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1271 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1272 backreferences as well. Requires a preorder visit. */
1273 static reg_errcode_t
1274 optimize_subexps (void *extra, bin_tree_t *node)
1276 re_dfa_t *dfa = (re_dfa_t *) extra;
1278 if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1280 int idx = node->token.opr.idx;
1281 node->token.opr.idx = dfa->subexp_map[idx];
1282 dfa->used_bkref_map |= 1 << node->token.opr.idx;
1285 else if (node->token.type == SUBEXP
1286 && node->left && node->left->token.type == SUBEXP)
1288 int other_idx = node->left->token.opr.idx;
1290 node->left = node->left->left;
1291 if (node->left)
1292 node->left->parent = node;
1294 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1295 if (other_idx < BITSET_WORD_BITS)
1296 dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1299 return REG_NOERROR;
1302 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1303 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1304 static reg_errcode_t
1305 lower_subexps (void *extra, bin_tree_t *node)
1307 regex_t *preg = (regex_t *) extra;
1308 reg_errcode_t err = REG_NOERROR;
1310 if (node->left && node->left->token.type == SUBEXP)
1312 node->left = lower_subexp (&err, preg, node->left);
1313 if (node->left)
1314 node->left->parent = node;
1316 if (node->right && node->right->token.type == SUBEXP)
1318 node->right = lower_subexp (&err, preg, node->right);
1319 if (node->right)
1320 node->right->parent = node;
1323 return err;
1326 static bin_tree_t *
1327 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1329 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1330 bin_tree_t *body = node->left;
1331 bin_tree_t *op, *cls, *tree1, *tree;
1333 if (preg->no_sub
1334 /* We do not optimize empty subexpressions, because otherwise we may
1335 have bad CONCAT nodes with NULL children. This is obviously not
1336 very common, so we do not lose much. An example that triggers
1337 this case is the sed "script" /\(\)/x. */
1338 && node->left != NULL
1339 && (node->token.opr.idx >= BITSET_WORD_BITS
1340 || !(dfa->used_bkref_map
1341 & ((bitset_word_t) 1 << node->token.opr.idx))))
1342 return node->left;
1344 /* Convert the SUBEXP node to the concatenation of an
1345 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1346 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1347 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1348 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1349 tree = create_tree (dfa, op, tree1, CONCAT);
1350 if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1352 *err = REG_ESPACE;
1353 return NULL;
1356 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1357 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1358 return tree;
1361 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1362 nodes. Requires a postorder visit. */
1363 static reg_errcode_t
1364 calc_first (void *extra, bin_tree_t *node)
1366 re_dfa_t *dfa = (re_dfa_t *) extra;
1367 if (node->token.type == CONCAT)
1369 node->first = node->left->first;
1370 node->node_idx = node->left->node_idx;
1372 else
1374 node->first = node;
1375 node->node_idx = re_dfa_add_node (dfa, node->token);
1376 if (BE (node->node_idx == -1, 0))
1377 return REG_ESPACE;
1378 if (node->token.type == ANCHOR)
1379 dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1381 return REG_NOERROR;
1384 /* Pass 2: compute NEXT on the tree. Preorder visit. */
1385 static reg_errcode_t
1386 calc_next (void *extra, bin_tree_t *node)
1388 switch (node->token.type)
1390 case OP_DUP_ASTERISK:
1391 node->left->next = node;
1392 break;
1393 case CONCAT:
1394 node->left->next = node->right->first;
1395 node->right->next = node->next;
1396 break;
1397 default:
1398 if (node->left)
1399 node->left->next = node->next;
1400 if (node->right)
1401 node->right->next = node->next;
1402 break;
1404 return REG_NOERROR;
1407 /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1408 static reg_errcode_t
1409 link_nfa_nodes (void *extra, bin_tree_t *node)
1411 re_dfa_t *dfa = (re_dfa_t *) extra;
1412 int idx = node->node_idx;
1413 reg_errcode_t err = REG_NOERROR;
1415 switch (node->token.type)
1417 case CONCAT:
1418 break;
1420 case END_OF_RE:
1421 assert (node->next == NULL);
1422 break;
1424 case OP_DUP_ASTERISK:
1425 case OP_ALT:
1427 int left, right;
1428 dfa->has_plural_match = 1;
1429 if (node->left != NULL)
1430 left = node->left->first->node_idx;
1431 else
1432 left = node->next->node_idx;
1433 if (node->right != NULL)
1434 right = node->right->first->node_idx;
1435 else
1436 right = node->next->node_idx;
1437 assert (left > -1);
1438 assert (right > -1);
1439 err = re_node_set_init_2 (dfa->edests + idx, left, right);
1441 break;
1443 case ANCHOR:
1444 case OP_OPEN_SUBEXP:
1445 case OP_CLOSE_SUBEXP:
1446 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1447 break;
1449 case OP_BACK_REF:
1450 dfa->nexts[idx] = node->next->node_idx;
1451 if (node->token.type == OP_BACK_REF)
1452 err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1453 break;
1455 default:
1456 assert (!IS_EPSILON_NODE (node->token.type));
1457 dfa->nexts[idx] = node->next->node_idx;
1458 break;
1461 return err;
1464 /* Duplicate the epsilon closure of the node ROOT_NODE.
1465 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1466 to their own constraint. */
1468 static reg_errcode_t
1469 internal_function
1470 duplicate_node_closure (re_dfa_t *dfa, int top_org_node, int top_clone_node,
1471 int root_node, unsigned int init_constraint)
1473 int org_node, clone_node, ret;
1474 unsigned int constraint = init_constraint;
1475 for (org_node = top_org_node, clone_node = top_clone_node;;)
1477 int org_dest, clone_dest;
1478 if (dfa->nodes[org_node].type == OP_BACK_REF)
1480 /* If the back reference epsilon-transit, its destination must
1481 also have the constraint. Then duplicate the epsilon closure
1482 of the destination of the back reference, and store it in
1483 edests of the back reference. */
1484 org_dest = dfa->nexts[org_node];
1485 re_node_set_empty (dfa->edests + clone_node);
1486 clone_dest = duplicate_node (dfa, org_dest, constraint);
1487 if (BE (clone_dest == -1, 0))
1488 return REG_ESPACE;
1489 dfa->nexts[clone_node] = dfa->nexts[org_node];
1490 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1491 if (BE (ret < 0, 0))
1492 return REG_ESPACE;
1494 else if (dfa->edests[org_node].nelem == 0)
1496 /* In case of the node can't epsilon-transit, don't duplicate the
1497 destination and store the original destination as the
1498 destination of the node. */
1499 dfa->nexts[clone_node] = dfa->nexts[org_node];
1500 break;
1502 else if (dfa->edests[org_node].nelem == 1)
1504 /* In case of the node can epsilon-transit, and it has only one
1505 destination. */
1506 org_dest = dfa->edests[org_node].elems[0];
1507 re_node_set_empty (dfa->edests + clone_node);
1508 /* If the node is root_node itself, it means the epsilon clsoure
1509 has a loop. Then tie it to the destination of the root_node. */
1510 if (org_node == root_node && clone_node != org_node)
1512 ret = re_node_set_insert (dfa->edests + clone_node, org_dest);
1513 if (BE (ret < 0, 0))
1514 return REG_ESPACE;
1515 break;
1517 /* In case of the node has another constraint, add it. */
1518 constraint |= dfa->nodes[org_node].constraint;
1519 clone_dest = duplicate_node (dfa, org_dest, constraint);
1520 if (BE (clone_dest == -1, 0))
1521 return REG_ESPACE;
1522 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1523 if (BE (ret < 0, 0))
1524 return REG_ESPACE;
1526 else /* dfa->edests[org_node].nelem == 2 */
1528 /* In case of the node can epsilon-transit, and it has two
1529 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1530 org_dest = dfa->edests[org_node].elems[0];
1531 re_node_set_empty (dfa->edests + clone_node);
1532 /* Search for a duplicated node which satisfies the constraint. */
1533 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1534 if (clone_dest == -1)
1536 /* There is no such duplicated node, create a new one. */
1537 reg_errcode_t err;
1538 clone_dest = duplicate_node (dfa, org_dest, constraint);
1539 if (BE (clone_dest == -1, 0))
1540 return REG_ESPACE;
1541 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1542 if (BE (ret < 0, 0))
1543 return REG_ESPACE;
1544 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1545 root_node, constraint);
1546 if (BE (err != REG_NOERROR, 0))
1547 return err;
1549 else
1551 /* There is a duplicated node which satisfies the constraint,
1552 use it to avoid infinite loop. */
1553 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1554 if (BE (ret < 0, 0))
1555 return REG_ESPACE;
1558 org_dest = dfa->edests[org_node].elems[1];
1559 clone_dest = duplicate_node (dfa, org_dest, constraint);
1560 if (BE (clone_dest == -1, 0))
1561 return REG_ESPACE;
1562 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1563 if (BE (ret < 0, 0))
1564 return REG_ESPACE;
1566 org_node = org_dest;
1567 clone_node = clone_dest;
1569 return REG_NOERROR;
1572 /* Search for a node which is duplicated from the node ORG_NODE, and
1573 satisfies the constraint CONSTRAINT. */
1575 static int
1576 search_duplicated_node (const re_dfa_t *dfa, int org_node,
1577 unsigned int constraint)
1579 int idx;
1580 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1582 if (org_node == dfa->org_indices[idx]
1583 && constraint == dfa->nodes[idx].constraint)
1584 return idx; /* Found. */
1586 return -1; /* Not found. */
1589 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1590 Return the index of the new node, or -1 if insufficient storage is
1591 available. */
1593 static int
1594 duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint)
1596 int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1597 if (BE (dup_idx != -1, 1))
1599 dfa->nodes[dup_idx].constraint = constraint;
1600 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1601 dfa->nodes[dup_idx].duplicated = 1;
1603 /* Store the index of the original node. */
1604 dfa->org_indices[dup_idx] = org_idx;
1606 return dup_idx;
1609 static reg_errcode_t
1610 calc_inveclosure (re_dfa_t *dfa)
1612 int src, idx, ret;
1613 for (idx = 0; idx < dfa->nodes_len; ++idx)
1614 re_node_set_init_empty (dfa->inveclosures + idx);
1616 for (src = 0; src < dfa->nodes_len; ++src)
1618 int *elems = dfa->eclosures[src].elems;
1619 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1621 ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1622 if (BE (ret == -1, 0))
1623 return REG_ESPACE;
1627 return REG_NOERROR;
1630 /* Calculate "eclosure" for all the node in DFA. */
1632 static reg_errcode_t
1633 calc_eclosure (re_dfa_t *dfa)
1635 int node_idx, incomplete;
1636 #ifdef DEBUG
1637 assert (dfa->nodes_len > 0);
1638 #endif
1639 incomplete = 0;
1640 /* For each nodes, calculate epsilon closure. */
1641 for (node_idx = 0; ; ++node_idx)
1643 reg_errcode_t err;
1644 re_node_set eclosure_elem;
1645 if (node_idx == dfa->nodes_len)
1647 if (!incomplete)
1648 break;
1649 incomplete = 0;
1650 node_idx = 0;
1653 #ifdef DEBUG
1654 assert (dfa->eclosures[node_idx].nelem != -1);
1655 #endif
1657 /* If we have already calculated, skip it. */
1658 if (dfa->eclosures[node_idx].nelem != 0)
1659 continue;
1660 /* Calculate epsilon closure of `node_idx'. */
1661 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1662 if (BE (err != REG_NOERROR, 0))
1663 return err;
1665 if (dfa->eclosures[node_idx].nelem == 0)
1667 incomplete = 1;
1668 re_node_set_free (&eclosure_elem);
1671 return REG_NOERROR;
1674 /* Calculate epsilon closure of NODE. */
1676 static reg_errcode_t
1677 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, int node, int root)
1679 reg_errcode_t err;
1680 int i;
1681 re_node_set eclosure;
1682 int ret;
1683 int incomplete = 0;
1684 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1685 if (BE (err != REG_NOERROR, 0))
1686 return err;
1688 /* This indicates that we are calculating this node now.
1689 We reference this value to avoid infinite loop. */
1690 dfa->eclosures[node].nelem = -1;
1692 /* If the current node has constraints, duplicate all nodes
1693 since they must inherit the constraints. */
1694 if (dfa->nodes[node].constraint
1695 && dfa->edests[node].nelem
1696 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1698 err = duplicate_node_closure (dfa, node, node, node,
1699 dfa->nodes[node].constraint);
1700 if (BE (err != REG_NOERROR, 0))
1701 return err;
1704 /* Expand each epsilon destination nodes. */
1705 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1706 for (i = 0; i < dfa->edests[node].nelem; ++i)
1708 re_node_set eclosure_elem;
1709 int edest = dfa->edests[node].elems[i];
1710 /* If calculating the epsilon closure of `edest' is in progress,
1711 return intermediate result. */
1712 if (dfa->eclosures[edest].nelem == -1)
1714 incomplete = 1;
1715 continue;
1717 /* If we haven't calculated the epsilon closure of `edest' yet,
1718 calculate now. Otherwise use calculated epsilon closure. */
1719 if (dfa->eclosures[edest].nelem == 0)
1721 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1722 if (BE (err != REG_NOERROR, 0))
1723 return err;
1725 else
1726 eclosure_elem = dfa->eclosures[edest];
1727 /* Merge the epsilon closure of `edest'. */
1728 err = re_node_set_merge (&eclosure, &eclosure_elem);
1729 if (BE (err != REG_NOERROR, 0))
1730 return err;
1731 /* If the epsilon closure of `edest' is incomplete,
1732 the epsilon closure of this node is also incomplete. */
1733 if (dfa->eclosures[edest].nelem == 0)
1735 incomplete = 1;
1736 re_node_set_free (&eclosure_elem);
1740 /* An epsilon closure includes itself. */
1741 ret = re_node_set_insert (&eclosure, node);
1742 if (BE (ret < 0, 0))
1743 return REG_ESPACE;
1744 if (incomplete && !root)
1745 dfa->eclosures[node].nelem = 0;
1746 else
1747 dfa->eclosures[node] = eclosure;
1748 *new_set = eclosure;
1749 return REG_NOERROR;
1752 /* Functions for token which are used in the parser. */
1754 /* Fetch a token from INPUT.
1755 We must not use this function inside bracket expressions. */
1757 static void
1758 internal_function
1759 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1761 re_string_skip_bytes (input, peek_token (result, input, syntax));
1764 /* Peek a token from INPUT, and return the length of the token.
1765 We must not use this function inside bracket expressions. */
1767 static int
1768 internal_function
1769 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1771 unsigned char c;
1773 if (re_string_eoi (input))
1775 token->type = END_OF_RE;
1776 return 0;
1779 c = re_string_peek_byte (input, 0);
1780 token->opr.c = c;
1782 token->word_char = 0;
1783 #ifdef RE_ENABLE_I18N
1784 token->mb_partial = 0;
1785 if (input->mb_cur_max > 1 &&
1786 !re_string_first_byte (input, re_string_cur_idx (input)))
1788 token->type = CHARACTER;
1789 token->mb_partial = 1;
1790 return 1;
1792 #endif
1793 if (c == '\\')
1795 unsigned char c2;
1796 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1798 token->type = BACK_SLASH;
1799 return 1;
1802 c2 = re_string_peek_byte_case (input, 1);
1803 token->opr.c = c2;
1804 token->type = CHARACTER;
1805 #ifdef RE_ENABLE_I18N
1806 if (input->mb_cur_max > 1)
1808 wint_t wc = re_string_wchar_at (input,
1809 re_string_cur_idx (input) + 1);
1810 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1812 else
1813 #endif
1814 token->word_char = IS_WORD_CHAR (c2) != 0;
1816 switch (c2)
1818 case '|':
1819 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1820 token->type = OP_ALT;
1821 break;
1822 case '1': case '2': case '3': case '4': case '5':
1823 case '6': case '7': case '8': case '9':
1824 if (!(syntax & RE_NO_BK_REFS))
1826 token->type = OP_BACK_REF;
1827 token->opr.idx = c2 - '1';
1829 break;
1830 case '<':
1831 if (!(syntax & RE_NO_GNU_OPS))
1833 token->type = ANCHOR;
1834 token->opr.ctx_type = WORD_FIRST;
1836 break;
1837 case '>':
1838 if (!(syntax & RE_NO_GNU_OPS))
1840 token->type = ANCHOR;
1841 token->opr.ctx_type = WORD_LAST;
1843 break;
1844 case 'b':
1845 if (!(syntax & RE_NO_GNU_OPS))
1847 token->type = ANCHOR;
1848 token->opr.ctx_type = WORD_DELIM;
1850 break;
1851 case 'B':
1852 if (!(syntax & RE_NO_GNU_OPS))
1854 token->type = ANCHOR;
1855 token->opr.ctx_type = NOT_WORD_DELIM;
1857 break;
1858 case 'w':
1859 if (!(syntax & RE_NO_GNU_OPS))
1860 token->type = OP_WORD;
1861 break;
1862 case 'W':
1863 if (!(syntax & RE_NO_GNU_OPS))
1864 token->type = OP_NOTWORD;
1865 break;
1866 case 's':
1867 if (!(syntax & RE_NO_GNU_OPS))
1868 token->type = OP_SPACE;
1869 break;
1870 case 'S':
1871 if (!(syntax & RE_NO_GNU_OPS))
1872 token->type = OP_NOTSPACE;
1873 break;
1874 case '`':
1875 if (!(syntax & RE_NO_GNU_OPS))
1877 token->type = ANCHOR;
1878 token->opr.ctx_type = BUF_FIRST;
1880 break;
1881 case '\'':
1882 if (!(syntax & RE_NO_GNU_OPS))
1884 token->type = ANCHOR;
1885 token->opr.ctx_type = BUF_LAST;
1887 break;
1888 case '(':
1889 if (!(syntax & RE_NO_BK_PARENS))
1890 token->type = OP_OPEN_SUBEXP;
1891 break;
1892 case ')':
1893 if (!(syntax & RE_NO_BK_PARENS))
1894 token->type = OP_CLOSE_SUBEXP;
1895 break;
1896 case '+':
1897 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1898 token->type = OP_DUP_PLUS;
1899 break;
1900 case '?':
1901 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1902 token->type = OP_DUP_QUESTION;
1903 break;
1904 case '{':
1905 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1906 token->type = OP_OPEN_DUP_NUM;
1907 break;
1908 case '}':
1909 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1910 token->type = OP_CLOSE_DUP_NUM;
1911 break;
1912 default:
1913 break;
1915 return 2;
1918 token->type = CHARACTER;
1919 #ifdef RE_ENABLE_I18N
1920 if (input->mb_cur_max > 1)
1922 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1923 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1925 else
1926 #endif
1927 token->word_char = IS_WORD_CHAR (token->opr.c);
1929 switch (c)
1931 case '\n':
1932 if (syntax & RE_NEWLINE_ALT)
1933 token->type = OP_ALT;
1934 break;
1935 case '|':
1936 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1937 token->type = OP_ALT;
1938 break;
1939 case '*':
1940 token->type = OP_DUP_ASTERISK;
1941 break;
1942 case '+':
1943 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1944 token->type = OP_DUP_PLUS;
1945 break;
1946 case '?':
1947 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1948 token->type = OP_DUP_QUESTION;
1949 break;
1950 case '{':
1951 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1952 token->type = OP_OPEN_DUP_NUM;
1953 break;
1954 case '}':
1955 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1956 token->type = OP_CLOSE_DUP_NUM;
1957 break;
1958 case '(':
1959 if (syntax & RE_NO_BK_PARENS)
1960 token->type = OP_OPEN_SUBEXP;
1961 break;
1962 case ')':
1963 if (syntax & RE_NO_BK_PARENS)
1964 token->type = OP_CLOSE_SUBEXP;
1965 break;
1966 case '[':
1967 token->type = OP_OPEN_BRACKET;
1968 break;
1969 case '.':
1970 token->type = OP_PERIOD;
1971 break;
1972 case '^':
1973 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1974 re_string_cur_idx (input) != 0)
1976 char prev = re_string_peek_byte (input, -1);
1977 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1978 break;
1980 token->type = ANCHOR;
1981 token->opr.ctx_type = LINE_FIRST;
1982 break;
1983 case '$':
1984 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1985 re_string_cur_idx (input) + 1 != re_string_length (input))
1987 re_token_t next;
1988 re_string_skip_bytes (input, 1);
1989 peek_token (&next, input, syntax);
1990 re_string_skip_bytes (input, -1);
1991 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1992 break;
1994 token->type = ANCHOR;
1995 token->opr.ctx_type = LINE_LAST;
1996 break;
1997 default:
1998 break;
2000 return 1;
2003 /* Peek a token from INPUT, and return the length of the token.
2004 We must not use this function out of bracket expressions. */
2006 static int
2007 internal_function
2008 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2010 unsigned char c;
2011 if (re_string_eoi (input))
2013 token->type = END_OF_RE;
2014 return 0;
2016 c = re_string_peek_byte (input, 0);
2017 token->opr.c = c;
2019 #ifdef RE_ENABLE_I18N
2020 if (input->mb_cur_max > 1 &&
2021 !re_string_first_byte (input, re_string_cur_idx (input)))
2023 token->type = CHARACTER;
2024 return 1;
2026 #endif /* RE_ENABLE_I18N */
2028 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2029 && re_string_cur_idx (input) + 1 < re_string_length (input))
2031 /* In this case, '\' escape a character. */
2032 unsigned char c2;
2033 re_string_skip_bytes (input, 1);
2034 c2 = re_string_peek_byte (input, 0);
2035 token->opr.c = c2;
2036 token->type = CHARACTER;
2037 return 1;
2039 if (c == '[') /* '[' is a special char in a bracket exps. */
2041 unsigned char c2;
2042 int token_len;
2043 if (re_string_cur_idx (input) + 1 < re_string_length (input))
2044 c2 = re_string_peek_byte (input, 1);
2045 else
2046 c2 = 0;
2047 token->opr.c = c2;
2048 token_len = 2;
2049 switch (c2)
2051 case '.':
2052 token->type = OP_OPEN_COLL_ELEM;
2053 break;
2054 case '=':
2055 token->type = OP_OPEN_EQUIV_CLASS;
2056 break;
2057 case ':':
2058 if (syntax & RE_CHAR_CLASSES)
2060 token->type = OP_OPEN_CHAR_CLASS;
2061 break;
2063 /* else fall through. */
2064 default:
2065 token->type = CHARACTER;
2066 token->opr.c = c;
2067 token_len = 1;
2068 break;
2070 return token_len;
2072 switch (c)
2074 case '-':
2075 token->type = OP_CHARSET_RANGE;
2076 break;
2077 case ']':
2078 token->type = OP_CLOSE_BRACKET;
2079 break;
2080 case '^':
2081 token->type = OP_NON_MATCH_LIST;
2082 break;
2083 default:
2084 token->type = CHARACTER;
2086 return 1;
2089 /* Functions for parser. */
2091 /* Entry point of the parser.
2092 Parse the regular expression REGEXP and return the structure tree.
2093 If an error is occured, ERR is set by error code, and return NULL.
2094 This function build the following tree, from regular expression <reg_exp>:
2098 <reg_exp> EOR
2100 CAT means concatenation.
2101 EOR means end of regular expression. */
2103 static bin_tree_t *
2104 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2105 reg_errcode_t *err)
2107 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2108 bin_tree_t *tree, *eor, *root;
2109 re_token_t current_token;
2110 dfa->syntax = syntax;
2111 fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2112 tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2113 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2114 return NULL;
2115 eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2116 if (tree != NULL)
2117 root = create_tree (dfa, tree, eor, CONCAT);
2118 else
2119 root = eor;
2120 if (BE (eor == NULL || root == NULL, 0))
2122 *err = REG_ESPACE;
2123 return NULL;
2125 return root;
2128 /* This function build the following tree, from regular expression
2129 <branch1>|<branch2>:
2133 <branch1> <branch2>
2135 ALT means alternative, which represents the operator `|'. */
2137 static bin_tree_t *
2138 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2139 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2141 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2142 bin_tree_t *tree, *branch = NULL;
2143 tree = parse_branch (regexp, preg, token, syntax, nest, err);
2144 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2145 return NULL;
2147 while (token->type == OP_ALT)
2149 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2150 if (token->type != OP_ALT && token->type != END_OF_RE
2151 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2153 branch = parse_branch (regexp, preg, token, syntax, nest, err);
2154 if (BE (*err != REG_NOERROR && branch == NULL, 0))
2155 return NULL;
2157 else
2158 branch = NULL;
2159 tree = create_tree (dfa, tree, branch, OP_ALT);
2160 if (BE (tree == NULL, 0))
2162 *err = REG_ESPACE;
2163 return NULL;
2166 return tree;
2169 /* This function build the following tree, from regular expression
2170 <exp1><exp2>:
2174 <exp1> <exp2>
2176 CAT means concatenation. */
2178 static bin_tree_t *
2179 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2180 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2182 bin_tree_t *tree, *exp;
2183 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2184 tree = parse_expression (regexp, preg, token, syntax, nest, err);
2185 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2186 return NULL;
2188 while (token->type != OP_ALT && token->type != END_OF_RE
2189 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2191 exp = parse_expression (regexp, preg, token, syntax, nest, err);
2192 if (BE (*err != REG_NOERROR && exp == NULL, 0))
2194 if (tree != NULL)
2195 postorder (tree, free_tree, NULL);
2196 return NULL;
2198 if (tree != NULL && exp != NULL)
2200 bin_tree_t *newtree = create_tree (dfa, tree, exp, CONCAT);
2201 if (newtree == NULL)
2203 postorder (exp, free_tree, NULL);
2204 postorder (tree, free_tree, NULL);
2205 *err = REG_ESPACE;
2206 return NULL;
2208 tree = newtree;
2210 else if (tree == NULL)
2211 tree = exp;
2212 /* Otherwise exp == NULL, we don't need to create new tree. */
2214 return tree;
2217 /* This function build the following tree, from regular expression a*:
2223 static bin_tree_t *
2224 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2225 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2227 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2228 bin_tree_t *tree;
2229 switch (token->type)
2231 case CHARACTER:
2232 tree = create_token_tree (dfa, NULL, NULL, token);
2233 if (BE (tree == NULL, 0))
2235 *err = REG_ESPACE;
2236 return NULL;
2238 #ifdef RE_ENABLE_I18N
2239 if (dfa->mb_cur_max > 1)
2241 while (!re_string_eoi (regexp)
2242 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2244 bin_tree_t *mbc_remain;
2245 fetch_token (token, regexp, syntax);
2246 mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2247 tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2248 if (BE (mbc_remain == NULL || tree == NULL, 0))
2250 *err = REG_ESPACE;
2251 return NULL;
2255 #endif
2256 break;
2257 case OP_OPEN_SUBEXP:
2258 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2259 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2260 return NULL;
2261 break;
2262 case OP_OPEN_BRACKET:
2263 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2264 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2265 return NULL;
2266 break;
2267 case OP_BACK_REF:
2268 if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2270 *err = REG_ESUBREG;
2271 return NULL;
2273 dfa->used_bkref_map |= 1 << token->opr.idx;
2274 tree = create_token_tree (dfa, NULL, NULL, token);
2275 if (BE (tree == NULL, 0))
2277 *err = REG_ESPACE;
2278 return NULL;
2280 ++dfa->nbackref;
2281 dfa->has_mb_node = 1;
2282 break;
2283 case OP_OPEN_DUP_NUM:
2284 if (syntax & RE_CONTEXT_INVALID_DUP)
2286 *err = REG_BADRPT;
2287 return NULL;
2289 /* FALLTHROUGH */
2290 case OP_DUP_ASTERISK:
2291 case OP_DUP_PLUS:
2292 case OP_DUP_QUESTION:
2293 if (syntax & RE_CONTEXT_INVALID_OPS)
2295 *err = REG_BADRPT;
2296 return NULL;
2298 else if (syntax & RE_CONTEXT_INDEP_OPS)
2300 fetch_token (token, regexp, syntax);
2301 return parse_expression (regexp, preg, token, syntax, nest, err);
2303 /* else fall through */
2304 case OP_CLOSE_SUBEXP:
2305 if ((token->type == OP_CLOSE_SUBEXP) &&
2306 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2308 *err = REG_ERPAREN;
2309 return NULL;
2311 /* else fall through */
2312 case OP_CLOSE_DUP_NUM:
2313 /* We treat it as a normal character. */
2315 /* Then we can these characters as normal characters. */
2316 token->type = CHARACTER;
2317 /* mb_partial and word_char bits should be initialized already
2318 by peek_token. */
2319 tree = create_token_tree (dfa, NULL, NULL, token);
2320 if (BE (tree == NULL, 0))
2322 *err = REG_ESPACE;
2323 return NULL;
2325 break;
2326 case ANCHOR:
2327 if ((token->opr.ctx_type
2328 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2329 && dfa->word_ops_used == 0)
2330 init_word_char (dfa);
2331 if (token->opr.ctx_type == WORD_DELIM
2332 || token->opr.ctx_type == NOT_WORD_DELIM)
2334 bin_tree_t *tree_first, *tree_last;
2335 if (token->opr.ctx_type == WORD_DELIM)
2337 token->opr.ctx_type = WORD_FIRST;
2338 tree_first = create_token_tree (dfa, NULL, NULL, token);
2339 token->opr.ctx_type = WORD_LAST;
2341 else
2343 token->opr.ctx_type = INSIDE_WORD;
2344 tree_first = create_token_tree (dfa, NULL, NULL, token);
2345 token->opr.ctx_type = INSIDE_NOTWORD;
2347 tree_last = create_token_tree (dfa, NULL, NULL, token);
2348 tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2349 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2351 *err = REG_ESPACE;
2352 return NULL;
2355 else
2357 tree = create_token_tree (dfa, NULL, NULL, token);
2358 if (BE (tree == NULL, 0))
2360 *err = REG_ESPACE;
2361 return NULL;
2364 /* We must return here, since ANCHORs can't be followed
2365 by repetition operators.
2366 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2367 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2368 fetch_token (token, regexp, syntax);
2369 return tree;
2370 case OP_PERIOD:
2371 tree = create_token_tree (dfa, NULL, NULL, token);
2372 if (BE (tree == NULL, 0))
2374 *err = REG_ESPACE;
2375 return NULL;
2377 if (dfa->mb_cur_max > 1)
2378 dfa->has_mb_node = 1;
2379 break;
2380 case OP_WORD:
2381 case OP_NOTWORD:
2382 tree = build_charclass_op (dfa, regexp->trans,
2383 (const unsigned char *) "alnum",
2384 (const unsigned char *) "_",
2385 token->type == OP_NOTWORD, err);
2386 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2387 return NULL;
2388 break;
2389 case OP_SPACE:
2390 case OP_NOTSPACE:
2391 tree = build_charclass_op (dfa, regexp->trans,
2392 (const unsigned char *) "space",
2393 (const unsigned char *) "",
2394 token->type == OP_NOTSPACE, err);
2395 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2396 return NULL;
2397 break;
2398 case OP_ALT:
2399 case END_OF_RE:
2400 return NULL;
2401 case BACK_SLASH:
2402 *err = REG_EESCAPE;
2403 return NULL;
2404 default:
2405 /* Must not happen? */
2406 #ifdef DEBUG
2407 assert (0);
2408 #endif
2409 return NULL;
2411 fetch_token (token, regexp, syntax);
2413 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2414 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2416 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2417 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2418 return NULL;
2419 /* In BRE consecutive duplications are not allowed. */
2420 if ((syntax & RE_CONTEXT_INVALID_DUP)
2421 && (token->type == OP_DUP_ASTERISK
2422 || token->type == OP_OPEN_DUP_NUM))
2424 *err = REG_BADRPT;
2425 return NULL;
2429 return tree;
2432 /* This function build the following tree, from regular expression
2433 (<reg_exp>):
2434 SUBEXP
2436 <reg_exp>
2439 static bin_tree_t *
2440 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2441 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2443 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2444 bin_tree_t *tree;
2445 size_t cur_nsub;
2446 cur_nsub = preg->re_nsub++;
2448 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2450 /* The subexpression may be a null string. */
2451 if (token->type == OP_CLOSE_SUBEXP)
2452 tree = NULL;
2453 else
2455 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2456 if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2458 if (tree != NULL)
2459 postorder (tree, free_tree, NULL);
2460 *err = REG_EPAREN;
2462 if (BE (*err != REG_NOERROR, 0))
2463 return NULL;
2466 if (cur_nsub <= '9' - '1')
2467 dfa->completed_bkref_map |= 1 << cur_nsub;
2469 tree = create_tree (dfa, tree, NULL, SUBEXP);
2470 if (BE (tree == NULL, 0))
2472 *err = REG_ESPACE;
2473 return NULL;
2475 tree->token.opr.idx = cur_nsub;
2476 return tree;
2479 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2481 static bin_tree_t *
2482 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2483 re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2485 bin_tree_t *tree = NULL, *old_tree = NULL;
2486 int i, start, end, start_idx = re_string_cur_idx (regexp);
2487 re_token_t start_token = *token;
2489 if (token->type == OP_OPEN_DUP_NUM)
2491 end = 0;
2492 start = fetch_number (regexp, token, syntax);
2493 if (start == -1)
2495 if (token->type == CHARACTER && token->opr.c == ',')
2496 start = 0; /* We treat "{,m}" as "{0,m}". */
2497 else
2499 *err = REG_BADBR; /* <re>{} is invalid. */
2500 return NULL;
2503 if (BE (start != -2, 1))
2505 /* We treat "{n}" as "{n,n}". */
2506 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2507 : ((token->type == CHARACTER && token->opr.c == ',')
2508 ? fetch_number (regexp, token, syntax) : -2));
2510 if (BE (start == -2 || end == -2, 0))
2512 /* Invalid sequence. */
2513 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2515 if (token->type == END_OF_RE)
2516 *err = REG_EBRACE;
2517 else
2518 *err = REG_BADBR;
2520 return NULL;
2523 /* If the syntax bit is set, rollback. */
2524 re_string_set_index (regexp, start_idx);
2525 *token = start_token;
2526 token->type = CHARACTER;
2527 /* mb_partial and word_char bits should be already initialized by
2528 peek_token. */
2529 return elem;
2532 if (BE ((end != -1 && start > end) || token->type != OP_CLOSE_DUP_NUM, 0))
2534 /* First number greater than second. */
2535 *err = REG_BADBR;
2536 return NULL;
2539 else
2541 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2542 end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2545 fetch_token (token, regexp, syntax);
2547 if (BE (elem == NULL, 0))
2548 return NULL;
2549 if (BE (start == 0 && end == 0, 0))
2551 postorder (elem, free_tree, NULL);
2552 return NULL;
2555 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2556 if (BE (start > 0, 0))
2558 tree = elem;
2559 for (i = 2; i <= start; ++i)
2561 elem = duplicate_tree (elem, dfa);
2562 tree = create_tree (dfa, tree, elem, CONCAT);
2563 if (BE (elem == NULL || tree == NULL, 0))
2564 goto parse_dup_op_espace;
2567 if (start == end)
2568 return tree;
2570 /* Duplicate ELEM before it is marked optional. */
2571 elem = duplicate_tree (elem, dfa);
2572 old_tree = tree;
2574 else
2575 old_tree = NULL;
2577 if (elem->token.type == SUBEXP)
2578 postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2580 tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2581 if (BE (tree == NULL, 0))
2582 goto parse_dup_op_espace;
2584 /* This loop is actually executed only when end != -1,
2585 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2586 already created the start+1-th copy. */
2587 for (i = start + 2; i <= end; ++i)
2589 elem = duplicate_tree (elem, dfa);
2590 tree = create_tree (dfa, tree, elem, CONCAT);
2591 if (BE (elem == NULL || tree == NULL, 0))
2592 goto parse_dup_op_espace;
2594 tree = create_tree (dfa, tree, NULL, OP_ALT);
2595 if (BE (tree == NULL, 0))
2596 goto parse_dup_op_espace;
2599 if (old_tree)
2600 tree = create_tree (dfa, old_tree, tree, CONCAT);
2602 return tree;
2604 parse_dup_op_espace:
2605 *err = REG_ESPACE;
2606 return NULL;
2609 /* Size of the names for collating symbol/equivalence_class/character_class.
2610 I'm not sure, but maybe enough. */
2611 #define BRACKET_NAME_BUF_SIZE 32
2613 #ifndef _LIBC
2614 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2615 Build the range expression which starts from START_ELEM, and ends
2616 at END_ELEM. The result are written to MBCSET and SBCSET.
2617 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2618 mbcset->range_ends, is a pointer argument sinse we may
2619 update it. */
2621 static reg_errcode_t
2622 internal_function
2623 # ifdef RE_ENABLE_I18N
2624 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2625 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2626 # else /* not RE_ENABLE_I18N */
2627 build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
2628 bracket_elem_t *end_elem)
2629 # endif /* not RE_ENABLE_I18N */
2631 unsigned int start_ch, end_ch;
2632 /* Equivalence Classes and Character Classes can't be a range start/end. */
2633 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2634 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2636 return REG_ERANGE;
2638 /* We can handle no multi character collating elements without libc
2639 support. */
2640 if (BE ((start_elem->type == COLL_SYM
2641 && strlen ((char *) start_elem->opr.name) > 1)
2642 || (end_elem->type == COLL_SYM
2643 && strlen ((char *) end_elem->opr.name) > 1), 0))
2644 return REG_ECOLLATE;
2646 # ifdef RE_ENABLE_I18N
2648 wchar_t wc;
2649 wint_t start_wc;
2650 wint_t end_wc;
2651 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2653 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2654 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2655 : 0));
2656 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2657 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2658 : 0));
2659 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2660 ? __btowc (start_ch) : start_elem->opr.wch);
2661 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2662 ? __btowc (end_ch) : end_elem->opr.wch);
2663 if (start_wc == WEOF || end_wc == WEOF)
2664 return REG_ECOLLATE;
2665 cmp_buf[0] = start_wc;
2666 cmp_buf[4] = end_wc;
2667 if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2668 return REG_ERANGE;
2670 /* Got valid collation sequence values, add them as a new entry.
2671 However, for !_LIBC we have no collation elements: if the
2672 character set is single byte, the single byte character set
2673 that we build below suffices. parse_bracket_exp passes
2674 no MBCSET if dfa->mb_cur_max == 1. */
2675 if (mbcset)
2677 /* Check the space of the arrays. */
2678 if (BE (*range_alloc == mbcset->nranges, 0))
2680 /* There is not enough space, need realloc. */
2681 wchar_t *new_array_start, *new_array_end;
2682 int new_nranges;
2684 /* +1 in case of mbcset->nranges is 0. */
2685 new_nranges = 2 * mbcset->nranges + 1;
2686 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2687 are NULL if *range_alloc == 0. */
2688 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2689 new_nranges);
2690 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2691 new_nranges);
2693 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2694 return REG_ESPACE;
2696 mbcset->range_starts = new_array_start;
2697 mbcset->range_ends = new_array_end;
2698 *range_alloc = new_nranges;
2701 mbcset->range_starts[mbcset->nranges] = start_wc;
2702 mbcset->range_ends[mbcset->nranges++] = end_wc;
2705 /* Build the table for single byte characters. */
2706 for (wc = 0; wc < SBC_MAX; ++wc)
2708 cmp_buf[2] = wc;
2709 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2710 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2711 bitset_set (sbcset, wc);
2714 # else /* not RE_ENABLE_I18N */
2716 unsigned int ch;
2717 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2718 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2719 : 0));
2720 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2721 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2722 : 0));
2723 if (start_ch > end_ch)
2724 return REG_ERANGE;
2725 /* Build the table for single byte characters. */
2726 for (ch = 0; ch < SBC_MAX; ++ch)
2727 if (start_ch <= ch && ch <= end_ch)
2728 bitset_set (sbcset, ch);
2730 # endif /* not RE_ENABLE_I18N */
2731 return REG_NOERROR;
2733 #endif /* not _LIBC */
2735 #ifndef _LIBC
2736 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2737 Build the collating element which is represented by NAME.
2738 The result are written to MBCSET and SBCSET.
2739 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2740 pointer argument since we may update it. */
2742 static reg_errcode_t
2743 internal_function
2744 # ifdef RE_ENABLE_I18N
2745 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2746 int *coll_sym_alloc, const unsigned char *name)
2747 # else /* not RE_ENABLE_I18N */
2748 build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2749 # endif /* not RE_ENABLE_I18N */
2751 size_t name_len = strlen ((const char *) name);
2752 if (BE (name_len != 1, 0))
2753 return REG_ECOLLATE;
2754 else
2756 bitset_set (sbcset, name[0]);
2757 return REG_NOERROR;
2760 #endif /* not _LIBC */
2762 /* This function parse bracket expression like "[abc]", "[a-c]",
2763 "[[.a-a.]]" etc. */
2765 static bin_tree_t *
2766 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2767 reg_syntax_t syntax, reg_errcode_t *err)
2769 #ifdef _LIBC
2770 const unsigned char *collseqmb;
2771 const char *collseqwc;
2772 uint32_t nrules;
2773 int32_t table_size;
2774 const int32_t *symb_table;
2775 const unsigned char *extra;
2777 /* Local function for parse_bracket_exp used in _LIBC environement.
2778 Seek the collating symbol entry correspondings to NAME.
2779 Return the index of the symbol in the SYMB_TABLE. */
2781 auto inline int32_t
2782 __attribute ((always_inline))
2783 seek_collating_symbol_entry (name, name_len)
2784 const unsigned char *name;
2785 size_t name_len;
2787 int32_t hash = elem_hash ((const char *) name, name_len);
2788 int32_t elem = hash % table_size;
2789 if (symb_table[2 * elem] != 0)
2791 int32_t second = hash % (table_size - 2) + 1;
2795 /* First compare the hashing value. */
2796 if (symb_table[2 * elem] == hash
2797 /* Compare the length of the name. */
2798 && name_len == extra[symb_table[2 * elem + 1]]
2799 /* Compare the name. */
2800 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2801 name_len) == 0)
2803 /* Yep, this is the entry. */
2804 break;
2807 /* Next entry. */
2808 elem += second;
2810 while (symb_table[2 * elem] != 0);
2812 return elem;
2815 /* Local function for parse_bracket_exp used in _LIBC environment.
2816 Look up the collation sequence value of BR_ELEM.
2817 Return the value if succeeded, UINT_MAX otherwise. */
2819 auto inline unsigned int
2820 __attribute ((always_inline))
2821 lookup_collation_sequence_value (br_elem)
2822 bracket_elem_t *br_elem;
2824 if (br_elem->type == SB_CHAR)
2827 if (MB_CUR_MAX == 1)
2829 if (nrules == 0)
2830 return collseqmb[br_elem->opr.ch];
2831 else
2833 wint_t wc = __btowc (br_elem->opr.ch);
2834 return __collseq_table_lookup (collseqwc, wc);
2837 else if (br_elem->type == MB_CHAR)
2839 if (nrules != 0)
2840 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2842 else if (br_elem->type == COLL_SYM)
2844 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2845 if (nrules != 0)
2847 int32_t elem, idx;
2848 elem = seek_collating_symbol_entry (br_elem->opr.name,
2849 sym_name_len);
2850 if (symb_table[2 * elem] != 0)
2852 /* We found the entry. */
2853 idx = symb_table[2 * elem + 1];
2854 /* Skip the name of collating element name. */
2855 idx += 1 + extra[idx];
2856 /* Skip the byte sequence of the collating element. */
2857 idx += 1 + extra[idx];
2858 /* Adjust for the alignment. */
2859 idx = (idx + 3) & ~3;
2860 /* Skip the multibyte collation sequence value. */
2861 idx += sizeof (unsigned int);
2862 /* Skip the wide char sequence of the collating element. */
2863 idx += sizeof (unsigned int) *
2864 (1 + *(unsigned int *) (extra + idx));
2865 /* Return the collation sequence value. */
2866 return *(unsigned int *) (extra + idx);
2868 else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2870 /* No valid character. Match it as a single byte
2871 character. */
2872 return collseqmb[br_elem->opr.name[0]];
2875 else if (sym_name_len == 1)
2876 return collseqmb[br_elem->opr.name[0]];
2878 return UINT_MAX;
2881 /* Local function for parse_bracket_exp used in _LIBC environement.
2882 Build the range expression which starts from START_ELEM, and ends
2883 at END_ELEM. The result are written to MBCSET and SBCSET.
2884 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2885 mbcset->range_ends, is a pointer argument sinse we may
2886 update it. */
2888 auto inline reg_errcode_t
2889 __attribute ((always_inline))
2890 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2891 re_charset_t *mbcset;
2892 int *range_alloc;
2893 bitset_t sbcset;
2894 bracket_elem_t *start_elem, *end_elem;
2896 unsigned int ch;
2897 uint32_t start_collseq;
2898 uint32_t end_collseq;
2900 /* Equivalence Classes and Character Classes can't be a range
2901 start/end. */
2902 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2903 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2905 return REG_ERANGE;
2907 start_collseq = lookup_collation_sequence_value (start_elem);
2908 end_collseq = lookup_collation_sequence_value (end_elem);
2909 /* Check start/end collation sequence values. */
2910 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2911 return REG_ECOLLATE;
2912 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2913 return REG_ERANGE;
2915 /* Got valid collation sequence values, add them as a new entry.
2916 However, if we have no collation elements, and the character set
2917 is single byte, the single byte character set that we
2918 build below suffices. */
2919 if (nrules > 0 || dfa->mb_cur_max > 1)
2921 /* Check the space of the arrays. */
2922 if (BE (*range_alloc == mbcset->nranges, 0))
2924 /* There is not enough space, need realloc. */
2925 uint32_t *new_array_start;
2926 uint32_t *new_array_end;
2927 int new_nranges;
2929 /* +1 in case of mbcset->nranges is 0. */
2930 new_nranges = 2 * mbcset->nranges + 1;
2931 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2932 new_nranges);
2933 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2934 new_nranges);
2936 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2937 return REG_ESPACE;
2939 mbcset->range_starts = new_array_start;
2940 mbcset->range_ends = new_array_end;
2941 *range_alloc = new_nranges;
2944 mbcset->range_starts[mbcset->nranges] = start_collseq;
2945 mbcset->range_ends[mbcset->nranges++] = end_collseq;
2948 /* Build the table for single byte characters. */
2949 for (ch = 0; ch < SBC_MAX; ch++)
2951 uint32_t ch_collseq;
2953 if (MB_CUR_MAX == 1)
2955 if (nrules == 0)
2956 ch_collseq = collseqmb[ch];
2957 else
2958 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2959 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2960 bitset_set (sbcset, ch);
2962 return REG_NOERROR;
2965 /* Local function for parse_bracket_exp used in _LIBC environement.
2966 Build the collating element which is represented by NAME.
2967 The result are written to MBCSET and SBCSET.
2968 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2969 pointer argument sinse we may update it. */
2971 auto inline reg_errcode_t
2972 __attribute ((always_inline))
2973 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2974 re_charset_t *mbcset;
2975 int *coll_sym_alloc;
2976 bitset_t sbcset;
2977 const unsigned char *name;
2979 int32_t elem, idx;
2980 size_t name_len = strlen ((const char *) name);
2981 if (nrules != 0)
2983 elem = seek_collating_symbol_entry (name, name_len);
2984 if (symb_table[2 * elem] != 0)
2986 /* We found the entry. */
2987 idx = symb_table[2 * elem + 1];
2988 /* Skip the name of collating element name. */
2989 idx += 1 + extra[idx];
2991 else if (symb_table[2 * elem] == 0 && name_len == 1)
2993 /* No valid character, treat it as a normal
2994 character. */
2995 bitset_set (sbcset, name[0]);
2996 return REG_NOERROR;
2998 else
2999 return REG_ECOLLATE;
3001 /* Got valid collation sequence, add it as a new entry. */
3002 /* Check the space of the arrays. */
3003 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3005 /* Not enough, realloc it. */
3006 /* +1 in case of mbcset->ncoll_syms is 0. */
3007 int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3008 /* Use realloc since mbcset->coll_syms is NULL
3009 if *alloc == 0. */
3010 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3011 new_coll_sym_alloc);
3012 if (BE (new_coll_syms == NULL, 0))
3013 return REG_ESPACE;
3014 mbcset->coll_syms = new_coll_syms;
3015 *coll_sym_alloc = new_coll_sym_alloc;
3017 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3018 return REG_NOERROR;
3020 else
3022 if (BE (name_len != 1, 0))
3023 return REG_ECOLLATE;
3024 else
3026 bitset_set (sbcset, name[0]);
3027 return REG_NOERROR;
3031 #endif
3033 re_token_t br_token;
3034 re_bitset_ptr_t sbcset;
3035 #ifdef RE_ENABLE_I18N
3036 re_charset_t *mbcset;
3037 int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3038 int equiv_class_alloc = 0, char_class_alloc = 0;
3039 #endif /* not RE_ENABLE_I18N */
3040 int non_match = 0;
3041 bin_tree_t *work_tree;
3042 int token_len;
3043 int first_round = 1;
3044 #ifdef _LIBC
3045 collseqmb = (const unsigned char *)
3046 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3047 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3048 if (nrules)
3051 if (MB_CUR_MAX > 1)
3053 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3054 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3055 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3056 _NL_COLLATE_SYMB_TABLEMB);
3057 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3058 _NL_COLLATE_SYMB_EXTRAMB);
3060 #endif
3061 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3062 #ifdef RE_ENABLE_I18N
3063 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3064 #endif /* RE_ENABLE_I18N */
3065 #ifdef RE_ENABLE_I18N
3066 if (BE (sbcset == NULL || mbcset == NULL, 0))
3067 #else
3068 if (BE (sbcset == NULL, 0))
3069 #endif /* RE_ENABLE_I18N */
3071 re_free (sbcset);
3072 #ifdef RE_ENABLE_I18N
3073 re_free (mbcset);
3074 #endif
3075 *err = REG_ESPACE;
3076 return NULL;
3079 token_len = peek_token_bracket (token, regexp, syntax);
3080 if (BE (token->type == END_OF_RE, 0))
3082 *err = REG_BADPAT;
3083 goto parse_bracket_exp_free_return;
3085 if (token->type == OP_NON_MATCH_LIST)
3087 #ifdef RE_ENABLE_I18N
3088 mbcset->non_match = 1;
3089 #endif /* not RE_ENABLE_I18N */
3090 non_match = 1;
3091 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3092 bitset_set (sbcset, '\n');
3093 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3094 token_len = peek_token_bracket (token, regexp, syntax);
3095 if (BE (token->type == END_OF_RE, 0))
3097 *err = REG_BADPAT;
3098 goto parse_bracket_exp_free_return;
3102 /* We treat the first ']' as a normal character. */
3103 if (token->type == OP_CLOSE_BRACKET)
3104 token->type = CHARACTER;
3106 while (1)
3108 bracket_elem_t start_elem, end_elem;
3109 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3110 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3111 reg_errcode_t ret;
3112 int token_len2 = 0, is_range_exp = 0;
3113 re_token_t token2;
3115 start_elem.opr.name = start_name_buf;
3116 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3117 syntax, first_round);
3118 if (BE (ret != REG_NOERROR, 0))
3120 *err = ret;
3121 goto parse_bracket_exp_free_return;
3123 first_round = 0;
3125 /* Get information about the next token. We need it in any case. */
3126 token_len = peek_token_bracket (token, regexp, syntax);
3128 /* Do not check for ranges if we know they are not allowed. */
3129 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3131 if (BE (token->type == END_OF_RE, 0))
3133 *err = REG_EBRACK;
3134 goto parse_bracket_exp_free_return;
3136 if (token->type == OP_CHARSET_RANGE)
3138 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3139 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3140 if (BE (token2.type == END_OF_RE, 0))
3142 *err = REG_EBRACK;
3143 goto parse_bracket_exp_free_return;
3145 if (token2.type == OP_CLOSE_BRACKET)
3147 /* We treat the last '-' as a normal character. */
3148 re_string_skip_bytes (regexp, -token_len);
3149 token->type = CHARACTER;
3151 else
3152 is_range_exp = 1;
3156 if (is_range_exp == 1)
3158 end_elem.opr.name = end_name_buf;
3159 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3160 dfa, syntax, 1);
3161 if (BE (ret != REG_NOERROR, 0))
3163 *err = ret;
3164 goto parse_bracket_exp_free_return;
3167 token_len = peek_token_bracket (token, regexp, syntax);
3169 #ifdef _LIBC
3170 *err = build_range_exp (sbcset, mbcset, &range_alloc,
3171 &start_elem, &end_elem);
3172 #else
3173 # ifdef RE_ENABLE_I18N
3174 *err = build_range_exp (sbcset,
3175 dfa->mb_cur_max > 1 ? mbcset : NULL,
3176 &range_alloc, &start_elem, &end_elem);
3177 # else
3178 *err = build_range_exp (sbcset, &start_elem, &end_elem);
3179 # endif
3180 #endif /* RE_ENABLE_I18N */
3181 if (BE (*err != REG_NOERROR, 0))
3182 goto parse_bracket_exp_free_return;
3184 else
3186 switch (start_elem.type)
3188 case SB_CHAR:
3189 bitset_set (sbcset, start_elem.opr.ch);
3190 break;
3191 #ifdef RE_ENABLE_I18N
3192 case MB_CHAR:
3193 /* Check whether the array has enough space. */
3194 if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3196 wchar_t *new_mbchars;
3197 /* Not enough, realloc it. */
3198 /* +1 in case of mbcset->nmbchars is 0. */
3199 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3200 /* Use realloc since array is NULL if *alloc == 0. */
3201 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3202 mbchar_alloc);
3203 if (BE (new_mbchars == NULL, 0))
3204 goto parse_bracket_exp_espace;
3205 mbcset->mbchars = new_mbchars;
3207 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3208 break;
3209 #endif /* RE_ENABLE_I18N */
3210 case EQUIV_CLASS:
3211 *err = build_equiv_class (sbcset,
3212 #ifdef RE_ENABLE_I18N
3213 mbcset, &equiv_class_alloc,
3214 #endif /* RE_ENABLE_I18N */
3215 start_elem.opr.name);
3216 if (BE (*err != REG_NOERROR, 0))
3217 goto parse_bracket_exp_free_return;
3218 break;
3219 case COLL_SYM:
3220 *err = build_collating_symbol (sbcset,
3221 #ifdef RE_ENABLE_I18N
3222 mbcset, &coll_sym_alloc,
3223 #endif /* RE_ENABLE_I18N */
3224 start_elem.opr.name);
3225 if (BE (*err != REG_NOERROR, 0))
3226 goto parse_bracket_exp_free_return;
3227 break;
3228 case CHAR_CLASS:
3229 *err = build_charclass (regexp->trans, sbcset,
3230 #ifdef RE_ENABLE_I18N
3231 mbcset, &char_class_alloc,
3232 #endif /* RE_ENABLE_I18N */
3233 start_elem.opr.name, syntax);
3234 if (BE (*err != REG_NOERROR, 0))
3235 goto parse_bracket_exp_free_return;
3236 break;
3237 default:
3238 assert (0);
3239 break;
3242 if (BE (token->type == END_OF_RE, 0))
3244 *err = REG_EBRACK;
3245 goto parse_bracket_exp_free_return;
3247 if (token->type == OP_CLOSE_BRACKET)
3248 break;
3251 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3253 /* If it is non-matching list. */
3254 if (non_match)
3255 bitset_not (sbcset);
3257 #ifdef RE_ENABLE_I18N
3258 /* Ensure only single byte characters are set. */
3259 if (dfa->mb_cur_max > 1)
3260 bitset_mask (sbcset, dfa->sb_char);
3262 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3263 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3264 || mbcset->non_match)))
3266 bin_tree_t *mbc_tree;
3267 int sbc_idx;
3268 /* Build a tree for complex bracket. */
3269 dfa->has_mb_node = 1;
3270 br_token.type = COMPLEX_BRACKET;
3271 br_token.opr.mbcset = mbcset;
3272 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3273 if (BE (mbc_tree == NULL, 0))
3274 goto parse_bracket_exp_espace;
3275 for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3276 if (sbcset[sbc_idx])
3277 break;
3278 /* If there are no bits set in sbcset, there is no point
3279 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3280 if (sbc_idx < BITSET_WORDS)
3282 /* Build a tree for simple bracket. */
3283 br_token.type = SIMPLE_BRACKET;
3284 br_token.opr.sbcset = sbcset;
3285 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3286 if (BE (work_tree == NULL, 0))
3287 goto parse_bracket_exp_espace;
3289 /* Then join them by ALT node. */
3290 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3291 if (BE (work_tree == NULL, 0))
3292 goto parse_bracket_exp_espace;
3294 else
3296 re_free (sbcset);
3297 work_tree = mbc_tree;
3300 else
3301 #endif /* not RE_ENABLE_I18N */
3303 #ifdef RE_ENABLE_I18N
3304 free_charset (mbcset);
3305 #endif
3306 /* Build a tree for simple bracket. */
3307 br_token.type = SIMPLE_BRACKET;
3308 br_token.opr.sbcset = sbcset;
3309 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3310 if (BE (work_tree == NULL, 0))
3311 goto parse_bracket_exp_espace;
3313 return work_tree;
3315 parse_bracket_exp_espace:
3316 *err = REG_ESPACE;
3317 parse_bracket_exp_free_return:
3318 re_free (sbcset);
3319 #ifdef RE_ENABLE_I18N
3320 free_charset (mbcset);
3321 #endif /* RE_ENABLE_I18N */
3322 return NULL;
3325 /* Parse an element in the bracket expression. */
3327 static reg_errcode_t
3328 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3329 re_token_t *token, int token_len, re_dfa_t *dfa,
3330 reg_syntax_t syntax, int accept_hyphen)
3332 #ifdef RE_ENABLE_I18N
3333 int cur_char_size;
3334 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3335 if (cur_char_size > 1)
3337 elem->type = MB_CHAR;
3338 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3339 re_string_skip_bytes (regexp, cur_char_size);
3340 return REG_NOERROR;
3342 #endif /* RE_ENABLE_I18N */
3343 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3344 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3345 || token->type == OP_OPEN_EQUIV_CLASS)
3346 return parse_bracket_symbol (elem, regexp, token);
3347 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3349 /* A '-' must only appear as anything but a range indicator before
3350 the closing bracket. Everything else is an error. */
3351 re_token_t token2;
3352 (void) peek_token_bracket (&token2, regexp, syntax);
3353 if (token2.type != OP_CLOSE_BRACKET)
3354 /* The actual error value is not standardized since this whole
3355 case is undefined. But ERANGE makes good sense. */
3356 return REG_ERANGE;
3358 elem->type = SB_CHAR;
3359 elem->opr.ch = token->opr.c;
3360 return REG_NOERROR;
3363 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3364 such as [:<character_class>:], [.<collating_element>.], and
3365 [=<equivalent_class>=]. */
3367 static reg_errcode_t
3368 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3369 re_token_t *token)
3371 unsigned char ch, delim = token->opr.c;
3372 int i = 0;
3373 if (re_string_eoi(regexp))
3374 return REG_EBRACK;
3375 for (;; ++i)
3377 if (i >= BRACKET_NAME_BUF_SIZE)
3378 return REG_EBRACK;
3379 if (token->type == OP_OPEN_CHAR_CLASS)
3380 ch = re_string_fetch_byte_case (regexp);
3381 else
3382 ch = re_string_fetch_byte (regexp);
3383 if (re_string_eoi(regexp))
3384 return REG_EBRACK;
3385 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3386 break;
3387 elem->opr.name[i] = ch;
3389 re_string_skip_bytes (regexp, 1);
3390 elem->opr.name[i] = '\0';
3391 switch (token->type)
3393 case OP_OPEN_COLL_ELEM:
3394 elem->type = COLL_SYM;
3395 break;
3396 case OP_OPEN_EQUIV_CLASS:
3397 elem->type = EQUIV_CLASS;
3398 break;
3399 case OP_OPEN_CHAR_CLASS:
3400 elem->type = CHAR_CLASS;
3401 break;
3402 default:
3403 break;
3405 return REG_NOERROR;
3408 /* Helper function for parse_bracket_exp.
3409 Build the equivalence class which is represented by NAME.
3410 The result are written to MBCSET and SBCSET.
3411 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3412 is a pointer argument sinse we may update it. */
3414 static reg_errcode_t
3415 #ifdef RE_ENABLE_I18N
3416 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3417 int *equiv_class_alloc, const unsigned char *name)
3418 #else /* not RE_ENABLE_I18N */
3419 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3420 #endif /* not RE_ENABLE_I18N */
3422 #ifdef _LIBC
3423 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3424 if (nrules != 0)
3426 const int32_t *table, *indirect;
3427 const unsigned char *weights, *extra, *cp;
3428 unsigned char char_buf[2];
3429 int32_t idx1, idx2;
3430 unsigned int ch;
3431 size_t len;
3432 /* This #include defines a local function! */
3433 # include <locale/weight.h>
3434 /* Calculate the index for equivalence class. */
3435 cp = name;
3436 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3437 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3438 _NL_COLLATE_WEIGHTMB);
3439 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3440 _NL_COLLATE_EXTRAMB);
3441 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3442 _NL_COLLATE_INDIRECTMB);
3443 idx1 = findidx (&cp, -1);
3444 if (BE (idx1 == 0 || *cp != '\0', 0))
3445 /* This isn't a valid character. */
3446 return REG_ECOLLATE;
3448 /* Build single byte matcing table for this equivalence class. */
3449 len = weights[idx1 & 0xffffff];
3450 for (ch = 0; ch < SBC_MAX; ++ch)
3452 char_buf[0] = ch;
3453 cp = char_buf;
3454 idx2 = findidx (&cp, 1);
3456 idx2 = table[ch];
3458 if (idx2 == 0)
3459 /* This isn't a valid character. */
3460 continue;
3461 /* Compare only if the length matches and the collation rule
3462 index is the same. */
3463 if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3465 int cnt = 0;
3467 while (cnt <= len &&
3468 weights[(idx1 & 0xffffff) + 1 + cnt]
3469 == weights[(idx2 & 0xffffff) + 1 + cnt])
3470 ++cnt;
3472 if (cnt > len)
3473 bitset_set (sbcset, ch);
3476 /* Check whether the array has enough space. */
3477 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3479 /* Not enough, realloc it. */
3480 /* +1 in case of mbcset->nequiv_classes is 0. */
3481 int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3482 /* Use realloc since the array is NULL if *alloc == 0. */
3483 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3484 int32_t,
3485 new_equiv_class_alloc);
3486 if (BE (new_equiv_classes == NULL, 0))
3487 return REG_ESPACE;
3488 mbcset->equiv_classes = new_equiv_classes;
3489 *equiv_class_alloc = new_equiv_class_alloc;
3491 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3493 else
3494 #endif /* _LIBC */
3496 if (BE (strlen ((const char *) name) != 1, 0))
3497 return REG_ECOLLATE;
3498 bitset_set (sbcset, *name);
3500 return REG_NOERROR;
3503 /* Helper function for parse_bracket_exp.
3504 Build the character class which is represented by NAME.
3505 The result are written to MBCSET and SBCSET.
3506 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3507 is a pointer argument sinse we may update it. */
3509 static reg_errcode_t
3510 #ifdef RE_ENABLE_I18N
3511 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3512 re_charset_t *mbcset, int *char_class_alloc,
3513 const unsigned char *class_name, reg_syntax_t syntax)
3514 #else /* not RE_ENABLE_I18N */
3515 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3516 const unsigned char *class_name, reg_syntax_t syntax)
3517 #endif /* not RE_ENABLE_I18N */
3519 int i;
3520 const char *name = (const char *) class_name;
3522 /* In case of REG_ICASE "upper" and "lower" match the both of
3523 upper and lower cases. */
3524 if ((syntax & RE_ICASE)
3525 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3526 name = "alpha";
3528 #ifdef RE_ENABLE_I18N
3529 /* Check the space of the arrays. */
3530 if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3532 /* Not enough, realloc it. */
3533 /* +1 in case of mbcset->nchar_classes is 0. */
3534 int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3535 /* Use realloc since array is NULL if *alloc == 0. */
3536 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3537 new_char_class_alloc);
3538 if (BE (new_char_classes == NULL, 0))
3539 return REG_ESPACE;
3540 mbcset->char_classes = new_char_classes;
3541 *char_class_alloc = new_char_class_alloc;
3543 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3544 #endif /* RE_ENABLE_I18N */
3546 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3547 do { \
3548 if (BE (trans != NULL, 0)) \
3550 for (i = 0; i < SBC_MAX; ++i) \
3551 if (ctype_func (i)) \
3552 bitset_set (sbcset, trans[i]); \
3554 else \
3556 for (i = 0; i < SBC_MAX; ++i) \
3557 if (ctype_func (i)) \
3558 bitset_set (sbcset, i); \
3560 } while (0)
3562 if (strcmp (name, "alnum") == 0)
3563 BUILD_CHARCLASS_LOOP (isalnum);
3564 else if (strcmp (name, "cntrl") == 0)
3565 BUILD_CHARCLASS_LOOP (iscntrl);
3566 else if (strcmp (name, "lower") == 0)
3567 BUILD_CHARCLASS_LOOP (islower);
3568 else if (strcmp (name, "space") == 0)
3569 BUILD_CHARCLASS_LOOP (isspace);
3570 else if (strcmp (name, "alpha") == 0)
3571 BUILD_CHARCLASS_LOOP (isalpha);
3572 else if (strcmp (name, "digit") == 0)
3573 BUILD_CHARCLASS_LOOP (isdigit);
3574 else if (strcmp (name, "print") == 0)
3575 BUILD_CHARCLASS_LOOP (isprint);
3576 else if (strcmp (name, "upper") == 0)
3577 BUILD_CHARCLASS_LOOP (isupper);
3578 else if (strcmp (name, "blank") == 0)
3579 BUILD_CHARCLASS_LOOP (isblank);
3580 else if (strcmp (name, "graph") == 0)
3581 BUILD_CHARCLASS_LOOP (isgraph);
3582 else if (strcmp (name, "punct") == 0)
3583 BUILD_CHARCLASS_LOOP (ispunct);
3584 else if (strcmp (name, "xdigit") == 0)
3585 BUILD_CHARCLASS_LOOP (isxdigit);
3586 else
3587 return REG_ECTYPE;
3589 return REG_NOERROR;
3592 static bin_tree_t *
3593 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3594 const unsigned char *class_name,
3595 const unsigned char *extra, int non_match,
3596 reg_errcode_t *err)
3598 re_bitset_ptr_t sbcset;
3599 #ifdef RE_ENABLE_I18N
3600 re_charset_t *mbcset;
3601 int alloc = 0;
3602 #endif /* not RE_ENABLE_I18N */
3603 reg_errcode_t ret;
3604 re_token_t br_token;
3605 bin_tree_t *tree;
3607 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3608 #ifdef RE_ENABLE_I18N
3609 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3610 #endif /* RE_ENABLE_I18N */
3612 #ifdef RE_ENABLE_I18N
3613 if (BE (sbcset == NULL || mbcset == NULL, 0))
3614 #else /* not RE_ENABLE_I18N */
3615 if (BE (sbcset == NULL, 0))
3616 #endif /* not RE_ENABLE_I18N */
3618 *err = REG_ESPACE;
3619 return NULL;
3622 if (non_match)
3624 #ifdef RE_ENABLE_I18N
3625 mbcset->non_match = 1;
3626 #endif /* not RE_ENABLE_I18N */
3629 /* We don't care the syntax in this case. */
3630 ret = build_charclass (trans, sbcset,
3631 #ifdef RE_ENABLE_I18N
3632 mbcset, &alloc,
3633 #endif /* RE_ENABLE_I18N */
3634 class_name, 0);
3636 if (BE (ret != REG_NOERROR, 0))
3638 re_free (sbcset);
3639 #ifdef RE_ENABLE_I18N
3640 free_charset (mbcset);
3641 #endif /* RE_ENABLE_I18N */
3642 *err = ret;
3643 return NULL;
3645 /* \w match '_' also. */
3646 for (; *extra; extra++)
3647 bitset_set (sbcset, *extra);
3649 /* If it is non-matching list. */
3650 if (non_match)
3651 bitset_not (sbcset);
3653 #ifdef RE_ENABLE_I18N
3654 /* Ensure only single byte characters are set. */
3655 if (dfa->mb_cur_max > 1)
3656 bitset_mask (sbcset, dfa->sb_char);
3657 #endif
3659 /* Build a tree for simple bracket. */
3660 br_token.type = SIMPLE_BRACKET;
3661 br_token.opr.sbcset = sbcset;
3662 tree = create_token_tree (dfa, NULL, NULL, &br_token);
3663 if (BE (tree == NULL, 0))
3664 goto build_word_op_espace;
3666 #ifdef RE_ENABLE_I18N
3667 if (dfa->mb_cur_max > 1)
3669 bin_tree_t *mbc_tree;
3670 /* Build a tree for complex bracket. */
3671 br_token.type = COMPLEX_BRACKET;
3672 br_token.opr.mbcset = mbcset;
3673 dfa->has_mb_node = 1;
3674 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3675 if (BE (mbc_tree == NULL, 0))
3676 goto build_word_op_espace;
3677 /* Then join them by ALT node. */
3678 tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3679 if (BE (mbc_tree != NULL, 1))
3680 return tree;
3682 else
3684 free_charset (mbcset);
3685 return tree;
3687 #else /* not RE_ENABLE_I18N */
3688 return tree;
3689 #endif /* not RE_ENABLE_I18N */
3691 build_word_op_espace:
3692 re_free (sbcset);
3693 #ifdef RE_ENABLE_I18N
3694 free_charset (mbcset);
3695 #endif /* RE_ENABLE_I18N */
3696 *err = REG_ESPACE;
3697 return NULL;
3700 /* This is intended for the expressions like "a{1,3}".
3701 Fetch a number from `input', and return the number.
3702 Return -1, if the number field is empty like "{,1}".
3703 Return -2, If an error is occured. */
3705 static int
3706 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3708 int num = -1;
3709 unsigned char c;
3710 while (1)
3712 fetch_token (token, input, syntax);
3713 c = token->opr.c;
3714 if (BE (token->type == END_OF_RE, 0))
3715 return -2;
3716 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3717 break;
3718 num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3719 ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3720 num = (num > RE_DUP_MAX) ? -2 : num;
3722 return num;
3725 #ifdef RE_ENABLE_I18N
3726 static void
3727 free_charset (re_charset_t *cset)
3729 re_free (cset->mbchars);
3730 # ifdef _LIBC
3731 re_free (cset->coll_syms);
3732 re_free (cset->equiv_classes);
3733 re_free (cset->range_starts);
3734 re_free (cset->range_ends);
3735 # endif
3736 re_free (cset->char_classes);
3737 re_free (cset);
3739 #endif /* RE_ENABLE_I18N */
3741 /* Functions for binary tree operation. */
3743 /* Create a tree node. */
3745 static bin_tree_t *
3746 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3747 re_token_type_t type)
3749 re_token_t t;
3750 t.type = type;
3751 return create_token_tree (dfa, left, right, &t);
3754 static bin_tree_t *
3755 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3756 const re_token_t *token)
3758 bin_tree_t *tree;
3759 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3761 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3763 if (storage == NULL)
3764 return NULL;
3765 storage->next = dfa->str_tree_storage;
3766 dfa->str_tree_storage = storage;
3767 dfa->str_tree_storage_idx = 0;
3769 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3771 tree->parent = NULL;
3772 tree->left = left;
3773 tree->right = right;
3774 tree->token = *token;
3775 tree->token.duplicated = 0;
3776 tree->token.opt_subexp = 0;
3777 tree->first = NULL;
3778 tree->next = NULL;
3779 tree->node_idx = -1;
3781 if (left != NULL)
3782 left->parent = tree;
3783 if (right != NULL)
3784 right->parent = tree;
3785 return tree;
3788 /* Mark the tree SRC as an optional subexpression.
3789 To be called from preorder or postorder. */
3791 static reg_errcode_t
3792 mark_opt_subexp (void *extra, bin_tree_t *node)
3794 int idx = (int) (long) extra;
3795 if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3796 node->token.opt_subexp = 1;
3798 return REG_NOERROR;
3801 /* Free the allocated memory inside NODE. */
3803 static void
3804 free_token (re_token_t *node)
3806 #ifdef RE_ENABLE_I18N
3807 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3808 free_charset (node->opr.mbcset);
3809 else
3810 #endif /* RE_ENABLE_I18N */
3811 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3812 re_free (node->opr.sbcset);
3815 /* Worker function for tree walking. Free the allocated memory inside NODE
3816 and its children. */
3818 static reg_errcode_t
3819 free_tree (void *extra, bin_tree_t *node)
3821 free_token (&node->token);
3822 return REG_NOERROR;
3826 /* Duplicate the node SRC, and return new node. This is a preorder
3827 visit similar to the one implemented by the generic visitor, but
3828 we need more infrastructure to maintain two parallel trees --- so,
3829 it's easier to duplicate. */
3831 static bin_tree_t *
3832 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3834 const bin_tree_t *node;
3835 bin_tree_t *dup_root;
3836 bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3838 for (node = root; ; )
3840 /* Create a new tree and link it back to the current parent. */
3841 *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3842 if (*p_new == NULL)
3843 return NULL;
3844 (*p_new)->parent = dup_node;
3845 (*p_new)->token.duplicated = 1;
3846 dup_node = *p_new;
3848 /* Go to the left node, or up and to the right. */
3849 if (node->left)
3851 node = node->left;
3852 p_new = &dup_node->left;
3854 else
3856 const bin_tree_t *prev = NULL;
3857 while (node->right == prev || node->right == NULL)
3859 prev = node;
3860 node = node->parent;
3861 dup_node = dup_node->parent;
3862 if (!node)
3863 return dup_root;
3865 node = node->right;
3866 p_new = &dup_node->right;