Check for existence of pre-instaleld pst3 for Solaris tests in t/check_procs.t
[monitoring-plugins.git] / gl / regcomp.c
blobb114b4d1400f480466abbc8f2532f8c2d63da8e6
1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002,2003,2004,2005,2006,2007,2008,2009
3 Free Software Foundation, Inc.
4 This file is part of the GNU C Library.
5 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
7 This program is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
10 any later version.
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License along
18 with this program; if not, write to the Free Software Foundation,
19 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
21 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
22 size_t length, reg_syntax_t syntax);
23 static void re_compile_fastmap_iter (regex_t *bufp,
24 const re_dfastate_t *init_state,
25 char *fastmap);
26 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
27 #ifdef RE_ENABLE_I18N
28 static void free_charset (re_charset_t *cset);
29 #endif /* RE_ENABLE_I18N */
30 static void free_workarea_compile (regex_t *preg);
31 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
32 #ifdef RE_ENABLE_I18N
33 static void optimize_utf8 (re_dfa_t *dfa);
34 #endif
35 static reg_errcode_t analyze (regex_t *preg);
36 static reg_errcode_t preorder (bin_tree_t *root,
37 reg_errcode_t (fn (void *, bin_tree_t *)),
38 void *extra);
39 static reg_errcode_t postorder (bin_tree_t *root,
40 reg_errcode_t (fn (void *, bin_tree_t *)),
41 void *extra);
42 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
43 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
44 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
45 bin_tree_t *node);
46 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
47 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
48 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
49 static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
50 static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
51 unsigned int constraint);
52 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
53 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
54 Idx node, bool root);
55 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
56 static Idx fetch_number (re_string_t *input, re_token_t *token,
57 reg_syntax_t syntax);
58 static int peek_token (re_token_t *token, re_string_t *input,
59 reg_syntax_t syntax) internal_function;
60 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
61 reg_syntax_t syntax, reg_errcode_t *err);
62 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
63 re_token_t *token, reg_syntax_t syntax,
64 Idx nest, reg_errcode_t *err);
65 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
66 re_token_t *token, reg_syntax_t syntax,
67 Idx nest, reg_errcode_t *err);
68 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
69 re_token_t *token, reg_syntax_t syntax,
70 Idx nest, reg_errcode_t *err);
71 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
72 re_token_t *token, reg_syntax_t syntax,
73 Idx nest, reg_errcode_t *err);
74 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
75 re_dfa_t *dfa, re_token_t *token,
76 reg_syntax_t syntax, reg_errcode_t *err);
77 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
78 re_token_t *token, reg_syntax_t syntax,
79 reg_errcode_t *err);
80 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
81 re_string_t *regexp,
82 re_token_t *token, int token_len,
83 re_dfa_t *dfa,
84 reg_syntax_t syntax,
85 bool accept_hyphen);
86 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
87 re_string_t *regexp,
88 re_token_t *token);
89 #ifdef RE_ENABLE_I18N
90 static reg_errcode_t build_equiv_class (bitset_t sbcset,
91 re_charset_t *mbcset,
92 Idx *equiv_class_alloc,
93 const unsigned char *name);
94 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
95 bitset_t sbcset,
96 re_charset_t *mbcset,
97 Idx *char_class_alloc,
98 const unsigned char *class_name,
99 reg_syntax_t syntax);
100 #else /* not RE_ENABLE_I18N */
101 static reg_errcode_t build_equiv_class (bitset_t sbcset,
102 const unsigned char *name);
103 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
104 bitset_t sbcset,
105 const unsigned char *class_name,
106 reg_syntax_t syntax);
107 #endif /* not RE_ENABLE_I18N */
108 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
109 RE_TRANSLATE_TYPE trans,
110 const unsigned char *class_name,
111 const unsigned char *extra,
112 bool non_match, reg_errcode_t *err);
113 static bin_tree_t *create_tree (re_dfa_t *dfa,
114 bin_tree_t *left, bin_tree_t *right,
115 re_token_type_t type);
116 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
117 bin_tree_t *left, bin_tree_t *right,
118 const re_token_t *token);
119 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
120 static void free_token (re_token_t *node);
121 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
122 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
124 /* This table gives an error message for each of the error codes listed
125 in regex.h. Obviously the order here has to be same as there.
126 POSIX doesn't require that we do anything for REG_NOERROR,
127 but why not be nice? */
129 static const char __re_error_msgid[] =
131 #define REG_NOERROR_IDX 0
132 gettext_noop ("Success") /* REG_NOERROR */
133 "\0"
134 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
135 gettext_noop ("No match") /* REG_NOMATCH */
136 "\0"
137 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
138 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
139 "\0"
140 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
141 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
142 "\0"
143 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
144 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
145 "\0"
146 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
147 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
148 "\0"
149 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
150 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
151 "\0"
152 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
153 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
154 "\0"
155 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
156 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
157 "\0"
158 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
159 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
160 "\0"
161 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
162 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
163 "\0"
164 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
165 gettext_noop ("Invalid range end") /* REG_ERANGE */
166 "\0"
167 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
168 gettext_noop ("Memory exhausted") /* REG_ESPACE */
169 "\0"
170 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
171 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
172 "\0"
173 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
174 gettext_noop ("Premature end of regular expression") /* REG_EEND */
175 "\0"
176 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
177 gettext_noop ("Regular expression too big") /* REG_ESIZE */
178 "\0"
179 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
180 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
183 static const size_t __re_error_msgid_idx[] =
185 REG_NOERROR_IDX,
186 REG_NOMATCH_IDX,
187 REG_BADPAT_IDX,
188 REG_ECOLLATE_IDX,
189 REG_ECTYPE_IDX,
190 REG_EESCAPE_IDX,
191 REG_ESUBREG_IDX,
192 REG_EBRACK_IDX,
193 REG_EPAREN_IDX,
194 REG_EBRACE_IDX,
195 REG_BADBR_IDX,
196 REG_ERANGE_IDX,
197 REG_ESPACE_IDX,
198 REG_BADRPT_IDX,
199 REG_EEND_IDX,
200 REG_ESIZE_IDX,
201 REG_ERPAREN_IDX
204 /* Entry points for GNU code. */
206 /* re_compile_pattern is the GNU regular expression compiler: it
207 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
208 Returns 0 if the pattern was valid, otherwise an error string.
210 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
211 are set in BUFP on entry. */
213 #ifdef _LIBC
214 const char *
215 re_compile_pattern (pattern, length, bufp)
216 const char *pattern;
217 size_t length;
218 struct re_pattern_buffer *bufp;
219 #else /* size_t might promote */
220 const char *
221 re_compile_pattern (const char *pattern, size_t length,
222 struct re_pattern_buffer *bufp)
223 #endif
225 reg_errcode_t ret;
227 /* And GNU code determines whether or not to get register information
228 by passing null for the REGS argument to re_match, etc., not by
229 setting no_sub, unless RE_NO_SUB is set. */
230 bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
232 /* Match anchors at newline. */
233 bufp->newline_anchor = 1;
235 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
237 if (!ret)
238 return NULL;
239 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
241 #ifdef _LIBC
242 weak_alias (__re_compile_pattern, re_compile_pattern)
243 #endif
245 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
246 also be assigned to arbitrarily: each pattern buffer stores its own
247 syntax, so it can be changed between regex compilations. */
248 /* This has no initializer because initialized variables in Emacs
249 become read-only after dumping. */
250 reg_syntax_t re_syntax_options;
253 /* Specify the precise syntax of regexps for compilation. This provides
254 for compatibility for various utilities which historically have
255 different, incompatible syntaxes.
257 The argument SYNTAX is a bit mask comprised of the various bits
258 defined in regex.h. We return the old syntax. */
260 reg_syntax_t
261 re_set_syntax (syntax)
262 reg_syntax_t syntax;
264 reg_syntax_t ret = re_syntax_options;
266 re_syntax_options = syntax;
267 return ret;
269 #ifdef _LIBC
270 weak_alias (__re_set_syntax, re_set_syntax)
271 #endif
274 re_compile_fastmap (bufp)
275 struct re_pattern_buffer *bufp;
277 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
278 char *fastmap = bufp->fastmap;
280 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
281 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
282 if (dfa->init_state != dfa->init_state_word)
283 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
284 if (dfa->init_state != dfa->init_state_nl)
285 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
286 if (dfa->init_state != dfa->init_state_begbuf)
287 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
288 bufp->fastmap_accurate = 1;
289 return 0;
291 #ifdef _LIBC
292 weak_alias (__re_compile_fastmap, re_compile_fastmap)
293 #endif
295 static inline void
296 __attribute ((always_inline))
297 re_set_fastmap (char *fastmap, bool icase, int ch)
299 fastmap[ch] = 1;
300 if (icase)
301 fastmap[tolower (ch)] = 1;
304 /* Helper function for re_compile_fastmap.
305 Compile fastmap for the initial_state INIT_STATE. */
307 static void
308 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
309 char *fastmap)
311 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
312 Idx node_cnt;
313 bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
314 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
316 Idx node = init_state->nodes.elems[node_cnt];
317 re_token_type_t type = dfa->nodes[node].type;
319 if (type == CHARACTER)
321 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
322 #ifdef RE_ENABLE_I18N
323 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
325 unsigned char buf[MB_LEN_MAX];
326 unsigned char *p;
327 wchar_t wc;
328 mbstate_t state;
330 p = buf;
331 *p++ = dfa->nodes[node].opr.c;
332 while (++node < dfa->nodes_len
333 && dfa->nodes[node].type == CHARACTER
334 && dfa->nodes[node].mb_partial)
335 *p++ = dfa->nodes[node].opr.c;
336 memset (&state, '\0', sizeof (state));
337 if (__mbrtowc (&wc, (const char *) buf, p - buf,
338 &state) == p - buf
339 && (__wcrtomb ((char *) buf, towlower (wc), &state)
340 != (size_t) -1))
341 re_set_fastmap (fastmap, false, buf[0]);
343 #endif
345 else if (type == SIMPLE_BRACKET)
347 int i, ch;
348 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
350 int j;
351 bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
352 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
353 if (w & ((bitset_word_t) 1 << j))
354 re_set_fastmap (fastmap, icase, ch);
357 #ifdef RE_ENABLE_I18N
358 else if (type == COMPLEX_BRACKET)
360 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
361 Idx i;
363 # ifdef _LIBC
364 /* See if we have to try all bytes which start multiple collation
365 elements.
366 e.g. In da_DK, we want to catch 'a' since "aa" is a valid
367 collation element, and don't catch 'b' since 'b' is
368 the only collation element which starts from 'b' (and
369 it is caught by SIMPLE_BRACKET). */
370 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
371 && (cset->ncoll_syms || cset->nranges))
373 const int32_t *table = (const int32_t *)
374 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
375 for (i = 0; i < SBC_MAX; ++i)
376 if (table[i] < 0)
377 re_set_fastmap (fastmap, icase, i);
379 # endif /* _LIBC */
381 /* See if we have to start the match at all multibyte characters,
382 i.e. where we would not find an invalid sequence. This only
383 applies to multibyte character sets; for single byte character
384 sets, the SIMPLE_BRACKET again suffices. */
385 if (dfa->mb_cur_max > 1
386 && (cset->nchar_classes || cset->non_match
387 # ifdef _LIBC
388 || cset->nequiv_classes
389 # endif /* _LIBC */
392 unsigned char c = 0;
395 mbstate_t mbs;
396 memset (&mbs, 0, sizeof (mbs));
397 if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
398 re_set_fastmap (fastmap, false, (int) c);
400 while (++c != 0);
403 else
405 /* ... Else catch all bytes which can start the mbchars. */
406 for (i = 0; i < cset->nmbchars; ++i)
408 char buf[256];
409 mbstate_t state;
410 memset (&state, '\0', sizeof (state));
411 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
412 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
413 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
415 if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
416 != (size_t) -1)
417 re_set_fastmap (fastmap, false, *(unsigned char *) buf);
422 #endif /* RE_ENABLE_I18N */
423 else if (type == OP_PERIOD
424 #ifdef RE_ENABLE_I18N
425 || type == OP_UTF8_PERIOD
426 #endif /* RE_ENABLE_I18N */
427 || type == END_OF_RE)
429 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
430 if (type == END_OF_RE)
431 bufp->can_be_null = 1;
432 return;
437 /* Entry point for POSIX code. */
438 /* regcomp takes a regular expression as a string and compiles it.
440 PREG is a regex_t *. We do not expect any fields to be initialized,
441 since POSIX says we shouldn't. Thus, we set
443 `buffer' to the compiled pattern;
444 `used' to the length of the compiled pattern;
445 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
446 REG_EXTENDED bit in CFLAGS is set; otherwise, to
447 RE_SYNTAX_POSIX_BASIC;
448 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
449 `fastmap' to an allocated space for the fastmap;
450 `fastmap_accurate' to zero;
451 `re_nsub' to the number of subexpressions in PATTERN.
453 PATTERN is the address of the pattern string.
455 CFLAGS is a series of bits which affect compilation.
457 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
458 use POSIX basic syntax.
460 If REG_NEWLINE is set, then . and [^...] don't match newline.
461 Also, regexec will try a match beginning after every newline.
463 If REG_ICASE is set, then we considers upper- and lowercase
464 versions of letters to be equivalent when matching.
466 If REG_NOSUB is set, then when PREG is passed to regexec, that
467 routine will report only success or failure, and nothing about the
468 registers.
470 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
471 the return codes and their meanings.) */
474 regcomp (preg, pattern, cflags)
475 regex_t *_Restrict_ preg;
476 const char *_Restrict_ pattern;
477 int cflags;
479 reg_errcode_t ret;
480 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
481 : RE_SYNTAX_POSIX_BASIC);
483 preg->buffer = NULL;
484 preg->allocated = 0;
485 preg->used = 0;
487 /* Try to allocate space for the fastmap. */
488 preg->fastmap = re_malloc (char, SBC_MAX);
489 if (BE (preg->fastmap == NULL, 0))
490 return REG_ESPACE;
492 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
494 /* If REG_NEWLINE is set, newlines are treated differently. */
495 if (cflags & REG_NEWLINE)
496 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
497 syntax &= ~RE_DOT_NEWLINE;
498 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
499 /* It also changes the matching behavior. */
500 preg->newline_anchor = 1;
502 else
503 preg->newline_anchor = 0;
504 preg->no_sub = !!(cflags & REG_NOSUB);
505 preg->translate = NULL;
507 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
509 /* POSIX doesn't distinguish between an unmatched open-group and an
510 unmatched close-group: both are REG_EPAREN. */
511 if (ret == REG_ERPAREN)
512 ret = REG_EPAREN;
514 /* We have already checked preg->fastmap != NULL. */
515 if (BE (ret == REG_NOERROR, 1))
516 /* Compute the fastmap now, since regexec cannot modify the pattern
517 buffer. This function never fails in this implementation. */
518 (void) re_compile_fastmap (preg);
519 else
521 /* Some error occurred while compiling the expression. */
522 re_free (preg->fastmap);
523 preg->fastmap = NULL;
526 return (int) ret;
528 #ifdef _LIBC
529 weak_alias (__regcomp, regcomp)
530 #endif
532 /* Returns a message corresponding to an error code, ERRCODE, returned
533 from either regcomp or regexec. We don't use PREG here. */
535 #ifdef _LIBC
536 size_t
537 regerror (errcode, preg, errbuf, errbuf_size)
538 int errcode;
539 const regex_t *_Restrict_ preg;
540 char *_Restrict_ errbuf;
541 size_t errbuf_size;
542 #else /* size_t might promote */
543 size_t
544 regerror (int errcode, const regex_t *_Restrict_ preg,
545 char *_Restrict_ errbuf, size_t errbuf_size)
546 #endif
548 const char *msg;
549 size_t msg_size;
551 if (BE (errcode < 0
552 || errcode >= (int) (sizeof (__re_error_msgid_idx)
553 / sizeof (__re_error_msgid_idx[0])), 0))
554 /* Only error codes returned by the rest of the code should be passed
555 to this routine. If we are given anything else, or if other regex
556 code generates an invalid error code, then the program has a bug.
557 Dump core so we can fix it. */
558 abort ();
560 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
562 msg_size = strlen (msg) + 1; /* Includes the null. */
564 if (BE (errbuf_size != 0, 1))
566 size_t cpy_size = msg_size;
567 if (BE (msg_size > errbuf_size, 0))
569 cpy_size = errbuf_size - 1;
570 errbuf[cpy_size] = '\0';
572 memcpy (errbuf, msg, cpy_size);
575 return msg_size;
577 #ifdef _LIBC
578 weak_alias (__regerror, regerror)
579 #endif
582 #ifdef RE_ENABLE_I18N
583 /* This static array is used for the map to single-byte characters when
584 UTF-8 is used. Otherwise we would allocate memory just to initialize
585 it the same all the time. UTF-8 is the preferred encoding so this is
586 a worthwhile optimization. */
587 static const bitset_t utf8_sb_map =
589 /* Set the first 128 bits. */
590 # if 4 * BITSET_WORD_BITS < ASCII_CHARS
591 # error "bitset_word_t is narrower than 32 bits"
592 # elif 3 * BITSET_WORD_BITS < ASCII_CHARS
593 BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
594 # elif 2 * BITSET_WORD_BITS < ASCII_CHARS
595 BITSET_WORD_MAX, BITSET_WORD_MAX,
596 # elif 1 * BITSET_WORD_BITS < ASCII_CHARS
597 BITSET_WORD_MAX,
598 # endif
599 (BITSET_WORD_MAX
600 >> (SBC_MAX % BITSET_WORD_BITS == 0
602 : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
604 #endif
607 static void
608 free_dfa_content (re_dfa_t *dfa)
610 Idx i, j;
612 if (dfa->nodes)
613 for (i = 0; i < dfa->nodes_len; ++i)
614 free_token (dfa->nodes + i);
615 re_free (dfa->nexts);
616 for (i = 0; i < dfa->nodes_len; ++i)
618 if (dfa->eclosures != NULL)
619 re_node_set_free (dfa->eclosures + i);
620 if (dfa->inveclosures != NULL)
621 re_node_set_free (dfa->inveclosures + i);
622 if (dfa->edests != NULL)
623 re_node_set_free (dfa->edests + i);
625 re_free (dfa->edests);
626 re_free (dfa->eclosures);
627 re_free (dfa->inveclosures);
628 re_free (dfa->nodes);
630 if (dfa->state_table)
631 for (i = 0; i <= dfa->state_hash_mask; ++i)
633 struct re_state_table_entry *entry = dfa->state_table + i;
634 for (j = 0; j < entry->num; ++j)
636 re_dfastate_t *state = entry->array[j];
637 free_state (state);
639 re_free (entry->array);
641 re_free (dfa->state_table);
642 #ifdef RE_ENABLE_I18N
643 if (dfa->sb_char != utf8_sb_map)
644 re_free (dfa->sb_char);
645 #endif
646 re_free (dfa->subexp_map);
647 #ifdef DEBUG
648 re_free (dfa->re_str);
649 #endif
651 re_free (dfa);
655 /* Free dynamically allocated space used by PREG. */
657 void
658 regfree (preg)
659 regex_t *preg;
661 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
662 if (BE (dfa != NULL, 1))
663 free_dfa_content (dfa);
664 preg->buffer = NULL;
665 preg->allocated = 0;
667 re_free (preg->fastmap);
668 preg->fastmap = NULL;
670 re_free (preg->translate);
671 preg->translate = NULL;
673 #ifdef _LIBC
674 weak_alias (__regfree, regfree)
675 #endif
677 /* Entry points compatible with 4.2 BSD regex library. We don't define
678 them unless specifically requested. */
680 #if defined _REGEX_RE_COMP || defined _LIBC
682 /* BSD has one and only one pattern buffer. */
683 static struct re_pattern_buffer re_comp_buf;
685 char *
686 # ifdef _LIBC
687 /* Make these definitions weak in libc, so POSIX programs can redefine
688 these names if they don't use our functions, and still use
689 regcomp/regexec above without link errors. */
690 weak_function
691 # endif
692 re_comp (s)
693 const char *s;
695 reg_errcode_t ret;
696 char *fastmap;
698 if (!s)
700 if (!re_comp_buf.buffer)
701 return gettext ("No previous regular expression");
702 return 0;
705 if (re_comp_buf.buffer)
707 fastmap = re_comp_buf.fastmap;
708 re_comp_buf.fastmap = NULL;
709 __regfree (&re_comp_buf);
710 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
711 re_comp_buf.fastmap = fastmap;
714 if (re_comp_buf.fastmap == NULL)
716 re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
717 if (re_comp_buf.fastmap == NULL)
718 return (char *) gettext (__re_error_msgid
719 + __re_error_msgid_idx[(int) REG_ESPACE]);
722 /* Since `re_exec' always passes NULL for the `regs' argument, we
723 don't need to initialize the pattern buffer fields which affect it. */
725 /* Match anchors at newlines. */
726 re_comp_buf.newline_anchor = 1;
728 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
730 if (!ret)
731 return NULL;
733 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
734 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
737 #ifdef _LIBC
738 libc_freeres_fn (free_mem)
740 __regfree (&re_comp_buf);
742 #endif
744 #endif /* _REGEX_RE_COMP */
746 /* Internal entry point.
747 Compile the regular expression PATTERN, whose length is LENGTH.
748 SYNTAX indicate regular expression's syntax. */
750 static reg_errcode_t
751 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
752 reg_syntax_t syntax)
754 reg_errcode_t err = REG_NOERROR;
755 re_dfa_t *dfa;
756 re_string_t regexp;
758 /* Initialize the pattern buffer. */
759 preg->fastmap_accurate = 0;
760 preg->syntax = syntax;
761 preg->not_bol = preg->not_eol = 0;
762 preg->used = 0;
763 preg->re_nsub = 0;
764 preg->can_be_null = 0;
765 preg->regs_allocated = REGS_UNALLOCATED;
767 /* Initialize the dfa. */
768 dfa = (re_dfa_t *) preg->buffer;
769 if (BE (preg->allocated < sizeof (re_dfa_t), 0))
771 /* If zero allocated, but buffer is non-null, try to realloc
772 enough space. This loses if buffer's address is bogus, but
773 that is the user's responsibility. If ->buffer is NULL this
774 is a simple allocation. */
775 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
776 if (dfa == NULL)
777 return REG_ESPACE;
778 preg->allocated = sizeof (re_dfa_t);
779 preg->buffer = (unsigned char *) dfa;
781 preg->used = sizeof (re_dfa_t);
783 err = init_dfa (dfa, length);
784 if (BE (err != REG_NOERROR, 0))
786 free_dfa_content (dfa);
787 preg->buffer = NULL;
788 preg->allocated = 0;
789 return err;
791 #ifdef DEBUG
792 /* Note: length+1 will not overflow since it is checked in init_dfa. */
793 dfa->re_str = re_malloc (char, length + 1);
794 strncpy (dfa->re_str, pattern, length + 1);
795 #endif
797 __libc_lock_init (dfa->lock);
799 err = re_string_construct (&regexp, pattern, length, preg->translate,
800 (syntax & RE_ICASE) != 0, dfa);
801 if (BE (err != REG_NOERROR, 0))
803 re_compile_internal_free_return:
804 free_workarea_compile (preg);
805 re_string_destruct (&regexp);
806 free_dfa_content (dfa);
807 preg->buffer = NULL;
808 preg->allocated = 0;
809 return err;
812 /* Parse the regular expression, and build a structure tree. */
813 preg->re_nsub = 0;
814 dfa->str_tree = parse (&regexp, preg, syntax, &err);
815 if (BE (dfa->str_tree == NULL, 0))
816 goto re_compile_internal_free_return;
818 /* Analyze the tree and create the nfa. */
819 err = analyze (preg);
820 if (BE (err != REG_NOERROR, 0))
821 goto re_compile_internal_free_return;
823 #ifdef RE_ENABLE_I18N
824 /* If possible, do searching in single byte encoding to speed things up. */
825 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
826 optimize_utf8 (dfa);
827 #endif
829 /* Then create the initial state of the dfa. */
830 err = create_initial_state (dfa);
832 /* Release work areas. */
833 free_workarea_compile (preg);
834 re_string_destruct (&regexp);
836 if (BE (err != REG_NOERROR, 0))
838 free_dfa_content (dfa);
839 preg->buffer = NULL;
840 preg->allocated = 0;
843 return err;
846 /* Initialize DFA. We use the length of the regular expression PAT_LEN
847 as the initial length of some arrays. */
849 static reg_errcode_t
850 init_dfa (re_dfa_t *dfa, size_t pat_len)
852 __re_size_t table_size;
853 #ifdef RE_ENABLE_I18N
854 size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
855 #else
856 size_t max_i18n_object_size = 0;
857 #endif
858 size_t max_object_size =
859 MAX (sizeof (struct re_state_table_entry),
860 MAX (sizeof (re_token_t),
861 MAX (sizeof (re_node_set),
862 MAX (sizeof (regmatch_t),
863 max_i18n_object_size))));
865 memset (dfa, '\0', sizeof (re_dfa_t));
867 /* Force allocation of str_tree_storage the first time. */
868 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
870 /* Avoid overflows. The extra "/ 2" is for the table_size doubling
871 calculation below, and for similar doubling calculations
872 elsewhere. And it's <= rather than <, because some of the
873 doubling calculations add 1 afterwards. */
874 if (BE (SIZE_MAX / max_object_size / 2 <= pat_len, 0))
875 return REG_ESPACE;
877 dfa->nodes_alloc = pat_len + 1;
878 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
880 /* table_size = 2 ^ ceil(log pat_len) */
881 for (table_size = 1; ; table_size <<= 1)
882 if (table_size > pat_len)
883 break;
885 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
886 dfa->state_hash_mask = table_size - 1;
888 dfa->mb_cur_max = MB_CUR_MAX;
889 #ifdef _LIBC
890 if (dfa->mb_cur_max == 6
891 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
892 dfa->is_utf8 = 1;
893 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
894 != 0);
895 #else
896 if (strcmp (locale_charset (), "UTF-8") == 0)
897 dfa->is_utf8 = 1;
899 /* We check exhaustively in the loop below if this charset is a
900 superset of ASCII. */
901 dfa->map_notascii = 0;
902 #endif
904 #ifdef RE_ENABLE_I18N
905 if (dfa->mb_cur_max > 1)
907 if (dfa->is_utf8)
908 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
909 else
911 int i, j, ch;
913 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
914 if (BE (dfa->sb_char == NULL, 0))
915 return REG_ESPACE;
917 /* Set the bits corresponding to single byte chars. */
918 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
919 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
921 wint_t wch = __btowc (ch);
922 if (wch != WEOF)
923 dfa->sb_char[i] |= (bitset_word_t) 1 << j;
924 # ifndef _LIBC
925 if (isascii (ch) && wch != ch)
926 dfa->map_notascii = 1;
927 # endif
931 #endif
933 if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
934 return REG_ESPACE;
935 return REG_NOERROR;
938 /* Initialize WORD_CHAR table, which indicate which character is
939 "word". In this case "word" means that it is the word construction
940 character used by some operators like "\<", "\>", etc. */
942 static void
943 internal_function
944 init_word_char (re_dfa_t *dfa)
946 int i, j, ch;
947 dfa->word_ops_used = 1;
948 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
949 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
950 if (isalnum (ch) || ch == '_')
951 dfa->word_char[i] |= (bitset_word_t) 1 << j;
954 /* Free the work area which are only used while compiling. */
956 static void
957 free_workarea_compile (regex_t *preg)
959 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
960 bin_tree_storage_t *storage, *next;
961 for (storage = dfa->str_tree_storage; storage; storage = next)
963 next = storage->next;
964 re_free (storage);
966 dfa->str_tree_storage = NULL;
967 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
968 dfa->str_tree = NULL;
969 re_free (dfa->org_indices);
970 dfa->org_indices = NULL;
973 /* Create initial states for all contexts. */
975 static reg_errcode_t
976 create_initial_state (re_dfa_t *dfa)
978 Idx first, i;
979 reg_errcode_t err;
980 re_node_set init_nodes;
982 /* Initial states have the epsilon closure of the node which is
983 the first node of the regular expression. */
984 first = dfa->str_tree->first->node_idx;
985 dfa->init_node = first;
986 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
987 if (BE (err != REG_NOERROR, 0))
988 return err;
990 /* The back-references which are in initial states can epsilon transit,
991 since in this case all of the subexpressions can be null.
992 Then we add epsilon closures of the nodes which are the next nodes of
993 the back-references. */
994 if (dfa->nbackref > 0)
995 for (i = 0; i < init_nodes.nelem; ++i)
997 Idx node_idx = init_nodes.elems[i];
998 re_token_type_t type = dfa->nodes[node_idx].type;
1000 Idx clexp_idx;
1001 if (type != OP_BACK_REF)
1002 continue;
1003 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1005 re_token_t *clexp_node;
1006 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1007 if (clexp_node->type == OP_CLOSE_SUBEXP
1008 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1009 break;
1011 if (clexp_idx == init_nodes.nelem)
1012 continue;
1014 if (type == OP_BACK_REF)
1016 Idx dest_idx = dfa->edests[node_idx].elems[0];
1017 if (!re_node_set_contains (&init_nodes, dest_idx))
1019 re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1020 i = 0;
1025 /* It must be the first time to invoke acquire_state. */
1026 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1027 /* We don't check ERR here, since the initial state must not be NULL. */
1028 if (BE (dfa->init_state == NULL, 0))
1029 return err;
1030 if (dfa->init_state->has_constraint)
1032 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1033 CONTEXT_WORD);
1034 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1035 CONTEXT_NEWLINE);
1036 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1037 &init_nodes,
1038 CONTEXT_NEWLINE
1039 | CONTEXT_BEGBUF);
1040 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1041 || dfa->init_state_begbuf == NULL, 0))
1042 return err;
1044 else
1045 dfa->init_state_word = dfa->init_state_nl
1046 = dfa->init_state_begbuf = dfa->init_state;
1048 re_node_set_free (&init_nodes);
1049 return REG_NOERROR;
1052 #ifdef RE_ENABLE_I18N
1053 /* If it is possible to do searching in single byte encoding instead of UTF-8
1054 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1055 DFA nodes where needed. */
1057 static void
1058 optimize_utf8 (re_dfa_t *dfa)
1060 Idx node;
1061 int i;
1062 bool mb_chars = false;
1063 bool has_period = false;
1065 for (node = 0; node < dfa->nodes_len; ++node)
1066 switch (dfa->nodes[node].type)
1068 case CHARACTER:
1069 if (dfa->nodes[node].opr.c >= ASCII_CHARS)
1070 mb_chars = true;
1071 break;
1072 case ANCHOR:
1073 switch (dfa->nodes[node].opr.ctx_type)
1075 case LINE_FIRST:
1076 case LINE_LAST:
1077 case BUF_FIRST:
1078 case BUF_LAST:
1079 break;
1080 default:
1081 /* Word anchors etc. cannot be handled. It's okay to test
1082 opr.ctx_type since constraints (for all DFA nodes) are
1083 created by ORing one or more opr.ctx_type values. */
1084 return;
1086 break;
1087 case OP_PERIOD:
1088 has_period = true;
1089 break;
1090 case OP_BACK_REF:
1091 case OP_ALT:
1092 case END_OF_RE:
1093 case OP_DUP_ASTERISK:
1094 case OP_OPEN_SUBEXP:
1095 case OP_CLOSE_SUBEXP:
1096 break;
1097 case COMPLEX_BRACKET:
1098 return;
1099 case SIMPLE_BRACKET:
1100 /* Just double check. */
1102 int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
1104 : BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
1105 for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1107 if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
1108 return;
1109 rshift = 0;
1112 break;
1113 default:
1114 abort ();
1117 if (mb_chars || has_period)
1118 for (node = 0; node < dfa->nodes_len; ++node)
1120 if (dfa->nodes[node].type == CHARACTER
1121 && dfa->nodes[node].opr.c >= ASCII_CHARS)
1122 dfa->nodes[node].mb_partial = 0;
1123 else if (dfa->nodes[node].type == OP_PERIOD)
1124 dfa->nodes[node].type = OP_UTF8_PERIOD;
1127 /* The search can be in single byte locale. */
1128 dfa->mb_cur_max = 1;
1129 dfa->is_utf8 = 0;
1130 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1132 #endif
1134 /* Analyze the structure tree, and calculate "first", "next", "edest",
1135 "eclosure", and "inveclosure". */
1137 static reg_errcode_t
1138 analyze (regex_t *preg)
1140 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1141 reg_errcode_t ret;
1143 /* Allocate arrays. */
1144 dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1145 dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1146 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1147 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1148 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1149 || dfa->eclosures == NULL, 0))
1150 return REG_ESPACE;
1152 dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
1153 if (dfa->subexp_map != NULL)
1155 Idx i;
1156 for (i = 0; i < preg->re_nsub; i++)
1157 dfa->subexp_map[i] = i;
1158 preorder (dfa->str_tree, optimize_subexps, dfa);
1159 for (i = 0; i < preg->re_nsub; i++)
1160 if (dfa->subexp_map[i] != i)
1161 break;
1162 if (i == preg->re_nsub)
1164 free (dfa->subexp_map);
1165 dfa->subexp_map = NULL;
1169 ret = postorder (dfa->str_tree, lower_subexps, preg);
1170 if (BE (ret != REG_NOERROR, 0))
1171 return ret;
1172 ret = postorder (dfa->str_tree, calc_first, dfa);
1173 if (BE (ret != REG_NOERROR, 0))
1174 return ret;
1175 preorder (dfa->str_tree, calc_next, dfa);
1176 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1177 if (BE (ret != REG_NOERROR, 0))
1178 return ret;
1179 ret = calc_eclosure (dfa);
1180 if (BE (ret != REG_NOERROR, 0))
1181 return ret;
1183 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1184 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1185 if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1186 || dfa->nbackref)
1188 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1189 if (BE (dfa->inveclosures == NULL, 0))
1190 return REG_ESPACE;
1191 ret = calc_inveclosure (dfa);
1194 return ret;
1197 /* Our parse trees are very unbalanced, so we cannot use a stack to
1198 implement parse tree visits. Instead, we use parent pointers and
1199 some hairy code in these two functions. */
1200 static reg_errcode_t
1201 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1202 void *extra)
1204 bin_tree_t *node, *prev;
1206 for (node = root; ; )
1208 /* Descend down the tree, preferably to the left (or to the right
1209 if that's the only child). */
1210 while (node->left || node->right)
1211 if (node->left)
1212 node = node->left;
1213 else
1214 node = node->right;
1218 reg_errcode_t err = fn (extra, node);
1219 if (BE (err != REG_NOERROR, 0))
1220 return err;
1221 if (node->parent == NULL)
1222 return REG_NOERROR;
1223 prev = node;
1224 node = node->parent;
1226 /* Go up while we have a node that is reached from the right. */
1227 while (node->right == prev || node->right == NULL);
1228 node = node->right;
1232 static reg_errcode_t
1233 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1234 void *extra)
1236 bin_tree_t *node;
1238 for (node = root; ; )
1240 reg_errcode_t err = fn (extra, node);
1241 if (BE (err != REG_NOERROR, 0))
1242 return err;
1244 /* Go to the left node, or up and to the right. */
1245 if (node->left)
1246 node = node->left;
1247 else
1249 bin_tree_t *prev = NULL;
1250 while (node->right == prev || node->right == NULL)
1252 prev = node;
1253 node = node->parent;
1254 if (!node)
1255 return REG_NOERROR;
1257 node = node->right;
1262 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1263 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1264 backreferences as well. Requires a preorder visit. */
1265 static reg_errcode_t
1266 optimize_subexps (void *extra, bin_tree_t *node)
1268 re_dfa_t *dfa = (re_dfa_t *) extra;
1270 if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1272 int idx = node->token.opr.idx;
1273 node->token.opr.idx = dfa->subexp_map[idx];
1274 dfa->used_bkref_map |= 1 << node->token.opr.idx;
1277 else if (node->token.type == SUBEXP
1278 && node->left && node->left->token.type == SUBEXP)
1280 Idx other_idx = node->left->token.opr.idx;
1282 node->left = node->left->left;
1283 if (node->left)
1284 node->left->parent = node;
1286 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1287 if (other_idx < BITSET_WORD_BITS)
1288 dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1291 return REG_NOERROR;
1294 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1295 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1296 static reg_errcode_t
1297 lower_subexps (void *extra, bin_tree_t *node)
1299 regex_t *preg = (regex_t *) extra;
1300 reg_errcode_t err = REG_NOERROR;
1302 if (node->left && node->left->token.type == SUBEXP)
1304 node->left = lower_subexp (&err, preg, node->left);
1305 if (node->left)
1306 node->left->parent = node;
1308 if (node->right && node->right->token.type == SUBEXP)
1310 node->right = lower_subexp (&err, preg, node->right);
1311 if (node->right)
1312 node->right->parent = node;
1315 return err;
1318 static bin_tree_t *
1319 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1321 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1322 bin_tree_t *body = node->left;
1323 bin_tree_t *op, *cls, *tree1, *tree;
1325 if (preg->no_sub
1326 /* We do not optimize empty subexpressions, because otherwise we may
1327 have bad CONCAT nodes with NULL children. This is obviously not
1328 very common, so we do not lose much. An example that triggers
1329 this case is the sed "script" /\(\)/x. */
1330 && node->left != NULL
1331 && (node->token.opr.idx >= BITSET_WORD_BITS
1332 || !(dfa->used_bkref_map
1333 & ((bitset_word_t) 1 << node->token.opr.idx))))
1334 return node->left;
1336 /* Convert the SUBEXP node to the concatenation of an
1337 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1338 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1339 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1340 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1341 tree = create_tree (dfa, op, tree1, CONCAT);
1342 if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1344 *err = REG_ESPACE;
1345 return NULL;
1348 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1349 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1350 return tree;
1353 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1354 nodes. Requires a postorder visit. */
1355 static reg_errcode_t
1356 calc_first (void *extra, bin_tree_t *node)
1358 re_dfa_t *dfa = (re_dfa_t *) extra;
1359 if (node->token.type == CONCAT)
1361 node->first = node->left->first;
1362 node->node_idx = node->left->node_idx;
1364 else
1366 node->first = node;
1367 node->node_idx = re_dfa_add_node (dfa, node->token);
1368 if (BE (node->node_idx == REG_MISSING, 0))
1369 return REG_ESPACE;
1370 if (node->token.type == ANCHOR)
1371 dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1373 return REG_NOERROR;
1376 /* Pass 2: compute NEXT on the tree. Preorder visit. */
1377 static reg_errcode_t
1378 calc_next (void *extra, bin_tree_t *node)
1380 switch (node->token.type)
1382 case OP_DUP_ASTERISK:
1383 node->left->next = node;
1384 break;
1385 case CONCAT:
1386 node->left->next = node->right->first;
1387 node->right->next = node->next;
1388 break;
1389 default:
1390 if (node->left)
1391 node->left->next = node->next;
1392 if (node->right)
1393 node->right->next = node->next;
1394 break;
1396 return REG_NOERROR;
1399 /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1400 static reg_errcode_t
1401 link_nfa_nodes (void *extra, bin_tree_t *node)
1403 re_dfa_t *dfa = (re_dfa_t *) extra;
1404 Idx idx = node->node_idx;
1405 reg_errcode_t err = REG_NOERROR;
1407 switch (node->token.type)
1409 case CONCAT:
1410 break;
1412 case END_OF_RE:
1413 assert (node->next == NULL);
1414 break;
1416 case OP_DUP_ASTERISK:
1417 case OP_ALT:
1419 Idx left, right;
1420 dfa->has_plural_match = 1;
1421 if (node->left != NULL)
1422 left = node->left->first->node_idx;
1423 else
1424 left = node->next->node_idx;
1425 if (node->right != NULL)
1426 right = node->right->first->node_idx;
1427 else
1428 right = node->next->node_idx;
1429 assert (REG_VALID_INDEX (left));
1430 assert (REG_VALID_INDEX (right));
1431 err = re_node_set_init_2 (dfa->edests + idx, left, right);
1433 break;
1435 case ANCHOR:
1436 case OP_OPEN_SUBEXP:
1437 case OP_CLOSE_SUBEXP:
1438 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1439 break;
1441 case OP_BACK_REF:
1442 dfa->nexts[idx] = node->next->node_idx;
1443 if (node->token.type == OP_BACK_REF)
1444 re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1445 break;
1447 default:
1448 assert (!IS_EPSILON_NODE (node->token.type));
1449 dfa->nexts[idx] = node->next->node_idx;
1450 break;
1453 return err;
1456 /* Duplicate the epsilon closure of the node ROOT_NODE.
1457 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1458 to their own constraint. */
1460 static reg_errcode_t
1461 internal_function
1462 duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
1463 Idx root_node, unsigned int init_constraint)
1465 Idx org_node, clone_node;
1466 bool ok;
1467 unsigned int constraint = init_constraint;
1468 for (org_node = top_org_node, clone_node = top_clone_node;;)
1470 Idx org_dest, clone_dest;
1471 if (dfa->nodes[org_node].type == OP_BACK_REF)
1473 /* If the back reference epsilon-transit, its destination must
1474 also have the constraint. Then duplicate the epsilon closure
1475 of the destination of the back reference, and store it in
1476 edests of the back reference. */
1477 org_dest = dfa->nexts[org_node];
1478 re_node_set_empty (dfa->edests + clone_node);
1479 clone_dest = duplicate_node (dfa, org_dest, constraint);
1480 if (BE (clone_dest == REG_MISSING, 0))
1481 return REG_ESPACE;
1482 dfa->nexts[clone_node] = dfa->nexts[org_node];
1483 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1484 if (BE (! ok, 0))
1485 return REG_ESPACE;
1487 else if (dfa->edests[org_node].nelem == 0)
1489 /* In case of the node can't epsilon-transit, don't duplicate the
1490 destination and store the original destination as the
1491 destination of the node. */
1492 dfa->nexts[clone_node] = dfa->nexts[org_node];
1493 break;
1495 else if (dfa->edests[org_node].nelem == 1)
1497 /* In case of the node can epsilon-transit, and it has only one
1498 destination. */
1499 org_dest = dfa->edests[org_node].elems[0];
1500 re_node_set_empty (dfa->edests + clone_node);
1501 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1502 /* If the node is root_node itself, it means the epsilon closure
1503 has a loop. Then tie it to the destination of the root_node. */
1504 if (org_node == root_node && clone_node != org_node)
1506 ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
1507 if (BE (! ok, 0))
1508 return REG_ESPACE;
1509 break;
1511 /* In case the node has another constraint, append it. */
1512 constraint |= dfa->nodes[org_node].constraint;
1513 clone_dest = duplicate_node (dfa, org_dest, constraint);
1514 if (BE (clone_dest == REG_MISSING, 0))
1515 return REG_ESPACE;
1516 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1517 if (BE (! ok, 0))
1518 return REG_ESPACE;
1520 else /* dfa->edests[org_node].nelem == 2 */
1522 /* In case of the node can epsilon-transit, and it has two
1523 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1524 org_dest = dfa->edests[org_node].elems[0];
1525 re_node_set_empty (dfa->edests + clone_node);
1526 /* Search for a duplicated node which satisfies the constraint. */
1527 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1528 if (clone_dest == REG_MISSING)
1530 /* There is no such duplicated node, create a new one. */
1531 reg_errcode_t err;
1532 clone_dest = duplicate_node (dfa, org_dest, constraint);
1533 if (BE (clone_dest == REG_MISSING, 0))
1534 return REG_ESPACE;
1535 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1536 if (BE (! ok, 0))
1537 return REG_ESPACE;
1538 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1539 root_node, constraint);
1540 if (BE (err != REG_NOERROR, 0))
1541 return err;
1543 else
1545 /* There is a duplicated node which satisfy the constraint,
1546 use it to avoid infinite loop. */
1547 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1548 if (BE (! ok, 0))
1549 return REG_ESPACE;
1552 org_dest = dfa->edests[org_node].elems[1];
1553 clone_dest = duplicate_node (dfa, org_dest, constraint);
1554 if (BE (clone_dest == REG_MISSING, 0))
1555 return REG_ESPACE;
1556 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1557 if (BE (! ok, 0))
1558 return REG_ESPACE;
1560 org_node = org_dest;
1561 clone_node = clone_dest;
1563 return REG_NOERROR;
1566 /* Search for a node which is duplicated from the node ORG_NODE, and
1567 satisfies the constraint CONSTRAINT. */
1569 static Idx
1570 search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1571 unsigned int constraint)
1573 Idx idx;
1574 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1576 if (org_node == dfa->org_indices[idx]
1577 && constraint == dfa->nodes[idx].constraint)
1578 return idx; /* Found. */
1580 return REG_MISSING; /* Not found. */
1583 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1584 Return the index of the new node, or REG_MISSING if insufficient storage is
1585 available. */
1587 static Idx
1588 duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
1590 Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1591 if (BE (dup_idx != REG_MISSING, 1))
1593 dfa->nodes[dup_idx].constraint = constraint;
1594 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1595 dfa->nodes[dup_idx].duplicated = 1;
1597 /* Store the index of the original node. */
1598 dfa->org_indices[dup_idx] = org_idx;
1600 return dup_idx;
1603 static reg_errcode_t
1604 calc_inveclosure (re_dfa_t *dfa)
1606 Idx src, idx;
1607 bool ok;
1608 for (idx = 0; idx < dfa->nodes_len; ++idx)
1609 re_node_set_init_empty (dfa->inveclosures + idx);
1611 for (src = 0; src < dfa->nodes_len; ++src)
1613 Idx *elems = dfa->eclosures[src].elems;
1614 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1616 ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1617 if (BE (! ok, 0))
1618 return REG_ESPACE;
1622 return REG_NOERROR;
1625 /* Calculate "eclosure" for all the node in DFA. */
1627 static reg_errcode_t
1628 calc_eclosure (re_dfa_t *dfa)
1630 Idx node_idx;
1631 bool incomplete;
1632 #ifdef DEBUG
1633 assert (dfa->nodes_len > 0);
1634 #endif
1635 incomplete = false;
1636 /* For each nodes, calculate epsilon closure. */
1637 for (node_idx = 0; ; ++node_idx)
1639 reg_errcode_t err;
1640 re_node_set eclosure_elem;
1641 if (node_idx == dfa->nodes_len)
1643 if (!incomplete)
1644 break;
1645 incomplete = false;
1646 node_idx = 0;
1649 #ifdef DEBUG
1650 assert (dfa->eclosures[node_idx].nelem != REG_MISSING);
1651 #endif
1653 /* If we have already calculated, skip it. */
1654 if (dfa->eclosures[node_idx].nelem != 0)
1655 continue;
1656 /* Calculate epsilon closure of `node_idx'. */
1657 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1658 if (BE (err != REG_NOERROR, 0))
1659 return err;
1661 if (dfa->eclosures[node_idx].nelem == 0)
1663 incomplete = true;
1664 re_node_set_free (&eclosure_elem);
1667 return REG_NOERROR;
1670 /* Calculate epsilon closure of NODE. */
1672 static reg_errcode_t
1673 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
1675 reg_errcode_t err;
1676 Idx i;
1677 bool incomplete;
1678 bool ok;
1679 re_node_set eclosure;
1680 incomplete = false;
1681 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1682 if (BE (err != REG_NOERROR, 0))
1683 return err;
1685 /* This indicates that we are calculating this node now.
1686 We reference this value to avoid infinite loop. */
1687 dfa->eclosures[node].nelem = REG_MISSING;
1689 /* If the current node has constraints, duplicate all nodes
1690 since they must inherit the constraints. */
1691 if (dfa->nodes[node].constraint
1692 && dfa->edests[node].nelem
1693 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1695 err = duplicate_node_closure (dfa, node, node, node,
1696 dfa->nodes[node].constraint);
1697 if (BE (err != REG_NOERROR, 0))
1698 return err;
1701 /* Expand each epsilon destination nodes. */
1702 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1703 for (i = 0; i < dfa->edests[node].nelem; ++i)
1705 re_node_set eclosure_elem;
1706 Idx edest = dfa->edests[node].elems[i];
1707 /* If calculating the epsilon closure of `edest' is in progress,
1708 return intermediate result. */
1709 if (dfa->eclosures[edest].nelem == REG_MISSING)
1711 incomplete = true;
1712 continue;
1714 /* If we haven't calculated the epsilon closure of `edest' yet,
1715 calculate now. Otherwise use calculated epsilon closure. */
1716 if (dfa->eclosures[edest].nelem == 0)
1718 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1719 if (BE (err != REG_NOERROR, 0))
1720 return err;
1722 else
1723 eclosure_elem = dfa->eclosures[edest];
1724 /* Merge the epsilon closure of `edest'. */
1725 re_node_set_merge (&eclosure, &eclosure_elem);
1726 /* If the epsilon closure of `edest' is incomplete,
1727 the epsilon closure of this node is also incomplete. */
1728 if (dfa->eclosures[edest].nelem == 0)
1730 incomplete = true;
1731 re_node_set_free (&eclosure_elem);
1735 /* Epsilon closures include itself. */
1736 ok = re_node_set_insert (&eclosure, node);
1737 if (BE (! ok, 0))
1738 return REG_ESPACE;
1739 if (incomplete && !root)
1740 dfa->eclosures[node].nelem = 0;
1741 else
1742 dfa->eclosures[node] = eclosure;
1743 *new_set = eclosure;
1744 return REG_NOERROR;
1747 /* Functions for token which are used in the parser. */
1749 /* Fetch a token from INPUT.
1750 We must not use this function inside bracket expressions. */
1752 static void
1753 internal_function
1754 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1756 re_string_skip_bytes (input, peek_token (result, input, syntax));
1759 /* Peek a token from INPUT, and return the length of the token.
1760 We must not use this function inside bracket expressions. */
1762 static int
1763 internal_function
1764 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1766 unsigned char c;
1768 if (re_string_eoi (input))
1770 token->type = END_OF_RE;
1771 return 0;
1774 c = re_string_peek_byte (input, 0);
1775 token->opr.c = c;
1777 token->word_char = 0;
1778 #ifdef RE_ENABLE_I18N
1779 token->mb_partial = 0;
1780 if (input->mb_cur_max > 1 &&
1781 !re_string_first_byte (input, re_string_cur_idx (input)))
1783 token->type = CHARACTER;
1784 token->mb_partial = 1;
1785 return 1;
1787 #endif
1788 if (c == '\\')
1790 unsigned char c2;
1791 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1793 token->type = BACK_SLASH;
1794 return 1;
1797 c2 = re_string_peek_byte_case (input, 1);
1798 token->opr.c = c2;
1799 token->type = CHARACTER;
1800 #ifdef RE_ENABLE_I18N
1801 if (input->mb_cur_max > 1)
1803 wint_t wc = re_string_wchar_at (input,
1804 re_string_cur_idx (input) + 1);
1805 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1807 else
1808 #endif
1809 token->word_char = IS_WORD_CHAR (c2) != 0;
1811 switch (c2)
1813 case '|':
1814 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1815 token->type = OP_ALT;
1816 break;
1817 case '1': case '2': case '3': case '4': case '5':
1818 case '6': case '7': case '8': case '9':
1819 if (!(syntax & RE_NO_BK_REFS))
1821 token->type = OP_BACK_REF;
1822 token->opr.idx = c2 - '1';
1824 break;
1825 case '<':
1826 if (!(syntax & RE_NO_GNU_OPS))
1828 token->type = ANCHOR;
1829 token->opr.ctx_type = WORD_FIRST;
1831 break;
1832 case '>':
1833 if (!(syntax & RE_NO_GNU_OPS))
1835 token->type = ANCHOR;
1836 token->opr.ctx_type = WORD_LAST;
1838 break;
1839 case 'b':
1840 if (!(syntax & RE_NO_GNU_OPS))
1842 token->type = ANCHOR;
1843 token->opr.ctx_type = WORD_DELIM;
1845 break;
1846 case 'B':
1847 if (!(syntax & RE_NO_GNU_OPS))
1849 token->type = ANCHOR;
1850 token->opr.ctx_type = NOT_WORD_DELIM;
1852 break;
1853 case 'w':
1854 if (!(syntax & RE_NO_GNU_OPS))
1855 token->type = OP_WORD;
1856 break;
1857 case 'W':
1858 if (!(syntax & RE_NO_GNU_OPS))
1859 token->type = OP_NOTWORD;
1860 break;
1861 case 's':
1862 if (!(syntax & RE_NO_GNU_OPS))
1863 token->type = OP_SPACE;
1864 break;
1865 case 'S':
1866 if (!(syntax & RE_NO_GNU_OPS))
1867 token->type = OP_NOTSPACE;
1868 break;
1869 case '`':
1870 if (!(syntax & RE_NO_GNU_OPS))
1872 token->type = ANCHOR;
1873 token->opr.ctx_type = BUF_FIRST;
1875 break;
1876 case '\'':
1877 if (!(syntax & RE_NO_GNU_OPS))
1879 token->type = ANCHOR;
1880 token->opr.ctx_type = BUF_LAST;
1882 break;
1883 case '(':
1884 if (!(syntax & RE_NO_BK_PARENS))
1885 token->type = OP_OPEN_SUBEXP;
1886 break;
1887 case ')':
1888 if (!(syntax & RE_NO_BK_PARENS))
1889 token->type = OP_CLOSE_SUBEXP;
1890 break;
1891 case '+':
1892 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1893 token->type = OP_DUP_PLUS;
1894 break;
1895 case '?':
1896 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1897 token->type = OP_DUP_QUESTION;
1898 break;
1899 case '{':
1900 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1901 token->type = OP_OPEN_DUP_NUM;
1902 break;
1903 case '}':
1904 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1905 token->type = OP_CLOSE_DUP_NUM;
1906 break;
1907 default:
1908 break;
1910 return 2;
1913 token->type = CHARACTER;
1914 #ifdef RE_ENABLE_I18N
1915 if (input->mb_cur_max > 1)
1917 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1918 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1920 else
1921 #endif
1922 token->word_char = IS_WORD_CHAR (token->opr.c);
1924 switch (c)
1926 case '\n':
1927 if (syntax & RE_NEWLINE_ALT)
1928 token->type = OP_ALT;
1929 break;
1930 case '|':
1931 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1932 token->type = OP_ALT;
1933 break;
1934 case '*':
1935 token->type = OP_DUP_ASTERISK;
1936 break;
1937 case '+':
1938 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1939 token->type = OP_DUP_PLUS;
1940 break;
1941 case '?':
1942 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1943 token->type = OP_DUP_QUESTION;
1944 break;
1945 case '{':
1946 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1947 token->type = OP_OPEN_DUP_NUM;
1948 break;
1949 case '}':
1950 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1951 token->type = OP_CLOSE_DUP_NUM;
1952 break;
1953 case '(':
1954 if (syntax & RE_NO_BK_PARENS)
1955 token->type = OP_OPEN_SUBEXP;
1956 break;
1957 case ')':
1958 if (syntax & RE_NO_BK_PARENS)
1959 token->type = OP_CLOSE_SUBEXP;
1960 break;
1961 case '[':
1962 token->type = OP_OPEN_BRACKET;
1963 break;
1964 case '.':
1965 token->type = OP_PERIOD;
1966 break;
1967 case '^':
1968 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1969 re_string_cur_idx (input) != 0)
1971 char prev = re_string_peek_byte (input, -1);
1972 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1973 break;
1975 token->type = ANCHOR;
1976 token->opr.ctx_type = LINE_FIRST;
1977 break;
1978 case '$':
1979 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1980 re_string_cur_idx (input) + 1 != re_string_length (input))
1982 re_token_t next;
1983 re_string_skip_bytes (input, 1);
1984 peek_token (&next, input, syntax);
1985 re_string_skip_bytes (input, -1);
1986 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1987 break;
1989 token->type = ANCHOR;
1990 token->opr.ctx_type = LINE_LAST;
1991 break;
1992 default:
1993 break;
1995 return 1;
1998 /* Peek a token from INPUT, and return the length of the token.
1999 We must not use this function out of bracket expressions. */
2001 static int
2002 internal_function
2003 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2005 unsigned char c;
2006 if (re_string_eoi (input))
2008 token->type = END_OF_RE;
2009 return 0;
2011 c = re_string_peek_byte (input, 0);
2012 token->opr.c = c;
2014 #ifdef RE_ENABLE_I18N
2015 if (input->mb_cur_max > 1 &&
2016 !re_string_first_byte (input, re_string_cur_idx (input)))
2018 token->type = CHARACTER;
2019 return 1;
2021 #endif /* RE_ENABLE_I18N */
2023 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2024 && re_string_cur_idx (input) + 1 < re_string_length (input))
2026 /* In this case, '\' escape a character. */
2027 unsigned char c2;
2028 re_string_skip_bytes (input, 1);
2029 c2 = re_string_peek_byte (input, 0);
2030 token->opr.c = c2;
2031 token->type = CHARACTER;
2032 return 1;
2034 if (c == '[') /* '[' is a special char in a bracket exps. */
2036 unsigned char c2;
2037 int token_len;
2038 if (re_string_cur_idx (input) + 1 < re_string_length (input))
2039 c2 = re_string_peek_byte (input, 1);
2040 else
2041 c2 = 0;
2042 token->opr.c = c2;
2043 token_len = 2;
2044 switch (c2)
2046 case '.':
2047 token->type = OP_OPEN_COLL_ELEM;
2048 break;
2049 case '=':
2050 token->type = OP_OPEN_EQUIV_CLASS;
2051 break;
2052 case ':':
2053 if (syntax & RE_CHAR_CLASSES)
2055 token->type = OP_OPEN_CHAR_CLASS;
2056 break;
2058 /* else fall through. */
2059 default:
2060 token->type = CHARACTER;
2061 token->opr.c = c;
2062 token_len = 1;
2063 break;
2065 return token_len;
2067 switch (c)
2069 case '-':
2070 token->type = OP_CHARSET_RANGE;
2071 break;
2072 case ']':
2073 token->type = OP_CLOSE_BRACKET;
2074 break;
2075 case '^':
2076 token->type = OP_NON_MATCH_LIST;
2077 break;
2078 default:
2079 token->type = CHARACTER;
2081 return 1;
2084 /* Functions for parser. */
2086 /* Entry point of the parser.
2087 Parse the regular expression REGEXP and return the structure tree.
2088 If an error is occured, ERR is set by error code, and return NULL.
2089 This function build the following tree, from regular expression <reg_exp>:
2093 <reg_exp> EOR
2095 CAT means concatenation.
2096 EOR means end of regular expression. */
2098 static bin_tree_t *
2099 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2100 reg_errcode_t *err)
2102 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2103 bin_tree_t *tree, *eor, *root;
2104 re_token_t current_token;
2105 dfa->syntax = syntax;
2106 fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2107 tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2108 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2109 return NULL;
2110 eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2111 if (tree != NULL)
2112 root = create_tree (dfa, tree, eor, CONCAT);
2113 else
2114 root = eor;
2115 if (BE (eor == NULL || root == NULL, 0))
2117 *err = REG_ESPACE;
2118 return NULL;
2120 return root;
2123 /* This function build the following tree, from regular expression
2124 <branch1>|<branch2>:
2128 <branch1> <branch2>
2130 ALT means alternative, which represents the operator `|'. */
2132 static bin_tree_t *
2133 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2134 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2136 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2137 bin_tree_t *tree, *branch = NULL;
2138 tree = parse_branch (regexp, preg, token, syntax, nest, err);
2139 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2140 return NULL;
2142 while (token->type == OP_ALT)
2144 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2145 if (token->type != OP_ALT && token->type != END_OF_RE
2146 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2148 branch = parse_branch (regexp, preg, token, syntax, nest, err);
2149 if (BE (*err != REG_NOERROR && branch == NULL, 0))
2150 return NULL;
2152 else
2153 branch = NULL;
2154 tree = create_tree (dfa, tree, branch, OP_ALT);
2155 if (BE (tree == NULL, 0))
2157 *err = REG_ESPACE;
2158 return NULL;
2161 return tree;
2164 /* This function build the following tree, from regular expression
2165 <exp1><exp2>:
2169 <exp1> <exp2>
2171 CAT means concatenation. */
2173 static bin_tree_t *
2174 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2175 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2177 bin_tree_t *tree, *expr;
2178 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2179 tree = parse_expression (regexp, preg, token, syntax, nest, err);
2180 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2181 return NULL;
2183 while (token->type != OP_ALT && token->type != END_OF_RE
2184 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2186 expr = parse_expression (regexp, preg, token, syntax, nest, err);
2187 if (BE (*err != REG_NOERROR && expr == NULL, 0))
2189 return NULL;
2191 if (tree != NULL && expr != NULL)
2193 tree = create_tree (dfa, tree, expr, CONCAT);
2194 if (tree == NULL)
2196 *err = REG_ESPACE;
2197 return NULL;
2200 else if (tree == NULL)
2201 tree = expr;
2202 /* Otherwise expr == NULL, we don't need to create new tree. */
2204 return tree;
2207 /* This function build the following tree, from regular expression a*:
2213 static bin_tree_t *
2214 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2215 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2217 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2218 bin_tree_t *tree;
2219 switch (token->type)
2221 case CHARACTER:
2222 tree = create_token_tree (dfa, NULL, NULL, token);
2223 if (BE (tree == NULL, 0))
2225 *err = REG_ESPACE;
2226 return NULL;
2228 #ifdef RE_ENABLE_I18N
2229 if (dfa->mb_cur_max > 1)
2231 while (!re_string_eoi (regexp)
2232 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2234 bin_tree_t *mbc_remain;
2235 fetch_token (token, regexp, syntax);
2236 mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2237 tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2238 if (BE (mbc_remain == NULL || tree == NULL, 0))
2240 *err = REG_ESPACE;
2241 return NULL;
2245 #endif
2246 break;
2247 case OP_OPEN_SUBEXP:
2248 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2249 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2250 return NULL;
2251 break;
2252 case OP_OPEN_BRACKET:
2253 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2254 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2255 return NULL;
2256 break;
2257 case OP_BACK_REF:
2258 if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2260 *err = REG_ESUBREG;
2261 return NULL;
2263 dfa->used_bkref_map |= 1 << token->opr.idx;
2264 tree = create_token_tree (dfa, NULL, NULL, token);
2265 if (BE (tree == NULL, 0))
2267 *err = REG_ESPACE;
2268 return NULL;
2270 ++dfa->nbackref;
2271 dfa->has_mb_node = 1;
2272 break;
2273 case OP_OPEN_DUP_NUM:
2274 if (syntax & RE_CONTEXT_INVALID_DUP)
2276 *err = REG_BADRPT;
2277 return NULL;
2279 /* FALLTHROUGH */
2280 case OP_DUP_ASTERISK:
2281 case OP_DUP_PLUS:
2282 case OP_DUP_QUESTION:
2283 if (syntax & RE_CONTEXT_INVALID_OPS)
2285 *err = REG_BADRPT;
2286 return NULL;
2288 else if (syntax & RE_CONTEXT_INDEP_OPS)
2290 fetch_token (token, regexp, syntax);
2291 return parse_expression (regexp, preg, token, syntax, nest, err);
2293 /* else fall through */
2294 case OP_CLOSE_SUBEXP:
2295 if ((token->type == OP_CLOSE_SUBEXP) &&
2296 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2298 *err = REG_ERPAREN;
2299 return NULL;
2301 /* else fall through */
2302 case OP_CLOSE_DUP_NUM:
2303 /* We treat it as a normal character. */
2305 /* Then we can these characters as normal characters. */
2306 token->type = CHARACTER;
2307 /* mb_partial and word_char bits should be initialized already
2308 by peek_token. */
2309 tree = create_token_tree (dfa, NULL, NULL, token);
2310 if (BE (tree == NULL, 0))
2312 *err = REG_ESPACE;
2313 return NULL;
2315 break;
2316 case ANCHOR:
2317 if ((token->opr.ctx_type
2318 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2319 && dfa->word_ops_used == 0)
2320 init_word_char (dfa);
2321 if (token->opr.ctx_type == WORD_DELIM
2322 || token->opr.ctx_type == NOT_WORD_DELIM)
2324 bin_tree_t *tree_first, *tree_last;
2325 if (token->opr.ctx_type == WORD_DELIM)
2327 token->opr.ctx_type = WORD_FIRST;
2328 tree_first = create_token_tree (dfa, NULL, NULL, token);
2329 token->opr.ctx_type = WORD_LAST;
2331 else
2333 token->opr.ctx_type = INSIDE_WORD;
2334 tree_first = create_token_tree (dfa, NULL, NULL, token);
2335 token->opr.ctx_type = INSIDE_NOTWORD;
2337 tree_last = create_token_tree (dfa, NULL, NULL, token);
2338 tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2339 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2341 *err = REG_ESPACE;
2342 return NULL;
2345 else
2347 tree = create_token_tree (dfa, NULL, NULL, token);
2348 if (BE (tree == NULL, 0))
2350 *err = REG_ESPACE;
2351 return NULL;
2354 /* We must return here, since ANCHORs can't be followed
2355 by repetition operators.
2356 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2357 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2358 fetch_token (token, regexp, syntax);
2359 return tree;
2360 case OP_PERIOD:
2361 tree = create_token_tree (dfa, NULL, NULL, token);
2362 if (BE (tree == NULL, 0))
2364 *err = REG_ESPACE;
2365 return NULL;
2367 if (dfa->mb_cur_max > 1)
2368 dfa->has_mb_node = 1;
2369 break;
2370 case OP_WORD:
2371 case OP_NOTWORD:
2372 tree = build_charclass_op (dfa, regexp->trans,
2373 (const unsigned char *) "alnum",
2374 (const unsigned char *) "_",
2375 token->type == OP_NOTWORD, err);
2376 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2377 return NULL;
2378 break;
2379 case OP_SPACE:
2380 case OP_NOTSPACE:
2381 tree = build_charclass_op (dfa, regexp->trans,
2382 (const unsigned char *) "space",
2383 (const unsigned char *) "",
2384 token->type == OP_NOTSPACE, err);
2385 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2386 return NULL;
2387 break;
2388 case OP_ALT:
2389 case END_OF_RE:
2390 return NULL;
2391 case BACK_SLASH:
2392 *err = REG_EESCAPE;
2393 return NULL;
2394 default:
2395 /* Must not happen? */
2396 #ifdef DEBUG
2397 assert (0);
2398 #endif
2399 return NULL;
2401 fetch_token (token, regexp, syntax);
2403 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2404 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2406 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2407 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2408 return NULL;
2409 /* In BRE consecutive duplications are not allowed. */
2410 if ((syntax & RE_CONTEXT_INVALID_DUP)
2411 && (token->type == OP_DUP_ASTERISK
2412 || token->type == OP_OPEN_DUP_NUM))
2414 *err = REG_BADRPT;
2415 return NULL;
2419 return tree;
2422 /* This function build the following tree, from regular expression
2423 (<reg_exp>):
2424 SUBEXP
2426 <reg_exp>
2429 static bin_tree_t *
2430 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2431 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2433 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2434 bin_tree_t *tree;
2435 size_t cur_nsub;
2436 cur_nsub = preg->re_nsub++;
2438 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2440 /* The subexpression may be a null string. */
2441 if (token->type == OP_CLOSE_SUBEXP)
2442 tree = NULL;
2443 else
2445 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2446 if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2447 *err = REG_EPAREN;
2448 if (BE (*err != REG_NOERROR, 0))
2449 return NULL;
2452 if (cur_nsub <= '9' - '1')
2453 dfa->completed_bkref_map |= 1 << cur_nsub;
2455 tree = create_tree (dfa, tree, NULL, SUBEXP);
2456 if (BE (tree == NULL, 0))
2458 *err = REG_ESPACE;
2459 return NULL;
2461 tree->token.opr.idx = cur_nsub;
2462 return tree;
2465 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2467 static bin_tree_t *
2468 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2469 re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2471 bin_tree_t *tree = NULL, *old_tree = NULL;
2472 Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2473 re_token_t start_token = *token;
2475 if (token->type == OP_OPEN_DUP_NUM)
2477 end = 0;
2478 start = fetch_number (regexp, token, syntax);
2479 if (start == REG_MISSING)
2481 if (token->type == CHARACTER && token->opr.c == ',')
2482 start = 0; /* We treat "{,m}" as "{0,m}". */
2483 else
2485 *err = REG_BADBR; /* <re>{} is invalid. */
2486 return NULL;
2489 if (BE (start != REG_ERROR, 1))
2491 /* We treat "{n}" as "{n,n}". */
2492 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2493 : ((token->type == CHARACTER && token->opr.c == ',')
2494 ? fetch_number (regexp, token, syntax) : REG_ERROR));
2496 if (BE (start == REG_ERROR || end == REG_ERROR, 0))
2498 /* Invalid sequence. */
2499 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2501 if (token->type == END_OF_RE)
2502 *err = REG_EBRACE;
2503 else
2504 *err = REG_BADBR;
2506 return NULL;
2509 /* If the syntax bit is set, rollback. */
2510 re_string_set_index (regexp, start_idx);
2511 *token = start_token;
2512 token->type = CHARACTER;
2513 /* mb_partial and word_char bits should be already initialized by
2514 peek_token. */
2515 return elem;
2518 if (BE (end != REG_MISSING && start > end, 0))
2520 /* First number greater than second. */
2521 *err = REG_BADBR;
2522 return NULL;
2525 else
2527 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2528 end = (token->type == OP_DUP_QUESTION) ? 1 : REG_MISSING;
2531 fetch_token (token, regexp, syntax);
2533 if (BE (elem == NULL, 0))
2534 return NULL;
2535 if (BE (start == 0 && end == 0, 0))
2537 postorder (elem, free_tree, NULL);
2538 return NULL;
2541 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2542 if (BE (start > 0, 0))
2544 tree = elem;
2545 for (i = 2; i <= start; ++i)
2547 elem = duplicate_tree (elem, dfa);
2548 tree = create_tree (dfa, tree, elem, CONCAT);
2549 if (BE (elem == NULL || tree == NULL, 0))
2550 goto parse_dup_op_espace;
2553 if (start == end)
2554 return tree;
2556 /* Duplicate ELEM before it is marked optional. */
2557 elem = duplicate_tree (elem, dfa);
2558 old_tree = tree;
2560 else
2561 old_tree = NULL;
2563 if (elem->token.type == SUBEXP)
2564 postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2566 tree = create_tree (dfa, elem, NULL,
2567 (end == REG_MISSING ? OP_DUP_ASTERISK : OP_ALT));
2568 if (BE (tree == NULL, 0))
2569 goto parse_dup_op_espace;
2571 /* This loop is actually executed only when end != REG_MISSING,
2572 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2573 already created the start+1-th copy. */
2574 if ((Idx) -1 < 0 || end != REG_MISSING)
2575 for (i = start + 2; i <= end; ++i)
2577 elem = duplicate_tree (elem, dfa);
2578 tree = create_tree (dfa, tree, elem, CONCAT);
2579 if (BE (elem == NULL || tree == NULL, 0))
2580 goto parse_dup_op_espace;
2582 tree = create_tree (dfa, tree, NULL, OP_ALT);
2583 if (BE (tree == NULL, 0))
2584 goto parse_dup_op_espace;
2587 if (old_tree)
2588 tree = create_tree (dfa, old_tree, tree, CONCAT);
2590 return tree;
2592 parse_dup_op_espace:
2593 *err = REG_ESPACE;
2594 return NULL;
2597 /* Size of the names for collating symbol/equivalence_class/character_class.
2598 I'm not sure, but maybe enough. */
2599 #define BRACKET_NAME_BUF_SIZE 32
2601 #ifndef _LIBC
2602 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2603 Build the range expression which starts from START_ELEM, and ends
2604 at END_ELEM. The result are written to MBCSET and SBCSET.
2605 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2606 mbcset->range_ends, is a pointer argument sinse we may
2607 update it. */
2609 static reg_errcode_t
2610 internal_function
2611 # ifdef RE_ENABLE_I18N
2612 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, Idx *range_alloc,
2613 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2614 # else /* not RE_ENABLE_I18N */
2615 build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
2616 bracket_elem_t *end_elem)
2617 # endif /* not RE_ENABLE_I18N */
2619 unsigned int start_ch, end_ch;
2620 /* Equivalence Classes and Character Classes can't be a range start/end. */
2621 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2622 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2624 return REG_ERANGE;
2626 /* We can handle no multi character collating elements without libc
2627 support. */
2628 if (BE ((start_elem->type == COLL_SYM
2629 && strlen ((char *) start_elem->opr.name) > 1)
2630 || (end_elem->type == COLL_SYM
2631 && strlen ((char *) end_elem->opr.name) > 1), 0))
2632 return REG_ECOLLATE;
2634 # ifdef RE_ENABLE_I18N
2636 wchar_t wc;
2637 wint_t start_wc;
2638 wint_t end_wc;
2639 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2641 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2642 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2643 : 0));
2644 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2645 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2646 : 0));
2647 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2648 ? __btowc (start_ch) : start_elem->opr.wch);
2649 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2650 ? __btowc (end_ch) : end_elem->opr.wch);
2651 if (start_wc == WEOF || end_wc == WEOF)
2652 return REG_ECOLLATE;
2653 cmp_buf[0] = start_wc;
2654 cmp_buf[4] = end_wc;
2655 if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2656 return REG_ERANGE;
2658 /* Got valid collation sequence values, add them as a new entry.
2659 However, for !_LIBC we have no collation elements: if the
2660 character set is single byte, the single byte character set
2661 that we build below suffices. parse_bracket_exp passes
2662 no MBCSET if dfa->mb_cur_max == 1. */
2663 if (mbcset)
2665 /* Check the space of the arrays. */
2666 if (BE (*range_alloc == mbcset->nranges, 0))
2668 /* There is not enough space, need realloc. */
2669 wchar_t *new_array_start, *new_array_end;
2670 Idx new_nranges;
2672 /* +1 in case of mbcset->nranges is 0. */
2673 new_nranges = 2 * mbcset->nranges + 1;
2674 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2675 are NULL if *range_alloc == 0. */
2676 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2677 new_nranges);
2678 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2679 new_nranges);
2681 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2682 return REG_ESPACE;
2684 mbcset->range_starts = new_array_start;
2685 mbcset->range_ends = new_array_end;
2686 *range_alloc = new_nranges;
2689 mbcset->range_starts[mbcset->nranges] = start_wc;
2690 mbcset->range_ends[mbcset->nranges++] = end_wc;
2693 /* Build the table for single byte characters. */
2694 for (wc = 0; wc < SBC_MAX; ++wc)
2696 cmp_buf[2] = wc;
2697 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2698 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2699 bitset_set (sbcset, wc);
2702 # else /* not RE_ENABLE_I18N */
2704 unsigned int ch;
2705 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2706 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2707 : 0));
2708 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2709 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2710 : 0));
2711 if (start_ch > end_ch)
2712 return REG_ERANGE;
2713 /* Build the table for single byte characters. */
2714 for (ch = 0; ch < SBC_MAX; ++ch)
2715 if (start_ch <= ch && ch <= end_ch)
2716 bitset_set (sbcset, ch);
2718 # endif /* not RE_ENABLE_I18N */
2719 return REG_NOERROR;
2721 #endif /* not _LIBC */
2723 #ifndef _LIBC
2724 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2725 Build the collating element which is represented by NAME.
2726 The result are written to MBCSET and SBCSET.
2727 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2728 pointer argument since we may update it. */
2730 static reg_errcode_t
2731 internal_function
2732 build_collating_symbol (bitset_t sbcset,
2733 # ifdef RE_ENABLE_I18N
2734 re_charset_t *mbcset, Idx *coll_sym_alloc,
2735 # endif
2736 const unsigned char *name)
2738 size_t name_len = strlen ((const char *) name);
2739 if (BE (name_len != 1, 0))
2740 return REG_ECOLLATE;
2741 else
2743 bitset_set (sbcset, name[0]);
2744 return REG_NOERROR;
2747 #endif /* not _LIBC */
2749 /* This function parse bracket expression like "[abc]", "[a-c]",
2750 "[[.a-a.]]" etc. */
2752 static bin_tree_t *
2753 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2754 reg_syntax_t syntax, reg_errcode_t *err)
2756 #ifdef _LIBC
2757 const unsigned char *collseqmb;
2758 const char *collseqwc;
2759 uint32_t nrules;
2760 int32_t table_size;
2761 const int32_t *symb_table;
2762 const unsigned char *extra;
2764 /* Local function for parse_bracket_exp used in _LIBC environement.
2765 Seek the collating symbol entry correspondings to NAME.
2766 Return the index of the symbol in the SYMB_TABLE. */
2768 auto inline int32_t
2769 __attribute ((always_inline))
2770 seek_collating_symbol_entry (name, name_len)
2771 const unsigned char *name;
2772 size_t name_len;
2774 int32_t hash = elem_hash ((const char *) name, name_len);
2775 int32_t elem = hash % table_size;
2776 if (symb_table[2 * elem] != 0)
2778 int32_t second = hash % (table_size - 2) + 1;
2782 /* First compare the hashing value. */
2783 if (symb_table[2 * elem] == hash
2784 /* Compare the length of the name. */
2785 && name_len == extra[symb_table[2 * elem + 1]]
2786 /* Compare the name. */
2787 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2788 name_len) == 0)
2790 /* Yep, this is the entry. */
2791 break;
2794 /* Next entry. */
2795 elem += second;
2797 while (symb_table[2 * elem] != 0);
2799 return elem;
2802 /* Local function for parse_bracket_exp used in _LIBC environement.
2803 Look up the collation sequence value of BR_ELEM.
2804 Return the value if succeeded, UINT_MAX otherwise. */
2806 auto inline unsigned int
2807 __attribute ((always_inline))
2808 lookup_collation_sequence_value (br_elem)
2809 bracket_elem_t *br_elem;
2811 if (br_elem->type == SB_CHAR)
2814 if (MB_CUR_MAX == 1)
2816 if (nrules == 0)
2817 return collseqmb[br_elem->opr.ch];
2818 else
2820 wint_t wc = __btowc (br_elem->opr.ch);
2821 return __collseq_table_lookup (collseqwc, wc);
2824 else if (br_elem->type == MB_CHAR)
2826 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2828 else if (br_elem->type == COLL_SYM)
2830 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2831 if (nrules != 0)
2833 int32_t elem, idx;
2834 elem = seek_collating_symbol_entry (br_elem->opr.name,
2835 sym_name_len);
2836 if (symb_table[2 * elem] != 0)
2838 /* We found the entry. */
2839 idx = symb_table[2 * elem + 1];
2840 /* Skip the name of collating element name. */
2841 idx += 1 + extra[idx];
2842 /* Skip the byte sequence of the collating element. */
2843 idx += 1 + extra[idx];
2844 /* Adjust for the alignment. */
2845 idx = (idx + 3) & ~3;
2846 /* Skip the multibyte collation sequence value. */
2847 idx += sizeof (unsigned int);
2848 /* Skip the wide char sequence of the collating element. */
2849 idx += sizeof (unsigned int) *
2850 (1 + *(unsigned int *) (extra + idx));
2851 /* Return the collation sequence value. */
2852 return *(unsigned int *) (extra + idx);
2854 else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2856 /* No valid character. Match it as a single byte
2857 character. */
2858 return collseqmb[br_elem->opr.name[0]];
2861 else if (sym_name_len == 1)
2862 return collseqmb[br_elem->opr.name[0]];
2864 return UINT_MAX;
2867 /* Local function for parse_bracket_exp used in _LIBC environement.
2868 Build the range expression which starts from START_ELEM, and ends
2869 at END_ELEM. The result are written to MBCSET and SBCSET.
2870 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2871 mbcset->range_ends, is a pointer argument sinse we may
2872 update it. */
2874 auto inline reg_errcode_t
2875 __attribute ((always_inline))
2876 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2877 re_charset_t *mbcset;
2878 Idx *range_alloc;
2879 bitset_t sbcset;
2880 bracket_elem_t *start_elem, *end_elem;
2882 unsigned int ch;
2883 uint32_t start_collseq;
2884 uint32_t end_collseq;
2886 /* Equivalence Classes and Character Classes can't be a range
2887 start/end. */
2888 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2889 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2891 return REG_ERANGE;
2893 start_collseq = lookup_collation_sequence_value (start_elem);
2894 end_collseq = lookup_collation_sequence_value (end_elem);
2895 /* Check start/end collation sequence values. */
2896 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2897 return REG_ECOLLATE;
2898 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2899 return REG_ERANGE;
2901 /* Got valid collation sequence values, add them as a new entry.
2902 However, if we have no collation elements, and the character set
2903 is single byte, the single byte character set that we
2904 build below suffices. */
2905 if (nrules > 0 || dfa->mb_cur_max > 1)
2907 /* Check the space of the arrays. */
2908 if (BE (*range_alloc == mbcset->nranges, 0))
2910 /* There is not enough space, need realloc. */
2911 uint32_t *new_array_start;
2912 uint32_t *new_array_end;
2913 Idx new_nranges;
2915 /* +1 in case of mbcset->nranges is 0. */
2916 new_nranges = 2 * mbcset->nranges + 1;
2917 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2918 new_nranges);
2919 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2920 new_nranges);
2922 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2923 return REG_ESPACE;
2925 mbcset->range_starts = new_array_start;
2926 mbcset->range_ends = new_array_end;
2927 *range_alloc = new_nranges;
2930 mbcset->range_starts[mbcset->nranges] = start_collseq;
2931 mbcset->range_ends[mbcset->nranges++] = end_collseq;
2934 /* Build the table for single byte characters. */
2935 for (ch = 0; ch < SBC_MAX; ch++)
2937 uint32_t ch_collseq;
2939 if (MB_CUR_MAX == 1)
2941 if (nrules == 0)
2942 ch_collseq = collseqmb[ch];
2943 else
2944 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2945 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2946 bitset_set (sbcset, ch);
2948 return REG_NOERROR;
2951 /* Local function for parse_bracket_exp used in _LIBC environement.
2952 Build the collating element which is represented by NAME.
2953 The result are written to MBCSET and SBCSET.
2954 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2955 pointer argument sinse we may update it. */
2957 auto inline reg_errcode_t
2958 __attribute ((always_inline))
2959 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2960 re_charset_t *mbcset;
2961 Idx *coll_sym_alloc;
2962 bitset_t sbcset;
2963 const unsigned char *name;
2965 int32_t elem, idx;
2966 size_t name_len = strlen ((const char *) name);
2967 if (nrules != 0)
2969 elem = seek_collating_symbol_entry (name, name_len);
2970 if (symb_table[2 * elem] != 0)
2972 /* We found the entry. */
2973 idx = symb_table[2 * elem + 1];
2974 /* Skip the name of collating element name. */
2975 idx += 1 + extra[idx];
2977 else if (symb_table[2 * elem] == 0 && name_len == 1)
2979 /* No valid character, treat it as a normal
2980 character. */
2981 bitset_set (sbcset, name[0]);
2982 return REG_NOERROR;
2984 else
2985 return REG_ECOLLATE;
2987 /* Got valid collation sequence, add it as a new entry. */
2988 /* Check the space of the arrays. */
2989 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
2991 /* Not enough, realloc it. */
2992 /* +1 in case of mbcset->ncoll_syms is 0. */
2993 Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
2994 /* Use realloc since mbcset->coll_syms is NULL
2995 if *alloc == 0. */
2996 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
2997 new_coll_sym_alloc);
2998 if (BE (new_coll_syms == NULL, 0))
2999 return REG_ESPACE;
3000 mbcset->coll_syms = new_coll_syms;
3001 *coll_sym_alloc = new_coll_sym_alloc;
3003 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3004 return REG_NOERROR;
3006 else
3008 if (BE (name_len != 1, 0))
3009 return REG_ECOLLATE;
3010 else
3012 bitset_set (sbcset, name[0]);
3013 return REG_NOERROR;
3017 #endif
3019 re_token_t br_token;
3020 re_bitset_ptr_t sbcset;
3021 #ifdef RE_ENABLE_I18N
3022 re_charset_t *mbcset;
3023 Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3024 Idx equiv_class_alloc = 0, char_class_alloc = 0;
3025 #endif /* not RE_ENABLE_I18N */
3026 bool non_match = false;
3027 bin_tree_t *work_tree;
3028 int token_len;
3029 bool first_round = true;
3030 #ifdef _LIBC
3031 collseqmb = (const unsigned char *)
3032 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3033 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3034 if (nrules)
3037 if (MB_CUR_MAX > 1)
3039 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3040 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3041 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3042 _NL_COLLATE_SYMB_TABLEMB);
3043 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3044 _NL_COLLATE_SYMB_EXTRAMB);
3046 #endif
3047 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3048 #ifdef RE_ENABLE_I18N
3049 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3050 #endif /* RE_ENABLE_I18N */
3051 #ifdef RE_ENABLE_I18N
3052 if (BE (sbcset == NULL || mbcset == NULL, 0))
3053 #else
3054 if (BE (sbcset == NULL, 0))
3055 #endif /* RE_ENABLE_I18N */
3057 *err = REG_ESPACE;
3058 return NULL;
3061 token_len = peek_token_bracket (token, regexp, syntax);
3062 if (BE (token->type == END_OF_RE, 0))
3064 *err = REG_BADPAT;
3065 goto parse_bracket_exp_free_return;
3067 if (token->type == OP_NON_MATCH_LIST)
3069 #ifdef RE_ENABLE_I18N
3070 mbcset->non_match = 1;
3071 #endif /* not RE_ENABLE_I18N */
3072 non_match = true;
3073 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3074 bitset_set (sbcset, '\n');
3075 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3076 token_len = peek_token_bracket (token, regexp, syntax);
3077 if (BE (token->type == END_OF_RE, 0))
3079 *err = REG_BADPAT;
3080 goto parse_bracket_exp_free_return;
3084 /* We treat the first ']' as a normal character. */
3085 if (token->type == OP_CLOSE_BRACKET)
3086 token->type = CHARACTER;
3088 while (1)
3090 bracket_elem_t start_elem, end_elem;
3091 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3092 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3093 reg_errcode_t ret;
3094 int token_len2 = 0;
3095 bool is_range_exp = false;
3096 re_token_t token2;
3098 start_elem.opr.name = start_name_buf;
3099 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3100 syntax, first_round);
3101 if (BE (ret != REG_NOERROR, 0))
3103 *err = ret;
3104 goto parse_bracket_exp_free_return;
3106 first_round = false;
3108 /* Get information about the next token. We need it in any case. */
3109 token_len = peek_token_bracket (token, regexp, syntax);
3111 /* Do not check for ranges if we know they are not allowed. */
3112 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3114 if (BE (token->type == END_OF_RE, 0))
3116 *err = REG_EBRACK;
3117 goto parse_bracket_exp_free_return;
3119 if (token->type == OP_CHARSET_RANGE)
3121 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3122 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3123 if (BE (token2.type == END_OF_RE, 0))
3125 *err = REG_EBRACK;
3126 goto parse_bracket_exp_free_return;
3128 if (token2.type == OP_CLOSE_BRACKET)
3130 /* We treat the last '-' as a normal character. */
3131 re_string_skip_bytes (regexp, -token_len);
3132 token->type = CHARACTER;
3134 else
3135 is_range_exp = true;
3139 if (is_range_exp == true)
3141 end_elem.opr.name = end_name_buf;
3142 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3143 dfa, syntax, true);
3144 if (BE (ret != REG_NOERROR, 0))
3146 *err = ret;
3147 goto parse_bracket_exp_free_return;
3150 token_len = peek_token_bracket (token, regexp, syntax);
3152 #ifdef _LIBC
3153 *err = build_range_exp (sbcset, mbcset, &range_alloc,
3154 &start_elem, &end_elem);
3155 #else
3156 # ifdef RE_ENABLE_I18N
3157 *err = build_range_exp (sbcset,
3158 dfa->mb_cur_max > 1 ? mbcset : NULL,
3159 &range_alloc, &start_elem, &end_elem);
3160 # else
3161 *err = build_range_exp (sbcset, &start_elem, &end_elem);
3162 # endif
3163 #endif /* RE_ENABLE_I18N */
3164 if (BE (*err != REG_NOERROR, 0))
3165 goto parse_bracket_exp_free_return;
3167 else
3169 switch (start_elem.type)
3171 case SB_CHAR:
3172 bitset_set (sbcset, start_elem.opr.ch);
3173 break;
3174 #ifdef RE_ENABLE_I18N
3175 case MB_CHAR:
3176 /* Check whether the array has enough space. */
3177 if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3179 wchar_t *new_mbchars;
3180 /* Not enough, realloc it. */
3181 /* +1 in case of mbcset->nmbchars is 0. */
3182 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3183 /* Use realloc since array is NULL if *alloc == 0. */
3184 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3185 mbchar_alloc);
3186 if (BE (new_mbchars == NULL, 0))
3187 goto parse_bracket_exp_espace;
3188 mbcset->mbchars = new_mbchars;
3190 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3191 break;
3192 #endif /* RE_ENABLE_I18N */
3193 case EQUIV_CLASS:
3194 *err = build_equiv_class (sbcset,
3195 #ifdef RE_ENABLE_I18N
3196 mbcset, &equiv_class_alloc,
3197 #endif /* RE_ENABLE_I18N */
3198 start_elem.opr.name);
3199 if (BE (*err != REG_NOERROR, 0))
3200 goto parse_bracket_exp_free_return;
3201 break;
3202 case COLL_SYM:
3203 *err = build_collating_symbol (sbcset,
3204 #ifdef RE_ENABLE_I18N
3205 mbcset, &coll_sym_alloc,
3206 #endif /* RE_ENABLE_I18N */
3207 start_elem.opr.name);
3208 if (BE (*err != REG_NOERROR, 0))
3209 goto parse_bracket_exp_free_return;
3210 break;
3211 case CHAR_CLASS:
3212 *err = build_charclass (regexp->trans, sbcset,
3213 #ifdef RE_ENABLE_I18N
3214 mbcset, &char_class_alloc,
3215 #endif /* RE_ENABLE_I18N */
3216 start_elem.opr.name, syntax);
3217 if (BE (*err != REG_NOERROR, 0))
3218 goto parse_bracket_exp_free_return;
3219 break;
3220 default:
3221 assert (0);
3222 break;
3225 if (BE (token->type == END_OF_RE, 0))
3227 *err = REG_EBRACK;
3228 goto parse_bracket_exp_free_return;
3230 if (token->type == OP_CLOSE_BRACKET)
3231 break;
3234 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3236 /* If it is non-matching list. */
3237 if (non_match)
3238 bitset_not (sbcset);
3240 #ifdef RE_ENABLE_I18N
3241 /* Ensure only single byte characters are set. */
3242 if (dfa->mb_cur_max > 1)
3243 bitset_mask (sbcset, dfa->sb_char);
3245 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3246 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3247 || mbcset->non_match)))
3249 bin_tree_t *mbc_tree;
3250 int sbc_idx;
3251 /* Build a tree for complex bracket. */
3252 dfa->has_mb_node = 1;
3253 br_token.type = COMPLEX_BRACKET;
3254 br_token.opr.mbcset = mbcset;
3255 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3256 if (BE (mbc_tree == NULL, 0))
3257 goto parse_bracket_exp_espace;
3258 for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3259 if (sbcset[sbc_idx])
3260 break;
3261 /* If there are no bits set in sbcset, there is no point
3262 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3263 if (sbc_idx < BITSET_WORDS)
3265 /* Build a tree for simple bracket. */
3266 br_token.type = SIMPLE_BRACKET;
3267 br_token.opr.sbcset = sbcset;
3268 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3269 if (BE (work_tree == NULL, 0))
3270 goto parse_bracket_exp_espace;
3272 /* Then join them by ALT node. */
3273 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3274 if (BE (work_tree == NULL, 0))
3275 goto parse_bracket_exp_espace;
3277 else
3279 re_free (sbcset);
3280 work_tree = mbc_tree;
3283 else
3284 #endif /* not RE_ENABLE_I18N */
3286 #ifdef RE_ENABLE_I18N
3287 free_charset (mbcset);
3288 #endif
3289 /* Build a tree for simple bracket. */
3290 br_token.type = SIMPLE_BRACKET;
3291 br_token.opr.sbcset = sbcset;
3292 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3293 if (BE (work_tree == NULL, 0))
3294 goto parse_bracket_exp_espace;
3296 return work_tree;
3298 parse_bracket_exp_espace:
3299 *err = REG_ESPACE;
3300 parse_bracket_exp_free_return:
3301 re_free (sbcset);
3302 #ifdef RE_ENABLE_I18N
3303 free_charset (mbcset);
3304 #endif /* RE_ENABLE_I18N */
3305 return NULL;
3308 /* Parse an element in the bracket expression. */
3310 static reg_errcode_t
3311 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3312 re_token_t *token, int token_len, re_dfa_t *dfa,
3313 reg_syntax_t syntax, bool accept_hyphen)
3315 #ifdef RE_ENABLE_I18N
3316 int cur_char_size;
3317 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3318 if (cur_char_size > 1)
3320 elem->type = MB_CHAR;
3321 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3322 re_string_skip_bytes (regexp, cur_char_size);
3323 return REG_NOERROR;
3325 #endif /* RE_ENABLE_I18N */
3326 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3327 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3328 || token->type == OP_OPEN_EQUIV_CLASS)
3329 return parse_bracket_symbol (elem, regexp, token);
3330 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3332 /* A '-' must only appear as anything but a range indicator before
3333 the closing bracket. Everything else is an error. */
3334 re_token_t token2;
3335 (void) peek_token_bracket (&token2, regexp, syntax);
3336 if (token2.type != OP_CLOSE_BRACKET)
3337 /* The actual error value is not standardized since this whole
3338 case is undefined. But ERANGE makes good sense. */
3339 return REG_ERANGE;
3341 elem->type = SB_CHAR;
3342 elem->opr.ch = token->opr.c;
3343 return REG_NOERROR;
3346 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3347 such as [:<character_class>:], [.<collating_element>.], and
3348 [=<equivalent_class>=]. */
3350 static reg_errcode_t
3351 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3352 re_token_t *token)
3354 unsigned char ch, delim = token->opr.c;
3355 int i = 0;
3356 if (re_string_eoi(regexp))
3357 return REG_EBRACK;
3358 for (;; ++i)
3360 if (i >= BRACKET_NAME_BUF_SIZE)
3361 return REG_EBRACK;
3362 if (token->type == OP_OPEN_CHAR_CLASS)
3363 ch = re_string_fetch_byte_case (regexp);
3364 else
3365 ch = re_string_fetch_byte (regexp);
3366 if (re_string_eoi(regexp))
3367 return REG_EBRACK;
3368 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3369 break;
3370 elem->opr.name[i] = ch;
3372 re_string_skip_bytes (regexp, 1);
3373 elem->opr.name[i] = '\0';
3374 switch (token->type)
3376 case OP_OPEN_COLL_ELEM:
3377 elem->type = COLL_SYM;
3378 break;
3379 case OP_OPEN_EQUIV_CLASS:
3380 elem->type = EQUIV_CLASS;
3381 break;
3382 case OP_OPEN_CHAR_CLASS:
3383 elem->type = CHAR_CLASS;
3384 break;
3385 default:
3386 break;
3388 return REG_NOERROR;
3391 /* Helper function for parse_bracket_exp.
3392 Build the equivalence class which is represented by NAME.
3393 The result are written to MBCSET and SBCSET.
3394 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3395 is a pointer argument sinse we may update it. */
3397 static reg_errcode_t
3398 #ifdef RE_ENABLE_I18N
3399 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3400 Idx *equiv_class_alloc, const unsigned char *name)
3401 #else /* not RE_ENABLE_I18N */
3402 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3403 #endif /* not RE_ENABLE_I18N */
3405 #ifdef _LIBC
3406 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3407 if (nrules != 0)
3409 const int32_t *table, *indirect;
3410 const unsigned char *weights, *extra, *cp;
3411 unsigned char char_buf[2];
3412 int32_t idx1, idx2;
3413 unsigned int ch;
3414 size_t len;
3415 /* This #include defines a local function! */
3416 # include <locale/weight.h>
3417 /* Calculate the index for equivalence class. */
3418 cp = name;
3419 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3420 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3421 _NL_COLLATE_WEIGHTMB);
3422 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3423 _NL_COLLATE_EXTRAMB);
3424 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3425 _NL_COLLATE_INDIRECTMB);
3426 idx1 = findidx (&cp);
3427 if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3428 /* This isn't a valid character. */
3429 return REG_ECOLLATE;
3431 /* Build single byte matcing table for this equivalence class. */
3432 char_buf[1] = (unsigned char) '\0';
3433 len = weights[idx1];
3434 for (ch = 0; ch < SBC_MAX; ++ch)
3436 char_buf[0] = ch;
3437 cp = char_buf;
3438 idx2 = findidx (&cp);
3440 idx2 = table[ch];
3442 if (idx2 == 0)
3443 /* This isn't a valid character. */
3444 continue;
3445 if (len == weights[idx2])
3447 int cnt = 0;
3448 while (cnt <= len &&
3449 weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
3450 ++cnt;
3452 if (cnt > len)
3453 bitset_set (sbcset, ch);
3456 /* Check whether the array has enough space. */
3457 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3459 /* Not enough, realloc it. */
3460 /* +1 in case of mbcset->nequiv_classes is 0. */
3461 Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3462 /* Use realloc since the array is NULL if *alloc == 0. */
3463 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3464 int32_t,
3465 new_equiv_class_alloc);
3466 if (BE (new_equiv_classes == NULL, 0))
3467 return REG_ESPACE;
3468 mbcset->equiv_classes = new_equiv_classes;
3469 *equiv_class_alloc = new_equiv_class_alloc;
3471 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3473 else
3474 #endif /* _LIBC */
3476 if (BE (strlen ((const char *) name) != 1, 0))
3477 return REG_ECOLLATE;
3478 bitset_set (sbcset, *name);
3480 return REG_NOERROR;
3483 /* Helper function for parse_bracket_exp.
3484 Build the character class which is represented by NAME.
3485 The result are written to MBCSET and SBCSET.
3486 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3487 is a pointer argument sinse we may update it. */
3489 static reg_errcode_t
3490 #ifdef RE_ENABLE_I18N
3491 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3492 re_charset_t *mbcset, Idx *char_class_alloc,
3493 const unsigned char *class_name, reg_syntax_t syntax)
3494 #else /* not RE_ENABLE_I18N */
3495 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3496 const unsigned char *class_name, reg_syntax_t syntax)
3497 #endif /* not RE_ENABLE_I18N */
3499 int i;
3500 const char *name = (const char *) class_name;
3502 /* In case of REG_ICASE "upper" and "lower" match the both of
3503 upper and lower cases. */
3504 if ((syntax & RE_ICASE)
3505 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3506 name = "alpha";
3508 #ifdef RE_ENABLE_I18N
3509 /* Check the space of the arrays. */
3510 if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3512 /* Not enough, realloc it. */
3513 /* +1 in case of mbcset->nchar_classes is 0. */
3514 Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3515 /* Use realloc since array is NULL if *alloc == 0. */
3516 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3517 new_char_class_alloc);
3518 if (BE (new_char_classes == NULL, 0))
3519 return REG_ESPACE;
3520 mbcset->char_classes = new_char_classes;
3521 *char_class_alloc = new_char_class_alloc;
3523 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3524 #endif /* RE_ENABLE_I18N */
3526 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3527 do { \
3528 if (BE (trans != NULL, 0)) \
3530 for (i = 0; i < SBC_MAX; ++i) \
3531 if (ctype_func (i)) \
3532 bitset_set (sbcset, trans[i]); \
3534 else \
3536 for (i = 0; i < SBC_MAX; ++i) \
3537 if (ctype_func (i)) \
3538 bitset_set (sbcset, i); \
3540 } while (0)
3542 if (strcmp (name, "alnum") == 0)
3543 BUILD_CHARCLASS_LOOP (isalnum);
3544 else if (strcmp (name, "cntrl") == 0)
3545 BUILD_CHARCLASS_LOOP (iscntrl);
3546 else if (strcmp (name, "lower") == 0)
3547 BUILD_CHARCLASS_LOOP (islower);
3548 else if (strcmp (name, "space") == 0)
3549 BUILD_CHARCLASS_LOOP (isspace);
3550 else if (strcmp (name, "alpha") == 0)
3551 BUILD_CHARCLASS_LOOP (isalpha);
3552 else if (strcmp (name, "digit") == 0)
3553 BUILD_CHARCLASS_LOOP (isdigit);
3554 else if (strcmp (name, "print") == 0)
3555 BUILD_CHARCLASS_LOOP (isprint);
3556 else if (strcmp (name, "upper") == 0)
3557 BUILD_CHARCLASS_LOOP (isupper);
3558 else if (strcmp (name, "blank") == 0)
3559 BUILD_CHARCLASS_LOOP (isblank);
3560 else if (strcmp (name, "graph") == 0)
3561 BUILD_CHARCLASS_LOOP (isgraph);
3562 else if (strcmp (name, "punct") == 0)
3563 BUILD_CHARCLASS_LOOP (ispunct);
3564 else if (strcmp (name, "xdigit") == 0)
3565 BUILD_CHARCLASS_LOOP (isxdigit);
3566 else
3567 return REG_ECTYPE;
3569 return REG_NOERROR;
3572 static bin_tree_t *
3573 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3574 const unsigned char *class_name,
3575 const unsigned char *extra, bool non_match,
3576 reg_errcode_t *err)
3578 re_bitset_ptr_t sbcset;
3579 #ifdef RE_ENABLE_I18N
3580 re_charset_t *mbcset;
3581 Idx alloc = 0;
3582 #endif /* not RE_ENABLE_I18N */
3583 reg_errcode_t ret;
3584 re_token_t br_token;
3585 bin_tree_t *tree;
3587 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3588 #ifdef RE_ENABLE_I18N
3589 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3590 #endif /* RE_ENABLE_I18N */
3592 #ifdef RE_ENABLE_I18N
3593 if (BE (sbcset == NULL || mbcset == NULL, 0))
3594 #else /* not RE_ENABLE_I18N */
3595 if (BE (sbcset == NULL, 0))
3596 #endif /* not RE_ENABLE_I18N */
3598 *err = REG_ESPACE;
3599 return NULL;
3602 if (non_match)
3604 #ifdef RE_ENABLE_I18N
3605 mbcset->non_match = 1;
3606 #endif /* not RE_ENABLE_I18N */
3609 /* We don't care the syntax in this case. */
3610 ret = build_charclass (trans, sbcset,
3611 #ifdef RE_ENABLE_I18N
3612 mbcset, &alloc,
3613 #endif /* RE_ENABLE_I18N */
3614 class_name, 0);
3616 if (BE (ret != REG_NOERROR, 0))
3618 re_free (sbcset);
3619 #ifdef RE_ENABLE_I18N
3620 free_charset (mbcset);
3621 #endif /* RE_ENABLE_I18N */
3622 *err = ret;
3623 return NULL;
3625 /* \w match '_' also. */
3626 for (; *extra; extra++)
3627 bitset_set (sbcset, *extra);
3629 /* If it is non-matching list. */
3630 if (non_match)
3631 bitset_not (sbcset);
3633 #ifdef RE_ENABLE_I18N
3634 /* Ensure only single byte characters are set. */
3635 if (dfa->mb_cur_max > 1)
3636 bitset_mask (sbcset, dfa->sb_char);
3637 #endif
3639 /* Build a tree for simple bracket. */
3640 br_token.type = SIMPLE_BRACKET;
3641 br_token.opr.sbcset = sbcset;
3642 tree = create_token_tree (dfa, NULL, NULL, &br_token);
3643 if (BE (tree == NULL, 0))
3644 goto build_word_op_espace;
3646 #ifdef RE_ENABLE_I18N
3647 if (dfa->mb_cur_max > 1)
3649 bin_tree_t *mbc_tree;
3650 /* Build a tree for complex bracket. */
3651 br_token.type = COMPLEX_BRACKET;
3652 br_token.opr.mbcset = mbcset;
3653 dfa->has_mb_node = 1;
3654 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3655 if (BE (mbc_tree == NULL, 0))
3656 goto build_word_op_espace;
3657 /* Then join them by ALT node. */
3658 tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3659 if (BE (mbc_tree != NULL, 1))
3660 return tree;
3662 else
3664 free_charset (mbcset);
3665 return tree;
3667 #else /* not RE_ENABLE_I18N */
3668 return tree;
3669 #endif /* not RE_ENABLE_I18N */
3671 build_word_op_espace:
3672 re_free (sbcset);
3673 #ifdef RE_ENABLE_I18N
3674 free_charset (mbcset);
3675 #endif /* RE_ENABLE_I18N */
3676 *err = REG_ESPACE;
3677 return NULL;
3680 /* This is intended for the expressions like "a{1,3}".
3681 Fetch a number from `input', and return the number.
3682 Return REG_MISSING if the number field is empty like "{,1}".
3683 Return REG_ERROR if an error occurred. */
3685 static Idx
3686 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3688 Idx num = REG_MISSING;
3689 unsigned char c;
3690 while (1)
3692 fetch_token (token, input, syntax);
3693 c = token->opr.c;
3694 if (BE (token->type == END_OF_RE, 0))
3695 return REG_ERROR;
3696 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3697 break;
3698 num = ((token->type != CHARACTER || c < '0' || '9' < c
3699 || num == REG_ERROR)
3700 ? REG_ERROR
3701 : ((num == REG_MISSING) ? c - '0' : num * 10 + c - '0'));
3702 num = (num > RE_DUP_MAX) ? REG_ERROR : num;
3704 return num;
3707 #ifdef RE_ENABLE_I18N
3708 static void
3709 free_charset (re_charset_t *cset)
3711 re_free (cset->mbchars);
3712 # ifdef _LIBC
3713 re_free (cset->coll_syms);
3714 re_free (cset->equiv_classes);
3715 re_free (cset->range_starts);
3716 re_free (cset->range_ends);
3717 # endif
3718 re_free (cset->char_classes);
3719 re_free (cset);
3721 #endif /* RE_ENABLE_I18N */
3723 /* Functions for binary tree operation. */
3725 /* Create a tree node. */
3727 static bin_tree_t *
3728 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3729 re_token_type_t type)
3731 re_token_t t;
3732 t.type = type;
3733 return create_token_tree (dfa, left, right, &t);
3736 static bin_tree_t *
3737 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3738 const re_token_t *token)
3740 bin_tree_t *tree;
3741 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3743 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3745 if (storage == NULL)
3746 return NULL;
3747 storage->next = dfa->str_tree_storage;
3748 dfa->str_tree_storage = storage;
3749 dfa->str_tree_storage_idx = 0;
3751 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3753 tree->parent = NULL;
3754 tree->left = left;
3755 tree->right = right;
3756 tree->token = *token;
3757 tree->token.duplicated = 0;
3758 tree->token.opt_subexp = 0;
3759 tree->first = NULL;
3760 tree->next = NULL;
3761 tree->node_idx = REG_MISSING;
3763 if (left != NULL)
3764 left->parent = tree;
3765 if (right != NULL)
3766 right->parent = tree;
3767 return tree;
3770 /* Mark the tree SRC as an optional subexpression.
3771 To be called from preorder or postorder. */
3773 static reg_errcode_t
3774 mark_opt_subexp (void *extra, bin_tree_t *node)
3776 Idx idx = (Idx) (long) extra;
3777 if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3778 node->token.opt_subexp = 1;
3780 return REG_NOERROR;
3783 /* Free the allocated memory inside NODE. */
3785 static void
3786 free_token (re_token_t *node)
3788 #ifdef RE_ENABLE_I18N
3789 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3790 free_charset (node->opr.mbcset);
3791 else
3792 #endif /* RE_ENABLE_I18N */
3793 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3794 re_free (node->opr.sbcset);
3797 /* Worker function for tree walking. Free the allocated memory inside NODE
3798 and its children. */
3800 static reg_errcode_t
3801 free_tree (void *extra, bin_tree_t *node)
3803 free_token (&node->token);
3804 return REG_NOERROR;
3808 /* Duplicate the node SRC, and return new node. This is a preorder
3809 visit similar to the one implemented by the generic visitor, but
3810 we need more infrastructure to maintain two parallel trees --- so,
3811 it's easier to duplicate. */
3813 static bin_tree_t *
3814 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3816 const bin_tree_t *node;
3817 bin_tree_t *dup_root;
3818 bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3820 for (node = root; ; )
3822 /* Create a new tree and link it back to the current parent. */
3823 *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3824 if (*p_new == NULL)
3825 return NULL;
3826 (*p_new)->parent = dup_node;
3827 (*p_new)->token.duplicated = 1;
3828 dup_node = *p_new;
3830 /* Go to the left node, or up and to the right. */
3831 if (node->left)
3833 node = node->left;
3834 p_new = &dup_node->left;
3836 else
3838 const bin_tree_t *prev = NULL;
3839 while (node->right == prev || node->right == NULL)
3841 prev = node;
3842 node = node->parent;
3843 dup_node = dup_node->parent;
3844 if (!node)
3845 return dup_root;
3847 node = node->right;
3848 p_new = &dup_node->right;