Hurd: fix mode type for openat
[glibc.git] / posix / regcomp.c
blob373a52ecffebc127b7a776371f407d18623580d2
1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002-2007,2009,2010,2011,2012 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <http://www.gnu.org/licenses/>. */
20 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
21 size_t length, reg_syntax_t syntax);
22 static void re_compile_fastmap_iter (regex_t *bufp,
23 const re_dfastate_t *init_state,
24 char *fastmap);
25 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
26 #ifdef RE_ENABLE_I18N
27 static void free_charset (re_charset_t *cset);
28 #endif /* RE_ENABLE_I18N */
29 static void free_workarea_compile (regex_t *preg);
30 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
31 #ifdef RE_ENABLE_I18N
32 static void optimize_utf8 (re_dfa_t *dfa);
33 #endif
34 static reg_errcode_t analyze (regex_t *preg);
35 static reg_errcode_t preorder (bin_tree_t *root,
36 reg_errcode_t (fn (void *, bin_tree_t *)),
37 void *extra);
38 static reg_errcode_t postorder (bin_tree_t *root,
39 reg_errcode_t (fn (void *, bin_tree_t *)),
40 void *extra);
41 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
42 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
43 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
44 bin_tree_t *node);
45 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
46 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
47 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
48 static int duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint);
49 static int search_duplicated_node (const re_dfa_t *dfa, int org_node,
50 unsigned int constraint);
51 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
52 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
53 int node, int root);
54 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
55 static int fetch_number (re_string_t *input, re_token_t *token,
56 reg_syntax_t syntax);
57 static int peek_token (re_token_t *token, re_string_t *input,
58 reg_syntax_t syntax) internal_function;
59 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
60 reg_syntax_t syntax, reg_errcode_t *err);
61 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
62 re_token_t *token, reg_syntax_t syntax,
63 int nest, reg_errcode_t *err);
64 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
65 re_token_t *token, reg_syntax_t syntax,
66 int nest, reg_errcode_t *err);
67 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
68 re_token_t *token, reg_syntax_t syntax,
69 int nest, reg_errcode_t *err);
70 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
71 re_token_t *token, reg_syntax_t syntax,
72 int nest, reg_errcode_t *err);
73 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
74 re_dfa_t *dfa, re_token_t *token,
75 reg_syntax_t syntax, reg_errcode_t *err);
76 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
77 re_token_t *token, reg_syntax_t syntax,
78 reg_errcode_t *err);
79 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
80 re_string_t *regexp,
81 re_token_t *token, int token_len,
82 re_dfa_t *dfa,
83 reg_syntax_t syntax,
84 int accept_hyphen);
85 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
86 re_string_t *regexp,
87 re_token_t *token);
88 #ifdef RE_ENABLE_I18N
89 static reg_errcode_t build_equiv_class (bitset_t sbcset,
90 re_charset_t *mbcset,
91 int *equiv_class_alloc,
92 const unsigned char *name);
93 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
94 bitset_t sbcset,
95 re_charset_t *mbcset,
96 int *char_class_alloc,
97 const unsigned char *class_name,
98 reg_syntax_t syntax);
99 #else /* not RE_ENABLE_I18N */
100 static reg_errcode_t build_equiv_class (bitset_t sbcset,
101 const unsigned char *name);
102 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
103 bitset_t sbcset,
104 const unsigned char *class_name,
105 reg_syntax_t syntax);
106 #endif /* not RE_ENABLE_I18N */
107 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
108 RE_TRANSLATE_TYPE trans,
109 const unsigned char *class_name,
110 const unsigned char *extra,
111 int non_match, reg_errcode_t *err);
112 static bin_tree_t *create_tree (re_dfa_t *dfa,
113 bin_tree_t *left, bin_tree_t *right,
114 re_token_type_t type);
115 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
116 bin_tree_t *left, bin_tree_t *right,
117 const re_token_t *token);
118 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
119 static void free_token (re_token_t *node);
120 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
121 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
123 /* This table gives an error message for each of the error codes listed
124 in regex.h. Obviously the order here has to be same as there.
125 POSIX doesn't require that we do anything for REG_NOERROR,
126 but why not be nice? */
128 const char __re_error_msgid[] attribute_hidden =
130 #define REG_NOERROR_IDX 0
131 gettext_noop ("Success") /* REG_NOERROR */
132 "\0"
133 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
134 gettext_noop ("No match") /* REG_NOMATCH */
135 "\0"
136 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
137 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
138 "\0"
139 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
140 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
141 "\0"
142 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
143 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
144 "\0"
145 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
146 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
147 "\0"
148 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
149 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
150 "\0"
151 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
152 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
153 "\0"
154 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
155 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
156 "\0"
157 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
158 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
159 "\0"
160 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
161 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
162 "\0"
163 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
164 gettext_noop ("Invalid range end") /* REG_ERANGE */
165 "\0"
166 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
167 gettext_noop ("Memory exhausted") /* REG_ESPACE */
168 "\0"
169 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
170 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
171 "\0"
172 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
173 gettext_noop ("Premature end of regular expression") /* REG_EEND */
174 "\0"
175 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
176 gettext_noop ("Regular expression too big") /* REG_ESIZE */
177 "\0"
178 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
179 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
182 const size_t __re_error_msgid_idx[] attribute_hidden =
184 REG_NOERROR_IDX,
185 REG_NOMATCH_IDX,
186 REG_BADPAT_IDX,
187 REG_ECOLLATE_IDX,
188 REG_ECTYPE_IDX,
189 REG_EESCAPE_IDX,
190 REG_ESUBREG_IDX,
191 REG_EBRACK_IDX,
192 REG_EPAREN_IDX,
193 REG_EBRACE_IDX,
194 REG_BADBR_IDX,
195 REG_ERANGE_IDX,
196 REG_ESPACE_IDX,
197 REG_BADRPT_IDX,
198 REG_EEND_IDX,
199 REG_ESIZE_IDX,
200 REG_ERPAREN_IDX
203 /* Entry points for GNU code. */
205 /* re_compile_pattern is the GNU regular expression compiler: it
206 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
207 Returns 0 if the pattern was valid, otherwise an error string.
209 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
210 are set in BUFP on entry. */
212 const char *
213 re_compile_pattern (pattern, length, bufp)
214 const char *pattern;
215 size_t length;
216 struct re_pattern_buffer *bufp;
218 reg_errcode_t ret;
220 /* And GNU code determines whether or not to get register information
221 by passing null for the REGS argument to re_match, etc., not by
222 setting no_sub, unless RE_NO_SUB is set. */
223 bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
225 /* Match anchors at newline. */
226 bufp->newline_anchor = 1;
228 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
230 if (!ret)
231 return NULL;
232 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
234 #ifdef _LIBC
235 weak_alias (__re_compile_pattern, re_compile_pattern)
236 #endif
238 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
239 also be assigned to arbitrarily: each pattern buffer stores its own
240 syntax, so it can be changed between regex compilations. */
241 /* This has no initializer because initialized variables in Emacs
242 become read-only after dumping. */
243 reg_syntax_t re_syntax_options;
246 /* Specify the precise syntax of regexps for compilation. This provides
247 for compatibility for various utilities which historically have
248 different, incompatible syntaxes.
250 The argument SYNTAX is a bit mask comprised of the various bits
251 defined in regex.h. We return the old syntax. */
253 reg_syntax_t
254 re_set_syntax (syntax)
255 reg_syntax_t syntax;
257 reg_syntax_t ret = re_syntax_options;
259 re_syntax_options = syntax;
260 return ret;
262 #ifdef _LIBC
263 weak_alias (__re_set_syntax, re_set_syntax)
264 #endif
267 re_compile_fastmap (bufp)
268 struct re_pattern_buffer *bufp;
270 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
271 char *fastmap = bufp->fastmap;
273 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
274 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
275 if (dfa->init_state != dfa->init_state_word)
276 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
277 if (dfa->init_state != dfa->init_state_nl)
278 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
279 if (dfa->init_state != dfa->init_state_begbuf)
280 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
281 bufp->fastmap_accurate = 1;
282 return 0;
284 #ifdef _LIBC
285 weak_alias (__re_compile_fastmap, re_compile_fastmap)
286 #endif
288 static inline void
289 __attribute ((always_inline))
290 re_set_fastmap (char *fastmap, int icase, int ch)
292 fastmap[ch] = 1;
293 if (icase)
294 fastmap[tolower (ch)] = 1;
297 /* Helper function for re_compile_fastmap.
298 Compile fastmap for the initial_state INIT_STATE. */
300 static void
301 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
302 char *fastmap)
304 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
305 int node_cnt;
306 int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
307 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
309 int node = init_state->nodes.elems[node_cnt];
310 re_token_type_t type = dfa->nodes[node].type;
312 if (type == CHARACTER)
314 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
315 #ifdef RE_ENABLE_I18N
316 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
318 unsigned char *buf = alloca (dfa->mb_cur_max), *p;
319 wchar_t wc;
320 mbstate_t state;
322 p = buf;
323 *p++ = dfa->nodes[node].opr.c;
324 while (++node < dfa->nodes_len
325 && dfa->nodes[node].type == CHARACTER
326 && dfa->nodes[node].mb_partial)
327 *p++ = dfa->nodes[node].opr.c;
328 memset (&state, '\0', sizeof (state));
329 if (__mbrtowc (&wc, (const char *) buf, p - buf,
330 &state) == p - buf
331 && (__wcrtomb ((char *) buf, towlower (wc), &state)
332 != (size_t) -1))
333 re_set_fastmap (fastmap, 0, buf[0]);
335 #endif
337 else if (type == SIMPLE_BRACKET)
339 int i, ch;
340 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
342 int j;
343 bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
344 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
345 if (w & ((bitset_word_t) 1 << j))
346 re_set_fastmap (fastmap, icase, ch);
349 #ifdef RE_ENABLE_I18N
350 else if (type == COMPLEX_BRACKET)
352 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
353 int i;
355 # ifdef _LIBC
356 /* See if we have to try all bytes which start multiple collation
357 elements.
358 e.g. In da_DK, we want to catch 'a' since "aa" is a valid
359 collation element, and don't catch 'b' since 'b' is
360 the only collation element which starts from 'b' (and
361 it is caught by SIMPLE_BRACKET). */
362 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
363 && (cset->ncoll_syms || cset->nranges))
365 const int32_t *table = (const int32_t *)
366 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
367 for (i = 0; i < SBC_MAX; ++i)
368 if (table[i] < 0)
369 re_set_fastmap (fastmap, icase, i);
371 # endif /* _LIBC */
373 /* See if we have to start the match at all multibyte characters,
374 i.e. where we would not find an invalid sequence. This only
375 applies to multibyte character sets; for single byte character
376 sets, the SIMPLE_BRACKET again suffices. */
377 if (dfa->mb_cur_max > 1
378 && (cset->nchar_classes || cset->non_match || cset->nranges
379 # ifdef _LIBC
380 || cset->nequiv_classes
381 # endif /* _LIBC */
384 unsigned char c = 0;
387 mbstate_t mbs;
388 memset (&mbs, 0, sizeof (mbs));
389 if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
390 re_set_fastmap (fastmap, false, (int) c);
392 while (++c != 0);
395 else
397 /* ... Else catch all bytes which can start the mbchars. */
398 for (i = 0; i < cset->nmbchars; ++i)
400 char buf[256];
401 mbstate_t state;
402 memset (&state, '\0', sizeof (state));
403 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
404 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
405 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
407 if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
408 != (size_t) -1)
409 re_set_fastmap (fastmap, false, *(unsigned char *) buf);
414 #endif /* RE_ENABLE_I18N */
415 else if (type == OP_PERIOD
416 #ifdef RE_ENABLE_I18N
417 || type == OP_UTF8_PERIOD
418 #endif /* RE_ENABLE_I18N */
419 || type == END_OF_RE)
421 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
422 if (type == END_OF_RE)
423 bufp->can_be_null = 1;
424 return;
429 /* Entry point for POSIX code. */
430 /* regcomp takes a regular expression as a string and compiles it.
432 PREG is a regex_t *. We do not expect any fields to be initialized,
433 since POSIX says we shouldn't. Thus, we set
435 `buffer' to the compiled pattern;
436 `used' to the length of the compiled pattern;
437 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
438 REG_EXTENDED bit in CFLAGS is set; otherwise, to
439 RE_SYNTAX_POSIX_BASIC;
440 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
441 `fastmap' to an allocated space for the fastmap;
442 `fastmap_accurate' to zero;
443 `re_nsub' to the number of subexpressions in PATTERN.
445 PATTERN is the address of the pattern string.
447 CFLAGS is a series of bits which affect compilation.
449 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
450 use POSIX basic syntax.
452 If REG_NEWLINE is set, then . and [^...] don't match newline.
453 Also, regexec will try a match beginning after every newline.
455 If REG_ICASE is set, then we considers upper- and lowercase
456 versions of letters to be equivalent when matching.
458 If REG_NOSUB is set, then when PREG is passed to regexec, that
459 routine will report only success or failure, and nothing about the
460 registers.
462 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
463 the return codes and their meanings.) */
466 regcomp (preg, pattern, cflags)
467 regex_t *__restrict preg;
468 const char *__restrict pattern;
469 int cflags;
471 reg_errcode_t ret;
472 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
473 : RE_SYNTAX_POSIX_BASIC);
475 preg->buffer = NULL;
476 preg->allocated = 0;
477 preg->used = 0;
479 /* Try to allocate space for the fastmap. */
480 preg->fastmap = re_malloc (char, SBC_MAX);
481 if (BE (preg->fastmap == NULL, 0))
482 return REG_ESPACE;
484 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
486 /* If REG_NEWLINE is set, newlines are treated differently. */
487 if (cflags & REG_NEWLINE)
488 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
489 syntax &= ~RE_DOT_NEWLINE;
490 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
491 /* It also changes the matching behavior. */
492 preg->newline_anchor = 1;
494 else
495 preg->newline_anchor = 0;
496 preg->no_sub = !!(cflags & REG_NOSUB);
497 preg->translate = NULL;
499 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
501 /* POSIX doesn't distinguish between an unmatched open-group and an
502 unmatched close-group: both are REG_EPAREN. */
503 if (ret == REG_ERPAREN)
504 ret = REG_EPAREN;
506 /* We have already checked preg->fastmap != NULL. */
507 if (BE (ret == REG_NOERROR, 1))
508 /* Compute the fastmap now, since regexec cannot modify the pattern
509 buffer. This function never fails in this implementation. */
510 (void) re_compile_fastmap (preg);
511 else
513 /* Some error occurred while compiling the expression. */
514 re_free (preg->fastmap);
515 preg->fastmap = NULL;
518 return (int) ret;
520 #ifdef _LIBC
521 weak_alias (__regcomp, regcomp)
522 #endif
524 /* Returns a message corresponding to an error code, ERRCODE, returned
525 from either regcomp or regexec. We don't use PREG here. */
527 size_t
528 regerror (errcode, preg, errbuf, errbuf_size)
529 int errcode;
530 const regex_t *__restrict preg;
531 char *__restrict errbuf;
532 size_t errbuf_size;
534 const char *msg;
535 size_t msg_size;
537 if (BE (errcode < 0
538 || errcode >= (int) (sizeof (__re_error_msgid_idx)
539 / sizeof (__re_error_msgid_idx[0])), 0))
540 /* Only error codes returned by the rest of the code should be passed
541 to this routine. If we are given anything else, or if other regex
542 code generates an invalid error code, then the program has a bug.
543 Dump core so we can fix it. */
544 abort ();
546 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
548 msg_size = strlen (msg) + 1; /* Includes the null. */
550 if (BE (errbuf_size != 0, 1))
552 if (BE (msg_size > errbuf_size, 0))
554 #if defined HAVE_MEMPCPY || defined _LIBC
555 *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
556 #else
557 memcpy (errbuf, msg, errbuf_size - 1);
558 errbuf[errbuf_size - 1] = 0;
559 #endif
561 else
562 memcpy (errbuf, msg, msg_size);
565 return msg_size;
567 #ifdef _LIBC
568 weak_alias (__regerror, regerror)
569 #endif
572 #ifdef RE_ENABLE_I18N
573 /* This static array is used for the map to single-byte characters when
574 UTF-8 is used. Otherwise we would allocate memory just to initialize
575 it the same all the time. UTF-8 is the preferred encoding so this is
576 a worthwhile optimization. */
577 static const bitset_t utf8_sb_map =
579 /* Set the first 128 bits. */
580 [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
582 #endif
585 static void
586 free_dfa_content (re_dfa_t *dfa)
588 int i, j;
590 if (dfa->nodes)
591 for (i = 0; i < dfa->nodes_len; ++i)
592 free_token (dfa->nodes + i);
593 re_free (dfa->nexts);
594 for (i = 0; i < dfa->nodes_len; ++i)
596 if (dfa->eclosures != NULL)
597 re_node_set_free (dfa->eclosures + i);
598 if (dfa->inveclosures != NULL)
599 re_node_set_free (dfa->inveclosures + i);
600 if (dfa->edests != NULL)
601 re_node_set_free (dfa->edests + i);
603 re_free (dfa->edests);
604 re_free (dfa->eclosures);
605 re_free (dfa->inveclosures);
606 re_free (dfa->nodes);
608 if (dfa->state_table)
609 for (i = 0; i <= dfa->state_hash_mask; ++i)
611 struct re_state_table_entry *entry = dfa->state_table + i;
612 for (j = 0; j < entry->num; ++j)
614 re_dfastate_t *state = entry->array[j];
615 free_state (state);
617 re_free (entry->array);
619 re_free (dfa->state_table);
620 #ifdef RE_ENABLE_I18N
621 if (dfa->sb_char != utf8_sb_map)
622 re_free (dfa->sb_char);
623 #endif
624 re_free (dfa->subexp_map);
625 #ifdef DEBUG
626 re_free (dfa->re_str);
627 #endif
629 re_free (dfa);
633 /* Free dynamically allocated space used by PREG. */
635 void
636 regfree (preg)
637 regex_t *preg;
639 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
640 if (BE (dfa != NULL, 1))
641 free_dfa_content (dfa);
642 preg->buffer = NULL;
643 preg->allocated = 0;
645 re_free (preg->fastmap);
646 preg->fastmap = NULL;
648 re_free (preg->translate);
649 preg->translate = NULL;
651 #ifdef _LIBC
652 weak_alias (__regfree, regfree)
653 #endif
655 /* Entry points compatible with 4.2 BSD regex library. We don't define
656 them unless specifically requested. */
658 #if defined _REGEX_RE_COMP || defined _LIBC
660 /* BSD has one and only one pattern buffer. */
661 static struct re_pattern_buffer re_comp_buf;
663 char *
664 # ifdef _LIBC
665 /* Make these definitions weak in libc, so POSIX programs can redefine
666 these names if they don't use our functions, and still use
667 regcomp/regexec above without link errors. */
668 weak_function
669 # endif
670 re_comp (s)
671 const char *s;
673 reg_errcode_t ret;
674 char *fastmap;
676 if (!s)
678 if (!re_comp_buf.buffer)
679 return gettext ("No previous regular expression");
680 return 0;
683 if (re_comp_buf.buffer)
685 fastmap = re_comp_buf.fastmap;
686 re_comp_buf.fastmap = NULL;
687 __regfree (&re_comp_buf);
688 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
689 re_comp_buf.fastmap = fastmap;
692 if (re_comp_buf.fastmap == NULL)
694 re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
695 if (re_comp_buf.fastmap == NULL)
696 return (char *) gettext (__re_error_msgid
697 + __re_error_msgid_idx[(int) REG_ESPACE]);
700 /* Since `re_exec' always passes NULL for the `regs' argument, we
701 don't need to initialize the pattern buffer fields which affect it. */
703 /* Match anchors at newlines. */
704 re_comp_buf.newline_anchor = 1;
706 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
708 if (!ret)
709 return NULL;
711 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
712 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
715 #ifdef _LIBC
716 libc_freeres_fn (free_mem)
718 __regfree (&re_comp_buf);
720 #endif
722 #endif /* _REGEX_RE_COMP */
724 /* Internal entry point.
725 Compile the regular expression PATTERN, whose length is LENGTH.
726 SYNTAX indicate regular expression's syntax. */
728 static reg_errcode_t
729 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
730 reg_syntax_t syntax)
732 reg_errcode_t err = REG_NOERROR;
733 re_dfa_t *dfa;
734 re_string_t regexp;
736 /* Initialize the pattern buffer. */
737 preg->fastmap_accurate = 0;
738 preg->syntax = syntax;
739 preg->not_bol = preg->not_eol = 0;
740 preg->used = 0;
741 preg->re_nsub = 0;
742 preg->can_be_null = 0;
743 preg->regs_allocated = REGS_UNALLOCATED;
745 /* Initialize the dfa. */
746 dfa = (re_dfa_t *) preg->buffer;
747 if (BE (preg->allocated < sizeof (re_dfa_t), 0))
749 /* If zero allocated, but buffer is non-null, try to realloc
750 enough space. This loses if buffer's address is bogus, but
751 that is the user's responsibility. If ->buffer is NULL this
752 is a simple allocation. */
753 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
754 if (dfa == NULL)
755 return REG_ESPACE;
756 preg->allocated = sizeof (re_dfa_t);
757 preg->buffer = (unsigned char *) dfa;
759 preg->used = sizeof (re_dfa_t);
761 err = init_dfa (dfa, length);
762 if (BE (err != REG_NOERROR, 0))
764 free_dfa_content (dfa);
765 preg->buffer = NULL;
766 preg->allocated = 0;
767 return err;
769 #ifdef DEBUG
770 /* Note: length+1 will not overflow since it is checked in init_dfa. */
771 dfa->re_str = re_malloc (char, length + 1);
772 strncpy (dfa->re_str, pattern, length + 1);
773 #endif
775 __libc_lock_init (dfa->lock);
777 err = re_string_construct (&regexp, pattern, length, preg->translate,
778 syntax & RE_ICASE, dfa);
779 if (BE (err != REG_NOERROR, 0))
781 re_compile_internal_free_return:
782 free_workarea_compile (preg);
783 re_string_destruct (&regexp);
784 free_dfa_content (dfa);
785 preg->buffer = NULL;
786 preg->allocated = 0;
787 return err;
790 /* Parse the regular expression, and build a structure tree. */
791 preg->re_nsub = 0;
792 dfa->str_tree = parse (&regexp, preg, syntax, &err);
793 if (BE (dfa->str_tree == NULL, 0))
794 goto re_compile_internal_free_return;
796 /* Analyze the tree and create the nfa. */
797 err = analyze (preg);
798 if (BE (err != REG_NOERROR, 0))
799 goto re_compile_internal_free_return;
801 #ifdef RE_ENABLE_I18N
802 /* If possible, do searching in single byte encoding to speed things up. */
803 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
804 optimize_utf8 (dfa);
805 #endif
807 /* Then create the initial state of the dfa. */
808 err = create_initial_state (dfa);
810 /* Release work areas. */
811 free_workarea_compile (preg);
812 re_string_destruct (&regexp);
814 if (BE (err != REG_NOERROR, 0))
816 free_dfa_content (dfa);
817 preg->buffer = NULL;
818 preg->allocated = 0;
821 return err;
824 /* Initialize DFA. We use the length of the regular expression PAT_LEN
825 as the initial length of some arrays. */
827 static reg_errcode_t
828 init_dfa (re_dfa_t *dfa, size_t pat_len)
830 unsigned int table_size;
831 #ifndef _LIBC
832 char *codeset_name;
833 #endif
835 memset (dfa, '\0', sizeof (re_dfa_t));
837 /* Force allocation of str_tree_storage the first time. */
838 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
840 /* Avoid overflows. */
841 if (pat_len == SIZE_MAX)
842 return REG_ESPACE;
844 dfa->nodes_alloc = pat_len + 1;
845 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
847 /* table_size = 2 ^ ceil(log pat_len) */
848 for (table_size = 1; ; table_size <<= 1)
849 if (table_size > pat_len)
850 break;
852 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
853 dfa->state_hash_mask = table_size - 1;
855 dfa->mb_cur_max = MB_CUR_MAX;
856 #ifdef _LIBC
857 if (dfa->mb_cur_max == 6
858 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
859 dfa->is_utf8 = 1;
860 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
861 != 0);
862 #else
863 # ifdef HAVE_LANGINFO_CODESET
864 codeset_name = nl_langinfo (CODESET);
865 # else
866 codeset_name = getenv ("LC_ALL");
867 if (codeset_name == NULL || codeset_name[0] == '\0')
868 codeset_name = getenv ("LC_CTYPE");
869 if (codeset_name == NULL || codeset_name[0] == '\0')
870 codeset_name = getenv ("LANG");
871 if (codeset_name == NULL)
872 codeset_name = "";
873 else if (strchr (codeset_name, '.') != NULL)
874 codeset_name = strchr (codeset_name, '.') + 1;
875 # endif
877 if (strcasecmp (codeset_name, "UTF-8") == 0
878 || strcasecmp (codeset_name, "UTF8") == 0)
879 dfa->is_utf8 = 1;
881 /* We check exhaustively in the loop below if this charset is a
882 superset of ASCII. */
883 dfa->map_notascii = 0;
884 #endif
886 #ifdef RE_ENABLE_I18N
887 if (dfa->mb_cur_max > 1)
889 if (dfa->is_utf8)
890 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
891 else
893 int i, j, ch;
895 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
896 if (BE (dfa->sb_char == NULL, 0))
897 return REG_ESPACE;
899 /* Set the bits corresponding to single byte chars. */
900 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
901 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
903 wint_t wch = __btowc (ch);
904 if (wch != WEOF)
905 dfa->sb_char[i] |= (bitset_word_t) 1 << j;
906 # ifndef _LIBC
907 if (isascii (ch) && wch != ch)
908 dfa->map_notascii = 1;
909 # endif
913 #endif
915 if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
916 return REG_ESPACE;
917 return REG_NOERROR;
920 /* Initialize WORD_CHAR table, which indicate which character is
921 "word". In this case "word" means that it is the word construction
922 character used by some operators like "\<", "\>", etc. */
924 static void
925 internal_function
926 init_word_char (re_dfa_t *dfa)
928 dfa->word_ops_used = 1;
929 int i = 0;
930 int ch = 0;
931 if (BE (dfa->map_notascii == 0, 1))
933 if (sizeof (dfa->word_char[0]) == 8)
935 dfa->word_char[0] = UINT64_C (0x03ff000000000000);
936 dfa->word_char[1] = UINT64_C (0x07fffffe87fffffe);
937 i = 2;
939 else if (sizeof (dfa->word_char[0]) == 4)
941 dfa->word_char[0] = UINT32_C (0x00000000);
942 dfa->word_char[1] = UINT32_C (0x03ff0000);
943 dfa->word_char[2] = UINT32_C (0x87fffffe);
944 dfa->word_char[3] = UINT32_C (0x07fffffe);
945 i = 4;
947 else
948 abort ();
949 ch = 128;
951 if (BE (dfa->is_utf8, 1))
953 memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8);
954 return;
958 for (; i < BITSET_WORDS; ++i)
959 for (int j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
960 if (isalnum (ch) || ch == '_')
961 dfa->word_char[i] |= (bitset_word_t) 1 << j;
964 /* Free the work area which are only used while compiling. */
966 static void
967 free_workarea_compile (regex_t *preg)
969 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
970 bin_tree_storage_t *storage, *next;
971 for (storage = dfa->str_tree_storage; storage; storage = next)
973 next = storage->next;
974 re_free (storage);
976 dfa->str_tree_storage = NULL;
977 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
978 dfa->str_tree = NULL;
979 re_free (dfa->org_indices);
980 dfa->org_indices = NULL;
983 /* Create initial states for all contexts. */
985 static reg_errcode_t
986 create_initial_state (re_dfa_t *dfa)
988 int first, i;
989 reg_errcode_t err;
990 re_node_set init_nodes;
992 /* Initial states have the epsilon closure of the node which is
993 the first node of the regular expression. */
994 first = dfa->str_tree->first->node_idx;
995 dfa->init_node = first;
996 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
997 if (BE (err != REG_NOERROR, 0))
998 return err;
1000 /* The back-references which are in initial states can epsilon transit,
1001 since in this case all of the subexpressions can be null.
1002 Then we add epsilon closures of the nodes which are the next nodes of
1003 the back-references. */
1004 if (dfa->nbackref > 0)
1005 for (i = 0; i < init_nodes.nelem; ++i)
1007 int node_idx = init_nodes.elems[i];
1008 re_token_type_t type = dfa->nodes[node_idx].type;
1010 int clexp_idx;
1011 if (type != OP_BACK_REF)
1012 continue;
1013 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1015 re_token_t *clexp_node;
1016 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1017 if (clexp_node->type == OP_CLOSE_SUBEXP
1018 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1019 break;
1021 if (clexp_idx == init_nodes.nelem)
1022 continue;
1024 if (type == OP_BACK_REF)
1026 int dest_idx = dfa->edests[node_idx].elems[0];
1027 if (!re_node_set_contains (&init_nodes, dest_idx))
1029 reg_errcode_t err = re_node_set_merge (&init_nodes,
1030 dfa->eclosures
1031 + dest_idx);
1032 if (err != REG_NOERROR)
1033 return err;
1034 i = 0;
1039 /* It must be the first time to invoke acquire_state. */
1040 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1041 /* We don't check ERR here, since the initial state must not be NULL. */
1042 if (BE (dfa->init_state == NULL, 0))
1043 return err;
1044 if (dfa->init_state->has_constraint)
1046 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1047 CONTEXT_WORD);
1048 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1049 CONTEXT_NEWLINE);
1050 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1051 &init_nodes,
1052 CONTEXT_NEWLINE
1053 | CONTEXT_BEGBUF);
1054 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1055 || dfa->init_state_begbuf == NULL, 0))
1056 return err;
1058 else
1059 dfa->init_state_word = dfa->init_state_nl
1060 = dfa->init_state_begbuf = dfa->init_state;
1062 re_node_set_free (&init_nodes);
1063 return REG_NOERROR;
1066 #ifdef RE_ENABLE_I18N
1067 /* If it is possible to do searching in single byte encoding instead of UTF-8
1068 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1069 DFA nodes where needed. */
1071 static void
1072 optimize_utf8 (re_dfa_t *dfa)
1074 int node, i, mb_chars = 0, has_period = 0;
1076 for (node = 0; node < dfa->nodes_len; ++node)
1077 switch (dfa->nodes[node].type)
1079 case CHARACTER:
1080 if (dfa->nodes[node].opr.c >= 0x80)
1081 mb_chars = 1;
1082 break;
1083 case ANCHOR:
1084 switch (dfa->nodes[node].opr.ctx_type)
1086 case LINE_FIRST:
1087 case LINE_LAST:
1088 case BUF_FIRST:
1089 case BUF_LAST:
1090 break;
1091 default:
1092 /* Word anchors etc. cannot be handled. It's okay to test
1093 opr.ctx_type since constraints (for all DFA nodes) are
1094 created by ORing one or more opr.ctx_type values. */
1095 return;
1097 break;
1098 case OP_PERIOD:
1099 has_period = 1;
1100 break;
1101 case OP_BACK_REF:
1102 case OP_ALT:
1103 case END_OF_RE:
1104 case OP_DUP_ASTERISK:
1105 case OP_OPEN_SUBEXP:
1106 case OP_CLOSE_SUBEXP:
1107 break;
1108 case COMPLEX_BRACKET:
1109 return;
1110 case SIMPLE_BRACKET:
1111 /* Just double check. The non-ASCII range starts at 0x80. */
1112 assert (0x80 % BITSET_WORD_BITS == 0);
1113 for (i = 0x80 / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1114 if (dfa->nodes[node].opr.sbcset[i])
1115 return;
1116 break;
1117 default:
1118 abort ();
1121 if (mb_chars || has_period)
1122 for (node = 0; node < dfa->nodes_len; ++node)
1124 if (dfa->nodes[node].type == CHARACTER
1125 && dfa->nodes[node].opr.c >= 0x80)
1126 dfa->nodes[node].mb_partial = 0;
1127 else if (dfa->nodes[node].type == OP_PERIOD)
1128 dfa->nodes[node].type = OP_UTF8_PERIOD;
1131 /* The search can be in single byte locale. */
1132 dfa->mb_cur_max = 1;
1133 dfa->is_utf8 = 0;
1134 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1136 #endif
1138 /* Analyze the structure tree, and calculate "first", "next", "edest",
1139 "eclosure", and "inveclosure". */
1141 static reg_errcode_t
1142 analyze (regex_t *preg)
1144 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1145 reg_errcode_t ret;
1147 /* Allocate arrays. */
1148 dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1149 dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1150 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1151 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1152 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1153 || dfa->eclosures == NULL, 0))
1154 return REG_ESPACE;
1156 dfa->subexp_map = re_malloc (int, preg->re_nsub);
1157 if (dfa->subexp_map != NULL)
1159 int i;
1160 for (i = 0; i < preg->re_nsub; i++)
1161 dfa->subexp_map[i] = i;
1162 preorder (dfa->str_tree, optimize_subexps, dfa);
1163 for (i = 0; i < preg->re_nsub; i++)
1164 if (dfa->subexp_map[i] != i)
1165 break;
1166 if (i == preg->re_nsub)
1168 free (dfa->subexp_map);
1169 dfa->subexp_map = NULL;
1173 ret = postorder (dfa->str_tree, lower_subexps, preg);
1174 if (BE (ret != REG_NOERROR, 0))
1175 return ret;
1176 ret = postorder (dfa->str_tree, calc_first, dfa);
1177 if (BE (ret != REG_NOERROR, 0))
1178 return ret;
1179 preorder (dfa->str_tree, calc_next, dfa);
1180 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1181 if (BE (ret != REG_NOERROR, 0))
1182 return ret;
1183 ret = calc_eclosure (dfa);
1184 if (BE (ret != REG_NOERROR, 0))
1185 return ret;
1187 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1188 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1189 if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1190 || dfa->nbackref)
1192 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1193 if (BE (dfa->inveclosures == NULL, 0))
1194 return REG_ESPACE;
1195 ret = calc_inveclosure (dfa);
1198 return ret;
1201 /* Our parse trees are very unbalanced, so we cannot use a stack to
1202 implement parse tree visits. Instead, we use parent pointers and
1203 some hairy code in these two functions. */
1204 static reg_errcode_t
1205 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1206 void *extra)
1208 bin_tree_t *node, *prev;
1210 for (node = root; ; )
1212 /* Descend down the tree, preferably to the left (or to the right
1213 if that's the only child). */
1214 while (node->left || node->right)
1215 if (node->left)
1216 node = node->left;
1217 else
1218 node = node->right;
1222 reg_errcode_t err = fn (extra, node);
1223 if (BE (err != REG_NOERROR, 0))
1224 return err;
1225 if (node->parent == NULL)
1226 return REG_NOERROR;
1227 prev = node;
1228 node = node->parent;
1230 /* Go up while we have a node that is reached from the right. */
1231 while (node->right == prev || node->right == NULL);
1232 node = node->right;
1236 static reg_errcode_t
1237 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1238 void *extra)
1240 bin_tree_t *node;
1242 for (node = root; ; )
1244 reg_errcode_t err = fn (extra, node);
1245 if (BE (err != REG_NOERROR, 0))
1246 return err;
1248 /* Go to the left node, or up and to the right. */
1249 if (node->left)
1250 node = node->left;
1251 else
1253 bin_tree_t *prev = NULL;
1254 while (node->right == prev || node->right == NULL)
1256 prev = node;
1257 node = node->parent;
1258 if (!node)
1259 return REG_NOERROR;
1261 node = node->right;
1266 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1267 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1268 backreferences as well. Requires a preorder visit. */
1269 static reg_errcode_t
1270 optimize_subexps (void *extra, bin_tree_t *node)
1272 re_dfa_t *dfa = (re_dfa_t *) extra;
1274 if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1276 int idx = node->token.opr.idx;
1277 node->token.opr.idx = dfa->subexp_map[idx];
1278 dfa->used_bkref_map |= 1 << node->token.opr.idx;
1281 else if (node->token.type == SUBEXP
1282 && node->left && node->left->token.type == SUBEXP)
1284 int other_idx = node->left->token.opr.idx;
1286 node->left = node->left->left;
1287 if (node->left)
1288 node->left->parent = node;
1290 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1291 if (other_idx < BITSET_WORD_BITS)
1292 dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1295 return REG_NOERROR;
1298 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1299 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1300 static reg_errcode_t
1301 lower_subexps (void *extra, bin_tree_t *node)
1303 regex_t *preg = (regex_t *) extra;
1304 reg_errcode_t err = REG_NOERROR;
1306 if (node->left && node->left->token.type == SUBEXP)
1308 node->left = lower_subexp (&err, preg, node->left);
1309 if (node->left)
1310 node->left->parent = node;
1312 if (node->right && node->right->token.type == SUBEXP)
1314 node->right = lower_subexp (&err, preg, node->right);
1315 if (node->right)
1316 node->right->parent = node;
1319 return err;
1322 static bin_tree_t *
1323 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1325 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1326 bin_tree_t *body = node->left;
1327 bin_tree_t *op, *cls, *tree1, *tree;
1329 if (preg->no_sub
1330 /* We do not optimize empty subexpressions, because otherwise we may
1331 have bad CONCAT nodes with NULL children. This is obviously not
1332 very common, so we do not lose much. An example that triggers
1333 this case is the sed "script" /\(\)/x. */
1334 && node->left != NULL
1335 && (node->token.opr.idx >= BITSET_WORD_BITS
1336 || !(dfa->used_bkref_map
1337 & ((bitset_word_t) 1 << node->token.opr.idx))))
1338 return node->left;
1340 /* Convert the SUBEXP node to the concatenation of an
1341 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1342 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1343 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1344 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1345 tree = create_tree (dfa, op, tree1, CONCAT);
1346 if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1348 *err = REG_ESPACE;
1349 return NULL;
1352 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1353 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1354 return tree;
1357 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1358 nodes. Requires a postorder visit. */
1359 static reg_errcode_t
1360 calc_first (void *extra, bin_tree_t *node)
1362 re_dfa_t *dfa = (re_dfa_t *) extra;
1363 if (node->token.type == CONCAT)
1365 node->first = node->left->first;
1366 node->node_idx = node->left->node_idx;
1368 else
1370 node->first = node;
1371 node->node_idx = re_dfa_add_node (dfa, node->token);
1372 if (BE (node->node_idx == -1, 0))
1373 return REG_ESPACE;
1374 if (node->token.type == ANCHOR)
1375 dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1377 return REG_NOERROR;
1380 /* Pass 2: compute NEXT on the tree. Preorder visit. */
1381 static reg_errcode_t
1382 calc_next (void *extra, bin_tree_t *node)
1384 switch (node->token.type)
1386 case OP_DUP_ASTERISK:
1387 node->left->next = node;
1388 break;
1389 case CONCAT:
1390 node->left->next = node->right->first;
1391 node->right->next = node->next;
1392 break;
1393 default:
1394 if (node->left)
1395 node->left->next = node->next;
1396 if (node->right)
1397 node->right->next = node->next;
1398 break;
1400 return REG_NOERROR;
1403 /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1404 static reg_errcode_t
1405 link_nfa_nodes (void *extra, bin_tree_t *node)
1407 re_dfa_t *dfa = (re_dfa_t *) extra;
1408 int idx = node->node_idx;
1409 reg_errcode_t err = REG_NOERROR;
1411 switch (node->token.type)
1413 case CONCAT:
1414 break;
1416 case END_OF_RE:
1417 assert (node->next == NULL);
1418 break;
1420 case OP_DUP_ASTERISK:
1421 case OP_ALT:
1423 int left, right;
1424 dfa->has_plural_match = 1;
1425 if (node->left != NULL)
1426 left = node->left->first->node_idx;
1427 else
1428 left = node->next->node_idx;
1429 if (node->right != NULL)
1430 right = node->right->first->node_idx;
1431 else
1432 right = node->next->node_idx;
1433 assert (left > -1);
1434 assert (right > -1);
1435 err = re_node_set_init_2 (dfa->edests + idx, left, right);
1437 break;
1439 case ANCHOR:
1440 case OP_OPEN_SUBEXP:
1441 case OP_CLOSE_SUBEXP:
1442 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1443 break;
1445 case OP_BACK_REF:
1446 dfa->nexts[idx] = node->next->node_idx;
1447 if (node->token.type == OP_BACK_REF)
1448 err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1449 break;
1451 default:
1452 assert (!IS_EPSILON_NODE (node->token.type));
1453 dfa->nexts[idx] = node->next->node_idx;
1454 break;
1457 return err;
1460 /* Duplicate the epsilon closure of the node ROOT_NODE.
1461 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1462 to their own constraint. */
1464 static reg_errcode_t
1465 internal_function
1466 duplicate_node_closure (re_dfa_t *dfa, int top_org_node, int top_clone_node,
1467 int root_node, unsigned int init_constraint)
1469 int org_node, clone_node, ret;
1470 unsigned int constraint = init_constraint;
1471 for (org_node = top_org_node, clone_node = top_clone_node;;)
1473 int org_dest, clone_dest;
1474 if (dfa->nodes[org_node].type == OP_BACK_REF)
1476 /* If the back reference epsilon-transit, its destination must
1477 also have the constraint. Then duplicate the epsilon closure
1478 of the destination of the back reference, and store it in
1479 edests of the back reference. */
1480 org_dest = dfa->nexts[org_node];
1481 re_node_set_empty (dfa->edests + clone_node);
1482 clone_dest = duplicate_node (dfa, org_dest, constraint);
1483 if (BE (clone_dest == -1, 0))
1484 return REG_ESPACE;
1485 dfa->nexts[clone_node] = dfa->nexts[org_node];
1486 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1487 if (BE (ret < 0, 0))
1488 return REG_ESPACE;
1490 else if (dfa->edests[org_node].nelem == 0)
1492 /* In case of the node can't epsilon-transit, don't duplicate the
1493 destination and store the original destination as the
1494 destination of the node. */
1495 dfa->nexts[clone_node] = dfa->nexts[org_node];
1496 break;
1498 else if (dfa->edests[org_node].nelem == 1)
1500 /* In case of the node can epsilon-transit, and it has only one
1501 destination. */
1502 org_dest = dfa->edests[org_node].elems[0];
1503 re_node_set_empty (dfa->edests + clone_node);
1504 /* If the node is root_node itself, it means the epsilon clsoure
1505 has a loop. Then tie it to the destination of the root_node. */
1506 if (org_node == root_node && clone_node != org_node)
1508 ret = re_node_set_insert (dfa->edests + clone_node, org_dest);
1509 if (BE (ret < 0, 0))
1510 return REG_ESPACE;
1511 break;
1513 /* In case of the node has another constraint, add it. */
1514 constraint |= dfa->nodes[org_node].constraint;
1515 clone_dest = duplicate_node (dfa, org_dest, constraint);
1516 if (BE (clone_dest == -1, 0))
1517 return REG_ESPACE;
1518 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1519 if (BE (ret < 0, 0))
1520 return REG_ESPACE;
1522 else /* dfa->edests[org_node].nelem == 2 */
1524 /* In case of the node can epsilon-transit, and it has two
1525 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1526 org_dest = dfa->edests[org_node].elems[0];
1527 re_node_set_empty (dfa->edests + clone_node);
1528 /* Search for a duplicated node which satisfies the constraint. */
1529 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1530 if (clone_dest == -1)
1532 /* There is no such duplicated node, create a new one. */
1533 reg_errcode_t err;
1534 clone_dest = duplicate_node (dfa, org_dest, constraint);
1535 if (BE (clone_dest == -1, 0))
1536 return REG_ESPACE;
1537 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1538 if (BE (ret < 0, 0))
1539 return REG_ESPACE;
1540 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1541 root_node, constraint);
1542 if (BE (err != REG_NOERROR, 0))
1543 return err;
1545 else
1547 /* There is a duplicated node which satisfies the constraint,
1548 use it to avoid infinite loop. */
1549 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1550 if (BE (ret < 0, 0))
1551 return REG_ESPACE;
1554 org_dest = dfa->edests[org_node].elems[1];
1555 clone_dest = duplicate_node (dfa, org_dest, constraint);
1556 if (BE (clone_dest == -1, 0))
1557 return REG_ESPACE;
1558 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1559 if (BE (ret < 0, 0))
1560 return REG_ESPACE;
1562 org_node = org_dest;
1563 clone_node = clone_dest;
1565 return REG_NOERROR;
1568 /* Search for a node which is duplicated from the node ORG_NODE, and
1569 satisfies the constraint CONSTRAINT. */
1571 static int
1572 search_duplicated_node (const re_dfa_t *dfa, int org_node,
1573 unsigned int constraint)
1575 int idx;
1576 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1578 if (org_node == dfa->org_indices[idx]
1579 && constraint == dfa->nodes[idx].constraint)
1580 return idx; /* Found. */
1582 return -1; /* Not found. */
1585 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1586 Return the index of the new node, or -1 if insufficient storage is
1587 available. */
1589 static int
1590 duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint)
1592 int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1593 if (BE (dup_idx != -1, 1))
1595 dfa->nodes[dup_idx].constraint = constraint;
1596 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1597 dfa->nodes[dup_idx].duplicated = 1;
1599 /* Store the index of the original node. */
1600 dfa->org_indices[dup_idx] = org_idx;
1602 return dup_idx;
1605 static reg_errcode_t
1606 calc_inveclosure (re_dfa_t *dfa)
1608 int src, idx, ret;
1609 for (idx = 0; idx < dfa->nodes_len; ++idx)
1610 re_node_set_init_empty (dfa->inveclosures + idx);
1612 for (src = 0; src < dfa->nodes_len; ++src)
1614 int *elems = dfa->eclosures[src].elems;
1615 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1617 ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1618 if (BE (ret == -1, 0))
1619 return REG_ESPACE;
1623 return REG_NOERROR;
1626 /* Calculate "eclosure" for all the node in DFA. */
1628 static reg_errcode_t
1629 calc_eclosure (re_dfa_t *dfa)
1631 int node_idx, incomplete;
1632 #ifdef DEBUG
1633 assert (dfa->nodes_len > 0);
1634 #endif
1635 incomplete = 0;
1636 /* For each nodes, calculate epsilon closure. */
1637 for (node_idx = 0; ; ++node_idx)
1639 reg_errcode_t err;
1640 re_node_set eclosure_elem;
1641 if (node_idx == dfa->nodes_len)
1643 if (!incomplete)
1644 break;
1645 incomplete = 0;
1646 node_idx = 0;
1649 #ifdef DEBUG
1650 assert (dfa->eclosures[node_idx].nelem != -1);
1651 #endif
1653 /* If we have already calculated, skip it. */
1654 if (dfa->eclosures[node_idx].nelem != 0)
1655 continue;
1656 /* Calculate epsilon closure of `node_idx'. */
1657 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1658 if (BE (err != REG_NOERROR, 0))
1659 return err;
1661 if (dfa->eclosures[node_idx].nelem == 0)
1663 incomplete = 1;
1664 re_node_set_free (&eclosure_elem);
1667 return REG_NOERROR;
1670 /* Calculate epsilon closure of NODE. */
1672 static reg_errcode_t
1673 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, int node, int root)
1675 reg_errcode_t err;
1676 int i;
1677 re_node_set eclosure;
1678 int ret;
1679 int incomplete = 0;
1680 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1681 if (BE (err != REG_NOERROR, 0))
1682 return err;
1684 /* This indicates that we are calculating this node now.
1685 We reference this value to avoid infinite loop. */
1686 dfa->eclosures[node].nelem = -1;
1688 /* If the current node has constraints, duplicate all nodes
1689 since they must inherit the constraints. */
1690 if (dfa->nodes[node].constraint
1691 && dfa->edests[node].nelem
1692 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1694 err = duplicate_node_closure (dfa, node, node, node,
1695 dfa->nodes[node].constraint);
1696 if (BE (err != REG_NOERROR, 0))
1697 return err;
1700 /* Expand each epsilon destination nodes. */
1701 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1702 for (i = 0; i < dfa->edests[node].nelem; ++i)
1704 re_node_set eclosure_elem;
1705 int edest = dfa->edests[node].elems[i];
1706 /* If calculating the epsilon closure of `edest' is in progress,
1707 return intermediate result. */
1708 if (dfa->eclosures[edest].nelem == -1)
1710 incomplete = 1;
1711 continue;
1713 /* If we haven't calculated the epsilon closure of `edest' yet,
1714 calculate now. Otherwise use calculated epsilon closure. */
1715 if (dfa->eclosures[edest].nelem == 0)
1717 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1718 if (BE (err != REG_NOERROR, 0))
1719 return err;
1721 else
1722 eclosure_elem = dfa->eclosures[edest];
1723 /* Merge the epsilon closure of `edest'. */
1724 err = re_node_set_merge (&eclosure, &eclosure_elem);
1725 if (BE (err != REG_NOERROR, 0))
1726 return err;
1727 /* If the epsilon closure of `edest' is incomplete,
1728 the epsilon closure of this node is also incomplete. */
1729 if (dfa->eclosures[edest].nelem == 0)
1731 incomplete = 1;
1732 re_node_set_free (&eclosure_elem);
1736 /* An epsilon closure includes itself. */
1737 ret = re_node_set_insert (&eclosure, node);
1738 if (BE (ret < 0, 0))
1739 return REG_ESPACE;
1740 if (incomplete && !root)
1741 dfa->eclosures[node].nelem = 0;
1742 else
1743 dfa->eclosures[node] = eclosure;
1744 *new_set = eclosure;
1745 return REG_NOERROR;
1748 /* Functions for token which are used in the parser. */
1750 /* Fetch a token from INPUT.
1751 We must not use this function inside bracket expressions. */
1753 static void
1754 internal_function
1755 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1757 re_string_skip_bytes (input, peek_token (result, input, syntax));
1760 /* Peek a token from INPUT, and return the length of the token.
1761 We must not use this function inside bracket expressions. */
1763 static int
1764 internal_function
1765 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1767 unsigned char c;
1769 if (re_string_eoi (input))
1771 token->type = END_OF_RE;
1772 return 0;
1775 c = re_string_peek_byte (input, 0);
1776 token->opr.c = c;
1778 token->word_char = 0;
1779 #ifdef RE_ENABLE_I18N
1780 token->mb_partial = 0;
1781 if (input->mb_cur_max > 1 &&
1782 !re_string_first_byte (input, re_string_cur_idx (input)))
1784 token->type = CHARACTER;
1785 token->mb_partial = 1;
1786 return 1;
1788 #endif
1789 if (c == '\\')
1791 unsigned char c2;
1792 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1794 token->type = BACK_SLASH;
1795 return 1;
1798 c2 = re_string_peek_byte_case (input, 1);
1799 token->opr.c = c2;
1800 token->type = CHARACTER;
1801 #ifdef RE_ENABLE_I18N
1802 if (input->mb_cur_max > 1)
1804 wint_t wc = re_string_wchar_at (input,
1805 re_string_cur_idx (input) + 1);
1806 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1808 else
1809 #endif
1810 token->word_char = IS_WORD_CHAR (c2) != 0;
1812 switch (c2)
1814 case '|':
1815 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1816 token->type = OP_ALT;
1817 break;
1818 case '1': case '2': case '3': case '4': case '5':
1819 case '6': case '7': case '8': case '9':
1820 if (!(syntax & RE_NO_BK_REFS))
1822 token->type = OP_BACK_REF;
1823 token->opr.idx = c2 - '1';
1825 break;
1826 case '<':
1827 if (!(syntax & RE_NO_GNU_OPS))
1829 token->type = ANCHOR;
1830 token->opr.ctx_type = WORD_FIRST;
1832 break;
1833 case '>':
1834 if (!(syntax & RE_NO_GNU_OPS))
1836 token->type = ANCHOR;
1837 token->opr.ctx_type = WORD_LAST;
1839 break;
1840 case 'b':
1841 if (!(syntax & RE_NO_GNU_OPS))
1843 token->type = ANCHOR;
1844 token->opr.ctx_type = WORD_DELIM;
1846 break;
1847 case 'B':
1848 if (!(syntax & RE_NO_GNU_OPS))
1850 token->type = ANCHOR;
1851 token->opr.ctx_type = NOT_WORD_DELIM;
1853 break;
1854 case 'w':
1855 if (!(syntax & RE_NO_GNU_OPS))
1856 token->type = OP_WORD;
1857 break;
1858 case 'W':
1859 if (!(syntax & RE_NO_GNU_OPS))
1860 token->type = OP_NOTWORD;
1861 break;
1862 case 's':
1863 if (!(syntax & RE_NO_GNU_OPS))
1864 token->type = OP_SPACE;
1865 break;
1866 case 'S':
1867 if (!(syntax & RE_NO_GNU_OPS))
1868 token->type = OP_NOTSPACE;
1869 break;
1870 case '`':
1871 if (!(syntax & RE_NO_GNU_OPS))
1873 token->type = ANCHOR;
1874 token->opr.ctx_type = BUF_FIRST;
1876 break;
1877 case '\'':
1878 if (!(syntax & RE_NO_GNU_OPS))
1880 token->type = ANCHOR;
1881 token->opr.ctx_type = BUF_LAST;
1883 break;
1884 case '(':
1885 if (!(syntax & RE_NO_BK_PARENS))
1886 token->type = OP_OPEN_SUBEXP;
1887 break;
1888 case ')':
1889 if (!(syntax & RE_NO_BK_PARENS))
1890 token->type = OP_CLOSE_SUBEXP;
1891 break;
1892 case '+':
1893 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1894 token->type = OP_DUP_PLUS;
1895 break;
1896 case '?':
1897 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1898 token->type = OP_DUP_QUESTION;
1899 break;
1900 case '{':
1901 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1902 token->type = OP_OPEN_DUP_NUM;
1903 break;
1904 case '}':
1905 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1906 token->type = OP_CLOSE_DUP_NUM;
1907 break;
1908 default:
1909 break;
1911 return 2;
1914 token->type = CHARACTER;
1915 #ifdef RE_ENABLE_I18N
1916 if (input->mb_cur_max > 1)
1918 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1919 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1921 else
1922 #endif
1923 token->word_char = IS_WORD_CHAR (token->opr.c);
1925 switch (c)
1927 case '\n':
1928 if (syntax & RE_NEWLINE_ALT)
1929 token->type = OP_ALT;
1930 break;
1931 case '|':
1932 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1933 token->type = OP_ALT;
1934 break;
1935 case '*':
1936 token->type = OP_DUP_ASTERISK;
1937 break;
1938 case '+':
1939 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1940 token->type = OP_DUP_PLUS;
1941 break;
1942 case '?':
1943 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1944 token->type = OP_DUP_QUESTION;
1945 break;
1946 case '{':
1947 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1948 token->type = OP_OPEN_DUP_NUM;
1949 break;
1950 case '}':
1951 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1952 token->type = OP_CLOSE_DUP_NUM;
1953 break;
1954 case '(':
1955 if (syntax & RE_NO_BK_PARENS)
1956 token->type = OP_OPEN_SUBEXP;
1957 break;
1958 case ')':
1959 if (syntax & RE_NO_BK_PARENS)
1960 token->type = OP_CLOSE_SUBEXP;
1961 break;
1962 case '[':
1963 token->type = OP_OPEN_BRACKET;
1964 break;
1965 case '.':
1966 token->type = OP_PERIOD;
1967 break;
1968 case '^':
1969 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1970 re_string_cur_idx (input) != 0)
1972 char prev = re_string_peek_byte (input, -1);
1973 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1974 break;
1976 token->type = ANCHOR;
1977 token->opr.ctx_type = LINE_FIRST;
1978 break;
1979 case '$':
1980 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1981 re_string_cur_idx (input) + 1 != re_string_length (input))
1983 re_token_t next;
1984 re_string_skip_bytes (input, 1);
1985 peek_token (&next, input, syntax);
1986 re_string_skip_bytes (input, -1);
1987 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1988 break;
1990 token->type = ANCHOR;
1991 token->opr.ctx_type = LINE_LAST;
1992 break;
1993 default:
1994 break;
1996 return 1;
1999 /* Peek a token from INPUT, and return the length of the token.
2000 We must not use this function out of bracket expressions. */
2002 static int
2003 internal_function
2004 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2006 unsigned char c;
2007 if (re_string_eoi (input))
2009 token->type = END_OF_RE;
2010 return 0;
2012 c = re_string_peek_byte (input, 0);
2013 token->opr.c = c;
2015 #ifdef RE_ENABLE_I18N
2016 if (input->mb_cur_max > 1 &&
2017 !re_string_first_byte (input, re_string_cur_idx (input)))
2019 token->type = CHARACTER;
2020 return 1;
2022 #endif /* RE_ENABLE_I18N */
2024 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2025 && re_string_cur_idx (input) + 1 < re_string_length (input))
2027 /* In this case, '\' escape a character. */
2028 unsigned char c2;
2029 re_string_skip_bytes (input, 1);
2030 c2 = re_string_peek_byte (input, 0);
2031 token->opr.c = c2;
2032 token->type = CHARACTER;
2033 return 1;
2035 if (c == '[') /* '[' is a special char in a bracket exps. */
2037 unsigned char c2;
2038 int token_len;
2039 if (re_string_cur_idx (input) + 1 < re_string_length (input))
2040 c2 = re_string_peek_byte (input, 1);
2041 else
2042 c2 = 0;
2043 token->opr.c = c2;
2044 token_len = 2;
2045 switch (c2)
2047 case '.':
2048 token->type = OP_OPEN_COLL_ELEM;
2049 break;
2050 case '=':
2051 token->type = OP_OPEN_EQUIV_CLASS;
2052 break;
2053 case ':':
2054 if (syntax & RE_CHAR_CLASSES)
2056 token->type = OP_OPEN_CHAR_CLASS;
2057 break;
2059 /* else fall through. */
2060 default:
2061 token->type = CHARACTER;
2062 token->opr.c = c;
2063 token_len = 1;
2064 break;
2066 return token_len;
2068 switch (c)
2070 case '-':
2071 token->type = OP_CHARSET_RANGE;
2072 break;
2073 case ']':
2074 token->type = OP_CLOSE_BRACKET;
2075 break;
2076 case '^':
2077 token->type = OP_NON_MATCH_LIST;
2078 break;
2079 default:
2080 token->type = CHARACTER;
2082 return 1;
2085 /* Functions for parser. */
2087 /* Entry point of the parser.
2088 Parse the regular expression REGEXP and return the structure tree.
2089 If an error is occured, ERR is set by error code, and return NULL.
2090 This function build the following tree, from regular expression <reg_exp>:
2094 <reg_exp> EOR
2096 CAT means concatenation.
2097 EOR means end of regular expression. */
2099 static bin_tree_t *
2100 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2101 reg_errcode_t *err)
2103 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2104 bin_tree_t *tree, *eor, *root;
2105 re_token_t current_token;
2106 dfa->syntax = syntax;
2107 fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2108 tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2109 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2110 return NULL;
2111 eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2112 if (tree != NULL)
2113 root = create_tree (dfa, tree, eor, CONCAT);
2114 else
2115 root = eor;
2116 if (BE (eor == NULL || root == NULL, 0))
2118 *err = REG_ESPACE;
2119 return NULL;
2121 return root;
2124 /* This function build the following tree, from regular expression
2125 <branch1>|<branch2>:
2129 <branch1> <branch2>
2131 ALT means alternative, which represents the operator `|'. */
2133 static bin_tree_t *
2134 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2135 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2137 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2138 bin_tree_t *tree, *branch = NULL;
2139 tree = parse_branch (regexp, preg, token, syntax, nest, err);
2140 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2141 return NULL;
2143 while (token->type == OP_ALT)
2145 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2146 if (token->type != OP_ALT && token->type != END_OF_RE
2147 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2149 branch = parse_branch (regexp, preg, token, syntax, nest, err);
2150 if (BE (*err != REG_NOERROR && branch == NULL, 0))
2151 return NULL;
2153 else
2154 branch = NULL;
2155 tree = create_tree (dfa, tree, branch, OP_ALT);
2156 if (BE (tree == NULL, 0))
2158 *err = REG_ESPACE;
2159 return NULL;
2162 return tree;
2165 /* This function build the following tree, from regular expression
2166 <exp1><exp2>:
2170 <exp1> <exp2>
2172 CAT means concatenation. */
2174 static bin_tree_t *
2175 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2176 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2178 bin_tree_t *tree, *exp;
2179 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2180 tree = parse_expression (regexp, preg, token, syntax, nest, err);
2181 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2182 return NULL;
2184 while (token->type != OP_ALT && token->type != END_OF_RE
2185 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2187 exp = parse_expression (regexp, preg, token, syntax, nest, err);
2188 if (BE (*err != REG_NOERROR && exp == NULL, 0))
2190 if (tree != NULL)
2191 postorder (tree, free_tree, NULL);
2192 return NULL;
2194 if (tree != NULL && exp != NULL)
2196 bin_tree_t *newtree = create_tree (dfa, tree, exp, CONCAT);
2197 if (newtree == NULL)
2199 postorder (exp, free_tree, NULL);
2200 postorder (tree, free_tree, NULL);
2201 *err = REG_ESPACE;
2202 return NULL;
2204 tree = newtree;
2206 else if (tree == NULL)
2207 tree = exp;
2208 /* Otherwise exp == NULL, we don't need to create new tree. */
2210 return tree;
2213 /* This function build the following tree, from regular expression a*:
2219 static bin_tree_t *
2220 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2221 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2223 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2224 bin_tree_t *tree;
2225 switch (token->type)
2227 case CHARACTER:
2228 tree = create_token_tree (dfa, NULL, NULL, token);
2229 if (BE (tree == NULL, 0))
2231 *err = REG_ESPACE;
2232 return NULL;
2234 #ifdef RE_ENABLE_I18N
2235 if (dfa->mb_cur_max > 1)
2237 while (!re_string_eoi (regexp)
2238 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2240 bin_tree_t *mbc_remain;
2241 fetch_token (token, regexp, syntax);
2242 mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2243 tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2244 if (BE (mbc_remain == NULL || tree == NULL, 0))
2246 *err = REG_ESPACE;
2247 return NULL;
2251 #endif
2252 break;
2253 case OP_OPEN_SUBEXP:
2254 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2255 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2256 return NULL;
2257 break;
2258 case OP_OPEN_BRACKET:
2259 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2260 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2261 return NULL;
2262 break;
2263 case OP_BACK_REF:
2264 if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2266 *err = REG_ESUBREG;
2267 return NULL;
2269 dfa->used_bkref_map |= 1 << token->opr.idx;
2270 tree = create_token_tree (dfa, NULL, NULL, token);
2271 if (BE (tree == NULL, 0))
2273 *err = REG_ESPACE;
2274 return NULL;
2276 ++dfa->nbackref;
2277 dfa->has_mb_node = 1;
2278 break;
2279 case OP_OPEN_DUP_NUM:
2280 if (syntax & RE_CONTEXT_INVALID_DUP)
2282 *err = REG_BADRPT;
2283 return NULL;
2285 /* FALLTHROUGH */
2286 case OP_DUP_ASTERISK:
2287 case OP_DUP_PLUS:
2288 case OP_DUP_QUESTION:
2289 if (syntax & RE_CONTEXT_INVALID_OPS)
2291 *err = REG_BADRPT;
2292 return NULL;
2294 else if (syntax & RE_CONTEXT_INDEP_OPS)
2296 fetch_token (token, regexp, syntax);
2297 return parse_expression (regexp, preg, token, syntax, nest, err);
2299 /* else fall through */
2300 case OP_CLOSE_SUBEXP:
2301 if ((token->type == OP_CLOSE_SUBEXP) &&
2302 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2304 *err = REG_ERPAREN;
2305 return NULL;
2307 /* else fall through */
2308 case OP_CLOSE_DUP_NUM:
2309 /* We treat it as a normal character. */
2311 /* Then we can these characters as normal characters. */
2312 token->type = CHARACTER;
2313 /* mb_partial and word_char bits should be initialized already
2314 by peek_token. */
2315 tree = create_token_tree (dfa, NULL, NULL, token);
2316 if (BE (tree == NULL, 0))
2318 *err = REG_ESPACE;
2319 return NULL;
2321 break;
2322 case ANCHOR:
2323 if ((token->opr.ctx_type
2324 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2325 && dfa->word_ops_used == 0)
2326 init_word_char (dfa);
2327 if (token->opr.ctx_type == WORD_DELIM
2328 || token->opr.ctx_type == NOT_WORD_DELIM)
2330 bin_tree_t *tree_first, *tree_last;
2331 if (token->opr.ctx_type == WORD_DELIM)
2333 token->opr.ctx_type = WORD_FIRST;
2334 tree_first = create_token_tree (dfa, NULL, NULL, token);
2335 token->opr.ctx_type = WORD_LAST;
2337 else
2339 token->opr.ctx_type = INSIDE_WORD;
2340 tree_first = create_token_tree (dfa, NULL, NULL, token);
2341 token->opr.ctx_type = INSIDE_NOTWORD;
2343 tree_last = create_token_tree (dfa, NULL, NULL, token);
2344 tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2345 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2347 *err = REG_ESPACE;
2348 return NULL;
2351 else
2353 tree = create_token_tree (dfa, NULL, NULL, token);
2354 if (BE (tree == NULL, 0))
2356 *err = REG_ESPACE;
2357 return NULL;
2360 /* We must return here, since ANCHORs can't be followed
2361 by repetition operators.
2362 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2363 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2364 fetch_token (token, regexp, syntax);
2365 return tree;
2366 case OP_PERIOD:
2367 tree = create_token_tree (dfa, NULL, NULL, token);
2368 if (BE (tree == NULL, 0))
2370 *err = REG_ESPACE;
2371 return NULL;
2373 if (dfa->mb_cur_max > 1)
2374 dfa->has_mb_node = 1;
2375 break;
2376 case OP_WORD:
2377 case OP_NOTWORD:
2378 tree = build_charclass_op (dfa, regexp->trans,
2379 (const unsigned char *) "alnum",
2380 (const unsigned char *) "_",
2381 token->type == OP_NOTWORD, err);
2382 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2383 return NULL;
2384 break;
2385 case OP_SPACE:
2386 case OP_NOTSPACE:
2387 tree = build_charclass_op (dfa, regexp->trans,
2388 (const unsigned char *) "space",
2389 (const unsigned char *) "",
2390 token->type == OP_NOTSPACE, err);
2391 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2392 return NULL;
2393 break;
2394 case OP_ALT:
2395 case END_OF_RE:
2396 return NULL;
2397 case BACK_SLASH:
2398 *err = REG_EESCAPE;
2399 return NULL;
2400 default:
2401 /* Must not happen? */
2402 #ifdef DEBUG
2403 assert (0);
2404 #endif
2405 return NULL;
2407 fetch_token (token, regexp, syntax);
2409 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2410 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2412 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2413 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2414 return NULL;
2415 /* In BRE consecutive duplications are not allowed. */
2416 if ((syntax & RE_CONTEXT_INVALID_DUP)
2417 && (token->type == OP_DUP_ASTERISK
2418 || token->type == OP_OPEN_DUP_NUM))
2420 *err = REG_BADRPT;
2421 return NULL;
2425 return tree;
2428 /* This function build the following tree, from regular expression
2429 (<reg_exp>):
2430 SUBEXP
2432 <reg_exp>
2435 static bin_tree_t *
2436 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2437 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2439 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2440 bin_tree_t *tree;
2441 size_t cur_nsub;
2442 cur_nsub = preg->re_nsub++;
2444 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2446 /* The subexpression may be a null string. */
2447 if (token->type == OP_CLOSE_SUBEXP)
2448 tree = NULL;
2449 else
2451 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2452 if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2454 if (tree != NULL)
2455 postorder (tree, free_tree, NULL);
2456 *err = REG_EPAREN;
2458 if (BE (*err != REG_NOERROR, 0))
2459 return NULL;
2462 if (cur_nsub <= '9' - '1')
2463 dfa->completed_bkref_map |= 1 << cur_nsub;
2465 tree = create_tree (dfa, tree, NULL, SUBEXP);
2466 if (BE (tree == NULL, 0))
2468 *err = REG_ESPACE;
2469 return NULL;
2471 tree->token.opr.idx = cur_nsub;
2472 return tree;
2475 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2477 static bin_tree_t *
2478 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2479 re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2481 bin_tree_t *tree = NULL, *old_tree = NULL;
2482 int i, start, end, start_idx = re_string_cur_idx (regexp);
2483 re_token_t start_token = *token;
2485 if (token->type == OP_OPEN_DUP_NUM)
2487 end = 0;
2488 start = fetch_number (regexp, token, syntax);
2489 if (start == -1)
2491 if (token->type == CHARACTER && token->opr.c == ',')
2492 start = 0; /* We treat "{,m}" as "{0,m}". */
2493 else
2495 *err = REG_BADBR; /* <re>{} is invalid. */
2496 return NULL;
2499 if (BE (start != -2, 1))
2501 /* We treat "{n}" as "{n,n}". */
2502 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2503 : ((token->type == CHARACTER && token->opr.c == ',')
2504 ? fetch_number (regexp, token, syntax) : -2));
2506 if (BE (start == -2 || end == -2, 0))
2508 /* Invalid sequence. */
2509 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2511 if (token->type == END_OF_RE)
2512 *err = REG_EBRACE;
2513 else
2514 *err = REG_BADBR;
2516 return NULL;
2519 /* If the syntax bit is set, rollback. */
2520 re_string_set_index (regexp, start_idx);
2521 *token = start_token;
2522 token->type = CHARACTER;
2523 /* mb_partial and word_char bits should be already initialized by
2524 peek_token. */
2525 return elem;
2528 if (BE ((end != -1 && start > end) || token->type != OP_CLOSE_DUP_NUM, 0))
2530 /* First number greater than second. */
2531 *err = REG_BADBR;
2532 return NULL;
2535 else
2537 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2538 end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2541 fetch_token (token, regexp, syntax);
2543 if (BE (elem == NULL, 0))
2544 return NULL;
2545 if (BE (start == 0 && end == 0, 0))
2547 postorder (elem, free_tree, NULL);
2548 return NULL;
2551 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2552 if (BE (start > 0, 0))
2554 tree = elem;
2555 for (i = 2; i <= start; ++i)
2557 elem = duplicate_tree (elem, dfa);
2558 tree = create_tree (dfa, tree, elem, CONCAT);
2559 if (BE (elem == NULL || tree == NULL, 0))
2560 goto parse_dup_op_espace;
2563 if (start == end)
2564 return tree;
2566 /* Duplicate ELEM before it is marked optional. */
2567 elem = duplicate_tree (elem, dfa);
2568 old_tree = tree;
2570 else
2571 old_tree = NULL;
2573 if (elem->token.type == SUBEXP)
2574 postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2576 tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2577 if (BE (tree == NULL, 0))
2578 goto parse_dup_op_espace;
2580 /* This loop is actually executed only when end != -1,
2581 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2582 already created the start+1-th copy. */
2583 for (i = start + 2; i <= end; ++i)
2585 elem = duplicate_tree (elem, dfa);
2586 tree = create_tree (dfa, tree, elem, CONCAT);
2587 if (BE (elem == NULL || tree == NULL, 0))
2588 goto parse_dup_op_espace;
2590 tree = create_tree (dfa, tree, NULL, OP_ALT);
2591 if (BE (tree == NULL, 0))
2592 goto parse_dup_op_espace;
2595 if (old_tree)
2596 tree = create_tree (dfa, old_tree, tree, CONCAT);
2598 return tree;
2600 parse_dup_op_espace:
2601 *err = REG_ESPACE;
2602 return NULL;
2605 /* Size of the names for collating symbol/equivalence_class/character_class.
2606 I'm not sure, but maybe enough. */
2607 #define BRACKET_NAME_BUF_SIZE 32
2609 #ifndef _LIBC
2610 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2611 Build the range expression which starts from START_ELEM, and ends
2612 at END_ELEM. The result are written to MBCSET and SBCSET.
2613 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2614 mbcset->range_ends, is a pointer argument sinse we may
2615 update it. */
2617 static reg_errcode_t
2618 internal_function
2619 # ifdef RE_ENABLE_I18N
2620 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2621 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2622 # else /* not RE_ENABLE_I18N */
2623 build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
2624 bracket_elem_t *end_elem)
2625 # endif /* not RE_ENABLE_I18N */
2627 unsigned int start_ch, end_ch;
2628 /* Equivalence Classes and Character Classes can't be a range start/end. */
2629 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2630 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2632 return REG_ERANGE;
2634 /* We can handle no multi character collating elements without libc
2635 support. */
2636 if (BE ((start_elem->type == COLL_SYM
2637 && strlen ((char *) start_elem->opr.name) > 1)
2638 || (end_elem->type == COLL_SYM
2639 && strlen ((char *) end_elem->opr.name) > 1), 0))
2640 return REG_ECOLLATE;
2642 # ifdef RE_ENABLE_I18N
2644 wchar_t wc;
2645 wint_t start_wc;
2646 wint_t end_wc;
2647 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2649 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2650 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2651 : 0));
2652 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2653 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2654 : 0));
2655 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2656 ? __btowc (start_ch) : start_elem->opr.wch);
2657 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2658 ? __btowc (end_ch) : end_elem->opr.wch);
2659 if (start_wc == WEOF || end_wc == WEOF)
2660 return REG_ECOLLATE;
2661 cmp_buf[0] = start_wc;
2662 cmp_buf[4] = end_wc;
2663 if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2664 return REG_ERANGE;
2666 /* Got valid collation sequence values, add them as a new entry.
2667 However, for !_LIBC we have no collation elements: if the
2668 character set is single byte, the single byte character set
2669 that we build below suffices. parse_bracket_exp passes
2670 no MBCSET if dfa->mb_cur_max == 1. */
2671 if (mbcset)
2673 /* Check the space of the arrays. */
2674 if (BE (*range_alloc == mbcset->nranges, 0))
2676 /* There is not enough space, need realloc. */
2677 wchar_t *new_array_start, *new_array_end;
2678 int new_nranges;
2680 /* +1 in case of mbcset->nranges is 0. */
2681 new_nranges = 2 * mbcset->nranges + 1;
2682 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2683 are NULL if *range_alloc == 0. */
2684 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2685 new_nranges);
2686 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2687 new_nranges);
2689 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2690 return REG_ESPACE;
2692 mbcset->range_starts = new_array_start;
2693 mbcset->range_ends = new_array_end;
2694 *range_alloc = new_nranges;
2697 mbcset->range_starts[mbcset->nranges] = start_wc;
2698 mbcset->range_ends[mbcset->nranges++] = end_wc;
2701 /* Build the table for single byte characters. */
2702 for (wc = 0; wc < SBC_MAX; ++wc)
2704 cmp_buf[2] = wc;
2705 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2706 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2707 bitset_set (sbcset, wc);
2710 # else /* not RE_ENABLE_I18N */
2712 unsigned int ch;
2713 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2714 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2715 : 0));
2716 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2717 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2718 : 0));
2719 if (start_ch > end_ch)
2720 return REG_ERANGE;
2721 /* Build the table for single byte characters. */
2722 for (ch = 0; ch < SBC_MAX; ++ch)
2723 if (start_ch <= ch && ch <= end_ch)
2724 bitset_set (sbcset, ch);
2726 # endif /* not RE_ENABLE_I18N */
2727 return REG_NOERROR;
2729 #endif /* not _LIBC */
2731 #ifndef _LIBC
2732 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2733 Build the collating element which is represented by NAME.
2734 The result are written to MBCSET and SBCSET.
2735 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2736 pointer argument since we may update it. */
2738 static reg_errcode_t
2739 internal_function
2740 # ifdef RE_ENABLE_I18N
2741 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2742 int *coll_sym_alloc, const unsigned char *name)
2743 # else /* not RE_ENABLE_I18N */
2744 build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2745 # endif /* not RE_ENABLE_I18N */
2747 size_t name_len = strlen ((const char *) name);
2748 if (BE (name_len != 1, 0))
2749 return REG_ECOLLATE;
2750 else
2752 bitset_set (sbcset, name[0]);
2753 return REG_NOERROR;
2756 #endif /* not _LIBC */
2758 /* This function parse bracket expression like "[abc]", "[a-c]",
2759 "[[.a-a.]]" etc. */
2761 static bin_tree_t *
2762 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2763 reg_syntax_t syntax, reg_errcode_t *err)
2765 #ifdef _LIBC
2766 const unsigned char *collseqmb;
2767 const char *collseqwc;
2768 uint32_t nrules;
2769 int32_t table_size;
2770 const int32_t *symb_table;
2771 const unsigned char *extra;
2773 /* Local function for parse_bracket_exp used in _LIBC environement.
2774 Seek the collating symbol entry correspondings to NAME.
2775 Return the index of the symbol in the SYMB_TABLE. */
2777 auto inline int32_t
2778 __attribute ((always_inline))
2779 seek_collating_symbol_entry (name, name_len)
2780 const unsigned char *name;
2781 size_t name_len;
2783 int32_t hash = elem_hash ((const char *) name, name_len);
2784 int32_t elem = hash % table_size;
2785 if (symb_table[2 * elem] != 0)
2787 int32_t second = hash % (table_size - 2) + 1;
2791 /* First compare the hashing value. */
2792 if (symb_table[2 * elem] == hash
2793 /* Compare the length of the name. */
2794 && name_len == extra[symb_table[2 * elem + 1]]
2795 /* Compare the name. */
2796 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2797 name_len) == 0)
2799 /* Yep, this is the entry. */
2800 break;
2803 /* Next entry. */
2804 elem += second;
2806 while (symb_table[2 * elem] != 0);
2808 return elem;
2811 /* Local function for parse_bracket_exp used in _LIBC environment.
2812 Look up the collation sequence value of BR_ELEM.
2813 Return the value if succeeded, UINT_MAX otherwise. */
2815 auto inline unsigned int
2816 __attribute ((always_inline))
2817 lookup_collation_sequence_value (br_elem)
2818 bracket_elem_t *br_elem;
2820 if (br_elem->type == SB_CHAR)
2823 if (MB_CUR_MAX == 1)
2825 if (nrules == 0)
2826 return collseqmb[br_elem->opr.ch];
2827 else
2829 wint_t wc = __btowc (br_elem->opr.ch);
2830 return __collseq_table_lookup (collseqwc, wc);
2833 else if (br_elem->type == MB_CHAR)
2835 if (nrules != 0)
2836 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2838 else if (br_elem->type == COLL_SYM)
2840 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2841 if (nrules != 0)
2843 int32_t elem, idx;
2844 elem = seek_collating_symbol_entry (br_elem->opr.name,
2845 sym_name_len);
2846 if (symb_table[2 * elem] != 0)
2848 /* We found the entry. */
2849 idx = symb_table[2 * elem + 1];
2850 /* Skip the name of collating element name. */
2851 idx += 1 + extra[idx];
2852 /* Skip the byte sequence of the collating element. */
2853 idx += 1 + extra[idx];
2854 /* Adjust for the alignment. */
2855 idx = (idx + 3) & ~3;
2856 /* Skip the multibyte collation sequence value. */
2857 idx += sizeof (unsigned int);
2858 /* Skip the wide char sequence of the collating element. */
2859 idx += sizeof (unsigned int) *
2860 (1 + *(unsigned int *) (extra + idx));
2861 /* Return the collation sequence value. */
2862 return *(unsigned int *) (extra + idx);
2864 else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2866 /* No valid character. Match it as a single byte
2867 character. */
2868 return collseqmb[br_elem->opr.name[0]];
2871 else if (sym_name_len == 1)
2872 return collseqmb[br_elem->opr.name[0]];
2874 return UINT_MAX;
2877 /* Local function for parse_bracket_exp used in _LIBC environement.
2878 Build the range expression which starts from START_ELEM, and ends
2879 at END_ELEM. The result are written to MBCSET and SBCSET.
2880 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2881 mbcset->range_ends, is a pointer argument sinse we may
2882 update it. */
2884 auto inline reg_errcode_t
2885 __attribute ((always_inline))
2886 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2887 re_charset_t *mbcset;
2888 int *range_alloc;
2889 bitset_t sbcset;
2890 bracket_elem_t *start_elem, *end_elem;
2892 unsigned int ch;
2893 uint32_t start_collseq;
2894 uint32_t end_collseq;
2896 /* Equivalence Classes and Character Classes can't be a range
2897 start/end. */
2898 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2899 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2901 return REG_ERANGE;
2903 start_collseq = lookup_collation_sequence_value (start_elem);
2904 end_collseq = lookup_collation_sequence_value (end_elem);
2905 /* Check start/end collation sequence values. */
2906 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2907 return REG_ECOLLATE;
2908 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2909 return REG_ERANGE;
2911 /* Got valid collation sequence values, add them as a new entry.
2912 However, if we have no collation elements, and the character set
2913 is single byte, the single byte character set that we
2914 build below suffices. */
2915 if (nrules > 0 || dfa->mb_cur_max > 1)
2917 /* Check the space of the arrays. */
2918 if (BE (*range_alloc == mbcset->nranges, 0))
2920 /* There is not enough space, need realloc. */
2921 uint32_t *new_array_start;
2922 uint32_t *new_array_end;
2923 int new_nranges;
2925 /* +1 in case of mbcset->nranges is 0. */
2926 new_nranges = 2 * mbcset->nranges + 1;
2927 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2928 new_nranges);
2929 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2930 new_nranges);
2932 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2933 return REG_ESPACE;
2935 mbcset->range_starts = new_array_start;
2936 mbcset->range_ends = new_array_end;
2937 *range_alloc = new_nranges;
2940 mbcset->range_starts[mbcset->nranges] = start_collseq;
2941 mbcset->range_ends[mbcset->nranges++] = end_collseq;
2944 /* Build the table for single byte characters. */
2945 for (ch = 0; ch < SBC_MAX; ch++)
2947 uint32_t ch_collseq;
2949 if (MB_CUR_MAX == 1)
2951 if (nrules == 0)
2952 ch_collseq = collseqmb[ch];
2953 else
2954 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2955 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2956 bitset_set (sbcset, ch);
2958 return REG_NOERROR;
2961 /* Local function for parse_bracket_exp used in _LIBC environement.
2962 Build the collating element which is represented by NAME.
2963 The result are written to MBCSET and SBCSET.
2964 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2965 pointer argument sinse we may update it. */
2967 auto inline reg_errcode_t
2968 __attribute ((always_inline))
2969 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2970 re_charset_t *mbcset;
2971 int *coll_sym_alloc;
2972 bitset_t sbcset;
2973 const unsigned char *name;
2975 int32_t elem, idx;
2976 size_t name_len = strlen ((const char *) name);
2977 if (nrules != 0)
2979 elem = seek_collating_symbol_entry (name, name_len);
2980 if (symb_table[2 * elem] != 0)
2982 /* We found the entry. */
2983 idx = symb_table[2 * elem + 1];
2984 /* Skip the name of collating element name. */
2985 idx += 1 + extra[idx];
2987 else if (symb_table[2 * elem] == 0 && name_len == 1)
2989 /* No valid character, treat it as a normal
2990 character. */
2991 bitset_set (sbcset, name[0]);
2992 return REG_NOERROR;
2994 else
2995 return REG_ECOLLATE;
2997 /* Got valid collation sequence, add it as a new entry. */
2998 /* Check the space of the arrays. */
2999 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3001 /* Not enough, realloc it. */
3002 /* +1 in case of mbcset->ncoll_syms is 0. */
3003 int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3004 /* Use realloc since mbcset->coll_syms is NULL
3005 if *alloc == 0. */
3006 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3007 new_coll_sym_alloc);
3008 if (BE (new_coll_syms == NULL, 0))
3009 return REG_ESPACE;
3010 mbcset->coll_syms = new_coll_syms;
3011 *coll_sym_alloc = new_coll_sym_alloc;
3013 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3014 return REG_NOERROR;
3016 else
3018 if (BE (name_len != 1, 0))
3019 return REG_ECOLLATE;
3020 else
3022 bitset_set (sbcset, name[0]);
3023 return REG_NOERROR;
3027 #endif
3029 re_token_t br_token;
3030 re_bitset_ptr_t sbcset;
3031 #ifdef RE_ENABLE_I18N
3032 re_charset_t *mbcset;
3033 int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3034 int equiv_class_alloc = 0, char_class_alloc = 0;
3035 #endif /* not RE_ENABLE_I18N */
3036 int non_match = 0;
3037 bin_tree_t *work_tree;
3038 int token_len;
3039 int first_round = 1;
3040 #ifdef _LIBC
3041 collseqmb = (const unsigned char *)
3042 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3043 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3044 if (nrules)
3047 if (MB_CUR_MAX > 1)
3049 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3050 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3051 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3052 _NL_COLLATE_SYMB_TABLEMB);
3053 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3054 _NL_COLLATE_SYMB_EXTRAMB);
3056 #endif
3057 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3058 #ifdef RE_ENABLE_I18N
3059 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3060 #endif /* RE_ENABLE_I18N */
3061 #ifdef RE_ENABLE_I18N
3062 if (BE (sbcset == NULL || mbcset == NULL, 0))
3063 #else
3064 if (BE (sbcset == NULL, 0))
3065 #endif /* RE_ENABLE_I18N */
3067 re_free (sbcset);
3068 #ifdef RE_ENABLE_I18N
3069 re_free (mbcset);
3070 #endif
3071 *err = REG_ESPACE;
3072 return NULL;
3075 token_len = peek_token_bracket (token, regexp, syntax);
3076 if (BE (token->type == END_OF_RE, 0))
3078 *err = REG_BADPAT;
3079 goto parse_bracket_exp_free_return;
3081 if (token->type == OP_NON_MATCH_LIST)
3083 #ifdef RE_ENABLE_I18N
3084 mbcset->non_match = 1;
3085 #endif /* not RE_ENABLE_I18N */
3086 non_match = 1;
3087 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3088 bitset_set (sbcset, '\n');
3089 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3090 token_len = peek_token_bracket (token, regexp, syntax);
3091 if (BE (token->type == END_OF_RE, 0))
3093 *err = REG_BADPAT;
3094 goto parse_bracket_exp_free_return;
3098 /* We treat the first ']' as a normal character. */
3099 if (token->type == OP_CLOSE_BRACKET)
3100 token->type = CHARACTER;
3102 while (1)
3104 bracket_elem_t start_elem, end_elem;
3105 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3106 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3107 reg_errcode_t ret;
3108 int token_len2 = 0, is_range_exp = 0;
3109 re_token_t token2;
3111 start_elem.opr.name = start_name_buf;
3112 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3113 syntax, first_round);
3114 if (BE (ret != REG_NOERROR, 0))
3116 *err = ret;
3117 goto parse_bracket_exp_free_return;
3119 first_round = 0;
3121 /* Get information about the next token. We need it in any case. */
3122 token_len = peek_token_bracket (token, regexp, syntax);
3124 /* Do not check for ranges if we know they are not allowed. */
3125 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3127 if (BE (token->type == END_OF_RE, 0))
3129 *err = REG_EBRACK;
3130 goto parse_bracket_exp_free_return;
3132 if (token->type == OP_CHARSET_RANGE)
3134 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3135 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3136 if (BE (token2.type == END_OF_RE, 0))
3138 *err = REG_EBRACK;
3139 goto parse_bracket_exp_free_return;
3141 if (token2.type == OP_CLOSE_BRACKET)
3143 /* We treat the last '-' as a normal character. */
3144 re_string_skip_bytes (regexp, -token_len);
3145 token->type = CHARACTER;
3147 else
3148 is_range_exp = 1;
3152 if (is_range_exp == 1)
3154 end_elem.opr.name = end_name_buf;
3155 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3156 dfa, syntax, 1);
3157 if (BE (ret != REG_NOERROR, 0))
3159 *err = ret;
3160 goto parse_bracket_exp_free_return;
3163 token_len = peek_token_bracket (token, regexp, syntax);
3165 #ifdef _LIBC
3166 *err = build_range_exp (sbcset, mbcset, &range_alloc,
3167 &start_elem, &end_elem);
3168 #else
3169 # ifdef RE_ENABLE_I18N
3170 *err = build_range_exp (sbcset,
3171 dfa->mb_cur_max > 1 ? mbcset : NULL,
3172 &range_alloc, &start_elem, &end_elem);
3173 # else
3174 *err = build_range_exp (sbcset, &start_elem, &end_elem);
3175 # endif
3176 #endif /* RE_ENABLE_I18N */
3177 if (BE (*err != REG_NOERROR, 0))
3178 goto parse_bracket_exp_free_return;
3180 else
3182 switch (start_elem.type)
3184 case SB_CHAR:
3185 bitset_set (sbcset, start_elem.opr.ch);
3186 break;
3187 #ifdef RE_ENABLE_I18N
3188 case MB_CHAR:
3189 /* Check whether the array has enough space. */
3190 if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3192 wchar_t *new_mbchars;
3193 /* Not enough, realloc it. */
3194 /* +1 in case of mbcset->nmbchars is 0. */
3195 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3196 /* Use realloc since array is NULL if *alloc == 0. */
3197 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3198 mbchar_alloc);
3199 if (BE (new_mbchars == NULL, 0))
3200 goto parse_bracket_exp_espace;
3201 mbcset->mbchars = new_mbchars;
3203 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3204 break;
3205 #endif /* RE_ENABLE_I18N */
3206 case EQUIV_CLASS:
3207 *err = build_equiv_class (sbcset,
3208 #ifdef RE_ENABLE_I18N
3209 mbcset, &equiv_class_alloc,
3210 #endif /* RE_ENABLE_I18N */
3211 start_elem.opr.name);
3212 if (BE (*err != REG_NOERROR, 0))
3213 goto parse_bracket_exp_free_return;
3214 break;
3215 case COLL_SYM:
3216 *err = build_collating_symbol (sbcset,
3217 #ifdef RE_ENABLE_I18N
3218 mbcset, &coll_sym_alloc,
3219 #endif /* RE_ENABLE_I18N */
3220 start_elem.opr.name);
3221 if (BE (*err != REG_NOERROR, 0))
3222 goto parse_bracket_exp_free_return;
3223 break;
3224 case CHAR_CLASS:
3225 *err = build_charclass (regexp->trans, sbcset,
3226 #ifdef RE_ENABLE_I18N
3227 mbcset, &char_class_alloc,
3228 #endif /* RE_ENABLE_I18N */
3229 start_elem.opr.name, syntax);
3230 if (BE (*err != REG_NOERROR, 0))
3231 goto parse_bracket_exp_free_return;
3232 break;
3233 default:
3234 assert (0);
3235 break;
3238 if (BE (token->type == END_OF_RE, 0))
3240 *err = REG_EBRACK;
3241 goto parse_bracket_exp_free_return;
3243 if (token->type == OP_CLOSE_BRACKET)
3244 break;
3247 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3249 /* If it is non-matching list. */
3250 if (non_match)
3251 bitset_not (sbcset);
3253 #ifdef RE_ENABLE_I18N
3254 /* Ensure only single byte characters are set. */
3255 if (dfa->mb_cur_max > 1)
3256 bitset_mask (sbcset, dfa->sb_char);
3258 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3259 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3260 || mbcset->non_match)))
3262 bin_tree_t *mbc_tree;
3263 int sbc_idx;
3264 /* Build a tree for complex bracket. */
3265 dfa->has_mb_node = 1;
3266 br_token.type = COMPLEX_BRACKET;
3267 br_token.opr.mbcset = mbcset;
3268 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3269 if (BE (mbc_tree == NULL, 0))
3270 goto parse_bracket_exp_espace;
3271 for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3272 if (sbcset[sbc_idx])
3273 break;
3274 /* If there are no bits set in sbcset, there is no point
3275 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3276 if (sbc_idx < BITSET_WORDS)
3278 /* Build a tree for simple bracket. */
3279 br_token.type = SIMPLE_BRACKET;
3280 br_token.opr.sbcset = sbcset;
3281 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3282 if (BE (work_tree == NULL, 0))
3283 goto parse_bracket_exp_espace;
3285 /* Then join them by ALT node. */
3286 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3287 if (BE (work_tree == NULL, 0))
3288 goto parse_bracket_exp_espace;
3290 else
3292 re_free (sbcset);
3293 work_tree = mbc_tree;
3296 else
3297 #endif /* not RE_ENABLE_I18N */
3299 #ifdef RE_ENABLE_I18N
3300 free_charset (mbcset);
3301 #endif
3302 /* Build a tree for simple bracket. */
3303 br_token.type = SIMPLE_BRACKET;
3304 br_token.opr.sbcset = sbcset;
3305 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3306 if (BE (work_tree == NULL, 0))
3307 goto parse_bracket_exp_espace;
3309 return work_tree;
3311 parse_bracket_exp_espace:
3312 *err = REG_ESPACE;
3313 parse_bracket_exp_free_return:
3314 re_free (sbcset);
3315 #ifdef RE_ENABLE_I18N
3316 free_charset (mbcset);
3317 #endif /* RE_ENABLE_I18N */
3318 return NULL;
3321 /* Parse an element in the bracket expression. */
3323 static reg_errcode_t
3324 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3325 re_token_t *token, int token_len, re_dfa_t *dfa,
3326 reg_syntax_t syntax, int accept_hyphen)
3328 #ifdef RE_ENABLE_I18N
3329 int cur_char_size;
3330 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3331 if (cur_char_size > 1)
3333 elem->type = MB_CHAR;
3334 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3335 re_string_skip_bytes (regexp, cur_char_size);
3336 return REG_NOERROR;
3338 #endif /* RE_ENABLE_I18N */
3339 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3340 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3341 || token->type == OP_OPEN_EQUIV_CLASS)
3342 return parse_bracket_symbol (elem, regexp, token);
3343 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3345 /* A '-' must only appear as anything but a range indicator before
3346 the closing bracket. Everything else is an error. */
3347 re_token_t token2;
3348 (void) peek_token_bracket (&token2, regexp, syntax);
3349 if (token2.type != OP_CLOSE_BRACKET)
3350 /* The actual error value is not standardized since this whole
3351 case is undefined. But ERANGE makes good sense. */
3352 return REG_ERANGE;
3354 elem->type = SB_CHAR;
3355 elem->opr.ch = token->opr.c;
3356 return REG_NOERROR;
3359 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3360 such as [:<character_class>:], [.<collating_element>.], and
3361 [=<equivalent_class>=]. */
3363 static reg_errcode_t
3364 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3365 re_token_t *token)
3367 unsigned char ch, delim = token->opr.c;
3368 int i = 0;
3369 if (re_string_eoi(regexp))
3370 return REG_EBRACK;
3371 for (;; ++i)
3373 if (i >= BRACKET_NAME_BUF_SIZE)
3374 return REG_EBRACK;
3375 if (token->type == OP_OPEN_CHAR_CLASS)
3376 ch = re_string_fetch_byte_case (regexp);
3377 else
3378 ch = re_string_fetch_byte (regexp);
3379 if (re_string_eoi(regexp))
3380 return REG_EBRACK;
3381 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3382 break;
3383 elem->opr.name[i] = ch;
3385 re_string_skip_bytes (regexp, 1);
3386 elem->opr.name[i] = '\0';
3387 switch (token->type)
3389 case OP_OPEN_COLL_ELEM:
3390 elem->type = COLL_SYM;
3391 break;
3392 case OP_OPEN_EQUIV_CLASS:
3393 elem->type = EQUIV_CLASS;
3394 break;
3395 case OP_OPEN_CHAR_CLASS:
3396 elem->type = CHAR_CLASS;
3397 break;
3398 default:
3399 break;
3401 return REG_NOERROR;
3404 /* Helper function for parse_bracket_exp.
3405 Build the equivalence class which is represented by NAME.
3406 The result are written to MBCSET and SBCSET.
3407 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3408 is a pointer argument sinse we may update it. */
3410 static reg_errcode_t
3411 #ifdef RE_ENABLE_I18N
3412 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3413 int *equiv_class_alloc, const unsigned char *name)
3414 #else /* not RE_ENABLE_I18N */
3415 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3416 #endif /* not RE_ENABLE_I18N */
3418 #ifdef _LIBC
3419 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3420 if (nrules != 0)
3422 const int32_t *table, *indirect;
3423 const unsigned char *weights, *extra, *cp;
3424 unsigned char char_buf[2];
3425 int32_t idx1, idx2;
3426 unsigned int ch;
3427 size_t len;
3428 /* This #include defines a local function! */
3429 # include <locale/weight.h>
3430 /* Calculate the index for equivalence class. */
3431 cp = name;
3432 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3433 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3434 _NL_COLLATE_WEIGHTMB);
3435 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3436 _NL_COLLATE_EXTRAMB);
3437 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3438 _NL_COLLATE_INDIRECTMB);
3439 idx1 = findidx (&cp, -1);
3440 if (BE (idx1 == 0 || *cp != '\0', 0))
3441 /* This isn't a valid character. */
3442 return REG_ECOLLATE;
3444 /* Build single byte matcing table for this equivalence class. */
3445 len = weights[idx1 & 0xffffff];
3446 for (ch = 0; ch < SBC_MAX; ++ch)
3448 char_buf[0] = ch;
3449 cp = char_buf;
3450 idx2 = findidx (&cp, 1);
3452 idx2 = table[ch];
3454 if (idx2 == 0)
3455 /* This isn't a valid character. */
3456 continue;
3457 /* Compare only if the length matches and the collation rule
3458 index is the same. */
3459 if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3461 int cnt = 0;
3463 while (cnt <= len &&
3464 weights[(idx1 & 0xffffff) + 1 + cnt]
3465 == weights[(idx2 & 0xffffff) + 1 + cnt])
3466 ++cnt;
3468 if (cnt > len)
3469 bitset_set (sbcset, ch);
3472 /* Check whether the array has enough space. */
3473 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3475 /* Not enough, realloc it. */
3476 /* +1 in case of mbcset->nequiv_classes is 0. */
3477 int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3478 /* Use realloc since the array is NULL if *alloc == 0. */
3479 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3480 int32_t,
3481 new_equiv_class_alloc);
3482 if (BE (new_equiv_classes == NULL, 0))
3483 return REG_ESPACE;
3484 mbcset->equiv_classes = new_equiv_classes;
3485 *equiv_class_alloc = new_equiv_class_alloc;
3487 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3489 else
3490 #endif /* _LIBC */
3492 if (BE (strlen ((const char *) name) != 1, 0))
3493 return REG_ECOLLATE;
3494 bitset_set (sbcset, *name);
3496 return REG_NOERROR;
3499 /* Helper function for parse_bracket_exp.
3500 Build the character class which is represented by NAME.
3501 The result are written to MBCSET and SBCSET.
3502 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3503 is a pointer argument sinse we may update it. */
3505 static reg_errcode_t
3506 #ifdef RE_ENABLE_I18N
3507 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3508 re_charset_t *mbcset, int *char_class_alloc,
3509 const unsigned char *class_name, reg_syntax_t syntax)
3510 #else /* not RE_ENABLE_I18N */
3511 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3512 const unsigned char *class_name, reg_syntax_t syntax)
3513 #endif /* not RE_ENABLE_I18N */
3515 int i;
3516 const char *name = (const char *) class_name;
3518 /* In case of REG_ICASE "upper" and "lower" match the both of
3519 upper and lower cases. */
3520 if ((syntax & RE_ICASE)
3521 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3522 name = "alpha";
3524 #ifdef RE_ENABLE_I18N
3525 /* Check the space of the arrays. */
3526 if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3528 /* Not enough, realloc it. */
3529 /* +1 in case of mbcset->nchar_classes is 0. */
3530 int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3531 /* Use realloc since array is NULL if *alloc == 0. */
3532 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3533 new_char_class_alloc);
3534 if (BE (new_char_classes == NULL, 0))
3535 return REG_ESPACE;
3536 mbcset->char_classes = new_char_classes;
3537 *char_class_alloc = new_char_class_alloc;
3539 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3540 #endif /* RE_ENABLE_I18N */
3542 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3543 do { \
3544 if (BE (trans != NULL, 0)) \
3546 for (i = 0; i < SBC_MAX; ++i) \
3547 if (ctype_func (i)) \
3548 bitset_set (sbcset, trans[i]); \
3550 else \
3552 for (i = 0; i < SBC_MAX; ++i) \
3553 if (ctype_func (i)) \
3554 bitset_set (sbcset, i); \
3556 } while (0)
3558 if (strcmp (name, "alnum") == 0)
3559 BUILD_CHARCLASS_LOOP (isalnum);
3560 else if (strcmp (name, "cntrl") == 0)
3561 BUILD_CHARCLASS_LOOP (iscntrl);
3562 else if (strcmp (name, "lower") == 0)
3563 BUILD_CHARCLASS_LOOP (islower);
3564 else if (strcmp (name, "space") == 0)
3565 BUILD_CHARCLASS_LOOP (isspace);
3566 else if (strcmp (name, "alpha") == 0)
3567 BUILD_CHARCLASS_LOOP (isalpha);
3568 else if (strcmp (name, "digit") == 0)
3569 BUILD_CHARCLASS_LOOP (isdigit);
3570 else if (strcmp (name, "print") == 0)
3571 BUILD_CHARCLASS_LOOP (isprint);
3572 else if (strcmp (name, "upper") == 0)
3573 BUILD_CHARCLASS_LOOP (isupper);
3574 else if (strcmp (name, "blank") == 0)
3575 BUILD_CHARCLASS_LOOP (isblank);
3576 else if (strcmp (name, "graph") == 0)
3577 BUILD_CHARCLASS_LOOP (isgraph);
3578 else if (strcmp (name, "punct") == 0)
3579 BUILD_CHARCLASS_LOOP (ispunct);
3580 else if (strcmp (name, "xdigit") == 0)
3581 BUILD_CHARCLASS_LOOP (isxdigit);
3582 else
3583 return REG_ECTYPE;
3585 return REG_NOERROR;
3588 static bin_tree_t *
3589 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3590 const unsigned char *class_name,
3591 const unsigned char *extra, int non_match,
3592 reg_errcode_t *err)
3594 re_bitset_ptr_t sbcset;
3595 #ifdef RE_ENABLE_I18N
3596 re_charset_t *mbcset;
3597 int alloc = 0;
3598 #endif /* not RE_ENABLE_I18N */
3599 reg_errcode_t ret;
3600 re_token_t br_token;
3601 bin_tree_t *tree;
3603 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3604 #ifdef RE_ENABLE_I18N
3605 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3606 #endif /* RE_ENABLE_I18N */
3608 #ifdef RE_ENABLE_I18N
3609 if (BE (sbcset == NULL || mbcset == NULL, 0))
3610 #else /* not RE_ENABLE_I18N */
3611 if (BE (sbcset == NULL, 0))
3612 #endif /* not RE_ENABLE_I18N */
3614 *err = REG_ESPACE;
3615 return NULL;
3618 if (non_match)
3620 #ifdef RE_ENABLE_I18N
3621 mbcset->non_match = 1;
3622 #endif /* not RE_ENABLE_I18N */
3625 /* We don't care the syntax in this case. */
3626 ret = build_charclass (trans, sbcset,
3627 #ifdef RE_ENABLE_I18N
3628 mbcset, &alloc,
3629 #endif /* RE_ENABLE_I18N */
3630 class_name, 0);
3632 if (BE (ret != REG_NOERROR, 0))
3634 re_free (sbcset);
3635 #ifdef RE_ENABLE_I18N
3636 free_charset (mbcset);
3637 #endif /* RE_ENABLE_I18N */
3638 *err = ret;
3639 return NULL;
3641 /* \w match '_' also. */
3642 for (; *extra; extra++)
3643 bitset_set (sbcset, *extra);
3645 /* If it is non-matching list. */
3646 if (non_match)
3647 bitset_not (sbcset);
3649 #ifdef RE_ENABLE_I18N
3650 /* Ensure only single byte characters are set. */
3651 if (dfa->mb_cur_max > 1)
3652 bitset_mask (sbcset, dfa->sb_char);
3653 #endif
3655 /* Build a tree for simple bracket. */
3656 br_token.type = SIMPLE_BRACKET;
3657 br_token.opr.sbcset = sbcset;
3658 tree = create_token_tree (dfa, NULL, NULL, &br_token);
3659 if (BE (tree == NULL, 0))
3660 goto build_word_op_espace;
3662 #ifdef RE_ENABLE_I18N
3663 if (dfa->mb_cur_max > 1)
3665 bin_tree_t *mbc_tree;
3666 /* Build a tree for complex bracket. */
3667 br_token.type = COMPLEX_BRACKET;
3668 br_token.opr.mbcset = mbcset;
3669 dfa->has_mb_node = 1;
3670 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3671 if (BE (mbc_tree == NULL, 0))
3672 goto build_word_op_espace;
3673 /* Then join them by ALT node. */
3674 tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3675 if (BE (mbc_tree != NULL, 1))
3676 return tree;
3678 else
3680 free_charset (mbcset);
3681 return tree;
3683 #else /* not RE_ENABLE_I18N */
3684 return tree;
3685 #endif /* not RE_ENABLE_I18N */
3687 build_word_op_espace:
3688 re_free (sbcset);
3689 #ifdef RE_ENABLE_I18N
3690 free_charset (mbcset);
3691 #endif /* RE_ENABLE_I18N */
3692 *err = REG_ESPACE;
3693 return NULL;
3696 /* This is intended for the expressions like "a{1,3}".
3697 Fetch a number from `input', and return the number.
3698 Return -1, if the number field is empty like "{,1}".
3699 Return -2, If an error is occured. */
3701 static int
3702 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3704 int num = -1;
3705 unsigned char c;
3706 while (1)
3708 fetch_token (token, input, syntax);
3709 c = token->opr.c;
3710 if (BE (token->type == END_OF_RE, 0))
3711 return -2;
3712 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3713 break;
3714 num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3715 ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3716 num = (num > RE_DUP_MAX) ? -2 : num;
3718 return num;
3721 #ifdef RE_ENABLE_I18N
3722 static void
3723 free_charset (re_charset_t *cset)
3725 re_free (cset->mbchars);
3726 # ifdef _LIBC
3727 re_free (cset->coll_syms);
3728 re_free (cset->equiv_classes);
3729 re_free (cset->range_starts);
3730 re_free (cset->range_ends);
3731 # endif
3732 re_free (cset->char_classes);
3733 re_free (cset);
3735 #endif /* RE_ENABLE_I18N */
3737 /* Functions for binary tree operation. */
3739 /* Create a tree node. */
3741 static bin_tree_t *
3742 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3743 re_token_type_t type)
3745 re_token_t t;
3746 t.type = type;
3747 return create_token_tree (dfa, left, right, &t);
3750 static bin_tree_t *
3751 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3752 const re_token_t *token)
3754 bin_tree_t *tree;
3755 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3757 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3759 if (storage == NULL)
3760 return NULL;
3761 storage->next = dfa->str_tree_storage;
3762 dfa->str_tree_storage = storage;
3763 dfa->str_tree_storage_idx = 0;
3765 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3767 tree->parent = NULL;
3768 tree->left = left;
3769 tree->right = right;
3770 tree->token = *token;
3771 tree->token.duplicated = 0;
3772 tree->token.opt_subexp = 0;
3773 tree->first = NULL;
3774 tree->next = NULL;
3775 tree->node_idx = -1;
3777 if (left != NULL)
3778 left->parent = tree;
3779 if (right != NULL)
3780 right->parent = tree;
3781 return tree;
3784 /* Mark the tree SRC as an optional subexpression.
3785 To be called from preorder or postorder. */
3787 static reg_errcode_t
3788 mark_opt_subexp (void *extra, bin_tree_t *node)
3790 int idx = (int) (long) extra;
3791 if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3792 node->token.opt_subexp = 1;
3794 return REG_NOERROR;
3797 /* Free the allocated memory inside NODE. */
3799 static void
3800 free_token (re_token_t *node)
3802 #ifdef RE_ENABLE_I18N
3803 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3804 free_charset (node->opr.mbcset);
3805 else
3806 #endif /* RE_ENABLE_I18N */
3807 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3808 re_free (node->opr.sbcset);
3811 /* Worker function for tree walking. Free the allocated memory inside NODE
3812 and its children. */
3814 static reg_errcode_t
3815 free_tree (void *extra, bin_tree_t *node)
3817 free_token (&node->token);
3818 return REG_NOERROR;
3822 /* Duplicate the node SRC, and return new node. This is a preorder
3823 visit similar to the one implemented by the generic visitor, but
3824 we need more infrastructure to maintain two parallel trees --- so,
3825 it's easier to duplicate. */
3827 static bin_tree_t *
3828 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3830 const bin_tree_t *node;
3831 bin_tree_t *dup_root;
3832 bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3834 for (node = root; ; )
3836 /* Create a new tree and link it back to the current parent. */
3837 *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3838 if (*p_new == NULL)
3839 return NULL;
3840 (*p_new)->parent = dup_node;
3841 (*p_new)->token.duplicated = 1;
3842 dup_node = *p_new;
3844 /* Go to the left node, or up and to the right. */
3845 if (node->left)
3847 node = node->left;
3848 p_new = &dup_node->left;
3850 else
3852 const bin_tree_t *prev = NULL;
3853 while (node->right == prev || node->right == NULL)
3855 prev = node;
3856 node = node->parent;
3857 dup_node = dup_node->parent;
3858 if (!node)
3859 return dup_root;
3861 node = node->right;
3862 p_new = &dup_node->right;