Include bits/cmathcalls.h for more _FloatN, _FloatNx types.
[glibc.git] / posix / regcomp.c
blob871ae2ffab5ffc57a9d3b594c42feec5cdd5a4f6
1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002-2017 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <http://www.gnu.org/licenses/>. */
20 #include <stdint.h>
22 #ifdef _LIBC
23 # include <locale/weight.h>
24 #endif
26 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
27 size_t length, reg_syntax_t syntax);
28 static void re_compile_fastmap_iter (regex_t *bufp,
29 const re_dfastate_t *init_state,
30 char *fastmap);
31 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
32 #ifdef RE_ENABLE_I18N
33 static void free_charset (re_charset_t *cset);
34 #endif /* RE_ENABLE_I18N */
35 static void free_workarea_compile (regex_t *preg);
36 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
37 #ifdef RE_ENABLE_I18N
38 static void optimize_utf8 (re_dfa_t *dfa);
39 #endif
40 static reg_errcode_t analyze (regex_t *preg);
41 static reg_errcode_t preorder (bin_tree_t *root,
42 reg_errcode_t (fn (void *, bin_tree_t *)),
43 void *extra);
44 static reg_errcode_t postorder (bin_tree_t *root,
45 reg_errcode_t (fn (void *, bin_tree_t *)),
46 void *extra);
47 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
48 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
49 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
50 bin_tree_t *node);
51 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
52 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
53 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
54 static int duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint);
55 static int search_duplicated_node (const re_dfa_t *dfa, int org_node,
56 unsigned int constraint);
57 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
58 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
59 int node, int root);
60 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
61 static int fetch_number (re_string_t *input, re_token_t *token,
62 reg_syntax_t syntax);
63 static int peek_token (re_token_t *token, re_string_t *input,
64 reg_syntax_t syntax);
65 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
66 reg_syntax_t syntax, reg_errcode_t *err);
67 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
68 re_token_t *token, reg_syntax_t syntax,
69 int nest, reg_errcode_t *err);
70 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
71 re_token_t *token, reg_syntax_t syntax,
72 int nest, reg_errcode_t *err);
73 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
74 re_token_t *token, reg_syntax_t syntax,
75 int nest, reg_errcode_t *err);
76 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
77 re_token_t *token, reg_syntax_t syntax,
78 int nest, reg_errcode_t *err);
79 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
80 re_dfa_t *dfa, re_token_t *token,
81 reg_syntax_t syntax, reg_errcode_t *err);
82 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
83 re_token_t *token, reg_syntax_t syntax,
84 reg_errcode_t *err);
85 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
86 re_string_t *regexp,
87 re_token_t *token, int token_len,
88 re_dfa_t *dfa,
89 reg_syntax_t syntax,
90 int accept_hyphen);
91 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
92 re_string_t *regexp,
93 re_token_t *token);
94 #ifdef RE_ENABLE_I18N
95 static reg_errcode_t build_equiv_class (bitset_t sbcset,
96 re_charset_t *mbcset,
97 int *equiv_class_alloc,
98 const unsigned char *name);
99 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
100 bitset_t sbcset,
101 re_charset_t *mbcset,
102 int *char_class_alloc,
103 const unsigned char *class_name,
104 reg_syntax_t syntax);
105 #else /* not RE_ENABLE_I18N */
106 static reg_errcode_t build_equiv_class (bitset_t sbcset,
107 const unsigned char *name);
108 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
109 bitset_t sbcset,
110 const unsigned char *class_name,
111 reg_syntax_t syntax);
112 #endif /* not RE_ENABLE_I18N */
113 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
114 RE_TRANSLATE_TYPE trans,
115 const unsigned char *class_name,
116 const unsigned char *extra,
117 int non_match, reg_errcode_t *err);
118 static bin_tree_t *create_tree (re_dfa_t *dfa,
119 bin_tree_t *left, bin_tree_t *right,
120 re_token_type_t type);
121 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
122 bin_tree_t *left, bin_tree_t *right,
123 const re_token_t *token);
124 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
125 static void free_token (re_token_t *node);
126 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
127 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
129 /* This table gives an error message for each of the error codes listed
130 in regex.h. Obviously the order here has to be same as there.
131 POSIX doesn't require that we do anything for REG_NOERROR,
132 but why not be nice? */
134 const char __re_error_msgid[] attribute_hidden =
136 #define REG_NOERROR_IDX 0
137 gettext_noop ("Success") /* REG_NOERROR */
138 "\0"
139 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
140 gettext_noop ("No match") /* REG_NOMATCH */
141 "\0"
142 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
143 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
144 "\0"
145 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
146 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
147 "\0"
148 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
149 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
150 "\0"
151 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
152 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
153 "\0"
154 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
155 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
156 "\0"
157 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
158 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
159 "\0"
160 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
161 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
162 "\0"
163 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
164 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
165 "\0"
166 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
167 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
168 "\0"
169 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
170 gettext_noop ("Invalid range end") /* REG_ERANGE */
171 "\0"
172 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
173 gettext_noop ("Memory exhausted") /* REG_ESPACE */
174 "\0"
175 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
176 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
177 "\0"
178 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
179 gettext_noop ("Premature end of regular expression") /* REG_EEND */
180 "\0"
181 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
182 gettext_noop ("Regular expression too big") /* REG_ESIZE */
183 "\0"
184 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
185 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
188 const size_t __re_error_msgid_idx[] attribute_hidden =
190 REG_NOERROR_IDX,
191 REG_NOMATCH_IDX,
192 REG_BADPAT_IDX,
193 REG_ECOLLATE_IDX,
194 REG_ECTYPE_IDX,
195 REG_EESCAPE_IDX,
196 REG_ESUBREG_IDX,
197 REG_EBRACK_IDX,
198 REG_EPAREN_IDX,
199 REG_EBRACE_IDX,
200 REG_BADBR_IDX,
201 REG_ERANGE_IDX,
202 REG_ESPACE_IDX,
203 REG_BADRPT_IDX,
204 REG_EEND_IDX,
205 REG_ESIZE_IDX,
206 REG_ERPAREN_IDX
209 /* Entry points for GNU code. */
211 /* re_compile_pattern is the GNU regular expression compiler: it
212 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
213 Returns 0 if the pattern was valid, otherwise an error string.
215 Assumes the 'allocated' (and perhaps 'buffer') and 'translate' fields
216 are set in BUFP on entry. */
218 const char *
219 re_compile_pattern (const char *pattern, size_t length,
220 struct re_pattern_buffer *bufp)
222 reg_errcode_t ret;
224 /* And GNU code determines whether or not to get register information
225 by passing null for the REGS argument to re_match, etc., not by
226 setting no_sub, unless RE_NO_SUB is set. */
227 bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
229 /* Match anchors at newline. */
230 bufp->newline_anchor = 1;
232 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
234 if (!ret)
235 return NULL;
236 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
238 #ifdef _LIBC
239 weak_alias (__re_compile_pattern, re_compile_pattern)
240 #endif
242 /* Set by 're_set_syntax' to the current regexp syntax to recognize. Can
243 also be assigned to arbitrarily: each pattern buffer stores its own
244 syntax, so it can be changed between regex compilations. */
245 /* This has no initializer because initialized variables in Emacs
246 become read-only after dumping. */
247 reg_syntax_t re_syntax_options;
250 /* Specify the precise syntax of regexps for compilation. This provides
251 for compatibility for various utilities which historically have
252 different, incompatible syntaxes.
254 The argument SYNTAX is a bit mask comprised of the various bits
255 defined in regex.h. We return the old syntax. */
257 reg_syntax_t
258 re_set_syntax (reg_syntax_t syntax)
260 reg_syntax_t ret = re_syntax_options;
262 re_syntax_options = syntax;
263 return ret;
265 #ifdef _LIBC
266 weak_alias (__re_set_syntax, re_set_syntax)
267 #endif
270 re_compile_fastmap (struct re_pattern_buffer *bufp)
272 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
273 char *fastmap = bufp->fastmap;
275 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
276 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
277 if (dfa->init_state != dfa->init_state_word)
278 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
279 if (dfa->init_state != dfa->init_state_nl)
280 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
281 if (dfa->init_state != dfa->init_state_begbuf)
282 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
283 bufp->fastmap_accurate = 1;
284 return 0;
286 #ifdef _LIBC
287 weak_alias (__re_compile_fastmap, re_compile_fastmap)
288 #endif
290 static inline void
291 __attribute__ ((always_inline))
292 re_set_fastmap (char *fastmap, bool icase, int ch)
294 fastmap[ch] = 1;
295 if (icase)
296 fastmap[tolower (ch)] = 1;
299 /* Helper function for re_compile_fastmap.
300 Compile fastmap for the initial_state INIT_STATE. */
302 static void
303 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
304 char *fastmap)
306 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
307 int node_cnt;
308 int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
309 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
311 int node = init_state->nodes.elems[node_cnt];
312 re_token_type_t type = dfa->nodes[node].type;
314 if (type == CHARACTER)
316 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
317 #ifdef RE_ENABLE_I18N
318 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
320 unsigned char *buf = alloca (dfa->mb_cur_max), *p;
321 wchar_t wc;
322 mbstate_t state;
324 p = buf;
325 *p++ = dfa->nodes[node].opr.c;
326 while (++node < dfa->nodes_len
327 && dfa->nodes[node].type == CHARACTER
328 && dfa->nodes[node].mb_partial)
329 *p++ = dfa->nodes[node].opr.c;
330 memset (&state, '\0', sizeof (state));
331 if (__mbrtowc (&wc, (const char *) buf, p - buf,
332 &state) == p - buf
333 && (__wcrtomb ((char *) buf, __towlower (wc), &state)
334 != (size_t) -1))
335 re_set_fastmap (fastmap, 0, buf[0]);
337 #endif
339 else if (type == SIMPLE_BRACKET)
341 int i, ch;
342 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
344 int j;
345 bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
346 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
347 if (w & ((bitset_word_t) 1 << j))
348 re_set_fastmap (fastmap, icase, ch);
351 #ifdef RE_ENABLE_I18N
352 else if (type == COMPLEX_BRACKET)
354 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
355 int i;
357 # ifdef _LIBC
358 /* See if we have to try all bytes which start multiple collation
359 elements.
360 e.g. In da_DK, we want to catch 'a' since "aa" is a valid
361 collation element, and don't catch 'b' since 'b' is
362 the only collation element which starts from 'b' (and
363 it is caught by SIMPLE_BRACKET). */
364 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
365 && (cset->ncoll_syms || cset->nranges))
367 const int32_t *table = (const int32_t *)
368 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
369 for (i = 0; i < SBC_MAX; ++i)
370 if (table[i] < 0)
371 re_set_fastmap (fastmap, icase, i);
373 # endif /* _LIBC */
375 /* See if we have to start the match at all multibyte characters,
376 i.e. where we would not find an invalid sequence. This only
377 applies to multibyte character sets; for single byte character
378 sets, the SIMPLE_BRACKET again suffices. */
379 if (dfa->mb_cur_max > 1
380 && (cset->nchar_classes || cset->non_match || cset->nranges
381 # ifdef _LIBC
382 || cset->nequiv_classes
383 # endif /* _LIBC */
386 unsigned char c = 0;
389 mbstate_t mbs;
390 memset (&mbs, 0, sizeof (mbs));
391 if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
392 re_set_fastmap (fastmap, false, (int) c);
394 while (++c != 0);
397 else
399 /* ... Else catch all bytes which can start the mbchars. */
400 for (i = 0; i < cset->nmbchars; ++i)
402 char buf[256];
403 mbstate_t state;
404 memset (&state, '\0', sizeof (state));
405 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
406 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
407 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
409 if (__wcrtomb (buf, __towlower (cset->mbchars[i]), &state)
410 != (size_t) -1)
411 re_set_fastmap (fastmap, false, *(unsigned char *) buf);
416 #endif /* RE_ENABLE_I18N */
417 else if (type == OP_PERIOD
418 #ifdef RE_ENABLE_I18N
419 || type == OP_UTF8_PERIOD
420 #endif /* RE_ENABLE_I18N */
421 || type == END_OF_RE)
423 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
424 if (type == END_OF_RE)
425 bufp->can_be_null = 1;
426 return;
431 /* Entry point for POSIX code. */
432 /* regcomp takes a regular expression as a string and compiles it.
434 PREG is a regex_t *. We do not expect any fields to be initialized,
435 since POSIX says we shouldn't. Thus, we set
437 'buffer' to the compiled pattern;
438 'used' to the length of the compiled pattern;
439 'syntax' to RE_SYNTAX_POSIX_EXTENDED if the
440 REG_EXTENDED bit in CFLAGS is set; otherwise, to
441 RE_SYNTAX_POSIX_BASIC;
442 'newline_anchor' to REG_NEWLINE being set in CFLAGS;
443 'fastmap' to an allocated space for the fastmap;
444 'fastmap_accurate' to zero;
445 're_nsub' to the number of subexpressions in PATTERN.
447 PATTERN is the address of the pattern string.
449 CFLAGS is a series of bits which affect compilation.
451 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
452 use POSIX basic syntax.
454 If REG_NEWLINE is set, then . and [^...] don't match newline.
455 Also, regexec will try a match beginning after every newline.
457 If REG_ICASE is set, then we considers upper- and lowercase
458 versions of letters to be equivalent when matching.
460 If REG_NOSUB is set, then when PREG is passed to regexec, that
461 routine will report only success or failure, and nothing about the
462 registers.
464 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
465 the return codes and their meanings.) */
468 regcomp (regex_t *__restrict preg, const char *__restrict pattern, int cflags)
470 reg_errcode_t ret;
471 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
472 : RE_SYNTAX_POSIX_BASIC);
474 preg->buffer = NULL;
475 preg->allocated = 0;
476 preg->used = 0;
478 /* Try to allocate space for the fastmap. */
479 preg->fastmap = re_malloc (char, SBC_MAX);
480 if (BE (preg->fastmap == NULL, 0))
481 return REG_ESPACE;
483 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
485 /* If REG_NEWLINE is set, newlines are treated differently. */
486 if (cflags & REG_NEWLINE)
487 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
488 syntax &= ~RE_DOT_NEWLINE;
489 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
490 /* It also changes the matching behavior. */
491 preg->newline_anchor = 1;
493 else
494 preg->newline_anchor = 0;
495 preg->no_sub = !!(cflags & REG_NOSUB);
496 preg->translate = NULL;
498 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
500 /* POSIX doesn't distinguish between an unmatched open-group and an
501 unmatched close-group: both are REG_EPAREN. */
502 if (ret == REG_ERPAREN)
503 ret = REG_EPAREN;
505 /* We have already checked preg->fastmap != NULL. */
506 if (BE (ret == REG_NOERROR, 1))
507 /* Compute the fastmap now, since regexec cannot modify the pattern
508 buffer. This function never fails in this implementation. */
509 (void) re_compile_fastmap (preg);
510 else
512 /* Some error occurred while compiling the expression. */
513 re_free (preg->fastmap);
514 preg->fastmap = NULL;
517 return (int) ret;
519 #ifdef _LIBC
520 libc_hidden_def (__regcomp)
521 weak_alias (__regcomp, regcomp)
522 #endif
524 /* Returns a message corresponding to an error code, ERRCODE, returned
525 from either regcomp or regexec. We don't use PREG here. */
527 size_t
528 regerror (int errcode, const regex_t *__restrict preg, char *__restrict errbuf,
529 size_t errbuf_size)
531 const char *msg;
532 size_t msg_size;
534 if (BE (errcode < 0
535 || errcode >= (int) (sizeof (__re_error_msgid_idx)
536 / sizeof (__re_error_msgid_idx[0])), 0))
537 /* Only error codes returned by the rest of the code should be passed
538 to this routine. If we are given anything else, or if other regex
539 code generates an invalid error code, then the program has a bug.
540 Dump core so we can fix it. */
541 abort ();
543 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
545 msg_size = strlen (msg) + 1; /* Includes the null. */
547 if (BE (errbuf_size != 0, 1))
549 if (BE (msg_size > errbuf_size, 0))
551 #if defined HAVE_MEMPCPY || defined _LIBC
552 *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
553 #else
554 memcpy (errbuf, msg, errbuf_size - 1);
555 errbuf[errbuf_size - 1] = 0;
556 #endif
558 else
559 memcpy (errbuf, msg, msg_size);
562 return msg_size;
564 #ifdef _LIBC
565 weak_alias (__regerror, regerror)
566 #endif
569 #ifdef RE_ENABLE_I18N
570 /* This static array is used for the map to single-byte characters when
571 UTF-8 is used. Otherwise we would allocate memory just to initialize
572 it the same all the time. UTF-8 is the preferred encoding so this is
573 a worthwhile optimization. */
574 static const bitset_t utf8_sb_map =
576 /* Set the first 128 bits. */
577 [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
579 #endif
582 static void
583 free_dfa_content (re_dfa_t *dfa)
585 int i, j;
587 if (dfa->nodes)
588 for (i = 0; i < dfa->nodes_len; ++i)
589 free_token (dfa->nodes + i);
590 re_free (dfa->nexts);
591 for (i = 0; i < dfa->nodes_len; ++i)
593 if (dfa->eclosures != NULL)
594 re_node_set_free (dfa->eclosures + i);
595 if (dfa->inveclosures != NULL)
596 re_node_set_free (dfa->inveclosures + i);
597 if (dfa->edests != NULL)
598 re_node_set_free (dfa->edests + i);
600 re_free (dfa->edests);
601 re_free (dfa->eclosures);
602 re_free (dfa->inveclosures);
603 re_free (dfa->nodes);
605 if (dfa->state_table)
606 for (i = 0; i <= dfa->state_hash_mask; ++i)
608 struct re_state_table_entry *entry = dfa->state_table + i;
609 for (j = 0; j < entry->num; ++j)
611 re_dfastate_t *state = entry->array[j];
612 free_state (state);
614 re_free (entry->array);
616 re_free (dfa->state_table);
617 #ifdef RE_ENABLE_I18N
618 if (dfa->sb_char != utf8_sb_map)
619 re_free (dfa->sb_char);
620 #endif
621 re_free (dfa->subexp_map);
622 #ifdef DEBUG
623 re_free (dfa->re_str);
624 #endif
626 re_free (dfa);
630 /* Free dynamically allocated space used by PREG. */
632 void
633 regfree (regex_t *preg)
635 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
636 if (BE (dfa != NULL, 1))
637 free_dfa_content (dfa);
638 preg->buffer = NULL;
639 preg->allocated = 0;
641 re_free (preg->fastmap);
642 preg->fastmap = NULL;
644 re_free (preg->translate);
645 preg->translate = NULL;
647 #ifdef _LIBC
648 libc_hidden_def (__regfree)
649 weak_alias (__regfree, regfree)
650 #endif
652 /* Entry points compatible with 4.2 BSD regex library. We don't define
653 them unless specifically requested. */
655 #if defined _REGEX_RE_COMP || defined _LIBC
657 /* BSD has one and only one pattern buffer. */
658 static struct re_pattern_buffer re_comp_buf;
660 char *
661 # ifdef _LIBC
662 /* Make these definitions weak in libc, so POSIX programs can redefine
663 these names if they don't use our functions, and still use
664 regcomp/regexec above without link errors. */
665 weak_function
666 # endif
667 re_comp (const char *s)
669 reg_errcode_t ret;
670 char *fastmap;
672 if (!s)
674 if (!re_comp_buf.buffer)
675 return gettext ("No previous regular expression");
676 return 0;
679 if (re_comp_buf.buffer)
681 fastmap = re_comp_buf.fastmap;
682 re_comp_buf.fastmap = NULL;
683 __regfree (&re_comp_buf);
684 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
685 re_comp_buf.fastmap = fastmap;
688 if (re_comp_buf.fastmap == NULL)
690 re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
691 if (re_comp_buf.fastmap == NULL)
692 return (char *) gettext (__re_error_msgid
693 + __re_error_msgid_idx[(int) REG_ESPACE]);
696 /* Since 're_exec' always passes NULL for the 'regs' argument, we
697 don't need to initialize the pattern buffer fields which affect it. */
699 /* Match anchors at newlines. */
700 re_comp_buf.newline_anchor = 1;
702 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
704 if (!ret)
705 return NULL;
707 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
708 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
711 #ifdef _LIBC
712 libc_freeres_fn (free_mem)
714 __regfree (&re_comp_buf);
716 #endif
718 #endif /* _REGEX_RE_COMP */
720 /* Internal entry point.
721 Compile the regular expression PATTERN, whose length is LENGTH.
722 SYNTAX indicate regular expression's syntax. */
724 static reg_errcode_t
725 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
726 reg_syntax_t syntax)
728 reg_errcode_t err = REG_NOERROR;
729 re_dfa_t *dfa;
730 re_string_t regexp;
732 /* Initialize the pattern buffer. */
733 preg->fastmap_accurate = 0;
734 preg->syntax = syntax;
735 preg->not_bol = preg->not_eol = 0;
736 preg->used = 0;
737 preg->re_nsub = 0;
738 preg->can_be_null = 0;
739 preg->regs_allocated = REGS_UNALLOCATED;
741 /* Initialize the dfa. */
742 dfa = (re_dfa_t *) preg->buffer;
743 if (BE (preg->allocated < sizeof (re_dfa_t), 0))
745 /* If zero allocated, but buffer is non-null, try to realloc
746 enough space. This loses if buffer's address is bogus, but
747 that is the user's responsibility. If ->buffer is NULL this
748 is a simple allocation. */
749 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
750 if (dfa == NULL)
751 return REG_ESPACE;
752 preg->allocated = sizeof (re_dfa_t);
753 preg->buffer = (unsigned char *) dfa;
755 preg->used = sizeof (re_dfa_t);
757 err = init_dfa (dfa, length);
758 if (BE (err != REG_NOERROR, 0))
760 free_dfa_content (dfa);
761 preg->buffer = NULL;
762 preg->allocated = 0;
763 return err;
765 #ifdef DEBUG
766 /* Note: length+1 will not overflow since it is checked in init_dfa. */
767 dfa->re_str = re_malloc (char, length + 1);
768 strncpy (dfa->re_str, pattern, length + 1);
769 #endif
771 __libc_lock_init (dfa->lock);
773 err = re_string_construct (&regexp, pattern, length, preg->translate,
774 syntax & RE_ICASE, dfa);
775 if (BE (err != REG_NOERROR, 0))
777 re_compile_internal_free_return:
778 free_workarea_compile (preg);
779 re_string_destruct (&regexp);
780 free_dfa_content (dfa);
781 preg->buffer = NULL;
782 preg->allocated = 0;
783 return err;
786 /* Parse the regular expression, and build a structure tree. */
787 preg->re_nsub = 0;
788 dfa->str_tree = parse (&regexp, preg, syntax, &err);
789 if (BE (dfa->str_tree == NULL, 0))
790 goto re_compile_internal_free_return;
792 /* Analyze the tree and create the nfa. */
793 err = analyze (preg);
794 if (BE (err != REG_NOERROR, 0))
795 goto re_compile_internal_free_return;
797 #ifdef RE_ENABLE_I18N
798 /* If possible, do searching in single byte encoding to speed things up. */
799 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
800 optimize_utf8 (dfa);
801 #endif
803 /* Then create the initial state of the dfa. */
804 err = create_initial_state (dfa);
806 /* Release work areas. */
807 free_workarea_compile (preg);
808 re_string_destruct (&regexp);
810 if (BE (err != REG_NOERROR, 0))
812 free_dfa_content (dfa);
813 preg->buffer = NULL;
814 preg->allocated = 0;
817 return err;
820 /* Initialize DFA. We use the length of the regular expression PAT_LEN
821 as the initial length of some arrays. */
823 static reg_errcode_t
824 init_dfa (re_dfa_t *dfa, size_t pat_len)
826 unsigned int table_size;
827 #ifndef _LIBC
828 char *codeset_name;
829 #endif
831 memset (dfa, '\0', sizeof (re_dfa_t));
833 /* Force allocation of str_tree_storage the first time. */
834 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
836 /* Avoid overflows. */
837 if (pat_len == SIZE_MAX)
838 return REG_ESPACE;
840 dfa->nodes_alloc = pat_len + 1;
841 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
843 /* table_size = 2 ^ ceil(log pat_len) */
844 for (table_size = 1; ; table_size <<= 1)
845 if (table_size > pat_len)
846 break;
848 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
849 dfa->state_hash_mask = table_size - 1;
851 dfa->mb_cur_max = MB_CUR_MAX;
852 #ifdef _LIBC
853 if (dfa->mb_cur_max == 6
854 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
855 dfa->is_utf8 = 1;
856 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
857 != 0);
858 #else
859 # ifdef HAVE_LANGINFO_CODESET
860 codeset_name = nl_langinfo (CODESET);
861 # else
862 codeset_name = getenv ("LC_ALL");
863 if (codeset_name == NULL || codeset_name[0] == '\0')
864 codeset_name = getenv ("LC_CTYPE");
865 if (codeset_name == NULL || codeset_name[0] == '\0')
866 codeset_name = getenv ("LANG");
867 if (codeset_name == NULL)
868 codeset_name = "";
869 else if (strchr (codeset_name, '.') != NULL)
870 codeset_name = strchr (codeset_name, '.') + 1;
871 # endif
873 if (strcasecmp (codeset_name, "UTF-8") == 0
874 || strcasecmp (codeset_name, "UTF8") == 0)
875 dfa->is_utf8 = 1;
877 /* We check exhaustively in the loop below if this charset is a
878 superset of ASCII. */
879 dfa->map_notascii = 0;
880 #endif
882 #ifdef RE_ENABLE_I18N
883 if (dfa->mb_cur_max > 1)
885 if (dfa->is_utf8)
886 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
887 else
889 int i, j, ch;
891 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
892 if (BE (dfa->sb_char == NULL, 0))
893 return REG_ESPACE;
895 /* Set the bits corresponding to single byte chars. */
896 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
897 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
899 wint_t wch = __btowc (ch);
900 if (wch != WEOF)
901 dfa->sb_char[i] |= (bitset_word_t) 1 << j;
902 # ifndef _LIBC
903 if (isascii (ch) && wch != ch)
904 dfa->map_notascii = 1;
905 # endif
909 #endif
911 if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
912 return REG_ESPACE;
913 return REG_NOERROR;
916 /* Initialize WORD_CHAR table, which indicate which character is
917 "word". In this case "word" means that it is the word construction
918 character used by some operators like "\<", "\>", etc. */
920 static void
921 init_word_char (re_dfa_t *dfa)
923 dfa->word_ops_used = 1;
924 int i = 0;
925 int ch = 0;
926 if (BE (dfa->map_notascii == 0, 1))
928 if (sizeof (dfa->word_char[0]) == 8)
930 /* The extra temporaries here avoid "implicitly truncated"
931 warnings in the case when this is dead code, i.e. 32-bit. */
932 const uint64_t wc0 = UINT64_C (0x03ff000000000000);
933 const uint64_t wc1 = UINT64_C (0x07fffffe87fffffe);
934 dfa->word_char[0] = wc0;
935 dfa->word_char[1] = wc1;
936 i = 2;
938 else if (sizeof (dfa->word_char[0]) == 4)
940 dfa->word_char[0] = UINT32_C (0x00000000);
941 dfa->word_char[1] = UINT32_C (0x03ff0000);
942 dfa->word_char[2] = UINT32_C (0x87fffffe);
943 dfa->word_char[3] = UINT32_C (0x07fffffe);
944 i = 4;
946 else
947 abort ();
948 ch = 128;
950 if (BE (dfa->is_utf8, 1))
952 memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8);
953 return;
957 for (; i < BITSET_WORDS; ++i)
958 for (int j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
959 if (isalnum (ch) || ch == '_')
960 dfa->word_char[i] |= (bitset_word_t) 1 << j;
963 /* Free the work area which are only used while compiling. */
965 static void
966 free_workarea_compile (regex_t *preg)
968 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
969 bin_tree_storage_t *storage, *next;
970 for (storage = dfa->str_tree_storage; storage; storage = next)
972 next = storage->next;
973 re_free (storage);
975 dfa->str_tree_storage = NULL;
976 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
977 dfa->str_tree = NULL;
978 re_free (dfa->org_indices);
979 dfa->org_indices = NULL;
982 /* Create initial states for all contexts. */
984 static reg_errcode_t
985 create_initial_state (re_dfa_t *dfa)
987 int first, i;
988 reg_errcode_t err;
989 re_node_set init_nodes;
991 /* Initial states have the epsilon closure of the node which is
992 the first node of the regular expression. */
993 first = dfa->str_tree->first->node_idx;
994 dfa->init_node = first;
995 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
996 if (BE (err != REG_NOERROR, 0))
997 return err;
999 /* The back-references which are in initial states can epsilon transit,
1000 since in this case all of the subexpressions can be null.
1001 Then we add epsilon closures of the nodes which are the next nodes of
1002 the back-references. */
1003 if (dfa->nbackref > 0)
1004 for (i = 0; i < init_nodes.nelem; ++i)
1006 int node_idx = init_nodes.elems[i];
1007 re_token_type_t type = dfa->nodes[node_idx].type;
1009 int clexp_idx;
1010 if (type != OP_BACK_REF)
1011 continue;
1012 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1014 re_token_t *clexp_node;
1015 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1016 if (clexp_node->type == OP_CLOSE_SUBEXP
1017 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1018 break;
1020 if (clexp_idx == init_nodes.nelem)
1021 continue;
1023 if (type == OP_BACK_REF)
1025 int dest_idx = dfa->edests[node_idx].elems[0];
1026 if (!re_node_set_contains (&init_nodes, dest_idx))
1028 reg_errcode_t err = re_node_set_merge (&init_nodes,
1029 dfa->eclosures
1030 + dest_idx);
1031 if (err != REG_NOERROR)
1032 return err;
1033 i = 0;
1038 /* It must be the first time to invoke acquire_state. */
1039 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1040 /* We don't check ERR here, since the initial state must not be NULL. */
1041 if (BE (dfa->init_state == NULL, 0))
1042 return err;
1043 if (dfa->init_state->has_constraint)
1045 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1046 CONTEXT_WORD);
1047 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1048 CONTEXT_NEWLINE);
1049 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1050 &init_nodes,
1051 CONTEXT_NEWLINE
1052 | CONTEXT_BEGBUF);
1053 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1054 || dfa->init_state_begbuf == NULL, 0))
1055 return err;
1057 else
1058 dfa->init_state_word = dfa->init_state_nl
1059 = dfa->init_state_begbuf = dfa->init_state;
1061 re_node_set_free (&init_nodes);
1062 return REG_NOERROR;
1065 #ifdef RE_ENABLE_I18N
1066 /* If it is possible to do searching in single byte encoding instead of UTF-8
1067 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1068 DFA nodes where needed. */
1070 static void
1071 optimize_utf8 (re_dfa_t *dfa)
1073 int node, i, mb_chars = 0, has_period = 0;
1075 for (node = 0; node < dfa->nodes_len; ++node)
1076 switch (dfa->nodes[node].type)
1078 case CHARACTER:
1079 if (dfa->nodes[node].opr.c >= 0x80)
1080 mb_chars = 1;
1081 break;
1082 case ANCHOR:
1083 switch (dfa->nodes[node].opr.ctx_type)
1085 case LINE_FIRST:
1086 case LINE_LAST:
1087 case BUF_FIRST:
1088 case BUF_LAST:
1089 break;
1090 default:
1091 /* Word anchors etc. cannot be handled. It's okay to test
1092 opr.ctx_type since constraints (for all DFA nodes) are
1093 created by ORing one or more opr.ctx_type values. */
1094 return;
1096 break;
1097 case OP_PERIOD:
1098 has_period = 1;
1099 break;
1100 case OP_BACK_REF:
1101 case OP_ALT:
1102 case END_OF_RE:
1103 case OP_DUP_ASTERISK:
1104 case OP_OPEN_SUBEXP:
1105 case OP_CLOSE_SUBEXP:
1106 break;
1107 case COMPLEX_BRACKET:
1108 return;
1109 case SIMPLE_BRACKET:
1110 /* Just double check. The non-ASCII range starts at 0x80. */
1111 assert (0x80 % BITSET_WORD_BITS == 0);
1112 for (i = 0x80 / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1113 if (dfa->nodes[node].opr.sbcset[i])
1114 return;
1115 break;
1116 default:
1117 abort ();
1120 if (mb_chars || has_period)
1121 for (node = 0; node < dfa->nodes_len; ++node)
1123 if (dfa->nodes[node].type == CHARACTER
1124 && dfa->nodes[node].opr.c >= 0x80)
1125 dfa->nodes[node].mb_partial = 0;
1126 else if (dfa->nodes[node].type == OP_PERIOD)
1127 dfa->nodes[node].type = OP_UTF8_PERIOD;
1130 /* The search can be in single byte locale. */
1131 dfa->mb_cur_max = 1;
1132 dfa->is_utf8 = 0;
1133 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1135 #endif
1137 /* Analyze the structure tree, and calculate "first", "next", "edest",
1138 "eclosure", and "inveclosure". */
1140 static reg_errcode_t
1141 analyze (regex_t *preg)
1143 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1144 reg_errcode_t ret;
1146 /* Allocate arrays. */
1147 dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1148 dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1149 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1150 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1151 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1152 || dfa->eclosures == NULL, 0))
1153 return REG_ESPACE;
1155 dfa->subexp_map = re_malloc (int, preg->re_nsub);
1156 if (dfa->subexp_map != NULL)
1158 int i;
1159 for (i = 0; i < preg->re_nsub; i++)
1160 dfa->subexp_map[i] = i;
1161 preorder (dfa->str_tree, optimize_subexps, dfa);
1162 for (i = 0; i < preg->re_nsub; i++)
1163 if (dfa->subexp_map[i] != i)
1164 break;
1165 if (i == preg->re_nsub)
1167 free (dfa->subexp_map);
1168 dfa->subexp_map = NULL;
1172 ret = postorder (dfa->str_tree, lower_subexps, preg);
1173 if (BE (ret != REG_NOERROR, 0))
1174 return ret;
1175 ret = postorder (dfa->str_tree, calc_first, dfa);
1176 if (BE (ret != REG_NOERROR, 0))
1177 return ret;
1178 preorder (dfa->str_tree, calc_next, dfa);
1179 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1180 if (BE (ret != REG_NOERROR, 0))
1181 return ret;
1182 ret = calc_eclosure (dfa);
1183 if (BE (ret != REG_NOERROR, 0))
1184 return ret;
1186 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1187 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1188 if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1189 || dfa->nbackref)
1191 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1192 if (BE (dfa->inveclosures == NULL, 0))
1193 return REG_ESPACE;
1194 ret = calc_inveclosure (dfa);
1197 return ret;
1200 /* Our parse trees are very unbalanced, so we cannot use a stack to
1201 implement parse tree visits. Instead, we use parent pointers and
1202 some hairy code in these two functions. */
1203 static reg_errcode_t
1204 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1205 void *extra)
1207 bin_tree_t *node, *prev;
1209 for (node = root; ; )
1211 /* Descend down the tree, preferably to the left (or to the right
1212 if that's the only child). */
1213 while (node->left || node->right)
1214 if (node->left)
1215 node = node->left;
1216 else
1217 node = node->right;
1221 reg_errcode_t err = fn (extra, node);
1222 if (BE (err != REG_NOERROR, 0))
1223 return err;
1224 if (node->parent == NULL)
1225 return REG_NOERROR;
1226 prev = node;
1227 node = node->parent;
1229 /* Go up while we have a node that is reached from the right. */
1230 while (node->right == prev || node->right == NULL);
1231 node = node->right;
1235 static reg_errcode_t
1236 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1237 void *extra)
1239 bin_tree_t *node;
1241 for (node = root; ; )
1243 reg_errcode_t err = fn (extra, node);
1244 if (BE (err != REG_NOERROR, 0))
1245 return err;
1247 /* Go to the left node, or up and to the right. */
1248 if (node->left)
1249 node = node->left;
1250 else
1252 bin_tree_t *prev = NULL;
1253 while (node->right == prev || node->right == NULL)
1255 prev = node;
1256 node = node->parent;
1257 if (!node)
1258 return REG_NOERROR;
1260 node = node->right;
1265 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1266 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1267 backreferences as well. Requires a preorder visit. */
1268 static reg_errcode_t
1269 optimize_subexps (void *extra, bin_tree_t *node)
1271 re_dfa_t *dfa = (re_dfa_t *) extra;
1273 if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1275 int idx = node->token.opr.idx;
1276 node->token.opr.idx = dfa->subexp_map[idx];
1277 dfa->used_bkref_map |= 1 << node->token.opr.idx;
1280 else if (node->token.type == SUBEXP
1281 && node->left && node->left->token.type == SUBEXP)
1283 int other_idx = node->left->token.opr.idx;
1285 node->left = node->left->left;
1286 if (node->left)
1287 node->left->parent = node;
1289 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1290 if (other_idx < BITSET_WORD_BITS)
1291 dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1294 return REG_NOERROR;
1297 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1298 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1299 static reg_errcode_t
1300 lower_subexps (void *extra, bin_tree_t *node)
1302 regex_t *preg = (regex_t *) extra;
1303 reg_errcode_t err = REG_NOERROR;
1305 if (node->left && node->left->token.type == SUBEXP)
1307 node->left = lower_subexp (&err, preg, node->left);
1308 if (node->left)
1309 node->left->parent = node;
1311 if (node->right && node->right->token.type == SUBEXP)
1313 node->right = lower_subexp (&err, preg, node->right);
1314 if (node->right)
1315 node->right->parent = node;
1318 return err;
1321 static bin_tree_t *
1322 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1324 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1325 bin_tree_t *body = node->left;
1326 bin_tree_t *op, *cls, *tree1, *tree;
1328 if (preg->no_sub
1329 /* We do not optimize empty subexpressions, because otherwise we may
1330 have bad CONCAT nodes with NULL children. This is obviously not
1331 very common, so we do not lose much. An example that triggers
1332 this case is the sed "script" /\(\)/x. */
1333 && node->left != NULL
1334 && (node->token.opr.idx >= BITSET_WORD_BITS
1335 || !(dfa->used_bkref_map
1336 & ((bitset_word_t) 1 << node->token.opr.idx))))
1337 return node->left;
1339 /* Convert the SUBEXP node to the concatenation of an
1340 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1341 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1342 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1343 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1344 tree = create_tree (dfa, op, tree1, CONCAT);
1345 if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1347 *err = REG_ESPACE;
1348 return NULL;
1351 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1352 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1353 return tree;
1356 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1357 nodes. Requires a postorder visit. */
1358 static reg_errcode_t
1359 calc_first (void *extra, bin_tree_t *node)
1361 re_dfa_t *dfa = (re_dfa_t *) extra;
1362 if (node->token.type == CONCAT)
1364 node->first = node->left->first;
1365 node->node_idx = node->left->node_idx;
1367 else
1369 node->first = node;
1370 node->node_idx = re_dfa_add_node (dfa, node->token);
1371 if (BE (node->node_idx == -1, 0))
1372 return REG_ESPACE;
1373 if (node->token.type == ANCHOR)
1374 dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1376 return REG_NOERROR;
1379 /* Pass 2: compute NEXT on the tree. Preorder visit. */
1380 static reg_errcode_t
1381 calc_next (void *extra, bin_tree_t *node)
1383 switch (node->token.type)
1385 case OP_DUP_ASTERISK:
1386 node->left->next = node;
1387 break;
1388 case CONCAT:
1389 node->left->next = node->right->first;
1390 node->right->next = node->next;
1391 break;
1392 default:
1393 if (node->left)
1394 node->left->next = node->next;
1395 if (node->right)
1396 node->right->next = node->next;
1397 break;
1399 return REG_NOERROR;
1402 /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1403 static reg_errcode_t
1404 link_nfa_nodes (void *extra, bin_tree_t *node)
1406 re_dfa_t *dfa = (re_dfa_t *) extra;
1407 int idx = node->node_idx;
1408 reg_errcode_t err = REG_NOERROR;
1410 switch (node->token.type)
1412 case CONCAT:
1413 break;
1415 case END_OF_RE:
1416 assert (node->next == NULL);
1417 break;
1419 case OP_DUP_ASTERISK:
1420 case OP_ALT:
1422 int left, right;
1423 dfa->has_plural_match = 1;
1424 if (node->left != NULL)
1425 left = node->left->first->node_idx;
1426 else
1427 left = node->next->node_idx;
1428 if (node->right != NULL)
1429 right = node->right->first->node_idx;
1430 else
1431 right = node->next->node_idx;
1432 assert (left > -1);
1433 assert (right > -1);
1434 err = re_node_set_init_2 (dfa->edests + idx, left, right);
1436 break;
1438 case ANCHOR:
1439 case OP_OPEN_SUBEXP:
1440 case OP_CLOSE_SUBEXP:
1441 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1442 break;
1444 case OP_BACK_REF:
1445 dfa->nexts[idx] = node->next->node_idx;
1446 if (node->token.type == OP_BACK_REF)
1447 err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1448 break;
1450 default:
1451 assert (!IS_EPSILON_NODE (node->token.type));
1452 dfa->nexts[idx] = node->next->node_idx;
1453 break;
1456 return err;
1459 /* Duplicate the epsilon closure of the node ROOT_NODE.
1460 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1461 to their own constraint. */
1463 static reg_errcode_t
1464 duplicate_node_closure (re_dfa_t *dfa, int top_org_node, int top_clone_node,
1465 int root_node, unsigned int init_constraint)
1467 int org_node, clone_node, ret;
1468 unsigned int constraint = init_constraint;
1469 for (org_node = top_org_node, clone_node = top_clone_node;;)
1471 int org_dest, clone_dest;
1472 if (dfa->nodes[org_node].type == OP_BACK_REF)
1474 /* If the back reference epsilon-transit, its destination must
1475 also have the constraint. Then duplicate the epsilon closure
1476 of the destination of the back reference, and store it in
1477 edests of the back reference. */
1478 org_dest = dfa->nexts[org_node];
1479 re_node_set_empty (dfa->edests + clone_node);
1480 clone_dest = duplicate_node (dfa, org_dest, constraint);
1481 if (BE (clone_dest == -1, 0))
1482 return REG_ESPACE;
1483 dfa->nexts[clone_node] = dfa->nexts[org_node];
1484 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1485 if (BE (ret < 0, 0))
1486 return REG_ESPACE;
1488 else if (dfa->edests[org_node].nelem == 0)
1490 /* In case of the node can't epsilon-transit, don't duplicate the
1491 destination and store the original destination as the
1492 destination of the node. */
1493 dfa->nexts[clone_node] = dfa->nexts[org_node];
1494 break;
1496 else if (dfa->edests[org_node].nelem == 1)
1498 /* In case of the node can epsilon-transit, and it has only one
1499 destination. */
1500 org_dest = dfa->edests[org_node].elems[0];
1501 re_node_set_empty (dfa->edests + clone_node);
1502 /* If the node is root_node itself, it means the epsilon closure
1503 has a loop. Then tie it to the destination of the root_node. */
1504 if (org_node == root_node && clone_node != org_node)
1506 ret = re_node_set_insert (dfa->edests + clone_node, org_dest);
1507 if (BE (ret < 0, 0))
1508 return REG_ESPACE;
1509 break;
1511 /* In case the node has another constraint, append it. */
1512 constraint |= dfa->nodes[org_node].constraint;
1513 clone_dest = duplicate_node (dfa, org_dest, constraint);
1514 if (BE (clone_dest == -1, 0))
1515 return REG_ESPACE;
1516 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1517 if (BE (ret < 0, 0))
1518 return REG_ESPACE;
1520 else /* dfa->edests[org_node].nelem == 2 */
1522 /* In case of the node can epsilon-transit, and it has two
1523 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1524 org_dest = dfa->edests[org_node].elems[0];
1525 re_node_set_empty (dfa->edests + clone_node);
1526 /* Search for a duplicated node which satisfies the constraint. */
1527 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1528 if (clone_dest == -1)
1530 /* There is no such duplicated node, create a new one. */
1531 reg_errcode_t err;
1532 clone_dest = duplicate_node (dfa, org_dest, constraint);
1533 if (BE (clone_dest == -1, 0))
1534 return REG_ESPACE;
1535 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1536 if (BE (ret < 0, 0))
1537 return REG_ESPACE;
1538 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1539 root_node, constraint);
1540 if (BE (err != REG_NOERROR, 0))
1541 return err;
1543 else
1545 /* There is a duplicated node which satisfies the constraint,
1546 use it to avoid infinite loop. */
1547 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1548 if (BE (ret < 0, 0))
1549 return REG_ESPACE;
1552 org_dest = dfa->edests[org_node].elems[1];
1553 clone_dest = duplicate_node (dfa, org_dest, constraint);
1554 if (BE (clone_dest == -1, 0))
1555 return REG_ESPACE;
1556 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1557 if (BE (ret < 0, 0))
1558 return REG_ESPACE;
1560 org_node = org_dest;
1561 clone_node = clone_dest;
1563 return REG_NOERROR;
1566 /* Search for a node which is duplicated from the node ORG_NODE, and
1567 satisfies the constraint CONSTRAINT. */
1569 static int
1570 search_duplicated_node (const re_dfa_t *dfa, int org_node,
1571 unsigned int constraint)
1573 int idx;
1574 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1576 if (org_node == dfa->org_indices[idx]
1577 && constraint == dfa->nodes[idx].constraint)
1578 return idx; /* Found. */
1580 return -1; /* Not found. */
1583 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1584 Return the index of the new node, or -1 if insufficient storage is
1585 available. */
1587 static int
1588 duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint)
1590 int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1591 if (BE (dup_idx != -1, 1))
1593 dfa->nodes[dup_idx].constraint = constraint;
1594 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1595 dfa->nodes[dup_idx].duplicated = 1;
1597 /* Store the index of the original node. */
1598 dfa->org_indices[dup_idx] = org_idx;
1600 return dup_idx;
1603 static reg_errcode_t
1604 calc_inveclosure (re_dfa_t *dfa)
1606 int src, idx, ret;
1607 for (idx = 0; idx < dfa->nodes_len; ++idx)
1608 re_node_set_init_empty (dfa->inveclosures + idx);
1610 for (src = 0; src < dfa->nodes_len; ++src)
1612 int *elems = dfa->eclosures[src].elems;
1613 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1615 ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1616 if (BE (ret == -1, 0))
1617 return REG_ESPACE;
1621 return REG_NOERROR;
1624 /* Calculate "eclosure" for all the node in DFA. */
1626 static reg_errcode_t
1627 calc_eclosure (re_dfa_t *dfa)
1629 int node_idx, incomplete;
1630 #ifdef DEBUG
1631 assert (dfa->nodes_len > 0);
1632 #endif
1633 incomplete = 0;
1634 /* For each nodes, calculate epsilon closure. */
1635 for (node_idx = 0; ; ++node_idx)
1637 reg_errcode_t err;
1638 re_node_set eclosure_elem;
1639 if (node_idx == dfa->nodes_len)
1641 if (!incomplete)
1642 break;
1643 incomplete = 0;
1644 node_idx = 0;
1647 #ifdef DEBUG
1648 assert (dfa->eclosures[node_idx].nelem != -1);
1649 #endif
1651 /* If we have already calculated, skip it. */
1652 if (dfa->eclosures[node_idx].nelem != 0)
1653 continue;
1654 /* Calculate epsilon closure of 'node_idx'. */
1655 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1656 if (BE (err != REG_NOERROR, 0))
1657 return err;
1659 if (dfa->eclosures[node_idx].nelem == 0)
1661 incomplete = 1;
1662 re_node_set_free (&eclosure_elem);
1665 return REG_NOERROR;
1668 /* Calculate epsilon closure of NODE. */
1670 static reg_errcode_t
1671 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, int node, int root)
1673 reg_errcode_t err;
1674 int i;
1675 re_node_set eclosure;
1676 int ret;
1677 int incomplete = 0;
1678 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1679 if (BE (err != REG_NOERROR, 0))
1680 return err;
1682 /* This indicates that we are calculating this node now.
1683 We reference this value to avoid infinite loop. */
1684 dfa->eclosures[node].nelem = -1;
1686 /* If the current node has constraints, duplicate all nodes
1687 since they must inherit the constraints. */
1688 if (dfa->nodes[node].constraint
1689 && dfa->edests[node].nelem
1690 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1692 err = duplicate_node_closure (dfa, node, node, node,
1693 dfa->nodes[node].constraint);
1694 if (BE (err != REG_NOERROR, 0))
1695 return err;
1698 /* Expand each epsilon destination nodes. */
1699 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1700 for (i = 0; i < dfa->edests[node].nelem; ++i)
1702 re_node_set eclosure_elem;
1703 int edest = dfa->edests[node].elems[i];
1704 /* If calculating the epsilon closure of `edest' is in progress,
1705 return intermediate result. */
1706 if (dfa->eclosures[edest].nelem == -1)
1708 incomplete = 1;
1709 continue;
1711 /* If we haven't calculated the epsilon closure of `edest' yet,
1712 calculate now. Otherwise use calculated epsilon closure. */
1713 if (dfa->eclosures[edest].nelem == 0)
1715 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1716 if (BE (err != REG_NOERROR, 0))
1717 return err;
1719 else
1720 eclosure_elem = dfa->eclosures[edest];
1721 /* Merge the epsilon closure of 'edest'. */
1722 err = re_node_set_merge (&eclosure, &eclosure_elem);
1723 if (BE (err != REG_NOERROR, 0))
1724 return err;
1725 /* If the epsilon closure of 'edest' is incomplete,
1726 the epsilon closure of this node is also incomplete. */
1727 if (dfa->eclosures[edest].nelem == 0)
1729 incomplete = 1;
1730 re_node_set_free (&eclosure_elem);
1734 /* An epsilon closure includes itself. */
1735 ret = re_node_set_insert (&eclosure, node);
1736 if (BE (ret < 0, 0))
1737 return REG_ESPACE;
1738 if (incomplete && !root)
1739 dfa->eclosures[node].nelem = 0;
1740 else
1741 dfa->eclosures[node] = eclosure;
1742 *new_set = eclosure;
1743 return REG_NOERROR;
1746 /* Functions for token which are used in the parser. */
1748 /* Fetch a token from INPUT.
1749 We must not use this function inside bracket expressions. */
1751 static void
1752 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1754 re_string_skip_bytes (input, peek_token (result, input, syntax));
1757 /* Peek a token from INPUT, and return the length of the token.
1758 We must not use this function inside bracket expressions. */
1760 static int
1761 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1763 unsigned char c;
1765 if (re_string_eoi (input))
1767 token->type = END_OF_RE;
1768 return 0;
1771 c = re_string_peek_byte (input, 0);
1772 token->opr.c = c;
1774 token->word_char = 0;
1775 #ifdef RE_ENABLE_I18N
1776 token->mb_partial = 0;
1777 if (input->mb_cur_max > 1 &&
1778 !re_string_first_byte (input, re_string_cur_idx (input)))
1780 token->type = CHARACTER;
1781 token->mb_partial = 1;
1782 return 1;
1784 #endif
1785 if (c == '\\')
1787 unsigned char c2;
1788 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1790 token->type = BACK_SLASH;
1791 return 1;
1794 c2 = re_string_peek_byte_case (input, 1);
1795 token->opr.c = c2;
1796 token->type = CHARACTER;
1797 #ifdef RE_ENABLE_I18N
1798 if (input->mb_cur_max > 1)
1800 wint_t wc = re_string_wchar_at (input,
1801 re_string_cur_idx (input) + 1);
1802 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1804 else
1805 #endif
1806 token->word_char = IS_WORD_CHAR (c2) != 0;
1808 switch (c2)
1810 case '|':
1811 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1812 token->type = OP_ALT;
1813 break;
1814 case '1': case '2': case '3': case '4': case '5':
1815 case '6': case '7': case '8': case '9':
1816 if (!(syntax & RE_NO_BK_REFS))
1818 token->type = OP_BACK_REF;
1819 token->opr.idx = c2 - '1';
1821 break;
1822 case '<':
1823 if (!(syntax & RE_NO_GNU_OPS))
1825 token->type = ANCHOR;
1826 token->opr.ctx_type = WORD_FIRST;
1828 break;
1829 case '>':
1830 if (!(syntax & RE_NO_GNU_OPS))
1832 token->type = ANCHOR;
1833 token->opr.ctx_type = WORD_LAST;
1835 break;
1836 case 'b':
1837 if (!(syntax & RE_NO_GNU_OPS))
1839 token->type = ANCHOR;
1840 token->opr.ctx_type = WORD_DELIM;
1842 break;
1843 case 'B':
1844 if (!(syntax & RE_NO_GNU_OPS))
1846 token->type = ANCHOR;
1847 token->opr.ctx_type = NOT_WORD_DELIM;
1849 break;
1850 case 'w':
1851 if (!(syntax & RE_NO_GNU_OPS))
1852 token->type = OP_WORD;
1853 break;
1854 case 'W':
1855 if (!(syntax & RE_NO_GNU_OPS))
1856 token->type = OP_NOTWORD;
1857 break;
1858 case 's':
1859 if (!(syntax & RE_NO_GNU_OPS))
1860 token->type = OP_SPACE;
1861 break;
1862 case 'S':
1863 if (!(syntax & RE_NO_GNU_OPS))
1864 token->type = OP_NOTSPACE;
1865 break;
1866 case '`':
1867 if (!(syntax & RE_NO_GNU_OPS))
1869 token->type = ANCHOR;
1870 token->opr.ctx_type = BUF_FIRST;
1872 break;
1873 case '\'':
1874 if (!(syntax & RE_NO_GNU_OPS))
1876 token->type = ANCHOR;
1877 token->opr.ctx_type = BUF_LAST;
1879 break;
1880 case '(':
1881 if (!(syntax & RE_NO_BK_PARENS))
1882 token->type = OP_OPEN_SUBEXP;
1883 break;
1884 case ')':
1885 if (!(syntax & RE_NO_BK_PARENS))
1886 token->type = OP_CLOSE_SUBEXP;
1887 break;
1888 case '+':
1889 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1890 token->type = OP_DUP_PLUS;
1891 break;
1892 case '?':
1893 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1894 token->type = OP_DUP_QUESTION;
1895 break;
1896 case '{':
1897 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1898 token->type = OP_OPEN_DUP_NUM;
1899 break;
1900 case '}':
1901 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1902 token->type = OP_CLOSE_DUP_NUM;
1903 break;
1904 default:
1905 break;
1907 return 2;
1910 token->type = CHARACTER;
1911 #ifdef RE_ENABLE_I18N
1912 if (input->mb_cur_max > 1)
1914 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1915 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1917 else
1918 #endif
1919 token->word_char = IS_WORD_CHAR (token->opr.c);
1921 switch (c)
1923 case '\n':
1924 if (syntax & RE_NEWLINE_ALT)
1925 token->type = OP_ALT;
1926 break;
1927 case '|':
1928 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1929 token->type = OP_ALT;
1930 break;
1931 case '*':
1932 token->type = OP_DUP_ASTERISK;
1933 break;
1934 case '+':
1935 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1936 token->type = OP_DUP_PLUS;
1937 break;
1938 case '?':
1939 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1940 token->type = OP_DUP_QUESTION;
1941 break;
1942 case '{':
1943 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1944 token->type = OP_OPEN_DUP_NUM;
1945 break;
1946 case '}':
1947 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1948 token->type = OP_CLOSE_DUP_NUM;
1949 break;
1950 case '(':
1951 if (syntax & RE_NO_BK_PARENS)
1952 token->type = OP_OPEN_SUBEXP;
1953 break;
1954 case ')':
1955 if (syntax & RE_NO_BK_PARENS)
1956 token->type = OP_CLOSE_SUBEXP;
1957 break;
1958 case '[':
1959 token->type = OP_OPEN_BRACKET;
1960 break;
1961 case '.':
1962 token->type = OP_PERIOD;
1963 break;
1964 case '^':
1965 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1966 re_string_cur_idx (input) != 0)
1968 char prev = re_string_peek_byte (input, -1);
1969 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1970 break;
1972 token->type = ANCHOR;
1973 token->opr.ctx_type = LINE_FIRST;
1974 break;
1975 case '$':
1976 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1977 re_string_cur_idx (input) + 1 != re_string_length (input))
1979 re_token_t next;
1980 re_string_skip_bytes (input, 1);
1981 peek_token (&next, input, syntax);
1982 re_string_skip_bytes (input, -1);
1983 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1984 break;
1986 token->type = ANCHOR;
1987 token->opr.ctx_type = LINE_LAST;
1988 break;
1989 default:
1990 break;
1992 return 1;
1995 /* Peek a token from INPUT, and return the length of the token.
1996 We must not use this function out of bracket expressions. */
1998 static int
1999 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2001 unsigned char c;
2002 if (re_string_eoi (input))
2004 token->type = END_OF_RE;
2005 return 0;
2007 c = re_string_peek_byte (input, 0);
2008 token->opr.c = c;
2010 #ifdef RE_ENABLE_I18N
2011 if (input->mb_cur_max > 1 &&
2012 !re_string_first_byte (input, re_string_cur_idx (input)))
2014 token->type = CHARACTER;
2015 return 1;
2017 #endif /* RE_ENABLE_I18N */
2019 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2020 && re_string_cur_idx (input) + 1 < re_string_length (input))
2022 /* In this case, '\' escape a character. */
2023 unsigned char c2;
2024 re_string_skip_bytes (input, 1);
2025 c2 = re_string_peek_byte (input, 0);
2026 token->opr.c = c2;
2027 token->type = CHARACTER;
2028 return 1;
2030 if (c == '[') /* '[' is a special char in a bracket exps. */
2032 unsigned char c2;
2033 int token_len;
2034 if (re_string_cur_idx (input) + 1 < re_string_length (input))
2035 c2 = re_string_peek_byte (input, 1);
2036 else
2037 c2 = 0;
2038 token->opr.c = c2;
2039 token_len = 2;
2040 switch (c2)
2042 case '.':
2043 token->type = OP_OPEN_COLL_ELEM;
2044 break;
2045 case '=':
2046 token->type = OP_OPEN_EQUIV_CLASS;
2047 break;
2048 case ':':
2049 if (syntax & RE_CHAR_CLASSES)
2051 token->type = OP_OPEN_CHAR_CLASS;
2052 break;
2054 /* else fall through. */
2055 default:
2056 token->type = CHARACTER;
2057 token->opr.c = c;
2058 token_len = 1;
2059 break;
2061 return token_len;
2063 switch (c)
2065 case '-':
2066 token->type = OP_CHARSET_RANGE;
2067 break;
2068 case ']':
2069 token->type = OP_CLOSE_BRACKET;
2070 break;
2071 case '^':
2072 token->type = OP_NON_MATCH_LIST;
2073 break;
2074 default:
2075 token->type = CHARACTER;
2077 return 1;
2080 /* Functions for parser. */
2082 /* Entry point of the parser.
2083 Parse the regular expression REGEXP and return the structure tree.
2084 If an error occurs, ERR is set by error code, and return NULL.
2085 This function build the following tree, from regular expression <reg_exp>:
2089 <reg_exp> EOR
2091 CAT means concatenation.
2092 EOR means end of regular expression. */
2094 static bin_tree_t *
2095 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2096 reg_errcode_t *err)
2098 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2099 bin_tree_t *tree, *eor, *root;
2100 re_token_t current_token;
2101 dfa->syntax = syntax;
2102 fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2103 tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2104 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2105 return NULL;
2106 eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2107 if (tree != NULL)
2108 root = create_tree (dfa, tree, eor, CONCAT);
2109 else
2110 root = eor;
2111 if (BE (eor == NULL || root == NULL, 0))
2113 *err = REG_ESPACE;
2114 return NULL;
2116 return root;
2119 /* This function build the following tree, from regular expression
2120 <branch1>|<branch2>:
2124 <branch1> <branch2>
2126 ALT means alternative, which represents the operator '|'. */
2128 static bin_tree_t *
2129 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2130 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2132 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2133 bin_tree_t *tree, *branch = NULL;
2134 tree = parse_branch (regexp, preg, token, syntax, nest, err);
2135 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2136 return NULL;
2138 while (token->type == OP_ALT)
2140 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2141 if (token->type != OP_ALT && token->type != END_OF_RE
2142 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2144 branch = parse_branch (regexp, preg, token, syntax, nest, err);
2145 if (BE (*err != REG_NOERROR && branch == NULL, 0))
2147 if (tree != NULL)
2148 postorder (tree, free_tree, NULL);
2149 return NULL;
2152 else
2153 branch = NULL;
2154 tree = create_tree (dfa, tree, branch, OP_ALT);
2155 if (BE (tree == NULL, 0))
2157 *err = REG_ESPACE;
2158 return NULL;
2161 return tree;
2164 /* This function build the following tree, from regular expression
2165 <exp1><exp2>:
2169 <exp1> <exp2>
2171 CAT means concatenation. */
2173 static bin_tree_t *
2174 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2175 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2177 bin_tree_t *tree, *exp;
2178 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2179 tree = parse_expression (regexp, preg, token, syntax, nest, err);
2180 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2181 return NULL;
2183 while (token->type != OP_ALT && token->type != END_OF_RE
2184 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2186 exp = parse_expression (regexp, preg, token, syntax, nest, err);
2187 if (BE (*err != REG_NOERROR && exp == NULL, 0))
2189 if (tree != NULL)
2190 postorder (tree, free_tree, NULL);
2191 return NULL;
2193 if (tree != NULL && exp != NULL)
2195 bin_tree_t *newtree = create_tree (dfa, tree, exp, CONCAT);
2196 if (newtree == NULL)
2198 postorder (exp, free_tree, NULL);
2199 postorder (tree, free_tree, NULL);
2200 *err = REG_ESPACE;
2201 return NULL;
2203 tree = newtree;
2205 else if (tree == NULL)
2206 tree = exp;
2207 /* Otherwise exp == NULL, we don't need to create new tree. */
2209 return tree;
2212 /* This function build the following tree, from regular expression a*:
2218 static bin_tree_t *
2219 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2220 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2222 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2223 bin_tree_t *tree;
2224 switch (token->type)
2226 case CHARACTER:
2227 tree = create_token_tree (dfa, NULL, NULL, token);
2228 if (BE (tree == NULL, 0))
2230 *err = REG_ESPACE;
2231 return NULL;
2233 #ifdef RE_ENABLE_I18N
2234 if (dfa->mb_cur_max > 1)
2236 while (!re_string_eoi (regexp)
2237 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2239 bin_tree_t *mbc_remain;
2240 fetch_token (token, regexp, syntax);
2241 mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2242 tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2243 if (BE (mbc_remain == NULL || tree == NULL, 0))
2245 *err = REG_ESPACE;
2246 return NULL;
2250 #endif
2251 break;
2252 case OP_OPEN_SUBEXP:
2253 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2254 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2255 return NULL;
2256 break;
2257 case OP_OPEN_BRACKET:
2258 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2259 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2260 return NULL;
2261 break;
2262 case OP_BACK_REF:
2263 if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2265 *err = REG_ESUBREG;
2266 return NULL;
2268 dfa->used_bkref_map |= 1 << token->opr.idx;
2269 tree = create_token_tree (dfa, NULL, NULL, token);
2270 if (BE (tree == NULL, 0))
2272 *err = REG_ESPACE;
2273 return NULL;
2275 ++dfa->nbackref;
2276 dfa->has_mb_node = 1;
2277 break;
2278 case OP_OPEN_DUP_NUM:
2279 if (syntax & RE_CONTEXT_INVALID_DUP)
2281 *err = REG_BADRPT;
2282 return NULL;
2284 /* FALLTHROUGH */
2285 case OP_DUP_ASTERISK:
2286 case OP_DUP_PLUS:
2287 case OP_DUP_QUESTION:
2288 if (syntax & RE_CONTEXT_INVALID_OPS)
2290 *err = REG_BADRPT;
2291 return NULL;
2293 else if (syntax & RE_CONTEXT_INDEP_OPS)
2295 fetch_token (token, regexp, syntax);
2296 return parse_expression (regexp, preg, token, syntax, nest, err);
2298 /* else fall through */
2299 case OP_CLOSE_SUBEXP:
2300 if ((token->type == OP_CLOSE_SUBEXP) &&
2301 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2303 *err = REG_ERPAREN;
2304 return NULL;
2306 /* else fall through */
2307 case OP_CLOSE_DUP_NUM:
2308 /* We treat it as a normal character. */
2310 /* Then we can these characters as normal characters. */
2311 token->type = CHARACTER;
2312 /* mb_partial and word_char bits should be initialized already
2313 by peek_token. */
2314 tree = create_token_tree (dfa, NULL, NULL, token);
2315 if (BE (tree == NULL, 0))
2317 *err = REG_ESPACE;
2318 return NULL;
2320 break;
2321 case ANCHOR:
2322 if ((token->opr.ctx_type
2323 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2324 && dfa->word_ops_used == 0)
2325 init_word_char (dfa);
2326 if (token->opr.ctx_type == WORD_DELIM
2327 || token->opr.ctx_type == NOT_WORD_DELIM)
2329 bin_tree_t *tree_first, *tree_last;
2330 if (token->opr.ctx_type == WORD_DELIM)
2332 token->opr.ctx_type = WORD_FIRST;
2333 tree_first = create_token_tree (dfa, NULL, NULL, token);
2334 token->opr.ctx_type = WORD_LAST;
2336 else
2338 token->opr.ctx_type = INSIDE_WORD;
2339 tree_first = create_token_tree (dfa, NULL, NULL, token);
2340 token->opr.ctx_type = INSIDE_NOTWORD;
2342 tree_last = create_token_tree (dfa, NULL, NULL, token);
2343 tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2344 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2346 *err = REG_ESPACE;
2347 return NULL;
2350 else
2352 tree = create_token_tree (dfa, NULL, NULL, token);
2353 if (BE (tree == NULL, 0))
2355 *err = REG_ESPACE;
2356 return NULL;
2359 /* We must return here, since ANCHORs can't be followed
2360 by repetition operators.
2361 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2362 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2363 fetch_token (token, regexp, syntax);
2364 return tree;
2365 case OP_PERIOD:
2366 tree = create_token_tree (dfa, NULL, NULL, token);
2367 if (BE (tree == NULL, 0))
2369 *err = REG_ESPACE;
2370 return NULL;
2372 if (dfa->mb_cur_max > 1)
2373 dfa->has_mb_node = 1;
2374 break;
2375 case OP_WORD:
2376 case OP_NOTWORD:
2377 tree = build_charclass_op (dfa, regexp->trans,
2378 (const unsigned char *) "alnum",
2379 (const unsigned char *) "_",
2380 token->type == OP_NOTWORD, err);
2381 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2382 return NULL;
2383 break;
2384 case OP_SPACE:
2385 case OP_NOTSPACE:
2386 tree = build_charclass_op (dfa, regexp->trans,
2387 (const unsigned char *) "space",
2388 (const unsigned char *) "",
2389 token->type == OP_NOTSPACE, err);
2390 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2391 return NULL;
2392 break;
2393 case OP_ALT:
2394 case END_OF_RE:
2395 return NULL;
2396 case BACK_SLASH:
2397 *err = REG_EESCAPE;
2398 return NULL;
2399 default:
2400 /* Must not happen? */
2401 #ifdef DEBUG
2402 assert (0);
2403 #endif
2404 return NULL;
2406 fetch_token (token, regexp, syntax);
2408 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2409 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2411 bin_tree_t *dup_tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2412 if (BE (*err != REG_NOERROR && dup_tree == NULL, 0))
2414 if (tree != NULL)
2415 postorder (tree, free_tree, NULL);
2416 return NULL;
2418 tree = dup_tree;
2419 /* In BRE consecutive duplications are not allowed. */
2420 if ((syntax & RE_CONTEXT_INVALID_DUP)
2421 && (token->type == OP_DUP_ASTERISK
2422 || token->type == OP_OPEN_DUP_NUM))
2424 if (tree != NULL)
2425 postorder (tree, free_tree, NULL);
2426 *err = REG_BADRPT;
2427 return NULL;
2431 return tree;
2434 /* This function build the following tree, from regular expression
2435 (<reg_exp>):
2436 SUBEXP
2438 <reg_exp>
2441 static bin_tree_t *
2442 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2443 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2445 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2446 bin_tree_t *tree;
2447 size_t cur_nsub;
2448 cur_nsub = preg->re_nsub++;
2450 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2452 /* The subexpression may be a null string. */
2453 if (token->type == OP_CLOSE_SUBEXP)
2454 tree = NULL;
2455 else
2457 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2458 if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2460 if (tree != NULL)
2461 postorder (tree, free_tree, NULL);
2462 *err = REG_EPAREN;
2464 if (BE (*err != REG_NOERROR, 0))
2465 return NULL;
2468 if (cur_nsub <= '9' - '1')
2469 dfa->completed_bkref_map |= 1 << cur_nsub;
2471 tree = create_tree (dfa, tree, NULL, SUBEXP);
2472 if (BE (tree == NULL, 0))
2474 *err = REG_ESPACE;
2475 return NULL;
2477 tree->token.opr.idx = cur_nsub;
2478 return tree;
2481 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2483 static bin_tree_t *
2484 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2485 re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2487 bin_tree_t *tree = NULL, *old_tree = NULL;
2488 int i, start, end, start_idx = re_string_cur_idx (regexp);
2489 re_token_t start_token = *token;
2491 if (token->type == OP_OPEN_DUP_NUM)
2493 end = 0;
2494 start = fetch_number (regexp, token, syntax);
2495 if (start == -1)
2497 if (token->type == CHARACTER && token->opr.c == ',')
2498 start = 0; /* We treat "{,m}" as "{0,m}". */
2499 else
2501 *err = REG_BADBR; /* <re>{} is invalid. */
2502 return NULL;
2505 if (BE (start != -2, 1))
2507 /* We treat "{n}" as "{n,n}". */
2508 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2509 : ((token->type == CHARACTER && token->opr.c == ',')
2510 ? fetch_number (regexp, token, syntax) : -2));
2512 if (BE (start == -2 || end == -2, 0))
2514 /* Invalid sequence. */
2515 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2517 if (token->type == END_OF_RE)
2518 *err = REG_EBRACE;
2519 else
2520 *err = REG_BADBR;
2522 return NULL;
2525 /* If the syntax bit is set, rollback. */
2526 re_string_set_index (regexp, start_idx);
2527 *token = start_token;
2528 token->type = CHARACTER;
2529 /* mb_partial and word_char bits should be already initialized by
2530 peek_token. */
2531 return elem;
2534 if (BE ((end != -1 && start > end) || token->type != OP_CLOSE_DUP_NUM, 0))
2536 /* First number greater than second. */
2537 *err = REG_BADBR;
2538 return NULL;
2541 else
2543 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2544 end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2547 fetch_token (token, regexp, syntax);
2549 if (BE (elem == NULL, 0))
2550 return NULL;
2551 if (BE (start == 0 && end == 0, 0))
2553 postorder (elem, free_tree, NULL);
2554 return NULL;
2557 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2558 if (BE (start > 0, 0))
2560 tree = elem;
2561 for (i = 2; i <= start; ++i)
2563 elem = duplicate_tree (elem, dfa);
2564 tree = create_tree (dfa, tree, elem, CONCAT);
2565 if (BE (elem == NULL || tree == NULL, 0))
2566 goto parse_dup_op_espace;
2569 if (start == end)
2570 return tree;
2572 /* Duplicate ELEM before it is marked optional. */
2573 elem = duplicate_tree (elem, dfa);
2574 if (BE (elem == NULL, 0))
2575 goto parse_dup_op_espace;
2576 old_tree = tree;
2578 else
2579 old_tree = NULL;
2581 if (elem->token.type == SUBEXP)
2582 postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2584 tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2585 if (BE (tree == NULL, 0))
2586 goto parse_dup_op_espace;
2588 /* This loop is actually executed only when end != -1,
2589 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2590 already created the start+1-th copy. */
2591 for (i = start + 2; i <= end; ++i)
2593 elem = duplicate_tree (elem, dfa);
2594 tree = create_tree (dfa, tree, elem, CONCAT);
2595 if (BE (elem == NULL || tree == NULL, 0))
2596 goto parse_dup_op_espace;
2598 tree = create_tree (dfa, tree, NULL, OP_ALT);
2599 if (BE (tree == NULL, 0))
2600 goto parse_dup_op_espace;
2603 if (old_tree)
2604 tree = create_tree (dfa, old_tree, tree, CONCAT);
2606 return tree;
2608 parse_dup_op_espace:
2609 *err = REG_ESPACE;
2610 return NULL;
2613 /* Size of the names for collating symbol/equivalence_class/character_class.
2614 I'm not sure, but maybe enough. */
2615 #define BRACKET_NAME_BUF_SIZE 32
2617 #ifndef _LIBC
2618 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2619 Build the range expression which starts from START_ELEM, and ends
2620 at END_ELEM. The result are written to MBCSET and SBCSET.
2621 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2622 mbcset->range_ends, is a pointer argument since we may
2623 update it. */
2625 static reg_errcode_t
2626 # ifdef RE_ENABLE_I18N
2627 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2628 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2629 # else /* not RE_ENABLE_I18N */
2630 build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
2631 bracket_elem_t *end_elem)
2632 # endif /* not RE_ENABLE_I18N */
2634 unsigned int start_ch, end_ch;
2635 /* Equivalence Classes and Character Classes can't be a range start/end. */
2636 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2637 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2639 return REG_ERANGE;
2641 /* We can handle no multi character collating elements without libc
2642 support. */
2643 if (BE ((start_elem->type == COLL_SYM
2644 && strlen ((char *) start_elem->opr.name) > 1)
2645 || (end_elem->type == COLL_SYM
2646 && strlen ((char *) end_elem->opr.name) > 1), 0))
2647 return REG_ECOLLATE;
2649 # ifdef RE_ENABLE_I18N
2651 wchar_t wc;
2652 wint_t start_wc;
2653 wint_t end_wc;
2654 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2656 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2657 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2658 : 0));
2659 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2660 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2661 : 0));
2662 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2663 ? __btowc (start_ch) : start_elem->opr.wch);
2664 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2665 ? __btowc (end_ch) : end_elem->opr.wch);
2666 if (start_wc == WEOF || end_wc == WEOF)
2667 return REG_ECOLLATE;
2668 cmp_buf[0] = start_wc;
2669 cmp_buf[4] = end_wc;
2670 if (__wcscoll (cmp_buf, cmp_buf + 4) > 0)
2671 return REG_ERANGE;
2673 /* Got valid collation sequence values, add them as a new entry.
2674 However, for !_LIBC we have no collation elements: if the
2675 character set is single byte, the single byte character set
2676 that we build below suffices. parse_bracket_exp passes
2677 no MBCSET if dfa->mb_cur_max == 1. */
2678 if (mbcset)
2680 /* Check the space of the arrays. */
2681 if (BE (*range_alloc == mbcset->nranges, 0))
2683 /* There is not enough space, need realloc. */
2684 wchar_t *new_array_start, *new_array_end;
2685 int new_nranges;
2687 /* +1 in case of mbcset->nranges is 0. */
2688 new_nranges = 2 * mbcset->nranges + 1;
2689 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2690 are NULL if *range_alloc == 0. */
2691 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2692 new_nranges);
2693 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2694 new_nranges);
2696 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2697 return REG_ESPACE;
2699 mbcset->range_starts = new_array_start;
2700 mbcset->range_ends = new_array_end;
2701 *range_alloc = new_nranges;
2704 mbcset->range_starts[mbcset->nranges] = start_wc;
2705 mbcset->range_ends[mbcset->nranges++] = end_wc;
2708 /* Build the table for single byte characters. */
2709 for (wc = 0; wc < SBC_MAX; ++wc)
2711 cmp_buf[2] = wc;
2712 if (__wcscoll (cmp_buf, cmp_buf + 2) <= 0
2713 && __wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2714 bitset_set (sbcset, wc);
2717 # else /* not RE_ENABLE_I18N */
2719 unsigned int ch;
2720 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2721 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2722 : 0));
2723 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2724 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2725 : 0));
2726 if (start_ch > end_ch)
2727 return REG_ERANGE;
2728 /* Build the table for single byte characters. */
2729 for (ch = 0; ch < SBC_MAX; ++ch)
2730 if (start_ch <= ch && ch <= end_ch)
2731 bitset_set (sbcset, ch);
2733 # endif /* not RE_ENABLE_I18N */
2734 return REG_NOERROR;
2736 #endif /* not _LIBC */
2738 #ifndef _LIBC
2739 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2740 Build the collating element which is represented by NAME.
2741 The result are written to MBCSET and SBCSET.
2742 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2743 pointer argument since we may update it. */
2745 static reg_errcode_t
2746 # ifdef RE_ENABLE_I18N
2747 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2748 int *coll_sym_alloc, const unsigned char *name)
2749 # else /* not RE_ENABLE_I18N */
2750 build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2751 # endif /* not RE_ENABLE_I18N */
2753 size_t name_len = strlen ((const char *) name);
2754 if (BE (name_len != 1, 0))
2755 return REG_ECOLLATE;
2756 else
2758 bitset_set (sbcset, name[0]);
2759 return REG_NOERROR;
2762 #endif /* not _LIBC */
2764 /* This function parse bracket expression like "[abc]", "[a-c]",
2765 "[[.a-a.]]" etc. */
2767 static bin_tree_t *
2768 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2769 reg_syntax_t syntax, reg_errcode_t *err)
2771 #ifdef _LIBC
2772 const unsigned char *collseqmb;
2773 const char *collseqwc;
2774 uint32_t nrules;
2775 int32_t table_size;
2776 const int32_t *symb_table;
2777 const unsigned char *extra;
2779 /* Local function for parse_bracket_exp used in _LIBC environment.
2780 Seek the collating symbol entry corresponding to NAME.
2781 Return the index of the symbol in the SYMB_TABLE,
2782 or -1 if not found. */
2784 auto inline int32_t
2785 __attribute__ ((always_inline))
2786 seek_collating_symbol_entry (const unsigned char *name, size_t name_len)
2788 int32_t elem;
2790 for (elem = 0; elem < table_size; elem++)
2791 if (symb_table[2 * elem] != 0)
2793 int32_t idx = symb_table[2 * elem + 1];
2794 /* Skip the name of collating element name. */
2795 idx += 1 + extra[idx];
2796 if (/* Compare the length of the name. */
2797 name_len == extra[idx]
2798 /* Compare the name. */
2799 && memcmp (name, &extra[idx + 1], name_len) == 0)
2800 /* Yep, this is the entry. */
2801 return elem;
2803 return -1;
2806 /* Local function for parse_bracket_exp used in _LIBC environment.
2807 Look up the collation sequence value of BR_ELEM.
2808 Return the value if succeeded, UINT_MAX otherwise. */
2810 auto inline unsigned int
2811 __attribute__ ((always_inline))
2812 lookup_collation_sequence_value (bracket_elem_t *br_elem)
2814 if (br_elem->type == SB_CHAR)
2817 if (MB_CUR_MAX == 1)
2819 if (nrules == 0)
2820 return collseqmb[br_elem->opr.ch];
2821 else
2823 wint_t wc = __btowc (br_elem->opr.ch);
2824 return __collseq_table_lookup (collseqwc, wc);
2827 else if (br_elem->type == MB_CHAR)
2829 if (nrules != 0)
2830 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2832 else if (br_elem->type == COLL_SYM)
2834 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2835 if (nrules != 0)
2837 int32_t elem, idx;
2838 elem = seek_collating_symbol_entry (br_elem->opr.name,
2839 sym_name_len);
2840 if (elem != -1)
2842 /* We found the entry. */
2843 idx = symb_table[2 * elem + 1];
2844 /* Skip the name of collating element name. */
2845 idx += 1 + extra[idx];
2846 /* Skip the byte sequence of the collating element. */
2847 idx += 1 + extra[idx];
2848 /* Adjust for the alignment. */
2849 idx = (idx + 3) & ~3;
2850 /* Skip the multibyte collation sequence value. */
2851 idx += sizeof (unsigned int);
2852 /* Skip the wide char sequence of the collating element. */
2853 idx += sizeof (unsigned int) *
2854 (1 + *(unsigned int *) (extra + idx));
2855 /* Return the collation sequence value. */
2856 return *(unsigned int *) (extra + idx);
2858 else if (sym_name_len == 1)
2860 /* No valid character. Match it as a single byte
2861 character. */
2862 return collseqmb[br_elem->opr.name[0]];
2865 else if (sym_name_len == 1)
2866 return collseqmb[br_elem->opr.name[0]];
2868 return UINT_MAX;
2871 /* Local function for parse_bracket_exp used in _LIBC environment.
2872 Build the range expression which starts from START_ELEM, and ends
2873 at END_ELEM. The result are written to MBCSET and SBCSET.
2874 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2875 mbcset->range_ends, is a pointer argument since we may
2876 update it. */
2878 auto inline reg_errcode_t
2879 __attribute__ ((always_inline))
2880 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2881 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2883 unsigned int ch;
2884 uint32_t start_collseq;
2885 uint32_t end_collseq;
2887 /* Equivalence Classes and Character Classes can't be a range
2888 start/end. */
2889 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2890 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2892 return REG_ERANGE;
2894 start_collseq = lookup_collation_sequence_value (start_elem);
2895 end_collseq = lookup_collation_sequence_value (end_elem);
2896 /* Check start/end collation sequence values. */
2897 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2898 return REG_ECOLLATE;
2899 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2900 return REG_ERANGE;
2902 /* Got valid collation sequence values, add them as a new entry.
2903 However, if we have no collation elements, and the character set
2904 is single byte, the single byte character set that we
2905 build below suffices. */
2906 if (nrules > 0 || dfa->mb_cur_max > 1)
2908 /* Check the space of the arrays. */
2909 if (BE (*range_alloc == mbcset->nranges, 0))
2911 /* There is not enough space, need realloc. */
2912 uint32_t *new_array_start;
2913 uint32_t *new_array_end;
2914 int new_nranges;
2916 /* +1 in case of mbcset->nranges is 0. */
2917 new_nranges = 2 * mbcset->nranges + 1;
2918 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2919 new_nranges);
2920 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2921 new_nranges);
2923 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2924 return REG_ESPACE;
2926 mbcset->range_starts = new_array_start;
2927 mbcset->range_ends = new_array_end;
2928 *range_alloc = new_nranges;
2931 mbcset->range_starts[mbcset->nranges] = start_collseq;
2932 mbcset->range_ends[mbcset->nranges++] = end_collseq;
2935 /* Build the table for single byte characters. */
2936 for (ch = 0; ch < SBC_MAX; ch++)
2938 uint32_t ch_collseq;
2940 if (MB_CUR_MAX == 1)
2942 if (nrules == 0)
2943 ch_collseq = collseqmb[ch];
2944 else
2945 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2946 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2947 bitset_set (sbcset, ch);
2949 return REG_NOERROR;
2952 /* Local function for parse_bracket_exp used in _LIBC environment.
2953 Build the collating element which is represented by NAME.
2954 The result are written to MBCSET and SBCSET.
2955 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2956 pointer argument since we may update it. */
2958 auto inline reg_errcode_t
2959 __attribute__ ((always_inline))
2960 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2961 int *coll_sym_alloc, const unsigned char *name)
2963 int32_t elem, idx;
2964 size_t name_len = strlen ((const char *) name);
2965 if (nrules != 0)
2967 elem = seek_collating_symbol_entry (name, name_len);
2968 if (elem != -1)
2970 /* We found the entry. */
2971 idx = symb_table[2 * elem + 1];
2972 /* Skip the name of collating element name. */
2973 idx += 1 + extra[idx];
2975 else if (name_len == 1)
2977 /* No valid character, treat it as a normal
2978 character. */
2979 bitset_set (sbcset, name[0]);
2980 return REG_NOERROR;
2982 else
2983 return REG_ECOLLATE;
2985 /* Got valid collation sequence, add it as a new entry. */
2986 /* Check the space of the arrays. */
2987 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
2989 /* Not enough, realloc it. */
2990 /* +1 in case of mbcset->ncoll_syms is 0. */
2991 int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
2992 /* Use realloc since mbcset->coll_syms is NULL
2993 if *alloc == 0. */
2994 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
2995 new_coll_sym_alloc);
2996 if (BE (new_coll_syms == NULL, 0))
2997 return REG_ESPACE;
2998 mbcset->coll_syms = new_coll_syms;
2999 *coll_sym_alloc = new_coll_sym_alloc;
3001 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3002 return REG_NOERROR;
3004 else
3006 if (BE (name_len != 1, 0))
3007 return REG_ECOLLATE;
3008 else
3010 bitset_set (sbcset, name[0]);
3011 return REG_NOERROR;
3015 #endif
3017 re_token_t br_token;
3018 re_bitset_ptr_t sbcset;
3019 #ifdef RE_ENABLE_I18N
3020 re_charset_t *mbcset;
3021 int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3022 int equiv_class_alloc = 0, char_class_alloc = 0;
3023 #endif /* not RE_ENABLE_I18N */
3024 int non_match = 0;
3025 bin_tree_t *work_tree;
3026 int token_len;
3027 int first_round = 1;
3028 #ifdef _LIBC
3029 collseqmb = (const unsigned char *)
3030 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3031 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3032 if (nrules)
3035 if (MB_CUR_MAX > 1)
3037 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3038 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3039 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3040 _NL_COLLATE_SYMB_TABLEMB);
3041 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3042 _NL_COLLATE_SYMB_EXTRAMB);
3044 #endif
3045 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3046 #ifdef RE_ENABLE_I18N
3047 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3048 #endif /* RE_ENABLE_I18N */
3049 #ifdef RE_ENABLE_I18N
3050 if (BE (sbcset == NULL || mbcset == NULL, 0))
3051 #else
3052 if (BE (sbcset == NULL, 0))
3053 #endif /* RE_ENABLE_I18N */
3055 re_free (sbcset);
3056 #ifdef RE_ENABLE_I18N
3057 re_free (mbcset);
3058 #endif
3059 *err = REG_ESPACE;
3060 return NULL;
3063 token_len = peek_token_bracket (token, regexp, syntax);
3064 if (BE (token->type == END_OF_RE, 0))
3066 *err = REG_BADPAT;
3067 goto parse_bracket_exp_free_return;
3069 if (token->type == OP_NON_MATCH_LIST)
3071 #ifdef RE_ENABLE_I18N
3072 mbcset->non_match = 1;
3073 #endif /* not RE_ENABLE_I18N */
3074 non_match = 1;
3075 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3076 bitset_set (sbcset, '\n');
3077 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3078 token_len = peek_token_bracket (token, regexp, syntax);
3079 if (BE (token->type == END_OF_RE, 0))
3081 *err = REG_BADPAT;
3082 goto parse_bracket_exp_free_return;
3086 /* We treat the first ']' as a normal character. */
3087 if (token->type == OP_CLOSE_BRACKET)
3088 token->type = CHARACTER;
3090 while (1)
3092 bracket_elem_t start_elem, end_elem;
3093 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3094 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3095 reg_errcode_t ret;
3096 int token_len2 = 0, is_range_exp = 0;
3097 re_token_t token2;
3099 start_elem.opr.name = start_name_buf;
3100 start_elem.type = COLL_SYM;
3101 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3102 syntax, first_round);
3103 if (BE (ret != REG_NOERROR, 0))
3105 *err = ret;
3106 goto parse_bracket_exp_free_return;
3108 first_round = 0;
3110 /* Get information about the next token. We need it in any case. */
3111 token_len = peek_token_bracket (token, regexp, syntax);
3113 /* Do not check for ranges if we know they are not allowed. */
3114 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3116 if (BE (token->type == END_OF_RE, 0))
3118 *err = REG_EBRACK;
3119 goto parse_bracket_exp_free_return;
3121 if (token->type == OP_CHARSET_RANGE)
3123 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3124 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3125 if (BE (token2.type == END_OF_RE, 0))
3127 *err = REG_EBRACK;
3128 goto parse_bracket_exp_free_return;
3130 if (token2.type == OP_CLOSE_BRACKET)
3132 /* We treat the last '-' as a normal character. */
3133 re_string_skip_bytes (regexp, -token_len);
3134 token->type = CHARACTER;
3136 else
3137 is_range_exp = 1;
3141 if (is_range_exp == 1)
3143 end_elem.opr.name = end_name_buf;
3144 end_elem.type = COLL_SYM;
3145 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3146 dfa, syntax, 1);
3147 if (BE (ret != REG_NOERROR, 0))
3149 *err = ret;
3150 goto parse_bracket_exp_free_return;
3153 token_len = peek_token_bracket (token, regexp, syntax);
3155 #ifdef _LIBC
3156 *err = build_range_exp (sbcset, mbcset, &range_alloc,
3157 &start_elem, &end_elem);
3158 #else
3159 # ifdef RE_ENABLE_I18N
3160 *err = build_range_exp (sbcset,
3161 dfa->mb_cur_max > 1 ? mbcset : NULL,
3162 &range_alloc, &start_elem, &end_elem);
3163 # else
3164 *err = build_range_exp (sbcset, &start_elem, &end_elem);
3165 # endif
3166 #endif /* RE_ENABLE_I18N */
3167 if (BE (*err != REG_NOERROR, 0))
3168 goto parse_bracket_exp_free_return;
3170 else
3172 switch (start_elem.type)
3174 case SB_CHAR:
3175 bitset_set (sbcset, start_elem.opr.ch);
3176 break;
3177 #ifdef RE_ENABLE_I18N
3178 case MB_CHAR:
3179 /* Check whether the array has enough space. */
3180 if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3182 wchar_t *new_mbchars;
3183 /* Not enough, realloc it. */
3184 /* +1 in case of mbcset->nmbchars is 0. */
3185 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3186 /* Use realloc since array is NULL if *alloc == 0. */
3187 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3188 mbchar_alloc);
3189 if (BE (new_mbchars == NULL, 0))
3190 goto parse_bracket_exp_espace;
3191 mbcset->mbchars = new_mbchars;
3193 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3194 break;
3195 #endif /* RE_ENABLE_I18N */
3196 case EQUIV_CLASS:
3197 *err = build_equiv_class (sbcset,
3198 #ifdef RE_ENABLE_I18N
3199 mbcset, &equiv_class_alloc,
3200 #endif /* RE_ENABLE_I18N */
3201 start_elem.opr.name);
3202 if (BE (*err != REG_NOERROR, 0))
3203 goto parse_bracket_exp_free_return;
3204 break;
3205 case COLL_SYM:
3206 *err = build_collating_symbol (sbcset,
3207 #ifdef RE_ENABLE_I18N
3208 mbcset, &coll_sym_alloc,
3209 #endif /* RE_ENABLE_I18N */
3210 start_elem.opr.name);
3211 if (BE (*err != REG_NOERROR, 0))
3212 goto parse_bracket_exp_free_return;
3213 break;
3214 case CHAR_CLASS:
3215 *err = build_charclass (regexp->trans, sbcset,
3216 #ifdef RE_ENABLE_I18N
3217 mbcset, &char_class_alloc,
3218 #endif /* RE_ENABLE_I18N */
3219 start_elem.opr.name, syntax);
3220 if (BE (*err != REG_NOERROR, 0))
3221 goto parse_bracket_exp_free_return;
3222 break;
3223 default:
3224 assert (0);
3225 break;
3228 if (BE (token->type == END_OF_RE, 0))
3230 *err = REG_EBRACK;
3231 goto parse_bracket_exp_free_return;
3233 if (token->type == OP_CLOSE_BRACKET)
3234 break;
3237 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3239 /* If it is non-matching list. */
3240 if (non_match)
3241 bitset_not (sbcset);
3243 #ifdef RE_ENABLE_I18N
3244 /* Ensure only single byte characters are set. */
3245 if (dfa->mb_cur_max > 1)
3246 bitset_mask (sbcset, dfa->sb_char);
3248 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3249 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3250 || mbcset->non_match)))
3252 bin_tree_t *mbc_tree;
3253 int sbc_idx;
3254 /* Build a tree for complex bracket. */
3255 dfa->has_mb_node = 1;
3256 br_token.type = COMPLEX_BRACKET;
3257 br_token.opr.mbcset = mbcset;
3258 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3259 if (BE (mbc_tree == NULL, 0))
3260 goto parse_bracket_exp_espace;
3261 for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3262 if (sbcset[sbc_idx])
3263 break;
3264 /* If there are no bits set in sbcset, there is no point
3265 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3266 if (sbc_idx < BITSET_WORDS)
3268 /* Build a tree for simple bracket. */
3269 br_token.type = SIMPLE_BRACKET;
3270 br_token.opr.sbcset = sbcset;
3271 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3272 if (BE (work_tree == NULL, 0))
3273 goto parse_bracket_exp_espace;
3275 /* Then join them by ALT node. */
3276 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3277 if (BE (work_tree == NULL, 0))
3278 goto parse_bracket_exp_espace;
3280 else
3282 re_free (sbcset);
3283 work_tree = mbc_tree;
3286 else
3287 #endif /* not RE_ENABLE_I18N */
3289 #ifdef RE_ENABLE_I18N
3290 free_charset (mbcset);
3291 #endif
3292 /* Build a tree for simple bracket. */
3293 br_token.type = SIMPLE_BRACKET;
3294 br_token.opr.sbcset = sbcset;
3295 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3296 if (BE (work_tree == NULL, 0))
3297 goto parse_bracket_exp_espace;
3299 return work_tree;
3301 parse_bracket_exp_espace:
3302 *err = REG_ESPACE;
3303 parse_bracket_exp_free_return:
3304 re_free (sbcset);
3305 #ifdef RE_ENABLE_I18N
3306 free_charset (mbcset);
3307 #endif /* RE_ENABLE_I18N */
3308 return NULL;
3311 /* Parse an element in the bracket expression. */
3313 static reg_errcode_t
3314 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3315 re_token_t *token, int token_len, re_dfa_t *dfa,
3316 reg_syntax_t syntax, int accept_hyphen)
3318 #ifdef RE_ENABLE_I18N
3319 int cur_char_size;
3320 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3321 if (cur_char_size > 1)
3323 elem->type = MB_CHAR;
3324 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3325 re_string_skip_bytes (regexp, cur_char_size);
3326 return REG_NOERROR;
3328 #endif /* RE_ENABLE_I18N */
3329 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3330 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3331 || token->type == OP_OPEN_EQUIV_CLASS)
3332 return parse_bracket_symbol (elem, regexp, token);
3333 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3335 /* A '-' must only appear as anything but a range indicator before
3336 the closing bracket. Everything else is an error. */
3337 re_token_t token2;
3338 (void) peek_token_bracket (&token2, regexp, syntax);
3339 if (token2.type != OP_CLOSE_BRACKET)
3340 /* The actual error value is not standardized since this whole
3341 case is undefined. But ERANGE makes good sense. */
3342 return REG_ERANGE;
3344 elem->type = SB_CHAR;
3345 elem->opr.ch = token->opr.c;
3346 return REG_NOERROR;
3349 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3350 such as [:<character_class>:], [.<collating_element>.], and
3351 [=<equivalent_class>=]. */
3353 static reg_errcode_t
3354 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3355 re_token_t *token)
3357 unsigned char ch, delim = token->opr.c;
3358 int i = 0;
3359 if (re_string_eoi(regexp))
3360 return REG_EBRACK;
3361 for (;; ++i)
3363 if (i >= BRACKET_NAME_BUF_SIZE)
3364 return REG_EBRACK;
3365 if (token->type == OP_OPEN_CHAR_CLASS)
3366 ch = re_string_fetch_byte_case (regexp);
3367 else
3368 ch = re_string_fetch_byte (regexp);
3369 if (re_string_eoi(regexp))
3370 return REG_EBRACK;
3371 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3372 break;
3373 elem->opr.name[i] = ch;
3375 re_string_skip_bytes (regexp, 1);
3376 elem->opr.name[i] = '\0';
3377 switch (token->type)
3379 case OP_OPEN_COLL_ELEM:
3380 elem->type = COLL_SYM;
3381 break;
3382 case OP_OPEN_EQUIV_CLASS:
3383 elem->type = EQUIV_CLASS;
3384 break;
3385 case OP_OPEN_CHAR_CLASS:
3386 elem->type = CHAR_CLASS;
3387 break;
3388 default:
3389 break;
3391 return REG_NOERROR;
3394 /* Helper function for parse_bracket_exp.
3395 Build the equivalence class which is represented by NAME.
3396 The result are written to MBCSET and SBCSET.
3397 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3398 is a pointer argument since we may update it. */
3400 static reg_errcode_t
3401 #ifdef RE_ENABLE_I18N
3402 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3403 int *equiv_class_alloc, const unsigned char *name)
3404 #else /* not RE_ENABLE_I18N */
3405 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3406 #endif /* not RE_ENABLE_I18N */
3408 #ifdef _LIBC
3409 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3410 if (nrules != 0)
3412 const int32_t *table, *indirect;
3413 const unsigned char *weights, *extra, *cp;
3414 unsigned char char_buf[2];
3415 int32_t idx1, idx2;
3416 unsigned int ch;
3417 size_t len;
3418 /* Calculate the index for equivalence class. */
3419 cp = name;
3420 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3421 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3422 _NL_COLLATE_WEIGHTMB);
3423 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3424 _NL_COLLATE_EXTRAMB);
3425 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3426 _NL_COLLATE_INDIRECTMB);
3427 idx1 = findidx (table, indirect, extra, &cp, -1);
3428 if (BE (idx1 == 0 || *cp != '\0', 0))
3429 /* This isn't a valid character. */
3430 return REG_ECOLLATE;
3432 /* Build single byte matcing table for this equivalence class. */
3433 len = weights[idx1 & 0xffffff];
3434 for (ch = 0; ch < SBC_MAX; ++ch)
3436 char_buf[0] = ch;
3437 cp = char_buf;
3438 idx2 = findidx (table, indirect, extra, &cp, 1);
3440 idx2 = table[ch];
3442 if (idx2 == 0)
3443 /* This isn't a valid character. */
3444 continue;
3445 /* Compare only if the length matches and the collation rule
3446 index is the same. */
3447 if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3449 int cnt = 0;
3451 while (cnt <= len &&
3452 weights[(idx1 & 0xffffff) + 1 + cnt]
3453 == weights[(idx2 & 0xffffff) + 1 + cnt])
3454 ++cnt;
3456 if (cnt > len)
3457 bitset_set (sbcset, ch);
3460 /* Check whether the array has enough space. */
3461 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3463 /* Not enough, realloc it. */
3464 /* +1 in case of mbcset->nequiv_classes is 0. */
3465 int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3466 /* Use realloc since the array is NULL if *alloc == 0. */
3467 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3468 int32_t,
3469 new_equiv_class_alloc);
3470 if (BE (new_equiv_classes == NULL, 0))
3471 return REG_ESPACE;
3472 mbcset->equiv_classes = new_equiv_classes;
3473 *equiv_class_alloc = new_equiv_class_alloc;
3475 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3477 else
3478 #endif /* _LIBC */
3480 if (BE (strlen ((const char *) name) != 1, 0))
3481 return REG_ECOLLATE;
3482 bitset_set (sbcset, *name);
3484 return REG_NOERROR;
3487 /* Helper function for parse_bracket_exp.
3488 Build the character class which is represented by NAME.
3489 The result are written to MBCSET and SBCSET.
3490 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3491 is a pointer argument since we may update it. */
3493 static reg_errcode_t
3494 #ifdef RE_ENABLE_I18N
3495 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3496 re_charset_t *mbcset, int *char_class_alloc,
3497 const unsigned char *class_name, reg_syntax_t syntax)
3498 #else /* not RE_ENABLE_I18N */
3499 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3500 const unsigned char *class_name, reg_syntax_t syntax)
3501 #endif /* not RE_ENABLE_I18N */
3503 int i;
3504 const char *name = (const char *) class_name;
3506 /* In case of REG_ICASE "upper" and "lower" match the both of
3507 upper and lower cases. */
3508 if ((syntax & RE_ICASE)
3509 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3510 name = "alpha";
3512 #ifdef RE_ENABLE_I18N
3513 /* Check the space of the arrays. */
3514 if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3516 /* Not enough, realloc it. */
3517 /* +1 in case of mbcset->nchar_classes is 0. */
3518 int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3519 /* Use realloc since array is NULL if *alloc == 0. */
3520 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3521 new_char_class_alloc);
3522 if (BE (new_char_classes == NULL, 0))
3523 return REG_ESPACE;
3524 mbcset->char_classes = new_char_classes;
3525 *char_class_alloc = new_char_class_alloc;
3527 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3528 #endif /* RE_ENABLE_I18N */
3530 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3531 do { \
3532 if (BE (trans != NULL, 0)) \
3534 for (i = 0; i < SBC_MAX; ++i) \
3535 if (ctype_func (i)) \
3536 bitset_set (sbcset, trans[i]); \
3538 else \
3540 for (i = 0; i < SBC_MAX; ++i) \
3541 if (ctype_func (i)) \
3542 bitset_set (sbcset, i); \
3544 } while (0)
3546 if (strcmp (name, "alnum") == 0)
3547 BUILD_CHARCLASS_LOOP (isalnum);
3548 else if (strcmp (name, "cntrl") == 0)
3549 BUILD_CHARCLASS_LOOP (iscntrl);
3550 else if (strcmp (name, "lower") == 0)
3551 BUILD_CHARCLASS_LOOP (islower);
3552 else if (strcmp (name, "space") == 0)
3553 BUILD_CHARCLASS_LOOP (isspace);
3554 else if (strcmp (name, "alpha") == 0)
3555 BUILD_CHARCLASS_LOOP (isalpha);
3556 else if (strcmp (name, "digit") == 0)
3557 BUILD_CHARCLASS_LOOP (isdigit);
3558 else if (strcmp (name, "print") == 0)
3559 BUILD_CHARCLASS_LOOP (isprint);
3560 else if (strcmp (name, "upper") == 0)
3561 BUILD_CHARCLASS_LOOP (isupper);
3562 else if (strcmp (name, "blank") == 0)
3563 BUILD_CHARCLASS_LOOP (isblank);
3564 else if (strcmp (name, "graph") == 0)
3565 BUILD_CHARCLASS_LOOP (isgraph);
3566 else if (strcmp (name, "punct") == 0)
3567 BUILD_CHARCLASS_LOOP (ispunct);
3568 else if (strcmp (name, "xdigit") == 0)
3569 BUILD_CHARCLASS_LOOP (isxdigit);
3570 else
3571 return REG_ECTYPE;
3573 return REG_NOERROR;
3576 static bin_tree_t *
3577 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3578 const unsigned char *class_name,
3579 const unsigned char *extra, int non_match,
3580 reg_errcode_t *err)
3582 re_bitset_ptr_t sbcset;
3583 #ifdef RE_ENABLE_I18N
3584 re_charset_t *mbcset;
3585 int alloc = 0;
3586 #endif /* not RE_ENABLE_I18N */
3587 reg_errcode_t ret;
3588 re_token_t br_token;
3589 bin_tree_t *tree;
3591 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3592 #ifdef RE_ENABLE_I18N
3593 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3594 #endif /* RE_ENABLE_I18N */
3596 #ifdef RE_ENABLE_I18N
3597 if (BE (sbcset == NULL || mbcset == NULL, 0))
3598 #else /* not RE_ENABLE_I18N */
3599 if (BE (sbcset == NULL, 0))
3600 #endif /* not RE_ENABLE_I18N */
3602 *err = REG_ESPACE;
3603 return NULL;
3606 if (non_match)
3608 #ifdef RE_ENABLE_I18N
3609 mbcset->non_match = 1;
3610 #endif /* not RE_ENABLE_I18N */
3613 /* We don't care the syntax in this case. */
3614 ret = build_charclass (trans, sbcset,
3615 #ifdef RE_ENABLE_I18N
3616 mbcset, &alloc,
3617 #endif /* RE_ENABLE_I18N */
3618 class_name, 0);
3620 if (BE (ret != REG_NOERROR, 0))
3622 re_free (sbcset);
3623 #ifdef RE_ENABLE_I18N
3624 free_charset (mbcset);
3625 #endif /* RE_ENABLE_I18N */
3626 *err = ret;
3627 return NULL;
3629 /* \w match '_' also. */
3630 for (; *extra; extra++)
3631 bitset_set (sbcset, *extra);
3633 /* If it is non-matching list. */
3634 if (non_match)
3635 bitset_not (sbcset);
3637 #ifdef RE_ENABLE_I18N
3638 /* Ensure only single byte characters are set. */
3639 if (dfa->mb_cur_max > 1)
3640 bitset_mask (sbcset, dfa->sb_char);
3641 #endif
3643 /* Build a tree for simple bracket. */
3644 br_token.type = SIMPLE_BRACKET;
3645 br_token.opr.sbcset = sbcset;
3646 tree = create_token_tree (dfa, NULL, NULL, &br_token);
3647 if (BE (tree == NULL, 0))
3648 goto build_word_op_espace;
3650 #ifdef RE_ENABLE_I18N
3651 if (dfa->mb_cur_max > 1)
3653 bin_tree_t *mbc_tree;
3654 /* Build a tree for complex bracket. */
3655 br_token.type = COMPLEX_BRACKET;
3656 br_token.opr.mbcset = mbcset;
3657 dfa->has_mb_node = 1;
3658 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3659 if (BE (mbc_tree == NULL, 0))
3660 goto build_word_op_espace;
3661 /* Then join them by ALT node. */
3662 tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3663 if (BE (mbc_tree != NULL, 1))
3664 return tree;
3666 else
3668 free_charset (mbcset);
3669 return tree;
3671 #else /* not RE_ENABLE_I18N */
3672 return tree;
3673 #endif /* not RE_ENABLE_I18N */
3675 build_word_op_espace:
3676 re_free (sbcset);
3677 #ifdef RE_ENABLE_I18N
3678 free_charset (mbcset);
3679 #endif /* RE_ENABLE_I18N */
3680 *err = REG_ESPACE;
3681 return NULL;
3684 /* This is intended for the expressions like "a{1,3}".
3685 Fetch a number from `input', and return the number.
3686 Return -1, if the number field is empty like "{,1}".
3687 Return -2, If an error is occured. */
3689 static int
3690 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3692 int num = -1;
3693 unsigned char c;
3694 while (1)
3696 fetch_token (token, input, syntax);
3697 c = token->opr.c;
3698 if (BE (token->type == END_OF_RE, 0))
3699 return -2;
3700 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3701 break;
3702 num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3703 ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3704 num = (num > RE_DUP_MAX) ? -2 : num;
3706 return num;
3709 #ifdef RE_ENABLE_I18N
3710 static void
3711 free_charset (re_charset_t *cset)
3713 re_free (cset->mbchars);
3714 # ifdef _LIBC
3715 re_free (cset->coll_syms);
3716 re_free (cset->equiv_classes);
3717 re_free (cset->range_starts);
3718 re_free (cset->range_ends);
3719 # endif
3720 re_free (cset->char_classes);
3721 re_free (cset);
3723 #endif /* RE_ENABLE_I18N */
3725 /* Functions for binary tree operation. */
3727 /* Create a tree node. */
3729 static bin_tree_t *
3730 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3731 re_token_type_t type)
3733 re_token_t t;
3734 t.type = type;
3735 return create_token_tree (dfa, left, right, &t);
3738 static bin_tree_t *
3739 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3740 const re_token_t *token)
3742 bin_tree_t *tree;
3743 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3745 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3747 if (storage == NULL)
3748 return NULL;
3749 storage->next = dfa->str_tree_storage;
3750 dfa->str_tree_storage = storage;
3751 dfa->str_tree_storage_idx = 0;
3753 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3755 tree->parent = NULL;
3756 tree->left = left;
3757 tree->right = right;
3758 tree->token = *token;
3759 tree->token.duplicated = 0;
3760 tree->token.opt_subexp = 0;
3761 tree->first = NULL;
3762 tree->next = NULL;
3763 tree->node_idx = -1;
3765 if (left != NULL)
3766 left->parent = tree;
3767 if (right != NULL)
3768 right->parent = tree;
3769 return tree;
3772 /* Mark the tree SRC as an optional subexpression.
3773 To be called from preorder or postorder. */
3775 static reg_errcode_t
3776 mark_opt_subexp (void *extra, bin_tree_t *node)
3778 int idx = (int) (long) extra;
3779 if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3780 node->token.opt_subexp = 1;
3782 return REG_NOERROR;
3785 /* Free the allocated memory inside NODE. */
3787 static void
3788 free_token (re_token_t *node)
3790 #ifdef RE_ENABLE_I18N
3791 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3792 free_charset (node->opr.mbcset);
3793 else
3794 #endif /* RE_ENABLE_I18N */
3795 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3796 re_free (node->opr.sbcset);
3799 /* Worker function for tree walking. Free the allocated memory inside NODE
3800 and its children. */
3802 static reg_errcode_t
3803 free_tree (void *extra, bin_tree_t *node)
3805 free_token (&node->token);
3806 return REG_NOERROR;
3810 /* Duplicate the node SRC, and return new node. This is a preorder
3811 visit similar to the one implemented by the generic visitor, but
3812 we need more infrastructure to maintain two parallel trees --- so,
3813 it's easier to duplicate. */
3815 static bin_tree_t *
3816 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3818 const bin_tree_t *node;
3819 bin_tree_t *dup_root;
3820 bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3822 for (node = root; ; )
3824 /* Create a new tree and link it back to the current parent. */
3825 *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3826 if (*p_new == NULL)
3827 return NULL;
3828 (*p_new)->parent = dup_node;
3829 (*p_new)->token.duplicated = 1;
3830 dup_node = *p_new;
3832 /* Go to the left node, or up and to the right. */
3833 if (node->left)
3835 node = node->left;
3836 p_new = &dup_node->left;
3838 else
3840 const bin_tree_t *prev = NULL;
3841 while (node->right == prev || node->right == NULL)
3843 prev = node;
3844 node = node->parent;
3845 dup_node = dup_node->parent;
3846 if (!node)
3847 return dup_root;
3849 node = node->right;
3850 p_new = &dup_node->right;