time: Use 64-bit time values for time zone parsing
[glibc.git] / posix / regcomp.c
blobf5c09febb914d69d9bc5b4440d17032c548a58a2
1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002-2018 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <http://www.gnu.org/licenses/>. */
20 #include <stdint.h>
22 #ifdef _LIBC
23 # include <locale/weight.h>
24 #endif
26 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
27 size_t length, reg_syntax_t syntax);
28 static void re_compile_fastmap_iter (regex_t *bufp,
29 const re_dfastate_t *init_state,
30 char *fastmap);
31 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
32 #ifdef RE_ENABLE_I18N
33 static void free_charset (re_charset_t *cset);
34 #endif /* RE_ENABLE_I18N */
35 static void free_workarea_compile (regex_t *preg);
36 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
37 #ifdef RE_ENABLE_I18N
38 static void optimize_utf8 (re_dfa_t *dfa);
39 #endif
40 static reg_errcode_t analyze (regex_t *preg);
41 static reg_errcode_t preorder (bin_tree_t *root,
42 reg_errcode_t (fn (void *, bin_tree_t *)),
43 void *extra);
44 static reg_errcode_t postorder (bin_tree_t *root,
45 reg_errcode_t (fn (void *, bin_tree_t *)),
46 void *extra);
47 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
48 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
49 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
50 bin_tree_t *node);
51 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
52 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
53 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
54 static int duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint);
55 static int search_duplicated_node (const re_dfa_t *dfa, int org_node,
56 unsigned int constraint);
57 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
58 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
59 int node, int root);
60 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
61 static int fetch_number (re_string_t *input, re_token_t *token,
62 reg_syntax_t syntax);
63 static int peek_token (re_token_t *token, re_string_t *input,
64 reg_syntax_t syntax);
65 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
66 reg_syntax_t syntax, reg_errcode_t *err);
67 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
68 re_token_t *token, reg_syntax_t syntax,
69 int nest, reg_errcode_t *err);
70 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
71 re_token_t *token, reg_syntax_t syntax,
72 int nest, reg_errcode_t *err);
73 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
74 re_token_t *token, reg_syntax_t syntax,
75 int nest, reg_errcode_t *err);
76 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
77 re_token_t *token, reg_syntax_t syntax,
78 int nest, reg_errcode_t *err);
79 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
80 re_dfa_t *dfa, re_token_t *token,
81 reg_syntax_t syntax, reg_errcode_t *err);
82 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
83 re_token_t *token, reg_syntax_t syntax,
84 reg_errcode_t *err);
85 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
86 re_string_t *regexp,
87 re_token_t *token, int token_len,
88 re_dfa_t *dfa,
89 reg_syntax_t syntax,
90 int accept_hyphen);
91 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
92 re_string_t *regexp,
93 re_token_t *token);
94 #ifdef RE_ENABLE_I18N
95 static reg_errcode_t build_equiv_class (bitset_t sbcset,
96 re_charset_t *mbcset,
97 int *equiv_class_alloc,
98 const unsigned char *name);
99 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
100 bitset_t sbcset,
101 re_charset_t *mbcset,
102 int *char_class_alloc,
103 const unsigned char *class_name,
104 reg_syntax_t syntax);
105 #else /* not RE_ENABLE_I18N */
106 static reg_errcode_t build_equiv_class (bitset_t sbcset,
107 const unsigned char *name);
108 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
109 bitset_t sbcset,
110 const unsigned char *class_name,
111 reg_syntax_t syntax);
112 #endif /* not RE_ENABLE_I18N */
113 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
114 RE_TRANSLATE_TYPE trans,
115 const unsigned char *class_name,
116 const unsigned char *extra,
117 int non_match, reg_errcode_t *err);
118 static bin_tree_t *create_tree (re_dfa_t *dfa,
119 bin_tree_t *left, bin_tree_t *right,
120 re_token_type_t type);
121 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
122 bin_tree_t *left, bin_tree_t *right,
123 const re_token_t *token);
124 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
125 static void free_token (re_token_t *node);
126 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
127 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
129 /* This table gives an error message for each of the error codes listed
130 in regex.h. Obviously the order here has to be same as there.
131 POSIX doesn't require that we do anything for REG_NOERROR,
132 but why not be nice? */
134 const char __re_error_msgid[] attribute_hidden =
136 #define REG_NOERROR_IDX 0
137 gettext_noop ("Success") /* REG_NOERROR */
138 "\0"
139 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
140 gettext_noop ("No match") /* REG_NOMATCH */
141 "\0"
142 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
143 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
144 "\0"
145 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
146 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
147 "\0"
148 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
149 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
150 "\0"
151 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
152 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
153 "\0"
154 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
155 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
156 "\0"
157 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
158 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
159 "\0"
160 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
161 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
162 "\0"
163 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
164 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
165 "\0"
166 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
167 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
168 "\0"
169 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
170 gettext_noop ("Invalid range end") /* REG_ERANGE */
171 "\0"
172 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
173 gettext_noop ("Memory exhausted") /* REG_ESPACE */
174 "\0"
175 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
176 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
177 "\0"
178 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
179 gettext_noop ("Premature end of regular expression") /* REG_EEND */
180 "\0"
181 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
182 gettext_noop ("Regular expression too big") /* REG_ESIZE */
183 "\0"
184 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
185 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
188 const size_t __re_error_msgid_idx[] attribute_hidden =
190 REG_NOERROR_IDX,
191 REG_NOMATCH_IDX,
192 REG_BADPAT_IDX,
193 REG_ECOLLATE_IDX,
194 REG_ECTYPE_IDX,
195 REG_EESCAPE_IDX,
196 REG_ESUBREG_IDX,
197 REG_EBRACK_IDX,
198 REG_EPAREN_IDX,
199 REG_EBRACE_IDX,
200 REG_BADBR_IDX,
201 REG_ERANGE_IDX,
202 REG_ESPACE_IDX,
203 REG_BADRPT_IDX,
204 REG_EEND_IDX,
205 REG_ESIZE_IDX,
206 REG_ERPAREN_IDX
209 /* Entry points for GNU code. */
211 /* re_compile_pattern is the GNU regular expression compiler: it
212 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
213 Returns 0 if the pattern was valid, otherwise an error string.
215 Assumes the 'allocated' (and perhaps 'buffer') and 'translate' fields
216 are set in BUFP on entry. */
218 const char *
219 re_compile_pattern (const char *pattern, size_t length,
220 struct re_pattern_buffer *bufp)
222 reg_errcode_t ret;
224 /* And GNU code determines whether or not to get register information
225 by passing null for the REGS argument to re_match, etc., not by
226 setting no_sub, unless RE_NO_SUB is set. */
227 bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
229 /* Match anchors at newline. */
230 bufp->newline_anchor = 1;
232 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
234 if (!ret)
235 return NULL;
236 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
238 #ifdef _LIBC
239 weak_alias (__re_compile_pattern, re_compile_pattern)
240 #endif
242 /* Set by 're_set_syntax' to the current regexp syntax to recognize. Can
243 also be assigned to arbitrarily: each pattern buffer stores its own
244 syntax, so it can be changed between regex compilations. */
245 /* This has no initializer because initialized variables in Emacs
246 become read-only after dumping. */
247 reg_syntax_t re_syntax_options;
250 /* Specify the precise syntax of regexps for compilation. This provides
251 for compatibility for various utilities which historically have
252 different, incompatible syntaxes.
254 The argument SYNTAX is a bit mask comprised of the various bits
255 defined in regex.h. We return the old syntax. */
257 reg_syntax_t
258 re_set_syntax (reg_syntax_t syntax)
260 reg_syntax_t ret = re_syntax_options;
262 re_syntax_options = syntax;
263 return ret;
265 #ifdef _LIBC
266 weak_alias (__re_set_syntax, re_set_syntax)
267 #endif
270 re_compile_fastmap (struct re_pattern_buffer *bufp)
272 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
273 char *fastmap = bufp->fastmap;
275 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
276 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
277 if (dfa->init_state != dfa->init_state_word)
278 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
279 if (dfa->init_state != dfa->init_state_nl)
280 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
281 if (dfa->init_state != dfa->init_state_begbuf)
282 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
283 bufp->fastmap_accurate = 1;
284 return 0;
286 #ifdef _LIBC
287 weak_alias (__re_compile_fastmap, re_compile_fastmap)
288 #endif
290 static inline void
291 __attribute__ ((always_inline))
292 re_set_fastmap (char *fastmap, bool icase, int ch)
294 fastmap[ch] = 1;
295 if (icase)
296 fastmap[tolower (ch)] = 1;
299 /* Helper function for re_compile_fastmap.
300 Compile fastmap for the initial_state INIT_STATE. */
302 static void
303 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
304 char *fastmap)
306 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
307 int node_cnt;
308 int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
309 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
311 int node = init_state->nodes.elems[node_cnt];
312 re_token_type_t type = dfa->nodes[node].type;
314 if (type == CHARACTER)
316 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
317 #ifdef RE_ENABLE_I18N
318 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
320 unsigned char *buf = alloca (dfa->mb_cur_max), *p;
321 wchar_t wc;
322 mbstate_t state;
324 p = buf;
325 *p++ = dfa->nodes[node].opr.c;
326 while (++node < dfa->nodes_len
327 && dfa->nodes[node].type == CHARACTER
328 && dfa->nodes[node].mb_partial)
329 *p++ = dfa->nodes[node].opr.c;
330 memset (&state, '\0', sizeof (state));
331 if (__mbrtowc (&wc, (const char *) buf, p - buf,
332 &state) == p - buf
333 && (__wcrtomb ((char *) buf, __towlower (wc), &state)
334 != (size_t) -1))
335 re_set_fastmap (fastmap, 0, buf[0]);
337 #endif
339 else if (type == SIMPLE_BRACKET)
341 int i, ch;
342 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
344 int j;
345 bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
346 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
347 if (w & ((bitset_word_t) 1 << j))
348 re_set_fastmap (fastmap, icase, ch);
351 #ifdef RE_ENABLE_I18N
352 else if (type == COMPLEX_BRACKET)
354 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
355 int i;
357 # ifdef _LIBC
358 /* See if we have to try all bytes which start multiple collation
359 elements.
360 e.g. In da_DK, we want to catch 'a' since "aa" is a valid
361 collation element, and don't catch 'b' since 'b' is
362 the only collation element which starts from 'b' (and
363 it is caught by SIMPLE_BRACKET). */
364 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
365 && (cset->ncoll_syms || cset->nranges))
367 const int32_t *table = (const int32_t *)
368 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
369 for (i = 0; i < SBC_MAX; ++i)
370 if (table[i] < 0)
371 re_set_fastmap (fastmap, icase, i);
373 # endif /* _LIBC */
375 /* See if we have to start the match at all multibyte characters,
376 i.e. where we would not find an invalid sequence. This only
377 applies to multibyte character sets; for single byte character
378 sets, the SIMPLE_BRACKET again suffices. */
379 if (dfa->mb_cur_max > 1
380 && (cset->nchar_classes || cset->non_match || cset->nranges
381 # ifdef _LIBC
382 || cset->nequiv_classes
383 # endif /* _LIBC */
386 unsigned char c = 0;
389 mbstate_t mbs;
390 memset (&mbs, 0, sizeof (mbs));
391 if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
392 re_set_fastmap (fastmap, false, (int) c);
394 while (++c != 0);
397 else
399 /* ... Else catch all bytes which can start the mbchars. */
400 for (i = 0; i < cset->nmbchars; ++i)
402 char buf[256];
403 mbstate_t state;
404 memset (&state, '\0', sizeof (state));
405 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
406 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
407 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
409 if (__wcrtomb (buf, __towlower (cset->mbchars[i]), &state)
410 != (size_t) -1)
411 re_set_fastmap (fastmap, false, *(unsigned char *) buf);
416 #endif /* RE_ENABLE_I18N */
417 else if (type == OP_PERIOD
418 #ifdef RE_ENABLE_I18N
419 || type == OP_UTF8_PERIOD
420 #endif /* RE_ENABLE_I18N */
421 || type == END_OF_RE)
423 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
424 if (type == END_OF_RE)
425 bufp->can_be_null = 1;
426 return;
431 /* Entry point for POSIX code. */
432 /* regcomp takes a regular expression as a string and compiles it.
434 PREG is a regex_t *. We do not expect any fields to be initialized,
435 since POSIX says we shouldn't. Thus, we set
437 'buffer' to the compiled pattern;
438 'used' to the length of the compiled pattern;
439 'syntax' to RE_SYNTAX_POSIX_EXTENDED if the
440 REG_EXTENDED bit in CFLAGS is set; otherwise, to
441 RE_SYNTAX_POSIX_BASIC;
442 'newline_anchor' to REG_NEWLINE being set in CFLAGS;
443 'fastmap' to an allocated space for the fastmap;
444 'fastmap_accurate' to zero;
445 're_nsub' to the number of subexpressions in PATTERN.
447 PATTERN is the address of the pattern string.
449 CFLAGS is a series of bits which affect compilation.
451 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
452 use POSIX basic syntax.
454 If REG_NEWLINE is set, then . and [^...] don't match newline.
455 Also, regexec will try a match beginning after every newline.
457 If REG_ICASE is set, then we considers upper- and lowercase
458 versions of letters to be equivalent when matching.
460 If REG_NOSUB is set, then when PREG is passed to regexec, that
461 routine will report only success or failure, and nothing about the
462 registers.
464 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
465 the return codes and their meanings.) */
468 regcomp (regex_t *__restrict preg, const char *__restrict pattern, int cflags)
470 reg_errcode_t ret;
471 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
472 : RE_SYNTAX_POSIX_BASIC);
474 preg->buffer = NULL;
475 preg->allocated = 0;
476 preg->used = 0;
478 /* Try to allocate space for the fastmap. */
479 preg->fastmap = re_malloc (char, SBC_MAX);
480 if (BE (preg->fastmap == NULL, 0))
481 return REG_ESPACE;
483 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
485 /* If REG_NEWLINE is set, newlines are treated differently. */
486 if (cflags & REG_NEWLINE)
487 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
488 syntax &= ~RE_DOT_NEWLINE;
489 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
490 /* It also changes the matching behavior. */
491 preg->newline_anchor = 1;
493 else
494 preg->newline_anchor = 0;
495 preg->no_sub = !!(cflags & REG_NOSUB);
496 preg->translate = NULL;
498 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
500 /* POSIX doesn't distinguish between an unmatched open-group and an
501 unmatched close-group: both are REG_EPAREN. */
502 if (ret == REG_ERPAREN)
503 ret = REG_EPAREN;
505 /* We have already checked preg->fastmap != NULL. */
506 if (BE (ret == REG_NOERROR, 1))
507 /* Compute the fastmap now, since regexec cannot modify the pattern
508 buffer. This function never fails in this implementation. */
509 (void) re_compile_fastmap (preg);
510 else
512 /* Some error occurred while compiling the expression. */
513 re_free (preg->fastmap);
514 preg->fastmap = NULL;
517 return (int) ret;
519 #ifdef _LIBC
520 libc_hidden_def (__regcomp)
521 weak_alias (__regcomp, regcomp)
522 #endif
524 /* Returns a message corresponding to an error code, ERRCODE, returned
525 from either regcomp or regexec. We don't use PREG here. */
527 size_t
528 regerror (int errcode, const regex_t *__restrict preg, char *__restrict errbuf,
529 size_t errbuf_size)
531 const char *msg;
532 size_t msg_size;
534 if (BE (errcode < 0
535 || errcode >= (int) (sizeof (__re_error_msgid_idx)
536 / sizeof (__re_error_msgid_idx[0])), 0))
537 /* Only error codes returned by the rest of the code should be passed
538 to this routine. If we are given anything else, or if other regex
539 code generates an invalid error code, then the program has a bug.
540 Dump core so we can fix it. */
541 abort ();
543 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
545 msg_size = strlen (msg) + 1; /* Includes the null. */
547 if (BE (errbuf_size != 0, 1))
549 if (BE (msg_size > errbuf_size, 0))
551 #if defined HAVE_MEMPCPY || defined _LIBC
552 *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
553 #else
554 memcpy (errbuf, msg, errbuf_size - 1);
555 errbuf[errbuf_size - 1] = 0;
556 #endif
558 else
559 memcpy (errbuf, msg, msg_size);
562 return msg_size;
564 #ifdef _LIBC
565 weak_alias (__regerror, regerror)
566 #endif
569 #ifdef RE_ENABLE_I18N
570 /* This static array is used for the map to single-byte characters when
571 UTF-8 is used. Otherwise we would allocate memory just to initialize
572 it the same all the time. UTF-8 is the preferred encoding so this is
573 a worthwhile optimization. */
574 static const bitset_t utf8_sb_map =
576 /* Set the first 128 bits. */
577 [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
579 #endif
582 static void
583 free_dfa_content (re_dfa_t *dfa)
585 int i, j;
587 if (dfa->nodes)
588 for (i = 0; i < dfa->nodes_len; ++i)
589 free_token (dfa->nodes + i);
590 re_free (dfa->nexts);
591 for (i = 0; i < dfa->nodes_len; ++i)
593 if (dfa->eclosures != NULL)
594 re_node_set_free (dfa->eclosures + i);
595 if (dfa->inveclosures != NULL)
596 re_node_set_free (dfa->inveclosures + i);
597 if (dfa->edests != NULL)
598 re_node_set_free (dfa->edests + i);
600 re_free (dfa->edests);
601 re_free (dfa->eclosures);
602 re_free (dfa->inveclosures);
603 re_free (dfa->nodes);
605 if (dfa->state_table)
606 for (i = 0; i <= dfa->state_hash_mask; ++i)
608 struct re_state_table_entry *entry = dfa->state_table + i;
609 for (j = 0; j < entry->num; ++j)
611 re_dfastate_t *state = entry->array[j];
612 free_state (state);
614 re_free (entry->array);
616 re_free (dfa->state_table);
617 #ifdef RE_ENABLE_I18N
618 if (dfa->sb_char != utf8_sb_map)
619 re_free (dfa->sb_char);
620 #endif
621 re_free (dfa->subexp_map);
622 #ifdef DEBUG
623 re_free (dfa->re_str);
624 #endif
626 re_free (dfa);
630 /* Free dynamically allocated space used by PREG. */
632 void
633 regfree (regex_t *preg)
635 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
636 if (BE (dfa != NULL, 1))
637 free_dfa_content (dfa);
638 preg->buffer = NULL;
639 preg->allocated = 0;
641 re_free (preg->fastmap);
642 preg->fastmap = NULL;
644 re_free (preg->translate);
645 preg->translate = NULL;
647 #ifdef _LIBC
648 libc_hidden_def (__regfree)
649 weak_alias (__regfree, regfree)
650 #endif
652 /* Entry points compatible with 4.2 BSD regex library. We don't define
653 them unless specifically requested. */
655 #if defined _REGEX_RE_COMP || defined _LIBC
657 /* BSD has one and only one pattern buffer. */
658 static struct re_pattern_buffer re_comp_buf;
660 char *
661 # ifdef _LIBC
662 /* Make these definitions weak in libc, so POSIX programs can redefine
663 these names if they don't use our functions, and still use
664 regcomp/regexec above without link errors. */
665 weak_function
666 # endif
667 re_comp (const char *s)
669 reg_errcode_t ret;
670 char *fastmap;
672 if (!s)
674 if (!re_comp_buf.buffer)
675 return gettext ("No previous regular expression");
676 return 0;
679 if (re_comp_buf.buffer)
681 fastmap = re_comp_buf.fastmap;
682 re_comp_buf.fastmap = NULL;
683 __regfree (&re_comp_buf);
684 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
685 re_comp_buf.fastmap = fastmap;
688 if (re_comp_buf.fastmap == NULL)
690 re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
691 if (re_comp_buf.fastmap == NULL)
692 return (char *) gettext (__re_error_msgid
693 + __re_error_msgid_idx[(int) REG_ESPACE]);
696 /* Since 're_exec' always passes NULL for the 'regs' argument, we
697 don't need to initialize the pattern buffer fields which affect it. */
699 /* Match anchors at newlines. */
700 re_comp_buf.newline_anchor = 1;
702 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
704 if (!ret)
705 return NULL;
707 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
708 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
711 #ifdef _LIBC
712 libc_freeres_fn (free_mem)
714 __regfree (&re_comp_buf);
716 #endif
718 #endif /* _REGEX_RE_COMP */
720 /* Internal entry point.
721 Compile the regular expression PATTERN, whose length is LENGTH.
722 SYNTAX indicate regular expression's syntax. */
724 static reg_errcode_t
725 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
726 reg_syntax_t syntax)
728 reg_errcode_t err = REG_NOERROR;
729 re_dfa_t *dfa;
730 re_string_t regexp;
732 /* Initialize the pattern buffer. */
733 preg->fastmap_accurate = 0;
734 preg->syntax = syntax;
735 preg->not_bol = preg->not_eol = 0;
736 preg->used = 0;
737 preg->re_nsub = 0;
738 preg->can_be_null = 0;
739 preg->regs_allocated = REGS_UNALLOCATED;
741 /* Initialize the dfa. */
742 dfa = (re_dfa_t *) preg->buffer;
743 if (BE (preg->allocated < sizeof (re_dfa_t), 0))
745 /* If zero allocated, but buffer is non-null, try to realloc
746 enough space. This loses if buffer's address is bogus, but
747 that is the user's responsibility. If ->buffer is NULL this
748 is a simple allocation. */
749 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
750 if (dfa == NULL)
751 return REG_ESPACE;
752 preg->allocated = sizeof (re_dfa_t);
753 preg->buffer = (unsigned char *) dfa;
755 preg->used = sizeof (re_dfa_t);
757 err = init_dfa (dfa, length);
758 if (BE (err != REG_NOERROR, 0))
760 free_dfa_content (dfa);
761 preg->buffer = NULL;
762 preg->allocated = 0;
763 return err;
765 #ifdef DEBUG
766 /* Note: length+1 will not overflow since it is checked in init_dfa. */
767 dfa->re_str = re_malloc (char, length + 1);
768 strncpy (dfa->re_str, pattern, length + 1);
769 #endif
771 __libc_lock_init (dfa->lock);
773 err = re_string_construct (&regexp, pattern, length, preg->translate,
774 syntax & RE_ICASE, dfa);
775 if (BE (err != REG_NOERROR, 0))
777 re_compile_internal_free_return:
778 free_workarea_compile (preg);
779 re_string_destruct (&regexp);
780 free_dfa_content (dfa);
781 preg->buffer = NULL;
782 preg->allocated = 0;
783 return err;
786 /* Parse the regular expression, and build a structure tree. */
787 preg->re_nsub = 0;
788 dfa->str_tree = parse (&regexp, preg, syntax, &err);
789 if (BE (dfa->str_tree == NULL, 0))
790 goto re_compile_internal_free_return;
792 /* Analyze the tree and create the nfa. */
793 err = analyze (preg);
794 if (BE (err != REG_NOERROR, 0))
795 goto re_compile_internal_free_return;
797 #ifdef RE_ENABLE_I18N
798 /* If possible, do searching in single byte encoding to speed things up. */
799 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
800 optimize_utf8 (dfa);
801 #endif
803 /* Then create the initial state of the dfa. */
804 err = create_initial_state (dfa);
806 /* Release work areas. */
807 free_workarea_compile (preg);
808 re_string_destruct (&regexp);
810 if (BE (err != REG_NOERROR, 0))
812 free_dfa_content (dfa);
813 preg->buffer = NULL;
814 preg->allocated = 0;
817 return err;
820 /* Initialize DFA. We use the length of the regular expression PAT_LEN
821 as the initial length of some arrays. */
823 static reg_errcode_t
824 init_dfa (re_dfa_t *dfa, size_t pat_len)
826 unsigned int table_size;
827 #ifndef _LIBC
828 char *codeset_name;
829 #endif
831 memset (dfa, '\0', sizeof (re_dfa_t));
833 /* Force allocation of str_tree_storage the first time. */
834 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
836 /* Avoid overflows. */
837 if (pat_len == SIZE_MAX)
838 return REG_ESPACE;
840 dfa->nodes_alloc = pat_len + 1;
841 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
843 /* table_size = 2 ^ ceil(log pat_len) */
844 for (table_size = 1; ; table_size <<= 1)
845 if (table_size > pat_len)
846 break;
848 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
849 dfa->state_hash_mask = table_size - 1;
851 dfa->mb_cur_max = MB_CUR_MAX;
852 #ifdef _LIBC
853 if (dfa->mb_cur_max == 6
854 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
855 dfa->is_utf8 = 1;
856 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
857 != 0);
858 #else
859 # ifdef HAVE_LANGINFO_CODESET
860 codeset_name = nl_langinfo (CODESET);
861 # else
862 codeset_name = getenv ("LC_ALL");
863 if (codeset_name == NULL || codeset_name[0] == '\0')
864 codeset_name = getenv ("LC_CTYPE");
865 if (codeset_name == NULL || codeset_name[0] == '\0')
866 codeset_name = getenv ("LANG");
867 if (codeset_name == NULL)
868 codeset_name = "";
869 else if (strchr (codeset_name, '.') != NULL)
870 codeset_name = strchr (codeset_name, '.') + 1;
871 # endif
873 if (strcasecmp (codeset_name, "UTF-8") == 0
874 || strcasecmp (codeset_name, "UTF8") == 0)
875 dfa->is_utf8 = 1;
877 /* We check exhaustively in the loop below if this charset is a
878 superset of ASCII. */
879 dfa->map_notascii = 0;
880 #endif
882 #ifdef RE_ENABLE_I18N
883 if (dfa->mb_cur_max > 1)
885 if (dfa->is_utf8)
886 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
887 else
889 int i, j, ch;
891 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
892 if (BE (dfa->sb_char == NULL, 0))
893 return REG_ESPACE;
895 /* Set the bits corresponding to single byte chars. */
896 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
897 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
899 wint_t wch = __btowc (ch);
900 if (wch != WEOF)
901 dfa->sb_char[i] |= (bitset_word_t) 1 << j;
902 # ifndef _LIBC
903 if (isascii (ch) && wch != ch)
904 dfa->map_notascii = 1;
905 # endif
909 #endif
911 if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
912 return REG_ESPACE;
913 return REG_NOERROR;
916 /* Initialize WORD_CHAR table, which indicate which character is
917 "word". In this case "word" means that it is the word construction
918 character used by some operators like "\<", "\>", etc. */
920 static void
921 init_word_char (re_dfa_t *dfa)
923 dfa->word_ops_used = 1;
924 int i = 0;
925 int ch = 0;
926 if (BE (dfa->map_notascii == 0, 1))
928 /* Avoid uint32_t and uint64_t as some non-GCC platforms lack
929 them, an issue when this code is used in Gnulib. */
930 bitset_word_t bits0 = 0x00000000;
931 bitset_word_t bits1 = 0x03ff0000;
932 bitset_word_t bits2 = 0x87fffffe;
933 bitset_word_t bits3 = 0x07fffffe;
934 if (BITSET_WORD_BITS == 64)
936 /* Pacify gcc -Woverflow on 32-bit platformns. */
937 dfa->word_char[0] = bits1 << 31 << 1 | bits0;
938 dfa->word_char[1] = bits3 << 31 << 1 | bits2;
939 i = 2;
941 else if (BITSET_WORD_BITS == 32)
943 dfa->word_char[0] = bits0;
944 dfa->word_char[1] = bits1;
945 dfa->word_char[2] = bits2;
946 dfa->word_char[3] = bits3;
947 i = 4;
949 else
950 goto general_case;
951 ch = 128;
953 if (BE (dfa->is_utf8, 1))
955 memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8);
956 return;
960 general_case:
961 for (; i < BITSET_WORDS; ++i)
962 for (int j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
963 if (isalnum (ch) || ch == '_')
964 dfa->word_char[i] |= (bitset_word_t) 1 << j;
967 /* Free the work area which are only used while compiling. */
969 static void
970 free_workarea_compile (regex_t *preg)
972 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
973 bin_tree_storage_t *storage, *next;
974 for (storage = dfa->str_tree_storage; storage; storage = next)
976 next = storage->next;
977 re_free (storage);
979 dfa->str_tree_storage = NULL;
980 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
981 dfa->str_tree = NULL;
982 re_free (dfa->org_indices);
983 dfa->org_indices = NULL;
986 /* Create initial states for all contexts. */
988 static reg_errcode_t
989 create_initial_state (re_dfa_t *dfa)
991 int first, i;
992 reg_errcode_t err;
993 re_node_set init_nodes;
995 /* Initial states have the epsilon closure of the node which is
996 the first node of the regular expression. */
997 first = dfa->str_tree->first->node_idx;
998 dfa->init_node = first;
999 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
1000 if (BE (err != REG_NOERROR, 0))
1001 return err;
1003 /* The back-references which are in initial states can epsilon transit,
1004 since in this case all of the subexpressions can be null.
1005 Then we add epsilon closures of the nodes which are the next nodes of
1006 the back-references. */
1007 if (dfa->nbackref > 0)
1008 for (i = 0; i < init_nodes.nelem; ++i)
1010 int node_idx = init_nodes.elems[i];
1011 re_token_type_t type = dfa->nodes[node_idx].type;
1013 int clexp_idx;
1014 if (type != OP_BACK_REF)
1015 continue;
1016 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1018 re_token_t *clexp_node;
1019 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1020 if (clexp_node->type == OP_CLOSE_SUBEXP
1021 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1022 break;
1024 if (clexp_idx == init_nodes.nelem)
1025 continue;
1027 if (type == OP_BACK_REF)
1029 int dest_idx = dfa->edests[node_idx].elems[0];
1030 if (!re_node_set_contains (&init_nodes, dest_idx))
1032 reg_errcode_t err = re_node_set_merge (&init_nodes,
1033 dfa->eclosures
1034 + dest_idx);
1035 if (err != REG_NOERROR)
1036 return err;
1037 i = 0;
1042 /* It must be the first time to invoke acquire_state. */
1043 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1044 /* We don't check ERR here, since the initial state must not be NULL. */
1045 if (BE (dfa->init_state == NULL, 0))
1046 return err;
1047 if (dfa->init_state->has_constraint)
1049 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1050 CONTEXT_WORD);
1051 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1052 CONTEXT_NEWLINE);
1053 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1054 &init_nodes,
1055 CONTEXT_NEWLINE
1056 | CONTEXT_BEGBUF);
1057 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1058 || dfa->init_state_begbuf == NULL, 0))
1059 return err;
1061 else
1062 dfa->init_state_word = dfa->init_state_nl
1063 = dfa->init_state_begbuf = dfa->init_state;
1065 re_node_set_free (&init_nodes);
1066 return REG_NOERROR;
1069 #ifdef RE_ENABLE_I18N
1070 /* If it is possible to do searching in single byte encoding instead of UTF-8
1071 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1072 DFA nodes where needed. */
1074 static void
1075 optimize_utf8 (re_dfa_t *dfa)
1077 int node, i, mb_chars = 0, has_period = 0;
1079 for (node = 0; node < dfa->nodes_len; ++node)
1080 switch (dfa->nodes[node].type)
1082 case CHARACTER:
1083 if (dfa->nodes[node].opr.c >= 0x80)
1084 mb_chars = 1;
1085 break;
1086 case ANCHOR:
1087 switch (dfa->nodes[node].opr.ctx_type)
1089 case LINE_FIRST:
1090 case LINE_LAST:
1091 case BUF_FIRST:
1092 case BUF_LAST:
1093 break;
1094 default:
1095 /* Word anchors etc. cannot be handled. It's okay to test
1096 opr.ctx_type since constraints (for all DFA nodes) are
1097 created by ORing one or more opr.ctx_type values. */
1098 return;
1100 break;
1101 case OP_PERIOD:
1102 has_period = 1;
1103 break;
1104 case OP_BACK_REF:
1105 case OP_ALT:
1106 case END_OF_RE:
1107 case OP_DUP_ASTERISK:
1108 case OP_OPEN_SUBEXP:
1109 case OP_CLOSE_SUBEXP:
1110 break;
1111 case COMPLEX_BRACKET:
1112 return;
1113 case SIMPLE_BRACKET:
1114 /* Just double check. The non-ASCII range starts at 0x80. */
1115 assert (0x80 % BITSET_WORD_BITS == 0);
1116 for (i = 0x80 / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1117 if (dfa->nodes[node].opr.sbcset[i])
1118 return;
1119 break;
1120 default:
1121 abort ();
1124 if (mb_chars || has_period)
1125 for (node = 0; node < dfa->nodes_len; ++node)
1127 if (dfa->nodes[node].type == CHARACTER
1128 && dfa->nodes[node].opr.c >= 0x80)
1129 dfa->nodes[node].mb_partial = 0;
1130 else if (dfa->nodes[node].type == OP_PERIOD)
1131 dfa->nodes[node].type = OP_UTF8_PERIOD;
1134 /* The search can be in single byte locale. */
1135 dfa->mb_cur_max = 1;
1136 dfa->is_utf8 = 0;
1137 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1139 #endif
1141 /* Analyze the structure tree, and calculate "first", "next", "edest",
1142 "eclosure", and "inveclosure". */
1144 static reg_errcode_t
1145 analyze (regex_t *preg)
1147 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1148 reg_errcode_t ret;
1150 /* Allocate arrays. */
1151 dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1152 dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1153 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1154 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1155 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1156 || dfa->eclosures == NULL, 0))
1157 return REG_ESPACE;
1159 dfa->subexp_map = re_malloc (int, preg->re_nsub);
1160 if (dfa->subexp_map != NULL)
1162 int i;
1163 for (i = 0; i < preg->re_nsub; i++)
1164 dfa->subexp_map[i] = i;
1165 preorder (dfa->str_tree, optimize_subexps, dfa);
1166 for (i = 0; i < preg->re_nsub; i++)
1167 if (dfa->subexp_map[i] != i)
1168 break;
1169 if (i == preg->re_nsub)
1171 free (dfa->subexp_map);
1172 dfa->subexp_map = NULL;
1176 ret = postorder (dfa->str_tree, lower_subexps, preg);
1177 if (BE (ret != REG_NOERROR, 0))
1178 return ret;
1179 ret = postorder (dfa->str_tree, calc_first, dfa);
1180 if (BE (ret != REG_NOERROR, 0))
1181 return ret;
1182 preorder (dfa->str_tree, calc_next, dfa);
1183 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1184 if (BE (ret != REG_NOERROR, 0))
1185 return ret;
1186 ret = calc_eclosure (dfa);
1187 if (BE (ret != REG_NOERROR, 0))
1188 return ret;
1190 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1191 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1192 if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1193 || dfa->nbackref)
1195 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1196 if (BE (dfa->inveclosures == NULL, 0))
1197 return REG_ESPACE;
1198 ret = calc_inveclosure (dfa);
1201 return ret;
1204 /* Our parse trees are very unbalanced, so we cannot use a stack to
1205 implement parse tree visits. Instead, we use parent pointers and
1206 some hairy code in these two functions. */
1207 static reg_errcode_t
1208 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1209 void *extra)
1211 bin_tree_t *node, *prev;
1213 for (node = root; ; )
1215 /* Descend down the tree, preferably to the left (or to the right
1216 if that's the only child). */
1217 while (node->left || node->right)
1218 if (node->left)
1219 node = node->left;
1220 else
1221 node = node->right;
1225 reg_errcode_t err = fn (extra, node);
1226 if (BE (err != REG_NOERROR, 0))
1227 return err;
1228 if (node->parent == NULL)
1229 return REG_NOERROR;
1230 prev = node;
1231 node = node->parent;
1233 /* Go up while we have a node that is reached from the right. */
1234 while (node->right == prev || node->right == NULL);
1235 node = node->right;
1239 static reg_errcode_t
1240 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1241 void *extra)
1243 bin_tree_t *node;
1245 for (node = root; ; )
1247 reg_errcode_t err = fn (extra, node);
1248 if (BE (err != REG_NOERROR, 0))
1249 return err;
1251 /* Go to the left node, or up and to the right. */
1252 if (node->left)
1253 node = node->left;
1254 else
1256 bin_tree_t *prev = NULL;
1257 while (node->right == prev || node->right == NULL)
1259 prev = node;
1260 node = node->parent;
1261 if (!node)
1262 return REG_NOERROR;
1264 node = node->right;
1269 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1270 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1271 backreferences as well. Requires a preorder visit. */
1272 static reg_errcode_t
1273 optimize_subexps (void *extra, bin_tree_t *node)
1275 re_dfa_t *dfa = (re_dfa_t *) extra;
1277 if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1279 int idx = node->token.opr.idx;
1280 node->token.opr.idx = dfa->subexp_map[idx];
1281 dfa->used_bkref_map |= 1 << node->token.opr.idx;
1284 else if (node->token.type == SUBEXP
1285 && node->left && node->left->token.type == SUBEXP)
1287 int other_idx = node->left->token.opr.idx;
1289 node->left = node->left->left;
1290 if (node->left)
1291 node->left->parent = node;
1293 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1294 if (other_idx < BITSET_WORD_BITS)
1295 dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1298 return REG_NOERROR;
1301 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1302 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1303 static reg_errcode_t
1304 lower_subexps (void *extra, bin_tree_t *node)
1306 regex_t *preg = (regex_t *) extra;
1307 reg_errcode_t err = REG_NOERROR;
1309 if (node->left && node->left->token.type == SUBEXP)
1311 node->left = lower_subexp (&err, preg, node->left);
1312 if (node->left)
1313 node->left->parent = node;
1315 if (node->right && node->right->token.type == SUBEXP)
1317 node->right = lower_subexp (&err, preg, node->right);
1318 if (node->right)
1319 node->right->parent = node;
1322 return err;
1325 static bin_tree_t *
1326 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1328 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1329 bin_tree_t *body = node->left;
1330 bin_tree_t *op, *cls, *tree1, *tree;
1332 if (preg->no_sub
1333 /* We do not optimize empty subexpressions, because otherwise we may
1334 have bad CONCAT nodes with NULL children. This is obviously not
1335 very common, so we do not lose much. An example that triggers
1336 this case is the sed "script" /\(\)/x. */
1337 && node->left != NULL
1338 && (node->token.opr.idx >= BITSET_WORD_BITS
1339 || !(dfa->used_bkref_map
1340 & ((bitset_word_t) 1 << node->token.opr.idx))))
1341 return node->left;
1343 /* Convert the SUBEXP node to the concatenation of an
1344 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1345 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1346 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1347 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1348 tree = create_tree (dfa, op, tree1, CONCAT);
1349 if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1351 *err = REG_ESPACE;
1352 return NULL;
1355 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1356 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1357 return tree;
1360 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1361 nodes. Requires a postorder visit. */
1362 static reg_errcode_t
1363 calc_first (void *extra, bin_tree_t *node)
1365 re_dfa_t *dfa = (re_dfa_t *) extra;
1366 if (node->token.type == CONCAT)
1368 node->first = node->left->first;
1369 node->node_idx = node->left->node_idx;
1371 else
1373 node->first = node;
1374 node->node_idx = re_dfa_add_node (dfa, node->token);
1375 if (BE (node->node_idx == -1, 0))
1376 return REG_ESPACE;
1377 if (node->token.type == ANCHOR)
1378 dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1380 return REG_NOERROR;
1383 /* Pass 2: compute NEXT on the tree. Preorder visit. */
1384 static reg_errcode_t
1385 calc_next (void *extra, bin_tree_t *node)
1387 switch (node->token.type)
1389 case OP_DUP_ASTERISK:
1390 node->left->next = node;
1391 break;
1392 case CONCAT:
1393 node->left->next = node->right->first;
1394 node->right->next = node->next;
1395 break;
1396 default:
1397 if (node->left)
1398 node->left->next = node->next;
1399 if (node->right)
1400 node->right->next = node->next;
1401 break;
1403 return REG_NOERROR;
1406 /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1407 static reg_errcode_t
1408 link_nfa_nodes (void *extra, bin_tree_t *node)
1410 re_dfa_t *dfa = (re_dfa_t *) extra;
1411 int idx = node->node_idx;
1412 reg_errcode_t err = REG_NOERROR;
1414 switch (node->token.type)
1416 case CONCAT:
1417 break;
1419 case END_OF_RE:
1420 assert (node->next == NULL);
1421 break;
1423 case OP_DUP_ASTERISK:
1424 case OP_ALT:
1426 int left, right;
1427 dfa->has_plural_match = 1;
1428 if (node->left != NULL)
1429 left = node->left->first->node_idx;
1430 else
1431 left = node->next->node_idx;
1432 if (node->right != NULL)
1433 right = node->right->first->node_idx;
1434 else
1435 right = node->next->node_idx;
1436 assert (left > -1);
1437 assert (right > -1);
1438 err = re_node_set_init_2 (dfa->edests + idx, left, right);
1440 break;
1442 case ANCHOR:
1443 case OP_OPEN_SUBEXP:
1444 case OP_CLOSE_SUBEXP:
1445 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1446 break;
1448 case OP_BACK_REF:
1449 dfa->nexts[idx] = node->next->node_idx;
1450 if (node->token.type == OP_BACK_REF)
1451 err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1452 break;
1454 default:
1455 assert (!IS_EPSILON_NODE (node->token.type));
1456 dfa->nexts[idx] = node->next->node_idx;
1457 break;
1460 return err;
1463 /* Duplicate the epsilon closure of the node ROOT_NODE.
1464 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1465 to their own constraint. */
1467 static reg_errcode_t
1468 duplicate_node_closure (re_dfa_t *dfa, int top_org_node, int top_clone_node,
1469 int root_node, unsigned int init_constraint)
1471 int org_node, clone_node, ret;
1472 unsigned int constraint = init_constraint;
1473 for (org_node = top_org_node, clone_node = top_clone_node;;)
1475 int org_dest, clone_dest;
1476 if (dfa->nodes[org_node].type == OP_BACK_REF)
1478 /* If the back reference epsilon-transit, its destination must
1479 also have the constraint. Then duplicate the epsilon closure
1480 of the destination of the back reference, and store it in
1481 edests of the back reference. */
1482 org_dest = dfa->nexts[org_node];
1483 re_node_set_empty (dfa->edests + clone_node);
1484 clone_dest = duplicate_node (dfa, org_dest, constraint);
1485 if (BE (clone_dest == -1, 0))
1486 return REG_ESPACE;
1487 dfa->nexts[clone_node] = dfa->nexts[org_node];
1488 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1489 if (BE (ret < 0, 0))
1490 return REG_ESPACE;
1492 else if (dfa->edests[org_node].nelem == 0)
1494 /* In case of the node can't epsilon-transit, don't duplicate the
1495 destination and store the original destination as the
1496 destination of the node. */
1497 dfa->nexts[clone_node] = dfa->nexts[org_node];
1498 break;
1500 else if (dfa->edests[org_node].nelem == 1)
1502 /* In case of the node can epsilon-transit, and it has only one
1503 destination. */
1504 org_dest = dfa->edests[org_node].elems[0];
1505 re_node_set_empty (dfa->edests + clone_node);
1506 /* If the node is root_node itself, it means the epsilon closure
1507 has a loop. Then tie it to the destination of the root_node. */
1508 if (org_node == root_node && clone_node != org_node)
1510 ret = re_node_set_insert (dfa->edests + clone_node, org_dest);
1511 if (BE (ret < 0, 0))
1512 return REG_ESPACE;
1513 break;
1515 /* In case the node has another constraint, append it. */
1516 constraint |= dfa->nodes[org_node].constraint;
1517 clone_dest = duplicate_node (dfa, org_dest, constraint);
1518 if (BE (clone_dest == -1, 0))
1519 return REG_ESPACE;
1520 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1521 if (BE (ret < 0, 0))
1522 return REG_ESPACE;
1524 else /* dfa->edests[org_node].nelem == 2 */
1526 /* In case of the node can epsilon-transit, and it has two
1527 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1528 org_dest = dfa->edests[org_node].elems[0];
1529 re_node_set_empty (dfa->edests + clone_node);
1530 /* Search for a duplicated node which satisfies the constraint. */
1531 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1532 if (clone_dest == -1)
1534 /* There is no such duplicated node, create a new one. */
1535 reg_errcode_t err;
1536 clone_dest = duplicate_node (dfa, org_dest, constraint);
1537 if (BE (clone_dest == -1, 0))
1538 return REG_ESPACE;
1539 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1540 if (BE (ret < 0, 0))
1541 return REG_ESPACE;
1542 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1543 root_node, constraint);
1544 if (BE (err != REG_NOERROR, 0))
1545 return err;
1547 else
1549 /* There is a duplicated node which satisfies the constraint,
1550 use it to avoid infinite loop. */
1551 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1552 if (BE (ret < 0, 0))
1553 return REG_ESPACE;
1556 org_dest = dfa->edests[org_node].elems[1];
1557 clone_dest = duplicate_node (dfa, org_dest, constraint);
1558 if (BE (clone_dest == -1, 0))
1559 return REG_ESPACE;
1560 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1561 if (BE (ret < 0, 0))
1562 return REG_ESPACE;
1564 org_node = org_dest;
1565 clone_node = clone_dest;
1567 return REG_NOERROR;
1570 /* Search for a node which is duplicated from the node ORG_NODE, and
1571 satisfies the constraint CONSTRAINT. */
1573 static int
1574 search_duplicated_node (const re_dfa_t *dfa, int org_node,
1575 unsigned int constraint)
1577 int idx;
1578 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1580 if (org_node == dfa->org_indices[idx]
1581 && constraint == dfa->nodes[idx].constraint)
1582 return idx; /* Found. */
1584 return -1; /* Not found. */
1587 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1588 Return the index of the new node, or -1 if insufficient storage is
1589 available. */
1591 static int
1592 duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint)
1594 int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1595 if (BE (dup_idx != -1, 1))
1597 dfa->nodes[dup_idx].constraint = constraint;
1598 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1599 dfa->nodes[dup_idx].duplicated = 1;
1601 /* Store the index of the original node. */
1602 dfa->org_indices[dup_idx] = org_idx;
1604 return dup_idx;
1607 static reg_errcode_t
1608 calc_inveclosure (re_dfa_t *dfa)
1610 int src, idx, ret;
1611 for (idx = 0; idx < dfa->nodes_len; ++idx)
1612 re_node_set_init_empty (dfa->inveclosures + idx);
1614 for (src = 0; src < dfa->nodes_len; ++src)
1616 int *elems = dfa->eclosures[src].elems;
1617 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1619 ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1620 if (BE (ret == -1, 0))
1621 return REG_ESPACE;
1625 return REG_NOERROR;
1628 /* Calculate "eclosure" for all the node in DFA. */
1630 static reg_errcode_t
1631 calc_eclosure (re_dfa_t *dfa)
1633 int node_idx, incomplete;
1634 #ifdef DEBUG
1635 assert (dfa->nodes_len > 0);
1636 #endif
1637 incomplete = 0;
1638 /* For each nodes, calculate epsilon closure. */
1639 for (node_idx = 0; ; ++node_idx)
1641 reg_errcode_t err;
1642 re_node_set eclosure_elem;
1643 if (node_idx == dfa->nodes_len)
1645 if (!incomplete)
1646 break;
1647 incomplete = 0;
1648 node_idx = 0;
1651 #ifdef DEBUG
1652 assert (dfa->eclosures[node_idx].nelem != -1);
1653 #endif
1655 /* If we have already calculated, skip it. */
1656 if (dfa->eclosures[node_idx].nelem != 0)
1657 continue;
1658 /* Calculate epsilon closure of 'node_idx'. */
1659 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1660 if (BE (err != REG_NOERROR, 0))
1661 return err;
1663 if (dfa->eclosures[node_idx].nelem == 0)
1665 incomplete = 1;
1666 re_node_set_free (&eclosure_elem);
1669 return REG_NOERROR;
1672 /* Calculate epsilon closure of NODE. */
1674 static reg_errcode_t
1675 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, int node, int root)
1677 reg_errcode_t err;
1678 int i;
1679 re_node_set eclosure;
1680 int ret;
1681 int incomplete = 0;
1682 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1683 if (BE (err != REG_NOERROR, 0))
1684 return err;
1686 /* This indicates that we are calculating this node now.
1687 We reference this value to avoid infinite loop. */
1688 dfa->eclosures[node].nelem = -1;
1690 /* If the current node has constraints, duplicate all nodes
1691 since they must inherit the constraints. */
1692 if (dfa->nodes[node].constraint
1693 && dfa->edests[node].nelem
1694 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1696 err = duplicate_node_closure (dfa, node, node, node,
1697 dfa->nodes[node].constraint);
1698 if (BE (err != REG_NOERROR, 0))
1699 return err;
1702 /* Expand each epsilon destination nodes. */
1703 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1704 for (i = 0; i < dfa->edests[node].nelem; ++i)
1706 re_node_set eclosure_elem;
1707 int edest = dfa->edests[node].elems[i];
1708 /* If calculating the epsilon closure of `edest' is in progress,
1709 return intermediate result. */
1710 if (dfa->eclosures[edest].nelem == -1)
1712 incomplete = 1;
1713 continue;
1715 /* If we haven't calculated the epsilon closure of `edest' yet,
1716 calculate now. Otherwise use calculated epsilon closure. */
1717 if (dfa->eclosures[edest].nelem == 0)
1719 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1720 if (BE (err != REG_NOERROR, 0))
1721 return err;
1723 else
1724 eclosure_elem = dfa->eclosures[edest];
1725 /* Merge the epsilon closure of 'edest'. */
1726 err = re_node_set_merge (&eclosure, &eclosure_elem);
1727 if (BE (err != REG_NOERROR, 0))
1728 return err;
1729 /* If the epsilon closure of 'edest' is incomplete,
1730 the epsilon closure of this node is also incomplete. */
1731 if (dfa->eclosures[edest].nelem == 0)
1733 incomplete = 1;
1734 re_node_set_free (&eclosure_elem);
1738 /* An epsilon closure includes itself. */
1739 ret = re_node_set_insert (&eclosure, node);
1740 if (BE (ret < 0, 0))
1741 return REG_ESPACE;
1742 if (incomplete && !root)
1743 dfa->eclosures[node].nelem = 0;
1744 else
1745 dfa->eclosures[node] = eclosure;
1746 *new_set = eclosure;
1747 return REG_NOERROR;
1750 /* Functions for token which are used in the parser. */
1752 /* Fetch a token from INPUT.
1753 We must not use this function inside bracket expressions. */
1755 static void
1756 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1758 re_string_skip_bytes (input, peek_token (result, input, syntax));
1761 /* Peek a token from INPUT, and return the length of the token.
1762 We must not use this function inside bracket expressions. */
1764 static int
1765 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1767 unsigned char c;
1769 if (re_string_eoi (input))
1771 token->type = END_OF_RE;
1772 return 0;
1775 c = re_string_peek_byte (input, 0);
1776 token->opr.c = c;
1778 token->word_char = 0;
1779 #ifdef RE_ENABLE_I18N
1780 token->mb_partial = 0;
1781 if (input->mb_cur_max > 1 &&
1782 !re_string_first_byte (input, re_string_cur_idx (input)))
1784 token->type = CHARACTER;
1785 token->mb_partial = 1;
1786 return 1;
1788 #endif
1789 if (c == '\\')
1791 unsigned char c2;
1792 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1794 token->type = BACK_SLASH;
1795 return 1;
1798 c2 = re_string_peek_byte_case (input, 1);
1799 token->opr.c = c2;
1800 token->type = CHARACTER;
1801 #ifdef RE_ENABLE_I18N
1802 if (input->mb_cur_max > 1)
1804 wint_t wc = re_string_wchar_at (input,
1805 re_string_cur_idx (input) + 1);
1806 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1808 else
1809 #endif
1810 token->word_char = IS_WORD_CHAR (c2) != 0;
1812 switch (c2)
1814 case '|':
1815 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1816 token->type = OP_ALT;
1817 break;
1818 case '1': case '2': case '3': case '4': case '5':
1819 case '6': case '7': case '8': case '9':
1820 if (!(syntax & RE_NO_BK_REFS))
1822 token->type = OP_BACK_REF;
1823 token->opr.idx = c2 - '1';
1825 break;
1826 case '<':
1827 if (!(syntax & RE_NO_GNU_OPS))
1829 token->type = ANCHOR;
1830 token->opr.ctx_type = WORD_FIRST;
1832 break;
1833 case '>':
1834 if (!(syntax & RE_NO_GNU_OPS))
1836 token->type = ANCHOR;
1837 token->opr.ctx_type = WORD_LAST;
1839 break;
1840 case 'b':
1841 if (!(syntax & RE_NO_GNU_OPS))
1843 token->type = ANCHOR;
1844 token->opr.ctx_type = WORD_DELIM;
1846 break;
1847 case 'B':
1848 if (!(syntax & RE_NO_GNU_OPS))
1850 token->type = ANCHOR;
1851 token->opr.ctx_type = NOT_WORD_DELIM;
1853 break;
1854 case 'w':
1855 if (!(syntax & RE_NO_GNU_OPS))
1856 token->type = OP_WORD;
1857 break;
1858 case 'W':
1859 if (!(syntax & RE_NO_GNU_OPS))
1860 token->type = OP_NOTWORD;
1861 break;
1862 case 's':
1863 if (!(syntax & RE_NO_GNU_OPS))
1864 token->type = OP_SPACE;
1865 break;
1866 case 'S':
1867 if (!(syntax & RE_NO_GNU_OPS))
1868 token->type = OP_NOTSPACE;
1869 break;
1870 case '`':
1871 if (!(syntax & RE_NO_GNU_OPS))
1873 token->type = ANCHOR;
1874 token->opr.ctx_type = BUF_FIRST;
1876 break;
1877 case '\'':
1878 if (!(syntax & RE_NO_GNU_OPS))
1880 token->type = ANCHOR;
1881 token->opr.ctx_type = BUF_LAST;
1883 break;
1884 case '(':
1885 if (!(syntax & RE_NO_BK_PARENS))
1886 token->type = OP_OPEN_SUBEXP;
1887 break;
1888 case ')':
1889 if (!(syntax & RE_NO_BK_PARENS))
1890 token->type = OP_CLOSE_SUBEXP;
1891 break;
1892 case '+':
1893 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1894 token->type = OP_DUP_PLUS;
1895 break;
1896 case '?':
1897 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1898 token->type = OP_DUP_QUESTION;
1899 break;
1900 case '{':
1901 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1902 token->type = OP_OPEN_DUP_NUM;
1903 break;
1904 case '}':
1905 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1906 token->type = OP_CLOSE_DUP_NUM;
1907 break;
1908 default:
1909 break;
1911 return 2;
1914 token->type = CHARACTER;
1915 #ifdef RE_ENABLE_I18N
1916 if (input->mb_cur_max > 1)
1918 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1919 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1921 else
1922 #endif
1923 token->word_char = IS_WORD_CHAR (token->opr.c);
1925 switch (c)
1927 case '\n':
1928 if (syntax & RE_NEWLINE_ALT)
1929 token->type = OP_ALT;
1930 break;
1931 case '|':
1932 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1933 token->type = OP_ALT;
1934 break;
1935 case '*':
1936 token->type = OP_DUP_ASTERISK;
1937 break;
1938 case '+':
1939 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1940 token->type = OP_DUP_PLUS;
1941 break;
1942 case '?':
1943 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1944 token->type = OP_DUP_QUESTION;
1945 break;
1946 case '{':
1947 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1948 token->type = OP_OPEN_DUP_NUM;
1949 break;
1950 case '}':
1951 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1952 token->type = OP_CLOSE_DUP_NUM;
1953 break;
1954 case '(':
1955 if (syntax & RE_NO_BK_PARENS)
1956 token->type = OP_OPEN_SUBEXP;
1957 break;
1958 case ')':
1959 if (syntax & RE_NO_BK_PARENS)
1960 token->type = OP_CLOSE_SUBEXP;
1961 break;
1962 case '[':
1963 token->type = OP_OPEN_BRACKET;
1964 break;
1965 case '.':
1966 token->type = OP_PERIOD;
1967 break;
1968 case '^':
1969 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1970 re_string_cur_idx (input) != 0)
1972 char prev = re_string_peek_byte (input, -1);
1973 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1974 break;
1976 token->type = ANCHOR;
1977 token->opr.ctx_type = LINE_FIRST;
1978 break;
1979 case '$':
1980 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1981 re_string_cur_idx (input) + 1 != re_string_length (input))
1983 re_token_t next;
1984 re_string_skip_bytes (input, 1);
1985 peek_token (&next, input, syntax);
1986 re_string_skip_bytes (input, -1);
1987 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1988 break;
1990 token->type = ANCHOR;
1991 token->opr.ctx_type = LINE_LAST;
1992 break;
1993 default:
1994 break;
1996 return 1;
1999 /* Peek a token from INPUT, and return the length of the token.
2000 We must not use this function out of bracket expressions. */
2002 static int
2003 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2005 unsigned char c;
2006 if (re_string_eoi (input))
2008 token->type = END_OF_RE;
2009 return 0;
2011 c = re_string_peek_byte (input, 0);
2012 token->opr.c = c;
2014 #ifdef RE_ENABLE_I18N
2015 if (input->mb_cur_max > 1 &&
2016 !re_string_first_byte (input, re_string_cur_idx (input)))
2018 token->type = CHARACTER;
2019 return 1;
2021 #endif /* RE_ENABLE_I18N */
2023 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2024 && re_string_cur_idx (input) + 1 < re_string_length (input))
2026 /* In this case, '\' escape a character. */
2027 unsigned char c2;
2028 re_string_skip_bytes (input, 1);
2029 c2 = re_string_peek_byte (input, 0);
2030 token->opr.c = c2;
2031 token->type = CHARACTER;
2032 return 1;
2034 if (c == '[') /* '[' is a special char in a bracket exps. */
2036 unsigned char c2;
2037 int token_len;
2038 if (re_string_cur_idx (input) + 1 < re_string_length (input))
2039 c2 = re_string_peek_byte (input, 1);
2040 else
2041 c2 = 0;
2042 token->opr.c = c2;
2043 token_len = 2;
2044 switch (c2)
2046 case '.':
2047 token->type = OP_OPEN_COLL_ELEM;
2048 break;
2049 case '=':
2050 token->type = OP_OPEN_EQUIV_CLASS;
2051 break;
2052 case ':':
2053 if (syntax & RE_CHAR_CLASSES)
2055 token->type = OP_OPEN_CHAR_CLASS;
2056 break;
2058 /* else fall through. */
2059 default:
2060 token->type = CHARACTER;
2061 token->opr.c = c;
2062 token_len = 1;
2063 break;
2065 return token_len;
2067 switch (c)
2069 case '-':
2070 token->type = OP_CHARSET_RANGE;
2071 break;
2072 case ']':
2073 token->type = OP_CLOSE_BRACKET;
2074 break;
2075 case '^':
2076 token->type = OP_NON_MATCH_LIST;
2077 break;
2078 default:
2079 token->type = CHARACTER;
2081 return 1;
2084 /* Functions for parser. */
2086 /* Entry point of the parser.
2087 Parse the regular expression REGEXP and return the structure tree.
2088 If an error occurs, ERR is set by error code, and return NULL.
2089 This function build the following tree, from regular expression <reg_exp>:
2093 <reg_exp> EOR
2095 CAT means concatenation.
2096 EOR means end of regular expression. */
2098 static bin_tree_t *
2099 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2100 reg_errcode_t *err)
2102 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2103 bin_tree_t *tree, *eor, *root;
2104 re_token_t current_token;
2105 dfa->syntax = syntax;
2106 fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2107 tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2108 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2109 return NULL;
2110 eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2111 if (tree != NULL)
2112 root = create_tree (dfa, tree, eor, CONCAT);
2113 else
2114 root = eor;
2115 if (BE (eor == NULL || root == NULL, 0))
2117 *err = REG_ESPACE;
2118 return NULL;
2120 return root;
2123 /* This function build the following tree, from regular expression
2124 <branch1>|<branch2>:
2128 <branch1> <branch2>
2130 ALT means alternative, which represents the operator '|'. */
2132 static bin_tree_t *
2133 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2134 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2136 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2137 bin_tree_t *tree, *branch = NULL;
2138 tree = parse_branch (regexp, preg, token, syntax, nest, err);
2139 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2140 return NULL;
2142 while (token->type == OP_ALT)
2144 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2145 if (token->type != OP_ALT && token->type != END_OF_RE
2146 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2148 branch = parse_branch (regexp, preg, token, syntax, nest, err);
2149 if (BE (*err != REG_NOERROR && branch == NULL, 0))
2151 if (tree != NULL)
2152 postorder (tree, free_tree, NULL);
2153 return NULL;
2156 else
2157 branch = NULL;
2158 tree = create_tree (dfa, tree, branch, OP_ALT);
2159 if (BE (tree == NULL, 0))
2161 *err = REG_ESPACE;
2162 return NULL;
2165 return tree;
2168 /* This function build the following tree, from regular expression
2169 <exp1><exp2>:
2173 <exp1> <exp2>
2175 CAT means concatenation. */
2177 static bin_tree_t *
2178 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2179 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2181 bin_tree_t *tree, *exp;
2182 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2183 tree = parse_expression (regexp, preg, token, syntax, nest, err);
2184 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2185 return NULL;
2187 while (token->type != OP_ALT && token->type != END_OF_RE
2188 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2190 exp = parse_expression (regexp, preg, token, syntax, nest, err);
2191 if (BE (*err != REG_NOERROR && exp == NULL, 0))
2193 if (tree != NULL)
2194 postorder (tree, free_tree, NULL);
2195 return NULL;
2197 if (tree != NULL && exp != NULL)
2199 bin_tree_t *newtree = create_tree (dfa, tree, exp, CONCAT);
2200 if (newtree == NULL)
2202 postorder (exp, free_tree, NULL);
2203 postorder (tree, free_tree, NULL);
2204 *err = REG_ESPACE;
2205 return NULL;
2207 tree = newtree;
2209 else if (tree == NULL)
2210 tree = exp;
2211 /* Otherwise exp == NULL, we don't need to create new tree. */
2213 return tree;
2216 /* This function build the following tree, from regular expression a*:
2222 static bin_tree_t *
2223 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2224 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2226 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2227 bin_tree_t *tree;
2228 switch (token->type)
2230 case CHARACTER:
2231 tree = create_token_tree (dfa, NULL, NULL, token);
2232 if (BE (tree == NULL, 0))
2234 *err = REG_ESPACE;
2235 return NULL;
2237 #ifdef RE_ENABLE_I18N
2238 if (dfa->mb_cur_max > 1)
2240 while (!re_string_eoi (regexp)
2241 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2243 bin_tree_t *mbc_remain;
2244 fetch_token (token, regexp, syntax);
2245 mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2246 tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2247 if (BE (mbc_remain == NULL || tree == NULL, 0))
2249 *err = REG_ESPACE;
2250 return NULL;
2254 #endif
2255 break;
2256 case OP_OPEN_SUBEXP:
2257 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2258 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2259 return NULL;
2260 break;
2261 case OP_OPEN_BRACKET:
2262 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2263 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2264 return NULL;
2265 break;
2266 case OP_BACK_REF:
2267 if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2269 *err = REG_ESUBREG;
2270 return NULL;
2272 dfa->used_bkref_map |= 1 << token->opr.idx;
2273 tree = create_token_tree (dfa, NULL, NULL, token);
2274 if (BE (tree == NULL, 0))
2276 *err = REG_ESPACE;
2277 return NULL;
2279 ++dfa->nbackref;
2280 dfa->has_mb_node = 1;
2281 break;
2282 case OP_OPEN_DUP_NUM:
2283 if (syntax & RE_CONTEXT_INVALID_DUP)
2285 *err = REG_BADRPT;
2286 return NULL;
2288 /* FALLTHROUGH */
2289 case OP_DUP_ASTERISK:
2290 case OP_DUP_PLUS:
2291 case OP_DUP_QUESTION:
2292 if (syntax & RE_CONTEXT_INVALID_OPS)
2294 *err = REG_BADRPT;
2295 return NULL;
2297 else if (syntax & RE_CONTEXT_INDEP_OPS)
2299 fetch_token (token, regexp, syntax);
2300 return parse_expression (regexp, preg, token, syntax, nest, err);
2302 /* else fall through */
2303 case OP_CLOSE_SUBEXP:
2304 if ((token->type == OP_CLOSE_SUBEXP) &&
2305 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2307 *err = REG_ERPAREN;
2308 return NULL;
2310 /* else fall through */
2311 case OP_CLOSE_DUP_NUM:
2312 /* We treat it as a normal character. */
2314 /* Then we can these characters as normal characters. */
2315 token->type = CHARACTER;
2316 /* mb_partial and word_char bits should be initialized already
2317 by peek_token. */
2318 tree = create_token_tree (dfa, NULL, NULL, token);
2319 if (BE (tree == NULL, 0))
2321 *err = REG_ESPACE;
2322 return NULL;
2324 break;
2325 case ANCHOR:
2326 if ((token->opr.ctx_type
2327 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2328 && dfa->word_ops_used == 0)
2329 init_word_char (dfa);
2330 if (token->opr.ctx_type == WORD_DELIM
2331 || token->opr.ctx_type == NOT_WORD_DELIM)
2333 bin_tree_t *tree_first, *tree_last;
2334 if (token->opr.ctx_type == WORD_DELIM)
2336 token->opr.ctx_type = WORD_FIRST;
2337 tree_first = create_token_tree (dfa, NULL, NULL, token);
2338 token->opr.ctx_type = WORD_LAST;
2340 else
2342 token->opr.ctx_type = INSIDE_WORD;
2343 tree_first = create_token_tree (dfa, NULL, NULL, token);
2344 token->opr.ctx_type = INSIDE_NOTWORD;
2346 tree_last = create_token_tree (dfa, NULL, NULL, token);
2347 tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2348 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2350 *err = REG_ESPACE;
2351 return NULL;
2354 else
2356 tree = create_token_tree (dfa, NULL, NULL, token);
2357 if (BE (tree == NULL, 0))
2359 *err = REG_ESPACE;
2360 return NULL;
2363 /* We must return here, since ANCHORs can't be followed
2364 by repetition operators.
2365 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2366 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2367 fetch_token (token, regexp, syntax);
2368 return tree;
2369 case OP_PERIOD:
2370 tree = create_token_tree (dfa, NULL, NULL, token);
2371 if (BE (tree == NULL, 0))
2373 *err = REG_ESPACE;
2374 return NULL;
2376 if (dfa->mb_cur_max > 1)
2377 dfa->has_mb_node = 1;
2378 break;
2379 case OP_WORD:
2380 case OP_NOTWORD:
2381 tree = build_charclass_op (dfa, regexp->trans,
2382 (const unsigned char *) "alnum",
2383 (const unsigned char *) "_",
2384 token->type == OP_NOTWORD, err);
2385 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2386 return NULL;
2387 break;
2388 case OP_SPACE:
2389 case OP_NOTSPACE:
2390 tree = build_charclass_op (dfa, regexp->trans,
2391 (const unsigned char *) "space",
2392 (const unsigned char *) "",
2393 token->type == OP_NOTSPACE, err);
2394 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2395 return NULL;
2396 break;
2397 case OP_ALT:
2398 case END_OF_RE:
2399 return NULL;
2400 case BACK_SLASH:
2401 *err = REG_EESCAPE;
2402 return NULL;
2403 default:
2404 /* Must not happen? */
2405 #ifdef DEBUG
2406 assert (0);
2407 #endif
2408 return NULL;
2410 fetch_token (token, regexp, syntax);
2412 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2413 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2415 bin_tree_t *dup_tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2416 if (BE (*err != REG_NOERROR && dup_tree == NULL, 0))
2418 if (tree != NULL)
2419 postorder (tree, free_tree, NULL);
2420 return NULL;
2422 tree = dup_tree;
2423 /* In BRE consecutive duplications are not allowed. */
2424 if ((syntax & RE_CONTEXT_INVALID_DUP)
2425 && (token->type == OP_DUP_ASTERISK
2426 || token->type == OP_OPEN_DUP_NUM))
2428 if (tree != NULL)
2429 postorder (tree, free_tree, NULL);
2430 *err = REG_BADRPT;
2431 return NULL;
2435 return tree;
2438 /* This function build the following tree, from regular expression
2439 (<reg_exp>):
2440 SUBEXP
2442 <reg_exp>
2445 static bin_tree_t *
2446 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2447 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2449 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2450 bin_tree_t *tree;
2451 size_t cur_nsub;
2452 cur_nsub = preg->re_nsub++;
2454 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2456 /* The subexpression may be a null string. */
2457 if (token->type == OP_CLOSE_SUBEXP)
2458 tree = NULL;
2459 else
2461 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2462 if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2464 if (tree != NULL)
2465 postorder (tree, free_tree, NULL);
2466 *err = REG_EPAREN;
2468 if (BE (*err != REG_NOERROR, 0))
2469 return NULL;
2472 if (cur_nsub <= '9' - '1')
2473 dfa->completed_bkref_map |= 1 << cur_nsub;
2475 tree = create_tree (dfa, tree, NULL, SUBEXP);
2476 if (BE (tree == NULL, 0))
2478 *err = REG_ESPACE;
2479 return NULL;
2481 tree->token.opr.idx = cur_nsub;
2482 return tree;
2485 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2487 static bin_tree_t *
2488 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2489 re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2491 bin_tree_t *tree = NULL, *old_tree = NULL;
2492 int i, start, end, start_idx = re_string_cur_idx (regexp);
2493 re_token_t start_token = *token;
2495 if (token->type == OP_OPEN_DUP_NUM)
2497 end = 0;
2498 start = fetch_number (regexp, token, syntax);
2499 if (start == -1)
2501 if (token->type == CHARACTER && token->opr.c == ',')
2502 start = 0; /* We treat "{,m}" as "{0,m}". */
2503 else
2505 *err = REG_BADBR; /* <re>{} is invalid. */
2506 return NULL;
2509 if (BE (start != -2, 1))
2511 /* We treat "{n}" as "{n,n}". */
2512 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2513 : ((token->type == CHARACTER && token->opr.c == ',')
2514 ? fetch_number (regexp, token, syntax) : -2));
2516 if (BE (start == -2 || end == -2, 0))
2518 /* Invalid sequence. */
2519 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2521 if (token->type == END_OF_RE)
2522 *err = REG_EBRACE;
2523 else
2524 *err = REG_BADBR;
2526 return NULL;
2529 /* If the syntax bit is set, rollback. */
2530 re_string_set_index (regexp, start_idx);
2531 *token = start_token;
2532 token->type = CHARACTER;
2533 /* mb_partial and word_char bits should be already initialized by
2534 peek_token. */
2535 return elem;
2538 if (BE ((end != -1 && start > end) || token->type != OP_CLOSE_DUP_NUM, 0))
2540 /* First number greater than second. */
2541 *err = REG_BADBR;
2542 return NULL;
2545 else
2547 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2548 end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2551 fetch_token (token, regexp, syntax);
2553 if (BE (elem == NULL, 0))
2554 return NULL;
2555 if (BE (start == 0 && end == 0, 0))
2557 postorder (elem, free_tree, NULL);
2558 return NULL;
2561 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2562 if (BE (start > 0, 0))
2564 tree = elem;
2565 for (i = 2; i <= start; ++i)
2567 elem = duplicate_tree (elem, dfa);
2568 tree = create_tree (dfa, tree, elem, CONCAT);
2569 if (BE (elem == NULL || tree == NULL, 0))
2570 goto parse_dup_op_espace;
2573 if (start == end)
2574 return tree;
2576 /* Duplicate ELEM before it is marked optional. */
2577 elem = duplicate_tree (elem, dfa);
2578 if (BE (elem == NULL, 0))
2579 goto parse_dup_op_espace;
2580 old_tree = tree;
2582 else
2583 old_tree = NULL;
2585 if (elem->token.type == SUBEXP)
2586 postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2588 tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2589 if (BE (tree == NULL, 0))
2590 goto parse_dup_op_espace;
2592 /* This loop is actually executed only when end != -1,
2593 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2594 already created the start+1-th copy. */
2595 for (i = start + 2; i <= end; ++i)
2597 elem = duplicate_tree (elem, dfa);
2598 tree = create_tree (dfa, tree, elem, CONCAT);
2599 if (BE (elem == NULL || tree == NULL, 0))
2600 goto parse_dup_op_espace;
2602 tree = create_tree (dfa, tree, NULL, OP_ALT);
2603 if (BE (tree == NULL, 0))
2604 goto parse_dup_op_espace;
2607 if (old_tree)
2608 tree = create_tree (dfa, old_tree, tree, CONCAT);
2610 return tree;
2612 parse_dup_op_espace:
2613 *err = REG_ESPACE;
2614 return NULL;
2617 /* Size of the names for collating symbol/equivalence_class/character_class.
2618 I'm not sure, but maybe enough. */
2619 #define BRACKET_NAME_BUF_SIZE 32
2621 #ifndef _LIBC
2622 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2623 Build the range expression which starts from START_ELEM, and ends
2624 at END_ELEM. The result are written to MBCSET and SBCSET.
2625 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2626 mbcset->range_ends, is a pointer argument since we may
2627 update it. */
2629 static reg_errcode_t
2630 # ifdef RE_ENABLE_I18N
2631 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2632 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2633 # else /* not RE_ENABLE_I18N */
2634 build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
2635 bracket_elem_t *end_elem)
2636 # endif /* not RE_ENABLE_I18N */
2638 unsigned int start_ch, end_ch;
2639 /* Equivalence Classes and Character Classes can't be a range start/end. */
2640 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2641 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2643 return REG_ERANGE;
2645 /* We can handle no multi character collating elements without libc
2646 support. */
2647 if (BE ((start_elem->type == COLL_SYM
2648 && strlen ((char *) start_elem->opr.name) > 1)
2649 || (end_elem->type == COLL_SYM
2650 && strlen ((char *) end_elem->opr.name) > 1), 0))
2651 return REG_ECOLLATE;
2653 # ifdef RE_ENABLE_I18N
2655 wchar_t wc;
2656 wint_t start_wc;
2657 wint_t end_wc;
2658 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2660 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2661 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2662 : 0));
2663 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2664 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2665 : 0));
2666 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2667 ? __btowc (start_ch) : start_elem->opr.wch);
2668 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2669 ? __btowc (end_ch) : end_elem->opr.wch);
2670 if (start_wc == WEOF || end_wc == WEOF)
2671 return REG_ECOLLATE;
2672 cmp_buf[0] = start_wc;
2673 cmp_buf[4] = end_wc;
2674 if (__wcscoll (cmp_buf, cmp_buf + 4) > 0)
2675 return REG_ERANGE;
2677 /* Got valid collation sequence values, add them as a new entry.
2678 However, for !_LIBC we have no collation elements: if the
2679 character set is single byte, the single byte character set
2680 that we build below suffices. parse_bracket_exp passes
2681 no MBCSET if dfa->mb_cur_max == 1. */
2682 if (mbcset)
2684 /* Check the space of the arrays. */
2685 if (BE (*range_alloc == mbcset->nranges, 0))
2687 /* There is not enough space, need realloc. */
2688 wchar_t *new_array_start, *new_array_end;
2689 int new_nranges;
2691 /* +1 in case of mbcset->nranges is 0. */
2692 new_nranges = 2 * mbcset->nranges + 1;
2693 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2694 are NULL if *range_alloc == 0. */
2695 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2696 new_nranges);
2697 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2698 new_nranges);
2700 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2701 return REG_ESPACE;
2703 mbcset->range_starts = new_array_start;
2704 mbcset->range_ends = new_array_end;
2705 *range_alloc = new_nranges;
2708 mbcset->range_starts[mbcset->nranges] = start_wc;
2709 mbcset->range_ends[mbcset->nranges++] = end_wc;
2712 /* Build the table for single byte characters. */
2713 for (wc = 0; wc < SBC_MAX; ++wc)
2715 cmp_buf[2] = wc;
2716 if (__wcscoll (cmp_buf, cmp_buf + 2) <= 0
2717 && __wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2718 bitset_set (sbcset, wc);
2721 # else /* not RE_ENABLE_I18N */
2723 unsigned int ch;
2724 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2725 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2726 : 0));
2727 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2728 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2729 : 0));
2730 if (start_ch > end_ch)
2731 return REG_ERANGE;
2732 /* Build the table for single byte characters. */
2733 for (ch = 0; ch < SBC_MAX; ++ch)
2734 if (start_ch <= ch && ch <= end_ch)
2735 bitset_set (sbcset, ch);
2737 # endif /* not RE_ENABLE_I18N */
2738 return REG_NOERROR;
2740 #endif /* not _LIBC */
2742 #ifndef _LIBC
2743 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2744 Build the collating element which is represented by NAME.
2745 The result are written to MBCSET and SBCSET.
2746 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2747 pointer argument since we may update it. */
2749 static reg_errcode_t
2750 # ifdef RE_ENABLE_I18N
2751 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2752 int *coll_sym_alloc, const unsigned char *name)
2753 # else /* not RE_ENABLE_I18N */
2754 build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2755 # endif /* not RE_ENABLE_I18N */
2757 size_t name_len = strlen ((const char *) name);
2758 if (BE (name_len != 1, 0))
2759 return REG_ECOLLATE;
2760 else
2762 bitset_set (sbcset, name[0]);
2763 return REG_NOERROR;
2766 #endif /* not _LIBC */
2768 /* This function parse bracket expression like "[abc]", "[a-c]",
2769 "[[.a-a.]]" etc. */
2771 static bin_tree_t *
2772 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2773 reg_syntax_t syntax, reg_errcode_t *err)
2775 #ifdef _LIBC
2776 const unsigned char *collseqmb;
2777 const char *collseqwc;
2778 uint32_t nrules;
2779 int32_t table_size;
2780 const int32_t *symb_table;
2781 const unsigned char *extra;
2783 /* Local function for parse_bracket_exp used in _LIBC environment.
2784 Seek the collating symbol entry corresponding to NAME.
2785 Return the index of the symbol in the SYMB_TABLE,
2786 or -1 if not found. */
2788 auto inline int32_t
2789 __attribute__ ((always_inline))
2790 seek_collating_symbol_entry (const unsigned char *name, size_t name_len)
2792 int32_t elem;
2794 for (elem = 0; elem < table_size; elem++)
2795 if (symb_table[2 * elem] != 0)
2797 int32_t idx = symb_table[2 * elem + 1];
2798 /* Skip the name of collating element name. */
2799 idx += 1 + extra[idx];
2800 if (/* Compare the length of the name. */
2801 name_len == extra[idx]
2802 /* Compare the name. */
2803 && memcmp (name, &extra[idx + 1], name_len) == 0)
2804 /* Yep, this is the entry. */
2805 return elem;
2807 return -1;
2810 /* Local function for parse_bracket_exp used in _LIBC environment.
2811 Look up the collation sequence value of BR_ELEM.
2812 Return the value if succeeded, UINT_MAX otherwise. */
2814 auto inline unsigned int
2815 __attribute__ ((always_inline))
2816 lookup_collation_sequence_value (bracket_elem_t *br_elem)
2818 if (br_elem->type == SB_CHAR)
2821 if (MB_CUR_MAX == 1)
2823 if (nrules == 0)
2824 return collseqmb[br_elem->opr.ch];
2825 else
2827 wint_t wc = __btowc (br_elem->opr.ch);
2828 return __collseq_table_lookup (collseqwc, wc);
2831 else if (br_elem->type == MB_CHAR)
2833 if (nrules != 0)
2834 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2836 else if (br_elem->type == COLL_SYM)
2838 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2839 if (nrules != 0)
2841 int32_t elem, idx;
2842 elem = seek_collating_symbol_entry (br_elem->opr.name,
2843 sym_name_len);
2844 if (elem != -1)
2846 /* We found the entry. */
2847 idx = symb_table[2 * elem + 1];
2848 /* Skip the name of collating element name. */
2849 idx += 1 + extra[idx];
2850 /* Skip the byte sequence of the collating element. */
2851 idx += 1 + extra[idx];
2852 /* Adjust for the alignment. */
2853 idx = (idx + 3) & ~3;
2854 /* Skip the multibyte collation sequence value. */
2855 idx += sizeof (unsigned int);
2856 /* Skip the wide char sequence of the collating element. */
2857 idx += sizeof (unsigned int) *
2858 (1 + *(unsigned int *) (extra + idx));
2859 /* Return the collation sequence value. */
2860 return *(unsigned int *) (extra + idx);
2862 else if (sym_name_len == 1)
2864 /* No valid character. Match it as a single byte
2865 character. */
2866 return collseqmb[br_elem->opr.name[0]];
2869 else if (sym_name_len == 1)
2870 return collseqmb[br_elem->opr.name[0]];
2872 return UINT_MAX;
2875 /* Local function for parse_bracket_exp used in _LIBC environment.
2876 Build the range expression which starts from START_ELEM, and ends
2877 at END_ELEM. The result are written to MBCSET and SBCSET.
2878 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2879 mbcset->range_ends, is a pointer argument since we may
2880 update it. */
2882 auto inline reg_errcode_t
2883 __attribute__ ((always_inline))
2884 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2885 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2887 unsigned int ch;
2888 uint32_t start_collseq;
2889 uint32_t end_collseq;
2891 /* Equivalence Classes and Character Classes can't be a range
2892 start/end. */
2893 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2894 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2896 return REG_ERANGE;
2898 start_collseq = lookup_collation_sequence_value (start_elem);
2899 end_collseq = lookup_collation_sequence_value (end_elem);
2900 /* Check start/end collation sequence values. */
2901 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2902 return REG_ECOLLATE;
2903 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2904 return REG_ERANGE;
2906 /* Got valid collation sequence values, add them as a new entry.
2907 However, if we have no collation elements, and the character set
2908 is single byte, the single byte character set that we
2909 build below suffices. */
2910 if (nrules > 0 || dfa->mb_cur_max > 1)
2912 /* Check the space of the arrays. */
2913 if (BE (*range_alloc == mbcset->nranges, 0))
2915 /* There is not enough space, need realloc. */
2916 uint32_t *new_array_start;
2917 uint32_t *new_array_end;
2918 int new_nranges;
2920 /* +1 in case of mbcset->nranges is 0. */
2921 new_nranges = 2 * mbcset->nranges + 1;
2922 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2923 new_nranges);
2924 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2925 new_nranges);
2927 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2928 return REG_ESPACE;
2930 mbcset->range_starts = new_array_start;
2931 mbcset->range_ends = new_array_end;
2932 *range_alloc = new_nranges;
2935 mbcset->range_starts[mbcset->nranges] = start_collseq;
2936 mbcset->range_ends[mbcset->nranges++] = end_collseq;
2939 /* Build the table for single byte characters. */
2940 for (ch = 0; ch < SBC_MAX; ch++)
2942 uint32_t ch_collseq;
2944 if (MB_CUR_MAX == 1)
2946 if (nrules == 0)
2947 ch_collseq = collseqmb[ch];
2948 else
2949 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2950 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2951 bitset_set (sbcset, ch);
2953 return REG_NOERROR;
2956 /* Local function for parse_bracket_exp used in _LIBC environment.
2957 Build the collating element which is represented by NAME.
2958 The result are written to MBCSET and SBCSET.
2959 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2960 pointer argument since we may update it. */
2962 auto inline reg_errcode_t
2963 __attribute__ ((always_inline))
2964 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2965 int *coll_sym_alloc, const unsigned char *name)
2967 int32_t elem, idx;
2968 size_t name_len = strlen ((const char *) name);
2969 if (nrules != 0)
2971 elem = seek_collating_symbol_entry (name, name_len);
2972 if (elem != -1)
2974 /* We found the entry. */
2975 idx = symb_table[2 * elem + 1];
2976 /* Skip the name of collating element name. */
2977 idx += 1 + extra[idx];
2979 else if (name_len == 1)
2981 /* No valid character, treat it as a normal
2982 character. */
2983 bitset_set (sbcset, name[0]);
2984 return REG_NOERROR;
2986 else
2987 return REG_ECOLLATE;
2989 /* Got valid collation sequence, add it as a new entry. */
2990 /* Check the space of the arrays. */
2991 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
2993 /* Not enough, realloc it. */
2994 /* +1 in case of mbcset->ncoll_syms is 0. */
2995 int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
2996 /* Use realloc since mbcset->coll_syms is NULL
2997 if *alloc == 0. */
2998 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
2999 new_coll_sym_alloc);
3000 if (BE (new_coll_syms == NULL, 0))
3001 return REG_ESPACE;
3002 mbcset->coll_syms = new_coll_syms;
3003 *coll_sym_alloc = new_coll_sym_alloc;
3005 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3006 return REG_NOERROR;
3008 else
3010 if (BE (name_len != 1, 0))
3011 return REG_ECOLLATE;
3012 else
3014 bitset_set (sbcset, name[0]);
3015 return REG_NOERROR;
3019 #endif
3021 re_token_t br_token;
3022 re_bitset_ptr_t sbcset;
3023 #ifdef RE_ENABLE_I18N
3024 re_charset_t *mbcset;
3025 int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3026 int equiv_class_alloc = 0, char_class_alloc = 0;
3027 #endif /* not RE_ENABLE_I18N */
3028 int non_match = 0;
3029 bin_tree_t *work_tree;
3030 int token_len;
3031 int first_round = 1;
3032 #ifdef _LIBC
3033 collseqmb = (const unsigned char *)
3034 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3035 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3036 if (nrules)
3039 if (MB_CUR_MAX > 1)
3041 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3042 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3043 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3044 _NL_COLLATE_SYMB_TABLEMB);
3045 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3046 _NL_COLLATE_SYMB_EXTRAMB);
3048 #endif
3049 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3050 #ifdef RE_ENABLE_I18N
3051 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3052 #endif /* RE_ENABLE_I18N */
3053 #ifdef RE_ENABLE_I18N
3054 if (BE (sbcset == NULL || mbcset == NULL, 0))
3055 #else
3056 if (BE (sbcset == NULL, 0))
3057 #endif /* RE_ENABLE_I18N */
3059 re_free (sbcset);
3060 #ifdef RE_ENABLE_I18N
3061 re_free (mbcset);
3062 #endif
3063 *err = REG_ESPACE;
3064 return NULL;
3067 token_len = peek_token_bracket (token, regexp, syntax);
3068 if (BE (token->type == END_OF_RE, 0))
3070 *err = REG_BADPAT;
3071 goto parse_bracket_exp_free_return;
3073 if (token->type == OP_NON_MATCH_LIST)
3075 #ifdef RE_ENABLE_I18N
3076 mbcset->non_match = 1;
3077 #endif /* not RE_ENABLE_I18N */
3078 non_match = 1;
3079 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3080 bitset_set (sbcset, '\n');
3081 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3082 token_len = peek_token_bracket (token, regexp, syntax);
3083 if (BE (token->type == END_OF_RE, 0))
3085 *err = REG_BADPAT;
3086 goto parse_bracket_exp_free_return;
3090 /* We treat the first ']' as a normal character. */
3091 if (token->type == OP_CLOSE_BRACKET)
3092 token->type = CHARACTER;
3094 while (1)
3096 bracket_elem_t start_elem, end_elem;
3097 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3098 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3099 reg_errcode_t ret;
3100 int token_len2 = 0, is_range_exp = 0;
3101 re_token_t token2;
3103 start_elem.opr.name = start_name_buf;
3104 start_elem.type = COLL_SYM;
3105 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3106 syntax, first_round);
3107 if (BE (ret != REG_NOERROR, 0))
3109 *err = ret;
3110 goto parse_bracket_exp_free_return;
3112 first_round = 0;
3114 /* Get information about the next token. We need it in any case. */
3115 token_len = peek_token_bracket (token, regexp, syntax);
3117 /* Do not check for ranges if we know they are not allowed. */
3118 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3120 if (BE (token->type == END_OF_RE, 0))
3122 *err = REG_EBRACK;
3123 goto parse_bracket_exp_free_return;
3125 if (token->type == OP_CHARSET_RANGE)
3127 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3128 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3129 if (BE (token2.type == END_OF_RE, 0))
3131 *err = REG_EBRACK;
3132 goto parse_bracket_exp_free_return;
3134 if (token2.type == OP_CLOSE_BRACKET)
3136 /* We treat the last '-' as a normal character. */
3137 re_string_skip_bytes (regexp, -token_len);
3138 token->type = CHARACTER;
3140 else
3141 is_range_exp = 1;
3145 if (is_range_exp == 1)
3147 end_elem.opr.name = end_name_buf;
3148 end_elem.type = COLL_SYM;
3149 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3150 dfa, syntax, 1);
3151 if (BE (ret != REG_NOERROR, 0))
3153 *err = ret;
3154 goto parse_bracket_exp_free_return;
3157 token_len = peek_token_bracket (token, regexp, syntax);
3159 #ifdef _LIBC
3160 *err = build_range_exp (sbcset, mbcset, &range_alloc,
3161 &start_elem, &end_elem);
3162 #else
3163 # ifdef RE_ENABLE_I18N
3164 *err = build_range_exp (sbcset,
3165 dfa->mb_cur_max > 1 ? mbcset : NULL,
3166 &range_alloc, &start_elem, &end_elem);
3167 # else
3168 *err = build_range_exp (sbcset, &start_elem, &end_elem);
3169 # endif
3170 #endif /* RE_ENABLE_I18N */
3171 if (BE (*err != REG_NOERROR, 0))
3172 goto parse_bracket_exp_free_return;
3174 else
3176 switch (start_elem.type)
3178 case SB_CHAR:
3179 bitset_set (sbcset, start_elem.opr.ch);
3180 break;
3181 #ifdef RE_ENABLE_I18N
3182 case MB_CHAR:
3183 /* Check whether the array has enough space. */
3184 if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3186 wchar_t *new_mbchars;
3187 /* Not enough, realloc it. */
3188 /* +1 in case of mbcset->nmbchars is 0. */
3189 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3190 /* Use realloc since array is NULL if *alloc == 0. */
3191 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3192 mbchar_alloc);
3193 if (BE (new_mbchars == NULL, 0))
3194 goto parse_bracket_exp_espace;
3195 mbcset->mbchars = new_mbchars;
3197 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3198 break;
3199 #endif /* RE_ENABLE_I18N */
3200 case EQUIV_CLASS:
3201 *err = build_equiv_class (sbcset,
3202 #ifdef RE_ENABLE_I18N
3203 mbcset, &equiv_class_alloc,
3204 #endif /* RE_ENABLE_I18N */
3205 start_elem.opr.name);
3206 if (BE (*err != REG_NOERROR, 0))
3207 goto parse_bracket_exp_free_return;
3208 break;
3209 case COLL_SYM:
3210 *err = build_collating_symbol (sbcset,
3211 #ifdef RE_ENABLE_I18N
3212 mbcset, &coll_sym_alloc,
3213 #endif /* RE_ENABLE_I18N */
3214 start_elem.opr.name);
3215 if (BE (*err != REG_NOERROR, 0))
3216 goto parse_bracket_exp_free_return;
3217 break;
3218 case CHAR_CLASS:
3219 *err = build_charclass (regexp->trans, sbcset,
3220 #ifdef RE_ENABLE_I18N
3221 mbcset, &char_class_alloc,
3222 #endif /* RE_ENABLE_I18N */
3223 start_elem.opr.name, syntax);
3224 if (BE (*err != REG_NOERROR, 0))
3225 goto parse_bracket_exp_free_return;
3226 break;
3227 default:
3228 assert (0);
3229 break;
3232 if (BE (token->type == END_OF_RE, 0))
3234 *err = REG_EBRACK;
3235 goto parse_bracket_exp_free_return;
3237 if (token->type == OP_CLOSE_BRACKET)
3238 break;
3241 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3243 /* If it is non-matching list. */
3244 if (non_match)
3245 bitset_not (sbcset);
3247 #ifdef RE_ENABLE_I18N
3248 /* Ensure only single byte characters are set. */
3249 if (dfa->mb_cur_max > 1)
3250 bitset_mask (sbcset, dfa->sb_char);
3252 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3253 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3254 || mbcset->non_match)))
3256 bin_tree_t *mbc_tree;
3257 int sbc_idx;
3258 /* Build a tree for complex bracket. */
3259 dfa->has_mb_node = 1;
3260 br_token.type = COMPLEX_BRACKET;
3261 br_token.opr.mbcset = mbcset;
3262 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3263 if (BE (mbc_tree == NULL, 0))
3264 goto parse_bracket_exp_espace;
3265 for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3266 if (sbcset[sbc_idx])
3267 break;
3268 /* If there are no bits set in sbcset, there is no point
3269 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3270 if (sbc_idx < BITSET_WORDS)
3272 /* Build a tree for simple bracket. */
3273 br_token.type = SIMPLE_BRACKET;
3274 br_token.opr.sbcset = sbcset;
3275 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3276 if (BE (work_tree == NULL, 0))
3277 goto parse_bracket_exp_espace;
3279 /* Then join them by ALT node. */
3280 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3281 if (BE (work_tree == NULL, 0))
3282 goto parse_bracket_exp_espace;
3284 else
3286 re_free (sbcset);
3287 work_tree = mbc_tree;
3290 else
3291 #endif /* not RE_ENABLE_I18N */
3293 #ifdef RE_ENABLE_I18N
3294 free_charset (mbcset);
3295 #endif
3296 /* Build a tree for simple bracket. */
3297 br_token.type = SIMPLE_BRACKET;
3298 br_token.opr.sbcset = sbcset;
3299 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3300 if (BE (work_tree == NULL, 0))
3301 goto parse_bracket_exp_espace;
3303 return work_tree;
3305 parse_bracket_exp_espace:
3306 *err = REG_ESPACE;
3307 parse_bracket_exp_free_return:
3308 re_free (sbcset);
3309 #ifdef RE_ENABLE_I18N
3310 free_charset (mbcset);
3311 #endif /* RE_ENABLE_I18N */
3312 return NULL;
3315 /* Parse an element in the bracket expression. */
3317 static reg_errcode_t
3318 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3319 re_token_t *token, int token_len, re_dfa_t *dfa,
3320 reg_syntax_t syntax, int accept_hyphen)
3322 #ifdef RE_ENABLE_I18N
3323 int cur_char_size;
3324 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3325 if (cur_char_size > 1)
3327 elem->type = MB_CHAR;
3328 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3329 re_string_skip_bytes (regexp, cur_char_size);
3330 return REG_NOERROR;
3332 #endif /* RE_ENABLE_I18N */
3333 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3334 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3335 || token->type == OP_OPEN_EQUIV_CLASS)
3336 return parse_bracket_symbol (elem, regexp, token);
3337 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3339 /* A '-' must only appear as anything but a range indicator before
3340 the closing bracket. Everything else is an error. */
3341 re_token_t token2;
3342 (void) peek_token_bracket (&token2, regexp, syntax);
3343 if (token2.type != OP_CLOSE_BRACKET)
3344 /* The actual error value is not standardized since this whole
3345 case is undefined. But ERANGE makes good sense. */
3346 return REG_ERANGE;
3348 elem->type = SB_CHAR;
3349 elem->opr.ch = token->opr.c;
3350 return REG_NOERROR;
3353 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3354 such as [:<character_class>:], [.<collating_element>.], and
3355 [=<equivalent_class>=]. */
3357 static reg_errcode_t
3358 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3359 re_token_t *token)
3361 unsigned char ch, delim = token->opr.c;
3362 int i = 0;
3363 if (re_string_eoi(regexp))
3364 return REG_EBRACK;
3365 for (;; ++i)
3367 if (i >= BRACKET_NAME_BUF_SIZE)
3368 return REG_EBRACK;
3369 if (token->type == OP_OPEN_CHAR_CLASS)
3370 ch = re_string_fetch_byte_case (regexp);
3371 else
3372 ch = re_string_fetch_byte (regexp);
3373 if (re_string_eoi(regexp))
3374 return REG_EBRACK;
3375 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3376 break;
3377 elem->opr.name[i] = ch;
3379 re_string_skip_bytes (regexp, 1);
3380 elem->opr.name[i] = '\0';
3381 switch (token->type)
3383 case OP_OPEN_COLL_ELEM:
3384 elem->type = COLL_SYM;
3385 break;
3386 case OP_OPEN_EQUIV_CLASS:
3387 elem->type = EQUIV_CLASS;
3388 break;
3389 case OP_OPEN_CHAR_CLASS:
3390 elem->type = CHAR_CLASS;
3391 break;
3392 default:
3393 break;
3395 return REG_NOERROR;
3398 /* Helper function for parse_bracket_exp.
3399 Build the equivalence class which is represented by NAME.
3400 The result are written to MBCSET and SBCSET.
3401 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3402 is a pointer argument since we may update it. */
3404 static reg_errcode_t
3405 #ifdef RE_ENABLE_I18N
3406 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3407 int *equiv_class_alloc, const unsigned char *name)
3408 #else /* not RE_ENABLE_I18N */
3409 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3410 #endif /* not RE_ENABLE_I18N */
3412 #ifdef _LIBC
3413 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3414 if (nrules != 0)
3416 const int32_t *table, *indirect;
3417 const unsigned char *weights, *extra, *cp;
3418 unsigned char char_buf[2];
3419 int32_t idx1, idx2;
3420 unsigned int ch;
3421 size_t len;
3422 /* Calculate the index for equivalence class. */
3423 cp = name;
3424 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3425 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3426 _NL_COLLATE_WEIGHTMB);
3427 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3428 _NL_COLLATE_EXTRAMB);
3429 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3430 _NL_COLLATE_INDIRECTMB);
3431 idx1 = findidx (table, indirect, extra, &cp, -1);
3432 if (BE (idx1 == 0 || *cp != '\0', 0))
3433 /* This isn't a valid character. */
3434 return REG_ECOLLATE;
3436 /* Build single byte matching table for this equivalence class. */
3437 len = weights[idx1 & 0xffffff];
3438 for (ch = 0; ch < SBC_MAX; ++ch)
3440 char_buf[0] = ch;
3441 cp = char_buf;
3442 idx2 = findidx (table, indirect, extra, &cp, 1);
3444 idx2 = table[ch];
3446 if (idx2 == 0)
3447 /* This isn't a valid character. */
3448 continue;
3449 /* Compare only if the length matches and the collation rule
3450 index is the same. */
3451 if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3453 int cnt = 0;
3455 while (cnt <= len &&
3456 weights[(idx1 & 0xffffff) + 1 + cnt]
3457 == weights[(idx2 & 0xffffff) + 1 + cnt])
3458 ++cnt;
3460 if (cnt > len)
3461 bitset_set (sbcset, ch);
3464 /* Check whether the array has enough space. */
3465 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3467 /* Not enough, realloc it. */
3468 /* +1 in case of mbcset->nequiv_classes is 0. */
3469 int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3470 /* Use realloc since the array is NULL if *alloc == 0. */
3471 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3472 int32_t,
3473 new_equiv_class_alloc);
3474 if (BE (new_equiv_classes == NULL, 0))
3475 return REG_ESPACE;
3476 mbcset->equiv_classes = new_equiv_classes;
3477 *equiv_class_alloc = new_equiv_class_alloc;
3479 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3481 else
3482 #endif /* _LIBC */
3484 if (BE (strlen ((const char *) name) != 1, 0))
3485 return REG_ECOLLATE;
3486 bitset_set (sbcset, *name);
3488 return REG_NOERROR;
3491 /* Helper function for parse_bracket_exp.
3492 Build the character class which is represented by NAME.
3493 The result are written to MBCSET and SBCSET.
3494 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3495 is a pointer argument since we may update it. */
3497 static reg_errcode_t
3498 #ifdef RE_ENABLE_I18N
3499 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3500 re_charset_t *mbcset, int *char_class_alloc,
3501 const unsigned char *class_name, reg_syntax_t syntax)
3502 #else /* not RE_ENABLE_I18N */
3503 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3504 const unsigned char *class_name, reg_syntax_t syntax)
3505 #endif /* not RE_ENABLE_I18N */
3507 int i;
3508 const char *name = (const char *) class_name;
3510 /* In case of REG_ICASE "upper" and "lower" match the both of
3511 upper and lower cases. */
3512 if ((syntax & RE_ICASE)
3513 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3514 name = "alpha";
3516 #ifdef RE_ENABLE_I18N
3517 /* Check the space of the arrays. */
3518 if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3520 /* Not enough, realloc it. */
3521 /* +1 in case of mbcset->nchar_classes is 0. */
3522 int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3523 /* Use realloc since array is NULL if *alloc == 0. */
3524 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3525 new_char_class_alloc);
3526 if (BE (new_char_classes == NULL, 0))
3527 return REG_ESPACE;
3528 mbcset->char_classes = new_char_classes;
3529 *char_class_alloc = new_char_class_alloc;
3531 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3532 #endif /* RE_ENABLE_I18N */
3534 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3535 do { \
3536 if (BE (trans != NULL, 0)) \
3538 for (i = 0; i < SBC_MAX; ++i) \
3539 if (ctype_func (i)) \
3540 bitset_set (sbcset, trans[i]); \
3542 else \
3544 for (i = 0; i < SBC_MAX; ++i) \
3545 if (ctype_func (i)) \
3546 bitset_set (sbcset, i); \
3548 } while (0)
3550 if (strcmp (name, "alnum") == 0)
3551 BUILD_CHARCLASS_LOOP (isalnum);
3552 else if (strcmp (name, "cntrl") == 0)
3553 BUILD_CHARCLASS_LOOP (iscntrl);
3554 else if (strcmp (name, "lower") == 0)
3555 BUILD_CHARCLASS_LOOP (islower);
3556 else if (strcmp (name, "space") == 0)
3557 BUILD_CHARCLASS_LOOP (isspace);
3558 else if (strcmp (name, "alpha") == 0)
3559 BUILD_CHARCLASS_LOOP (isalpha);
3560 else if (strcmp (name, "digit") == 0)
3561 BUILD_CHARCLASS_LOOP (isdigit);
3562 else if (strcmp (name, "print") == 0)
3563 BUILD_CHARCLASS_LOOP (isprint);
3564 else if (strcmp (name, "upper") == 0)
3565 BUILD_CHARCLASS_LOOP (isupper);
3566 else if (strcmp (name, "blank") == 0)
3567 BUILD_CHARCLASS_LOOP (isblank);
3568 else if (strcmp (name, "graph") == 0)
3569 BUILD_CHARCLASS_LOOP (isgraph);
3570 else if (strcmp (name, "punct") == 0)
3571 BUILD_CHARCLASS_LOOP (ispunct);
3572 else if (strcmp (name, "xdigit") == 0)
3573 BUILD_CHARCLASS_LOOP (isxdigit);
3574 else
3575 return REG_ECTYPE;
3577 return REG_NOERROR;
3580 static bin_tree_t *
3581 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3582 const unsigned char *class_name,
3583 const unsigned char *extra, int non_match,
3584 reg_errcode_t *err)
3586 re_bitset_ptr_t sbcset;
3587 #ifdef RE_ENABLE_I18N
3588 re_charset_t *mbcset;
3589 int alloc = 0;
3590 #endif /* not RE_ENABLE_I18N */
3591 reg_errcode_t ret;
3592 re_token_t br_token;
3593 bin_tree_t *tree;
3595 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3596 #ifdef RE_ENABLE_I18N
3597 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3598 #endif /* RE_ENABLE_I18N */
3600 #ifdef RE_ENABLE_I18N
3601 if (BE (sbcset == NULL || mbcset == NULL, 0))
3602 #else /* not RE_ENABLE_I18N */
3603 if (BE (sbcset == NULL, 0))
3604 #endif /* not RE_ENABLE_I18N */
3606 *err = REG_ESPACE;
3607 return NULL;
3610 if (non_match)
3612 #ifdef RE_ENABLE_I18N
3613 mbcset->non_match = 1;
3614 #endif /* not RE_ENABLE_I18N */
3617 /* We don't care the syntax in this case. */
3618 ret = build_charclass (trans, sbcset,
3619 #ifdef RE_ENABLE_I18N
3620 mbcset, &alloc,
3621 #endif /* RE_ENABLE_I18N */
3622 class_name, 0);
3624 if (BE (ret != REG_NOERROR, 0))
3626 re_free (sbcset);
3627 #ifdef RE_ENABLE_I18N
3628 free_charset (mbcset);
3629 #endif /* RE_ENABLE_I18N */
3630 *err = ret;
3631 return NULL;
3633 /* \w match '_' also. */
3634 for (; *extra; extra++)
3635 bitset_set (sbcset, *extra);
3637 /* If it is non-matching list. */
3638 if (non_match)
3639 bitset_not (sbcset);
3641 #ifdef RE_ENABLE_I18N
3642 /* Ensure only single byte characters are set. */
3643 if (dfa->mb_cur_max > 1)
3644 bitset_mask (sbcset, dfa->sb_char);
3645 #endif
3647 /* Build a tree for simple bracket. */
3648 br_token.type = SIMPLE_BRACKET;
3649 br_token.opr.sbcset = sbcset;
3650 tree = create_token_tree (dfa, NULL, NULL, &br_token);
3651 if (BE (tree == NULL, 0))
3652 goto build_word_op_espace;
3654 #ifdef RE_ENABLE_I18N
3655 if (dfa->mb_cur_max > 1)
3657 bin_tree_t *mbc_tree;
3658 /* Build a tree for complex bracket. */
3659 br_token.type = COMPLEX_BRACKET;
3660 br_token.opr.mbcset = mbcset;
3661 dfa->has_mb_node = 1;
3662 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3663 if (BE (mbc_tree == NULL, 0))
3664 goto build_word_op_espace;
3665 /* Then join them by ALT node. */
3666 tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3667 if (BE (mbc_tree != NULL, 1))
3668 return tree;
3670 else
3672 free_charset (mbcset);
3673 return tree;
3675 #else /* not RE_ENABLE_I18N */
3676 return tree;
3677 #endif /* not RE_ENABLE_I18N */
3679 build_word_op_espace:
3680 re_free (sbcset);
3681 #ifdef RE_ENABLE_I18N
3682 free_charset (mbcset);
3683 #endif /* RE_ENABLE_I18N */
3684 *err = REG_ESPACE;
3685 return NULL;
3688 /* This is intended for the expressions like "a{1,3}".
3689 Fetch a number from `input', and return the number.
3690 Return -1, if the number field is empty like "{,1}".
3691 Return -2, If an error is occured. */
3693 static int
3694 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3696 int num = -1;
3697 unsigned char c;
3698 while (1)
3700 fetch_token (token, input, syntax);
3701 c = token->opr.c;
3702 if (BE (token->type == END_OF_RE, 0))
3703 return -2;
3704 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3705 break;
3706 num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3707 ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3708 num = (num > RE_DUP_MAX) ? -2 : num;
3710 return num;
3713 #ifdef RE_ENABLE_I18N
3714 static void
3715 free_charset (re_charset_t *cset)
3717 re_free (cset->mbchars);
3718 # ifdef _LIBC
3719 re_free (cset->coll_syms);
3720 re_free (cset->equiv_classes);
3721 re_free (cset->range_starts);
3722 re_free (cset->range_ends);
3723 # endif
3724 re_free (cset->char_classes);
3725 re_free (cset);
3727 #endif /* RE_ENABLE_I18N */
3729 /* Functions for binary tree operation. */
3731 /* Create a tree node. */
3733 static bin_tree_t *
3734 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3735 re_token_type_t type)
3737 re_token_t t;
3738 t.type = type;
3739 return create_token_tree (dfa, left, right, &t);
3742 static bin_tree_t *
3743 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3744 const re_token_t *token)
3746 bin_tree_t *tree;
3747 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3749 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3751 if (storage == NULL)
3752 return NULL;
3753 storage->next = dfa->str_tree_storage;
3754 dfa->str_tree_storage = storage;
3755 dfa->str_tree_storage_idx = 0;
3757 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3759 tree->parent = NULL;
3760 tree->left = left;
3761 tree->right = right;
3762 tree->token = *token;
3763 tree->token.duplicated = 0;
3764 tree->token.opt_subexp = 0;
3765 tree->first = NULL;
3766 tree->next = NULL;
3767 tree->node_idx = -1;
3769 if (left != NULL)
3770 left->parent = tree;
3771 if (right != NULL)
3772 right->parent = tree;
3773 return tree;
3776 /* Mark the tree SRC as an optional subexpression.
3777 To be called from preorder or postorder. */
3779 static reg_errcode_t
3780 mark_opt_subexp (void *extra, bin_tree_t *node)
3782 int idx = (int) (long) extra;
3783 if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3784 node->token.opt_subexp = 1;
3786 return REG_NOERROR;
3789 /* Free the allocated memory inside NODE. */
3791 static void
3792 free_token (re_token_t *node)
3794 #ifdef RE_ENABLE_I18N
3795 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3796 free_charset (node->opr.mbcset);
3797 else
3798 #endif /* RE_ENABLE_I18N */
3799 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3800 re_free (node->opr.sbcset);
3803 /* Worker function for tree walking. Free the allocated memory inside NODE
3804 and its children. */
3806 static reg_errcode_t
3807 free_tree (void *extra, bin_tree_t *node)
3809 free_token (&node->token);
3810 return REG_NOERROR;
3814 /* Duplicate the node SRC, and return new node. This is a preorder
3815 visit similar to the one implemented by the generic visitor, but
3816 we need more infrastructure to maintain two parallel trees --- so,
3817 it's easier to duplicate. */
3819 static bin_tree_t *
3820 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3822 const bin_tree_t *node;
3823 bin_tree_t *dup_root;
3824 bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3826 for (node = root; ; )
3828 /* Create a new tree and link it back to the current parent. */
3829 *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3830 if (*p_new == NULL)
3831 return NULL;
3832 (*p_new)->parent = dup_node;
3833 (*p_new)->token.duplicated = 1;
3834 dup_node = *p_new;
3836 /* Go to the left node, or up and to the right. */
3837 if (node->left)
3839 node = node->left;
3840 p_new = &dup_node->left;
3842 else
3844 const bin_tree_t *prev = NULL;
3845 while (node->right == prev || node->right == NULL)
3847 prev = node;
3848 node = node->parent;
3849 dup_node = dup_node->parent;
3850 if (!node)
3851 return dup_root;
3853 node = node->right;
3854 p_new = &dup_node->right;