posix: Remove internal_function attribute
[glibc.git] / posix / regcomp.c
bloba5b46139a96f2f41eabacc023784144038bd46e9
1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002-2017 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <http://www.gnu.org/licenses/>. */
20 #include <stdint.h>
22 #ifdef _LIBC
23 # include <locale/weight.h>
24 #endif
26 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
27 size_t length, reg_syntax_t syntax);
28 static void re_compile_fastmap_iter (regex_t *bufp,
29 const re_dfastate_t *init_state,
30 char *fastmap);
31 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
32 #ifdef RE_ENABLE_I18N
33 static void free_charset (re_charset_t *cset);
34 #endif /* RE_ENABLE_I18N */
35 static void free_workarea_compile (regex_t *preg);
36 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
37 #ifdef RE_ENABLE_I18N
38 static void optimize_utf8 (re_dfa_t *dfa);
39 #endif
40 static reg_errcode_t analyze (regex_t *preg);
41 static reg_errcode_t preorder (bin_tree_t *root,
42 reg_errcode_t (fn (void *, bin_tree_t *)),
43 void *extra);
44 static reg_errcode_t postorder (bin_tree_t *root,
45 reg_errcode_t (fn (void *, bin_tree_t *)),
46 void *extra);
47 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
48 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
49 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
50 bin_tree_t *node);
51 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
52 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
53 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
54 static int duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint);
55 static int search_duplicated_node (const re_dfa_t *dfa, int org_node,
56 unsigned int constraint);
57 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
58 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
59 int node, int root);
60 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
61 static int fetch_number (re_string_t *input, re_token_t *token,
62 reg_syntax_t syntax);
63 static int peek_token (re_token_t *token, re_string_t *input,
64 reg_syntax_t syntax);
65 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
66 reg_syntax_t syntax, reg_errcode_t *err);
67 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
68 re_token_t *token, reg_syntax_t syntax,
69 int nest, reg_errcode_t *err);
70 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
71 re_token_t *token, reg_syntax_t syntax,
72 int nest, reg_errcode_t *err);
73 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
74 re_token_t *token, reg_syntax_t syntax,
75 int nest, reg_errcode_t *err);
76 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
77 re_token_t *token, reg_syntax_t syntax,
78 int nest, reg_errcode_t *err);
79 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
80 re_dfa_t *dfa, re_token_t *token,
81 reg_syntax_t syntax, reg_errcode_t *err);
82 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
83 re_token_t *token, reg_syntax_t syntax,
84 reg_errcode_t *err);
85 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
86 re_string_t *regexp,
87 re_token_t *token, int token_len,
88 re_dfa_t *dfa,
89 reg_syntax_t syntax,
90 int accept_hyphen);
91 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
92 re_string_t *regexp,
93 re_token_t *token);
94 #ifdef RE_ENABLE_I18N
95 static reg_errcode_t build_equiv_class (bitset_t sbcset,
96 re_charset_t *mbcset,
97 int *equiv_class_alloc,
98 const unsigned char *name);
99 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
100 bitset_t sbcset,
101 re_charset_t *mbcset,
102 int *char_class_alloc,
103 const unsigned char *class_name,
104 reg_syntax_t syntax);
105 #else /* not RE_ENABLE_I18N */
106 static reg_errcode_t build_equiv_class (bitset_t sbcset,
107 const unsigned char *name);
108 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
109 bitset_t sbcset,
110 const unsigned char *class_name,
111 reg_syntax_t syntax);
112 #endif /* not RE_ENABLE_I18N */
113 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
114 RE_TRANSLATE_TYPE trans,
115 const unsigned char *class_name,
116 const unsigned char *extra,
117 int non_match, reg_errcode_t *err);
118 static bin_tree_t *create_tree (re_dfa_t *dfa,
119 bin_tree_t *left, bin_tree_t *right,
120 re_token_type_t type);
121 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
122 bin_tree_t *left, bin_tree_t *right,
123 const re_token_t *token);
124 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
125 static void free_token (re_token_t *node);
126 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
127 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
129 /* This table gives an error message for each of the error codes listed
130 in regex.h. Obviously the order here has to be same as there.
131 POSIX doesn't require that we do anything for REG_NOERROR,
132 but why not be nice? */
134 const char __re_error_msgid[] attribute_hidden =
136 #define REG_NOERROR_IDX 0
137 gettext_noop ("Success") /* REG_NOERROR */
138 "\0"
139 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
140 gettext_noop ("No match") /* REG_NOMATCH */
141 "\0"
142 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
143 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
144 "\0"
145 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
146 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
147 "\0"
148 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
149 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
150 "\0"
151 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
152 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
153 "\0"
154 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
155 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
156 "\0"
157 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
158 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
159 "\0"
160 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
161 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
162 "\0"
163 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
164 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
165 "\0"
166 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
167 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
168 "\0"
169 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
170 gettext_noop ("Invalid range end") /* REG_ERANGE */
171 "\0"
172 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
173 gettext_noop ("Memory exhausted") /* REG_ESPACE */
174 "\0"
175 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
176 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
177 "\0"
178 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
179 gettext_noop ("Premature end of regular expression") /* REG_EEND */
180 "\0"
181 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
182 gettext_noop ("Regular expression too big") /* REG_ESIZE */
183 "\0"
184 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
185 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
188 const size_t __re_error_msgid_idx[] attribute_hidden =
190 REG_NOERROR_IDX,
191 REG_NOMATCH_IDX,
192 REG_BADPAT_IDX,
193 REG_ECOLLATE_IDX,
194 REG_ECTYPE_IDX,
195 REG_EESCAPE_IDX,
196 REG_ESUBREG_IDX,
197 REG_EBRACK_IDX,
198 REG_EPAREN_IDX,
199 REG_EBRACE_IDX,
200 REG_BADBR_IDX,
201 REG_ERANGE_IDX,
202 REG_ESPACE_IDX,
203 REG_BADRPT_IDX,
204 REG_EEND_IDX,
205 REG_ESIZE_IDX,
206 REG_ERPAREN_IDX
209 /* Entry points for GNU code. */
211 /* re_compile_pattern is the GNU regular expression compiler: it
212 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
213 Returns 0 if the pattern was valid, otherwise an error string.
215 Assumes the 'allocated' (and perhaps 'buffer') and 'translate' fields
216 are set in BUFP on entry. */
218 const char *
219 re_compile_pattern (const char *pattern, size_t length,
220 struct re_pattern_buffer *bufp)
222 reg_errcode_t ret;
224 /* And GNU code determines whether or not to get register information
225 by passing null for the REGS argument to re_match, etc., not by
226 setting no_sub, unless RE_NO_SUB is set. */
227 bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
229 /* Match anchors at newline. */
230 bufp->newline_anchor = 1;
232 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
234 if (!ret)
235 return NULL;
236 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
238 #ifdef _LIBC
239 weak_alias (__re_compile_pattern, re_compile_pattern)
240 #endif
242 /* Set by 're_set_syntax' to the current regexp syntax to recognize. Can
243 also be assigned to arbitrarily: each pattern buffer stores its own
244 syntax, so it can be changed between regex compilations. */
245 /* This has no initializer because initialized variables in Emacs
246 become read-only after dumping. */
247 reg_syntax_t re_syntax_options;
250 /* Specify the precise syntax of regexps for compilation. This provides
251 for compatibility for various utilities which historically have
252 different, incompatible syntaxes.
254 The argument SYNTAX is a bit mask comprised of the various bits
255 defined in regex.h. We return the old syntax. */
257 reg_syntax_t
258 re_set_syntax (reg_syntax_t syntax)
260 reg_syntax_t ret = re_syntax_options;
262 re_syntax_options = syntax;
263 return ret;
265 #ifdef _LIBC
266 weak_alias (__re_set_syntax, re_set_syntax)
267 #endif
270 re_compile_fastmap (struct re_pattern_buffer *bufp)
272 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
273 char *fastmap = bufp->fastmap;
275 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
276 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
277 if (dfa->init_state != dfa->init_state_word)
278 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
279 if (dfa->init_state != dfa->init_state_nl)
280 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
281 if (dfa->init_state != dfa->init_state_begbuf)
282 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
283 bufp->fastmap_accurate = 1;
284 return 0;
286 #ifdef _LIBC
287 weak_alias (__re_compile_fastmap, re_compile_fastmap)
288 #endif
290 static inline void
291 __attribute__ ((always_inline))
292 re_set_fastmap (char *fastmap, bool icase, int ch)
294 fastmap[ch] = 1;
295 if (icase)
296 fastmap[tolower (ch)] = 1;
299 /* Helper function for re_compile_fastmap.
300 Compile fastmap for the initial_state INIT_STATE. */
302 static void
303 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
304 char *fastmap)
306 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
307 int node_cnt;
308 int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
309 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
311 int node = init_state->nodes.elems[node_cnt];
312 re_token_type_t type = dfa->nodes[node].type;
314 if (type == CHARACTER)
316 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
317 #ifdef RE_ENABLE_I18N
318 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
320 unsigned char *buf = alloca (dfa->mb_cur_max), *p;
321 wchar_t wc;
322 mbstate_t state;
324 p = buf;
325 *p++ = dfa->nodes[node].opr.c;
326 while (++node < dfa->nodes_len
327 && dfa->nodes[node].type == CHARACTER
328 && dfa->nodes[node].mb_partial)
329 *p++ = dfa->nodes[node].opr.c;
330 memset (&state, '\0', sizeof (state));
331 if (__mbrtowc (&wc, (const char *) buf, p - buf,
332 &state) == p - buf
333 && (__wcrtomb ((char *) buf, __towlower (wc), &state)
334 != (size_t) -1))
335 re_set_fastmap (fastmap, 0, buf[0]);
337 #endif
339 else if (type == SIMPLE_BRACKET)
341 int i, ch;
342 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
344 int j;
345 bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
346 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
347 if (w & ((bitset_word_t) 1 << j))
348 re_set_fastmap (fastmap, icase, ch);
351 #ifdef RE_ENABLE_I18N
352 else if (type == COMPLEX_BRACKET)
354 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
355 int i;
357 # ifdef _LIBC
358 /* See if we have to try all bytes which start multiple collation
359 elements.
360 e.g. In da_DK, we want to catch 'a' since "aa" is a valid
361 collation element, and don't catch 'b' since 'b' is
362 the only collation element which starts from 'b' (and
363 it is caught by SIMPLE_BRACKET). */
364 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
365 && (cset->ncoll_syms || cset->nranges))
367 const int32_t *table = (const int32_t *)
368 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
369 for (i = 0; i < SBC_MAX; ++i)
370 if (table[i] < 0)
371 re_set_fastmap (fastmap, icase, i);
373 # endif /* _LIBC */
375 /* See if we have to start the match at all multibyte characters,
376 i.e. where we would not find an invalid sequence. This only
377 applies to multibyte character sets; for single byte character
378 sets, the SIMPLE_BRACKET again suffices. */
379 if (dfa->mb_cur_max > 1
380 && (cset->nchar_classes || cset->non_match || cset->nranges
381 # ifdef _LIBC
382 || cset->nequiv_classes
383 # endif /* _LIBC */
386 unsigned char c = 0;
389 mbstate_t mbs;
390 memset (&mbs, 0, sizeof (mbs));
391 if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
392 re_set_fastmap (fastmap, false, (int) c);
394 while (++c != 0);
397 else
399 /* ... Else catch all bytes which can start the mbchars. */
400 for (i = 0; i < cset->nmbchars; ++i)
402 char buf[256];
403 mbstate_t state;
404 memset (&state, '\0', sizeof (state));
405 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
406 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
407 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
409 if (__wcrtomb (buf, __towlower (cset->mbchars[i]), &state)
410 != (size_t) -1)
411 re_set_fastmap (fastmap, false, *(unsigned char *) buf);
416 #endif /* RE_ENABLE_I18N */
417 else if (type == OP_PERIOD
418 #ifdef RE_ENABLE_I18N
419 || type == OP_UTF8_PERIOD
420 #endif /* RE_ENABLE_I18N */
421 || type == END_OF_RE)
423 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
424 if (type == END_OF_RE)
425 bufp->can_be_null = 1;
426 return;
431 /* Entry point for POSIX code. */
432 /* regcomp takes a regular expression as a string and compiles it.
434 PREG is a regex_t *. We do not expect any fields to be initialized,
435 since POSIX says we shouldn't. Thus, we set
437 'buffer' to the compiled pattern;
438 'used' to the length of the compiled pattern;
439 'syntax' to RE_SYNTAX_POSIX_EXTENDED if the
440 REG_EXTENDED bit in CFLAGS is set; otherwise, to
441 RE_SYNTAX_POSIX_BASIC;
442 'newline_anchor' to REG_NEWLINE being set in CFLAGS;
443 'fastmap' to an allocated space for the fastmap;
444 'fastmap_accurate' to zero;
445 're_nsub' to the number of subexpressions in PATTERN.
447 PATTERN is the address of the pattern string.
449 CFLAGS is a series of bits which affect compilation.
451 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
452 use POSIX basic syntax.
454 If REG_NEWLINE is set, then . and [^...] don't match newline.
455 Also, regexec will try a match beginning after every newline.
457 If REG_ICASE is set, then we considers upper- and lowercase
458 versions of letters to be equivalent when matching.
460 If REG_NOSUB is set, then when PREG is passed to regexec, that
461 routine will report only success or failure, and nothing about the
462 registers.
464 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
465 the return codes and their meanings.) */
468 regcomp (regex_t *__restrict preg, const char *__restrict pattern, int cflags)
470 reg_errcode_t ret;
471 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
472 : RE_SYNTAX_POSIX_BASIC);
474 preg->buffer = NULL;
475 preg->allocated = 0;
476 preg->used = 0;
478 /* Try to allocate space for the fastmap. */
479 preg->fastmap = re_malloc (char, SBC_MAX);
480 if (BE (preg->fastmap == NULL, 0))
481 return REG_ESPACE;
483 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
485 /* If REG_NEWLINE is set, newlines are treated differently. */
486 if (cflags & REG_NEWLINE)
487 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
488 syntax &= ~RE_DOT_NEWLINE;
489 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
490 /* It also changes the matching behavior. */
491 preg->newline_anchor = 1;
493 else
494 preg->newline_anchor = 0;
495 preg->no_sub = !!(cflags & REG_NOSUB);
496 preg->translate = NULL;
498 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
500 /* POSIX doesn't distinguish between an unmatched open-group and an
501 unmatched close-group: both are REG_EPAREN. */
502 if (ret == REG_ERPAREN)
503 ret = REG_EPAREN;
505 /* We have already checked preg->fastmap != NULL. */
506 if (BE (ret == REG_NOERROR, 1))
507 /* Compute the fastmap now, since regexec cannot modify the pattern
508 buffer. This function never fails in this implementation. */
509 (void) re_compile_fastmap (preg);
510 else
512 /* Some error occurred while compiling the expression. */
513 re_free (preg->fastmap);
514 preg->fastmap = NULL;
517 return (int) ret;
519 #ifdef _LIBC
520 weak_alias (__regcomp, regcomp)
521 #endif
523 /* Returns a message corresponding to an error code, ERRCODE, returned
524 from either regcomp or regexec. We don't use PREG here. */
526 size_t
527 regerror (int errcode, const regex_t *__restrict preg, char *__restrict errbuf,
528 size_t errbuf_size)
530 const char *msg;
531 size_t msg_size;
533 if (BE (errcode < 0
534 || errcode >= (int) (sizeof (__re_error_msgid_idx)
535 / sizeof (__re_error_msgid_idx[0])), 0))
536 /* Only error codes returned by the rest of the code should be passed
537 to this routine. If we are given anything else, or if other regex
538 code generates an invalid error code, then the program has a bug.
539 Dump core so we can fix it. */
540 abort ();
542 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
544 msg_size = strlen (msg) + 1; /* Includes the null. */
546 if (BE (errbuf_size != 0, 1))
548 if (BE (msg_size > errbuf_size, 0))
550 #if defined HAVE_MEMPCPY || defined _LIBC
551 *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
552 #else
553 memcpy (errbuf, msg, errbuf_size - 1);
554 errbuf[errbuf_size - 1] = 0;
555 #endif
557 else
558 memcpy (errbuf, msg, msg_size);
561 return msg_size;
563 #ifdef _LIBC
564 weak_alias (__regerror, regerror)
565 #endif
568 #ifdef RE_ENABLE_I18N
569 /* This static array is used for the map to single-byte characters when
570 UTF-8 is used. Otherwise we would allocate memory just to initialize
571 it the same all the time. UTF-8 is the preferred encoding so this is
572 a worthwhile optimization. */
573 static const bitset_t utf8_sb_map =
575 /* Set the first 128 bits. */
576 [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
578 #endif
581 static void
582 free_dfa_content (re_dfa_t *dfa)
584 int i, j;
586 if (dfa->nodes)
587 for (i = 0; i < dfa->nodes_len; ++i)
588 free_token (dfa->nodes + i);
589 re_free (dfa->nexts);
590 for (i = 0; i < dfa->nodes_len; ++i)
592 if (dfa->eclosures != NULL)
593 re_node_set_free (dfa->eclosures + i);
594 if (dfa->inveclosures != NULL)
595 re_node_set_free (dfa->inveclosures + i);
596 if (dfa->edests != NULL)
597 re_node_set_free (dfa->edests + i);
599 re_free (dfa->edests);
600 re_free (dfa->eclosures);
601 re_free (dfa->inveclosures);
602 re_free (dfa->nodes);
604 if (dfa->state_table)
605 for (i = 0; i <= dfa->state_hash_mask; ++i)
607 struct re_state_table_entry *entry = dfa->state_table + i;
608 for (j = 0; j < entry->num; ++j)
610 re_dfastate_t *state = entry->array[j];
611 free_state (state);
613 re_free (entry->array);
615 re_free (dfa->state_table);
616 #ifdef RE_ENABLE_I18N
617 if (dfa->sb_char != utf8_sb_map)
618 re_free (dfa->sb_char);
619 #endif
620 re_free (dfa->subexp_map);
621 #ifdef DEBUG
622 re_free (dfa->re_str);
623 #endif
625 re_free (dfa);
629 /* Free dynamically allocated space used by PREG. */
631 void
632 regfree (regex_t *preg)
634 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
635 if (BE (dfa != NULL, 1))
636 free_dfa_content (dfa);
637 preg->buffer = NULL;
638 preg->allocated = 0;
640 re_free (preg->fastmap);
641 preg->fastmap = NULL;
643 re_free (preg->translate);
644 preg->translate = NULL;
646 #ifdef _LIBC
647 weak_alias (__regfree, regfree)
648 #endif
650 /* Entry points compatible with 4.2 BSD regex library. We don't define
651 them unless specifically requested. */
653 #if defined _REGEX_RE_COMP || defined _LIBC
655 /* BSD has one and only one pattern buffer. */
656 static struct re_pattern_buffer re_comp_buf;
658 char *
659 # ifdef _LIBC
660 /* Make these definitions weak in libc, so POSIX programs can redefine
661 these names if they don't use our functions, and still use
662 regcomp/regexec above without link errors. */
663 weak_function
664 # endif
665 re_comp (const char *s)
667 reg_errcode_t ret;
668 char *fastmap;
670 if (!s)
672 if (!re_comp_buf.buffer)
673 return gettext ("No previous regular expression");
674 return 0;
677 if (re_comp_buf.buffer)
679 fastmap = re_comp_buf.fastmap;
680 re_comp_buf.fastmap = NULL;
681 __regfree (&re_comp_buf);
682 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
683 re_comp_buf.fastmap = fastmap;
686 if (re_comp_buf.fastmap == NULL)
688 re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
689 if (re_comp_buf.fastmap == NULL)
690 return (char *) gettext (__re_error_msgid
691 + __re_error_msgid_idx[(int) REG_ESPACE]);
694 /* Since 're_exec' always passes NULL for the 'regs' argument, we
695 don't need to initialize the pattern buffer fields which affect it. */
697 /* Match anchors at newlines. */
698 re_comp_buf.newline_anchor = 1;
700 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
702 if (!ret)
703 return NULL;
705 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
706 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
709 #ifdef _LIBC
710 libc_freeres_fn (free_mem)
712 __regfree (&re_comp_buf);
714 #endif
716 #endif /* _REGEX_RE_COMP */
718 /* Internal entry point.
719 Compile the regular expression PATTERN, whose length is LENGTH.
720 SYNTAX indicate regular expression's syntax. */
722 static reg_errcode_t
723 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
724 reg_syntax_t syntax)
726 reg_errcode_t err = REG_NOERROR;
727 re_dfa_t *dfa;
728 re_string_t regexp;
730 /* Initialize the pattern buffer. */
731 preg->fastmap_accurate = 0;
732 preg->syntax = syntax;
733 preg->not_bol = preg->not_eol = 0;
734 preg->used = 0;
735 preg->re_nsub = 0;
736 preg->can_be_null = 0;
737 preg->regs_allocated = REGS_UNALLOCATED;
739 /* Initialize the dfa. */
740 dfa = (re_dfa_t *) preg->buffer;
741 if (BE (preg->allocated < sizeof (re_dfa_t), 0))
743 /* If zero allocated, but buffer is non-null, try to realloc
744 enough space. This loses if buffer's address is bogus, but
745 that is the user's responsibility. If ->buffer is NULL this
746 is a simple allocation. */
747 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
748 if (dfa == NULL)
749 return REG_ESPACE;
750 preg->allocated = sizeof (re_dfa_t);
751 preg->buffer = (unsigned char *) dfa;
753 preg->used = sizeof (re_dfa_t);
755 err = init_dfa (dfa, length);
756 if (BE (err != REG_NOERROR, 0))
758 free_dfa_content (dfa);
759 preg->buffer = NULL;
760 preg->allocated = 0;
761 return err;
763 #ifdef DEBUG
764 /* Note: length+1 will not overflow since it is checked in init_dfa. */
765 dfa->re_str = re_malloc (char, length + 1);
766 strncpy (dfa->re_str, pattern, length + 1);
767 #endif
769 __libc_lock_init (dfa->lock);
771 err = re_string_construct (&regexp, pattern, length, preg->translate,
772 syntax & RE_ICASE, dfa);
773 if (BE (err != REG_NOERROR, 0))
775 re_compile_internal_free_return:
776 free_workarea_compile (preg);
777 re_string_destruct (&regexp);
778 free_dfa_content (dfa);
779 preg->buffer = NULL;
780 preg->allocated = 0;
781 return err;
784 /* Parse the regular expression, and build a structure tree. */
785 preg->re_nsub = 0;
786 dfa->str_tree = parse (&regexp, preg, syntax, &err);
787 if (BE (dfa->str_tree == NULL, 0))
788 goto re_compile_internal_free_return;
790 /* Analyze the tree and create the nfa. */
791 err = analyze (preg);
792 if (BE (err != REG_NOERROR, 0))
793 goto re_compile_internal_free_return;
795 #ifdef RE_ENABLE_I18N
796 /* If possible, do searching in single byte encoding to speed things up. */
797 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
798 optimize_utf8 (dfa);
799 #endif
801 /* Then create the initial state of the dfa. */
802 err = create_initial_state (dfa);
804 /* Release work areas. */
805 free_workarea_compile (preg);
806 re_string_destruct (&regexp);
808 if (BE (err != REG_NOERROR, 0))
810 free_dfa_content (dfa);
811 preg->buffer = NULL;
812 preg->allocated = 0;
815 return err;
818 /* Initialize DFA. We use the length of the regular expression PAT_LEN
819 as the initial length of some arrays. */
821 static reg_errcode_t
822 init_dfa (re_dfa_t *dfa, size_t pat_len)
824 unsigned int table_size;
825 #ifndef _LIBC
826 char *codeset_name;
827 #endif
829 memset (dfa, '\0', sizeof (re_dfa_t));
831 /* Force allocation of str_tree_storage the first time. */
832 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
834 /* Avoid overflows. */
835 if (pat_len == SIZE_MAX)
836 return REG_ESPACE;
838 dfa->nodes_alloc = pat_len + 1;
839 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
841 /* table_size = 2 ^ ceil(log pat_len) */
842 for (table_size = 1; ; table_size <<= 1)
843 if (table_size > pat_len)
844 break;
846 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
847 dfa->state_hash_mask = table_size - 1;
849 dfa->mb_cur_max = MB_CUR_MAX;
850 #ifdef _LIBC
851 if (dfa->mb_cur_max == 6
852 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
853 dfa->is_utf8 = 1;
854 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
855 != 0);
856 #else
857 # ifdef HAVE_LANGINFO_CODESET
858 codeset_name = nl_langinfo (CODESET);
859 # else
860 codeset_name = getenv ("LC_ALL");
861 if (codeset_name == NULL || codeset_name[0] == '\0')
862 codeset_name = getenv ("LC_CTYPE");
863 if (codeset_name == NULL || codeset_name[0] == '\0')
864 codeset_name = getenv ("LANG");
865 if (codeset_name == NULL)
866 codeset_name = "";
867 else if (strchr (codeset_name, '.') != NULL)
868 codeset_name = strchr (codeset_name, '.') + 1;
869 # endif
871 if (strcasecmp (codeset_name, "UTF-8") == 0
872 || strcasecmp (codeset_name, "UTF8") == 0)
873 dfa->is_utf8 = 1;
875 /* We check exhaustively in the loop below if this charset is a
876 superset of ASCII. */
877 dfa->map_notascii = 0;
878 #endif
880 #ifdef RE_ENABLE_I18N
881 if (dfa->mb_cur_max > 1)
883 if (dfa->is_utf8)
884 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
885 else
887 int i, j, ch;
889 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
890 if (BE (dfa->sb_char == NULL, 0))
891 return REG_ESPACE;
893 /* Set the bits corresponding to single byte chars. */
894 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
895 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
897 wint_t wch = __btowc (ch);
898 if (wch != WEOF)
899 dfa->sb_char[i] |= (bitset_word_t) 1 << j;
900 # ifndef _LIBC
901 if (isascii (ch) && wch != ch)
902 dfa->map_notascii = 1;
903 # endif
907 #endif
909 if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
910 return REG_ESPACE;
911 return REG_NOERROR;
914 /* Initialize WORD_CHAR table, which indicate which character is
915 "word". In this case "word" means that it is the word construction
916 character used by some operators like "\<", "\>", etc. */
918 static void
919 init_word_char (re_dfa_t *dfa)
921 dfa->word_ops_used = 1;
922 int i = 0;
923 int ch = 0;
924 if (BE (dfa->map_notascii == 0, 1))
926 if (sizeof (dfa->word_char[0]) == 8)
928 /* The extra temporaries here avoid "implicitly truncated"
929 warnings in the case when this is dead code, i.e. 32-bit. */
930 const uint64_t wc0 = UINT64_C (0x03ff000000000000);
931 const uint64_t wc1 = UINT64_C (0x07fffffe87fffffe);
932 dfa->word_char[0] = wc0;
933 dfa->word_char[1] = wc1;
934 i = 2;
936 else if (sizeof (dfa->word_char[0]) == 4)
938 dfa->word_char[0] = UINT32_C (0x00000000);
939 dfa->word_char[1] = UINT32_C (0x03ff0000);
940 dfa->word_char[2] = UINT32_C (0x87fffffe);
941 dfa->word_char[3] = UINT32_C (0x07fffffe);
942 i = 4;
944 else
945 abort ();
946 ch = 128;
948 if (BE (dfa->is_utf8, 1))
950 memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8);
951 return;
955 for (; i < BITSET_WORDS; ++i)
956 for (int j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
957 if (isalnum (ch) || ch == '_')
958 dfa->word_char[i] |= (bitset_word_t) 1 << j;
961 /* Free the work area which are only used while compiling. */
963 static void
964 free_workarea_compile (regex_t *preg)
966 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
967 bin_tree_storage_t *storage, *next;
968 for (storage = dfa->str_tree_storage; storage; storage = next)
970 next = storage->next;
971 re_free (storage);
973 dfa->str_tree_storage = NULL;
974 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
975 dfa->str_tree = NULL;
976 re_free (dfa->org_indices);
977 dfa->org_indices = NULL;
980 /* Create initial states for all contexts. */
982 static reg_errcode_t
983 create_initial_state (re_dfa_t *dfa)
985 int first, i;
986 reg_errcode_t err;
987 re_node_set init_nodes;
989 /* Initial states have the epsilon closure of the node which is
990 the first node of the regular expression. */
991 first = dfa->str_tree->first->node_idx;
992 dfa->init_node = first;
993 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
994 if (BE (err != REG_NOERROR, 0))
995 return err;
997 /* The back-references which are in initial states can epsilon transit,
998 since in this case all of the subexpressions can be null.
999 Then we add epsilon closures of the nodes which are the next nodes of
1000 the back-references. */
1001 if (dfa->nbackref > 0)
1002 for (i = 0; i < init_nodes.nelem; ++i)
1004 int node_idx = init_nodes.elems[i];
1005 re_token_type_t type = dfa->nodes[node_idx].type;
1007 int clexp_idx;
1008 if (type != OP_BACK_REF)
1009 continue;
1010 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1012 re_token_t *clexp_node;
1013 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1014 if (clexp_node->type == OP_CLOSE_SUBEXP
1015 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1016 break;
1018 if (clexp_idx == init_nodes.nelem)
1019 continue;
1021 if (type == OP_BACK_REF)
1023 int dest_idx = dfa->edests[node_idx].elems[0];
1024 if (!re_node_set_contains (&init_nodes, dest_idx))
1026 reg_errcode_t err = re_node_set_merge (&init_nodes,
1027 dfa->eclosures
1028 + dest_idx);
1029 if (err != REG_NOERROR)
1030 return err;
1031 i = 0;
1036 /* It must be the first time to invoke acquire_state. */
1037 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1038 /* We don't check ERR here, since the initial state must not be NULL. */
1039 if (BE (dfa->init_state == NULL, 0))
1040 return err;
1041 if (dfa->init_state->has_constraint)
1043 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1044 CONTEXT_WORD);
1045 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1046 CONTEXT_NEWLINE);
1047 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1048 &init_nodes,
1049 CONTEXT_NEWLINE
1050 | CONTEXT_BEGBUF);
1051 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1052 || dfa->init_state_begbuf == NULL, 0))
1053 return err;
1055 else
1056 dfa->init_state_word = dfa->init_state_nl
1057 = dfa->init_state_begbuf = dfa->init_state;
1059 re_node_set_free (&init_nodes);
1060 return REG_NOERROR;
1063 #ifdef RE_ENABLE_I18N
1064 /* If it is possible to do searching in single byte encoding instead of UTF-8
1065 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1066 DFA nodes where needed. */
1068 static void
1069 optimize_utf8 (re_dfa_t *dfa)
1071 int node, i, mb_chars = 0, has_period = 0;
1073 for (node = 0; node < dfa->nodes_len; ++node)
1074 switch (dfa->nodes[node].type)
1076 case CHARACTER:
1077 if (dfa->nodes[node].opr.c >= 0x80)
1078 mb_chars = 1;
1079 break;
1080 case ANCHOR:
1081 switch (dfa->nodes[node].opr.ctx_type)
1083 case LINE_FIRST:
1084 case LINE_LAST:
1085 case BUF_FIRST:
1086 case BUF_LAST:
1087 break;
1088 default:
1089 /* Word anchors etc. cannot be handled. It's okay to test
1090 opr.ctx_type since constraints (for all DFA nodes) are
1091 created by ORing one or more opr.ctx_type values. */
1092 return;
1094 break;
1095 case OP_PERIOD:
1096 has_period = 1;
1097 break;
1098 case OP_BACK_REF:
1099 case OP_ALT:
1100 case END_OF_RE:
1101 case OP_DUP_ASTERISK:
1102 case OP_OPEN_SUBEXP:
1103 case OP_CLOSE_SUBEXP:
1104 break;
1105 case COMPLEX_BRACKET:
1106 return;
1107 case SIMPLE_BRACKET:
1108 /* Just double check. The non-ASCII range starts at 0x80. */
1109 assert (0x80 % BITSET_WORD_BITS == 0);
1110 for (i = 0x80 / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1111 if (dfa->nodes[node].opr.sbcset[i])
1112 return;
1113 break;
1114 default:
1115 abort ();
1118 if (mb_chars || has_period)
1119 for (node = 0; node < dfa->nodes_len; ++node)
1121 if (dfa->nodes[node].type == CHARACTER
1122 && dfa->nodes[node].opr.c >= 0x80)
1123 dfa->nodes[node].mb_partial = 0;
1124 else if (dfa->nodes[node].type == OP_PERIOD)
1125 dfa->nodes[node].type = OP_UTF8_PERIOD;
1128 /* The search can be in single byte locale. */
1129 dfa->mb_cur_max = 1;
1130 dfa->is_utf8 = 0;
1131 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1133 #endif
1135 /* Analyze the structure tree, and calculate "first", "next", "edest",
1136 "eclosure", and "inveclosure". */
1138 static reg_errcode_t
1139 analyze (regex_t *preg)
1141 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1142 reg_errcode_t ret;
1144 /* Allocate arrays. */
1145 dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1146 dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1147 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1148 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1149 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1150 || dfa->eclosures == NULL, 0))
1151 return REG_ESPACE;
1153 dfa->subexp_map = re_malloc (int, preg->re_nsub);
1154 if (dfa->subexp_map != NULL)
1156 int i;
1157 for (i = 0; i < preg->re_nsub; i++)
1158 dfa->subexp_map[i] = i;
1159 preorder (dfa->str_tree, optimize_subexps, dfa);
1160 for (i = 0; i < preg->re_nsub; i++)
1161 if (dfa->subexp_map[i] != i)
1162 break;
1163 if (i == preg->re_nsub)
1165 free (dfa->subexp_map);
1166 dfa->subexp_map = NULL;
1170 ret = postorder (dfa->str_tree, lower_subexps, preg);
1171 if (BE (ret != REG_NOERROR, 0))
1172 return ret;
1173 ret = postorder (dfa->str_tree, calc_first, dfa);
1174 if (BE (ret != REG_NOERROR, 0))
1175 return ret;
1176 preorder (dfa->str_tree, calc_next, dfa);
1177 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1178 if (BE (ret != REG_NOERROR, 0))
1179 return ret;
1180 ret = calc_eclosure (dfa);
1181 if (BE (ret != REG_NOERROR, 0))
1182 return ret;
1184 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1185 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1186 if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1187 || dfa->nbackref)
1189 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1190 if (BE (dfa->inveclosures == NULL, 0))
1191 return REG_ESPACE;
1192 ret = calc_inveclosure (dfa);
1195 return ret;
1198 /* Our parse trees are very unbalanced, so we cannot use a stack to
1199 implement parse tree visits. Instead, we use parent pointers and
1200 some hairy code in these two functions. */
1201 static reg_errcode_t
1202 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1203 void *extra)
1205 bin_tree_t *node, *prev;
1207 for (node = root; ; )
1209 /* Descend down the tree, preferably to the left (or to the right
1210 if that's the only child). */
1211 while (node->left || node->right)
1212 if (node->left)
1213 node = node->left;
1214 else
1215 node = node->right;
1219 reg_errcode_t err = fn (extra, node);
1220 if (BE (err != REG_NOERROR, 0))
1221 return err;
1222 if (node->parent == NULL)
1223 return REG_NOERROR;
1224 prev = node;
1225 node = node->parent;
1227 /* Go up while we have a node that is reached from the right. */
1228 while (node->right == prev || node->right == NULL);
1229 node = node->right;
1233 static reg_errcode_t
1234 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1235 void *extra)
1237 bin_tree_t *node;
1239 for (node = root; ; )
1241 reg_errcode_t err = fn (extra, node);
1242 if (BE (err != REG_NOERROR, 0))
1243 return err;
1245 /* Go to the left node, or up and to the right. */
1246 if (node->left)
1247 node = node->left;
1248 else
1250 bin_tree_t *prev = NULL;
1251 while (node->right == prev || node->right == NULL)
1253 prev = node;
1254 node = node->parent;
1255 if (!node)
1256 return REG_NOERROR;
1258 node = node->right;
1263 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1264 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1265 backreferences as well. Requires a preorder visit. */
1266 static reg_errcode_t
1267 optimize_subexps (void *extra, bin_tree_t *node)
1269 re_dfa_t *dfa = (re_dfa_t *) extra;
1271 if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1273 int idx = node->token.opr.idx;
1274 node->token.opr.idx = dfa->subexp_map[idx];
1275 dfa->used_bkref_map |= 1 << node->token.opr.idx;
1278 else if (node->token.type == SUBEXP
1279 && node->left && node->left->token.type == SUBEXP)
1281 int other_idx = node->left->token.opr.idx;
1283 node->left = node->left->left;
1284 if (node->left)
1285 node->left->parent = node;
1287 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1288 if (other_idx < BITSET_WORD_BITS)
1289 dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1292 return REG_NOERROR;
1295 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1296 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1297 static reg_errcode_t
1298 lower_subexps (void *extra, bin_tree_t *node)
1300 regex_t *preg = (regex_t *) extra;
1301 reg_errcode_t err = REG_NOERROR;
1303 if (node->left && node->left->token.type == SUBEXP)
1305 node->left = lower_subexp (&err, preg, node->left);
1306 if (node->left)
1307 node->left->parent = node;
1309 if (node->right && node->right->token.type == SUBEXP)
1311 node->right = lower_subexp (&err, preg, node->right);
1312 if (node->right)
1313 node->right->parent = node;
1316 return err;
1319 static bin_tree_t *
1320 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1322 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1323 bin_tree_t *body = node->left;
1324 bin_tree_t *op, *cls, *tree1, *tree;
1326 if (preg->no_sub
1327 /* We do not optimize empty subexpressions, because otherwise we may
1328 have bad CONCAT nodes with NULL children. This is obviously not
1329 very common, so we do not lose much. An example that triggers
1330 this case is the sed "script" /\(\)/x. */
1331 && node->left != NULL
1332 && (node->token.opr.idx >= BITSET_WORD_BITS
1333 || !(dfa->used_bkref_map
1334 & ((bitset_word_t) 1 << node->token.opr.idx))))
1335 return node->left;
1337 /* Convert the SUBEXP node to the concatenation of an
1338 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1339 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1340 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1341 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1342 tree = create_tree (dfa, op, tree1, CONCAT);
1343 if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1345 *err = REG_ESPACE;
1346 return NULL;
1349 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1350 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1351 return tree;
1354 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1355 nodes. Requires a postorder visit. */
1356 static reg_errcode_t
1357 calc_first (void *extra, bin_tree_t *node)
1359 re_dfa_t *dfa = (re_dfa_t *) extra;
1360 if (node->token.type == CONCAT)
1362 node->first = node->left->first;
1363 node->node_idx = node->left->node_idx;
1365 else
1367 node->first = node;
1368 node->node_idx = re_dfa_add_node (dfa, node->token);
1369 if (BE (node->node_idx == -1, 0))
1370 return REG_ESPACE;
1371 if (node->token.type == ANCHOR)
1372 dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1374 return REG_NOERROR;
1377 /* Pass 2: compute NEXT on the tree. Preorder visit. */
1378 static reg_errcode_t
1379 calc_next (void *extra, bin_tree_t *node)
1381 switch (node->token.type)
1383 case OP_DUP_ASTERISK:
1384 node->left->next = node;
1385 break;
1386 case CONCAT:
1387 node->left->next = node->right->first;
1388 node->right->next = node->next;
1389 break;
1390 default:
1391 if (node->left)
1392 node->left->next = node->next;
1393 if (node->right)
1394 node->right->next = node->next;
1395 break;
1397 return REG_NOERROR;
1400 /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1401 static reg_errcode_t
1402 link_nfa_nodes (void *extra, bin_tree_t *node)
1404 re_dfa_t *dfa = (re_dfa_t *) extra;
1405 int idx = node->node_idx;
1406 reg_errcode_t err = REG_NOERROR;
1408 switch (node->token.type)
1410 case CONCAT:
1411 break;
1413 case END_OF_RE:
1414 assert (node->next == NULL);
1415 break;
1417 case OP_DUP_ASTERISK:
1418 case OP_ALT:
1420 int left, right;
1421 dfa->has_plural_match = 1;
1422 if (node->left != NULL)
1423 left = node->left->first->node_idx;
1424 else
1425 left = node->next->node_idx;
1426 if (node->right != NULL)
1427 right = node->right->first->node_idx;
1428 else
1429 right = node->next->node_idx;
1430 assert (left > -1);
1431 assert (right > -1);
1432 err = re_node_set_init_2 (dfa->edests + idx, left, right);
1434 break;
1436 case ANCHOR:
1437 case OP_OPEN_SUBEXP:
1438 case OP_CLOSE_SUBEXP:
1439 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1440 break;
1442 case OP_BACK_REF:
1443 dfa->nexts[idx] = node->next->node_idx;
1444 if (node->token.type == OP_BACK_REF)
1445 err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1446 break;
1448 default:
1449 assert (!IS_EPSILON_NODE (node->token.type));
1450 dfa->nexts[idx] = node->next->node_idx;
1451 break;
1454 return err;
1457 /* Duplicate the epsilon closure of the node ROOT_NODE.
1458 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1459 to their own constraint. */
1461 static reg_errcode_t
1462 duplicate_node_closure (re_dfa_t *dfa, int top_org_node, int top_clone_node,
1463 int root_node, unsigned int init_constraint)
1465 int org_node, clone_node, ret;
1466 unsigned int constraint = init_constraint;
1467 for (org_node = top_org_node, clone_node = top_clone_node;;)
1469 int org_dest, clone_dest;
1470 if (dfa->nodes[org_node].type == OP_BACK_REF)
1472 /* If the back reference epsilon-transit, its destination must
1473 also have the constraint. Then duplicate the epsilon closure
1474 of the destination of the back reference, and store it in
1475 edests of the back reference. */
1476 org_dest = dfa->nexts[org_node];
1477 re_node_set_empty (dfa->edests + clone_node);
1478 clone_dest = duplicate_node (dfa, org_dest, constraint);
1479 if (BE (clone_dest == -1, 0))
1480 return REG_ESPACE;
1481 dfa->nexts[clone_node] = dfa->nexts[org_node];
1482 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1483 if (BE (ret < 0, 0))
1484 return REG_ESPACE;
1486 else if (dfa->edests[org_node].nelem == 0)
1488 /* In case of the node can't epsilon-transit, don't duplicate the
1489 destination and store the original destination as the
1490 destination of the node. */
1491 dfa->nexts[clone_node] = dfa->nexts[org_node];
1492 break;
1494 else if (dfa->edests[org_node].nelem == 1)
1496 /* In case of the node can epsilon-transit, and it has only one
1497 destination. */
1498 org_dest = dfa->edests[org_node].elems[0];
1499 re_node_set_empty (dfa->edests + clone_node);
1500 /* If the node is root_node itself, it means the epsilon closure
1501 has a loop. Then tie it to the destination of the root_node. */
1502 if (org_node == root_node && clone_node != org_node)
1504 ret = re_node_set_insert (dfa->edests + clone_node, org_dest);
1505 if (BE (ret < 0, 0))
1506 return REG_ESPACE;
1507 break;
1509 /* In case the node has another constraint, append it. */
1510 constraint |= dfa->nodes[org_node].constraint;
1511 clone_dest = duplicate_node (dfa, org_dest, constraint);
1512 if (BE (clone_dest == -1, 0))
1513 return REG_ESPACE;
1514 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1515 if (BE (ret < 0, 0))
1516 return REG_ESPACE;
1518 else /* dfa->edests[org_node].nelem == 2 */
1520 /* In case of the node can epsilon-transit, and it has two
1521 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1522 org_dest = dfa->edests[org_node].elems[0];
1523 re_node_set_empty (dfa->edests + clone_node);
1524 /* Search for a duplicated node which satisfies the constraint. */
1525 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1526 if (clone_dest == -1)
1528 /* There is no such duplicated node, create a new one. */
1529 reg_errcode_t err;
1530 clone_dest = duplicate_node (dfa, org_dest, constraint);
1531 if (BE (clone_dest == -1, 0))
1532 return REG_ESPACE;
1533 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1534 if (BE (ret < 0, 0))
1535 return REG_ESPACE;
1536 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1537 root_node, constraint);
1538 if (BE (err != REG_NOERROR, 0))
1539 return err;
1541 else
1543 /* There is a duplicated node which satisfies the constraint,
1544 use it to avoid infinite loop. */
1545 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1546 if (BE (ret < 0, 0))
1547 return REG_ESPACE;
1550 org_dest = dfa->edests[org_node].elems[1];
1551 clone_dest = duplicate_node (dfa, org_dest, constraint);
1552 if (BE (clone_dest == -1, 0))
1553 return REG_ESPACE;
1554 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1555 if (BE (ret < 0, 0))
1556 return REG_ESPACE;
1558 org_node = org_dest;
1559 clone_node = clone_dest;
1561 return REG_NOERROR;
1564 /* Search for a node which is duplicated from the node ORG_NODE, and
1565 satisfies the constraint CONSTRAINT. */
1567 static int
1568 search_duplicated_node (const re_dfa_t *dfa, int org_node,
1569 unsigned int constraint)
1571 int idx;
1572 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1574 if (org_node == dfa->org_indices[idx]
1575 && constraint == dfa->nodes[idx].constraint)
1576 return idx; /* Found. */
1578 return -1; /* Not found. */
1581 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1582 Return the index of the new node, or -1 if insufficient storage is
1583 available. */
1585 static int
1586 duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint)
1588 int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1589 if (BE (dup_idx != -1, 1))
1591 dfa->nodes[dup_idx].constraint = constraint;
1592 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1593 dfa->nodes[dup_idx].duplicated = 1;
1595 /* Store the index of the original node. */
1596 dfa->org_indices[dup_idx] = org_idx;
1598 return dup_idx;
1601 static reg_errcode_t
1602 calc_inveclosure (re_dfa_t *dfa)
1604 int src, idx, ret;
1605 for (idx = 0; idx < dfa->nodes_len; ++idx)
1606 re_node_set_init_empty (dfa->inveclosures + idx);
1608 for (src = 0; src < dfa->nodes_len; ++src)
1610 int *elems = dfa->eclosures[src].elems;
1611 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1613 ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1614 if (BE (ret == -1, 0))
1615 return REG_ESPACE;
1619 return REG_NOERROR;
1622 /* Calculate "eclosure" for all the node in DFA. */
1624 static reg_errcode_t
1625 calc_eclosure (re_dfa_t *dfa)
1627 int node_idx, incomplete;
1628 #ifdef DEBUG
1629 assert (dfa->nodes_len > 0);
1630 #endif
1631 incomplete = 0;
1632 /* For each nodes, calculate epsilon closure. */
1633 for (node_idx = 0; ; ++node_idx)
1635 reg_errcode_t err;
1636 re_node_set eclosure_elem;
1637 if (node_idx == dfa->nodes_len)
1639 if (!incomplete)
1640 break;
1641 incomplete = 0;
1642 node_idx = 0;
1645 #ifdef DEBUG
1646 assert (dfa->eclosures[node_idx].nelem != -1);
1647 #endif
1649 /* If we have already calculated, skip it. */
1650 if (dfa->eclosures[node_idx].nelem != 0)
1651 continue;
1652 /* Calculate epsilon closure of 'node_idx'. */
1653 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1654 if (BE (err != REG_NOERROR, 0))
1655 return err;
1657 if (dfa->eclosures[node_idx].nelem == 0)
1659 incomplete = 1;
1660 re_node_set_free (&eclosure_elem);
1663 return REG_NOERROR;
1666 /* Calculate epsilon closure of NODE. */
1668 static reg_errcode_t
1669 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, int node, int root)
1671 reg_errcode_t err;
1672 int i;
1673 re_node_set eclosure;
1674 int ret;
1675 int incomplete = 0;
1676 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1677 if (BE (err != REG_NOERROR, 0))
1678 return err;
1680 /* This indicates that we are calculating this node now.
1681 We reference this value to avoid infinite loop. */
1682 dfa->eclosures[node].nelem = -1;
1684 /* If the current node has constraints, duplicate all nodes
1685 since they must inherit the constraints. */
1686 if (dfa->nodes[node].constraint
1687 && dfa->edests[node].nelem
1688 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1690 err = duplicate_node_closure (dfa, node, node, node,
1691 dfa->nodes[node].constraint);
1692 if (BE (err != REG_NOERROR, 0))
1693 return err;
1696 /* Expand each epsilon destination nodes. */
1697 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1698 for (i = 0; i < dfa->edests[node].nelem; ++i)
1700 re_node_set eclosure_elem;
1701 int edest = dfa->edests[node].elems[i];
1702 /* If calculating the epsilon closure of `edest' is in progress,
1703 return intermediate result. */
1704 if (dfa->eclosures[edest].nelem == -1)
1706 incomplete = 1;
1707 continue;
1709 /* If we haven't calculated the epsilon closure of `edest' yet,
1710 calculate now. Otherwise use calculated epsilon closure. */
1711 if (dfa->eclosures[edest].nelem == 0)
1713 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1714 if (BE (err != REG_NOERROR, 0))
1715 return err;
1717 else
1718 eclosure_elem = dfa->eclosures[edest];
1719 /* Merge the epsilon closure of 'edest'. */
1720 err = re_node_set_merge (&eclosure, &eclosure_elem);
1721 if (BE (err != REG_NOERROR, 0))
1722 return err;
1723 /* If the epsilon closure of 'edest' is incomplete,
1724 the epsilon closure of this node is also incomplete. */
1725 if (dfa->eclosures[edest].nelem == 0)
1727 incomplete = 1;
1728 re_node_set_free (&eclosure_elem);
1732 /* An epsilon closure includes itself. */
1733 ret = re_node_set_insert (&eclosure, node);
1734 if (BE (ret < 0, 0))
1735 return REG_ESPACE;
1736 if (incomplete && !root)
1737 dfa->eclosures[node].nelem = 0;
1738 else
1739 dfa->eclosures[node] = eclosure;
1740 *new_set = eclosure;
1741 return REG_NOERROR;
1744 /* Functions for token which are used in the parser. */
1746 /* Fetch a token from INPUT.
1747 We must not use this function inside bracket expressions. */
1749 static void
1750 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1752 re_string_skip_bytes (input, peek_token (result, input, syntax));
1755 /* Peek a token from INPUT, and return the length of the token.
1756 We must not use this function inside bracket expressions. */
1758 static int
1759 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1761 unsigned char c;
1763 if (re_string_eoi (input))
1765 token->type = END_OF_RE;
1766 return 0;
1769 c = re_string_peek_byte (input, 0);
1770 token->opr.c = c;
1772 token->word_char = 0;
1773 #ifdef RE_ENABLE_I18N
1774 token->mb_partial = 0;
1775 if (input->mb_cur_max > 1 &&
1776 !re_string_first_byte (input, re_string_cur_idx (input)))
1778 token->type = CHARACTER;
1779 token->mb_partial = 1;
1780 return 1;
1782 #endif
1783 if (c == '\\')
1785 unsigned char c2;
1786 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1788 token->type = BACK_SLASH;
1789 return 1;
1792 c2 = re_string_peek_byte_case (input, 1);
1793 token->opr.c = c2;
1794 token->type = CHARACTER;
1795 #ifdef RE_ENABLE_I18N
1796 if (input->mb_cur_max > 1)
1798 wint_t wc = re_string_wchar_at (input,
1799 re_string_cur_idx (input) + 1);
1800 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1802 else
1803 #endif
1804 token->word_char = IS_WORD_CHAR (c2) != 0;
1806 switch (c2)
1808 case '|':
1809 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1810 token->type = OP_ALT;
1811 break;
1812 case '1': case '2': case '3': case '4': case '5':
1813 case '6': case '7': case '8': case '9':
1814 if (!(syntax & RE_NO_BK_REFS))
1816 token->type = OP_BACK_REF;
1817 token->opr.idx = c2 - '1';
1819 break;
1820 case '<':
1821 if (!(syntax & RE_NO_GNU_OPS))
1823 token->type = ANCHOR;
1824 token->opr.ctx_type = WORD_FIRST;
1826 break;
1827 case '>':
1828 if (!(syntax & RE_NO_GNU_OPS))
1830 token->type = ANCHOR;
1831 token->opr.ctx_type = WORD_LAST;
1833 break;
1834 case 'b':
1835 if (!(syntax & RE_NO_GNU_OPS))
1837 token->type = ANCHOR;
1838 token->opr.ctx_type = WORD_DELIM;
1840 break;
1841 case 'B':
1842 if (!(syntax & RE_NO_GNU_OPS))
1844 token->type = ANCHOR;
1845 token->opr.ctx_type = NOT_WORD_DELIM;
1847 break;
1848 case 'w':
1849 if (!(syntax & RE_NO_GNU_OPS))
1850 token->type = OP_WORD;
1851 break;
1852 case 'W':
1853 if (!(syntax & RE_NO_GNU_OPS))
1854 token->type = OP_NOTWORD;
1855 break;
1856 case 's':
1857 if (!(syntax & RE_NO_GNU_OPS))
1858 token->type = OP_SPACE;
1859 break;
1860 case 'S':
1861 if (!(syntax & RE_NO_GNU_OPS))
1862 token->type = OP_NOTSPACE;
1863 break;
1864 case '`':
1865 if (!(syntax & RE_NO_GNU_OPS))
1867 token->type = ANCHOR;
1868 token->opr.ctx_type = BUF_FIRST;
1870 break;
1871 case '\'':
1872 if (!(syntax & RE_NO_GNU_OPS))
1874 token->type = ANCHOR;
1875 token->opr.ctx_type = BUF_LAST;
1877 break;
1878 case '(':
1879 if (!(syntax & RE_NO_BK_PARENS))
1880 token->type = OP_OPEN_SUBEXP;
1881 break;
1882 case ')':
1883 if (!(syntax & RE_NO_BK_PARENS))
1884 token->type = OP_CLOSE_SUBEXP;
1885 break;
1886 case '+':
1887 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1888 token->type = OP_DUP_PLUS;
1889 break;
1890 case '?':
1891 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1892 token->type = OP_DUP_QUESTION;
1893 break;
1894 case '{':
1895 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1896 token->type = OP_OPEN_DUP_NUM;
1897 break;
1898 case '}':
1899 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1900 token->type = OP_CLOSE_DUP_NUM;
1901 break;
1902 default:
1903 break;
1905 return 2;
1908 token->type = CHARACTER;
1909 #ifdef RE_ENABLE_I18N
1910 if (input->mb_cur_max > 1)
1912 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1913 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1915 else
1916 #endif
1917 token->word_char = IS_WORD_CHAR (token->opr.c);
1919 switch (c)
1921 case '\n':
1922 if (syntax & RE_NEWLINE_ALT)
1923 token->type = OP_ALT;
1924 break;
1925 case '|':
1926 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1927 token->type = OP_ALT;
1928 break;
1929 case '*':
1930 token->type = OP_DUP_ASTERISK;
1931 break;
1932 case '+':
1933 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1934 token->type = OP_DUP_PLUS;
1935 break;
1936 case '?':
1937 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1938 token->type = OP_DUP_QUESTION;
1939 break;
1940 case '{':
1941 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1942 token->type = OP_OPEN_DUP_NUM;
1943 break;
1944 case '}':
1945 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1946 token->type = OP_CLOSE_DUP_NUM;
1947 break;
1948 case '(':
1949 if (syntax & RE_NO_BK_PARENS)
1950 token->type = OP_OPEN_SUBEXP;
1951 break;
1952 case ')':
1953 if (syntax & RE_NO_BK_PARENS)
1954 token->type = OP_CLOSE_SUBEXP;
1955 break;
1956 case '[':
1957 token->type = OP_OPEN_BRACKET;
1958 break;
1959 case '.':
1960 token->type = OP_PERIOD;
1961 break;
1962 case '^':
1963 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1964 re_string_cur_idx (input) != 0)
1966 char prev = re_string_peek_byte (input, -1);
1967 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1968 break;
1970 token->type = ANCHOR;
1971 token->opr.ctx_type = LINE_FIRST;
1972 break;
1973 case '$':
1974 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1975 re_string_cur_idx (input) + 1 != re_string_length (input))
1977 re_token_t next;
1978 re_string_skip_bytes (input, 1);
1979 peek_token (&next, input, syntax);
1980 re_string_skip_bytes (input, -1);
1981 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1982 break;
1984 token->type = ANCHOR;
1985 token->opr.ctx_type = LINE_LAST;
1986 break;
1987 default:
1988 break;
1990 return 1;
1993 /* Peek a token from INPUT, and return the length of the token.
1994 We must not use this function out of bracket expressions. */
1996 static int
1997 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1999 unsigned char c;
2000 if (re_string_eoi (input))
2002 token->type = END_OF_RE;
2003 return 0;
2005 c = re_string_peek_byte (input, 0);
2006 token->opr.c = c;
2008 #ifdef RE_ENABLE_I18N
2009 if (input->mb_cur_max > 1 &&
2010 !re_string_first_byte (input, re_string_cur_idx (input)))
2012 token->type = CHARACTER;
2013 return 1;
2015 #endif /* RE_ENABLE_I18N */
2017 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2018 && re_string_cur_idx (input) + 1 < re_string_length (input))
2020 /* In this case, '\' escape a character. */
2021 unsigned char c2;
2022 re_string_skip_bytes (input, 1);
2023 c2 = re_string_peek_byte (input, 0);
2024 token->opr.c = c2;
2025 token->type = CHARACTER;
2026 return 1;
2028 if (c == '[') /* '[' is a special char in a bracket exps. */
2030 unsigned char c2;
2031 int token_len;
2032 if (re_string_cur_idx (input) + 1 < re_string_length (input))
2033 c2 = re_string_peek_byte (input, 1);
2034 else
2035 c2 = 0;
2036 token->opr.c = c2;
2037 token_len = 2;
2038 switch (c2)
2040 case '.':
2041 token->type = OP_OPEN_COLL_ELEM;
2042 break;
2043 case '=':
2044 token->type = OP_OPEN_EQUIV_CLASS;
2045 break;
2046 case ':':
2047 if (syntax & RE_CHAR_CLASSES)
2049 token->type = OP_OPEN_CHAR_CLASS;
2050 break;
2052 /* else fall through. */
2053 default:
2054 token->type = CHARACTER;
2055 token->opr.c = c;
2056 token_len = 1;
2057 break;
2059 return token_len;
2061 switch (c)
2063 case '-':
2064 token->type = OP_CHARSET_RANGE;
2065 break;
2066 case ']':
2067 token->type = OP_CLOSE_BRACKET;
2068 break;
2069 case '^':
2070 token->type = OP_NON_MATCH_LIST;
2071 break;
2072 default:
2073 token->type = CHARACTER;
2075 return 1;
2078 /* Functions for parser. */
2080 /* Entry point of the parser.
2081 Parse the regular expression REGEXP and return the structure tree.
2082 If an error occurs, ERR is set by error code, and return NULL.
2083 This function build the following tree, from regular expression <reg_exp>:
2087 <reg_exp> EOR
2089 CAT means concatenation.
2090 EOR means end of regular expression. */
2092 static bin_tree_t *
2093 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2094 reg_errcode_t *err)
2096 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2097 bin_tree_t *tree, *eor, *root;
2098 re_token_t current_token;
2099 dfa->syntax = syntax;
2100 fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2101 tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2102 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2103 return NULL;
2104 eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2105 if (tree != NULL)
2106 root = create_tree (dfa, tree, eor, CONCAT);
2107 else
2108 root = eor;
2109 if (BE (eor == NULL || root == NULL, 0))
2111 *err = REG_ESPACE;
2112 return NULL;
2114 return root;
2117 /* This function build the following tree, from regular expression
2118 <branch1>|<branch2>:
2122 <branch1> <branch2>
2124 ALT means alternative, which represents the operator '|'. */
2126 static bin_tree_t *
2127 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2128 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2130 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2131 bin_tree_t *tree, *branch = NULL;
2132 tree = parse_branch (regexp, preg, token, syntax, nest, err);
2133 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2134 return NULL;
2136 while (token->type == OP_ALT)
2138 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2139 if (token->type != OP_ALT && token->type != END_OF_RE
2140 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2142 branch = parse_branch (regexp, preg, token, syntax, nest, err);
2143 if (BE (*err != REG_NOERROR && branch == NULL, 0))
2145 if (tree != NULL)
2146 postorder (tree, free_tree, NULL);
2147 return NULL;
2150 else
2151 branch = NULL;
2152 tree = create_tree (dfa, tree, branch, OP_ALT);
2153 if (BE (tree == NULL, 0))
2155 *err = REG_ESPACE;
2156 return NULL;
2159 return tree;
2162 /* This function build the following tree, from regular expression
2163 <exp1><exp2>:
2167 <exp1> <exp2>
2169 CAT means concatenation. */
2171 static bin_tree_t *
2172 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2173 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2175 bin_tree_t *tree, *exp;
2176 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2177 tree = parse_expression (regexp, preg, token, syntax, nest, err);
2178 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2179 return NULL;
2181 while (token->type != OP_ALT && token->type != END_OF_RE
2182 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2184 exp = parse_expression (regexp, preg, token, syntax, nest, err);
2185 if (BE (*err != REG_NOERROR && exp == NULL, 0))
2187 if (tree != NULL)
2188 postorder (tree, free_tree, NULL);
2189 return NULL;
2191 if (tree != NULL && exp != NULL)
2193 bin_tree_t *newtree = create_tree (dfa, tree, exp, CONCAT);
2194 if (newtree == NULL)
2196 postorder (exp, free_tree, NULL);
2197 postorder (tree, free_tree, NULL);
2198 *err = REG_ESPACE;
2199 return NULL;
2201 tree = newtree;
2203 else if (tree == NULL)
2204 tree = exp;
2205 /* Otherwise exp == NULL, we don't need to create new tree. */
2207 return tree;
2210 /* This function build the following tree, from regular expression a*:
2216 static bin_tree_t *
2217 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2218 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2220 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2221 bin_tree_t *tree;
2222 switch (token->type)
2224 case CHARACTER:
2225 tree = create_token_tree (dfa, NULL, NULL, token);
2226 if (BE (tree == NULL, 0))
2228 *err = REG_ESPACE;
2229 return NULL;
2231 #ifdef RE_ENABLE_I18N
2232 if (dfa->mb_cur_max > 1)
2234 while (!re_string_eoi (regexp)
2235 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2237 bin_tree_t *mbc_remain;
2238 fetch_token (token, regexp, syntax);
2239 mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2240 tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2241 if (BE (mbc_remain == NULL || tree == NULL, 0))
2243 *err = REG_ESPACE;
2244 return NULL;
2248 #endif
2249 break;
2250 case OP_OPEN_SUBEXP:
2251 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2252 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2253 return NULL;
2254 break;
2255 case OP_OPEN_BRACKET:
2256 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2257 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2258 return NULL;
2259 break;
2260 case OP_BACK_REF:
2261 if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2263 *err = REG_ESUBREG;
2264 return NULL;
2266 dfa->used_bkref_map |= 1 << token->opr.idx;
2267 tree = create_token_tree (dfa, NULL, NULL, token);
2268 if (BE (tree == NULL, 0))
2270 *err = REG_ESPACE;
2271 return NULL;
2273 ++dfa->nbackref;
2274 dfa->has_mb_node = 1;
2275 break;
2276 case OP_OPEN_DUP_NUM:
2277 if (syntax & RE_CONTEXT_INVALID_DUP)
2279 *err = REG_BADRPT;
2280 return NULL;
2282 /* FALLTHROUGH */
2283 case OP_DUP_ASTERISK:
2284 case OP_DUP_PLUS:
2285 case OP_DUP_QUESTION:
2286 if (syntax & RE_CONTEXT_INVALID_OPS)
2288 *err = REG_BADRPT;
2289 return NULL;
2291 else if (syntax & RE_CONTEXT_INDEP_OPS)
2293 fetch_token (token, regexp, syntax);
2294 return parse_expression (regexp, preg, token, syntax, nest, err);
2296 /* else fall through */
2297 case OP_CLOSE_SUBEXP:
2298 if ((token->type == OP_CLOSE_SUBEXP) &&
2299 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2301 *err = REG_ERPAREN;
2302 return NULL;
2304 /* else fall through */
2305 case OP_CLOSE_DUP_NUM:
2306 /* We treat it as a normal character. */
2308 /* Then we can these characters as normal characters. */
2309 token->type = CHARACTER;
2310 /* mb_partial and word_char bits should be initialized already
2311 by peek_token. */
2312 tree = create_token_tree (dfa, NULL, NULL, token);
2313 if (BE (tree == NULL, 0))
2315 *err = REG_ESPACE;
2316 return NULL;
2318 break;
2319 case ANCHOR:
2320 if ((token->opr.ctx_type
2321 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2322 && dfa->word_ops_used == 0)
2323 init_word_char (dfa);
2324 if (token->opr.ctx_type == WORD_DELIM
2325 || token->opr.ctx_type == NOT_WORD_DELIM)
2327 bin_tree_t *tree_first, *tree_last;
2328 if (token->opr.ctx_type == WORD_DELIM)
2330 token->opr.ctx_type = WORD_FIRST;
2331 tree_first = create_token_tree (dfa, NULL, NULL, token);
2332 token->opr.ctx_type = WORD_LAST;
2334 else
2336 token->opr.ctx_type = INSIDE_WORD;
2337 tree_first = create_token_tree (dfa, NULL, NULL, token);
2338 token->opr.ctx_type = INSIDE_NOTWORD;
2340 tree_last = create_token_tree (dfa, NULL, NULL, token);
2341 tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2342 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2344 *err = REG_ESPACE;
2345 return NULL;
2348 else
2350 tree = create_token_tree (dfa, NULL, NULL, token);
2351 if (BE (tree == NULL, 0))
2353 *err = REG_ESPACE;
2354 return NULL;
2357 /* We must return here, since ANCHORs can't be followed
2358 by repetition operators.
2359 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2360 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2361 fetch_token (token, regexp, syntax);
2362 return tree;
2363 case OP_PERIOD:
2364 tree = create_token_tree (dfa, NULL, NULL, token);
2365 if (BE (tree == NULL, 0))
2367 *err = REG_ESPACE;
2368 return NULL;
2370 if (dfa->mb_cur_max > 1)
2371 dfa->has_mb_node = 1;
2372 break;
2373 case OP_WORD:
2374 case OP_NOTWORD:
2375 tree = build_charclass_op (dfa, regexp->trans,
2376 (const unsigned char *) "alnum",
2377 (const unsigned char *) "_",
2378 token->type == OP_NOTWORD, err);
2379 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2380 return NULL;
2381 break;
2382 case OP_SPACE:
2383 case OP_NOTSPACE:
2384 tree = build_charclass_op (dfa, regexp->trans,
2385 (const unsigned char *) "space",
2386 (const unsigned char *) "",
2387 token->type == OP_NOTSPACE, err);
2388 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2389 return NULL;
2390 break;
2391 case OP_ALT:
2392 case END_OF_RE:
2393 return NULL;
2394 case BACK_SLASH:
2395 *err = REG_EESCAPE;
2396 return NULL;
2397 default:
2398 /* Must not happen? */
2399 #ifdef DEBUG
2400 assert (0);
2401 #endif
2402 return NULL;
2404 fetch_token (token, regexp, syntax);
2406 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2407 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2409 bin_tree_t *dup_tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2410 if (BE (*err != REG_NOERROR && dup_tree == NULL, 0))
2412 if (tree != NULL)
2413 postorder (tree, free_tree, NULL);
2414 return NULL;
2416 tree = dup_tree;
2417 /* In BRE consecutive duplications are not allowed. */
2418 if ((syntax & RE_CONTEXT_INVALID_DUP)
2419 && (token->type == OP_DUP_ASTERISK
2420 || token->type == OP_OPEN_DUP_NUM))
2422 if (tree != NULL)
2423 postorder (tree, free_tree, NULL);
2424 *err = REG_BADRPT;
2425 return NULL;
2429 return tree;
2432 /* This function build the following tree, from regular expression
2433 (<reg_exp>):
2434 SUBEXP
2436 <reg_exp>
2439 static bin_tree_t *
2440 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2441 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2443 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2444 bin_tree_t *tree;
2445 size_t cur_nsub;
2446 cur_nsub = preg->re_nsub++;
2448 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2450 /* The subexpression may be a null string. */
2451 if (token->type == OP_CLOSE_SUBEXP)
2452 tree = NULL;
2453 else
2455 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2456 if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2458 if (tree != NULL)
2459 postorder (tree, free_tree, NULL);
2460 *err = REG_EPAREN;
2462 if (BE (*err != REG_NOERROR, 0))
2463 return NULL;
2466 if (cur_nsub <= '9' - '1')
2467 dfa->completed_bkref_map |= 1 << cur_nsub;
2469 tree = create_tree (dfa, tree, NULL, SUBEXP);
2470 if (BE (tree == NULL, 0))
2472 *err = REG_ESPACE;
2473 return NULL;
2475 tree->token.opr.idx = cur_nsub;
2476 return tree;
2479 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2481 static bin_tree_t *
2482 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2483 re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2485 bin_tree_t *tree = NULL, *old_tree = NULL;
2486 int i, start, end, start_idx = re_string_cur_idx (regexp);
2487 re_token_t start_token = *token;
2489 if (token->type == OP_OPEN_DUP_NUM)
2491 end = 0;
2492 start = fetch_number (regexp, token, syntax);
2493 if (start == -1)
2495 if (token->type == CHARACTER && token->opr.c == ',')
2496 start = 0; /* We treat "{,m}" as "{0,m}". */
2497 else
2499 *err = REG_BADBR; /* <re>{} is invalid. */
2500 return NULL;
2503 if (BE (start != -2, 1))
2505 /* We treat "{n}" as "{n,n}". */
2506 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2507 : ((token->type == CHARACTER && token->opr.c == ',')
2508 ? fetch_number (regexp, token, syntax) : -2));
2510 if (BE (start == -2 || end == -2, 0))
2512 /* Invalid sequence. */
2513 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2515 if (token->type == END_OF_RE)
2516 *err = REG_EBRACE;
2517 else
2518 *err = REG_BADBR;
2520 return NULL;
2523 /* If the syntax bit is set, rollback. */
2524 re_string_set_index (regexp, start_idx);
2525 *token = start_token;
2526 token->type = CHARACTER;
2527 /* mb_partial and word_char bits should be already initialized by
2528 peek_token. */
2529 return elem;
2532 if (BE ((end != -1 && start > end) || token->type != OP_CLOSE_DUP_NUM, 0))
2534 /* First number greater than second. */
2535 *err = REG_BADBR;
2536 return NULL;
2539 else
2541 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2542 end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2545 fetch_token (token, regexp, syntax);
2547 if (BE (elem == NULL, 0))
2548 return NULL;
2549 if (BE (start == 0 && end == 0, 0))
2551 postorder (elem, free_tree, NULL);
2552 return NULL;
2555 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2556 if (BE (start > 0, 0))
2558 tree = elem;
2559 for (i = 2; i <= start; ++i)
2561 elem = duplicate_tree (elem, dfa);
2562 tree = create_tree (dfa, tree, elem, CONCAT);
2563 if (BE (elem == NULL || tree == NULL, 0))
2564 goto parse_dup_op_espace;
2567 if (start == end)
2568 return tree;
2570 /* Duplicate ELEM before it is marked optional. */
2571 elem = duplicate_tree (elem, dfa);
2572 if (BE (elem == NULL, 0))
2573 goto parse_dup_op_espace;
2574 old_tree = tree;
2576 else
2577 old_tree = NULL;
2579 if (elem->token.type == SUBEXP)
2580 postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2582 tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2583 if (BE (tree == NULL, 0))
2584 goto parse_dup_op_espace;
2586 /* This loop is actually executed only when end != -1,
2587 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2588 already created the start+1-th copy. */
2589 for (i = start + 2; i <= end; ++i)
2591 elem = duplicate_tree (elem, dfa);
2592 tree = create_tree (dfa, tree, elem, CONCAT);
2593 if (BE (elem == NULL || tree == NULL, 0))
2594 goto parse_dup_op_espace;
2596 tree = create_tree (dfa, tree, NULL, OP_ALT);
2597 if (BE (tree == NULL, 0))
2598 goto parse_dup_op_espace;
2601 if (old_tree)
2602 tree = create_tree (dfa, old_tree, tree, CONCAT);
2604 return tree;
2606 parse_dup_op_espace:
2607 *err = REG_ESPACE;
2608 return NULL;
2611 /* Size of the names for collating symbol/equivalence_class/character_class.
2612 I'm not sure, but maybe enough. */
2613 #define BRACKET_NAME_BUF_SIZE 32
2615 #ifndef _LIBC
2616 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2617 Build the range expression which starts from START_ELEM, and ends
2618 at END_ELEM. The result are written to MBCSET and SBCSET.
2619 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2620 mbcset->range_ends, is a pointer argument since we may
2621 update it. */
2623 static reg_errcode_t
2624 # ifdef RE_ENABLE_I18N
2625 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2626 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2627 # else /* not RE_ENABLE_I18N */
2628 build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
2629 bracket_elem_t *end_elem)
2630 # endif /* not RE_ENABLE_I18N */
2632 unsigned int start_ch, end_ch;
2633 /* Equivalence Classes and Character Classes can't be a range start/end. */
2634 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2635 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2637 return REG_ERANGE;
2639 /* We can handle no multi character collating elements without libc
2640 support. */
2641 if (BE ((start_elem->type == COLL_SYM
2642 && strlen ((char *) start_elem->opr.name) > 1)
2643 || (end_elem->type == COLL_SYM
2644 && strlen ((char *) end_elem->opr.name) > 1), 0))
2645 return REG_ECOLLATE;
2647 # ifdef RE_ENABLE_I18N
2649 wchar_t wc;
2650 wint_t start_wc;
2651 wint_t end_wc;
2652 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2654 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2655 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2656 : 0));
2657 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2658 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2659 : 0));
2660 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2661 ? __btowc (start_ch) : start_elem->opr.wch);
2662 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2663 ? __btowc (end_ch) : end_elem->opr.wch);
2664 if (start_wc == WEOF || end_wc == WEOF)
2665 return REG_ECOLLATE;
2666 cmp_buf[0] = start_wc;
2667 cmp_buf[4] = end_wc;
2668 if (__wcscoll (cmp_buf, cmp_buf + 4) > 0)
2669 return REG_ERANGE;
2671 /* Got valid collation sequence values, add them as a new entry.
2672 However, for !_LIBC we have no collation elements: if the
2673 character set is single byte, the single byte character set
2674 that we build below suffices. parse_bracket_exp passes
2675 no MBCSET if dfa->mb_cur_max == 1. */
2676 if (mbcset)
2678 /* Check the space of the arrays. */
2679 if (BE (*range_alloc == mbcset->nranges, 0))
2681 /* There is not enough space, need realloc. */
2682 wchar_t *new_array_start, *new_array_end;
2683 int new_nranges;
2685 /* +1 in case of mbcset->nranges is 0. */
2686 new_nranges = 2 * mbcset->nranges + 1;
2687 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2688 are NULL if *range_alloc == 0. */
2689 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2690 new_nranges);
2691 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2692 new_nranges);
2694 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2695 return REG_ESPACE;
2697 mbcset->range_starts = new_array_start;
2698 mbcset->range_ends = new_array_end;
2699 *range_alloc = new_nranges;
2702 mbcset->range_starts[mbcset->nranges] = start_wc;
2703 mbcset->range_ends[mbcset->nranges++] = end_wc;
2706 /* Build the table for single byte characters. */
2707 for (wc = 0; wc < SBC_MAX; ++wc)
2709 cmp_buf[2] = wc;
2710 if (__wcscoll (cmp_buf, cmp_buf + 2) <= 0
2711 && __wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2712 bitset_set (sbcset, wc);
2715 # else /* not RE_ENABLE_I18N */
2717 unsigned int ch;
2718 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2719 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2720 : 0));
2721 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2722 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2723 : 0));
2724 if (start_ch > end_ch)
2725 return REG_ERANGE;
2726 /* Build the table for single byte characters. */
2727 for (ch = 0; ch < SBC_MAX; ++ch)
2728 if (start_ch <= ch && ch <= end_ch)
2729 bitset_set (sbcset, ch);
2731 # endif /* not RE_ENABLE_I18N */
2732 return REG_NOERROR;
2734 #endif /* not _LIBC */
2736 #ifndef _LIBC
2737 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2738 Build the collating element which is represented by NAME.
2739 The result are written to MBCSET and SBCSET.
2740 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2741 pointer argument since we may update it. */
2743 static reg_errcode_t
2744 # ifdef RE_ENABLE_I18N
2745 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2746 int *coll_sym_alloc, const unsigned char *name)
2747 # else /* not RE_ENABLE_I18N */
2748 build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2749 # endif /* not RE_ENABLE_I18N */
2751 size_t name_len = strlen ((const char *) name);
2752 if (BE (name_len != 1, 0))
2753 return REG_ECOLLATE;
2754 else
2756 bitset_set (sbcset, name[0]);
2757 return REG_NOERROR;
2760 #endif /* not _LIBC */
2762 /* This function parse bracket expression like "[abc]", "[a-c]",
2763 "[[.a-a.]]" etc. */
2765 static bin_tree_t *
2766 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2767 reg_syntax_t syntax, reg_errcode_t *err)
2769 #ifdef _LIBC
2770 const unsigned char *collseqmb;
2771 const char *collseqwc;
2772 uint32_t nrules;
2773 int32_t table_size;
2774 const int32_t *symb_table;
2775 const unsigned char *extra;
2777 /* Local function for parse_bracket_exp used in _LIBC environment.
2778 Seek the collating symbol entry corresponding to NAME.
2779 Return the index of the symbol in the SYMB_TABLE,
2780 or -1 if not found. */
2782 auto inline int32_t
2783 __attribute__ ((always_inline))
2784 seek_collating_symbol_entry (const unsigned char *name, size_t name_len)
2786 int32_t elem;
2788 for (elem = 0; elem < table_size; elem++)
2789 if (symb_table[2 * elem] != 0)
2791 int32_t idx = symb_table[2 * elem + 1];
2792 /* Skip the name of collating element name. */
2793 idx += 1 + extra[idx];
2794 if (/* Compare the length of the name. */
2795 name_len == extra[idx]
2796 /* Compare the name. */
2797 && memcmp (name, &extra[idx + 1], name_len) == 0)
2798 /* Yep, this is the entry. */
2799 return elem;
2801 return -1;
2804 /* Local function for parse_bracket_exp used in _LIBC environment.
2805 Look up the collation sequence value of BR_ELEM.
2806 Return the value if succeeded, UINT_MAX otherwise. */
2808 auto inline unsigned int
2809 __attribute__ ((always_inline))
2810 lookup_collation_sequence_value (bracket_elem_t *br_elem)
2812 if (br_elem->type == SB_CHAR)
2815 if (MB_CUR_MAX == 1)
2817 if (nrules == 0)
2818 return collseqmb[br_elem->opr.ch];
2819 else
2821 wint_t wc = __btowc (br_elem->opr.ch);
2822 return __collseq_table_lookup (collseqwc, wc);
2825 else if (br_elem->type == MB_CHAR)
2827 if (nrules != 0)
2828 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2830 else if (br_elem->type == COLL_SYM)
2832 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2833 if (nrules != 0)
2835 int32_t elem, idx;
2836 elem = seek_collating_symbol_entry (br_elem->opr.name,
2837 sym_name_len);
2838 if (elem != -1)
2840 /* We found the entry. */
2841 idx = symb_table[2 * elem + 1];
2842 /* Skip the name of collating element name. */
2843 idx += 1 + extra[idx];
2844 /* Skip the byte sequence of the collating element. */
2845 idx += 1 + extra[idx];
2846 /* Adjust for the alignment. */
2847 idx = (idx + 3) & ~3;
2848 /* Skip the multibyte collation sequence value. */
2849 idx += sizeof (unsigned int);
2850 /* Skip the wide char sequence of the collating element. */
2851 idx += sizeof (unsigned int) *
2852 (1 + *(unsigned int *) (extra + idx));
2853 /* Return the collation sequence value. */
2854 return *(unsigned int *) (extra + idx);
2856 else if (sym_name_len == 1)
2858 /* No valid character. Match it as a single byte
2859 character. */
2860 return collseqmb[br_elem->opr.name[0]];
2863 else if (sym_name_len == 1)
2864 return collseqmb[br_elem->opr.name[0]];
2866 return UINT_MAX;
2869 /* Local function for parse_bracket_exp used in _LIBC environment.
2870 Build the range expression which starts from START_ELEM, and ends
2871 at END_ELEM. The result are written to MBCSET and SBCSET.
2872 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2873 mbcset->range_ends, is a pointer argument since we may
2874 update it. */
2876 auto inline reg_errcode_t
2877 __attribute__ ((always_inline))
2878 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2879 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2881 unsigned int ch;
2882 uint32_t start_collseq;
2883 uint32_t end_collseq;
2885 /* Equivalence Classes and Character Classes can't be a range
2886 start/end. */
2887 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2888 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2890 return REG_ERANGE;
2892 start_collseq = lookup_collation_sequence_value (start_elem);
2893 end_collseq = lookup_collation_sequence_value (end_elem);
2894 /* Check start/end collation sequence values. */
2895 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2896 return REG_ECOLLATE;
2897 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2898 return REG_ERANGE;
2900 /* Got valid collation sequence values, add them as a new entry.
2901 However, if we have no collation elements, and the character set
2902 is single byte, the single byte character set that we
2903 build below suffices. */
2904 if (nrules > 0 || dfa->mb_cur_max > 1)
2906 /* Check the space of the arrays. */
2907 if (BE (*range_alloc == mbcset->nranges, 0))
2909 /* There is not enough space, need realloc. */
2910 uint32_t *new_array_start;
2911 uint32_t *new_array_end;
2912 int new_nranges;
2914 /* +1 in case of mbcset->nranges is 0. */
2915 new_nranges = 2 * mbcset->nranges + 1;
2916 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2917 new_nranges);
2918 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2919 new_nranges);
2921 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2922 return REG_ESPACE;
2924 mbcset->range_starts = new_array_start;
2925 mbcset->range_ends = new_array_end;
2926 *range_alloc = new_nranges;
2929 mbcset->range_starts[mbcset->nranges] = start_collseq;
2930 mbcset->range_ends[mbcset->nranges++] = end_collseq;
2933 /* Build the table for single byte characters. */
2934 for (ch = 0; ch < SBC_MAX; ch++)
2936 uint32_t ch_collseq;
2938 if (MB_CUR_MAX == 1)
2940 if (nrules == 0)
2941 ch_collseq = collseqmb[ch];
2942 else
2943 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2944 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2945 bitset_set (sbcset, ch);
2947 return REG_NOERROR;
2950 /* Local function for parse_bracket_exp used in _LIBC environment.
2951 Build the collating element which is represented by NAME.
2952 The result are written to MBCSET and SBCSET.
2953 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2954 pointer argument since we may update it. */
2956 auto inline reg_errcode_t
2957 __attribute__ ((always_inline))
2958 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2959 int *coll_sym_alloc, const unsigned char *name)
2961 int32_t elem, idx;
2962 size_t name_len = strlen ((const char *) name);
2963 if (nrules != 0)
2965 elem = seek_collating_symbol_entry (name, name_len);
2966 if (elem != -1)
2968 /* We found the entry. */
2969 idx = symb_table[2 * elem + 1];
2970 /* Skip the name of collating element name. */
2971 idx += 1 + extra[idx];
2973 else if (name_len == 1)
2975 /* No valid character, treat it as a normal
2976 character. */
2977 bitset_set (sbcset, name[0]);
2978 return REG_NOERROR;
2980 else
2981 return REG_ECOLLATE;
2983 /* Got valid collation sequence, add it as a new entry. */
2984 /* Check the space of the arrays. */
2985 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
2987 /* Not enough, realloc it. */
2988 /* +1 in case of mbcset->ncoll_syms is 0. */
2989 int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
2990 /* Use realloc since mbcset->coll_syms is NULL
2991 if *alloc == 0. */
2992 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
2993 new_coll_sym_alloc);
2994 if (BE (new_coll_syms == NULL, 0))
2995 return REG_ESPACE;
2996 mbcset->coll_syms = new_coll_syms;
2997 *coll_sym_alloc = new_coll_sym_alloc;
2999 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3000 return REG_NOERROR;
3002 else
3004 if (BE (name_len != 1, 0))
3005 return REG_ECOLLATE;
3006 else
3008 bitset_set (sbcset, name[0]);
3009 return REG_NOERROR;
3013 #endif
3015 re_token_t br_token;
3016 re_bitset_ptr_t sbcset;
3017 #ifdef RE_ENABLE_I18N
3018 re_charset_t *mbcset;
3019 int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3020 int equiv_class_alloc = 0, char_class_alloc = 0;
3021 #endif /* not RE_ENABLE_I18N */
3022 int non_match = 0;
3023 bin_tree_t *work_tree;
3024 int token_len;
3025 int first_round = 1;
3026 #ifdef _LIBC
3027 collseqmb = (const unsigned char *)
3028 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3029 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3030 if (nrules)
3033 if (MB_CUR_MAX > 1)
3035 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3036 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3037 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3038 _NL_COLLATE_SYMB_TABLEMB);
3039 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3040 _NL_COLLATE_SYMB_EXTRAMB);
3042 #endif
3043 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3044 #ifdef RE_ENABLE_I18N
3045 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3046 #endif /* RE_ENABLE_I18N */
3047 #ifdef RE_ENABLE_I18N
3048 if (BE (sbcset == NULL || mbcset == NULL, 0))
3049 #else
3050 if (BE (sbcset == NULL, 0))
3051 #endif /* RE_ENABLE_I18N */
3053 re_free (sbcset);
3054 #ifdef RE_ENABLE_I18N
3055 re_free (mbcset);
3056 #endif
3057 *err = REG_ESPACE;
3058 return NULL;
3061 token_len = peek_token_bracket (token, regexp, syntax);
3062 if (BE (token->type == END_OF_RE, 0))
3064 *err = REG_BADPAT;
3065 goto parse_bracket_exp_free_return;
3067 if (token->type == OP_NON_MATCH_LIST)
3069 #ifdef RE_ENABLE_I18N
3070 mbcset->non_match = 1;
3071 #endif /* not RE_ENABLE_I18N */
3072 non_match = 1;
3073 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3074 bitset_set (sbcset, '\n');
3075 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3076 token_len = peek_token_bracket (token, regexp, syntax);
3077 if (BE (token->type == END_OF_RE, 0))
3079 *err = REG_BADPAT;
3080 goto parse_bracket_exp_free_return;
3084 /* We treat the first ']' as a normal character. */
3085 if (token->type == OP_CLOSE_BRACKET)
3086 token->type = CHARACTER;
3088 while (1)
3090 bracket_elem_t start_elem, end_elem;
3091 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3092 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3093 reg_errcode_t ret;
3094 int token_len2 = 0, is_range_exp = 0;
3095 re_token_t token2;
3097 start_elem.opr.name = start_name_buf;
3098 start_elem.type = COLL_SYM;
3099 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3100 syntax, first_round);
3101 if (BE (ret != REG_NOERROR, 0))
3103 *err = ret;
3104 goto parse_bracket_exp_free_return;
3106 first_round = 0;
3108 /* Get information about the next token. We need it in any case. */
3109 token_len = peek_token_bracket (token, regexp, syntax);
3111 /* Do not check for ranges if we know they are not allowed. */
3112 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3114 if (BE (token->type == END_OF_RE, 0))
3116 *err = REG_EBRACK;
3117 goto parse_bracket_exp_free_return;
3119 if (token->type == OP_CHARSET_RANGE)
3121 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3122 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3123 if (BE (token2.type == END_OF_RE, 0))
3125 *err = REG_EBRACK;
3126 goto parse_bracket_exp_free_return;
3128 if (token2.type == OP_CLOSE_BRACKET)
3130 /* We treat the last '-' as a normal character. */
3131 re_string_skip_bytes (regexp, -token_len);
3132 token->type = CHARACTER;
3134 else
3135 is_range_exp = 1;
3139 if (is_range_exp == 1)
3141 end_elem.opr.name = end_name_buf;
3142 end_elem.type = COLL_SYM;
3143 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3144 dfa, syntax, 1);
3145 if (BE (ret != REG_NOERROR, 0))
3147 *err = ret;
3148 goto parse_bracket_exp_free_return;
3151 token_len = peek_token_bracket (token, regexp, syntax);
3153 #ifdef _LIBC
3154 *err = build_range_exp (sbcset, mbcset, &range_alloc,
3155 &start_elem, &end_elem);
3156 #else
3157 # ifdef RE_ENABLE_I18N
3158 *err = build_range_exp (sbcset,
3159 dfa->mb_cur_max > 1 ? mbcset : NULL,
3160 &range_alloc, &start_elem, &end_elem);
3161 # else
3162 *err = build_range_exp (sbcset, &start_elem, &end_elem);
3163 # endif
3164 #endif /* RE_ENABLE_I18N */
3165 if (BE (*err != REG_NOERROR, 0))
3166 goto parse_bracket_exp_free_return;
3168 else
3170 switch (start_elem.type)
3172 case SB_CHAR:
3173 bitset_set (sbcset, start_elem.opr.ch);
3174 break;
3175 #ifdef RE_ENABLE_I18N
3176 case MB_CHAR:
3177 /* Check whether the array has enough space. */
3178 if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3180 wchar_t *new_mbchars;
3181 /* Not enough, realloc it. */
3182 /* +1 in case of mbcset->nmbchars is 0. */
3183 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3184 /* Use realloc since array is NULL if *alloc == 0. */
3185 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3186 mbchar_alloc);
3187 if (BE (new_mbchars == NULL, 0))
3188 goto parse_bracket_exp_espace;
3189 mbcset->mbchars = new_mbchars;
3191 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3192 break;
3193 #endif /* RE_ENABLE_I18N */
3194 case EQUIV_CLASS:
3195 *err = build_equiv_class (sbcset,
3196 #ifdef RE_ENABLE_I18N
3197 mbcset, &equiv_class_alloc,
3198 #endif /* RE_ENABLE_I18N */
3199 start_elem.opr.name);
3200 if (BE (*err != REG_NOERROR, 0))
3201 goto parse_bracket_exp_free_return;
3202 break;
3203 case COLL_SYM:
3204 *err = build_collating_symbol (sbcset,
3205 #ifdef RE_ENABLE_I18N
3206 mbcset, &coll_sym_alloc,
3207 #endif /* RE_ENABLE_I18N */
3208 start_elem.opr.name);
3209 if (BE (*err != REG_NOERROR, 0))
3210 goto parse_bracket_exp_free_return;
3211 break;
3212 case CHAR_CLASS:
3213 *err = build_charclass (regexp->trans, sbcset,
3214 #ifdef RE_ENABLE_I18N
3215 mbcset, &char_class_alloc,
3216 #endif /* RE_ENABLE_I18N */
3217 start_elem.opr.name, syntax);
3218 if (BE (*err != REG_NOERROR, 0))
3219 goto parse_bracket_exp_free_return;
3220 break;
3221 default:
3222 assert (0);
3223 break;
3226 if (BE (token->type == END_OF_RE, 0))
3228 *err = REG_EBRACK;
3229 goto parse_bracket_exp_free_return;
3231 if (token->type == OP_CLOSE_BRACKET)
3232 break;
3235 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3237 /* If it is non-matching list. */
3238 if (non_match)
3239 bitset_not (sbcset);
3241 #ifdef RE_ENABLE_I18N
3242 /* Ensure only single byte characters are set. */
3243 if (dfa->mb_cur_max > 1)
3244 bitset_mask (sbcset, dfa->sb_char);
3246 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3247 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3248 || mbcset->non_match)))
3250 bin_tree_t *mbc_tree;
3251 int sbc_idx;
3252 /* Build a tree for complex bracket. */
3253 dfa->has_mb_node = 1;
3254 br_token.type = COMPLEX_BRACKET;
3255 br_token.opr.mbcset = mbcset;
3256 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3257 if (BE (mbc_tree == NULL, 0))
3258 goto parse_bracket_exp_espace;
3259 for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3260 if (sbcset[sbc_idx])
3261 break;
3262 /* If there are no bits set in sbcset, there is no point
3263 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3264 if (sbc_idx < BITSET_WORDS)
3266 /* Build a tree for simple bracket. */
3267 br_token.type = SIMPLE_BRACKET;
3268 br_token.opr.sbcset = sbcset;
3269 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3270 if (BE (work_tree == NULL, 0))
3271 goto parse_bracket_exp_espace;
3273 /* Then join them by ALT node. */
3274 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3275 if (BE (work_tree == NULL, 0))
3276 goto parse_bracket_exp_espace;
3278 else
3280 re_free (sbcset);
3281 work_tree = mbc_tree;
3284 else
3285 #endif /* not RE_ENABLE_I18N */
3287 #ifdef RE_ENABLE_I18N
3288 free_charset (mbcset);
3289 #endif
3290 /* Build a tree for simple bracket. */
3291 br_token.type = SIMPLE_BRACKET;
3292 br_token.opr.sbcset = sbcset;
3293 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3294 if (BE (work_tree == NULL, 0))
3295 goto parse_bracket_exp_espace;
3297 return work_tree;
3299 parse_bracket_exp_espace:
3300 *err = REG_ESPACE;
3301 parse_bracket_exp_free_return:
3302 re_free (sbcset);
3303 #ifdef RE_ENABLE_I18N
3304 free_charset (mbcset);
3305 #endif /* RE_ENABLE_I18N */
3306 return NULL;
3309 /* Parse an element in the bracket expression. */
3311 static reg_errcode_t
3312 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3313 re_token_t *token, int token_len, re_dfa_t *dfa,
3314 reg_syntax_t syntax, int accept_hyphen)
3316 #ifdef RE_ENABLE_I18N
3317 int cur_char_size;
3318 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3319 if (cur_char_size > 1)
3321 elem->type = MB_CHAR;
3322 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3323 re_string_skip_bytes (regexp, cur_char_size);
3324 return REG_NOERROR;
3326 #endif /* RE_ENABLE_I18N */
3327 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3328 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3329 || token->type == OP_OPEN_EQUIV_CLASS)
3330 return parse_bracket_symbol (elem, regexp, token);
3331 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3333 /* A '-' must only appear as anything but a range indicator before
3334 the closing bracket. Everything else is an error. */
3335 re_token_t token2;
3336 (void) peek_token_bracket (&token2, regexp, syntax);
3337 if (token2.type != OP_CLOSE_BRACKET)
3338 /* The actual error value is not standardized since this whole
3339 case is undefined. But ERANGE makes good sense. */
3340 return REG_ERANGE;
3342 elem->type = SB_CHAR;
3343 elem->opr.ch = token->opr.c;
3344 return REG_NOERROR;
3347 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3348 such as [:<character_class>:], [.<collating_element>.], and
3349 [=<equivalent_class>=]. */
3351 static reg_errcode_t
3352 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3353 re_token_t *token)
3355 unsigned char ch, delim = token->opr.c;
3356 int i = 0;
3357 if (re_string_eoi(regexp))
3358 return REG_EBRACK;
3359 for (;; ++i)
3361 if (i >= BRACKET_NAME_BUF_SIZE)
3362 return REG_EBRACK;
3363 if (token->type == OP_OPEN_CHAR_CLASS)
3364 ch = re_string_fetch_byte_case (regexp);
3365 else
3366 ch = re_string_fetch_byte (regexp);
3367 if (re_string_eoi(regexp))
3368 return REG_EBRACK;
3369 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3370 break;
3371 elem->opr.name[i] = ch;
3373 re_string_skip_bytes (regexp, 1);
3374 elem->opr.name[i] = '\0';
3375 switch (token->type)
3377 case OP_OPEN_COLL_ELEM:
3378 elem->type = COLL_SYM;
3379 break;
3380 case OP_OPEN_EQUIV_CLASS:
3381 elem->type = EQUIV_CLASS;
3382 break;
3383 case OP_OPEN_CHAR_CLASS:
3384 elem->type = CHAR_CLASS;
3385 break;
3386 default:
3387 break;
3389 return REG_NOERROR;
3392 /* Helper function for parse_bracket_exp.
3393 Build the equivalence class which is represented by NAME.
3394 The result are written to MBCSET and SBCSET.
3395 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3396 is a pointer argument since we may update it. */
3398 static reg_errcode_t
3399 #ifdef RE_ENABLE_I18N
3400 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3401 int *equiv_class_alloc, const unsigned char *name)
3402 #else /* not RE_ENABLE_I18N */
3403 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3404 #endif /* not RE_ENABLE_I18N */
3406 #ifdef _LIBC
3407 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3408 if (nrules != 0)
3410 const int32_t *table, *indirect;
3411 const unsigned char *weights, *extra, *cp;
3412 unsigned char char_buf[2];
3413 int32_t idx1, idx2;
3414 unsigned int ch;
3415 size_t len;
3416 /* Calculate the index for equivalence class. */
3417 cp = name;
3418 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3419 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3420 _NL_COLLATE_WEIGHTMB);
3421 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3422 _NL_COLLATE_EXTRAMB);
3423 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3424 _NL_COLLATE_INDIRECTMB);
3425 idx1 = findidx (table, indirect, extra, &cp, -1);
3426 if (BE (idx1 == 0 || *cp != '\0', 0))
3427 /* This isn't a valid character. */
3428 return REG_ECOLLATE;
3430 /* Build single byte matcing table for this equivalence class. */
3431 len = weights[idx1 & 0xffffff];
3432 for (ch = 0; ch < SBC_MAX; ++ch)
3434 char_buf[0] = ch;
3435 cp = char_buf;
3436 idx2 = findidx (table, indirect, extra, &cp, 1);
3438 idx2 = table[ch];
3440 if (idx2 == 0)
3441 /* This isn't a valid character. */
3442 continue;
3443 /* Compare only if the length matches and the collation rule
3444 index is the same. */
3445 if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3447 int cnt = 0;
3449 while (cnt <= len &&
3450 weights[(idx1 & 0xffffff) + 1 + cnt]
3451 == weights[(idx2 & 0xffffff) + 1 + cnt])
3452 ++cnt;
3454 if (cnt > len)
3455 bitset_set (sbcset, ch);
3458 /* Check whether the array has enough space. */
3459 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3461 /* Not enough, realloc it. */
3462 /* +1 in case of mbcset->nequiv_classes is 0. */
3463 int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3464 /* Use realloc since the array is NULL if *alloc == 0. */
3465 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3466 int32_t,
3467 new_equiv_class_alloc);
3468 if (BE (new_equiv_classes == NULL, 0))
3469 return REG_ESPACE;
3470 mbcset->equiv_classes = new_equiv_classes;
3471 *equiv_class_alloc = new_equiv_class_alloc;
3473 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3475 else
3476 #endif /* _LIBC */
3478 if (BE (strlen ((const char *) name) != 1, 0))
3479 return REG_ECOLLATE;
3480 bitset_set (sbcset, *name);
3482 return REG_NOERROR;
3485 /* Helper function for parse_bracket_exp.
3486 Build the character class which is represented by NAME.
3487 The result are written to MBCSET and SBCSET.
3488 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3489 is a pointer argument since we may update it. */
3491 static reg_errcode_t
3492 #ifdef RE_ENABLE_I18N
3493 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3494 re_charset_t *mbcset, int *char_class_alloc,
3495 const unsigned char *class_name, reg_syntax_t syntax)
3496 #else /* not RE_ENABLE_I18N */
3497 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3498 const unsigned char *class_name, reg_syntax_t syntax)
3499 #endif /* not RE_ENABLE_I18N */
3501 int i;
3502 const char *name = (const char *) class_name;
3504 /* In case of REG_ICASE "upper" and "lower" match the both of
3505 upper and lower cases. */
3506 if ((syntax & RE_ICASE)
3507 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3508 name = "alpha";
3510 #ifdef RE_ENABLE_I18N
3511 /* Check the space of the arrays. */
3512 if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3514 /* Not enough, realloc it. */
3515 /* +1 in case of mbcset->nchar_classes is 0. */
3516 int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3517 /* Use realloc since array is NULL if *alloc == 0. */
3518 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3519 new_char_class_alloc);
3520 if (BE (new_char_classes == NULL, 0))
3521 return REG_ESPACE;
3522 mbcset->char_classes = new_char_classes;
3523 *char_class_alloc = new_char_class_alloc;
3525 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3526 #endif /* RE_ENABLE_I18N */
3528 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3529 do { \
3530 if (BE (trans != NULL, 0)) \
3532 for (i = 0; i < SBC_MAX; ++i) \
3533 if (ctype_func (i)) \
3534 bitset_set (sbcset, trans[i]); \
3536 else \
3538 for (i = 0; i < SBC_MAX; ++i) \
3539 if (ctype_func (i)) \
3540 bitset_set (sbcset, i); \
3542 } while (0)
3544 if (strcmp (name, "alnum") == 0)
3545 BUILD_CHARCLASS_LOOP (isalnum);
3546 else if (strcmp (name, "cntrl") == 0)
3547 BUILD_CHARCLASS_LOOP (iscntrl);
3548 else if (strcmp (name, "lower") == 0)
3549 BUILD_CHARCLASS_LOOP (islower);
3550 else if (strcmp (name, "space") == 0)
3551 BUILD_CHARCLASS_LOOP (isspace);
3552 else if (strcmp (name, "alpha") == 0)
3553 BUILD_CHARCLASS_LOOP (isalpha);
3554 else if (strcmp (name, "digit") == 0)
3555 BUILD_CHARCLASS_LOOP (isdigit);
3556 else if (strcmp (name, "print") == 0)
3557 BUILD_CHARCLASS_LOOP (isprint);
3558 else if (strcmp (name, "upper") == 0)
3559 BUILD_CHARCLASS_LOOP (isupper);
3560 else if (strcmp (name, "blank") == 0)
3561 BUILD_CHARCLASS_LOOP (isblank);
3562 else if (strcmp (name, "graph") == 0)
3563 BUILD_CHARCLASS_LOOP (isgraph);
3564 else if (strcmp (name, "punct") == 0)
3565 BUILD_CHARCLASS_LOOP (ispunct);
3566 else if (strcmp (name, "xdigit") == 0)
3567 BUILD_CHARCLASS_LOOP (isxdigit);
3568 else
3569 return REG_ECTYPE;
3571 return REG_NOERROR;
3574 static bin_tree_t *
3575 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3576 const unsigned char *class_name,
3577 const unsigned char *extra, int non_match,
3578 reg_errcode_t *err)
3580 re_bitset_ptr_t sbcset;
3581 #ifdef RE_ENABLE_I18N
3582 re_charset_t *mbcset;
3583 int alloc = 0;
3584 #endif /* not RE_ENABLE_I18N */
3585 reg_errcode_t ret;
3586 re_token_t br_token;
3587 bin_tree_t *tree;
3589 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3590 #ifdef RE_ENABLE_I18N
3591 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3592 #endif /* RE_ENABLE_I18N */
3594 #ifdef RE_ENABLE_I18N
3595 if (BE (sbcset == NULL || mbcset == NULL, 0))
3596 #else /* not RE_ENABLE_I18N */
3597 if (BE (sbcset == NULL, 0))
3598 #endif /* not RE_ENABLE_I18N */
3600 *err = REG_ESPACE;
3601 return NULL;
3604 if (non_match)
3606 #ifdef RE_ENABLE_I18N
3607 mbcset->non_match = 1;
3608 #endif /* not RE_ENABLE_I18N */
3611 /* We don't care the syntax in this case. */
3612 ret = build_charclass (trans, sbcset,
3613 #ifdef RE_ENABLE_I18N
3614 mbcset, &alloc,
3615 #endif /* RE_ENABLE_I18N */
3616 class_name, 0);
3618 if (BE (ret != REG_NOERROR, 0))
3620 re_free (sbcset);
3621 #ifdef RE_ENABLE_I18N
3622 free_charset (mbcset);
3623 #endif /* RE_ENABLE_I18N */
3624 *err = ret;
3625 return NULL;
3627 /* \w match '_' also. */
3628 for (; *extra; extra++)
3629 bitset_set (sbcset, *extra);
3631 /* If it is non-matching list. */
3632 if (non_match)
3633 bitset_not (sbcset);
3635 #ifdef RE_ENABLE_I18N
3636 /* Ensure only single byte characters are set. */
3637 if (dfa->mb_cur_max > 1)
3638 bitset_mask (sbcset, dfa->sb_char);
3639 #endif
3641 /* Build a tree for simple bracket. */
3642 br_token.type = SIMPLE_BRACKET;
3643 br_token.opr.sbcset = sbcset;
3644 tree = create_token_tree (dfa, NULL, NULL, &br_token);
3645 if (BE (tree == NULL, 0))
3646 goto build_word_op_espace;
3648 #ifdef RE_ENABLE_I18N
3649 if (dfa->mb_cur_max > 1)
3651 bin_tree_t *mbc_tree;
3652 /* Build a tree for complex bracket. */
3653 br_token.type = COMPLEX_BRACKET;
3654 br_token.opr.mbcset = mbcset;
3655 dfa->has_mb_node = 1;
3656 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3657 if (BE (mbc_tree == NULL, 0))
3658 goto build_word_op_espace;
3659 /* Then join them by ALT node. */
3660 tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3661 if (BE (mbc_tree != NULL, 1))
3662 return tree;
3664 else
3666 free_charset (mbcset);
3667 return tree;
3669 #else /* not RE_ENABLE_I18N */
3670 return tree;
3671 #endif /* not RE_ENABLE_I18N */
3673 build_word_op_espace:
3674 re_free (sbcset);
3675 #ifdef RE_ENABLE_I18N
3676 free_charset (mbcset);
3677 #endif /* RE_ENABLE_I18N */
3678 *err = REG_ESPACE;
3679 return NULL;
3682 /* This is intended for the expressions like "a{1,3}".
3683 Fetch a number from `input', and return the number.
3684 Return -1, if the number field is empty like "{,1}".
3685 Return -2, If an error is occured. */
3687 static int
3688 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3690 int num = -1;
3691 unsigned char c;
3692 while (1)
3694 fetch_token (token, input, syntax);
3695 c = token->opr.c;
3696 if (BE (token->type == END_OF_RE, 0))
3697 return -2;
3698 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3699 break;
3700 num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3701 ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3702 num = (num > RE_DUP_MAX) ? -2 : num;
3704 return num;
3707 #ifdef RE_ENABLE_I18N
3708 static void
3709 free_charset (re_charset_t *cset)
3711 re_free (cset->mbchars);
3712 # ifdef _LIBC
3713 re_free (cset->coll_syms);
3714 re_free (cset->equiv_classes);
3715 re_free (cset->range_starts);
3716 re_free (cset->range_ends);
3717 # endif
3718 re_free (cset->char_classes);
3719 re_free (cset);
3721 #endif /* RE_ENABLE_I18N */
3723 /* Functions for binary tree operation. */
3725 /* Create a tree node. */
3727 static bin_tree_t *
3728 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3729 re_token_type_t type)
3731 re_token_t t;
3732 t.type = type;
3733 return create_token_tree (dfa, left, right, &t);
3736 static bin_tree_t *
3737 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3738 const re_token_t *token)
3740 bin_tree_t *tree;
3741 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3743 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3745 if (storage == NULL)
3746 return NULL;
3747 storage->next = dfa->str_tree_storage;
3748 dfa->str_tree_storage = storage;
3749 dfa->str_tree_storage_idx = 0;
3751 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3753 tree->parent = NULL;
3754 tree->left = left;
3755 tree->right = right;
3756 tree->token = *token;
3757 tree->token.duplicated = 0;
3758 tree->token.opt_subexp = 0;
3759 tree->first = NULL;
3760 tree->next = NULL;
3761 tree->node_idx = -1;
3763 if (left != NULL)
3764 left->parent = tree;
3765 if (right != NULL)
3766 right->parent = tree;
3767 return tree;
3770 /* Mark the tree SRC as an optional subexpression.
3771 To be called from preorder or postorder. */
3773 static reg_errcode_t
3774 mark_opt_subexp (void *extra, bin_tree_t *node)
3776 int idx = (int) (long) extra;
3777 if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3778 node->token.opt_subexp = 1;
3780 return REG_NOERROR;
3783 /* Free the allocated memory inside NODE. */
3785 static void
3786 free_token (re_token_t *node)
3788 #ifdef RE_ENABLE_I18N
3789 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3790 free_charset (node->opr.mbcset);
3791 else
3792 #endif /* RE_ENABLE_I18N */
3793 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3794 re_free (node->opr.sbcset);
3797 /* Worker function for tree walking. Free the allocated memory inside NODE
3798 and its children. */
3800 static reg_errcode_t
3801 free_tree (void *extra, bin_tree_t *node)
3803 free_token (&node->token);
3804 return REG_NOERROR;
3808 /* Duplicate the node SRC, and return new node. This is a preorder
3809 visit similar to the one implemented by the generic visitor, but
3810 we need more infrastructure to maintain two parallel trees --- so,
3811 it's easier to duplicate. */
3813 static bin_tree_t *
3814 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3816 const bin_tree_t *node;
3817 bin_tree_t *dup_root;
3818 bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3820 for (node = root; ; )
3822 /* Create a new tree and link it back to the current parent. */
3823 *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3824 if (*p_new == NULL)
3825 return NULL;
3826 (*p_new)->parent = dup_node;
3827 (*p_new)->token.duplicated = 1;
3828 dup_node = *p_new;
3830 /* Go to the left node, or up and to the right. */
3831 if (node->left)
3833 node = node->left;
3834 p_new = &dup_node->left;
3836 else
3838 const bin_tree_t *prev = NULL;
3839 while (node->right == prev || node->right == NULL)
3841 prev = node;
3842 node = node->parent;
3843 dup_node = dup_node->parent;
3844 if (!node)
3845 return dup_root;
3847 node = node->right;
3848 p_new = &dup_node->right;