havelib: Support overriding the result of AC_LIB_PREPARE_MULTILIB.
[gnulib.git] / lib / regcomp.c
blob9fd4fed99943ede3e23899e032b57e6a01032618
1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002-2017 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <http://www.gnu.org/licenses/>. */
20 #ifdef _LIBC
21 # include <locale/weight.h>
22 #endif
24 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
25 size_t length, reg_syntax_t syntax);
26 static void re_compile_fastmap_iter (regex_t *bufp,
27 const re_dfastate_t *init_state,
28 char *fastmap);
29 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
30 #ifdef RE_ENABLE_I18N
31 static void free_charset (re_charset_t *cset);
32 #endif /* RE_ENABLE_I18N */
33 static void free_workarea_compile (regex_t *preg);
34 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
35 #ifdef RE_ENABLE_I18N
36 static void optimize_utf8 (re_dfa_t *dfa);
37 #endif
38 static reg_errcode_t analyze (regex_t *preg);
39 static reg_errcode_t preorder (bin_tree_t *root,
40 reg_errcode_t (fn (void *, bin_tree_t *)),
41 void *extra);
42 static reg_errcode_t postorder (bin_tree_t *root,
43 reg_errcode_t (fn (void *, bin_tree_t *)),
44 void *extra);
45 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
46 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
47 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
48 bin_tree_t *node);
49 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
50 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
51 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
52 static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
53 static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
54 unsigned int constraint);
55 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
56 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
57 Idx node, bool root);
58 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
59 static Idx fetch_number (re_string_t *input, re_token_t *token,
60 reg_syntax_t syntax);
61 static int peek_token (re_token_t *token, re_string_t *input,
62 reg_syntax_t syntax) internal_function;
63 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
64 reg_syntax_t syntax, reg_errcode_t *err);
65 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
66 re_token_t *token, reg_syntax_t syntax,
67 Idx nest, reg_errcode_t *err);
68 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
69 re_token_t *token, reg_syntax_t syntax,
70 Idx nest, reg_errcode_t *err);
71 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
72 re_token_t *token, reg_syntax_t syntax,
73 Idx nest, reg_errcode_t *err);
74 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
75 re_token_t *token, reg_syntax_t syntax,
76 Idx nest, reg_errcode_t *err);
77 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
78 re_dfa_t *dfa, re_token_t *token,
79 reg_syntax_t syntax, reg_errcode_t *err);
80 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
81 re_token_t *token, reg_syntax_t syntax,
82 reg_errcode_t *err);
83 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
84 re_string_t *regexp,
85 re_token_t *token, int token_len,
86 re_dfa_t *dfa,
87 reg_syntax_t syntax,
88 bool accept_hyphen);
89 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
90 re_string_t *regexp,
91 re_token_t *token);
92 #ifdef RE_ENABLE_I18N
93 static reg_errcode_t build_equiv_class (bitset_t sbcset,
94 re_charset_t *mbcset,
95 Idx *equiv_class_alloc,
96 const unsigned char *name);
97 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
98 bitset_t sbcset,
99 re_charset_t *mbcset,
100 Idx *char_class_alloc,
101 const char *class_name,
102 reg_syntax_t syntax);
103 #else /* not RE_ENABLE_I18N */
104 static reg_errcode_t build_equiv_class (bitset_t sbcset,
105 const unsigned char *name);
106 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
107 bitset_t sbcset,
108 const char *class_name,
109 reg_syntax_t syntax);
110 #endif /* not RE_ENABLE_I18N */
111 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
112 RE_TRANSLATE_TYPE trans,
113 const char *class_name,
114 const char *extra,
115 bool non_match, reg_errcode_t *err);
116 static bin_tree_t *create_tree (re_dfa_t *dfa,
117 bin_tree_t *left, bin_tree_t *right,
118 re_token_type_t type);
119 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
120 bin_tree_t *left, bin_tree_t *right,
121 const re_token_t *token);
122 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
123 static void free_token (re_token_t *node);
124 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
125 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
127 /* This table gives an error message for each of the error codes listed
128 in regex.h. Obviously the order here has to be same as there.
129 POSIX doesn't require that we do anything for REG_NOERROR,
130 but why not be nice? */
132 static const char __re_error_msgid[] =
134 #define REG_NOERROR_IDX 0
135 gettext_noop ("Success") /* REG_NOERROR */
136 "\0"
137 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
138 gettext_noop ("No match") /* REG_NOMATCH */
139 "\0"
140 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
141 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
142 "\0"
143 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
144 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
145 "\0"
146 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
147 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
148 "\0"
149 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
150 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
151 "\0"
152 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
153 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
154 "\0"
155 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
156 gettext_noop ("Unmatched [, [^, [:, [., or [=") /* REG_EBRACK */
157 "\0"
158 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [, [^, [:, [., or [=")
159 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
160 "\0"
161 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
162 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
163 "\0"
164 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
165 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
166 "\0"
167 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
168 gettext_noop ("Invalid range end") /* REG_ERANGE */
169 "\0"
170 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
171 gettext_noop ("Memory exhausted") /* REG_ESPACE */
172 "\0"
173 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
174 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
175 "\0"
176 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
177 gettext_noop ("Premature end of regular expression") /* REG_EEND */
178 "\0"
179 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
180 gettext_noop ("Regular expression too big") /* REG_ESIZE */
181 "\0"
182 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
183 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
186 static const size_t __re_error_msgid_idx[] =
188 REG_NOERROR_IDX,
189 REG_NOMATCH_IDX,
190 REG_BADPAT_IDX,
191 REG_ECOLLATE_IDX,
192 REG_ECTYPE_IDX,
193 REG_EESCAPE_IDX,
194 REG_ESUBREG_IDX,
195 REG_EBRACK_IDX,
196 REG_EPAREN_IDX,
197 REG_EBRACE_IDX,
198 REG_BADBR_IDX,
199 REG_ERANGE_IDX,
200 REG_ESPACE_IDX,
201 REG_BADRPT_IDX,
202 REG_EEND_IDX,
203 REG_ESIZE_IDX,
204 REG_ERPAREN_IDX
207 /* Entry points for GNU code. */
209 /* re_compile_pattern is the GNU regular expression compiler: it
210 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
211 Returns 0 if the pattern was valid, otherwise an error string.
213 Assumes the 'allocated' (and perhaps 'buffer') and 'translate' fields
214 are set in BUFP on entry. */
216 const char *
217 re_compile_pattern (const char *pattern, size_t length,
218 struct re_pattern_buffer *bufp)
220 reg_errcode_t ret;
222 /* And GNU code determines whether or not to get register information
223 by passing null for the REGS argument to re_match, etc., not by
224 setting no_sub, unless RE_NO_SUB is set. */
225 bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
227 /* Match anchors at newline. */
228 bufp->newline_anchor = 1;
230 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
232 if (!ret)
233 return NULL;
234 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
236 #ifdef _LIBC
237 weak_alias (__re_compile_pattern, re_compile_pattern)
238 #endif
240 /* Set by 're_set_syntax' to the current regexp syntax to recognize. Can
241 also be assigned to arbitrarily: each pattern buffer stores its own
242 syntax, so it can be changed between regex compilations. */
243 /* This has no initializer because initialized variables in Emacs
244 become read-only after dumping. */
245 reg_syntax_t re_syntax_options;
248 /* Specify the precise syntax of regexps for compilation. This provides
249 for compatibility for various utilities which historically have
250 different, incompatible syntaxes.
252 The argument SYNTAX is a bit mask comprised of the various bits
253 defined in regex.h. We return the old syntax. */
255 reg_syntax_t
256 re_set_syntax (reg_syntax_t syntax)
258 reg_syntax_t ret = re_syntax_options;
260 re_syntax_options = syntax;
261 return ret;
263 #ifdef _LIBC
264 weak_alias (__re_set_syntax, re_set_syntax)
265 #endif
268 re_compile_fastmap (struct re_pattern_buffer *bufp)
270 re_dfa_t *dfa = bufp->buffer;
271 char *fastmap = bufp->fastmap;
273 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
274 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
275 if (dfa->init_state != dfa->init_state_word)
276 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
277 if (dfa->init_state != dfa->init_state_nl)
278 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
279 if (dfa->init_state != dfa->init_state_begbuf)
280 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
281 bufp->fastmap_accurate = 1;
282 return 0;
284 #ifdef _LIBC
285 weak_alias (__re_compile_fastmap, re_compile_fastmap)
286 #endif
288 static inline void
289 __attribute__ ((always_inline))
290 re_set_fastmap (char *fastmap, bool icase, int ch)
292 fastmap[ch] = 1;
293 if (icase)
294 fastmap[tolower (ch)] = 1;
297 /* Helper function for re_compile_fastmap.
298 Compile fastmap for the initial_state INIT_STATE. */
300 static void
301 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
302 char *fastmap)
304 re_dfa_t *dfa = bufp->buffer;
305 Idx node_cnt;
306 bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
307 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
309 Idx node = init_state->nodes.elems[node_cnt];
310 re_token_type_t type = dfa->nodes[node].type;
312 if (type == CHARACTER)
314 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
315 #ifdef RE_ENABLE_I18N
316 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
318 unsigned char buf[MB_LEN_MAX];
319 unsigned char *p;
320 wchar_t wc;
321 mbstate_t state;
323 p = buf;
324 *p++ = dfa->nodes[node].opr.c;
325 while (++node < dfa->nodes_len
326 && dfa->nodes[node].type == CHARACTER
327 && dfa->nodes[node].mb_partial)
328 *p++ = dfa->nodes[node].opr.c;
329 memset (&state, '\0', sizeof (state));
330 if (__mbrtowc (&wc, (const char *) buf, p - buf,
331 &state) == p - buf
332 && (__wcrtomb ((char *) buf, __towlower (wc), &state)
333 != (size_t) -1))
334 re_set_fastmap (fastmap, false, buf[0]);
336 #endif
338 else if (type == SIMPLE_BRACKET)
340 int i, ch;
341 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
343 int j;
344 bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
345 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
346 if (w & ((bitset_word_t) 1 << j))
347 re_set_fastmap (fastmap, icase, ch);
350 #ifdef RE_ENABLE_I18N
351 else if (type == COMPLEX_BRACKET)
353 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
354 Idx i;
356 # ifdef _LIBC
357 /* See if we have to try all bytes which start multiple collation
358 elements.
359 e.g. In da_DK, we want to catch 'a' since "aa" is a valid
360 collation element, and don't catch 'b' since 'b' is
361 the only collation element which starts from 'b' (and
362 it is caught by SIMPLE_BRACKET). */
363 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
364 && (cset->ncoll_syms || cset->nranges))
366 const int32_t *table = (const int32_t *)
367 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
368 for (i = 0; i < SBC_MAX; ++i)
369 if (table[i] < 0)
370 re_set_fastmap (fastmap, icase, i);
372 # endif /* _LIBC */
374 /* See if we have to start the match at all multibyte characters,
375 i.e. where we would not find an invalid sequence. This only
376 applies to multibyte character sets; for single byte character
377 sets, the SIMPLE_BRACKET again suffices. */
378 if (dfa->mb_cur_max > 1
379 && (cset->nchar_classes || cset->non_match || cset->nranges
380 # ifdef _LIBC
381 || cset->nequiv_classes
382 # endif /* _LIBC */
385 unsigned char c = 0;
388 mbstate_t mbs;
389 memset (&mbs, 0, sizeof (mbs));
390 if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
391 re_set_fastmap (fastmap, false, (int) c);
393 while (++c != 0);
396 else
398 /* ... Else catch all bytes which can start the mbchars. */
399 for (i = 0; i < cset->nmbchars; ++i)
401 char buf[256];
402 mbstate_t state;
403 memset (&state, '\0', sizeof (state));
404 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
405 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
406 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
408 if (__wcrtomb (buf, __towlower (cset->mbchars[i]), &state)
409 != (size_t) -1)
410 re_set_fastmap (fastmap, false, *(unsigned char *) buf);
415 #endif /* RE_ENABLE_I18N */
416 else if (type == OP_PERIOD
417 #ifdef RE_ENABLE_I18N
418 || type == OP_UTF8_PERIOD
419 #endif /* RE_ENABLE_I18N */
420 || type == END_OF_RE)
422 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
423 if (type == END_OF_RE)
424 bufp->can_be_null = 1;
425 return;
430 /* Entry point for POSIX code. */
431 /* regcomp takes a regular expression as a string and compiles it.
433 PREG is a regex_t *. We do not expect any fields to be initialized,
434 since POSIX says we shouldn't. Thus, we set
436 'buffer' to the compiled pattern;
437 'used' to the length of the compiled pattern;
438 'syntax' to RE_SYNTAX_POSIX_EXTENDED if the
439 REG_EXTENDED bit in CFLAGS is set; otherwise, to
440 RE_SYNTAX_POSIX_BASIC;
441 'newline_anchor' to REG_NEWLINE being set in CFLAGS;
442 'fastmap' to an allocated space for the fastmap;
443 'fastmap_accurate' to zero;
444 're_nsub' to the number of subexpressions in PATTERN.
446 PATTERN is the address of the pattern string.
448 CFLAGS is a series of bits which affect compilation.
450 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
451 use POSIX basic syntax.
453 If REG_NEWLINE is set, then . and [^...] don't match newline.
454 Also, regexec will try a match beginning after every newline.
456 If REG_ICASE is set, then we considers upper- and lowercase
457 versions of letters to be equivalent when matching.
459 If REG_NOSUB is set, then when PREG is passed to regexec, that
460 routine will report only success or failure, and nothing about the
461 registers.
463 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
464 the return codes and their meanings.) */
467 regcomp (regex_t *_Restrict_ preg, const char *_Restrict_ pattern, int cflags)
469 reg_errcode_t ret;
470 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
471 : RE_SYNTAX_POSIX_BASIC);
473 preg->buffer = NULL;
474 preg->allocated = 0;
475 preg->used = 0;
477 /* Try to allocate space for the fastmap. */
478 preg->fastmap = re_malloc (char, SBC_MAX);
479 if (BE (preg->fastmap == NULL, 0))
480 return REG_ESPACE;
482 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
484 /* If REG_NEWLINE is set, newlines are treated differently. */
485 if (cflags & REG_NEWLINE)
486 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
487 syntax &= ~RE_DOT_NEWLINE;
488 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
489 /* It also changes the matching behavior. */
490 preg->newline_anchor = 1;
492 else
493 preg->newline_anchor = 0;
494 preg->no_sub = !!(cflags & REG_NOSUB);
495 preg->translate = NULL;
497 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
499 /* POSIX doesn't distinguish between an unmatched open-group and an
500 unmatched close-group: both are REG_EPAREN. */
501 if (ret == REG_ERPAREN)
502 ret = REG_EPAREN;
504 /* We have already checked preg->fastmap != NULL. */
505 if (BE (ret == REG_NOERROR, 1))
506 /* Compute the fastmap now, since regexec cannot modify the pattern
507 buffer. This function never fails in this implementation. */
508 (void) re_compile_fastmap (preg);
509 else
511 /* Some error occurred while compiling the expression. */
512 re_free (preg->fastmap);
513 preg->fastmap = NULL;
516 return (int) ret;
518 #ifdef _LIBC
519 weak_alias (__regcomp, regcomp)
520 #endif
522 /* Returns a message corresponding to an error code, ERRCODE, returned
523 from either regcomp or regexec. We don't use PREG here. */
525 size_t
526 regerror (int errcode, const regex_t *_Restrict_ preg, char *_Restrict_ errbuf,
527 size_t errbuf_size)
529 const char *msg;
530 size_t msg_size;
532 if (BE (errcode < 0
533 || errcode >= (int) (sizeof (__re_error_msgid_idx)
534 / sizeof (__re_error_msgid_idx[0])), 0))
535 /* Only error codes returned by the rest of the code should be passed
536 to this routine. If we are given anything else, or if other regex
537 code generates an invalid error code, then the program has a bug.
538 Dump core so we can fix it. */
539 abort ();
541 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
543 msg_size = strlen (msg) + 1; /* Includes the null. */
545 if (BE (errbuf_size != 0, 1))
547 size_t cpy_size = msg_size;
548 if (BE (msg_size > errbuf_size, 0))
550 cpy_size = errbuf_size - 1;
551 errbuf[cpy_size] = '\0';
553 memcpy (errbuf, msg, cpy_size);
556 return msg_size;
558 #ifdef _LIBC
559 weak_alias (__regerror, regerror)
560 #endif
563 #ifdef RE_ENABLE_I18N
564 /* This static array is used for the map to single-byte characters when
565 UTF-8 is used. Otherwise we would allocate memory just to initialize
566 it the same all the time. UTF-8 is the preferred encoding so this is
567 a worthwhile optimization. */
568 static const bitset_t utf8_sb_map =
570 /* Set the first 128 bits. */
571 # if defined __GNUC__ && !defined __STRICT_ANSI__
572 [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
573 # else
574 # if 4 * BITSET_WORD_BITS < ASCII_CHARS
575 # error "bitset_word_t is narrower than 32 bits"
576 # elif 3 * BITSET_WORD_BITS < ASCII_CHARS
577 BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
578 # elif 2 * BITSET_WORD_BITS < ASCII_CHARS
579 BITSET_WORD_MAX, BITSET_WORD_MAX,
580 # elif 1 * BITSET_WORD_BITS < ASCII_CHARS
581 BITSET_WORD_MAX,
582 # endif
583 (BITSET_WORD_MAX
584 >> (SBC_MAX % BITSET_WORD_BITS == 0
586 : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
587 # endif
589 #endif
592 static void
593 free_dfa_content (re_dfa_t *dfa)
595 Idx i, j;
597 if (dfa->nodes)
598 for (i = 0; i < dfa->nodes_len; ++i)
599 free_token (dfa->nodes + i);
600 re_free (dfa->nexts);
601 for (i = 0; i < dfa->nodes_len; ++i)
603 if (dfa->eclosures != NULL)
604 re_node_set_free (dfa->eclosures + i);
605 if (dfa->inveclosures != NULL)
606 re_node_set_free (dfa->inveclosures + i);
607 if (dfa->edests != NULL)
608 re_node_set_free (dfa->edests + i);
610 re_free (dfa->edests);
611 re_free (dfa->eclosures);
612 re_free (dfa->inveclosures);
613 re_free (dfa->nodes);
615 if (dfa->state_table)
616 for (i = 0; i <= dfa->state_hash_mask; ++i)
618 struct re_state_table_entry *entry = dfa->state_table + i;
619 for (j = 0; j < entry->num; ++j)
621 re_dfastate_t *state = entry->array[j];
622 free_state (state);
624 re_free (entry->array);
626 re_free (dfa->state_table);
627 #ifdef RE_ENABLE_I18N
628 if (dfa->sb_char != utf8_sb_map)
629 re_free (dfa->sb_char);
630 #endif
631 re_free (dfa->subexp_map);
632 #ifdef DEBUG
633 re_free (dfa->re_str);
634 #endif
636 re_free (dfa);
640 /* Free dynamically allocated space used by PREG. */
642 void
643 regfree (regex_t *preg)
645 re_dfa_t *dfa = preg->buffer;
646 if (BE (dfa != NULL, 1))
648 lock_fini (dfa->lock);
649 free_dfa_content (dfa);
651 preg->buffer = NULL;
652 preg->allocated = 0;
654 re_free (preg->fastmap);
655 preg->fastmap = NULL;
657 re_free (preg->translate);
658 preg->translate = NULL;
660 #ifdef _LIBC
661 weak_alias (__regfree, regfree)
662 #endif
664 /* Entry points compatible with 4.2 BSD regex library. We don't define
665 them unless specifically requested. */
667 #if defined _REGEX_RE_COMP || defined _LIBC
669 /* BSD has one and only one pattern buffer. */
670 static struct re_pattern_buffer re_comp_buf;
672 char *
673 # ifdef _LIBC
674 /* Make these definitions weak in libc, so POSIX programs can redefine
675 these names if they don't use our functions, and still use
676 regcomp/regexec above without link errors. */
677 weak_function
678 # endif
679 re_comp (const char *s)
681 reg_errcode_t ret;
682 char *fastmap;
684 if (!s)
686 if (!re_comp_buf.buffer)
687 return gettext ("No previous regular expression");
688 return 0;
691 if (re_comp_buf.buffer)
693 fastmap = re_comp_buf.fastmap;
694 re_comp_buf.fastmap = NULL;
695 __regfree (&re_comp_buf);
696 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
697 re_comp_buf.fastmap = fastmap;
700 if (re_comp_buf.fastmap == NULL)
702 re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
703 if (re_comp_buf.fastmap == NULL)
704 return (char *) gettext (__re_error_msgid
705 + __re_error_msgid_idx[(int) REG_ESPACE]);
708 /* Since 're_exec' always passes NULL for the 'regs' argument, we
709 don't need to initialize the pattern buffer fields which affect it. */
711 /* Match anchors at newlines. */
712 re_comp_buf.newline_anchor = 1;
714 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
716 if (!ret)
717 return NULL;
719 /* Yes, we're discarding 'const' here if !HAVE_LIBINTL. */
720 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
723 #ifdef _LIBC
724 libc_freeres_fn (free_mem)
726 __regfree (&re_comp_buf);
728 #endif
730 #endif /* _REGEX_RE_COMP */
732 /* Internal entry point.
733 Compile the regular expression PATTERN, whose length is LENGTH.
734 SYNTAX indicate regular expression's syntax. */
736 static reg_errcode_t
737 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
738 reg_syntax_t syntax)
740 reg_errcode_t err = REG_NOERROR;
741 re_dfa_t *dfa;
742 re_string_t regexp;
744 /* Initialize the pattern buffer. */
745 preg->fastmap_accurate = 0;
746 preg->syntax = syntax;
747 preg->not_bol = preg->not_eol = 0;
748 preg->used = 0;
749 preg->re_nsub = 0;
750 preg->can_be_null = 0;
751 preg->regs_allocated = REGS_UNALLOCATED;
753 /* Initialize the dfa. */
754 dfa = preg->buffer;
755 if (BE (preg->allocated < sizeof (re_dfa_t), 0))
757 /* If zero allocated, but buffer is non-null, try to realloc
758 enough space. This loses if buffer's address is bogus, but
759 that is the user's responsibility. If ->buffer is NULL this
760 is a simple allocation. */
761 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
762 if (dfa == NULL)
763 return REG_ESPACE;
764 preg->allocated = sizeof (re_dfa_t);
765 preg->buffer = dfa;
767 preg->used = sizeof (re_dfa_t);
769 err = init_dfa (dfa, length);
770 if (BE (err == REG_NOERROR && lock_init (dfa->lock) != 0, 0))
771 err = REG_ESPACE;
772 if (BE (err != REG_NOERROR, 0))
774 free_dfa_content (dfa);
775 preg->buffer = NULL;
776 preg->allocated = 0;
777 return err;
779 #ifdef DEBUG
780 /* Note: length+1 will not overflow since it is checked in init_dfa. */
781 dfa->re_str = re_malloc (char, length + 1);
782 strncpy (dfa->re_str, pattern, length + 1);
783 #endif
785 err = re_string_construct (&regexp, pattern, length, preg->translate,
786 (syntax & RE_ICASE) != 0, dfa);
787 if (BE (err != REG_NOERROR, 0))
789 re_compile_internal_free_return:
790 free_workarea_compile (preg);
791 re_string_destruct (&regexp);
792 lock_fini (dfa->lock);
793 free_dfa_content (dfa);
794 preg->buffer = NULL;
795 preg->allocated = 0;
796 return err;
799 /* Parse the regular expression, and build a structure tree. */
800 preg->re_nsub = 0;
801 dfa->str_tree = parse (&regexp, preg, syntax, &err);
802 if (BE (dfa->str_tree == NULL, 0))
803 goto re_compile_internal_free_return;
805 /* Analyze the tree and create the nfa. */
806 err = analyze (preg);
807 if (BE (err != REG_NOERROR, 0))
808 goto re_compile_internal_free_return;
810 #ifdef RE_ENABLE_I18N
811 /* If possible, do searching in single byte encoding to speed things up. */
812 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
813 optimize_utf8 (dfa);
814 #endif
816 /* Then create the initial state of the dfa. */
817 err = create_initial_state (dfa);
819 /* Release work areas. */
820 free_workarea_compile (preg);
821 re_string_destruct (&regexp);
823 if (BE (err != REG_NOERROR, 0))
825 lock_fini (dfa->lock);
826 free_dfa_content (dfa);
827 preg->buffer = NULL;
828 preg->allocated = 0;
831 return err;
834 /* Initialize DFA. We use the length of the regular expression PAT_LEN
835 as the initial length of some arrays. */
837 static reg_errcode_t
838 init_dfa (re_dfa_t *dfa, size_t pat_len)
840 __re_size_t table_size;
841 #ifndef _LIBC
842 const char *codeset_name;
843 #endif
844 #ifdef RE_ENABLE_I18N
845 size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
846 #else
847 size_t max_i18n_object_size = 0;
848 #endif
849 size_t max_object_size =
850 MAX (sizeof (struct re_state_table_entry),
851 MAX (sizeof (re_token_t),
852 MAX (sizeof (re_node_set),
853 MAX (sizeof (regmatch_t),
854 max_i18n_object_size))));
856 memset (dfa, '\0', sizeof (re_dfa_t));
858 /* Force allocation of str_tree_storage the first time. */
859 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
861 /* Avoid overflows. The extra "/ 2" is for the table_size doubling
862 calculation below, and for similar doubling calculations
863 elsewhere. And it's <= rather than <, because some of the
864 doubling calculations add 1 afterwards. */
865 if (BE (MIN (IDX_MAX, SIZE_MAX / max_object_size) / 2 <= pat_len, 0))
866 return REG_ESPACE;
868 dfa->nodes_alloc = pat_len + 1;
869 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
871 /* table_size = 2 ^ ceil(log pat_len) */
872 for (table_size = 1; ; table_size <<= 1)
873 if (table_size > pat_len)
874 break;
876 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
877 dfa->state_hash_mask = table_size - 1;
879 dfa->mb_cur_max = MB_CUR_MAX;
880 #ifdef _LIBC
881 if (dfa->mb_cur_max == 6
882 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
883 dfa->is_utf8 = 1;
884 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
885 != 0);
886 #else
887 codeset_name = nl_langinfo (CODESET);
888 if ((codeset_name[0] == 'U' || codeset_name[0] == 'u')
889 && (codeset_name[1] == 'T' || codeset_name[1] == 't')
890 && (codeset_name[2] == 'F' || codeset_name[2] == 'f')
891 && strcmp (codeset_name + 3 + (codeset_name[3] == '-'), "8") == 0)
892 dfa->is_utf8 = 1;
894 /* We check exhaustively in the loop below if this charset is a
895 superset of ASCII. */
896 dfa->map_notascii = 0;
897 #endif
899 #ifdef RE_ENABLE_I18N
900 if (dfa->mb_cur_max > 1)
902 if (dfa->is_utf8)
903 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
904 else
906 int i, j, ch;
908 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
909 if (BE (dfa->sb_char == NULL, 0))
910 return REG_ESPACE;
912 /* Set the bits corresponding to single byte chars. */
913 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
914 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
916 wint_t wch = __btowc (ch);
917 if (wch != WEOF)
918 dfa->sb_char[i] |= (bitset_word_t) 1 << j;
919 # ifndef _LIBC
920 if (isascii (ch) && wch != ch)
921 dfa->map_notascii = 1;
922 # endif
926 #endif
928 if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
929 return REG_ESPACE;
930 return REG_NOERROR;
933 /* Initialize WORD_CHAR table, which indicate which character is
934 "word". In this case "word" means that it is the word construction
935 character used by some operators like "\<", "\>", etc. */
937 static void
938 internal_function
939 init_word_char (re_dfa_t *dfa)
941 int i = 0;
942 int j;
943 int ch = 0;
944 dfa->word_ops_used = 1;
945 if (BE (dfa->map_notascii == 0, 1))
947 bitset_word_t bits0 = 0x00000000;
948 bitset_word_t bits1 = 0x03ff0000;
949 bitset_word_t bits2 = 0x87fffffe;
950 bitset_word_t bits3 = 0x07fffffe;
951 if (BITSET_WORD_BITS == 64)
953 dfa->word_char[0] = bits1 << 31 << 1 | bits0;
954 dfa->word_char[1] = bits3 << 31 << 1 | bits2;
955 i = 2;
957 else if (BITSET_WORD_BITS == 32)
959 dfa->word_char[0] = bits0;
960 dfa->word_char[1] = bits1;
961 dfa->word_char[2] = bits2;
962 dfa->word_char[3] = bits3;
963 i = 4;
965 else
966 goto general_case;
967 ch = 128;
969 if (BE (dfa->is_utf8, 1))
971 memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8);
972 return;
976 general_case:
977 for (; i < BITSET_WORDS; ++i)
978 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
979 if (isalnum (ch) || ch == '_')
980 dfa->word_char[i] |= (bitset_word_t) 1 << j;
983 /* Free the work area which are only used while compiling. */
985 static void
986 free_workarea_compile (regex_t *preg)
988 re_dfa_t *dfa = preg->buffer;
989 bin_tree_storage_t *storage, *next;
990 for (storage = dfa->str_tree_storage; storage; storage = next)
992 next = storage->next;
993 re_free (storage);
995 dfa->str_tree_storage = NULL;
996 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
997 dfa->str_tree = NULL;
998 re_free (dfa->org_indices);
999 dfa->org_indices = NULL;
1002 /* Create initial states for all contexts. */
1004 static reg_errcode_t
1005 create_initial_state (re_dfa_t *dfa)
1007 Idx first, i;
1008 reg_errcode_t err;
1009 re_node_set init_nodes;
1011 /* Initial states have the epsilon closure of the node which is
1012 the first node of the regular expression. */
1013 first = dfa->str_tree->first->node_idx;
1014 dfa->init_node = first;
1015 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
1016 if (BE (err != REG_NOERROR, 0))
1017 return err;
1019 /* The back-references which are in initial states can epsilon transit,
1020 since in this case all of the subexpressions can be null.
1021 Then we add epsilon closures of the nodes which are the next nodes of
1022 the back-references. */
1023 if (dfa->nbackref > 0)
1024 for (i = 0; i < init_nodes.nelem; ++i)
1026 Idx node_idx = init_nodes.elems[i];
1027 re_token_type_t type = dfa->nodes[node_idx].type;
1029 Idx clexp_idx;
1030 if (type != OP_BACK_REF)
1031 continue;
1032 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1034 re_token_t *clexp_node;
1035 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1036 if (clexp_node->type == OP_CLOSE_SUBEXP
1037 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1038 break;
1040 if (clexp_idx == init_nodes.nelem)
1041 continue;
1043 if (type == OP_BACK_REF)
1045 Idx dest_idx = dfa->edests[node_idx].elems[0];
1046 if (!re_node_set_contains (&init_nodes, dest_idx))
1048 reg_errcode_t merge_err
1049 = re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1050 if (merge_err != REG_NOERROR)
1051 return merge_err;
1052 i = 0;
1057 /* It must be the first time to invoke acquire_state. */
1058 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1059 /* We don't check ERR here, since the initial state must not be NULL. */
1060 if (BE (dfa->init_state == NULL, 0))
1061 return err;
1062 if (dfa->init_state->has_constraint)
1064 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1065 CONTEXT_WORD);
1066 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1067 CONTEXT_NEWLINE);
1068 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1069 &init_nodes,
1070 CONTEXT_NEWLINE
1071 | CONTEXT_BEGBUF);
1072 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1073 || dfa->init_state_begbuf == NULL, 0))
1074 return err;
1076 else
1077 dfa->init_state_word = dfa->init_state_nl
1078 = dfa->init_state_begbuf = dfa->init_state;
1080 re_node_set_free (&init_nodes);
1081 return REG_NOERROR;
1084 #ifdef RE_ENABLE_I18N
1085 /* If it is possible to do searching in single byte encoding instead of UTF-8
1086 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1087 DFA nodes where needed. */
1089 static void
1090 optimize_utf8 (re_dfa_t *dfa)
1092 Idx node;
1093 int i;
1094 bool mb_chars = false;
1095 bool has_period = false;
1097 for (node = 0; node < dfa->nodes_len; ++node)
1098 switch (dfa->nodes[node].type)
1100 case CHARACTER:
1101 if (dfa->nodes[node].opr.c >= ASCII_CHARS)
1102 mb_chars = true;
1103 break;
1104 case ANCHOR:
1105 switch (dfa->nodes[node].opr.ctx_type)
1107 case LINE_FIRST:
1108 case LINE_LAST:
1109 case BUF_FIRST:
1110 case BUF_LAST:
1111 break;
1112 default:
1113 /* Word anchors etc. cannot be handled. It's okay to test
1114 opr.ctx_type since constraints (for all DFA nodes) are
1115 created by ORing one or more opr.ctx_type values. */
1116 return;
1118 break;
1119 case OP_PERIOD:
1120 has_period = true;
1121 break;
1122 case OP_BACK_REF:
1123 case OP_ALT:
1124 case END_OF_RE:
1125 case OP_DUP_ASTERISK:
1126 case OP_OPEN_SUBEXP:
1127 case OP_CLOSE_SUBEXP:
1128 break;
1129 case COMPLEX_BRACKET:
1130 return;
1131 case SIMPLE_BRACKET:
1132 /* Just double check. */
1134 int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
1136 : BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
1137 for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1139 if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
1140 return;
1141 rshift = 0;
1144 break;
1145 default:
1146 abort ();
1149 if (mb_chars || has_period)
1150 for (node = 0; node < dfa->nodes_len; ++node)
1152 if (dfa->nodes[node].type == CHARACTER
1153 && dfa->nodes[node].opr.c >= ASCII_CHARS)
1154 dfa->nodes[node].mb_partial = 0;
1155 else if (dfa->nodes[node].type == OP_PERIOD)
1156 dfa->nodes[node].type = OP_UTF8_PERIOD;
1159 /* The search can be in single byte locale. */
1160 dfa->mb_cur_max = 1;
1161 dfa->is_utf8 = 0;
1162 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1164 #endif
1166 /* Analyze the structure tree, and calculate "first", "next", "edest",
1167 "eclosure", and "inveclosure". */
1169 static reg_errcode_t
1170 analyze (regex_t *preg)
1172 re_dfa_t *dfa = preg->buffer;
1173 reg_errcode_t ret;
1175 /* Allocate arrays. */
1176 dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1177 dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1178 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1179 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1180 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1181 || dfa->eclosures == NULL, 0))
1182 return REG_ESPACE;
1184 dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
1185 if (dfa->subexp_map != NULL)
1187 Idx i;
1188 for (i = 0; i < preg->re_nsub; i++)
1189 dfa->subexp_map[i] = i;
1190 preorder (dfa->str_tree, optimize_subexps, dfa);
1191 for (i = 0; i < preg->re_nsub; i++)
1192 if (dfa->subexp_map[i] != i)
1193 break;
1194 if (i == preg->re_nsub)
1196 free (dfa->subexp_map);
1197 dfa->subexp_map = NULL;
1201 ret = postorder (dfa->str_tree, lower_subexps, preg);
1202 if (BE (ret != REG_NOERROR, 0))
1203 return ret;
1204 ret = postorder (dfa->str_tree, calc_first, dfa);
1205 if (BE (ret != REG_NOERROR, 0))
1206 return ret;
1207 preorder (dfa->str_tree, calc_next, dfa);
1208 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1209 if (BE (ret != REG_NOERROR, 0))
1210 return ret;
1211 ret = calc_eclosure (dfa);
1212 if (BE (ret != REG_NOERROR, 0))
1213 return ret;
1215 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1216 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1217 if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1218 || dfa->nbackref)
1220 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1221 if (BE (dfa->inveclosures == NULL, 0))
1222 return REG_ESPACE;
1223 ret = calc_inveclosure (dfa);
1226 return ret;
1229 /* Our parse trees are very unbalanced, so we cannot use a stack to
1230 implement parse tree visits. Instead, we use parent pointers and
1231 some hairy code in these two functions. */
1232 static reg_errcode_t
1233 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1234 void *extra)
1236 bin_tree_t *node, *prev;
1238 for (node = root; ; )
1240 /* Descend down the tree, preferably to the left (or to the right
1241 if that's the only child). */
1242 while (node->left || node->right)
1243 if (node->left)
1244 node = node->left;
1245 else
1246 node = node->right;
1250 reg_errcode_t err = fn (extra, node);
1251 if (BE (err != REG_NOERROR, 0))
1252 return err;
1253 if (node->parent == NULL)
1254 return REG_NOERROR;
1255 prev = node;
1256 node = node->parent;
1258 /* Go up while we have a node that is reached from the right. */
1259 while (node->right == prev || node->right == NULL);
1260 node = node->right;
1264 static reg_errcode_t
1265 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1266 void *extra)
1268 bin_tree_t *node;
1270 for (node = root; ; )
1272 reg_errcode_t err = fn (extra, node);
1273 if (BE (err != REG_NOERROR, 0))
1274 return err;
1276 /* Go to the left node, or up and to the right. */
1277 if (node->left)
1278 node = node->left;
1279 else
1281 bin_tree_t *prev = NULL;
1282 while (node->right == prev || node->right == NULL)
1284 prev = node;
1285 node = node->parent;
1286 if (!node)
1287 return REG_NOERROR;
1289 node = node->right;
1294 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1295 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1296 backreferences as well. Requires a preorder visit. */
1297 static reg_errcode_t
1298 optimize_subexps (void *extra, bin_tree_t *node)
1300 re_dfa_t *dfa = (re_dfa_t *) extra;
1302 if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1304 int idx = node->token.opr.idx;
1305 node->token.opr.idx = dfa->subexp_map[idx];
1306 dfa->used_bkref_map |= 1 << node->token.opr.idx;
1309 else if (node->token.type == SUBEXP
1310 && node->left && node->left->token.type == SUBEXP)
1312 Idx other_idx = node->left->token.opr.idx;
1314 node->left = node->left->left;
1315 if (node->left)
1316 node->left->parent = node;
1318 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1319 if (other_idx < BITSET_WORD_BITS)
1320 dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1323 return REG_NOERROR;
1326 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1327 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1328 static reg_errcode_t
1329 lower_subexps (void *extra, bin_tree_t *node)
1331 regex_t *preg = (regex_t *) extra;
1332 reg_errcode_t err = REG_NOERROR;
1334 if (node->left && node->left->token.type == SUBEXP)
1336 node->left = lower_subexp (&err, preg, node->left);
1337 if (node->left)
1338 node->left->parent = node;
1340 if (node->right && node->right->token.type == SUBEXP)
1342 node->right = lower_subexp (&err, preg, node->right);
1343 if (node->right)
1344 node->right->parent = node;
1347 return err;
1350 static bin_tree_t *
1351 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1353 re_dfa_t *dfa = preg->buffer;
1354 bin_tree_t *body = node->left;
1355 bin_tree_t *op, *cls, *tree1, *tree;
1357 if (preg->no_sub
1358 /* We do not optimize empty subexpressions, because otherwise we may
1359 have bad CONCAT nodes with NULL children. This is obviously not
1360 very common, so we do not lose much. An example that triggers
1361 this case is the sed "script" /\(\)/x. */
1362 && node->left != NULL
1363 && (node->token.opr.idx >= BITSET_WORD_BITS
1364 || !(dfa->used_bkref_map
1365 & ((bitset_word_t) 1 << node->token.opr.idx))))
1366 return node->left;
1368 /* Convert the SUBEXP node to the concatenation of an
1369 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1370 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1371 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1372 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1373 tree = create_tree (dfa, op, tree1, CONCAT);
1374 if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1376 *err = REG_ESPACE;
1377 return NULL;
1380 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1381 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1382 return tree;
1385 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1386 nodes. Requires a postorder visit. */
1387 static reg_errcode_t
1388 calc_first (void *extra, bin_tree_t *node)
1390 re_dfa_t *dfa = (re_dfa_t *) extra;
1391 if (node->token.type == CONCAT)
1393 node->first = node->left->first;
1394 node->node_idx = node->left->node_idx;
1396 else
1398 node->first = node;
1399 node->node_idx = re_dfa_add_node (dfa, node->token);
1400 if (BE (node->node_idx == -1, 0))
1401 return REG_ESPACE;
1402 if (node->token.type == ANCHOR)
1403 dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1405 return REG_NOERROR;
1408 /* Pass 2: compute NEXT on the tree. Preorder visit. */
1409 static reg_errcode_t
1410 calc_next (void *extra, bin_tree_t *node)
1412 switch (node->token.type)
1414 case OP_DUP_ASTERISK:
1415 node->left->next = node;
1416 break;
1417 case CONCAT:
1418 node->left->next = node->right->first;
1419 node->right->next = node->next;
1420 break;
1421 default:
1422 if (node->left)
1423 node->left->next = node->next;
1424 if (node->right)
1425 node->right->next = node->next;
1426 break;
1428 return REG_NOERROR;
1431 /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1432 static reg_errcode_t
1433 link_nfa_nodes (void *extra, bin_tree_t *node)
1435 re_dfa_t *dfa = (re_dfa_t *) extra;
1436 Idx idx = node->node_idx;
1437 reg_errcode_t err = REG_NOERROR;
1439 switch (node->token.type)
1441 case CONCAT:
1442 break;
1444 case END_OF_RE:
1445 assert (node->next == NULL);
1446 break;
1448 case OP_DUP_ASTERISK:
1449 case OP_ALT:
1451 Idx left, right;
1452 dfa->has_plural_match = 1;
1453 if (node->left != NULL)
1454 left = node->left->first->node_idx;
1455 else
1456 left = node->next->node_idx;
1457 if (node->right != NULL)
1458 right = node->right->first->node_idx;
1459 else
1460 right = node->next->node_idx;
1461 assert (left > -1);
1462 assert (right > -1);
1463 err = re_node_set_init_2 (dfa->edests + idx, left, right);
1465 break;
1467 case ANCHOR:
1468 case OP_OPEN_SUBEXP:
1469 case OP_CLOSE_SUBEXP:
1470 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1471 break;
1473 case OP_BACK_REF:
1474 dfa->nexts[idx] = node->next->node_idx;
1475 if (node->token.type == OP_BACK_REF)
1476 err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1477 break;
1479 default:
1480 assert (!IS_EPSILON_NODE (node->token.type));
1481 dfa->nexts[idx] = node->next->node_idx;
1482 break;
1485 return err;
1488 /* Duplicate the epsilon closure of the node ROOT_NODE.
1489 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1490 to their own constraint. */
1492 static reg_errcode_t
1493 internal_function
1494 duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
1495 Idx root_node, unsigned int init_constraint)
1497 Idx org_node, clone_node;
1498 bool ok;
1499 unsigned int constraint = init_constraint;
1500 for (org_node = top_org_node, clone_node = top_clone_node;;)
1502 Idx org_dest, clone_dest;
1503 if (dfa->nodes[org_node].type == OP_BACK_REF)
1505 /* If the back reference epsilon-transit, its destination must
1506 also have the constraint. Then duplicate the epsilon closure
1507 of the destination of the back reference, and store it in
1508 edests of the back reference. */
1509 org_dest = dfa->nexts[org_node];
1510 re_node_set_empty (dfa->edests + clone_node);
1511 clone_dest = duplicate_node (dfa, org_dest, constraint);
1512 if (BE (clone_dest == -1, 0))
1513 return REG_ESPACE;
1514 dfa->nexts[clone_node] = dfa->nexts[org_node];
1515 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1516 if (BE (! ok, 0))
1517 return REG_ESPACE;
1519 else if (dfa->edests[org_node].nelem == 0)
1521 /* In case of the node can't epsilon-transit, don't duplicate the
1522 destination and store the original destination as the
1523 destination of the node. */
1524 dfa->nexts[clone_node] = dfa->nexts[org_node];
1525 break;
1527 else if (dfa->edests[org_node].nelem == 1)
1529 /* In case of the node can epsilon-transit, and it has only one
1530 destination. */
1531 org_dest = dfa->edests[org_node].elems[0];
1532 re_node_set_empty (dfa->edests + clone_node);
1533 /* If the node is root_node itself, it means the epsilon closure
1534 has a loop. Then tie it to the destination of the root_node. */
1535 if (org_node == root_node && clone_node != org_node)
1537 ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
1538 if (BE (! ok, 0))
1539 return REG_ESPACE;
1540 break;
1542 /* In case the node has another constraint, append it. */
1543 constraint |= dfa->nodes[org_node].constraint;
1544 clone_dest = duplicate_node (dfa, org_dest, constraint);
1545 if (BE (clone_dest == -1, 0))
1546 return REG_ESPACE;
1547 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1548 if (BE (! ok, 0))
1549 return REG_ESPACE;
1551 else /* dfa->edests[org_node].nelem == 2 */
1553 /* In case of the node can epsilon-transit, and it has two
1554 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1555 org_dest = dfa->edests[org_node].elems[0];
1556 re_node_set_empty (dfa->edests + clone_node);
1557 /* Search for a duplicated node which satisfies the constraint. */
1558 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1559 if (clone_dest == -1)
1561 /* There is no such duplicated node, create a new one. */
1562 reg_errcode_t err;
1563 clone_dest = duplicate_node (dfa, org_dest, constraint);
1564 if (BE (clone_dest == -1, 0))
1565 return REG_ESPACE;
1566 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1567 if (BE (! ok, 0))
1568 return REG_ESPACE;
1569 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1570 root_node, constraint);
1571 if (BE (err != REG_NOERROR, 0))
1572 return err;
1574 else
1576 /* There is a duplicated node which satisfies the constraint,
1577 use it to avoid infinite loop. */
1578 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1579 if (BE (! ok, 0))
1580 return REG_ESPACE;
1583 org_dest = dfa->edests[org_node].elems[1];
1584 clone_dest = duplicate_node (dfa, org_dest, constraint);
1585 if (BE (clone_dest == -1, 0))
1586 return REG_ESPACE;
1587 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1588 if (BE (! ok, 0))
1589 return REG_ESPACE;
1591 org_node = org_dest;
1592 clone_node = clone_dest;
1594 return REG_NOERROR;
1597 /* Search for a node which is duplicated from the node ORG_NODE, and
1598 satisfies the constraint CONSTRAINT. */
1600 static Idx
1601 search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1602 unsigned int constraint)
1604 Idx idx;
1605 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1607 if (org_node == dfa->org_indices[idx]
1608 && constraint == dfa->nodes[idx].constraint)
1609 return idx; /* Found. */
1611 return -1; /* Not found. */
1614 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1615 Return the index of the new node, or -1 if insufficient storage is
1616 available. */
1618 static Idx
1619 duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
1621 Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1622 if (BE (dup_idx != -1, 1))
1624 dfa->nodes[dup_idx].constraint = constraint;
1625 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1626 dfa->nodes[dup_idx].duplicated = 1;
1628 /* Store the index of the original node. */
1629 dfa->org_indices[dup_idx] = org_idx;
1631 return dup_idx;
1634 static reg_errcode_t
1635 calc_inveclosure (re_dfa_t *dfa)
1637 Idx src, idx;
1638 bool ok;
1639 for (idx = 0; idx < dfa->nodes_len; ++idx)
1640 re_node_set_init_empty (dfa->inveclosures + idx);
1642 for (src = 0; src < dfa->nodes_len; ++src)
1644 Idx *elems = dfa->eclosures[src].elems;
1645 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1647 ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1648 if (BE (! ok, 0))
1649 return REG_ESPACE;
1653 return REG_NOERROR;
1656 /* Calculate "eclosure" for all the node in DFA. */
1658 static reg_errcode_t
1659 calc_eclosure (re_dfa_t *dfa)
1661 Idx node_idx;
1662 bool incomplete;
1663 #ifdef DEBUG
1664 assert (dfa->nodes_len > 0);
1665 #endif
1666 incomplete = false;
1667 /* For each nodes, calculate epsilon closure. */
1668 for (node_idx = 0; ; ++node_idx)
1670 reg_errcode_t err;
1671 re_node_set eclosure_elem;
1672 if (node_idx == dfa->nodes_len)
1674 if (!incomplete)
1675 break;
1676 incomplete = false;
1677 node_idx = 0;
1680 #ifdef DEBUG
1681 assert (dfa->eclosures[node_idx].nelem != -1);
1682 #endif
1684 /* If we have already calculated, skip it. */
1685 if (dfa->eclosures[node_idx].nelem != 0)
1686 continue;
1687 /* Calculate epsilon closure of 'node_idx'. */
1688 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1689 if (BE (err != REG_NOERROR, 0))
1690 return err;
1692 if (dfa->eclosures[node_idx].nelem == 0)
1694 incomplete = true;
1695 re_node_set_free (&eclosure_elem);
1698 return REG_NOERROR;
1701 /* Calculate epsilon closure of NODE. */
1703 static reg_errcode_t
1704 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
1706 reg_errcode_t err;
1707 Idx i;
1708 re_node_set eclosure;
1709 bool ok;
1710 bool incomplete = false;
1711 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1712 if (BE (err != REG_NOERROR, 0))
1713 return err;
1715 /* This indicates that we are calculating this node now.
1716 We reference this value to avoid infinite loop. */
1717 dfa->eclosures[node].nelem = -1;
1719 /* If the current node has constraints, duplicate all nodes
1720 since they must inherit the constraints. */
1721 if (dfa->nodes[node].constraint
1722 && dfa->edests[node].nelem
1723 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1725 err = duplicate_node_closure (dfa, node, node, node,
1726 dfa->nodes[node].constraint);
1727 if (BE (err != REG_NOERROR, 0))
1728 return err;
1731 /* Expand each epsilon destination nodes. */
1732 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1733 for (i = 0; i < dfa->edests[node].nelem; ++i)
1735 re_node_set eclosure_elem;
1736 Idx edest = dfa->edests[node].elems[i];
1737 /* If calculating the epsilon closure of 'edest' is in progress,
1738 return intermediate result. */
1739 if (dfa->eclosures[edest].nelem == -1)
1741 incomplete = true;
1742 continue;
1744 /* If we haven't calculated the epsilon closure of 'edest' yet,
1745 calculate now. Otherwise use calculated epsilon closure. */
1746 if (dfa->eclosures[edest].nelem == 0)
1748 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1749 if (BE (err != REG_NOERROR, 0))
1750 return err;
1752 else
1753 eclosure_elem = dfa->eclosures[edest];
1754 /* Merge the epsilon closure of 'edest'. */
1755 err = re_node_set_merge (&eclosure, &eclosure_elem);
1756 if (BE (err != REG_NOERROR, 0))
1757 return err;
1758 /* If the epsilon closure of 'edest' is incomplete,
1759 the epsilon closure of this node is also incomplete. */
1760 if (dfa->eclosures[edest].nelem == 0)
1762 incomplete = true;
1763 re_node_set_free (&eclosure_elem);
1767 /* An epsilon closure includes itself. */
1768 ok = re_node_set_insert (&eclosure, node);
1769 if (BE (! ok, 0))
1770 return REG_ESPACE;
1771 if (incomplete && !root)
1772 dfa->eclosures[node].nelem = 0;
1773 else
1774 dfa->eclosures[node] = eclosure;
1775 *new_set = eclosure;
1776 return REG_NOERROR;
1779 /* Functions for token which are used in the parser. */
1781 /* Fetch a token from INPUT.
1782 We must not use this function inside bracket expressions. */
1784 static void
1785 internal_function
1786 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1788 re_string_skip_bytes (input, peek_token (result, input, syntax));
1791 /* Peek a token from INPUT, and return the length of the token.
1792 We must not use this function inside bracket expressions. */
1794 static int
1795 internal_function
1796 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1798 unsigned char c;
1800 if (re_string_eoi (input))
1802 token->type = END_OF_RE;
1803 return 0;
1806 c = re_string_peek_byte (input, 0);
1807 token->opr.c = c;
1809 token->word_char = 0;
1810 #ifdef RE_ENABLE_I18N
1811 token->mb_partial = 0;
1812 if (input->mb_cur_max > 1 &&
1813 !re_string_first_byte (input, re_string_cur_idx (input)))
1815 token->type = CHARACTER;
1816 token->mb_partial = 1;
1817 return 1;
1819 #endif
1820 if (c == '\\')
1822 unsigned char c2;
1823 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1825 token->type = BACK_SLASH;
1826 return 1;
1829 c2 = re_string_peek_byte_case (input, 1);
1830 token->opr.c = c2;
1831 token->type = CHARACTER;
1832 #ifdef RE_ENABLE_I18N
1833 if (input->mb_cur_max > 1)
1835 wint_t wc = re_string_wchar_at (input,
1836 re_string_cur_idx (input) + 1);
1837 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1839 else
1840 #endif
1841 token->word_char = IS_WORD_CHAR (c2) != 0;
1843 switch (c2)
1845 case '|':
1846 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1847 token->type = OP_ALT;
1848 break;
1849 case '1': case '2': case '3': case '4': case '5':
1850 case '6': case '7': case '8': case '9':
1851 if (!(syntax & RE_NO_BK_REFS))
1853 token->type = OP_BACK_REF;
1854 token->opr.idx = c2 - '1';
1856 break;
1857 case '<':
1858 if (!(syntax & RE_NO_GNU_OPS))
1860 token->type = ANCHOR;
1861 token->opr.ctx_type = WORD_FIRST;
1863 break;
1864 case '>':
1865 if (!(syntax & RE_NO_GNU_OPS))
1867 token->type = ANCHOR;
1868 token->opr.ctx_type = WORD_LAST;
1870 break;
1871 case 'b':
1872 if (!(syntax & RE_NO_GNU_OPS))
1874 token->type = ANCHOR;
1875 token->opr.ctx_type = WORD_DELIM;
1877 break;
1878 case 'B':
1879 if (!(syntax & RE_NO_GNU_OPS))
1881 token->type = ANCHOR;
1882 token->opr.ctx_type = NOT_WORD_DELIM;
1884 break;
1885 case 'w':
1886 if (!(syntax & RE_NO_GNU_OPS))
1887 token->type = OP_WORD;
1888 break;
1889 case 'W':
1890 if (!(syntax & RE_NO_GNU_OPS))
1891 token->type = OP_NOTWORD;
1892 break;
1893 case 's':
1894 if (!(syntax & RE_NO_GNU_OPS))
1895 token->type = OP_SPACE;
1896 break;
1897 case 'S':
1898 if (!(syntax & RE_NO_GNU_OPS))
1899 token->type = OP_NOTSPACE;
1900 break;
1901 case '`':
1902 if (!(syntax & RE_NO_GNU_OPS))
1904 token->type = ANCHOR;
1905 token->opr.ctx_type = BUF_FIRST;
1907 break;
1908 case '\'':
1909 if (!(syntax & RE_NO_GNU_OPS))
1911 token->type = ANCHOR;
1912 token->opr.ctx_type = BUF_LAST;
1914 break;
1915 case '(':
1916 if (!(syntax & RE_NO_BK_PARENS))
1917 token->type = OP_OPEN_SUBEXP;
1918 break;
1919 case ')':
1920 if (!(syntax & RE_NO_BK_PARENS))
1921 token->type = OP_CLOSE_SUBEXP;
1922 break;
1923 case '+':
1924 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1925 token->type = OP_DUP_PLUS;
1926 break;
1927 case '?':
1928 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1929 token->type = OP_DUP_QUESTION;
1930 break;
1931 case '{':
1932 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1933 token->type = OP_OPEN_DUP_NUM;
1934 break;
1935 case '}':
1936 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1937 token->type = OP_CLOSE_DUP_NUM;
1938 break;
1939 default:
1940 break;
1942 return 2;
1945 token->type = CHARACTER;
1946 #ifdef RE_ENABLE_I18N
1947 if (input->mb_cur_max > 1)
1949 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1950 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1952 else
1953 #endif
1954 token->word_char = IS_WORD_CHAR (token->opr.c);
1956 switch (c)
1958 case '\n':
1959 if (syntax & RE_NEWLINE_ALT)
1960 token->type = OP_ALT;
1961 break;
1962 case '|':
1963 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1964 token->type = OP_ALT;
1965 break;
1966 case '*':
1967 token->type = OP_DUP_ASTERISK;
1968 break;
1969 case '+':
1970 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1971 token->type = OP_DUP_PLUS;
1972 break;
1973 case '?':
1974 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1975 token->type = OP_DUP_QUESTION;
1976 break;
1977 case '{':
1978 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1979 token->type = OP_OPEN_DUP_NUM;
1980 break;
1981 case '}':
1982 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1983 token->type = OP_CLOSE_DUP_NUM;
1984 break;
1985 case '(':
1986 if (syntax & RE_NO_BK_PARENS)
1987 token->type = OP_OPEN_SUBEXP;
1988 break;
1989 case ')':
1990 if (syntax & RE_NO_BK_PARENS)
1991 token->type = OP_CLOSE_SUBEXP;
1992 break;
1993 case '[':
1994 token->type = OP_OPEN_BRACKET;
1995 break;
1996 case '.':
1997 token->type = OP_PERIOD;
1998 break;
1999 case '^':
2000 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
2001 re_string_cur_idx (input) != 0)
2003 char prev = re_string_peek_byte (input, -1);
2004 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
2005 break;
2007 token->type = ANCHOR;
2008 token->opr.ctx_type = LINE_FIRST;
2009 break;
2010 case '$':
2011 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
2012 re_string_cur_idx (input) + 1 != re_string_length (input))
2014 re_token_t next;
2015 re_string_skip_bytes (input, 1);
2016 peek_token (&next, input, syntax);
2017 re_string_skip_bytes (input, -1);
2018 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
2019 break;
2021 token->type = ANCHOR;
2022 token->opr.ctx_type = LINE_LAST;
2023 break;
2024 default:
2025 break;
2027 return 1;
2030 /* Peek a token from INPUT, and return the length of the token.
2031 We must not use this function out of bracket expressions. */
2033 static int
2034 internal_function
2035 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2037 unsigned char c;
2038 if (re_string_eoi (input))
2040 token->type = END_OF_RE;
2041 return 0;
2043 c = re_string_peek_byte (input, 0);
2044 token->opr.c = c;
2046 #ifdef RE_ENABLE_I18N
2047 if (input->mb_cur_max > 1 &&
2048 !re_string_first_byte (input, re_string_cur_idx (input)))
2050 token->type = CHARACTER;
2051 return 1;
2053 #endif /* RE_ENABLE_I18N */
2055 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2056 && re_string_cur_idx (input) + 1 < re_string_length (input))
2058 /* In this case, '\' escape a character. */
2059 unsigned char c2;
2060 re_string_skip_bytes (input, 1);
2061 c2 = re_string_peek_byte (input, 0);
2062 token->opr.c = c2;
2063 token->type = CHARACTER;
2064 return 1;
2066 if (c == '[') /* '[' is a special char in a bracket exps. */
2068 unsigned char c2;
2069 int token_len;
2070 if (re_string_cur_idx (input) + 1 < re_string_length (input))
2071 c2 = re_string_peek_byte (input, 1);
2072 else
2073 c2 = 0;
2074 token->opr.c = c2;
2075 token_len = 2;
2076 switch (c2)
2078 case '.':
2079 token->type = OP_OPEN_COLL_ELEM;
2080 break;
2081 case '=':
2082 token->type = OP_OPEN_EQUIV_CLASS;
2083 break;
2084 case ':':
2085 if (syntax & RE_CHAR_CLASSES)
2087 token->type = OP_OPEN_CHAR_CLASS;
2088 break;
2090 /* else fall through. */
2091 default:
2092 token->type = CHARACTER;
2093 token->opr.c = c;
2094 token_len = 1;
2095 break;
2097 return token_len;
2099 switch (c)
2101 case '-':
2102 token->type = OP_CHARSET_RANGE;
2103 break;
2104 case ']':
2105 token->type = OP_CLOSE_BRACKET;
2106 break;
2107 case '^':
2108 token->type = OP_NON_MATCH_LIST;
2109 break;
2110 default:
2111 token->type = CHARACTER;
2113 return 1;
2116 /* Functions for parser. */
2118 /* Entry point of the parser.
2119 Parse the regular expression REGEXP and return the structure tree.
2120 If an error occurs, ERR is set by error code, and return NULL.
2121 This function build the following tree, from regular expression <reg_exp>:
2125 <reg_exp> EOR
2127 CAT means concatenation.
2128 EOR means end of regular expression. */
2130 static bin_tree_t *
2131 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2132 reg_errcode_t *err)
2134 re_dfa_t *dfa = preg->buffer;
2135 bin_tree_t *tree, *eor, *root;
2136 re_token_t current_token;
2137 dfa->syntax = syntax;
2138 fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2139 tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2140 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2141 return NULL;
2142 eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2143 if (tree != NULL)
2144 root = create_tree (dfa, tree, eor, CONCAT);
2145 else
2146 root = eor;
2147 if (BE (eor == NULL || root == NULL, 0))
2149 *err = REG_ESPACE;
2150 return NULL;
2152 return root;
2155 /* This function build the following tree, from regular expression
2156 <branch1>|<branch2>:
2160 <branch1> <branch2>
2162 ALT means alternative, which represents the operator '|'. */
2164 static bin_tree_t *
2165 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2166 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2168 re_dfa_t *dfa = preg->buffer;
2169 bin_tree_t *tree, *branch = NULL;
2170 bitset_word_t initial_bkref_map = dfa->completed_bkref_map;
2171 tree = parse_branch (regexp, preg, token, syntax, nest, err);
2172 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2173 return NULL;
2175 while (token->type == OP_ALT)
2177 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2178 if (token->type != OP_ALT && token->type != END_OF_RE
2179 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2181 bitset_word_t accumulated_bkref_map = dfa->completed_bkref_map;
2182 dfa->completed_bkref_map = initial_bkref_map;
2183 branch = parse_branch (regexp, preg, token, syntax, nest, err);
2184 if (BE (*err != REG_NOERROR && branch == NULL, 0))
2186 if (tree != NULL)
2187 postorder (tree, free_tree, NULL);
2188 return NULL;
2190 dfa->completed_bkref_map |= accumulated_bkref_map;
2192 else
2193 branch = NULL;
2194 tree = create_tree (dfa, tree, branch, OP_ALT);
2195 if (BE (tree == NULL, 0))
2197 *err = REG_ESPACE;
2198 return NULL;
2201 return tree;
2204 /* This function build the following tree, from regular expression
2205 <exp1><exp2>:
2209 <exp1> <exp2>
2211 CAT means concatenation. */
2213 static bin_tree_t *
2214 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2215 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2217 bin_tree_t *tree, *expr;
2218 re_dfa_t *dfa = preg->buffer;
2219 tree = parse_expression (regexp, preg, token, syntax, nest, err);
2220 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2221 return NULL;
2223 while (token->type != OP_ALT && token->type != END_OF_RE
2224 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2226 expr = parse_expression (regexp, preg, token, syntax, nest, err);
2227 if (BE (*err != REG_NOERROR && expr == NULL, 0))
2229 if (tree != NULL)
2230 postorder (tree, free_tree, NULL);
2231 return NULL;
2233 if (tree != NULL && expr != NULL)
2235 bin_tree_t *newtree = create_tree (dfa, tree, expr, CONCAT);
2236 if (newtree == NULL)
2238 postorder (expr, free_tree, NULL);
2239 postorder (tree, free_tree, NULL);
2240 *err = REG_ESPACE;
2241 return NULL;
2243 tree = newtree;
2245 else if (tree == NULL)
2246 tree = expr;
2247 /* Otherwise expr == NULL, we don't need to create new tree. */
2249 return tree;
2252 /* This function build the following tree, from regular expression a*:
2258 static bin_tree_t *
2259 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2260 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2262 re_dfa_t *dfa = preg->buffer;
2263 bin_tree_t *tree;
2264 switch (token->type)
2266 case CHARACTER:
2267 tree = create_token_tree (dfa, NULL, NULL, token);
2268 if (BE (tree == NULL, 0))
2270 *err = REG_ESPACE;
2271 return NULL;
2273 #ifdef RE_ENABLE_I18N
2274 if (dfa->mb_cur_max > 1)
2276 while (!re_string_eoi (regexp)
2277 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2279 bin_tree_t *mbc_remain;
2280 fetch_token (token, regexp, syntax);
2281 mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2282 tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2283 if (BE (mbc_remain == NULL || tree == NULL, 0))
2285 *err = REG_ESPACE;
2286 return NULL;
2290 #endif
2291 break;
2292 case OP_OPEN_SUBEXP:
2293 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2294 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2295 return NULL;
2296 break;
2297 case OP_OPEN_BRACKET:
2298 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2299 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2300 return NULL;
2301 break;
2302 case OP_BACK_REF:
2303 if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2305 *err = REG_ESUBREG;
2306 return NULL;
2308 dfa->used_bkref_map |= 1 << token->opr.idx;
2309 tree = create_token_tree (dfa, NULL, NULL, token);
2310 if (BE (tree == NULL, 0))
2312 *err = REG_ESPACE;
2313 return NULL;
2315 ++dfa->nbackref;
2316 dfa->has_mb_node = 1;
2317 break;
2318 case OP_OPEN_DUP_NUM:
2319 if (syntax & RE_CONTEXT_INVALID_DUP)
2321 *err = REG_BADRPT;
2322 return NULL;
2324 /* FALLTHROUGH */
2325 case OP_DUP_ASTERISK:
2326 case OP_DUP_PLUS:
2327 case OP_DUP_QUESTION:
2328 if (syntax & RE_CONTEXT_INVALID_OPS)
2330 *err = REG_BADRPT;
2331 return NULL;
2333 else if (syntax & RE_CONTEXT_INDEP_OPS)
2335 fetch_token (token, regexp, syntax);
2336 return parse_expression (regexp, preg, token, syntax, nest, err);
2338 /* else fall through */
2339 case OP_CLOSE_SUBEXP:
2340 if ((token->type == OP_CLOSE_SUBEXP) &&
2341 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2343 *err = REG_ERPAREN;
2344 return NULL;
2346 /* else fall through */
2347 case OP_CLOSE_DUP_NUM:
2348 /* We treat it as a normal character. */
2350 /* Then we can these characters as normal characters. */
2351 token->type = CHARACTER;
2352 /* mb_partial and word_char bits should be initialized already
2353 by peek_token. */
2354 tree = create_token_tree (dfa, NULL, NULL, token);
2355 if (BE (tree == NULL, 0))
2357 *err = REG_ESPACE;
2358 return NULL;
2360 break;
2361 case ANCHOR:
2362 if ((token->opr.ctx_type
2363 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2364 && dfa->word_ops_used == 0)
2365 init_word_char (dfa);
2366 if (token->opr.ctx_type == WORD_DELIM
2367 || token->opr.ctx_type == NOT_WORD_DELIM)
2369 bin_tree_t *tree_first, *tree_last;
2370 if (token->opr.ctx_type == WORD_DELIM)
2372 token->opr.ctx_type = WORD_FIRST;
2373 tree_first = create_token_tree (dfa, NULL, NULL, token);
2374 token->opr.ctx_type = WORD_LAST;
2376 else
2378 token->opr.ctx_type = INSIDE_WORD;
2379 tree_first = create_token_tree (dfa, NULL, NULL, token);
2380 token->opr.ctx_type = INSIDE_NOTWORD;
2382 tree_last = create_token_tree (dfa, NULL, NULL, token);
2383 tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2384 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2386 *err = REG_ESPACE;
2387 return NULL;
2390 else
2392 tree = create_token_tree (dfa, NULL, NULL, token);
2393 if (BE (tree == NULL, 0))
2395 *err = REG_ESPACE;
2396 return NULL;
2399 /* We must return here, since ANCHORs can't be followed
2400 by repetition operators.
2401 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2402 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2403 fetch_token (token, regexp, syntax);
2404 return tree;
2405 case OP_PERIOD:
2406 tree = create_token_tree (dfa, NULL, NULL, token);
2407 if (BE (tree == NULL, 0))
2409 *err = REG_ESPACE;
2410 return NULL;
2412 if (dfa->mb_cur_max > 1)
2413 dfa->has_mb_node = 1;
2414 break;
2415 case OP_WORD:
2416 case OP_NOTWORD:
2417 tree = build_charclass_op (dfa, regexp->trans,
2418 "alnum",
2419 "_",
2420 token->type == OP_NOTWORD, err);
2421 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2422 return NULL;
2423 break;
2424 case OP_SPACE:
2425 case OP_NOTSPACE:
2426 tree = build_charclass_op (dfa, regexp->trans,
2427 "space",
2429 token->type == OP_NOTSPACE, err);
2430 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2431 return NULL;
2432 break;
2433 case OP_ALT:
2434 case END_OF_RE:
2435 return NULL;
2436 case BACK_SLASH:
2437 *err = REG_EESCAPE;
2438 return NULL;
2439 default:
2440 /* Must not happen? */
2441 #ifdef DEBUG
2442 assert (0);
2443 #endif
2444 return NULL;
2446 fetch_token (token, regexp, syntax);
2448 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2449 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2451 bin_tree_t *dup_tree = parse_dup_op (tree, regexp, dfa, token,
2452 syntax, err);
2453 if (BE (*err != REG_NOERROR && dup_tree == NULL, 0))
2455 if (tree != NULL)
2456 postorder (tree, free_tree, NULL);
2457 return NULL;
2459 tree = dup_tree;
2460 /* In BRE consecutive duplications are not allowed. */
2461 if ((syntax & RE_CONTEXT_INVALID_DUP)
2462 && (token->type == OP_DUP_ASTERISK
2463 || token->type == OP_OPEN_DUP_NUM))
2465 if (tree != NULL)
2466 postorder (tree, free_tree, NULL);
2467 *err = REG_BADRPT;
2468 return NULL;
2472 return tree;
2475 /* This function build the following tree, from regular expression
2476 (<reg_exp>):
2477 SUBEXP
2479 <reg_exp>
2482 static bin_tree_t *
2483 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2484 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2486 re_dfa_t *dfa = preg->buffer;
2487 bin_tree_t *tree;
2488 size_t cur_nsub;
2489 cur_nsub = preg->re_nsub++;
2491 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2493 /* The subexpression may be a null string. */
2494 if (token->type == OP_CLOSE_SUBEXP)
2495 tree = NULL;
2496 else
2498 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2499 if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2501 if (tree != NULL)
2502 postorder (tree, free_tree, NULL);
2503 *err = REG_EPAREN;
2505 if (BE (*err != REG_NOERROR, 0))
2506 return NULL;
2509 if (cur_nsub <= '9' - '1')
2510 dfa->completed_bkref_map |= 1 << cur_nsub;
2512 tree = create_tree (dfa, tree, NULL, SUBEXP);
2513 if (BE (tree == NULL, 0))
2515 *err = REG_ESPACE;
2516 return NULL;
2518 tree->token.opr.idx = cur_nsub;
2519 return tree;
2522 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2524 static bin_tree_t *
2525 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2526 re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2528 bin_tree_t *tree = NULL, *old_tree = NULL;
2529 Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2530 re_token_t start_token = *token;
2532 if (token->type == OP_OPEN_DUP_NUM)
2534 end = 0;
2535 start = fetch_number (regexp, token, syntax);
2536 if (start == -1)
2538 if (token->type == CHARACTER && token->opr.c == ',')
2539 start = 0; /* We treat "{,m}" as "{0,m}". */
2540 else
2542 *err = REG_BADBR; /* <re>{} is invalid. */
2543 return NULL;
2546 if (BE (start != -2, 1))
2548 /* We treat "{n}" as "{n,n}". */
2549 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2550 : ((token->type == CHARACTER && token->opr.c == ',')
2551 ? fetch_number (regexp, token, syntax) : -2));
2553 if (BE (start == -2 || end == -2, 0))
2555 /* Invalid sequence. */
2556 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2558 if (token->type == END_OF_RE)
2559 *err = REG_EBRACE;
2560 else
2561 *err = REG_BADBR;
2563 return NULL;
2566 /* If the syntax bit is set, rollback. */
2567 re_string_set_index (regexp, start_idx);
2568 *token = start_token;
2569 token->type = CHARACTER;
2570 /* mb_partial and word_char bits should be already initialized by
2571 peek_token. */
2572 return elem;
2575 if (BE ((end != -1 && start > end)
2576 || token->type != OP_CLOSE_DUP_NUM, 0))
2578 /* First number greater than second. */
2579 *err = REG_BADBR;
2580 return NULL;
2583 if (BE (RE_DUP_MAX < (end == -1 ? start : end), 0))
2585 *err = REG_ESIZE;
2586 return NULL;
2589 else
2591 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2592 end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2595 fetch_token (token, regexp, syntax);
2597 if (BE (elem == NULL, 0))
2598 return NULL;
2599 if (BE (start == 0 && end == 0, 0))
2601 postorder (elem, free_tree, NULL);
2602 return NULL;
2605 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2606 if (BE (start > 0, 0))
2608 tree = elem;
2609 for (i = 2; i <= start; ++i)
2611 elem = duplicate_tree (elem, dfa);
2612 tree = create_tree (dfa, tree, elem, CONCAT);
2613 if (BE (elem == NULL || tree == NULL, 0))
2614 goto parse_dup_op_espace;
2617 if (start == end)
2618 return tree;
2620 /* Duplicate ELEM before it is marked optional. */
2621 elem = duplicate_tree (elem, dfa);
2622 if (BE (elem == NULL, 0))
2623 goto parse_dup_op_espace;
2624 old_tree = tree;
2626 else
2627 old_tree = NULL;
2629 if (elem->token.type == SUBEXP)
2631 uintptr_t subidx = elem->token.opr.idx;
2632 postorder (elem, mark_opt_subexp, (void *) subidx);
2635 tree = create_tree (dfa, elem, NULL,
2636 (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2637 if (BE (tree == NULL, 0))
2638 goto parse_dup_op_espace;
2640 /* From gnulib's "intprops.h":
2641 True if the arithmetic type T is signed. */
2642 #define TYPE_SIGNED(t) (! ((t) 0 < (t) -1))
2644 /* This loop is actually executed only when end != -1,
2645 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2646 already created the start+1-th copy. */
2647 if (TYPE_SIGNED (Idx) || end != -1)
2648 for (i = start + 2; i <= end; ++i)
2650 elem = duplicate_tree (elem, dfa);
2651 tree = create_tree (dfa, tree, elem, CONCAT);
2652 if (BE (elem == NULL || tree == NULL, 0))
2653 goto parse_dup_op_espace;
2655 tree = create_tree (dfa, tree, NULL, OP_ALT);
2656 if (BE (tree == NULL, 0))
2657 goto parse_dup_op_espace;
2660 if (old_tree)
2661 tree = create_tree (dfa, old_tree, tree, CONCAT);
2663 return tree;
2665 parse_dup_op_espace:
2666 *err = REG_ESPACE;
2667 return NULL;
2670 /* Size of the names for collating symbol/equivalence_class/character_class.
2671 I'm not sure, but maybe enough. */
2672 #define BRACKET_NAME_BUF_SIZE 32
2674 #ifndef _LIBC
2676 # ifdef RE_ENABLE_I18N
2677 /* Convert the byte B to the corresponding wide character. In a
2678 unibyte locale, treat B as itself if it is an encoding error.
2679 In a multibyte locale, return WEOF if B is an encoding error. */
2680 static wint_t
2681 parse_byte (unsigned char b, re_charset_t *mbcset)
2683 wint_t wc = __btowc (b);
2684 return wc == WEOF && !mbcset ? b : wc;
2686 #endif
2688 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2689 Build the range expression which starts from START_ELEM, and ends
2690 at END_ELEM. The result are written to MBCSET and SBCSET.
2691 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2692 mbcset->range_ends, is a pointer argument since we may
2693 update it. */
2695 static reg_errcode_t
2696 internal_function
2697 # ifdef RE_ENABLE_I18N
2698 build_range_exp (const reg_syntax_t syntax,
2699 bitset_t sbcset,
2700 re_charset_t *mbcset,
2701 Idx *range_alloc,
2702 const bracket_elem_t *start_elem,
2703 const bracket_elem_t *end_elem)
2704 # else /* not RE_ENABLE_I18N */
2705 build_range_exp (const reg_syntax_t syntax,
2706 bitset_t sbcset,
2707 const bracket_elem_t *start_elem,
2708 const bracket_elem_t *end_elem)
2709 # endif /* not RE_ENABLE_I18N */
2711 unsigned int start_ch, end_ch;
2712 /* Equivalence Classes and Character Classes can't be a range start/end. */
2713 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2714 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2716 return REG_ERANGE;
2718 /* We can handle no multi character collating elements without libc
2719 support. */
2720 if (BE ((start_elem->type == COLL_SYM
2721 && strlen ((char *) start_elem->opr.name) > 1)
2722 || (end_elem->type == COLL_SYM
2723 && strlen ((char *) end_elem->opr.name) > 1), 0))
2724 return REG_ECOLLATE;
2726 # ifdef RE_ENABLE_I18N
2728 wchar_t wc;
2729 wint_t start_wc;
2730 wint_t end_wc;
2732 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2733 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2734 : 0));
2735 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2736 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2737 : 0));
2738 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2739 ? parse_byte (start_ch, mbcset) : start_elem->opr.wch);
2740 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2741 ? parse_byte (end_ch, mbcset) : end_elem->opr.wch);
2742 if (start_wc == WEOF || end_wc == WEOF)
2743 return REG_ECOLLATE;
2744 else if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_wc > end_wc, 0))
2745 return REG_ERANGE;
2747 /* Got valid collation sequence values, add them as a new entry.
2748 However, for !_LIBC we have no collation elements: if the
2749 character set is single byte, the single byte character set
2750 that we build below suffices. parse_bracket_exp passes
2751 no MBCSET if dfa->mb_cur_max == 1. */
2752 if (mbcset)
2754 /* Check the space of the arrays. */
2755 if (BE (*range_alloc == mbcset->nranges, 0))
2757 /* There is not enough space, need realloc. */
2758 wchar_t *new_array_start, *new_array_end;
2759 Idx new_nranges;
2761 /* +1 in case of mbcset->nranges is 0. */
2762 new_nranges = 2 * mbcset->nranges + 1;
2763 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2764 are NULL if *range_alloc == 0. */
2765 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2766 new_nranges);
2767 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2768 new_nranges);
2770 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2772 re_free (new_array_start);
2773 re_free (new_array_end);
2774 return REG_ESPACE;
2777 mbcset->range_starts = new_array_start;
2778 mbcset->range_ends = new_array_end;
2779 *range_alloc = new_nranges;
2782 mbcset->range_starts[mbcset->nranges] = start_wc;
2783 mbcset->range_ends[mbcset->nranges++] = end_wc;
2786 /* Build the table for single byte characters. */
2787 for (wc = 0; wc < SBC_MAX; ++wc)
2789 if (start_wc <= wc && wc <= end_wc)
2790 bitset_set (sbcset, wc);
2793 # else /* not RE_ENABLE_I18N */
2795 unsigned int ch;
2796 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2797 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2798 : 0));
2799 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2800 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2801 : 0));
2802 if (start_ch > end_ch)
2803 return REG_ERANGE;
2804 /* Build the table for single byte characters. */
2805 for (ch = 0; ch < SBC_MAX; ++ch)
2806 if (start_ch <= ch && ch <= end_ch)
2807 bitset_set (sbcset, ch);
2809 # endif /* not RE_ENABLE_I18N */
2810 return REG_NOERROR;
2812 #endif /* not _LIBC */
2814 #ifndef _LIBC
2815 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2816 Build the collating element which is represented by NAME.
2817 The result are written to MBCSET and SBCSET.
2818 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2819 pointer argument since we may update it. */
2821 static reg_errcode_t
2822 internal_function
2823 # ifdef RE_ENABLE_I18N
2824 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2825 Idx *coll_sym_alloc, const unsigned char *name)
2826 # else /* not RE_ENABLE_I18N */
2827 build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2828 # endif /* not RE_ENABLE_I18N */
2830 size_t name_len = strlen ((const char *) name);
2831 if (BE (name_len != 1, 0))
2832 return REG_ECOLLATE;
2833 else
2835 bitset_set (sbcset, name[0]);
2836 return REG_NOERROR;
2839 #endif /* not _LIBC */
2841 /* This function parse bracket expression like "[abc]", "[a-c]",
2842 "[[.a-a.]]" etc. */
2844 static bin_tree_t *
2845 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2846 reg_syntax_t syntax, reg_errcode_t *err)
2848 #ifdef _LIBC
2849 const unsigned char *collseqmb;
2850 const char *collseqwc;
2851 uint32_t nrules;
2852 int32_t table_size;
2853 const int32_t *symb_table;
2854 const unsigned char *extra;
2856 /* Local function for parse_bracket_exp used in _LIBC environment.
2857 Seek the collating symbol entry corresponding to NAME.
2858 Return the index of the symbol in the SYMB_TABLE,
2859 or -1 if not found. */
2861 auto inline int32_t
2862 __attribute__ ((always_inline))
2863 seek_collating_symbol_entry (const unsigned char *name, size_t name_len)
2865 int32_t elem;
2867 for (elem = 0; elem < table_size; elem++)
2868 if (symb_table[2 * elem] != 0)
2870 int32_t idx = symb_table[2 * elem + 1];
2871 /* Skip the name of collating element name. */
2872 idx += 1 + extra[idx];
2873 if (/* Compare the length of the name. */
2874 name_len == extra[idx]
2875 /* Compare the name. */
2876 && memcmp (name, &extra[idx + 1], name_len) == 0)
2877 /* Yep, this is the entry. */
2878 return elem;
2880 return -1;
2883 /* Local function for parse_bracket_exp used in _LIBC environment.
2884 Look up the collation sequence value of BR_ELEM.
2885 Return the value if succeeded, UINT_MAX otherwise. */
2887 auto inline unsigned int
2888 __attribute__ ((always_inline))
2889 lookup_collation_sequence_value (bracket_elem_t *br_elem)
2891 if (br_elem->type == SB_CHAR)
2894 if (MB_CUR_MAX == 1)
2896 if (nrules == 0)
2897 return collseqmb[br_elem->opr.ch];
2898 else
2900 wint_t wc = __btowc (br_elem->opr.ch);
2901 return __collseq_table_lookup (collseqwc, wc);
2904 else if (br_elem->type == MB_CHAR)
2906 if (nrules != 0)
2907 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2909 else if (br_elem->type == COLL_SYM)
2911 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2912 if (nrules != 0)
2914 int32_t elem, idx;
2915 elem = seek_collating_symbol_entry (br_elem->opr.name,
2916 sym_name_len);
2917 if (elem != -1)
2919 /* We found the entry. */
2920 idx = symb_table[2 * elem + 1];
2921 /* Skip the name of collating element name. */
2922 idx += 1 + extra[idx];
2923 /* Skip the byte sequence of the collating element. */
2924 idx += 1 + extra[idx];
2925 /* Adjust for the alignment. */
2926 idx = (idx + 3) & ~3;
2927 /* Skip the multibyte collation sequence value. */
2928 idx += sizeof (unsigned int);
2929 /* Skip the wide char sequence of the collating element. */
2930 idx += sizeof (unsigned int) *
2931 (1 + *(unsigned int *) (extra + idx));
2932 /* Return the collation sequence value. */
2933 return *(unsigned int *) (extra + idx);
2935 else if (sym_name_len == 1)
2937 /* No valid character. Match it as a single byte
2938 character. */
2939 return collseqmb[br_elem->opr.name[0]];
2942 else if (sym_name_len == 1)
2943 return collseqmb[br_elem->opr.name[0]];
2945 return UINT_MAX;
2948 /* Local function for parse_bracket_exp used in _LIBC environment.
2949 Build the range expression which starts from START_ELEM, and ends
2950 at END_ELEM. The result are written to MBCSET and SBCSET.
2951 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2952 mbcset->range_ends, is a pointer argument since we may
2953 update it. */
2955 auto inline reg_errcode_t
2956 __attribute__ ((always_inline))
2957 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2958 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2960 unsigned int ch;
2961 uint32_t start_collseq;
2962 uint32_t end_collseq;
2964 /* Equivalence Classes and Character Classes can't be a range
2965 start/end. */
2966 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2967 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2969 return REG_ERANGE;
2971 /* FIXME: Implement rational ranges here, too. */
2972 start_collseq = lookup_collation_sequence_value (start_elem);
2973 end_collseq = lookup_collation_sequence_value (end_elem);
2974 /* Check start/end collation sequence values. */
2975 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2976 return REG_ECOLLATE;
2977 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2978 return REG_ERANGE;
2980 /* Got valid collation sequence values, add them as a new entry.
2981 However, if we have no collation elements, and the character set
2982 is single byte, the single byte character set that we
2983 build below suffices. */
2984 if (nrules > 0 || dfa->mb_cur_max > 1)
2986 /* Check the space of the arrays. */
2987 if (BE (*range_alloc == mbcset->nranges, 0))
2989 /* There is not enough space, need realloc. */
2990 uint32_t *new_array_start;
2991 uint32_t *new_array_end;
2992 Idx new_nranges;
2994 /* +1 in case of mbcset->nranges is 0. */
2995 new_nranges = 2 * mbcset->nranges + 1;
2996 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2997 new_nranges);
2998 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2999 new_nranges);
3001 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
3002 return REG_ESPACE;
3004 mbcset->range_starts = new_array_start;
3005 mbcset->range_ends = new_array_end;
3006 *range_alloc = new_nranges;
3009 mbcset->range_starts[mbcset->nranges] = start_collseq;
3010 mbcset->range_ends[mbcset->nranges++] = end_collseq;
3013 /* Build the table for single byte characters. */
3014 for (ch = 0; ch < SBC_MAX; ch++)
3016 uint32_t ch_collseq;
3018 if (MB_CUR_MAX == 1)
3020 if (nrules == 0)
3021 ch_collseq = collseqmb[ch];
3022 else
3023 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
3024 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
3025 bitset_set (sbcset, ch);
3027 return REG_NOERROR;
3030 /* Local function for parse_bracket_exp used in _LIBC environment.
3031 Build the collating element which is represented by NAME.
3032 The result are written to MBCSET and SBCSET.
3033 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
3034 pointer argument since we may update it. */
3036 auto inline reg_errcode_t
3037 __attribute__ ((always_inline))
3038 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
3039 Idx *coll_sym_alloc, const unsigned char *name)
3041 int32_t elem, idx;
3042 size_t name_len = strlen ((const char *) name);
3043 if (nrules != 0)
3045 elem = seek_collating_symbol_entry (name, name_len);
3046 if (elem != -1)
3048 /* We found the entry. */
3049 idx = symb_table[2 * elem + 1];
3050 /* Skip the name of collating element name. */
3051 idx += 1 + extra[idx];
3053 else if (name_len == 1)
3055 /* No valid character, treat it as a normal
3056 character. */
3057 bitset_set (sbcset, name[0]);
3058 return REG_NOERROR;
3060 else
3061 return REG_ECOLLATE;
3063 /* Got valid collation sequence, add it as a new entry. */
3064 /* Check the space of the arrays. */
3065 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3067 /* Not enough, realloc it. */
3068 /* +1 in case of mbcset->ncoll_syms is 0. */
3069 Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3070 /* Use realloc since mbcset->coll_syms is NULL
3071 if *alloc == 0. */
3072 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3073 new_coll_sym_alloc);
3074 if (BE (new_coll_syms == NULL, 0))
3075 return REG_ESPACE;
3076 mbcset->coll_syms = new_coll_syms;
3077 *coll_sym_alloc = new_coll_sym_alloc;
3079 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3080 return REG_NOERROR;
3082 else
3084 if (BE (name_len != 1, 0))
3085 return REG_ECOLLATE;
3086 else
3088 bitset_set (sbcset, name[0]);
3089 return REG_NOERROR;
3093 #endif
3095 re_token_t br_token;
3096 re_bitset_ptr_t sbcset;
3097 #ifdef RE_ENABLE_I18N
3098 re_charset_t *mbcset;
3099 Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3100 Idx equiv_class_alloc = 0, char_class_alloc = 0;
3101 #endif /* not RE_ENABLE_I18N */
3102 bool non_match = false;
3103 bin_tree_t *work_tree;
3104 int token_len;
3105 bool first_round = true;
3106 #ifdef _LIBC
3107 collseqmb = (const unsigned char *)
3108 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3109 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3110 if (nrules)
3113 if (MB_CUR_MAX > 1)
3115 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3116 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3117 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3118 _NL_COLLATE_SYMB_TABLEMB);
3119 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3120 _NL_COLLATE_SYMB_EXTRAMB);
3122 #endif
3123 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3124 #ifdef RE_ENABLE_I18N
3125 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3126 #endif /* RE_ENABLE_I18N */
3127 #ifdef RE_ENABLE_I18N
3128 if (BE (sbcset == NULL || mbcset == NULL, 0))
3129 #else
3130 if (BE (sbcset == NULL, 0))
3131 #endif /* RE_ENABLE_I18N */
3133 re_free (sbcset);
3134 #ifdef RE_ENABLE_I18N
3135 re_free (mbcset);
3136 #endif
3137 *err = REG_ESPACE;
3138 return NULL;
3141 token_len = peek_token_bracket (token, regexp, syntax);
3142 if (BE (token->type == END_OF_RE, 0))
3144 *err = REG_BADPAT;
3145 goto parse_bracket_exp_free_return;
3147 if (token->type == OP_NON_MATCH_LIST)
3149 #ifdef RE_ENABLE_I18N
3150 mbcset->non_match = 1;
3151 #endif /* not RE_ENABLE_I18N */
3152 non_match = true;
3153 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3154 bitset_set (sbcset, '\n');
3155 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3156 token_len = peek_token_bracket (token, regexp, syntax);
3157 if (BE (token->type == END_OF_RE, 0))
3159 *err = REG_BADPAT;
3160 goto parse_bracket_exp_free_return;
3164 /* We treat the first ']' as a normal character. */
3165 if (token->type == OP_CLOSE_BRACKET)
3166 token->type = CHARACTER;
3168 while (1)
3170 bracket_elem_t start_elem, end_elem;
3171 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3172 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3173 reg_errcode_t ret;
3174 int token_len2 = 0;
3175 bool is_range_exp = false;
3176 re_token_t token2;
3178 start_elem.opr.name = start_name_buf;
3179 start_elem.type = COLL_SYM;
3180 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3181 syntax, first_round);
3182 if (BE (ret != REG_NOERROR, 0))
3184 *err = ret;
3185 goto parse_bracket_exp_free_return;
3187 first_round = false;
3189 /* Get information about the next token. We need it in any case. */
3190 token_len = peek_token_bracket (token, regexp, syntax);
3192 /* Do not check for ranges if we know they are not allowed. */
3193 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3195 if (BE (token->type == END_OF_RE, 0))
3197 *err = REG_EBRACK;
3198 goto parse_bracket_exp_free_return;
3200 if (token->type == OP_CHARSET_RANGE)
3202 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3203 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3204 if (BE (token2.type == END_OF_RE, 0))
3206 *err = REG_EBRACK;
3207 goto parse_bracket_exp_free_return;
3209 if (token2.type == OP_CLOSE_BRACKET)
3211 /* We treat the last '-' as a normal character. */
3212 re_string_skip_bytes (regexp, -token_len);
3213 token->type = CHARACTER;
3215 else
3216 is_range_exp = true;
3220 if (is_range_exp == true)
3222 end_elem.opr.name = end_name_buf;
3223 end_elem.type = COLL_SYM;
3224 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3225 dfa, syntax, true);
3226 if (BE (ret != REG_NOERROR, 0))
3228 *err = ret;
3229 goto parse_bracket_exp_free_return;
3232 token_len = peek_token_bracket (token, regexp, syntax);
3234 #ifdef _LIBC
3235 *err = build_range_exp (sbcset, mbcset, &range_alloc,
3236 &start_elem, &end_elem);
3237 #else
3238 # ifdef RE_ENABLE_I18N
3239 *err = build_range_exp (syntax, sbcset,
3240 dfa->mb_cur_max > 1 ? mbcset : NULL,
3241 &range_alloc, &start_elem, &end_elem);
3242 # else
3243 *err = build_range_exp (syntax, sbcset, &start_elem, &end_elem);
3244 # endif
3245 #endif /* RE_ENABLE_I18N */
3246 if (BE (*err != REG_NOERROR, 0))
3247 goto parse_bracket_exp_free_return;
3249 else
3251 switch (start_elem.type)
3253 case SB_CHAR:
3254 bitset_set (sbcset, start_elem.opr.ch);
3255 break;
3256 #ifdef RE_ENABLE_I18N
3257 case MB_CHAR:
3258 /* Check whether the array has enough space. */
3259 if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3261 wchar_t *new_mbchars;
3262 /* Not enough, realloc it. */
3263 /* +1 in case of mbcset->nmbchars is 0. */
3264 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3265 /* Use realloc since array is NULL if *alloc == 0. */
3266 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3267 mbchar_alloc);
3268 if (BE (new_mbchars == NULL, 0))
3269 goto parse_bracket_exp_espace;
3270 mbcset->mbchars = new_mbchars;
3272 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3273 break;
3274 #endif /* RE_ENABLE_I18N */
3275 case EQUIV_CLASS:
3276 *err = build_equiv_class (sbcset,
3277 #ifdef RE_ENABLE_I18N
3278 mbcset, &equiv_class_alloc,
3279 #endif /* RE_ENABLE_I18N */
3280 start_elem.opr.name);
3281 if (BE (*err != REG_NOERROR, 0))
3282 goto parse_bracket_exp_free_return;
3283 break;
3284 case COLL_SYM:
3285 *err = build_collating_symbol (sbcset,
3286 #ifdef RE_ENABLE_I18N
3287 mbcset, &coll_sym_alloc,
3288 #endif /* RE_ENABLE_I18N */
3289 start_elem.opr.name);
3290 if (BE (*err != REG_NOERROR, 0))
3291 goto parse_bracket_exp_free_return;
3292 break;
3293 case CHAR_CLASS:
3294 *err = build_charclass (regexp->trans, sbcset,
3295 #ifdef RE_ENABLE_I18N
3296 mbcset, &char_class_alloc,
3297 #endif /* RE_ENABLE_I18N */
3298 (const char *) start_elem.opr.name,
3299 syntax);
3300 if (BE (*err != REG_NOERROR, 0))
3301 goto parse_bracket_exp_free_return;
3302 break;
3303 default:
3304 assert (0);
3305 break;
3308 if (BE (token->type == END_OF_RE, 0))
3310 *err = REG_EBRACK;
3311 goto parse_bracket_exp_free_return;
3313 if (token->type == OP_CLOSE_BRACKET)
3314 break;
3317 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3319 /* If it is non-matching list. */
3320 if (non_match)
3321 bitset_not (sbcset);
3323 #ifdef RE_ENABLE_I18N
3324 /* Ensure only single byte characters are set. */
3325 if (dfa->mb_cur_max > 1)
3326 bitset_mask (sbcset, dfa->sb_char);
3328 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3329 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3330 || mbcset->non_match)))
3332 bin_tree_t *mbc_tree;
3333 int sbc_idx;
3334 /* Build a tree for complex bracket. */
3335 dfa->has_mb_node = 1;
3336 br_token.type = COMPLEX_BRACKET;
3337 br_token.opr.mbcset = mbcset;
3338 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3339 if (BE (mbc_tree == NULL, 0))
3340 goto parse_bracket_exp_espace;
3341 for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3342 if (sbcset[sbc_idx])
3343 break;
3344 /* If there are no bits set in sbcset, there is no point
3345 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3346 if (sbc_idx < BITSET_WORDS)
3348 /* Build a tree for simple bracket. */
3349 br_token.type = SIMPLE_BRACKET;
3350 br_token.opr.sbcset = sbcset;
3351 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3352 if (BE (work_tree == NULL, 0))
3353 goto parse_bracket_exp_espace;
3355 /* Then join them by ALT node. */
3356 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3357 if (BE (work_tree == NULL, 0))
3358 goto parse_bracket_exp_espace;
3360 else
3362 re_free (sbcset);
3363 work_tree = mbc_tree;
3366 else
3367 #endif /* not RE_ENABLE_I18N */
3369 #ifdef RE_ENABLE_I18N
3370 free_charset (mbcset);
3371 #endif
3372 /* Build a tree for simple bracket. */
3373 br_token.type = SIMPLE_BRACKET;
3374 br_token.opr.sbcset = sbcset;
3375 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3376 if (BE (work_tree == NULL, 0))
3377 goto parse_bracket_exp_espace;
3379 return work_tree;
3381 parse_bracket_exp_espace:
3382 *err = REG_ESPACE;
3383 parse_bracket_exp_free_return:
3384 re_free (sbcset);
3385 #ifdef RE_ENABLE_I18N
3386 free_charset (mbcset);
3387 #endif /* RE_ENABLE_I18N */
3388 return NULL;
3391 /* Parse an element in the bracket expression. */
3393 static reg_errcode_t
3394 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3395 re_token_t *token, int token_len, re_dfa_t *dfa,
3396 reg_syntax_t syntax, bool accept_hyphen)
3398 #ifdef RE_ENABLE_I18N
3399 int cur_char_size;
3400 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3401 if (cur_char_size > 1)
3403 elem->type = MB_CHAR;
3404 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3405 re_string_skip_bytes (regexp, cur_char_size);
3406 return REG_NOERROR;
3408 #endif /* RE_ENABLE_I18N */
3409 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3410 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3411 || token->type == OP_OPEN_EQUIV_CLASS)
3412 return parse_bracket_symbol (elem, regexp, token);
3413 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3415 /* A '-' must only appear as anything but a range indicator before
3416 the closing bracket. Everything else is an error. */
3417 re_token_t token2;
3418 (void) peek_token_bracket (&token2, regexp, syntax);
3419 if (token2.type != OP_CLOSE_BRACKET)
3420 /* The actual error value is not standardized since this whole
3421 case is undefined. But ERANGE makes good sense. */
3422 return REG_ERANGE;
3424 elem->type = SB_CHAR;
3425 elem->opr.ch = token->opr.c;
3426 return REG_NOERROR;
3429 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3430 such as [:<character_class>:], [.<collating_element>.], and
3431 [=<equivalent_class>=]. */
3433 static reg_errcode_t
3434 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3435 re_token_t *token)
3437 unsigned char ch, delim = token->opr.c;
3438 int i = 0;
3439 if (re_string_eoi(regexp))
3440 return REG_EBRACK;
3441 for (;; ++i)
3443 if (i >= BRACKET_NAME_BUF_SIZE)
3444 return REG_EBRACK;
3445 if (token->type == OP_OPEN_CHAR_CLASS)
3446 ch = re_string_fetch_byte_case (regexp);
3447 else
3448 ch = re_string_fetch_byte (regexp);
3449 if (re_string_eoi(regexp))
3450 return REG_EBRACK;
3451 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3452 break;
3453 elem->opr.name[i] = ch;
3455 re_string_skip_bytes (regexp, 1);
3456 elem->opr.name[i] = '\0';
3457 switch (token->type)
3459 case OP_OPEN_COLL_ELEM:
3460 elem->type = COLL_SYM;
3461 break;
3462 case OP_OPEN_EQUIV_CLASS:
3463 elem->type = EQUIV_CLASS;
3464 break;
3465 case OP_OPEN_CHAR_CLASS:
3466 elem->type = CHAR_CLASS;
3467 break;
3468 default:
3469 break;
3471 return REG_NOERROR;
3474 /* Helper function for parse_bracket_exp.
3475 Build the equivalence class which is represented by NAME.
3476 The result are written to MBCSET and SBCSET.
3477 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3478 is a pointer argument since we may update it. */
3480 static reg_errcode_t
3481 #ifdef RE_ENABLE_I18N
3482 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3483 Idx *equiv_class_alloc, const unsigned char *name)
3484 #else /* not RE_ENABLE_I18N */
3485 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3486 #endif /* not RE_ENABLE_I18N */
3488 #ifdef _LIBC
3489 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3490 if (nrules != 0)
3492 const int32_t *table, *indirect;
3493 const unsigned char *weights, *extra, *cp;
3494 unsigned char char_buf[2];
3495 int32_t idx1, idx2;
3496 unsigned int ch;
3497 size_t len;
3498 /* Calculate the index for equivalence class. */
3499 cp = name;
3500 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3501 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3502 _NL_COLLATE_WEIGHTMB);
3503 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3504 _NL_COLLATE_EXTRAMB);
3505 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3506 _NL_COLLATE_INDIRECTMB);
3507 idx1 = findidx (table, indirect, extra, &cp, -1);
3508 if (BE (idx1 == 0 || *cp != '\0', 0))
3509 /* This isn't a valid character. */
3510 return REG_ECOLLATE;
3512 /* Build single byte matching table for this equivalence class. */
3513 len = weights[idx1 & 0xffffff];
3514 for (ch = 0; ch < SBC_MAX; ++ch)
3516 char_buf[0] = ch;
3517 cp = char_buf;
3518 idx2 = findidx (table, indirect, extra, &cp, 1);
3520 idx2 = table[ch];
3522 if (idx2 == 0)
3523 /* This isn't a valid character. */
3524 continue;
3525 /* Compare only if the length matches and the collation rule
3526 index is the same. */
3527 if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3529 int cnt = 0;
3531 while (cnt <= len &&
3532 weights[(idx1 & 0xffffff) + 1 + cnt]
3533 == weights[(idx2 & 0xffffff) + 1 + cnt])
3534 ++cnt;
3536 if (cnt > len)
3537 bitset_set (sbcset, ch);
3540 /* Check whether the array has enough space. */
3541 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3543 /* Not enough, realloc it. */
3544 /* +1 in case of mbcset->nequiv_classes is 0. */
3545 Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3546 /* Use realloc since the array is NULL if *alloc == 0. */
3547 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3548 int32_t,
3549 new_equiv_class_alloc);
3550 if (BE (new_equiv_classes == NULL, 0))
3551 return REG_ESPACE;
3552 mbcset->equiv_classes = new_equiv_classes;
3553 *equiv_class_alloc = new_equiv_class_alloc;
3555 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3557 else
3558 #endif /* _LIBC */
3560 if (BE (strlen ((const char *) name) != 1, 0))
3561 return REG_ECOLLATE;
3562 bitset_set (sbcset, *name);
3564 return REG_NOERROR;
3567 /* Helper function for parse_bracket_exp.
3568 Build the character class which is represented by NAME.
3569 The result are written to MBCSET and SBCSET.
3570 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3571 is a pointer argument since we may update it. */
3573 static reg_errcode_t
3574 #ifdef RE_ENABLE_I18N
3575 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3576 re_charset_t *mbcset, Idx *char_class_alloc,
3577 const char *class_name, reg_syntax_t syntax)
3578 #else /* not RE_ENABLE_I18N */
3579 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3580 const char *class_name, reg_syntax_t syntax)
3581 #endif /* not RE_ENABLE_I18N */
3583 int i;
3584 const char *name = class_name;
3586 /* In case of REG_ICASE "upper" and "lower" match the both of
3587 upper and lower cases. */
3588 if ((syntax & RE_ICASE)
3589 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3590 name = "alpha";
3592 #ifdef RE_ENABLE_I18N
3593 /* Check the space of the arrays. */
3594 if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3596 /* Not enough, realloc it. */
3597 /* +1 in case of mbcset->nchar_classes is 0. */
3598 Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3599 /* Use realloc since array is NULL if *alloc == 0. */
3600 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3601 new_char_class_alloc);
3602 if (BE (new_char_classes == NULL, 0))
3603 return REG_ESPACE;
3604 mbcset->char_classes = new_char_classes;
3605 *char_class_alloc = new_char_class_alloc;
3607 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3608 #endif /* RE_ENABLE_I18N */
3610 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3611 do { \
3612 if (BE (trans != NULL, 0)) \
3614 for (i = 0; i < SBC_MAX; ++i) \
3615 if (ctype_func (i)) \
3616 bitset_set (sbcset, trans[i]); \
3618 else \
3620 for (i = 0; i < SBC_MAX; ++i) \
3621 if (ctype_func (i)) \
3622 bitset_set (sbcset, i); \
3624 } while (0)
3626 if (strcmp (name, "alnum") == 0)
3627 BUILD_CHARCLASS_LOOP (isalnum);
3628 else if (strcmp (name, "cntrl") == 0)
3629 BUILD_CHARCLASS_LOOP (iscntrl);
3630 else if (strcmp (name, "lower") == 0)
3631 BUILD_CHARCLASS_LOOP (islower);
3632 else if (strcmp (name, "space") == 0)
3633 BUILD_CHARCLASS_LOOP (isspace);
3634 else if (strcmp (name, "alpha") == 0)
3635 BUILD_CHARCLASS_LOOP (isalpha);
3636 else if (strcmp (name, "digit") == 0)
3637 BUILD_CHARCLASS_LOOP (isdigit);
3638 else if (strcmp (name, "print") == 0)
3639 BUILD_CHARCLASS_LOOP (isprint);
3640 else if (strcmp (name, "upper") == 0)
3641 BUILD_CHARCLASS_LOOP (isupper);
3642 else if (strcmp (name, "blank") == 0)
3643 BUILD_CHARCLASS_LOOP (isblank);
3644 else if (strcmp (name, "graph") == 0)
3645 BUILD_CHARCLASS_LOOP (isgraph);
3646 else if (strcmp (name, "punct") == 0)
3647 BUILD_CHARCLASS_LOOP (ispunct);
3648 else if (strcmp (name, "xdigit") == 0)
3649 BUILD_CHARCLASS_LOOP (isxdigit);
3650 else
3651 return REG_ECTYPE;
3653 return REG_NOERROR;
3656 static bin_tree_t *
3657 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3658 const char *class_name,
3659 const char *extra, bool non_match,
3660 reg_errcode_t *err)
3662 re_bitset_ptr_t sbcset;
3663 #ifdef RE_ENABLE_I18N
3664 re_charset_t *mbcset;
3665 Idx alloc = 0;
3666 #endif /* not RE_ENABLE_I18N */
3667 reg_errcode_t ret;
3668 re_token_t br_token;
3669 bin_tree_t *tree;
3671 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3672 if (BE (sbcset == NULL, 0))
3674 *err = REG_ESPACE;
3675 return NULL;
3677 #ifdef RE_ENABLE_I18N
3678 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3679 if (BE (mbcset == NULL, 0))
3681 re_free (sbcset);
3682 *err = REG_ESPACE;
3683 return NULL;
3685 mbcset->non_match = non_match;
3686 #endif /* RE_ENABLE_I18N */
3688 /* We don't care the syntax in this case. */
3689 ret = build_charclass (trans, sbcset,
3690 #ifdef RE_ENABLE_I18N
3691 mbcset, &alloc,
3692 #endif /* RE_ENABLE_I18N */
3693 class_name, 0);
3695 if (BE (ret != REG_NOERROR, 0))
3697 re_free (sbcset);
3698 #ifdef RE_ENABLE_I18N
3699 free_charset (mbcset);
3700 #endif /* RE_ENABLE_I18N */
3701 *err = ret;
3702 return NULL;
3704 /* \w match '_' also. */
3705 for (; *extra; extra++)
3706 bitset_set (sbcset, *extra);
3708 /* If it is non-matching list. */
3709 if (non_match)
3710 bitset_not (sbcset);
3712 #ifdef RE_ENABLE_I18N
3713 /* Ensure only single byte characters are set. */
3714 if (dfa->mb_cur_max > 1)
3715 bitset_mask (sbcset, dfa->sb_char);
3716 #endif
3718 /* Build a tree for simple bracket. */
3719 #if defined GCC_LINT || defined lint
3720 memset (&br_token, 0, sizeof br_token);
3721 #endif
3722 br_token.type = SIMPLE_BRACKET;
3723 br_token.opr.sbcset = sbcset;
3724 tree = create_token_tree (dfa, NULL, NULL, &br_token);
3725 if (BE (tree == NULL, 0))
3726 goto build_word_op_espace;
3728 #ifdef RE_ENABLE_I18N
3729 if (dfa->mb_cur_max > 1)
3731 bin_tree_t *mbc_tree;
3732 /* Build a tree for complex bracket. */
3733 br_token.type = COMPLEX_BRACKET;
3734 br_token.opr.mbcset = mbcset;
3735 dfa->has_mb_node = 1;
3736 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3737 if (BE (mbc_tree == NULL, 0))
3738 goto build_word_op_espace;
3739 /* Then join them by ALT node. */
3740 tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3741 if (BE (mbc_tree != NULL, 1))
3742 return tree;
3744 else
3746 free_charset (mbcset);
3747 return tree;
3749 #else /* not RE_ENABLE_I18N */
3750 return tree;
3751 #endif /* not RE_ENABLE_I18N */
3753 build_word_op_espace:
3754 re_free (sbcset);
3755 #ifdef RE_ENABLE_I18N
3756 free_charset (mbcset);
3757 #endif /* RE_ENABLE_I18N */
3758 *err = REG_ESPACE;
3759 return NULL;
3762 /* This is intended for the expressions like "a{1,3}".
3763 Fetch a number from 'input', and return the number.
3764 Return -1 if the number field is empty like "{,1}".
3765 Return RE_DUP_MAX + 1 if the number field is too large.
3766 Return -2 if an error occurred. */
3768 static Idx
3769 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3771 Idx num = -1;
3772 unsigned char c;
3773 while (1)
3775 fetch_token (token, input, syntax);
3776 c = token->opr.c;
3777 if (BE (token->type == END_OF_RE, 0))
3778 return -2;
3779 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3780 break;
3781 num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3782 ? -2
3783 : num == -1
3784 ? c - '0'
3785 : MIN (RE_DUP_MAX + 1, num * 10 + c - '0'));
3787 return num;
3790 #ifdef RE_ENABLE_I18N
3791 static void
3792 free_charset (re_charset_t *cset)
3794 re_free (cset->mbchars);
3795 # ifdef _LIBC
3796 re_free (cset->coll_syms);
3797 re_free (cset->equiv_classes);
3798 re_free (cset->range_starts);
3799 re_free (cset->range_ends);
3800 # endif
3801 re_free (cset->char_classes);
3802 re_free (cset);
3804 #endif /* RE_ENABLE_I18N */
3806 /* Functions for binary tree operation. */
3808 /* Create a tree node. */
3810 static bin_tree_t *
3811 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3812 re_token_type_t type)
3814 re_token_t t;
3815 #if defined GCC_LINT || defined lint
3816 memset (&t, 0, sizeof t);
3817 #endif
3818 t.type = type;
3819 return create_token_tree (dfa, left, right, &t);
3822 static bin_tree_t *
3823 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3824 const re_token_t *token)
3826 bin_tree_t *tree;
3827 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3829 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3831 if (storage == NULL)
3832 return NULL;
3833 storage->next = dfa->str_tree_storage;
3834 dfa->str_tree_storage = storage;
3835 dfa->str_tree_storage_idx = 0;
3837 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3839 tree->parent = NULL;
3840 tree->left = left;
3841 tree->right = right;
3842 tree->token = *token;
3843 tree->token.duplicated = 0;
3844 tree->token.opt_subexp = 0;
3845 tree->first = NULL;
3846 tree->next = NULL;
3847 tree->node_idx = -1;
3849 if (left != NULL)
3850 left->parent = tree;
3851 if (right != NULL)
3852 right->parent = tree;
3853 return tree;
3856 /* Mark the tree SRC as an optional subexpression.
3857 To be called from preorder or postorder. */
3859 static reg_errcode_t
3860 mark_opt_subexp (void *extra, bin_tree_t *node)
3862 Idx idx = (uintptr_t) extra;
3863 if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3864 node->token.opt_subexp = 1;
3866 return REG_NOERROR;
3869 /* Free the allocated memory inside NODE. */
3871 static void
3872 free_token (re_token_t *node)
3874 #ifdef RE_ENABLE_I18N
3875 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3876 free_charset (node->opr.mbcset);
3877 else
3878 #endif /* RE_ENABLE_I18N */
3879 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3880 re_free (node->opr.sbcset);
3883 /* Worker function for tree walking. Free the allocated memory inside NODE
3884 and its children. */
3886 static reg_errcode_t
3887 free_tree (void *extra, bin_tree_t *node)
3889 free_token (&node->token);
3890 return REG_NOERROR;
3894 /* Duplicate the node SRC, and return new node. This is a preorder
3895 visit similar to the one implemented by the generic visitor, but
3896 we need more infrastructure to maintain two parallel trees --- so,
3897 it's easier to duplicate. */
3899 static bin_tree_t *
3900 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3902 const bin_tree_t *node;
3903 bin_tree_t *dup_root;
3904 bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3906 for (node = root; ; )
3908 /* Create a new tree and link it back to the current parent. */
3909 *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3910 if (*p_new == NULL)
3911 return NULL;
3912 (*p_new)->parent = dup_node;
3913 (*p_new)->token.duplicated = 1;
3914 dup_node = *p_new;
3916 /* Go to the left node, or up and to the right. */
3917 if (node->left)
3919 node = node->left;
3920 p_new = &dup_node->left;
3922 else
3924 const bin_tree_t *prev = NULL;
3925 while (node->right == prev || node->right == NULL)
3927 prev = node;
3928 node = node->parent;
3929 dup_node = dup_node->parent;
3930 if (!node)
3931 return dup_root;
3933 node = node->right;
3934 p_new = &dup_node->right;