* posix/regcomp.c (lower_subexp): Do not optimize empty
[glibc.git] / posix / regcomp.c
bloba7112cffddd7f234db67e223ea99ef2516da4f2f
1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19 02111-1307 USA. */
21 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
22 int length, reg_syntax_t syntax);
23 static void re_compile_fastmap_iter (regex_t *bufp,
24 const re_dfastate_t *init_state,
25 char *fastmap);
26 static reg_errcode_t init_dfa (re_dfa_t *dfa, int pat_len);
27 static void init_word_char (re_dfa_t *dfa);
28 #ifdef RE_ENABLE_I18N
29 static void free_charset (re_charset_t *cset);
30 #endif /* RE_ENABLE_I18N */
31 static void free_workarea_compile (regex_t *preg);
32 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
33 #ifdef RE_ENABLE_I18N
34 static void optimize_utf8 (re_dfa_t *dfa);
35 #endif
36 static reg_errcode_t analyze (regex_t *preg);
37 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
38 static reg_errcode_t preorder (bin_tree_t *root,
39 reg_errcode_t (fn (void *, bin_tree_t *)),
40 void *extra);
41 static reg_errcode_t postorder (bin_tree_t *root,
42 reg_errcode_t (fn (void *, bin_tree_t *)),
43 void *extra);
44 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
45 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
46 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
47 bin_tree_t *node);
48 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
49 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
50 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
51 static reg_errcode_t duplicate_node_closure (re_dfa_t *dfa, int top_org_node,
52 int top_clone_node, int root_node,
53 unsigned int constraint);
54 static reg_errcode_t duplicate_node (int *new_idx, re_dfa_t *dfa, int org_idx,
55 unsigned int constraint);
56 static int search_duplicated_node (re_dfa_t *dfa, int org_node,
57 unsigned int constraint);
58 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
59 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
60 int node, int root);
61 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
62 static int fetch_number (re_string_t *input, re_token_t *token,
63 reg_syntax_t syntax);
64 static void fetch_token (re_token_t *result, re_string_t *input,
65 reg_syntax_t syntax);
66 static int peek_token (re_token_t *token, re_string_t *input,
67 reg_syntax_t syntax);
68 static int peek_token_bracket (re_token_t *token, re_string_t *input,
69 reg_syntax_t syntax);
70 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
71 reg_syntax_t syntax, reg_errcode_t *err);
72 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
73 re_token_t *token, reg_syntax_t syntax,
74 int nest, reg_errcode_t *err);
75 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
76 re_token_t *token, reg_syntax_t syntax,
77 int nest, reg_errcode_t *err);
78 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
79 re_token_t *token, reg_syntax_t syntax,
80 int nest, reg_errcode_t *err);
81 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
82 re_token_t *token, reg_syntax_t syntax,
83 int nest, reg_errcode_t *err);
84 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
85 re_dfa_t *dfa, re_token_t *token,
86 reg_syntax_t syntax, reg_errcode_t *err);
87 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
88 re_token_t *token, reg_syntax_t syntax,
89 reg_errcode_t *err);
90 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
91 re_string_t *regexp,
92 re_token_t *token, int token_len,
93 re_dfa_t *dfa,
94 reg_syntax_t syntax,
95 int accept_hyphen);
96 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
97 re_string_t *regexp,
98 re_token_t *token);
99 #ifndef _LIBC
100 # ifdef RE_ENABLE_I18N
101 static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
102 re_charset_t *mbcset, int *range_alloc,
103 bracket_elem_t *start_elem,
104 bracket_elem_t *end_elem);
105 static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
106 re_charset_t *mbcset,
107 int *coll_sym_alloc,
108 const unsigned char *name);
109 # else /* not RE_ENABLE_I18N */
110 static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
111 bracket_elem_t *start_elem,
112 bracket_elem_t *end_elem);
113 static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
114 const unsigned char *name);
115 # endif /* not RE_ENABLE_I18N */
116 #endif /* not _LIBC */
117 #ifdef RE_ENABLE_I18N
118 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
119 re_charset_t *mbcset,
120 int *equiv_class_alloc,
121 const unsigned char *name);
122 static reg_errcode_t build_charclass (unsigned RE_TRANSLATE_TYPE trans,
123 re_bitset_ptr_t sbcset,
124 re_charset_t *mbcset,
125 int *char_class_alloc,
126 const unsigned char *class_name,
127 reg_syntax_t syntax);
128 #else /* not RE_ENABLE_I18N */
129 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
130 const unsigned char *name);
131 static reg_errcode_t build_charclass (unsigned RE_TRANSLATE_TYPE trans,
132 re_bitset_ptr_t sbcset,
133 const unsigned char *class_name,
134 reg_syntax_t syntax);
135 #endif /* not RE_ENABLE_I18N */
136 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
137 unsigned RE_TRANSLATE_TYPE trans,
138 const unsigned char *class_name,
139 const unsigned char *extra,
140 int non_match, reg_errcode_t *err);
141 static bin_tree_t *create_tree (re_dfa_t *dfa,
142 bin_tree_t *left, bin_tree_t *right,
143 re_token_type_t type);
144 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
145 bin_tree_t *left, bin_tree_t *right,
146 const re_token_t *token);
147 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
148 static void free_token (re_token_t *node);
149 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
150 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
152 /* This table gives an error message for each of the error codes listed
153 in regex.h. Obviously the order here has to be same as there.
154 POSIX doesn't require that we do anything for REG_NOERROR,
155 but why not be nice? */
157 const char __re_error_msgid[] attribute_hidden =
159 #define REG_NOERROR_IDX 0
160 gettext_noop ("Success") /* REG_NOERROR */
161 "\0"
162 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
163 gettext_noop ("No match") /* REG_NOMATCH */
164 "\0"
165 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
166 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
167 "\0"
168 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
169 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
170 "\0"
171 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
172 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
173 "\0"
174 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
175 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
176 "\0"
177 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
178 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
179 "\0"
180 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
181 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
182 "\0"
183 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
184 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
185 "\0"
186 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
187 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
188 "\0"
189 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
190 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
191 "\0"
192 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
193 gettext_noop ("Invalid range end") /* REG_ERANGE */
194 "\0"
195 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
196 gettext_noop ("Memory exhausted") /* REG_ESPACE */
197 "\0"
198 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
199 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
200 "\0"
201 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
202 gettext_noop ("Premature end of regular expression") /* REG_EEND */
203 "\0"
204 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
205 gettext_noop ("Regular expression too big") /* REG_ESIZE */
206 "\0"
207 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
208 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
211 const size_t __re_error_msgid_idx[] attribute_hidden =
213 REG_NOERROR_IDX,
214 REG_NOMATCH_IDX,
215 REG_BADPAT_IDX,
216 REG_ECOLLATE_IDX,
217 REG_ECTYPE_IDX,
218 REG_EESCAPE_IDX,
219 REG_ESUBREG_IDX,
220 REG_EBRACK_IDX,
221 REG_EPAREN_IDX,
222 REG_EBRACE_IDX,
223 REG_BADBR_IDX,
224 REG_ERANGE_IDX,
225 REG_ESPACE_IDX,
226 REG_BADRPT_IDX,
227 REG_EEND_IDX,
228 REG_ESIZE_IDX,
229 REG_ERPAREN_IDX
232 /* Entry points for GNU code. */
234 /* re_compile_pattern is the GNU regular expression compiler: it
235 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
236 Returns 0 if the pattern was valid, otherwise an error string.
238 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
239 are set in BUFP on entry. */
241 const char *
242 re_compile_pattern (pattern, length, bufp)
243 const char *pattern;
244 size_t length;
245 struct re_pattern_buffer *bufp;
247 reg_errcode_t ret;
249 /* And GNU code determines whether or not to get register information
250 by passing null for the REGS argument to re_match, etc., not by
251 setting no_sub, unless RE_NO_SUB is set. */
252 bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
254 /* Match anchors at newline. */
255 bufp->newline_anchor = 1;
257 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
259 if (!ret)
260 return NULL;
261 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
263 #ifdef _LIBC
264 weak_alias (__re_compile_pattern, re_compile_pattern)
265 #endif
267 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
268 also be assigned to arbitrarily: each pattern buffer stores its own
269 syntax, so it can be changed between regex compilations. */
270 /* This has no initializer because initialized variables in Emacs
271 become read-only after dumping. */
272 reg_syntax_t re_syntax_options;
275 /* Specify the precise syntax of regexps for compilation. This provides
276 for compatibility for various utilities which historically have
277 different, incompatible syntaxes.
279 The argument SYNTAX is a bit mask comprised of the various bits
280 defined in regex.h. We return the old syntax. */
282 reg_syntax_t
283 re_set_syntax (syntax)
284 reg_syntax_t syntax;
286 reg_syntax_t ret = re_syntax_options;
288 re_syntax_options = syntax;
289 return ret;
291 #ifdef _LIBC
292 weak_alias (__re_set_syntax, re_set_syntax)
293 #endif
296 re_compile_fastmap (bufp)
297 struct re_pattern_buffer *bufp;
299 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
300 char *fastmap = bufp->fastmap;
302 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
303 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
304 if (dfa->init_state != dfa->init_state_word)
305 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
306 if (dfa->init_state != dfa->init_state_nl)
307 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
308 if (dfa->init_state != dfa->init_state_begbuf)
309 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
310 bufp->fastmap_accurate = 1;
311 return 0;
313 #ifdef _LIBC
314 weak_alias (__re_compile_fastmap, re_compile_fastmap)
315 #endif
317 static inline void
318 __attribute ((always_inline))
319 re_set_fastmap (char *fastmap, int icase, int ch)
321 fastmap[ch] = 1;
322 if (icase)
323 fastmap[tolower (ch)] = 1;
326 /* Helper function for re_compile_fastmap.
327 Compile fastmap for the initial_state INIT_STATE. */
329 static void
330 re_compile_fastmap_iter (bufp, init_state, fastmap)
331 regex_t *bufp;
332 const re_dfastate_t *init_state;
333 char *fastmap;
335 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
336 int node_cnt;
337 int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
338 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
340 int node = init_state->nodes.elems[node_cnt];
341 re_token_type_t type = dfa->nodes[node].type;
343 if (type == CHARACTER)
345 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
346 #ifdef RE_ENABLE_I18N
347 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
349 unsigned char *buf = alloca (dfa->mb_cur_max), *p;
350 wchar_t wc;
351 mbstate_t state;
353 p = buf;
354 *p++ = dfa->nodes[node].opr.c;
355 while (++node < dfa->nodes_len
356 && dfa->nodes[node].type == CHARACTER
357 && dfa->nodes[node].mb_partial)
358 *p++ = dfa->nodes[node].opr.c;
359 memset (&state, 0, sizeof (state));
360 if (mbrtowc (&wc, (const char *) buf, p - buf,
361 &state) == p - buf
362 && __wcrtomb ((char *) buf, towlower (wc), &state) > 0)
363 re_set_fastmap (fastmap, 0, buf[0]);
365 #endif
367 else if (type == SIMPLE_BRACKET)
369 int i, j, ch;
370 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
371 for (j = 0; j < UINT_BITS; ++j, ++ch)
372 if (dfa->nodes[node].opr.sbcset[i] & (1 << j))
373 re_set_fastmap (fastmap, icase, ch);
375 #ifdef RE_ENABLE_I18N
376 else if (type == COMPLEX_BRACKET)
378 int i;
379 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
380 if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
381 || cset->nranges || cset->nchar_classes)
383 # ifdef _LIBC
384 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
386 /* In this case we want to catch the bytes which are
387 the first byte of any collation elements.
388 e.g. In da_DK, we want to catch 'a' since "aa"
389 is a valid collation element, and don't catch
390 'b' since 'b' is the only collation element
391 which starts from 'b'. */
392 int j, ch;
393 const int32_t *table = (const int32_t *)
394 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
395 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
396 for (j = 0; j < UINT_BITS; ++j, ++ch)
397 if (table[ch] < 0)
398 re_set_fastmap (fastmap, icase, ch);
400 # else
401 if (dfa->mb_cur_max > 1)
402 for (i = 0; i < SBC_MAX; ++i)
403 if (__btowc (i) == WEOF)
404 re_set_fastmap (fastmap, icase, i);
405 # endif /* not _LIBC */
407 for (i = 0; i < cset->nmbchars; ++i)
409 char buf[256];
410 mbstate_t state;
411 memset (&state, '\0', sizeof (state));
412 __wcrtomb (buf, cset->mbchars[i], &state);
413 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
414 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
416 __wcrtomb (buf, towlower (cset->mbchars[i]), &state);
417 re_set_fastmap (fastmap, 0, *(unsigned char *) buf);
421 #endif /* RE_ENABLE_I18N */
422 else if (type == OP_PERIOD
423 #ifdef RE_ENABLE_I18N
424 || type == OP_UTF8_PERIOD
425 #endif /* RE_ENABLE_I18N */
426 || type == END_OF_RE)
428 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
429 if (type == END_OF_RE)
430 bufp->can_be_null = 1;
431 return;
436 /* Entry point for POSIX code. */
437 /* regcomp takes a regular expression as a string and compiles it.
439 PREG is a regex_t *. We do not expect any fields to be initialized,
440 since POSIX says we shouldn't. Thus, we set
442 `buffer' to the compiled pattern;
443 `used' to the length of the compiled pattern;
444 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
445 REG_EXTENDED bit in CFLAGS is set; otherwise, to
446 RE_SYNTAX_POSIX_BASIC;
447 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
448 `fastmap' to an allocated space for the fastmap;
449 `fastmap_accurate' to zero;
450 `re_nsub' to the number of subexpressions in PATTERN.
452 PATTERN is the address of the pattern string.
454 CFLAGS is a series of bits which affect compilation.
456 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
457 use POSIX basic syntax.
459 If REG_NEWLINE is set, then . and [^...] don't match newline.
460 Also, regexec will try a match beginning after every newline.
462 If REG_ICASE is set, then we considers upper- and lowercase
463 versions of letters to be equivalent when matching.
465 If REG_NOSUB is set, then when PREG is passed to regexec, that
466 routine will report only success or failure, and nothing about the
467 registers.
469 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
470 the return codes and their meanings.) */
473 regcomp (preg, pattern, cflags)
474 regex_t *__restrict preg;
475 const char *__restrict pattern;
476 int cflags;
478 reg_errcode_t ret;
479 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
480 : RE_SYNTAX_POSIX_BASIC);
482 preg->buffer = NULL;
483 preg->allocated = 0;
484 preg->used = 0;
486 /* Try to allocate space for the fastmap. */
487 preg->fastmap = re_malloc (char, SBC_MAX);
488 if (BE (preg->fastmap == NULL, 0))
489 return REG_ESPACE;
491 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
493 /* If REG_NEWLINE is set, newlines are treated differently. */
494 if (cflags & REG_NEWLINE)
495 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
496 syntax &= ~RE_DOT_NEWLINE;
497 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
498 /* It also changes the matching behavior. */
499 preg->newline_anchor = 1;
501 else
502 preg->newline_anchor = 0;
503 preg->no_sub = !!(cflags & REG_NOSUB);
504 preg->translate = NULL;
506 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
508 /* POSIX doesn't distinguish between an unmatched open-group and an
509 unmatched close-group: both are REG_EPAREN. */
510 if (ret == REG_ERPAREN)
511 ret = REG_EPAREN;
513 /* We have already checked preg->fastmap != NULL. */
514 if (BE (ret == REG_NOERROR, 1))
515 /* Compute the fastmap now, since regexec cannot modify the pattern
516 buffer. This function never fails in this implementation. */
517 (void) re_compile_fastmap (preg);
518 else
520 /* Some error occurred while compiling the expression. */
521 re_free (preg->fastmap);
522 preg->fastmap = NULL;
525 return (int) ret;
527 #ifdef _LIBC
528 weak_alias (__regcomp, regcomp)
529 #endif
531 /* Returns a message corresponding to an error code, ERRCODE, returned
532 from either regcomp or regexec. We don't use PREG here. */
534 size_t
535 regerror (errcode, preg, errbuf, errbuf_size)
536 int errcode;
537 const regex_t *preg;
538 char *errbuf;
539 size_t errbuf_size;
541 const char *msg;
542 size_t msg_size;
544 if (BE (errcode < 0
545 || errcode >= (int) (sizeof (__re_error_msgid_idx)
546 / sizeof (__re_error_msgid_idx[0])), 0))
547 /* Only error codes returned by the rest of the code should be passed
548 to this routine. If we are given anything else, or if other regex
549 code generates an invalid error code, then the program has a bug.
550 Dump core so we can fix it. */
551 abort ();
553 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
555 msg_size = strlen (msg) + 1; /* Includes the null. */
557 if (BE (errbuf_size != 0, 1))
559 if (BE (msg_size > errbuf_size, 0))
561 #if defined HAVE_MEMPCPY || defined _LIBC
562 *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
563 #else
564 memcpy (errbuf, msg, errbuf_size - 1);
565 errbuf[errbuf_size - 1] = 0;
566 #endif
568 else
569 memcpy (errbuf, msg, msg_size);
572 return msg_size;
574 #ifdef _LIBC
575 weak_alias (__regerror, regerror)
576 #endif
579 #ifdef RE_ENABLE_I18N
580 /* This static array is used for the map to single-byte characters when
581 UTF-8 is used. Otherwise we would allocate memory just to initialize
582 it the same all the time. UTF-8 is the preferred encoding so this is
583 a worthwhile optimization. */
584 static const bitset utf8_sb_map =
586 /* Set the first 128 bits. */
587 # if UINT_MAX == 0xffffffff
588 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff
589 # else
590 # error "Add case for new unsigned int size"
591 # endif
593 #endif
596 static void
597 free_dfa_content (re_dfa_t *dfa)
599 int i, j;
601 if (dfa->nodes)
602 for (i = 0; i < dfa->nodes_len; ++i)
603 free_token (dfa->nodes + i);
604 re_free (dfa->nexts);
605 for (i = 0; i < dfa->nodes_len; ++i)
607 if (dfa->eclosures != NULL)
608 re_node_set_free (dfa->eclosures + i);
609 if (dfa->inveclosures != NULL)
610 re_node_set_free (dfa->inveclosures + i);
611 if (dfa->edests != NULL)
612 re_node_set_free (dfa->edests + i);
614 re_free (dfa->edests);
615 re_free (dfa->eclosures);
616 re_free (dfa->inveclosures);
617 re_free (dfa->nodes);
619 if (dfa->state_table)
620 for (i = 0; i <= dfa->state_hash_mask; ++i)
622 struct re_state_table_entry *entry = dfa->state_table + i;
623 for (j = 0; j < entry->num; ++j)
625 re_dfastate_t *state = entry->array[j];
626 free_state (state);
628 re_free (entry->array);
630 re_free (dfa->state_table);
631 #ifdef RE_ENABLE_I18N
632 if (dfa->sb_char != utf8_sb_map)
633 re_free (dfa->sb_char);
634 #endif
635 re_free (dfa->subexp_map);
636 #ifdef DEBUG
637 re_free (dfa->re_str);
638 #endif
640 re_free (dfa);
644 /* Free dynamically allocated space used by PREG. */
646 void
647 regfree (preg)
648 regex_t *preg;
650 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
651 if (BE (dfa != NULL, 1))
652 free_dfa_content (dfa);
653 preg->buffer = NULL;
654 preg->allocated = 0;
656 re_free (preg->fastmap);
657 preg->fastmap = NULL;
659 re_free (preg->translate);
660 preg->translate = NULL;
662 #ifdef _LIBC
663 weak_alias (__regfree, regfree)
664 #endif
666 /* Entry points compatible with 4.2 BSD regex library. We don't define
667 them unless specifically requested. */
669 #if defined _REGEX_RE_COMP || defined _LIBC
671 /* BSD has one and only one pattern buffer. */
672 static struct re_pattern_buffer re_comp_buf;
674 char *
675 # ifdef _LIBC
676 /* Make these definitions weak in libc, so POSIX programs can redefine
677 these names if they don't use our functions, and still use
678 regcomp/regexec above without link errors. */
679 weak_function
680 # endif
681 re_comp (s)
682 const char *s;
684 reg_errcode_t ret;
685 char *fastmap;
687 if (!s)
689 if (!re_comp_buf.buffer)
690 return gettext ("No previous regular expression");
691 return 0;
694 if (re_comp_buf.buffer)
696 fastmap = re_comp_buf.fastmap;
697 re_comp_buf.fastmap = NULL;
698 __regfree (&re_comp_buf);
699 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
700 re_comp_buf.fastmap = fastmap;
703 if (re_comp_buf.fastmap == NULL)
705 re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
706 if (re_comp_buf.fastmap == NULL)
707 return (char *) gettext (__re_error_msgid
708 + __re_error_msgid_idx[(int) REG_ESPACE]);
711 /* Since `re_exec' always passes NULL for the `regs' argument, we
712 don't need to initialize the pattern buffer fields which affect it. */
714 /* Match anchors at newlines. */
715 re_comp_buf.newline_anchor = 1;
717 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
719 if (!ret)
720 return NULL;
722 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
723 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
726 #ifdef _LIBC
727 libc_freeres_fn (free_mem)
729 __regfree (&re_comp_buf);
731 #endif
733 #endif /* _REGEX_RE_COMP */
735 /* Internal entry point.
736 Compile the regular expression PATTERN, whose length is LENGTH.
737 SYNTAX indicate regular expression's syntax. */
739 static reg_errcode_t
740 re_compile_internal (preg, pattern, length, syntax)
741 regex_t *preg;
742 const char * pattern;
743 int length;
744 reg_syntax_t syntax;
746 reg_errcode_t err = REG_NOERROR;
747 re_dfa_t *dfa;
748 re_string_t regexp;
750 /* Initialize the pattern buffer. */
751 preg->fastmap_accurate = 0;
752 preg->syntax = syntax;
753 preg->not_bol = preg->not_eol = 0;
754 preg->used = 0;
755 preg->re_nsub = 0;
756 preg->can_be_null = 0;
757 preg->regs_allocated = REGS_UNALLOCATED;
759 /* Initialize the dfa. */
760 dfa = (re_dfa_t *) preg->buffer;
761 if (BE (preg->allocated < sizeof (re_dfa_t), 0))
763 /* If zero allocated, but buffer is non-null, try to realloc
764 enough space. This loses if buffer's address is bogus, but
765 that is the user's responsibility. If ->buffer is NULL this
766 is a simple allocation. */
767 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
768 if (dfa == NULL)
769 return REG_ESPACE;
770 preg->allocated = sizeof (re_dfa_t);
771 preg->buffer = (unsigned char *) dfa;
773 preg->used = sizeof (re_dfa_t);
775 err = init_dfa (dfa, length);
776 if (BE (err != REG_NOERROR, 0))
778 free_dfa_content (dfa);
779 preg->buffer = NULL;
780 preg->allocated = 0;
781 return err;
783 #ifdef DEBUG
784 dfa->re_str = re_malloc (char, length + 1);
785 strncpy (dfa->re_str, pattern, length + 1);
786 #endif
788 err = re_string_construct (&regexp, pattern, length, preg->translate,
789 syntax & RE_ICASE, dfa);
790 if (BE (err != REG_NOERROR, 0))
792 re_compile_internal_free_return:
793 free_workarea_compile (preg);
794 re_string_destruct (&regexp);
795 free_dfa_content (dfa);
796 preg->buffer = NULL;
797 preg->allocated = 0;
798 return err;
801 /* Parse the regular expression, and build a structure tree. */
802 preg->re_nsub = 0;
803 dfa->str_tree = parse (&regexp, preg, syntax, &err);
804 if (BE (dfa->str_tree == NULL, 0))
805 goto re_compile_internal_free_return;
807 /* Analyze the tree and create the nfa. */
808 err = analyze (preg);
809 if (BE (err != REG_NOERROR, 0))
810 goto re_compile_internal_free_return;
812 #ifdef RE_ENABLE_I18N
813 /* If possible, do searching in single byte encoding to speed things up. */
814 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
815 optimize_utf8 (dfa);
816 #endif
818 /* Then create the initial state of the dfa. */
819 err = create_initial_state (dfa);
821 /* Release work areas. */
822 free_workarea_compile (preg);
823 re_string_destruct (&regexp);
825 if (BE (err != REG_NOERROR, 0))
827 free_dfa_content (dfa);
828 preg->buffer = NULL;
829 preg->allocated = 0;
832 return err;
835 /* Initialize DFA. We use the length of the regular expression PAT_LEN
836 as the initial length of some arrays. */
838 static reg_errcode_t
839 init_dfa (dfa, pat_len)
840 re_dfa_t *dfa;
841 int pat_len;
843 int table_size;
844 #ifndef _LIBC
845 char *codeset_name;
846 #endif
848 memset (dfa, '\0', sizeof (re_dfa_t));
850 /* Force allocation of str_tree_storage the first time. */
851 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
853 dfa->nodes_alloc = pat_len + 1;
854 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
856 dfa->states_alloc = pat_len + 1;
858 /* table_size = 2 ^ ceil(log pat_len) */
859 for (table_size = 1; table_size > 0; table_size <<= 1)
860 if (table_size > pat_len)
861 break;
863 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
864 dfa->state_hash_mask = table_size - 1;
866 dfa->mb_cur_max = MB_CUR_MAX;
867 #ifdef _LIBC
868 if (dfa->mb_cur_max == 6
869 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
870 dfa->is_utf8 = 1;
871 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
872 != 0);
873 #else
874 # ifdef HAVE_LANGINFO_CODESET
875 codeset_name = nl_langinfo (CODESET);
876 # else
877 codeset_name = getenv ("LC_ALL");
878 if (codeset_name == NULL || codeset_name[0] == '\0')
879 codeset_name = getenv ("LC_CTYPE");
880 if (codeset_name == NULL || codeset_name[0] == '\0')
881 codeset_name = getenv ("LANG");
882 if (codeset_name == NULL)
883 codeset_name = "";
884 else if (strchr (codeset_name, '.') != NULL)
885 codeset_name = strchr (codeset_name, '.') + 1;
886 # endif
888 if (strcasecmp (codeset_name, "UTF-8") == 0
889 || strcasecmp (codeset_name, "UTF8") == 0)
890 dfa->is_utf8 = 1;
892 /* We check exhaustively in the loop below if this charset is a
893 superset of ASCII. */
894 dfa->map_notascii = 0;
895 #endif
897 #ifdef RE_ENABLE_I18N
898 if (dfa->mb_cur_max > 1)
900 if (dfa->is_utf8)
901 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
902 else
904 int i, j, ch;
906 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset), 1);
907 if (BE (dfa->sb_char == NULL, 0))
908 return REG_ESPACE;
910 /* Clear all bits by, then set those corresponding to single
911 byte chars. */
912 bitset_empty (dfa->sb_char);
914 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
915 for (j = 0; j < UINT_BITS; ++j, ++ch)
917 wchar_t wch = __btowc (ch);
918 if (wch != WEOF)
919 dfa->sb_char[i] |= 1 << j;
920 # ifndef _LIBC
921 if (isascii (ch) && wch != (wchar_t) ch)
922 dfa->map_notascii = 1;
923 # endif
927 #endif
929 if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
930 return REG_ESPACE;
931 return REG_NOERROR;
934 /* Initialize WORD_CHAR table, which indicate which character is
935 "word". In this case "word" means that it is the word construction
936 character used by some operators like "\<", "\>", etc. */
938 static void
939 init_word_char (dfa)
940 re_dfa_t *dfa;
942 int i, j, ch;
943 dfa->word_ops_used = 1;
944 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
945 for (j = 0; j < UINT_BITS; ++j, ++ch)
946 if (isalnum (ch) || ch == '_')
947 dfa->word_char[i] |= 1 << j;
950 /* Free the work area which are only used while compiling. */
952 static void
953 free_workarea_compile (preg)
954 regex_t *preg;
956 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
957 bin_tree_storage_t *storage, *next;
958 for (storage = dfa->str_tree_storage; storage; storage = next)
960 next = storage->next;
961 re_free (storage);
963 dfa->str_tree_storage = NULL;
964 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
965 dfa->str_tree = NULL;
966 re_free (dfa->org_indices);
967 dfa->org_indices = NULL;
970 /* Create initial states for all contexts. */
972 static reg_errcode_t
973 create_initial_state (dfa)
974 re_dfa_t *dfa;
976 int first, i;
977 reg_errcode_t err;
978 re_node_set init_nodes;
980 /* Initial states have the epsilon closure of the node which is
981 the first node of the regular expression. */
982 first = dfa->str_tree->first->node_idx;
983 dfa->init_node = first;
984 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
985 if (BE (err != REG_NOERROR, 0))
986 return err;
988 /* The back-references which are in initial states can epsilon transit,
989 since in this case all of the subexpressions can be null.
990 Then we add epsilon closures of the nodes which are the next nodes of
991 the back-references. */
992 if (dfa->nbackref > 0)
993 for (i = 0; i < init_nodes.nelem; ++i)
995 int node_idx = init_nodes.elems[i];
996 re_token_type_t type = dfa->nodes[node_idx].type;
998 int clexp_idx;
999 if (type != OP_BACK_REF)
1000 continue;
1001 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1003 re_token_t *clexp_node;
1004 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1005 if (clexp_node->type == OP_CLOSE_SUBEXP
1006 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1007 break;
1009 if (clexp_idx == init_nodes.nelem)
1010 continue;
1012 if (type == OP_BACK_REF)
1014 int dest_idx = dfa->edests[node_idx].elems[0];
1015 if (!re_node_set_contains (&init_nodes, dest_idx))
1017 re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1018 i = 0;
1023 /* It must be the first time to invoke acquire_state. */
1024 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1025 /* We don't check ERR here, since the initial state must not be NULL. */
1026 if (BE (dfa->init_state == NULL, 0))
1027 return err;
1028 if (dfa->init_state->has_constraint)
1030 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1031 CONTEXT_WORD);
1032 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1033 CONTEXT_NEWLINE);
1034 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1035 &init_nodes,
1036 CONTEXT_NEWLINE
1037 | CONTEXT_BEGBUF);
1038 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1039 || dfa->init_state_begbuf == NULL, 0))
1040 return err;
1042 else
1043 dfa->init_state_word = dfa->init_state_nl
1044 = dfa->init_state_begbuf = dfa->init_state;
1046 re_node_set_free (&init_nodes);
1047 return REG_NOERROR;
1050 #ifdef RE_ENABLE_I18N
1051 /* If it is possible to do searching in single byte encoding instead of UTF-8
1052 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1053 DFA nodes where needed. */
1055 static void
1056 optimize_utf8 (dfa)
1057 re_dfa_t *dfa;
1059 int node, i, mb_chars = 0, has_period = 0;
1061 for (node = 0; node < dfa->nodes_len; ++node)
1062 switch (dfa->nodes[node].type)
1064 case CHARACTER:
1065 if (dfa->nodes[node].opr.c >= 0x80)
1066 mb_chars = 1;
1067 break;
1068 case ANCHOR:
1069 switch (dfa->nodes[node].opr.idx)
1071 case LINE_FIRST:
1072 case LINE_LAST:
1073 case BUF_FIRST:
1074 case BUF_LAST:
1075 break;
1076 default:
1077 /* Word anchors etc. cannot be handled. */
1078 return;
1080 break;
1081 case OP_PERIOD:
1082 has_period = 1;
1083 break;
1084 case OP_BACK_REF:
1085 case OP_ALT:
1086 case END_OF_RE:
1087 case OP_DUP_ASTERISK:
1088 case OP_OPEN_SUBEXP:
1089 case OP_CLOSE_SUBEXP:
1090 break;
1091 case COMPLEX_BRACKET:
1092 return;
1093 case SIMPLE_BRACKET:
1094 /* Just double check. */
1095 for (i = 0x80 / UINT_BITS; i < BITSET_UINTS; ++i)
1096 if (dfa->nodes[node].opr.sbcset[i])
1097 return;
1098 break;
1099 default:
1100 abort ();
1103 if (mb_chars || has_period)
1104 for (node = 0; node < dfa->nodes_len; ++node)
1106 if (dfa->nodes[node].type == CHARACTER
1107 && dfa->nodes[node].opr.c >= 0x80)
1108 dfa->nodes[node].mb_partial = 0;
1109 else if (dfa->nodes[node].type == OP_PERIOD)
1110 dfa->nodes[node].type = OP_UTF8_PERIOD;
1113 /* The search can be in single byte locale. */
1114 dfa->mb_cur_max = 1;
1115 dfa->is_utf8 = 0;
1116 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1118 #endif
1120 /* Analyze the structure tree, and calculate "first", "next", "edest",
1121 "eclosure", and "inveclosure". */
1123 static reg_errcode_t
1124 analyze (preg)
1125 regex_t *preg;
1127 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1128 reg_errcode_t ret;
1130 /* Allocate arrays. */
1131 dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1132 dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1133 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1134 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1135 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1136 || dfa->eclosures == NULL, 0))
1137 return REG_ESPACE;
1139 dfa->subexp_map = re_malloc (int, preg->re_nsub);
1140 if (dfa->subexp_map != NULL)
1142 int i;
1143 for (i = 0; i < preg->re_nsub; i++)
1144 dfa->subexp_map[i] = i;
1145 preorder (dfa->str_tree, optimize_subexps, dfa);
1146 for (i = 0; i < preg->re_nsub; i++)
1147 if (dfa->subexp_map[i] != i)
1148 break;
1149 if (i == preg->re_nsub)
1151 free (dfa->subexp_map);
1152 dfa->subexp_map = NULL;
1156 ret = postorder (dfa->str_tree, lower_subexps, preg);
1157 if (BE (ret != REG_NOERROR, 0))
1158 return ret;
1159 ret = postorder (dfa->str_tree, calc_first, dfa);
1160 if (BE (ret != REG_NOERROR, 0))
1161 return ret;
1162 preorder (dfa->str_tree, calc_next, dfa);
1163 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1164 if (BE (ret != REG_NOERROR, 0))
1165 return ret;
1166 ret = calc_eclosure (dfa);
1167 if (BE (ret != REG_NOERROR, 0))
1168 return ret;
1170 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1171 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1172 if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1173 || dfa->nbackref)
1175 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1176 if (BE (dfa->inveclosures == NULL, 0))
1177 return REG_ESPACE;
1178 ret = calc_inveclosure (dfa);
1181 return ret;
1184 /* Our parse trees are very unbalanced, so we cannot use a stack to
1185 implement parse tree visits. Instead, we use parent pointers and
1186 some hairy code in these two functions. */
1187 static reg_errcode_t
1188 postorder (root, fn, extra)
1189 bin_tree_t *root;
1190 reg_errcode_t (fn (void *, bin_tree_t *));
1191 void *extra;
1193 bin_tree_t *node, *prev;
1195 for (node = root; ; )
1197 /* Descend down the tree, preferably to the left (or to the right
1198 if that's the only child). */
1199 while (node->left || node->right)
1200 if (node->left)
1201 node = node->left;
1202 else
1203 node = node->right;
1207 reg_errcode_t err = fn (extra, node);
1208 if (BE (err != REG_NOERROR, 0))
1209 return err;
1210 if (node->parent == NULL)
1211 return REG_NOERROR;
1212 prev = node;
1213 node = node->parent;
1215 /* Go up while we have a node that is reached from the right. */
1216 while (node->right == prev || node->right == NULL);
1217 node = node->right;
1221 static reg_errcode_t
1222 preorder (root, fn, extra)
1223 bin_tree_t *root;
1224 reg_errcode_t (fn (void *, bin_tree_t *));
1225 void *extra;
1227 bin_tree_t *node;
1229 for (node = root; ; )
1231 reg_errcode_t err = fn (extra, node);
1232 if (BE (err != REG_NOERROR, 0))
1233 return err;
1235 /* Go to the left node, or up and to the right. */
1236 if (node->left)
1237 node = node->left;
1238 else
1240 bin_tree_t *prev = NULL;
1241 while (node->right == prev || node->right == NULL)
1243 prev = node;
1244 node = node->parent;
1245 if (!node)
1246 return REG_NOERROR;
1248 node = node->right;
1253 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1254 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1255 backreferences as well. Requires a preorder visit. */
1256 static reg_errcode_t
1257 optimize_subexps (extra, node)
1258 void *extra;
1259 bin_tree_t *node;
1261 re_dfa_t *dfa = (re_dfa_t *) extra;
1263 if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1265 int idx = node->token.opr.idx;
1266 node->token.opr.idx = dfa->subexp_map[idx];
1267 dfa->used_bkref_map |= 1 << node->token.opr.idx;
1270 else if (node->token.type == SUBEXP
1271 && node->left && node->left->token.type == SUBEXP)
1273 int other_idx = node->left->token.opr.idx;
1275 node->left = node->left->left;
1276 if (node->left)
1277 node->left->parent = node;
1279 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1280 if (other_idx < 8 * sizeof (dfa->used_bkref_map))
1281 dfa->used_bkref_map &= ~(1 << other_idx);
1284 return REG_NOERROR;
1287 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1288 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1289 static reg_errcode_t
1290 lower_subexps (extra, node)
1291 void *extra;
1292 bin_tree_t *node;
1294 regex_t *preg = (regex_t *) extra;
1295 reg_errcode_t err = REG_NOERROR;
1297 if (node->left && node->left->token.type == SUBEXP)
1299 node->left = lower_subexp (&err, preg, node->left);
1300 if (node->left)
1301 node->left->parent = node;
1303 if (node->right && node->right->token.type == SUBEXP)
1305 node->right = lower_subexp (&err, preg, node->right);
1306 if (node->right)
1307 node->right->parent = node;
1310 return err;
1313 static bin_tree_t *
1314 lower_subexp (err, preg, node)
1315 reg_errcode_t *err;
1316 regex_t *preg;
1317 bin_tree_t *node;
1319 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1320 bin_tree_t *body = node->left;
1321 bin_tree_t *op, *cls, *tree1, *tree;
1323 if (preg->no_sub
1324 /* We do not optimize empty subexpressions, because otherwise we may
1325 have bad CONCAT nodes with NULL children. This is obviously not
1326 very common, so we do not lose much. An example that triggers
1327 this case is the sed "script" /\(\)/x. */
1328 && node->left != NULL
1329 && (node->token.opr.idx >= 8 * sizeof (dfa->used_bkref_map)
1330 || !(dfa->used_bkref_map & (1 << node->token.opr.idx))))
1331 return node->left;
1333 /* Convert the SUBEXP node to the concatenation of an
1334 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1335 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1336 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1337 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1338 tree = create_tree (dfa, op, tree1, CONCAT);
1339 if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1341 *err = REG_ESPACE;
1342 return NULL;
1345 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1346 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1347 return tree;
1350 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1351 nodes. Requires a postorder visit. */
1352 static reg_errcode_t
1353 calc_first (extra, node)
1354 void *extra;
1355 bin_tree_t *node;
1357 re_dfa_t *dfa = (re_dfa_t *) extra;
1358 if (node->token.type == CONCAT)
1360 node->first = node->left->first;
1361 node->node_idx = node->left->node_idx;
1363 else
1365 node->first = node;
1366 node->node_idx = re_dfa_add_node (dfa, node->token);
1367 if (BE (node->node_idx == -1, 0))
1368 return REG_ESPACE;
1370 return REG_NOERROR;
1373 /* Pass 2: compute NEXT on the tree. Preorder visit. */
1374 static reg_errcode_t
1375 calc_next (extra, node)
1376 void *extra;
1377 bin_tree_t *node;
1379 switch (node->token.type)
1381 case OP_DUP_ASTERISK:
1382 node->left->next = node;
1383 break;
1384 case CONCAT:
1385 node->left->next = node->right->first;
1386 node->right->next = node->next;
1387 break;
1388 default:
1389 if (node->left)
1390 node->left->next = node->next;
1391 if (node->right)
1392 node->right->next = node->next;
1393 break;
1395 return REG_NOERROR;
1398 /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1399 static reg_errcode_t
1400 link_nfa_nodes (extra, node)
1401 void *extra;
1402 bin_tree_t *node;
1404 re_dfa_t *dfa = (re_dfa_t *) extra;
1405 int idx = node->node_idx;
1406 reg_errcode_t err = REG_NOERROR;
1408 switch (node->token.type)
1410 case CONCAT:
1411 break;
1413 case END_OF_RE:
1414 assert (node->next == NULL);
1415 break;
1417 case OP_DUP_ASTERISK:
1418 case OP_ALT:
1420 int left, right;
1421 dfa->has_plural_match = 1;
1422 if (node->left != NULL)
1423 left = node->left->first->node_idx;
1424 else
1425 left = node->next->node_idx;
1426 if (node->right != NULL)
1427 right = node->right->first->node_idx;
1428 else
1429 right = node->next->node_idx;
1430 assert (left > -1);
1431 assert (right > -1);
1432 err = re_node_set_init_2 (dfa->edests + idx, left, right);
1434 break;
1436 case ANCHOR:
1437 case OP_OPEN_SUBEXP:
1438 case OP_CLOSE_SUBEXP:
1439 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1440 break;
1442 case OP_BACK_REF:
1443 dfa->nexts[idx] = node->next->node_idx;
1444 if (node->token.type == OP_BACK_REF)
1445 re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1446 break;
1448 default:
1449 assert (!IS_EPSILON_NODE (node->token.type));
1450 dfa->nexts[idx] = node->next->node_idx;
1451 break;
1454 return err;
1457 /* Duplicate the epsilon closure of the node ROOT_NODE.
1458 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1459 to their own constraint. */
1461 static reg_errcode_t
1462 duplicate_node_closure (dfa, top_org_node, top_clone_node, root_node,
1463 init_constraint)
1464 re_dfa_t *dfa;
1465 int top_org_node, top_clone_node, root_node;
1466 unsigned int init_constraint;
1468 reg_errcode_t err;
1469 int org_node, clone_node, ret;
1470 unsigned int constraint = init_constraint;
1471 for (org_node = top_org_node, clone_node = top_clone_node;;)
1473 int org_dest, clone_dest;
1474 if (dfa->nodes[org_node].type == OP_BACK_REF)
1476 /* If the back reference epsilon-transit, its destination must
1477 also have the constraint. Then duplicate the epsilon closure
1478 of the destination of the back reference, and store it in
1479 edests of the back reference. */
1480 org_dest = dfa->nexts[org_node];
1481 re_node_set_empty (dfa->edests + clone_node);
1482 err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1483 if (BE (err != REG_NOERROR, 0))
1484 return err;
1485 dfa->nexts[clone_node] = dfa->nexts[org_node];
1486 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1487 if (BE (ret < 0, 0))
1488 return REG_ESPACE;
1490 else if (dfa->edests[org_node].nelem == 0)
1492 /* In case of the node can't epsilon-transit, don't duplicate the
1493 destination and store the original destination as the
1494 destination of the node. */
1495 dfa->nexts[clone_node] = dfa->nexts[org_node];
1496 break;
1498 else if (dfa->edests[org_node].nelem == 1)
1500 /* In case of the node can epsilon-transit, and it has only one
1501 destination. */
1502 org_dest = dfa->edests[org_node].elems[0];
1503 re_node_set_empty (dfa->edests + clone_node);
1504 if (dfa->nodes[org_node].type == ANCHOR)
1506 /* In case of the node has another constraint, append it. */
1507 if (org_node == root_node && clone_node != org_node)
1509 /* ...but if the node is root_node itself, it means the
1510 epsilon closure have a loop, then tie it to the
1511 destination of the root_node. */
1512 ret = re_node_set_insert (dfa->edests + clone_node,
1513 org_dest);
1514 if (BE (ret < 0, 0))
1515 return REG_ESPACE;
1516 break;
1518 constraint |= dfa->nodes[org_node].opr.ctx_type;
1520 err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1521 if (BE (err != REG_NOERROR, 0))
1522 return err;
1523 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1524 if (BE (ret < 0, 0))
1525 return REG_ESPACE;
1527 else /* dfa->edests[org_node].nelem == 2 */
1529 /* In case of the node can epsilon-transit, and it has two
1530 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1531 org_dest = dfa->edests[org_node].elems[0];
1532 re_node_set_empty (dfa->edests + clone_node);
1533 /* Search for a duplicated node which satisfies the constraint. */
1534 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1535 if (clone_dest == -1)
1537 /* There are no such a duplicated node, create a new one. */
1538 err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1539 if (BE (err != REG_NOERROR, 0))
1540 return err;
1541 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1542 if (BE (ret < 0, 0))
1543 return REG_ESPACE;
1544 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1545 root_node, constraint);
1546 if (BE (err != REG_NOERROR, 0))
1547 return err;
1549 else
1551 /* There are a duplicated node which satisfy the constraint,
1552 use it to avoid infinite loop. */
1553 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1554 if (BE (ret < 0, 0))
1555 return REG_ESPACE;
1558 org_dest = dfa->edests[org_node].elems[1];
1559 err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1560 if (BE (err != REG_NOERROR, 0))
1561 return err;
1562 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1563 if (BE (ret < 0, 0))
1564 return REG_ESPACE;
1566 org_node = org_dest;
1567 clone_node = clone_dest;
1569 return REG_NOERROR;
1572 /* Search for a node which is duplicated from the node ORG_NODE, and
1573 satisfies the constraint CONSTRAINT. */
1575 static int
1576 search_duplicated_node (dfa, org_node, constraint)
1577 re_dfa_t *dfa;
1578 int org_node;
1579 unsigned int constraint;
1581 int idx;
1582 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1584 if (org_node == dfa->org_indices[idx]
1585 && constraint == dfa->nodes[idx].constraint)
1586 return idx; /* Found. */
1588 return -1; /* Not found. */
1591 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1592 The new index will be stored in NEW_IDX and return REG_NOERROR if succeeded,
1593 otherwise return the error code. */
1595 static reg_errcode_t
1596 duplicate_node (new_idx, dfa, org_idx, constraint)
1597 re_dfa_t *dfa;
1598 int *new_idx, org_idx;
1599 unsigned int constraint;
1601 int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1602 if (BE (dup_idx == -1, 0))
1603 return REG_ESPACE;
1604 dfa->nodes[dup_idx].constraint = constraint;
1605 if (dfa->nodes[org_idx].type == ANCHOR)
1606 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type;
1607 dfa->nodes[dup_idx].duplicated = 1;
1609 /* Store the index of the original node. */
1610 dfa->org_indices[dup_idx] = org_idx;
1611 *new_idx = dup_idx;
1612 return REG_NOERROR;
1615 static reg_errcode_t
1616 calc_inveclosure (dfa)
1617 re_dfa_t *dfa;
1619 int src, idx, ret;
1620 for (idx = 0; idx < dfa->nodes_len; ++idx)
1621 re_node_set_init_empty (dfa->inveclosures + idx);
1623 for (src = 0; src < dfa->nodes_len; ++src)
1625 int *elems = dfa->eclosures[src].elems;
1626 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1628 ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1629 if (BE (ret == -1, 0))
1630 return REG_ESPACE;
1634 return REG_NOERROR;
1637 /* Calculate "eclosure" for all the node in DFA. */
1639 static reg_errcode_t
1640 calc_eclosure (dfa)
1641 re_dfa_t *dfa;
1643 int node_idx, incomplete;
1644 #ifdef DEBUG
1645 assert (dfa->nodes_len > 0);
1646 #endif
1647 incomplete = 0;
1648 /* For each nodes, calculate epsilon closure. */
1649 for (node_idx = 0; ; ++node_idx)
1651 reg_errcode_t err;
1652 re_node_set eclosure_elem;
1653 if (node_idx == dfa->nodes_len)
1655 if (!incomplete)
1656 break;
1657 incomplete = 0;
1658 node_idx = 0;
1661 #ifdef DEBUG
1662 assert (dfa->eclosures[node_idx].nelem != -1);
1663 #endif
1665 /* If we have already calculated, skip it. */
1666 if (dfa->eclosures[node_idx].nelem != 0)
1667 continue;
1668 /* Calculate epsilon closure of `node_idx'. */
1669 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1670 if (BE (err != REG_NOERROR, 0))
1671 return err;
1673 if (dfa->eclosures[node_idx].nelem == 0)
1675 incomplete = 1;
1676 re_node_set_free (&eclosure_elem);
1679 return REG_NOERROR;
1682 /* Calculate epsilon closure of NODE. */
1684 static reg_errcode_t
1685 calc_eclosure_iter (new_set, dfa, node, root)
1686 re_node_set *new_set;
1687 re_dfa_t *dfa;
1688 int node, root;
1690 reg_errcode_t err;
1691 unsigned int constraint;
1692 int i, incomplete;
1693 re_node_set eclosure;
1694 incomplete = 0;
1695 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1696 if (BE (err != REG_NOERROR, 0))
1697 return err;
1699 /* This indicates that we are calculating this node now.
1700 We reference this value to avoid infinite loop. */
1701 dfa->eclosures[node].nelem = -1;
1703 constraint = ((dfa->nodes[node].type == ANCHOR)
1704 ? dfa->nodes[node].opr.ctx_type : 0);
1705 /* If the current node has constraints, duplicate all nodes.
1706 Since they must inherit the constraints. */
1707 if (constraint
1708 && dfa->edests[node].nelem
1709 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1711 int org_node, cur_node;
1712 org_node = cur_node = node;
1713 err = duplicate_node_closure (dfa, node, node, node, constraint);
1714 if (BE (err != REG_NOERROR, 0))
1715 return err;
1718 /* Expand each epsilon destination nodes. */
1719 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1720 for (i = 0; i < dfa->edests[node].nelem; ++i)
1722 re_node_set eclosure_elem;
1723 int edest = dfa->edests[node].elems[i];
1724 /* If calculating the epsilon closure of `edest' is in progress,
1725 return intermediate result. */
1726 if (dfa->eclosures[edest].nelem == -1)
1728 incomplete = 1;
1729 continue;
1731 /* If we haven't calculated the epsilon closure of `edest' yet,
1732 calculate now. Otherwise use calculated epsilon closure. */
1733 if (dfa->eclosures[edest].nelem == 0)
1735 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1736 if (BE (err != REG_NOERROR, 0))
1737 return err;
1739 else
1740 eclosure_elem = dfa->eclosures[edest];
1741 /* Merge the epsilon closure of `edest'. */
1742 re_node_set_merge (&eclosure, &eclosure_elem);
1743 /* If the epsilon closure of `edest' is incomplete,
1744 the epsilon closure of this node is also incomplete. */
1745 if (dfa->eclosures[edest].nelem == 0)
1747 incomplete = 1;
1748 re_node_set_free (&eclosure_elem);
1752 /* Epsilon closures include itself. */
1753 re_node_set_insert (&eclosure, node);
1754 if (incomplete && !root)
1755 dfa->eclosures[node].nelem = 0;
1756 else
1757 dfa->eclosures[node] = eclosure;
1758 *new_set = eclosure;
1759 return REG_NOERROR;
1762 /* Functions for token which are used in the parser. */
1764 /* Fetch a token from INPUT.
1765 We must not use this function inside bracket expressions. */
1767 static void
1768 fetch_token (result, input, syntax)
1769 re_token_t *result;
1770 re_string_t *input;
1771 reg_syntax_t syntax;
1773 re_string_skip_bytes (input, peek_token (result, input, syntax));
1776 /* Peek a token from INPUT, and return the length of the token.
1777 We must not use this function inside bracket expressions. */
1779 static int
1780 peek_token (token, input, syntax)
1781 re_token_t *token;
1782 re_string_t *input;
1783 reg_syntax_t syntax;
1785 unsigned char c;
1787 if (re_string_eoi (input))
1789 token->type = END_OF_RE;
1790 return 0;
1793 c = re_string_peek_byte (input, 0);
1794 token->opr.c = c;
1796 token->word_char = 0;
1797 #ifdef RE_ENABLE_I18N
1798 token->mb_partial = 0;
1799 if (input->mb_cur_max > 1 &&
1800 !re_string_first_byte (input, re_string_cur_idx (input)))
1802 token->type = CHARACTER;
1803 token->mb_partial = 1;
1804 return 1;
1806 #endif
1807 if (c == '\\')
1809 unsigned char c2;
1810 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1812 token->type = BACK_SLASH;
1813 return 1;
1816 c2 = re_string_peek_byte_case (input, 1);
1817 token->opr.c = c2;
1818 token->type = CHARACTER;
1819 #ifdef RE_ENABLE_I18N
1820 if (input->mb_cur_max > 1)
1822 wint_t wc = re_string_wchar_at (input,
1823 re_string_cur_idx (input) + 1);
1824 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1826 else
1827 #endif
1828 token->word_char = IS_WORD_CHAR (c2) != 0;
1830 switch (c2)
1832 case '|':
1833 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1834 token->type = OP_ALT;
1835 break;
1836 case '1': case '2': case '3': case '4': case '5':
1837 case '6': case '7': case '8': case '9':
1838 if (!(syntax & RE_NO_BK_REFS))
1840 token->type = OP_BACK_REF;
1841 token->opr.idx = c2 - '1';
1843 break;
1844 case '<':
1845 if (!(syntax & RE_NO_GNU_OPS))
1847 token->type = ANCHOR;
1848 token->opr.ctx_type = WORD_FIRST;
1850 break;
1851 case '>':
1852 if (!(syntax & RE_NO_GNU_OPS))
1854 token->type = ANCHOR;
1855 token->opr.ctx_type = WORD_LAST;
1857 break;
1858 case 'b':
1859 if (!(syntax & RE_NO_GNU_OPS))
1861 token->type = ANCHOR;
1862 token->opr.ctx_type = WORD_DELIM;
1864 break;
1865 case 'B':
1866 if (!(syntax & RE_NO_GNU_OPS))
1868 token->type = ANCHOR;
1869 token->opr.ctx_type = NOT_WORD_DELIM;
1871 break;
1872 case 'w':
1873 if (!(syntax & RE_NO_GNU_OPS))
1874 token->type = OP_WORD;
1875 break;
1876 case 'W':
1877 if (!(syntax & RE_NO_GNU_OPS))
1878 token->type = OP_NOTWORD;
1879 break;
1880 case 's':
1881 if (!(syntax & RE_NO_GNU_OPS))
1882 token->type = OP_SPACE;
1883 break;
1884 case 'S':
1885 if (!(syntax & RE_NO_GNU_OPS))
1886 token->type = OP_NOTSPACE;
1887 break;
1888 case '`':
1889 if (!(syntax & RE_NO_GNU_OPS))
1891 token->type = ANCHOR;
1892 token->opr.ctx_type = BUF_FIRST;
1894 break;
1895 case '\'':
1896 if (!(syntax & RE_NO_GNU_OPS))
1898 token->type = ANCHOR;
1899 token->opr.ctx_type = BUF_LAST;
1901 break;
1902 case '(':
1903 if (!(syntax & RE_NO_BK_PARENS))
1904 token->type = OP_OPEN_SUBEXP;
1905 break;
1906 case ')':
1907 if (!(syntax & RE_NO_BK_PARENS))
1908 token->type = OP_CLOSE_SUBEXP;
1909 break;
1910 case '+':
1911 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1912 token->type = OP_DUP_PLUS;
1913 break;
1914 case '?':
1915 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1916 token->type = OP_DUP_QUESTION;
1917 break;
1918 case '{':
1919 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1920 token->type = OP_OPEN_DUP_NUM;
1921 break;
1922 case '}':
1923 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1924 token->type = OP_CLOSE_DUP_NUM;
1925 break;
1926 default:
1927 break;
1929 return 2;
1932 token->type = CHARACTER;
1933 #ifdef RE_ENABLE_I18N
1934 if (input->mb_cur_max > 1)
1936 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1937 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1939 else
1940 #endif
1941 token->word_char = IS_WORD_CHAR (token->opr.c);
1943 switch (c)
1945 case '\n':
1946 if (syntax & RE_NEWLINE_ALT)
1947 token->type = OP_ALT;
1948 break;
1949 case '|':
1950 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1951 token->type = OP_ALT;
1952 break;
1953 case '*':
1954 token->type = OP_DUP_ASTERISK;
1955 break;
1956 case '+':
1957 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1958 token->type = OP_DUP_PLUS;
1959 break;
1960 case '?':
1961 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1962 token->type = OP_DUP_QUESTION;
1963 break;
1964 case '{':
1965 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1966 token->type = OP_OPEN_DUP_NUM;
1967 break;
1968 case '}':
1969 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1970 token->type = OP_CLOSE_DUP_NUM;
1971 break;
1972 case '(':
1973 if (syntax & RE_NO_BK_PARENS)
1974 token->type = OP_OPEN_SUBEXP;
1975 break;
1976 case ')':
1977 if (syntax & RE_NO_BK_PARENS)
1978 token->type = OP_CLOSE_SUBEXP;
1979 break;
1980 case '[':
1981 token->type = OP_OPEN_BRACKET;
1982 break;
1983 case '.':
1984 token->type = OP_PERIOD;
1985 break;
1986 case '^':
1987 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1988 re_string_cur_idx (input) != 0)
1990 char prev = re_string_peek_byte (input, -1);
1991 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1992 break;
1994 token->type = ANCHOR;
1995 token->opr.ctx_type = LINE_FIRST;
1996 break;
1997 case '$':
1998 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1999 re_string_cur_idx (input) + 1 != re_string_length (input))
2001 re_token_t next;
2002 re_string_skip_bytes (input, 1);
2003 peek_token (&next, input, syntax);
2004 re_string_skip_bytes (input, -1);
2005 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
2006 break;
2008 token->type = ANCHOR;
2009 token->opr.ctx_type = LINE_LAST;
2010 break;
2011 default:
2012 break;
2014 return 1;
2017 /* Peek a token from INPUT, and return the length of the token.
2018 We must not use this function out of bracket expressions. */
2020 static int
2021 peek_token_bracket (token, input, syntax)
2022 re_token_t *token;
2023 re_string_t *input;
2024 reg_syntax_t syntax;
2026 unsigned char c;
2027 if (re_string_eoi (input))
2029 token->type = END_OF_RE;
2030 return 0;
2032 c = re_string_peek_byte (input, 0);
2033 token->opr.c = c;
2035 #ifdef RE_ENABLE_I18N
2036 if (input->mb_cur_max > 1 &&
2037 !re_string_first_byte (input, re_string_cur_idx (input)))
2039 token->type = CHARACTER;
2040 return 1;
2042 #endif /* RE_ENABLE_I18N */
2044 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2045 && re_string_cur_idx (input) + 1 < re_string_length (input))
2047 /* In this case, '\' escape a character. */
2048 unsigned char c2;
2049 re_string_skip_bytes (input, 1);
2050 c2 = re_string_peek_byte (input, 0);
2051 token->opr.c = c2;
2052 token->type = CHARACTER;
2053 return 1;
2055 if (c == '[') /* '[' is a special char in a bracket exps. */
2057 unsigned char c2;
2058 int token_len;
2059 if (re_string_cur_idx (input) + 1 < re_string_length (input))
2060 c2 = re_string_peek_byte (input, 1);
2061 else
2062 c2 = 0;
2063 token->opr.c = c2;
2064 token_len = 2;
2065 switch (c2)
2067 case '.':
2068 token->type = OP_OPEN_COLL_ELEM;
2069 break;
2070 case '=':
2071 token->type = OP_OPEN_EQUIV_CLASS;
2072 break;
2073 case ':':
2074 if (syntax & RE_CHAR_CLASSES)
2076 token->type = OP_OPEN_CHAR_CLASS;
2077 break;
2079 /* else fall through. */
2080 default:
2081 token->type = CHARACTER;
2082 token->opr.c = c;
2083 token_len = 1;
2084 break;
2086 return token_len;
2088 switch (c)
2090 case '-':
2091 token->type = OP_CHARSET_RANGE;
2092 break;
2093 case ']':
2094 token->type = OP_CLOSE_BRACKET;
2095 break;
2096 case '^':
2097 token->type = OP_NON_MATCH_LIST;
2098 break;
2099 default:
2100 token->type = CHARACTER;
2102 return 1;
2105 /* Functions for parser. */
2107 /* Entry point of the parser.
2108 Parse the regular expression REGEXP and return the structure tree.
2109 If an error is occured, ERR is set by error code, and return NULL.
2110 This function build the following tree, from regular expression <reg_exp>:
2114 <reg_exp> EOR
2116 CAT means concatenation.
2117 EOR means end of regular expression. */
2119 static bin_tree_t *
2120 parse (regexp, preg, syntax, err)
2121 re_string_t *regexp;
2122 regex_t *preg;
2123 reg_syntax_t syntax;
2124 reg_errcode_t *err;
2126 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2127 bin_tree_t *tree, *eor, *root;
2128 re_token_t current_token;
2129 dfa->syntax = syntax;
2130 fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2131 tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2132 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2133 return NULL;
2134 eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2135 if (tree != NULL)
2136 root = create_tree (dfa, tree, eor, CONCAT);
2137 else
2138 root = eor;
2139 if (BE (eor == NULL || root == NULL, 0))
2141 *err = REG_ESPACE;
2142 return NULL;
2144 return root;
2147 /* This function build the following tree, from regular expression
2148 <branch1>|<branch2>:
2152 <branch1> <branch2>
2154 ALT means alternative, which represents the operator `|'. */
2156 static bin_tree_t *
2157 parse_reg_exp (regexp, preg, token, syntax, nest, err)
2158 re_string_t *regexp;
2159 regex_t *preg;
2160 re_token_t *token;
2161 reg_syntax_t syntax;
2162 int nest;
2163 reg_errcode_t *err;
2165 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2166 bin_tree_t *tree, *branch = NULL;
2167 tree = parse_branch (regexp, preg, token, syntax, nest, err);
2168 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2169 return NULL;
2171 while (token->type == OP_ALT)
2173 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2174 if (token->type != OP_ALT && token->type != END_OF_RE
2175 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2177 branch = parse_branch (regexp, preg, token, syntax, nest, err);
2178 if (BE (*err != REG_NOERROR && branch == NULL, 0))
2179 return NULL;
2181 else
2182 branch = NULL;
2183 tree = create_tree (dfa, tree, branch, OP_ALT);
2184 if (BE (tree == NULL, 0))
2186 *err = REG_ESPACE;
2187 return NULL;
2190 return tree;
2193 /* This function build the following tree, from regular expression
2194 <exp1><exp2>:
2198 <exp1> <exp2>
2200 CAT means concatenation. */
2202 static bin_tree_t *
2203 parse_branch (regexp, preg, token, syntax, nest, err)
2204 re_string_t *regexp;
2205 regex_t *preg;
2206 re_token_t *token;
2207 reg_syntax_t syntax;
2208 int nest;
2209 reg_errcode_t *err;
2211 bin_tree_t *tree, *exp;
2212 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2213 tree = parse_expression (regexp, preg, token, syntax, nest, err);
2214 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2215 return NULL;
2217 while (token->type != OP_ALT && token->type != END_OF_RE
2218 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2220 exp = parse_expression (regexp, preg, token, syntax, nest, err);
2221 if (BE (*err != REG_NOERROR && exp == NULL, 0))
2223 return NULL;
2225 if (tree != NULL && exp != NULL)
2227 tree = create_tree (dfa, tree, exp, CONCAT);
2228 if (tree == NULL)
2230 *err = REG_ESPACE;
2231 return NULL;
2234 else if (tree == NULL)
2235 tree = exp;
2236 /* Otherwise exp == NULL, we don't need to create new tree. */
2238 return tree;
2241 /* This function build the following tree, from regular expression a*:
2247 static bin_tree_t *
2248 parse_expression (regexp, preg, token, syntax, nest, err)
2249 re_string_t *regexp;
2250 regex_t *preg;
2251 re_token_t *token;
2252 reg_syntax_t syntax;
2253 int nest;
2254 reg_errcode_t *err;
2256 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2257 bin_tree_t *tree;
2258 switch (token->type)
2260 case CHARACTER:
2261 tree = create_token_tree (dfa, NULL, NULL, token);
2262 if (BE (tree == NULL, 0))
2264 *err = REG_ESPACE;
2265 return NULL;
2267 #ifdef RE_ENABLE_I18N
2268 if (dfa->mb_cur_max > 1)
2270 while (!re_string_eoi (regexp)
2271 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2273 bin_tree_t *mbc_remain;
2274 fetch_token (token, regexp, syntax);
2275 mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2276 tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2277 if (BE (mbc_remain == NULL || tree == NULL, 0))
2279 *err = REG_ESPACE;
2280 return NULL;
2284 #endif
2285 break;
2286 case OP_OPEN_SUBEXP:
2287 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2288 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2289 return NULL;
2290 break;
2291 case OP_OPEN_BRACKET:
2292 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2293 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2294 return NULL;
2295 break;
2296 case OP_BACK_REF:
2297 if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2299 *err = REG_ESUBREG;
2300 return NULL;
2302 dfa->used_bkref_map |= 1 << token->opr.idx;
2303 tree = create_token_tree (dfa, NULL, NULL, token);
2304 if (BE (tree == NULL, 0))
2306 *err = REG_ESPACE;
2307 return NULL;
2309 ++dfa->nbackref;
2310 dfa->has_mb_node = 1;
2311 break;
2312 case OP_OPEN_DUP_NUM:
2313 if (syntax & RE_CONTEXT_INVALID_DUP)
2315 *err = REG_BADRPT;
2316 return NULL;
2318 /* FALLTHROUGH */
2319 case OP_DUP_ASTERISK:
2320 case OP_DUP_PLUS:
2321 case OP_DUP_QUESTION:
2322 if (syntax & RE_CONTEXT_INVALID_OPS)
2324 *err = REG_BADRPT;
2325 return NULL;
2327 else if (syntax & RE_CONTEXT_INDEP_OPS)
2329 fetch_token (token, regexp, syntax);
2330 return parse_expression (regexp, preg, token, syntax, nest, err);
2332 /* else fall through */
2333 case OP_CLOSE_SUBEXP:
2334 if ((token->type == OP_CLOSE_SUBEXP) &&
2335 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2337 *err = REG_ERPAREN;
2338 return NULL;
2340 /* else fall through */
2341 case OP_CLOSE_DUP_NUM:
2342 /* We treat it as a normal character. */
2344 /* Then we can these characters as normal characters. */
2345 token->type = CHARACTER;
2346 /* mb_partial and word_char bits should be initialized already
2347 by peek_token. */
2348 tree = create_token_tree (dfa, NULL, NULL, token);
2349 if (BE (tree == NULL, 0))
2351 *err = REG_ESPACE;
2352 return NULL;
2354 break;
2355 case ANCHOR:
2356 if ((token->opr.ctx_type
2357 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2358 && dfa->word_ops_used == 0)
2359 init_word_char (dfa);
2360 if (token->opr.ctx_type == WORD_DELIM
2361 || token->opr.ctx_type == NOT_WORD_DELIM)
2363 bin_tree_t *tree_first, *tree_last;
2364 if (token->opr.ctx_type == WORD_DELIM)
2366 token->opr.ctx_type = WORD_FIRST;
2367 tree_first = create_token_tree (dfa, NULL, NULL, token);
2368 token->opr.ctx_type = WORD_LAST;
2370 else
2372 token->opr.ctx_type = INSIDE_WORD;
2373 tree_first = create_token_tree (dfa, NULL, NULL, token);
2374 token->opr.ctx_type = INSIDE_NOTWORD;
2376 tree_last = create_token_tree (dfa, NULL, NULL, token);
2377 tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2378 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2380 *err = REG_ESPACE;
2381 return NULL;
2384 else
2386 tree = create_token_tree (dfa, NULL, NULL, token);
2387 if (BE (tree == NULL, 0))
2389 *err = REG_ESPACE;
2390 return NULL;
2393 /* We must return here, since ANCHORs can't be followed
2394 by repetition operators.
2395 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2396 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2397 fetch_token (token, regexp, syntax);
2398 return tree;
2399 case OP_PERIOD:
2400 tree = create_token_tree (dfa, NULL, NULL, token);
2401 if (BE (tree == NULL, 0))
2403 *err = REG_ESPACE;
2404 return NULL;
2406 if (dfa->mb_cur_max > 1)
2407 dfa->has_mb_node = 1;
2408 break;
2409 case OP_WORD:
2410 case OP_NOTWORD:
2411 tree = build_charclass_op (dfa, regexp->trans,
2412 (const unsigned char *) "alnum",
2413 (const unsigned char *) "_",
2414 token->type == OP_NOTWORD, err);
2415 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2416 return NULL;
2417 break;
2418 case OP_SPACE:
2419 case OP_NOTSPACE:
2420 tree = build_charclass_op (dfa, regexp->trans,
2421 (const unsigned char *) "space",
2422 (const unsigned char *) "",
2423 token->type == OP_NOTSPACE, err);
2424 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2425 return NULL;
2426 break;
2427 case OP_ALT:
2428 case END_OF_RE:
2429 return NULL;
2430 case BACK_SLASH:
2431 *err = REG_EESCAPE;
2432 return NULL;
2433 default:
2434 /* Must not happen? */
2435 #ifdef DEBUG
2436 assert (0);
2437 #endif
2438 return NULL;
2440 fetch_token (token, regexp, syntax);
2442 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2443 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2445 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2446 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2447 return NULL;
2448 /* In BRE consecutive duplications are not allowed. */
2449 if ((syntax & RE_CONTEXT_INVALID_DUP)
2450 && (token->type == OP_DUP_ASTERISK
2451 || token->type == OP_OPEN_DUP_NUM))
2453 *err = REG_BADRPT;
2454 return NULL;
2458 return tree;
2461 /* This function build the following tree, from regular expression
2462 (<reg_exp>):
2463 SUBEXP
2465 <reg_exp>
2468 static bin_tree_t *
2469 parse_sub_exp (regexp, preg, token, syntax, nest, err)
2470 re_string_t *regexp;
2471 regex_t *preg;
2472 re_token_t *token;
2473 reg_syntax_t syntax;
2474 int nest;
2475 reg_errcode_t *err;
2477 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2478 bin_tree_t *tree;
2479 size_t cur_nsub;
2480 cur_nsub = preg->re_nsub++;
2482 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2484 /* The subexpression may be a null string. */
2485 if (token->type == OP_CLOSE_SUBEXP)
2486 tree = NULL;
2487 else
2489 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2490 if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2491 *err = REG_EPAREN;
2492 if (BE (*err != REG_NOERROR, 0))
2493 return NULL;
2495 dfa->completed_bkref_map |= 1 << cur_nsub;
2497 tree = create_tree (dfa, tree, NULL, SUBEXP);
2498 if (BE (tree == NULL, 0))
2500 *err = REG_ESPACE;
2501 return NULL;
2503 tree->token.opr.idx = cur_nsub;
2504 return tree;
2507 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2509 static bin_tree_t *
2510 parse_dup_op (elem, regexp, dfa, token, syntax, err)
2511 bin_tree_t *elem;
2512 re_string_t *regexp;
2513 re_dfa_t *dfa;
2514 re_token_t *token;
2515 reg_syntax_t syntax;
2516 reg_errcode_t *err;
2518 bin_tree_t *tree = NULL, *old_tree = NULL;
2519 int i, start, end, start_idx = re_string_cur_idx (regexp);
2520 re_token_t start_token = *token;
2522 if (token->type == OP_OPEN_DUP_NUM)
2524 end = 0;
2525 start = fetch_number (regexp, token, syntax);
2526 if (start == -1)
2528 if (token->type == CHARACTER && token->opr.c == ',')
2529 start = 0; /* We treat "{,m}" as "{0,m}". */
2530 else
2532 *err = REG_BADBR; /* <re>{} is invalid. */
2533 return NULL;
2536 if (BE (start != -2, 1))
2538 /* We treat "{n}" as "{n,n}". */
2539 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2540 : ((token->type == CHARACTER && token->opr.c == ',')
2541 ? fetch_number (regexp, token, syntax) : -2));
2543 if (BE (start == -2 || end == -2, 0))
2545 /* Invalid sequence. */
2546 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2548 if (token->type == END_OF_RE)
2549 *err = REG_EBRACE;
2550 else
2551 *err = REG_BADBR;
2553 return NULL;
2556 /* If the syntax bit is set, rollback. */
2557 re_string_set_index (regexp, start_idx);
2558 *token = start_token;
2559 token->type = CHARACTER;
2560 /* mb_partial and word_char bits should be already initialized by
2561 peek_token. */
2562 return elem;
2565 if (BE (end != -1 && start > end, 0))
2567 /* First number greater than second. */
2568 *err = REG_BADBR;
2569 return NULL;
2572 else
2574 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2575 end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2578 fetch_token (token, regexp, syntax);
2580 if (BE (elem == NULL, 0))
2581 return NULL;
2582 if (BE (start == 0 && end == 0, 0))
2584 postorder (elem, free_tree, NULL);
2585 return NULL;
2588 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2589 if (BE (start > 0, 0))
2591 tree = elem;
2592 for (i = 2; i <= start; ++i)
2594 elem = duplicate_tree (elem, dfa);
2595 tree = create_tree (dfa, tree, elem, CONCAT);
2596 if (BE (elem == NULL || tree == NULL, 0))
2597 goto parse_dup_op_espace;
2600 if (start == end)
2601 return tree;
2603 /* Duplicate ELEM before it is marked optional. */
2604 elem = duplicate_tree (elem, dfa);
2605 old_tree = tree;
2607 else
2608 old_tree = NULL;
2610 if (elem->token.type == SUBEXP)
2611 postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2613 tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2614 if (BE (tree == NULL, 0))
2615 goto parse_dup_op_espace;
2617 /* This loop is actually executed only when end != -1,
2618 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2619 already created the start+1-th copy. */
2620 for (i = start + 2; i <= end; ++i)
2622 elem = duplicate_tree (elem, dfa);
2623 tree = create_tree (dfa, tree, elem, CONCAT);
2624 if (BE (elem == NULL || tree == NULL, 0))
2625 goto parse_dup_op_espace;
2627 tree = create_tree (dfa, tree, NULL, OP_ALT);
2628 if (BE (tree == NULL, 0))
2629 goto parse_dup_op_espace;
2632 if (old_tree)
2633 tree = create_tree (dfa, old_tree, tree, CONCAT);
2635 return tree;
2637 parse_dup_op_espace:
2638 *err = REG_ESPACE;
2639 return NULL;
2642 /* Size of the names for collating symbol/equivalence_class/character_class.
2643 I'm not sure, but maybe enough. */
2644 #define BRACKET_NAME_BUF_SIZE 32
2646 #ifndef _LIBC
2647 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2648 Build the range expression which starts from START_ELEM, and ends
2649 at END_ELEM. The result are written to MBCSET and SBCSET.
2650 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2651 mbcset->range_ends, is a pointer argument sinse we may
2652 update it. */
2654 static reg_errcode_t
2655 # ifdef RE_ENABLE_I18N
2656 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2657 re_charset_t *mbcset;
2658 int *range_alloc;
2659 # else /* not RE_ENABLE_I18N */
2660 build_range_exp (sbcset, start_elem, end_elem)
2661 # endif /* not RE_ENABLE_I18N */
2662 re_bitset_ptr_t sbcset;
2663 bracket_elem_t *start_elem, *end_elem;
2665 unsigned int start_ch, end_ch;
2666 /* Equivalence Classes and Character Classes can't be a range start/end. */
2667 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2668 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2670 return REG_ERANGE;
2672 /* We can handle no multi character collating elements without libc
2673 support. */
2674 if (BE ((start_elem->type == COLL_SYM
2675 && strlen ((char *) start_elem->opr.name) > 1)
2676 || (end_elem->type == COLL_SYM
2677 && strlen ((char *) end_elem->opr.name) > 1), 0))
2678 return REG_ECOLLATE;
2680 # ifdef RE_ENABLE_I18N
2682 wchar_t wc, start_wc, end_wc;
2683 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2685 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2686 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2687 : 0));
2688 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2689 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2690 : 0));
2691 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2692 ? __btowc (start_ch) : start_elem->opr.wch);
2693 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2694 ? __btowc (end_ch) : end_elem->opr.wch);
2695 if (start_wc == WEOF || end_wc == WEOF)
2696 return REG_ECOLLATE;
2697 cmp_buf[0] = start_wc;
2698 cmp_buf[4] = end_wc;
2699 if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2700 return REG_ERANGE;
2702 /* Got valid collation sequence values, add them as a new entry.
2703 However, for !_LIBC we have no collation elements: if the
2704 character set is single byte, the single byte character set
2705 that we build below suffices. parse_bracket_exp passes
2706 no MBCSET if dfa->mb_cur_max == 1. */
2707 if (mbcset)
2709 /* Check the space of the arrays. */
2710 if (BE (*range_alloc == mbcset->nranges, 0))
2712 /* There is not enough space, need realloc. */
2713 wchar_t *new_array_start, *new_array_end;
2714 int new_nranges;
2716 /* +1 in case of mbcset->nranges is 0. */
2717 new_nranges = 2 * mbcset->nranges + 1;
2718 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2719 are NULL if *range_alloc == 0. */
2720 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2721 new_nranges);
2722 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2723 new_nranges);
2725 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2726 return REG_ESPACE;
2728 mbcset->range_starts = new_array_start;
2729 mbcset->range_ends = new_array_end;
2730 *range_alloc = new_nranges;
2733 mbcset->range_starts[mbcset->nranges] = start_wc;
2734 mbcset->range_ends[mbcset->nranges++] = end_wc;
2737 /* Build the table for single byte characters. */
2738 for (wc = 0; wc < SBC_MAX; ++wc)
2740 cmp_buf[2] = wc;
2741 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2742 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2743 bitset_set (sbcset, wc);
2746 # else /* not RE_ENABLE_I18N */
2748 unsigned int ch;
2749 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2750 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2751 : 0));
2752 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2753 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2754 : 0));
2755 if (start_ch > end_ch)
2756 return REG_ERANGE;
2757 /* Build the table for single byte characters. */
2758 for (ch = 0; ch < SBC_MAX; ++ch)
2759 if (start_ch <= ch && ch <= end_ch)
2760 bitset_set (sbcset, ch);
2762 # endif /* not RE_ENABLE_I18N */
2763 return REG_NOERROR;
2765 #endif /* not _LIBC */
2767 #ifndef _LIBC
2768 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2769 Build the collating element which is represented by NAME.
2770 The result are written to MBCSET and SBCSET.
2771 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2772 pointer argument since we may update it. */
2774 static reg_errcode_t
2775 # ifdef RE_ENABLE_I18N
2776 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2777 re_charset_t *mbcset;
2778 int *coll_sym_alloc;
2779 # else /* not RE_ENABLE_I18N */
2780 build_collating_symbol (sbcset, name)
2781 # endif /* not RE_ENABLE_I18N */
2782 re_bitset_ptr_t sbcset;
2783 const unsigned char *name;
2785 size_t name_len = strlen ((const char *) name);
2786 if (BE (name_len != 1, 0))
2787 return REG_ECOLLATE;
2788 else
2790 bitset_set (sbcset, name[0]);
2791 return REG_NOERROR;
2794 #endif /* not _LIBC */
2796 /* This function parse bracket expression like "[abc]", "[a-c]",
2797 "[[.a-a.]]" etc. */
2799 static bin_tree_t *
2800 parse_bracket_exp (regexp, dfa, token, syntax, err)
2801 re_string_t *regexp;
2802 re_dfa_t *dfa;
2803 re_token_t *token;
2804 reg_syntax_t syntax;
2805 reg_errcode_t *err;
2807 #ifdef _LIBC
2808 const unsigned char *collseqmb;
2809 const char *collseqwc;
2810 uint32_t nrules;
2811 int32_t table_size;
2812 const int32_t *symb_table;
2813 const unsigned char *extra;
2815 /* Local function for parse_bracket_exp used in _LIBC environement.
2816 Seek the collating symbol entry correspondings to NAME.
2817 Return the index of the symbol in the SYMB_TABLE. */
2819 auto inline int32_t
2820 __attribute ((always_inline))
2821 seek_collating_symbol_entry (name, name_len)
2822 const unsigned char *name;
2823 size_t name_len;
2825 int32_t hash = elem_hash ((const char *) name, name_len);
2826 int32_t elem = hash % table_size;
2827 int32_t second = hash % (table_size - 2);
2828 while (symb_table[2 * elem] != 0)
2830 /* First compare the hashing value. */
2831 if (symb_table[2 * elem] == hash
2832 /* Compare the length of the name. */
2833 && name_len == extra[symb_table[2 * elem + 1]]
2834 /* Compare the name. */
2835 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2836 name_len) == 0)
2838 /* Yep, this is the entry. */
2839 break;
2842 /* Next entry. */
2843 elem += second;
2845 return elem;
2848 /* Local function for parse_bracket_exp used in _LIBC environement.
2849 Look up the collation sequence value of BR_ELEM.
2850 Return the value if succeeded, UINT_MAX otherwise. */
2852 auto inline unsigned int
2853 __attribute ((always_inline))
2854 lookup_collation_sequence_value (br_elem)
2855 bracket_elem_t *br_elem;
2857 if (br_elem->type == SB_CHAR)
2860 if (MB_CUR_MAX == 1)
2862 if (nrules == 0)
2863 return collseqmb[br_elem->opr.ch];
2864 else
2866 wint_t wc = __btowc (br_elem->opr.ch);
2867 return __collseq_table_lookup (collseqwc, wc);
2870 else if (br_elem->type == MB_CHAR)
2872 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2874 else if (br_elem->type == COLL_SYM)
2876 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2877 if (nrules != 0)
2879 int32_t elem, idx;
2880 elem = seek_collating_symbol_entry (br_elem->opr.name,
2881 sym_name_len);
2882 if (symb_table[2 * elem] != 0)
2884 /* We found the entry. */
2885 idx = symb_table[2 * elem + 1];
2886 /* Skip the name of collating element name. */
2887 idx += 1 + extra[idx];
2888 /* Skip the byte sequence of the collating element. */
2889 idx += 1 + extra[idx];
2890 /* Adjust for the alignment. */
2891 idx = (idx + 3) & ~3;
2892 /* Skip the multibyte collation sequence value. */
2893 idx += sizeof (unsigned int);
2894 /* Skip the wide char sequence of the collating element. */
2895 idx += sizeof (unsigned int) *
2896 (1 + *(unsigned int *) (extra + idx));
2897 /* Return the collation sequence value. */
2898 return *(unsigned int *) (extra + idx);
2900 else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2902 /* No valid character. Match it as a single byte
2903 character. */
2904 return collseqmb[br_elem->opr.name[0]];
2907 else if (sym_name_len == 1)
2908 return collseqmb[br_elem->opr.name[0]];
2910 return UINT_MAX;
2913 /* Local function for parse_bracket_exp used in _LIBC environement.
2914 Build the range expression which starts from START_ELEM, and ends
2915 at END_ELEM. The result are written to MBCSET and SBCSET.
2916 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2917 mbcset->range_ends, is a pointer argument sinse we may
2918 update it. */
2920 auto inline reg_errcode_t
2921 __attribute ((always_inline))
2922 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2923 re_charset_t *mbcset;
2924 int *range_alloc;
2925 re_bitset_ptr_t sbcset;
2926 bracket_elem_t *start_elem, *end_elem;
2928 unsigned int ch;
2929 uint32_t start_collseq;
2930 uint32_t end_collseq;
2932 /* Equivalence Classes and Character Classes can't be a range
2933 start/end. */
2934 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2935 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2937 return REG_ERANGE;
2939 start_collseq = lookup_collation_sequence_value (start_elem);
2940 end_collseq = lookup_collation_sequence_value (end_elem);
2941 /* Check start/end collation sequence values. */
2942 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2943 return REG_ECOLLATE;
2944 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2945 return REG_ERANGE;
2947 /* Got valid collation sequence values, add them as a new entry.
2948 However, if we have no collation elements, and the character set
2949 is single byte, the single byte character set that we
2950 build below suffices. */
2951 if (nrules > 0 || dfa->mb_cur_max > 1)
2953 /* Check the space of the arrays. */
2954 if (BE (*range_alloc == mbcset->nranges, 0))
2956 /* There is not enough space, need realloc. */
2957 uint32_t *new_array_start;
2958 uint32_t *new_array_end;
2959 int new_nranges;
2961 /* +1 in case of mbcset->nranges is 0. */
2962 new_nranges = 2 * mbcset->nranges + 1;
2963 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2964 new_nranges);
2965 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2966 new_nranges);
2968 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2969 return REG_ESPACE;
2971 mbcset->range_starts = new_array_start;
2972 mbcset->range_ends = new_array_end;
2973 *range_alloc = new_nranges;
2976 mbcset->range_starts[mbcset->nranges] = start_collseq;
2977 mbcset->range_ends[mbcset->nranges++] = end_collseq;
2980 /* Build the table for single byte characters. */
2981 for (ch = 0; ch < SBC_MAX; ch++)
2983 uint32_t ch_collseq;
2985 if (MB_CUR_MAX == 1)
2987 if (nrules == 0)
2988 ch_collseq = collseqmb[ch];
2989 else
2990 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2991 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2992 bitset_set (sbcset, ch);
2994 return REG_NOERROR;
2997 /* Local function for parse_bracket_exp used in _LIBC environement.
2998 Build the collating element which is represented by NAME.
2999 The result are written to MBCSET and SBCSET.
3000 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
3001 pointer argument sinse we may update it. */
3003 auto inline reg_errcode_t
3004 __attribute ((always_inline))
3005 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
3006 re_charset_t *mbcset;
3007 int *coll_sym_alloc;
3008 re_bitset_ptr_t sbcset;
3009 const unsigned char *name;
3011 int32_t elem, idx;
3012 size_t name_len = strlen ((const char *) name);
3013 if (nrules != 0)
3015 elem = seek_collating_symbol_entry (name, name_len);
3016 if (symb_table[2 * elem] != 0)
3018 /* We found the entry. */
3019 idx = symb_table[2 * elem + 1];
3020 /* Skip the name of collating element name. */
3021 idx += 1 + extra[idx];
3023 else if (symb_table[2 * elem] == 0 && name_len == 1)
3025 /* No valid character, treat it as a normal
3026 character. */
3027 bitset_set (sbcset, name[0]);
3028 return REG_NOERROR;
3030 else
3031 return REG_ECOLLATE;
3033 /* Got valid collation sequence, add it as a new entry. */
3034 /* Check the space of the arrays. */
3035 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3037 /* Not enough, realloc it. */
3038 /* +1 in case of mbcset->ncoll_syms is 0. */
3039 int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3040 /* Use realloc since mbcset->coll_syms is NULL
3041 if *alloc == 0. */
3042 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3043 new_coll_sym_alloc);
3044 if (BE (new_coll_syms == NULL, 0))
3045 return REG_ESPACE;
3046 mbcset->coll_syms = new_coll_syms;
3047 *coll_sym_alloc = new_coll_sym_alloc;
3049 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3050 return REG_NOERROR;
3052 else
3054 if (BE (name_len != 1, 0))
3055 return REG_ECOLLATE;
3056 else
3058 bitset_set (sbcset, name[0]);
3059 return REG_NOERROR;
3063 #endif
3065 re_token_t br_token;
3066 re_bitset_ptr_t sbcset;
3067 #ifdef RE_ENABLE_I18N
3068 re_charset_t *mbcset;
3069 int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3070 int equiv_class_alloc = 0, char_class_alloc = 0;
3071 #endif /* not RE_ENABLE_I18N */
3072 int non_match = 0;
3073 bin_tree_t *work_tree;
3074 int token_len;
3075 int first_round = 1;
3076 #ifdef _LIBC
3077 collseqmb = (const unsigned char *)
3078 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3079 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3080 if (nrules)
3083 if (MB_CUR_MAX > 1)
3085 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3086 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3087 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3088 _NL_COLLATE_SYMB_TABLEMB);
3089 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3090 _NL_COLLATE_SYMB_EXTRAMB);
3092 #endif
3093 sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
3094 #ifdef RE_ENABLE_I18N
3095 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3096 #endif /* RE_ENABLE_I18N */
3097 #ifdef RE_ENABLE_I18N
3098 if (BE (sbcset == NULL || mbcset == NULL, 0))
3099 #else
3100 if (BE (sbcset == NULL, 0))
3101 #endif /* RE_ENABLE_I18N */
3103 *err = REG_ESPACE;
3104 return NULL;
3107 token_len = peek_token_bracket (token, regexp, syntax);
3108 if (BE (token->type == END_OF_RE, 0))
3110 *err = REG_BADPAT;
3111 goto parse_bracket_exp_free_return;
3113 if (token->type == OP_NON_MATCH_LIST)
3115 #ifdef RE_ENABLE_I18N
3116 mbcset->non_match = 1;
3117 #endif /* not RE_ENABLE_I18N */
3118 non_match = 1;
3119 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3120 bitset_set (sbcset, '\0');
3121 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3122 token_len = peek_token_bracket (token, regexp, syntax);
3123 if (BE (token->type == END_OF_RE, 0))
3125 *err = REG_BADPAT;
3126 goto parse_bracket_exp_free_return;
3130 /* We treat the first ']' as a normal character. */
3131 if (token->type == OP_CLOSE_BRACKET)
3132 token->type = CHARACTER;
3134 while (1)
3136 bracket_elem_t start_elem, end_elem;
3137 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3138 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3139 reg_errcode_t ret;
3140 int token_len2 = 0, is_range_exp = 0;
3141 re_token_t token2;
3143 start_elem.opr.name = start_name_buf;
3144 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3145 syntax, first_round);
3146 if (BE (ret != REG_NOERROR, 0))
3148 *err = ret;
3149 goto parse_bracket_exp_free_return;
3151 first_round = 0;
3153 /* Get information about the next token. We need it in any case. */
3154 token_len = peek_token_bracket (token, regexp, syntax);
3156 /* Do not check for ranges if we know they are not allowed. */
3157 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3159 if (BE (token->type == END_OF_RE, 0))
3161 *err = REG_EBRACK;
3162 goto parse_bracket_exp_free_return;
3164 if (token->type == OP_CHARSET_RANGE)
3166 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3167 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3168 if (BE (token2.type == END_OF_RE, 0))
3170 *err = REG_EBRACK;
3171 goto parse_bracket_exp_free_return;
3173 if (token2.type == OP_CLOSE_BRACKET)
3175 /* We treat the last '-' as a normal character. */
3176 re_string_skip_bytes (regexp, -token_len);
3177 token->type = CHARACTER;
3179 else
3180 is_range_exp = 1;
3184 if (is_range_exp == 1)
3186 end_elem.opr.name = end_name_buf;
3187 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3188 dfa, syntax, 1);
3189 if (BE (ret != REG_NOERROR, 0))
3191 *err = ret;
3192 goto parse_bracket_exp_free_return;
3195 token_len = peek_token_bracket (token, regexp, syntax);
3197 #ifdef _LIBC
3198 *err = build_range_exp (sbcset, mbcset, &range_alloc,
3199 &start_elem, &end_elem);
3200 #else
3201 # ifdef RE_ENABLE_I18N
3202 *err = build_range_exp (sbcset,
3203 dfa->mb_cur_max > 1 ? mbcset : NULL,
3204 &range_alloc, &start_elem, &end_elem);
3205 # else
3206 *err = build_range_exp (sbcset, &start_elem, &end_elem);
3207 # endif
3208 #endif /* RE_ENABLE_I18N */
3209 if (BE (*err != REG_NOERROR, 0))
3210 goto parse_bracket_exp_free_return;
3212 else
3214 switch (start_elem.type)
3216 case SB_CHAR:
3217 bitset_set (sbcset, start_elem.opr.ch);
3218 break;
3219 #ifdef RE_ENABLE_I18N
3220 case MB_CHAR:
3221 /* Check whether the array has enough space. */
3222 if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3224 wchar_t *new_mbchars;
3225 /* Not enough, realloc it. */
3226 /* +1 in case of mbcset->nmbchars is 0. */
3227 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3228 /* Use realloc since array is NULL if *alloc == 0. */
3229 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3230 mbchar_alloc);
3231 if (BE (new_mbchars == NULL, 0))
3232 goto parse_bracket_exp_espace;
3233 mbcset->mbchars = new_mbchars;
3235 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3236 break;
3237 #endif /* RE_ENABLE_I18N */
3238 case EQUIV_CLASS:
3239 *err = build_equiv_class (sbcset,
3240 #ifdef RE_ENABLE_I18N
3241 mbcset, &equiv_class_alloc,
3242 #endif /* RE_ENABLE_I18N */
3243 start_elem.opr.name);
3244 if (BE (*err != REG_NOERROR, 0))
3245 goto parse_bracket_exp_free_return;
3246 break;
3247 case COLL_SYM:
3248 *err = build_collating_symbol (sbcset,
3249 #ifdef RE_ENABLE_I18N
3250 mbcset, &coll_sym_alloc,
3251 #endif /* RE_ENABLE_I18N */
3252 start_elem.opr.name);
3253 if (BE (*err != REG_NOERROR, 0))
3254 goto parse_bracket_exp_free_return;
3255 break;
3256 case CHAR_CLASS:
3257 *err = build_charclass (regexp->trans, sbcset,
3258 #ifdef RE_ENABLE_I18N
3259 mbcset, &char_class_alloc,
3260 #endif /* RE_ENABLE_I18N */
3261 start_elem.opr.name, syntax);
3262 if (BE (*err != REG_NOERROR, 0))
3263 goto parse_bracket_exp_free_return;
3264 break;
3265 default:
3266 assert (0);
3267 break;
3270 if (BE (token->type == END_OF_RE, 0))
3272 *err = REG_EBRACK;
3273 goto parse_bracket_exp_free_return;
3275 if (token->type == OP_CLOSE_BRACKET)
3276 break;
3279 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3281 /* If it is non-matching list. */
3282 if (non_match)
3283 bitset_not (sbcset);
3285 #ifdef RE_ENABLE_I18N
3286 /* Ensure only single byte characters are set. */
3287 if (dfa->mb_cur_max > 1)
3288 bitset_mask (sbcset, dfa->sb_char);
3290 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3291 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3292 || mbcset->non_match)))
3294 bin_tree_t *mbc_tree;
3295 int sbc_idx;
3296 /* Build a tree for complex bracket. */
3297 dfa->has_mb_node = 1;
3298 br_token.type = COMPLEX_BRACKET;
3299 br_token.opr.mbcset = mbcset;
3300 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3301 if (BE (mbc_tree == NULL, 0))
3302 goto parse_bracket_exp_espace;
3303 for (sbc_idx = 0; sbc_idx < BITSET_UINTS; ++sbc_idx)
3304 if (sbcset[sbc_idx])
3305 break;
3306 /* If there are no bits set in sbcset, there is no point
3307 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3308 if (sbc_idx < BITSET_UINTS)
3310 /* Build a tree for simple bracket. */
3311 br_token.type = SIMPLE_BRACKET;
3312 br_token.opr.sbcset = sbcset;
3313 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3314 if (BE (work_tree == NULL, 0))
3315 goto parse_bracket_exp_espace;
3317 /* Then join them by ALT node. */
3318 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3319 if (BE (work_tree == NULL, 0))
3320 goto parse_bracket_exp_espace;
3322 else
3324 re_free (sbcset);
3325 work_tree = mbc_tree;
3328 else
3329 #endif /* not RE_ENABLE_I18N */
3331 #ifdef RE_ENABLE_I18N
3332 free_charset (mbcset);
3333 #endif
3334 /* Build a tree for simple bracket. */
3335 br_token.type = SIMPLE_BRACKET;
3336 br_token.opr.sbcset = sbcset;
3337 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3338 if (BE (work_tree == NULL, 0))
3339 goto parse_bracket_exp_espace;
3341 return work_tree;
3343 parse_bracket_exp_espace:
3344 *err = REG_ESPACE;
3345 parse_bracket_exp_free_return:
3346 re_free (sbcset);
3347 #ifdef RE_ENABLE_I18N
3348 free_charset (mbcset);
3349 #endif /* RE_ENABLE_I18N */
3350 return NULL;
3353 /* Parse an element in the bracket expression. */
3355 static reg_errcode_t
3356 parse_bracket_element (elem, regexp, token, token_len, dfa, syntax,
3357 accept_hyphen)
3358 bracket_elem_t *elem;
3359 re_string_t *regexp;
3360 re_token_t *token;
3361 int token_len;
3362 re_dfa_t *dfa;
3363 reg_syntax_t syntax;
3364 int accept_hyphen;
3366 #ifdef RE_ENABLE_I18N
3367 int cur_char_size;
3368 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3369 if (cur_char_size > 1)
3371 elem->type = MB_CHAR;
3372 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3373 re_string_skip_bytes (regexp, cur_char_size);
3374 return REG_NOERROR;
3376 #endif /* RE_ENABLE_I18N */
3377 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3378 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3379 || token->type == OP_OPEN_EQUIV_CLASS)
3380 return parse_bracket_symbol (elem, regexp, token);
3381 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3383 /* A '-' must only appear as anything but a range indicator before
3384 the closing bracket. Everything else is an error. */
3385 re_token_t token2;
3386 (void) peek_token_bracket (&token2, regexp, syntax);
3387 if (token2.type != OP_CLOSE_BRACKET)
3388 /* The actual error value is not standardized since this whole
3389 case is undefined. But ERANGE makes good sense. */
3390 return REG_ERANGE;
3392 elem->type = SB_CHAR;
3393 elem->opr.ch = token->opr.c;
3394 return REG_NOERROR;
3397 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3398 such as [:<character_class>:], [.<collating_element>.], and
3399 [=<equivalent_class>=]. */
3401 static reg_errcode_t
3402 parse_bracket_symbol (elem, regexp, token)
3403 bracket_elem_t *elem;
3404 re_string_t *regexp;
3405 re_token_t *token;
3407 unsigned char ch, delim = token->opr.c;
3408 int i = 0;
3409 if (re_string_eoi(regexp))
3410 return REG_EBRACK;
3411 for (;; ++i)
3413 if (i >= BRACKET_NAME_BUF_SIZE)
3414 return REG_EBRACK;
3415 if (token->type == OP_OPEN_CHAR_CLASS)
3416 ch = re_string_fetch_byte_case (regexp);
3417 else
3418 ch = re_string_fetch_byte (regexp);
3419 if (re_string_eoi(regexp))
3420 return REG_EBRACK;
3421 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3422 break;
3423 elem->opr.name[i] = ch;
3425 re_string_skip_bytes (regexp, 1);
3426 elem->opr.name[i] = '\0';
3427 switch (token->type)
3429 case OP_OPEN_COLL_ELEM:
3430 elem->type = COLL_SYM;
3431 break;
3432 case OP_OPEN_EQUIV_CLASS:
3433 elem->type = EQUIV_CLASS;
3434 break;
3435 case OP_OPEN_CHAR_CLASS:
3436 elem->type = CHAR_CLASS;
3437 break;
3438 default:
3439 break;
3441 return REG_NOERROR;
3444 /* Helper function for parse_bracket_exp.
3445 Build the equivalence class which is represented by NAME.
3446 The result are written to MBCSET and SBCSET.
3447 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3448 is a pointer argument sinse we may update it. */
3450 static reg_errcode_t
3451 #ifdef RE_ENABLE_I18N
3452 build_equiv_class (sbcset, mbcset, equiv_class_alloc, name)
3453 re_charset_t *mbcset;
3454 int *equiv_class_alloc;
3455 #else /* not RE_ENABLE_I18N */
3456 build_equiv_class (sbcset, name)
3457 #endif /* not RE_ENABLE_I18N */
3458 re_bitset_ptr_t sbcset;
3459 const unsigned char *name;
3461 #if defined _LIBC
3462 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3463 if (nrules != 0)
3465 const int32_t *table, *indirect;
3466 const unsigned char *weights, *extra, *cp;
3467 unsigned char char_buf[2];
3468 int32_t idx1, idx2;
3469 unsigned int ch;
3470 size_t len;
3471 /* This #include defines a local function! */
3472 # include <locale/weight.h>
3473 /* Calculate the index for equivalence class. */
3474 cp = name;
3475 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3476 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3477 _NL_COLLATE_WEIGHTMB);
3478 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3479 _NL_COLLATE_EXTRAMB);
3480 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3481 _NL_COLLATE_INDIRECTMB);
3482 idx1 = findidx (&cp);
3483 if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3484 /* This isn't a valid character. */
3485 return REG_ECOLLATE;
3487 /* Build single byte matcing table for this equivalence class. */
3488 char_buf[1] = (unsigned char) '\0';
3489 len = weights[idx1];
3490 for (ch = 0; ch < SBC_MAX; ++ch)
3492 char_buf[0] = ch;
3493 cp = char_buf;
3494 idx2 = findidx (&cp);
3496 idx2 = table[ch];
3498 if (idx2 == 0)
3499 /* This isn't a valid character. */
3500 continue;
3501 if (len == weights[idx2])
3503 int cnt = 0;
3504 while (cnt <= len &&
3505 weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
3506 ++cnt;
3508 if (cnt > len)
3509 bitset_set (sbcset, ch);
3512 /* Check whether the array has enough space. */
3513 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3515 /* Not enough, realloc it. */
3516 /* +1 in case of mbcset->nequiv_classes is 0. */
3517 int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3518 /* Use realloc since the array is NULL if *alloc == 0. */
3519 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3520 int32_t,
3521 new_equiv_class_alloc);
3522 if (BE (new_equiv_classes == NULL, 0))
3523 return REG_ESPACE;
3524 mbcset->equiv_classes = new_equiv_classes;
3525 *equiv_class_alloc = new_equiv_class_alloc;
3527 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3529 else
3530 #endif /* _LIBC */
3532 if (BE (strlen ((const char *) name) != 1, 0))
3533 return REG_ECOLLATE;
3534 bitset_set (sbcset, *name);
3536 return REG_NOERROR;
3539 /* Helper function for parse_bracket_exp.
3540 Build the character class which is represented by NAME.
3541 The result are written to MBCSET and SBCSET.
3542 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3543 is a pointer argument sinse we may update it. */
3545 static reg_errcode_t
3546 #ifdef RE_ENABLE_I18N
3547 build_charclass (trans, sbcset, mbcset, char_class_alloc, class_name, syntax)
3548 re_charset_t *mbcset;
3549 int *char_class_alloc;
3550 #else /* not RE_ENABLE_I18N */
3551 build_charclass (trans, sbcset, class_name, syntax)
3552 #endif /* not RE_ENABLE_I18N */
3553 unsigned RE_TRANSLATE_TYPE trans;
3554 re_bitset_ptr_t sbcset;
3555 const unsigned char *class_name;
3556 reg_syntax_t syntax;
3558 int i;
3559 const char *name = (const char *) class_name;
3561 /* In case of REG_ICASE "upper" and "lower" match the both of
3562 upper and lower cases. */
3563 if ((syntax & RE_ICASE)
3564 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3565 name = "alpha";
3567 #ifdef RE_ENABLE_I18N
3568 /* Check the space of the arrays. */
3569 if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3571 /* Not enough, realloc it. */
3572 /* +1 in case of mbcset->nchar_classes is 0. */
3573 int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3574 /* Use realloc since array is NULL if *alloc == 0. */
3575 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3576 new_char_class_alloc);
3577 if (BE (new_char_classes == NULL, 0))
3578 return REG_ESPACE;
3579 mbcset->char_classes = new_char_classes;
3580 *char_class_alloc = new_char_class_alloc;
3582 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3583 #endif /* RE_ENABLE_I18N */
3585 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3586 for (i = 0; i < SBC_MAX; ++i) \
3588 if (ctype_func (i)) \
3590 int ch = trans ? trans[i] : i; \
3591 bitset_set (sbcset, ch); \
3595 if (strcmp (name, "alnum") == 0)
3596 BUILD_CHARCLASS_LOOP (isalnum)
3597 else if (strcmp (name, "cntrl") == 0)
3598 BUILD_CHARCLASS_LOOP (iscntrl)
3599 else if (strcmp (name, "lower") == 0)
3600 BUILD_CHARCLASS_LOOP (islower)
3601 else if (strcmp (name, "space") == 0)
3602 BUILD_CHARCLASS_LOOP (isspace)
3603 else if (strcmp (name, "alpha") == 0)
3604 BUILD_CHARCLASS_LOOP (isalpha)
3605 else if (strcmp (name, "digit") == 0)
3606 BUILD_CHARCLASS_LOOP (isdigit)
3607 else if (strcmp (name, "print") == 0)
3608 BUILD_CHARCLASS_LOOP (isprint)
3609 else if (strcmp (name, "upper") == 0)
3610 BUILD_CHARCLASS_LOOP (isupper)
3611 else if (strcmp (name, "blank") == 0)
3612 BUILD_CHARCLASS_LOOP (isblank)
3613 else if (strcmp (name, "graph") == 0)
3614 BUILD_CHARCLASS_LOOP (isgraph)
3615 else if (strcmp (name, "punct") == 0)
3616 BUILD_CHARCLASS_LOOP (ispunct)
3617 else if (strcmp (name, "xdigit") == 0)
3618 BUILD_CHARCLASS_LOOP (isxdigit)
3619 else
3620 return REG_ECTYPE;
3622 return REG_NOERROR;
3625 static bin_tree_t *
3626 build_charclass_op (dfa, trans, class_name, extra, non_match, err)
3627 re_dfa_t *dfa;
3628 unsigned RE_TRANSLATE_TYPE trans;
3629 const unsigned char *class_name;
3630 const unsigned char *extra;
3631 int non_match;
3632 reg_errcode_t *err;
3634 re_bitset_ptr_t sbcset;
3635 #ifdef RE_ENABLE_I18N
3636 re_charset_t *mbcset;
3637 int alloc = 0;
3638 #endif /* not RE_ENABLE_I18N */
3639 reg_errcode_t ret;
3640 re_token_t br_token;
3641 bin_tree_t *tree;
3643 sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
3644 #ifdef RE_ENABLE_I18N
3645 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3646 #endif /* RE_ENABLE_I18N */
3648 #ifdef RE_ENABLE_I18N
3649 if (BE (sbcset == NULL || mbcset == NULL, 0))
3650 #else /* not RE_ENABLE_I18N */
3651 if (BE (sbcset == NULL, 0))
3652 #endif /* not RE_ENABLE_I18N */
3654 *err = REG_ESPACE;
3655 return NULL;
3658 if (non_match)
3660 #ifdef RE_ENABLE_I18N
3662 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3663 bitset_set(cset->sbcset, '\0');
3665 mbcset->non_match = 1;
3666 #endif /* not RE_ENABLE_I18N */
3669 /* We don't care the syntax in this case. */
3670 ret = build_charclass (trans, sbcset,
3671 #ifdef RE_ENABLE_I18N
3672 mbcset, &alloc,
3673 #endif /* RE_ENABLE_I18N */
3674 class_name, 0);
3676 if (BE (ret != REG_NOERROR, 0))
3678 re_free (sbcset);
3679 #ifdef RE_ENABLE_I18N
3680 free_charset (mbcset);
3681 #endif /* RE_ENABLE_I18N */
3682 *err = ret;
3683 return NULL;
3685 /* \w match '_' also. */
3686 for (; *extra; extra++)
3687 bitset_set (sbcset, *extra);
3689 /* If it is non-matching list. */
3690 if (non_match)
3691 bitset_not (sbcset);
3693 #ifdef RE_ENABLE_I18N
3694 /* Ensure only single byte characters are set. */
3695 if (dfa->mb_cur_max > 1)
3696 bitset_mask (sbcset, dfa->sb_char);
3697 #endif
3699 /* Build a tree for simple bracket. */
3700 br_token.type = SIMPLE_BRACKET;
3701 br_token.opr.sbcset = sbcset;
3702 tree = create_token_tree (dfa, NULL, NULL, &br_token);
3703 if (BE (tree == NULL, 0))
3704 goto build_word_op_espace;
3706 #ifdef RE_ENABLE_I18N
3707 if (dfa->mb_cur_max > 1)
3709 bin_tree_t *mbc_tree;
3710 /* Build a tree for complex bracket. */
3711 br_token.type = COMPLEX_BRACKET;
3712 br_token.opr.mbcset = mbcset;
3713 dfa->has_mb_node = 1;
3714 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3715 if (BE (mbc_tree == NULL, 0))
3716 goto build_word_op_espace;
3717 /* Then join them by ALT node. */
3718 tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3719 if (BE (mbc_tree != NULL, 1))
3720 return tree;
3722 else
3724 free_charset (mbcset);
3725 return tree;
3727 #else /* not RE_ENABLE_I18N */
3728 return tree;
3729 #endif /* not RE_ENABLE_I18N */
3731 build_word_op_espace:
3732 re_free (sbcset);
3733 #ifdef RE_ENABLE_I18N
3734 free_charset (mbcset);
3735 #endif /* RE_ENABLE_I18N */
3736 *err = REG_ESPACE;
3737 return NULL;
3740 /* This is intended for the expressions like "a{1,3}".
3741 Fetch a number from `input', and return the number.
3742 Return -1, if the number field is empty like "{,1}".
3743 Return -2, If an error is occured. */
3745 static int
3746 fetch_number (input, token, syntax)
3747 re_string_t *input;
3748 re_token_t *token;
3749 reg_syntax_t syntax;
3751 int num = -1;
3752 unsigned char c;
3753 while (1)
3755 fetch_token (token, input, syntax);
3756 c = token->opr.c;
3757 if (BE (token->type == END_OF_RE, 0))
3758 return -2;
3759 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3760 break;
3761 num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3762 ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3763 num = (num > RE_DUP_MAX) ? -2 : num;
3765 return num;
3768 #ifdef RE_ENABLE_I18N
3769 static void
3770 free_charset (re_charset_t *cset)
3772 re_free (cset->mbchars);
3773 # ifdef _LIBC
3774 re_free (cset->coll_syms);
3775 re_free (cset->equiv_classes);
3776 re_free (cset->range_starts);
3777 re_free (cset->range_ends);
3778 # endif
3779 re_free (cset->char_classes);
3780 re_free (cset);
3782 #endif /* RE_ENABLE_I18N */
3784 /* Functions for binary tree operation. */
3786 /* Create a tree node. */
3788 static bin_tree_t *
3789 create_tree (dfa, left, right, type)
3790 re_dfa_t *dfa;
3791 bin_tree_t *left;
3792 bin_tree_t *right;
3793 re_token_type_t type;
3795 re_token_t t;
3796 t.type = type;
3797 return create_token_tree (dfa, left, right, &t);
3800 static bin_tree_t *
3801 create_token_tree (dfa, left, right, token)
3802 re_dfa_t *dfa;
3803 bin_tree_t *left;
3804 bin_tree_t *right;
3805 const re_token_t *token;
3807 bin_tree_t *tree;
3808 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3810 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3812 if (storage == NULL)
3813 return NULL;
3814 storage->next = dfa->str_tree_storage;
3815 dfa->str_tree_storage = storage;
3816 dfa->str_tree_storage_idx = 0;
3818 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3820 tree->parent = NULL;
3821 tree->left = left;
3822 tree->right = right;
3823 tree->token = *token;
3824 tree->token.duplicated = 0;
3825 tree->token.opt_subexp = 0;
3826 tree->first = NULL;
3827 tree->next = NULL;
3828 tree->node_idx = -1;
3830 if (left != NULL)
3831 left->parent = tree;
3832 if (right != NULL)
3833 right->parent = tree;
3834 return tree;
3837 /* Mark the tree SRC as an optional subexpression.
3838 To be called from preorder or postorder. */
3840 static reg_errcode_t
3841 mark_opt_subexp (extra, node)
3842 void *extra;
3843 bin_tree_t *node;
3845 int idx = (int) (long) extra;
3846 if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3847 node->token.opt_subexp = 1;
3849 return REG_NOERROR;
3852 /* Free the allocated memory inside NODE. */
3854 static void
3855 free_token (re_token_t *node)
3857 #ifdef RE_ENABLE_I18N
3858 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3859 free_charset (node->opr.mbcset);
3860 else
3861 #endif /* RE_ENABLE_I18N */
3862 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3863 re_free (node->opr.sbcset);
3866 /* Worker function for tree walking. Free the allocated memory inside NODE
3867 and its children. */
3869 static reg_errcode_t
3870 free_tree (void *extra, bin_tree_t *node)
3872 free_token (&node->token);
3873 return REG_NOERROR;
3877 /* Duplicate the node SRC, and return new node. This is a preorder
3878 visit similar to the one implemented by the generic visitor, but
3879 we need more infrastructure to maintain two parallel trees --- so,
3880 it's easier to duplicate. */
3882 static bin_tree_t *
3883 duplicate_tree (root, dfa)
3884 const bin_tree_t *root;
3885 re_dfa_t *dfa;
3887 const bin_tree_t *node;
3888 bin_tree_t *dup_root;
3889 bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3891 for (node = root; ; )
3893 /* Create a new tree and link it back to the current parent. */
3894 *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3895 if (*p_new == NULL)
3896 return NULL;
3897 (*p_new)->parent = dup_node;
3898 (*p_new)->token.duplicated = 1;
3899 dup_node = *p_new;
3901 /* Go to the left node, or up and to the right. */
3902 if (node->left)
3904 node = node->left;
3905 p_new = &dup_node->left;
3907 else
3909 const bin_tree_t *prev = NULL;
3910 while (node->right == prev || node->right == NULL)
3912 prev = node;
3913 node = node->parent;
3914 dup_node = dup_node->parent;
3915 if (!node)
3916 return dup_root;
3918 node = node->right;
3919 p_new = &dup_node->right;