* crypt/md5.h (MD5_DIGEST_SIZE, MD5_BLOCK_SIZE): New macros.
[glibc.git] / posix / regcomp.c
blobfde262b83ca1fcbff9d529f47c8d5382b89bfe3e
1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19 02111-1307 USA. */
21 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
22 size_t length, reg_syntax_t syntax);
23 static void re_compile_fastmap_iter (regex_t *bufp,
24 const re_dfastate_t *init_state,
25 char *fastmap);
26 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
27 static void init_word_char (re_dfa_t *dfa);
28 #ifdef RE_ENABLE_I18N
29 static void free_charset (re_charset_t *cset);
30 #endif /* RE_ENABLE_I18N */
31 static void free_workarea_compile (regex_t *preg);
32 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
33 #ifdef RE_ENABLE_I18N
34 static void optimize_utf8 (re_dfa_t *dfa);
35 #endif
36 static reg_errcode_t analyze (regex_t *preg);
37 static reg_errcode_t preorder (bin_tree_t *root,
38 reg_errcode_t (fn (void *, bin_tree_t *)),
39 void *extra);
40 static reg_errcode_t postorder (bin_tree_t *root,
41 reg_errcode_t (fn (void *, bin_tree_t *)),
42 void *extra);
43 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
44 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
45 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
46 bin_tree_t *node);
47 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
48 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
49 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
50 static reg_errcode_t duplicate_node_closure (re_dfa_t *dfa, int top_org_node,
51 int top_clone_node, int root_node,
52 unsigned int constraint);
53 static int duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint);
54 static int search_duplicated_node (const re_dfa_t *dfa, int org_node,
55 unsigned int constraint);
56 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
57 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
58 int node, int root);
59 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
60 static int fetch_number (re_string_t *input, re_token_t *token,
61 reg_syntax_t syntax);
62 static void fetch_token (re_token_t *result, re_string_t *input,
63 reg_syntax_t syntax);
64 static int peek_token (re_token_t *token, re_string_t *input,
65 reg_syntax_t syntax);
66 static int peek_token_bracket (re_token_t *token, re_string_t *input,
67 reg_syntax_t syntax);
68 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
69 reg_syntax_t syntax, reg_errcode_t *err);
70 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
71 re_token_t *token, reg_syntax_t syntax,
72 int nest, reg_errcode_t *err);
73 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
74 re_token_t *token, reg_syntax_t syntax,
75 int nest, reg_errcode_t *err);
76 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
77 re_token_t *token, reg_syntax_t syntax,
78 int nest, reg_errcode_t *err);
79 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
80 re_token_t *token, reg_syntax_t syntax,
81 int nest, reg_errcode_t *err);
82 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
83 re_dfa_t *dfa, re_token_t *token,
84 reg_syntax_t syntax, reg_errcode_t *err);
85 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
86 re_token_t *token, reg_syntax_t syntax,
87 reg_errcode_t *err);
88 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
89 re_string_t *regexp,
90 re_token_t *token, int token_len,
91 re_dfa_t *dfa,
92 reg_syntax_t syntax,
93 int accept_hyphen);
94 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
95 re_string_t *regexp,
96 re_token_t *token);
97 #ifndef _LIBC
98 # ifdef RE_ENABLE_I18N
99 static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
100 re_charset_t *mbcset, int *range_alloc,
101 bracket_elem_t *start_elem,
102 bracket_elem_t *end_elem);
103 static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
104 re_charset_t *mbcset,
105 int *coll_sym_alloc,
106 const unsigned char *name);
107 # else /* not RE_ENABLE_I18N */
108 static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
109 bracket_elem_t *start_elem,
110 bracket_elem_t *end_elem);
111 static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
112 const unsigned char *name);
113 # endif /* not RE_ENABLE_I18N */
114 #endif /* not _LIBC */
115 #ifdef RE_ENABLE_I18N
116 static reg_errcode_t build_equiv_class (bitset_t sbcset,
117 re_charset_t *mbcset,
118 int *equiv_class_alloc,
119 const unsigned char *name);
120 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
121 bitset_t sbcset,
122 re_charset_t *mbcset,
123 int *char_class_alloc,
124 const unsigned char *class_name,
125 reg_syntax_t syntax);
126 #else /* not RE_ENABLE_I18N */
127 static reg_errcode_t build_equiv_class (bitset_t sbcset,
128 const unsigned char *name);
129 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
130 bitset_t sbcset,
131 const unsigned char *class_name,
132 reg_syntax_t syntax);
133 #endif /* not RE_ENABLE_I18N */
134 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
135 RE_TRANSLATE_TYPE trans,
136 const unsigned char *class_name,
137 const unsigned char *extra,
138 int non_match, reg_errcode_t *err);
139 static bin_tree_t *create_tree (re_dfa_t *dfa,
140 bin_tree_t *left, bin_tree_t *right,
141 re_token_type_t type);
142 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
143 bin_tree_t *left, bin_tree_t *right,
144 const re_token_t *token);
145 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
146 static void free_token (re_token_t *node);
147 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
148 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
150 /* This table gives an error message for each of the error codes listed
151 in regex.h. Obviously the order here has to be same as there.
152 POSIX doesn't require that we do anything for REG_NOERROR,
153 but why not be nice? */
155 const char __re_error_msgid[] attribute_hidden =
157 #define REG_NOERROR_IDX 0
158 gettext_noop ("Success") /* REG_NOERROR */
159 "\0"
160 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
161 gettext_noop ("No match") /* REG_NOMATCH */
162 "\0"
163 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
164 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
165 "\0"
166 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
167 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
168 "\0"
169 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
170 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
171 "\0"
172 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
173 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
174 "\0"
175 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
176 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
177 "\0"
178 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
179 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
180 "\0"
181 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
182 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
183 "\0"
184 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
185 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
186 "\0"
187 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
188 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
189 "\0"
190 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
191 gettext_noop ("Invalid range end") /* REG_ERANGE */
192 "\0"
193 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
194 gettext_noop ("Memory exhausted") /* REG_ESPACE */
195 "\0"
196 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
197 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
198 "\0"
199 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
200 gettext_noop ("Premature end of regular expression") /* REG_EEND */
201 "\0"
202 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
203 gettext_noop ("Regular expression too big") /* REG_ESIZE */
204 "\0"
205 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
206 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
209 const size_t __re_error_msgid_idx[] attribute_hidden =
211 REG_NOERROR_IDX,
212 REG_NOMATCH_IDX,
213 REG_BADPAT_IDX,
214 REG_ECOLLATE_IDX,
215 REG_ECTYPE_IDX,
216 REG_EESCAPE_IDX,
217 REG_ESUBREG_IDX,
218 REG_EBRACK_IDX,
219 REG_EPAREN_IDX,
220 REG_EBRACE_IDX,
221 REG_BADBR_IDX,
222 REG_ERANGE_IDX,
223 REG_ESPACE_IDX,
224 REG_BADRPT_IDX,
225 REG_EEND_IDX,
226 REG_ESIZE_IDX,
227 REG_ERPAREN_IDX
230 /* Entry points for GNU code. */
232 /* re_compile_pattern is the GNU regular expression compiler: it
233 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
234 Returns 0 if the pattern was valid, otherwise an error string.
236 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
237 are set in BUFP on entry. */
239 const char *
240 re_compile_pattern (pattern, length, bufp)
241 const char *pattern;
242 size_t length;
243 struct re_pattern_buffer *bufp;
245 reg_errcode_t ret;
247 /* And GNU code determines whether or not to get register information
248 by passing null for the REGS argument to re_match, etc., not by
249 setting no_sub, unless RE_NO_SUB is set. */
250 bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
252 /* Match anchors at newline. */
253 bufp->newline_anchor = 1;
255 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
257 if (!ret)
258 return NULL;
259 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
261 #ifdef _LIBC
262 weak_alias (__re_compile_pattern, re_compile_pattern)
263 #endif
265 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
266 also be assigned to arbitrarily: each pattern buffer stores its own
267 syntax, so it can be changed between regex compilations. */
268 /* This has no initializer because initialized variables in Emacs
269 become read-only after dumping. */
270 reg_syntax_t re_syntax_options;
273 /* Specify the precise syntax of regexps for compilation. This provides
274 for compatibility for various utilities which historically have
275 different, incompatible syntaxes.
277 The argument SYNTAX is a bit mask comprised of the various bits
278 defined in regex.h. We return the old syntax. */
280 reg_syntax_t
281 re_set_syntax (syntax)
282 reg_syntax_t syntax;
284 reg_syntax_t ret = re_syntax_options;
286 re_syntax_options = syntax;
287 return ret;
289 #ifdef _LIBC
290 weak_alias (__re_set_syntax, re_set_syntax)
291 #endif
294 re_compile_fastmap (bufp)
295 struct re_pattern_buffer *bufp;
297 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
298 char *fastmap = bufp->fastmap;
300 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
301 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
302 if (dfa->init_state != dfa->init_state_word)
303 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
304 if (dfa->init_state != dfa->init_state_nl)
305 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
306 if (dfa->init_state != dfa->init_state_begbuf)
307 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
308 bufp->fastmap_accurate = 1;
309 return 0;
311 #ifdef _LIBC
312 weak_alias (__re_compile_fastmap, re_compile_fastmap)
313 #endif
315 static inline void
316 __attribute ((always_inline))
317 re_set_fastmap (char *fastmap, int icase, int ch)
319 fastmap[ch] = 1;
320 if (icase)
321 fastmap[tolower (ch)] = 1;
324 /* Helper function for re_compile_fastmap.
325 Compile fastmap for the initial_state INIT_STATE. */
327 static void
328 re_compile_fastmap_iter (bufp, init_state, fastmap)
329 regex_t *bufp;
330 const re_dfastate_t *init_state;
331 char *fastmap;
333 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
334 int node_cnt;
335 int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
336 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
338 int node = init_state->nodes.elems[node_cnt];
339 re_token_type_t type = dfa->nodes[node].type;
341 if (type == CHARACTER)
343 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
344 #ifdef RE_ENABLE_I18N
345 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
347 unsigned char *buf = alloca (dfa->mb_cur_max), *p;
348 wchar_t wc;
349 mbstate_t state;
351 p = buf;
352 *p++ = dfa->nodes[node].opr.c;
353 while (++node < dfa->nodes_len
354 && dfa->nodes[node].type == CHARACTER
355 && dfa->nodes[node].mb_partial)
356 *p++ = dfa->nodes[node].opr.c;
357 memset (&state, '\0', sizeof (state));
358 if (mbrtowc (&wc, (const char *) buf, p - buf,
359 &state) == p - buf
360 && (__wcrtomb ((char *) buf, towlower (wc), &state)
361 != (size_t) -1))
362 re_set_fastmap (fastmap, 0, buf[0]);
364 #endif
366 else if (type == SIMPLE_BRACKET)
368 int i, ch;
369 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
371 int j;
372 bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
373 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
374 if (w & ((bitset_word_t) 1 << j))
375 re_set_fastmap (fastmap, icase, ch);
378 #ifdef RE_ENABLE_I18N
379 else if (type == COMPLEX_BRACKET)
381 int i;
382 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
383 if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
384 || cset->nranges || cset->nchar_classes)
386 # ifdef _LIBC
387 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
389 /* In this case we want to catch the bytes which are
390 the first byte of any collation elements.
391 e.g. In da_DK, we want to catch 'a' since "aa"
392 is a valid collation element, and don't catch
393 'b' since 'b' is the only collation element
394 which starts from 'b'. */
395 const int32_t *table = (const int32_t *)
396 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
397 for (i = 0; i < SBC_MAX; ++i)
398 if (table[i] < 0)
399 re_set_fastmap (fastmap, icase, i);
401 # else
402 if (dfa->mb_cur_max > 1)
403 for (i = 0; i < SBC_MAX; ++i)
404 if (__btowc (i) == WEOF)
405 re_set_fastmap (fastmap, icase, i);
406 # endif /* not _LIBC */
408 for (i = 0; i < cset->nmbchars; ++i)
410 char buf[256];
411 mbstate_t state;
412 memset (&state, '\0', sizeof (state));
413 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
414 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
415 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
417 if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
418 != (size_t) -1)
419 re_set_fastmap (fastmap, 0, *(unsigned char *) buf);
423 #endif /* RE_ENABLE_I18N */
424 else if (type == OP_PERIOD
425 #ifdef RE_ENABLE_I18N
426 || type == OP_UTF8_PERIOD
427 #endif /* RE_ENABLE_I18N */
428 || type == END_OF_RE)
430 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
431 if (type == END_OF_RE)
432 bufp->can_be_null = 1;
433 return;
438 /* Entry point for POSIX code. */
439 /* regcomp takes a regular expression as a string and compiles it.
441 PREG is a regex_t *. We do not expect any fields to be initialized,
442 since POSIX says we shouldn't. Thus, we set
444 `buffer' to the compiled pattern;
445 `used' to the length of the compiled pattern;
446 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
447 REG_EXTENDED bit in CFLAGS is set; otherwise, to
448 RE_SYNTAX_POSIX_BASIC;
449 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
450 `fastmap' to an allocated space for the fastmap;
451 `fastmap_accurate' to zero;
452 `re_nsub' to the number of subexpressions in PATTERN.
454 PATTERN is the address of the pattern string.
456 CFLAGS is a series of bits which affect compilation.
458 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
459 use POSIX basic syntax.
461 If REG_NEWLINE is set, then . and [^...] don't match newline.
462 Also, regexec will try a match beginning after every newline.
464 If REG_ICASE is set, then we considers upper- and lowercase
465 versions of letters to be equivalent when matching.
467 If REG_NOSUB is set, then when PREG is passed to regexec, that
468 routine will report only success or failure, and nothing about the
469 registers.
471 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
472 the return codes and their meanings.) */
475 regcomp (preg, pattern, cflags)
476 regex_t *__restrict preg;
477 const char *__restrict pattern;
478 int cflags;
480 reg_errcode_t ret;
481 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
482 : RE_SYNTAX_POSIX_BASIC);
484 preg->buffer = NULL;
485 preg->allocated = 0;
486 preg->used = 0;
488 /* Try to allocate space for the fastmap. */
489 preg->fastmap = re_malloc (char, SBC_MAX);
490 if (BE (preg->fastmap == NULL, 0))
491 return REG_ESPACE;
493 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
495 /* If REG_NEWLINE is set, newlines are treated differently. */
496 if (cflags & REG_NEWLINE)
497 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
498 syntax &= ~RE_DOT_NEWLINE;
499 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
500 /* It also changes the matching behavior. */
501 preg->newline_anchor = 1;
503 else
504 preg->newline_anchor = 0;
505 preg->no_sub = !!(cflags & REG_NOSUB);
506 preg->translate = NULL;
508 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
510 /* POSIX doesn't distinguish between an unmatched open-group and an
511 unmatched close-group: both are REG_EPAREN. */
512 if (ret == REG_ERPAREN)
513 ret = REG_EPAREN;
515 /* We have already checked preg->fastmap != NULL. */
516 if (BE (ret == REG_NOERROR, 1))
517 /* Compute the fastmap now, since regexec cannot modify the pattern
518 buffer. This function never fails in this implementation. */
519 (void) re_compile_fastmap (preg);
520 else
522 /* Some error occurred while compiling the expression. */
523 re_free (preg->fastmap);
524 preg->fastmap = NULL;
527 return (int) ret;
529 #ifdef _LIBC
530 weak_alias (__regcomp, regcomp)
531 #endif
533 /* Returns a message corresponding to an error code, ERRCODE, returned
534 from either regcomp or regexec. We don't use PREG here. */
536 size_t
537 regerror (errcode, preg, errbuf, errbuf_size)
538 int errcode;
539 const regex_t *__restrict preg;
540 char *__restrict errbuf;
541 size_t errbuf_size;
543 const char *msg;
544 size_t msg_size;
546 if (BE (errcode < 0
547 || errcode >= (int) (sizeof (__re_error_msgid_idx)
548 / sizeof (__re_error_msgid_idx[0])), 0))
549 /* Only error codes returned by the rest of the code should be passed
550 to this routine. If we are given anything else, or if other regex
551 code generates an invalid error code, then the program has a bug.
552 Dump core so we can fix it. */
553 abort ();
555 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
557 msg_size = strlen (msg) + 1; /* Includes the null. */
559 if (BE (errbuf_size != 0, 1))
561 if (BE (msg_size > errbuf_size, 0))
563 #if defined HAVE_MEMPCPY || defined _LIBC
564 *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
565 #else
566 memcpy (errbuf, msg, errbuf_size - 1);
567 errbuf[errbuf_size - 1] = 0;
568 #endif
570 else
571 memcpy (errbuf, msg, msg_size);
574 return msg_size;
576 #ifdef _LIBC
577 weak_alias (__regerror, regerror)
578 #endif
581 #ifdef RE_ENABLE_I18N
582 /* This static array is used for the map to single-byte characters when
583 UTF-8 is used. Otherwise we would allocate memory just to initialize
584 it the same all the time. UTF-8 is the preferred encoding so this is
585 a worthwhile optimization. */
586 static const bitset_t utf8_sb_map =
588 /* Set the first 128 bits. */
589 [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
591 #endif
594 static void
595 free_dfa_content (re_dfa_t *dfa)
597 int i, j;
599 if (dfa->nodes)
600 for (i = 0; i < dfa->nodes_len; ++i)
601 free_token (dfa->nodes + i);
602 re_free (dfa->nexts);
603 for (i = 0; i < dfa->nodes_len; ++i)
605 if (dfa->eclosures != NULL)
606 re_node_set_free (dfa->eclosures + i);
607 if (dfa->inveclosures != NULL)
608 re_node_set_free (dfa->inveclosures + i);
609 if (dfa->edests != NULL)
610 re_node_set_free (dfa->edests + i);
612 re_free (dfa->edests);
613 re_free (dfa->eclosures);
614 re_free (dfa->inveclosures);
615 re_free (dfa->nodes);
617 if (dfa->state_table)
618 for (i = 0; i <= dfa->state_hash_mask; ++i)
620 struct re_state_table_entry *entry = dfa->state_table + i;
621 for (j = 0; j < entry->num; ++j)
623 re_dfastate_t *state = entry->array[j];
624 free_state (state);
626 re_free (entry->array);
628 re_free (dfa->state_table);
629 #ifdef RE_ENABLE_I18N
630 if (dfa->sb_char != utf8_sb_map)
631 re_free (dfa->sb_char);
632 #endif
633 re_free (dfa->subexp_map);
634 #ifdef DEBUG
635 re_free (dfa->re_str);
636 #endif
638 re_free (dfa);
642 /* Free dynamically allocated space used by PREG. */
644 void
645 regfree (preg)
646 regex_t *preg;
648 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
649 if (BE (dfa != NULL, 1))
650 free_dfa_content (dfa);
651 preg->buffer = NULL;
652 preg->allocated = 0;
654 re_free (preg->fastmap);
655 preg->fastmap = NULL;
657 re_free (preg->translate);
658 preg->translate = NULL;
660 #ifdef _LIBC
661 weak_alias (__regfree, regfree)
662 #endif
664 /* Entry points compatible with 4.2 BSD regex library. We don't define
665 them unless specifically requested. */
667 #if defined _REGEX_RE_COMP || defined _LIBC
669 /* BSD has one and only one pattern buffer. */
670 static struct re_pattern_buffer re_comp_buf;
672 char *
673 # ifdef _LIBC
674 /* Make these definitions weak in libc, so POSIX programs can redefine
675 these names if they don't use our functions, and still use
676 regcomp/regexec above without link errors. */
677 weak_function
678 # endif
679 re_comp (s)
680 const char *s;
682 reg_errcode_t ret;
683 char *fastmap;
685 if (!s)
687 if (!re_comp_buf.buffer)
688 return gettext ("No previous regular expression");
689 return 0;
692 if (re_comp_buf.buffer)
694 fastmap = re_comp_buf.fastmap;
695 re_comp_buf.fastmap = NULL;
696 __regfree (&re_comp_buf);
697 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
698 re_comp_buf.fastmap = fastmap;
701 if (re_comp_buf.fastmap == NULL)
703 re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
704 if (re_comp_buf.fastmap == NULL)
705 return (char *) gettext (__re_error_msgid
706 + __re_error_msgid_idx[(int) REG_ESPACE]);
709 /* Since `re_exec' always passes NULL for the `regs' argument, we
710 don't need to initialize the pattern buffer fields which affect it. */
712 /* Match anchors at newlines. */
713 re_comp_buf.newline_anchor = 1;
715 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
717 if (!ret)
718 return NULL;
720 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
721 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
724 #ifdef _LIBC
725 libc_freeres_fn (free_mem)
727 __regfree (&re_comp_buf);
729 #endif
731 #endif /* _REGEX_RE_COMP */
733 /* Internal entry point.
734 Compile the regular expression PATTERN, whose length is LENGTH.
735 SYNTAX indicate regular expression's syntax. */
737 static reg_errcode_t
738 re_compile_internal (preg, pattern, length, syntax)
739 regex_t *preg;
740 const char * pattern;
741 size_t length;
742 reg_syntax_t syntax;
744 reg_errcode_t err = REG_NOERROR;
745 re_dfa_t *dfa;
746 re_string_t regexp;
748 /* Initialize the pattern buffer. */
749 preg->fastmap_accurate = 0;
750 preg->syntax = syntax;
751 preg->not_bol = preg->not_eol = 0;
752 preg->used = 0;
753 preg->re_nsub = 0;
754 preg->can_be_null = 0;
755 preg->regs_allocated = REGS_UNALLOCATED;
757 /* Initialize the dfa. */
758 dfa = (re_dfa_t *) preg->buffer;
759 if (BE (preg->allocated < sizeof (re_dfa_t), 0))
761 /* If zero allocated, but buffer is non-null, try to realloc
762 enough space. This loses if buffer's address is bogus, but
763 that is the user's responsibility. If ->buffer is NULL this
764 is a simple allocation. */
765 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
766 if (dfa == NULL)
767 return REG_ESPACE;
768 preg->allocated = sizeof (re_dfa_t);
769 preg->buffer = (unsigned char *) dfa;
771 preg->used = sizeof (re_dfa_t);
773 err = init_dfa (dfa, length);
774 if (BE (err != REG_NOERROR, 0))
776 free_dfa_content (dfa);
777 preg->buffer = NULL;
778 preg->allocated = 0;
779 return err;
781 #ifdef DEBUG
782 /* Note: length+1 will not overflow since it is checked in init_dfa. */
783 dfa->re_str = re_malloc (char, length + 1);
784 strncpy (dfa->re_str, pattern, length + 1);
785 #endif
787 __libc_lock_init (dfa->lock);
789 err = re_string_construct (&regexp, pattern, length, preg->translate,
790 syntax & RE_ICASE, dfa);
791 if (BE (err != REG_NOERROR, 0))
793 re_compile_internal_free_return:
794 free_workarea_compile (preg);
795 re_string_destruct (&regexp);
796 free_dfa_content (dfa);
797 preg->buffer = NULL;
798 preg->allocated = 0;
799 return err;
802 /* Parse the regular expression, and build a structure tree. */
803 preg->re_nsub = 0;
804 dfa->str_tree = parse (&regexp, preg, syntax, &err);
805 if (BE (dfa->str_tree == NULL, 0))
806 goto re_compile_internal_free_return;
808 /* Analyze the tree and create the nfa. */
809 err = analyze (preg);
810 if (BE (err != REG_NOERROR, 0))
811 goto re_compile_internal_free_return;
813 #ifdef RE_ENABLE_I18N
814 /* If possible, do searching in single byte encoding to speed things up. */
815 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
816 optimize_utf8 (dfa);
817 #endif
819 /* Then create the initial state of the dfa. */
820 err = create_initial_state (dfa);
822 /* Release work areas. */
823 free_workarea_compile (preg);
824 re_string_destruct (&regexp);
826 if (BE (err != REG_NOERROR, 0))
828 free_dfa_content (dfa);
829 preg->buffer = NULL;
830 preg->allocated = 0;
833 return err;
836 /* Initialize DFA. We use the length of the regular expression PAT_LEN
837 as the initial length of some arrays. */
839 static reg_errcode_t
840 init_dfa (dfa, pat_len)
841 re_dfa_t *dfa;
842 size_t pat_len;
844 unsigned int table_size;
845 #ifndef _LIBC
846 char *codeset_name;
847 #endif
849 memset (dfa, '\0', sizeof (re_dfa_t));
851 /* Force allocation of str_tree_storage the first time. */
852 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
854 /* Avoid overflows. */
855 if (pat_len == SIZE_MAX)
856 return REG_ESPACE;
858 dfa->nodes_alloc = pat_len + 1;
859 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
861 /* table_size = 2 ^ ceil(log pat_len) */
862 for (table_size = 1; ; table_size <<= 1)
863 if (table_size > pat_len)
864 break;
866 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
867 dfa->state_hash_mask = table_size - 1;
869 dfa->mb_cur_max = MB_CUR_MAX;
870 #ifdef _LIBC
871 if (dfa->mb_cur_max == 6
872 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
873 dfa->is_utf8 = 1;
874 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
875 != 0);
876 #else
877 # ifdef HAVE_LANGINFO_CODESET
878 codeset_name = nl_langinfo (CODESET);
879 # else
880 codeset_name = getenv ("LC_ALL");
881 if (codeset_name == NULL || codeset_name[0] == '\0')
882 codeset_name = getenv ("LC_CTYPE");
883 if (codeset_name == NULL || codeset_name[0] == '\0')
884 codeset_name = getenv ("LANG");
885 if (codeset_name == NULL)
886 codeset_name = "";
887 else if (strchr (codeset_name, '.') != NULL)
888 codeset_name = strchr (codeset_name, '.') + 1;
889 # endif
891 if (strcasecmp (codeset_name, "UTF-8") == 0
892 || strcasecmp (codeset_name, "UTF8") == 0)
893 dfa->is_utf8 = 1;
895 /* We check exhaustively in the loop below if this charset is a
896 superset of ASCII. */
897 dfa->map_notascii = 0;
898 #endif
900 #ifdef RE_ENABLE_I18N
901 if (dfa->mb_cur_max > 1)
903 if (dfa->is_utf8)
904 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
905 else
907 int i, j, ch;
909 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
910 if (BE (dfa->sb_char == NULL, 0))
911 return REG_ESPACE;
913 /* Set the bits corresponding to single byte chars. */
914 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
915 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
917 wint_t wch = __btowc (ch);
918 if (wch != WEOF)
919 dfa->sb_char[i] |= (bitset_word_t) 1 << j;
920 # ifndef _LIBC
921 if (isascii (ch) && wch != ch)
922 dfa->map_notascii = 1;
923 # endif
927 #endif
929 if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
930 return REG_ESPACE;
931 return REG_NOERROR;
934 /* Initialize WORD_CHAR table, which indicate which character is
935 "word". In this case "word" means that it is the word construction
936 character used by some operators like "\<", "\>", etc. */
938 static void
939 init_word_char (dfa)
940 re_dfa_t *dfa;
942 int i, j, ch;
943 dfa->word_ops_used = 1;
944 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
945 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
946 if (isalnum (ch) || ch == '_')
947 dfa->word_char[i] |= (bitset_word_t) 1 << j;
950 /* Free the work area which are only used while compiling. */
952 static void
953 free_workarea_compile (preg)
954 regex_t *preg;
956 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
957 bin_tree_storage_t *storage, *next;
958 for (storage = dfa->str_tree_storage; storage; storage = next)
960 next = storage->next;
961 re_free (storage);
963 dfa->str_tree_storage = NULL;
964 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
965 dfa->str_tree = NULL;
966 re_free (dfa->org_indices);
967 dfa->org_indices = NULL;
970 /* Create initial states for all contexts. */
972 static reg_errcode_t
973 create_initial_state (dfa)
974 re_dfa_t *dfa;
976 int first, i;
977 reg_errcode_t err;
978 re_node_set init_nodes;
980 /* Initial states have the epsilon closure of the node which is
981 the first node of the regular expression. */
982 first = dfa->str_tree->first->node_idx;
983 dfa->init_node = first;
984 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
985 if (BE (err != REG_NOERROR, 0))
986 return err;
988 /* The back-references which are in initial states can epsilon transit,
989 since in this case all of the subexpressions can be null.
990 Then we add epsilon closures of the nodes which are the next nodes of
991 the back-references. */
992 if (dfa->nbackref > 0)
993 for (i = 0; i < init_nodes.nelem; ++i)
995 int node_idx = init_nodes.elems[i];
996 re_token_type_t type = dfa->nodes[node_idx].type;
998 int clexp_idx;
999 if (type != OP_BACK_REF)
1000 continue;
1001 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1003 re_token_t *clexp_node;
1004 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1005 if (clexp_node->type == OP_CLOSE_SUBEXP
1006 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1007 break;
1009 if (clexp_idx == init_nodes.nelem)
1010 continue;
1012 if (type == OP_BACK_REF)
1014 int dest_idx = dfa->edests[node_idx].elems[0];
1015 if (!re_node_set_contains (&init_nodes, dest_idx))
1017 re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1018 i = 0;
1023 /* It must be the first time to invoke acquire_state. */
1024 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1025 /* We don't check ERR here, since the initial state must not be NULL. */
1026 if (BE (dfa->init_state == NULL, 0))
1027 return err;
1028 if (dfa->init_state->has_constraint)
1030 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1031 CONTEXT_WORD);
1032 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1033 CONTEXT_NEWLINE);
1034 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1035 &init_nodes,
1036 CONTEXT_NEWLINE
1037 | CONTEXT_BEGBUF);
1038 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1039 || dfa->init_state_begbuf == NULL, 0))
1040 return err;
1042 else
1043 dfa->init_state_word = dfa->init_state_nl
1044 = dfa->init_state_begbuf = dfa->init_state;
1046 re_node_set_free (&init_nodes);
1047 return REG_NOERROR;
1050 #ifdef RE_ENABLE_I18N
1051 /* If it is possible to do searching in single byte encoding instead of UTF-8
1052 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1053 DFA nodes where needed. */
1055 static void
1056 optimize_utf8 (dfa)
1057 re_dfa_t *dfa;
1059 int node, i, mb_chars = 0, has_period = 0;
1061 for (node = 0; node < dfa->nodes_len; ++node)
1062 switch (dfa->nodes[node].type)
1064 case CHARACTER:
1065 if (dfa->nodes[node].opr.c >= 0x80)
1066 mb_chars = 1;
1067 break;
1068 case ANCHOR:
1069 switch (dfa->nodes[node].opr.idx)
1071 case LINE_FIRST:
1072 case LINE_LAST:
1073 case BUF_FIRST:
1074 case BUF_LAST:
1075 break;
1076 default:
1077 /* Word anchors etc. cannot be handled. */
1078 return;
1080 break;
1081 case OP_PERIOD:
1082 has_period = 1;
1083 break;
1084 case OP_BACK_REF:
1085 case OP_ALT:
1086 case END_OF_RE:
1087 case OP_DUP_ASTERISK:
1088 case OP_OPEN_SUBEXP:
1089 case OP_CLOSE_SUBEXP:
1090 break;
1091 case COMPLEX_BRACKET:
1092 return;
1093 case SIMPLE_BRACKET:
1094 /* Just double check. The non-ASCII range starts at 0x80. */
1095 assert (0x80 % BITSET_WORD_BITS == 0);
1096 for (i = 0x80 / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1097 if (dfa->nodes[node].opr.sbcset[i])
1098 return;
1099 break;
1100 default:
1101 abort ();
1104 if (mb_chars || has_period)
1105 for (node = 0; node < dfa->nodes_len; ++node)
1107 if (dfa->nodes[node].type == CHARACTER
1108 && dfa->nodes[node].opr.c >= 0x80)
1109 dfa->nodes[node].mb_partial = 0;
1110 else if (dfa->nodes[node].type == OP_PERIOD)
1111 dfa->nodes[node].type = OP_UTF8_PERIOD;
1114 /* The search can be in single byte locale. */
1115 dfa->mb_cur_max = 1;
1116 dfa->is_utf8 = 0;
1117 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1119 #endif
1121 /* Analyze the structure tree, and calculate "first", "next", "edest",
1122 "eclosure", and "inveclosure". */
1124 static reg_errcode_t
1125 analyze (preg)
1126 regex_t *preg;
1128 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1129 reg_errcode_t ret;
1131 /* Allocate arrays. */
1132 dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1133 dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1134 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1135 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1136 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1137 || dfa->eclosures == NULL, 0))
1138 return REG_ESPACE;
1140 dfa->subexp_map = re_malloc (int, preg->re_nsub);
1141 if (dfa->subexp_map != NULL)
1143 int i;
1144 for (i = 0; i < preg->re_nsub; i++)
1145 dfa->subexp_map[i] = i;
1146 preorder (dfa->str_tree, optimize_subexps, dfa);
1147 for (i = 0; i < preg->re_nsub; i++)
1148 if (dfa->subexp_map[i] != i)
1149 break;
1150 if (i == preg->re_nsub)
1152 free (dfa->subexp_map);
1153 dfa->subexp_map = NULL;
1157 ret = postorder (dfa->str_tree, lower_subexps, preg);
1158 if (BE (ret != REG_NOERROR, 0))
1159 return ret;
1160 ret = postorder (dfa->str_tree, calc_first, dfa);
1161 if (BE (ret != REG_NOERROR, 0))
1162 return ret;
1163 preorder (dfa->str_tree, calc_next, dfa);
1164 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1165 if (BE (ret != REG_NOERROR, 0))
1166 return ret;
1167 ret = calc_eclosure (dfa);
1168 if (BE (ret != REG_NOERROR, 0))
1169 return ret;
1171 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1172 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1173 if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1174 || dfa->nbackref)
1176 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1177 if (BE (dfa->inveclosures == NULL, 0))
1178 return REG_ESPACE;
1179 ret = calc_inveclosure (dfa);
1182 return ret;
1185 /* Our parse trees are very unbalanced, so we cannot use a stack to
1186 implement parse tree visits. Instead, we use parent pointers and
1187 some hairy code in these two functions. */
1188 static reg_errcode_t
1189 postorder (root, fn, extra)
1190 bin_tree_t *root;
1191 reg_errcode_t (fn (void *, bin_tree_t *));
1192 void *extra;
1194 bin_tree_t *node, *prev;
1196 for (node = root; ; )
1198 /* Descend down the tree, preferably to the left (or to the right
1199 if that's the only child). */
1200 while (node->left || node->right)
1201 if (node->left)
1202 node = node->left;
1203 else
1204 node = node->right;
1208 reg_errcode_t err = fn (extra, node);
1209 if (BE (err != REG_NOERROR, 0))
1210 return err;
1211 if (node->parent == NULL)
1212 return REG_NOERROR;
1213 prev = node;
1214 node = node->parent;
1216 /* Go up while we have a node that is reached from the right. */
1217 while (node->right == prev || node->right == NULL);
1218 node = node->right;
1222 static reg_errcode_t
1223 preorder (root, fn, extra)
1224 bin_tree_t *root;
1225 reg_errcode_t (fn (void *, bin_tree_t *));
1226 void *extra;
1228 bin_tree_t *node;
1230 for (node = root; ; )
1232 reg_errcode_t err = fn (extra, node);
1233 if (BE (err != REG_NOERROR, 0))
1234 return err;
1236 /* Go to the left node, or up and to the right. */
1237 if (node->left)
1238 node = node->left;
1239 else
1241 bin_tree_t *prev = NULL;
1242 while (node->right == prev || node->right == NULL)
1244 prev = node;
1245 node = node->parent;
1246 if (!node)
1247 return REG_NOERROR;
1249 node = node->right;
1254 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1255 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1256 backreferences as well. Requires a preorder visit. */
1257 static reg_errcode_t
1258 optimize_subexps (extra, node)
1259 void *extra;
1260 bin_tree_t *node;
1262 re_dfa_t *dfa = (re_dfa_t *) extra;
1264 if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1266 int idx = node->token.opr.idx;
1267 node->token.opr.idx = dfa->subexp_map[idx];
1268 dfa->used_bkref_map |= 1 << node->token.opr.idx;
1271 else if (node->token.type == SUBEXP
1272 && node->left && node->left->token.type == SUBEXP)
1274 int other_idx = node->left->token.opr.idx;
1276 node->left = node->left->left;
1277 if (node->left)
1278 node->left->parent = node;
1280 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1281 if (other_idx < BITSET_WORD_BITS)
1282 dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1285 return REG_NOERROR;
1288 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1289 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1290 static reg_errcode_t
1291 lower_subexps (extra, node)
1292 void *extra;
1293 bin_tree_t *node;
1295 regex_t *preg = (regex_t *) extra;
1296 reg_errcode_t err = REG_NOERROR;
1298 if (node->left && node->left->token.type == SUBEXP)
1300 node->left = lower_subexp (&err, preg, node->left);
1301 if (node->left)
1302 node->left->parent = node;
1304 if (node->right && node->right->token.type == SUBEXP)
1306 node->right = lower_subexp (&err, preg, node->right);
1307 if (node->right)
1308 node->right->parent = node;
1311 return err;
1314 static bin_tree_t *
1315 lower_subexp (err, preg, node)
1316 reg_errcode_t *err;
1317 regex_t *preg;
1318 bin_tree_t *node;
1320 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1321 bin_tree_t *body = node->left;
1322 bin_tree_t *op, *cls, *tree1, *tree;
1324 if (preg->no_sub
1325 /* We do not optimize empty subexpressions, because otherwise we may
1326 have bad CONCAT nodes with NULL children. This is obviously not
1327 very common, so we do not lose much. An example that triggers
1328 this case is the sed "script" /\(\)/x. */
1329 && node->left != NULL
1330 && (node->token.opr.idx >= BITSET_WORD_BITS
1331 || !(dfa->used_bkref_map
1332 & ((bitset_word_t) 1 << node->token.opr.idx))))
1333 return node->left;
1335 /* Convert the SUBEXP node to the concatenation of an
1336 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1337 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1338 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1339 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1340 tree = create_tree (dfa, op, tree1, CONCAT);
1341 if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1343 *err = REG_ESPACE;
1344 return NULL;
1347 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1348 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1349 return tree;
1352 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1353 nodes. Requires a postorder visit. */
1354 static reg_errcode_t
1355 calc_first (extra, node)
1356 void *extra;
1357 bin_tree_t *node;
1359 re_dfa_t *dfa = (re_dfa_t *) extra;
1360 if (node->token.type == CONCAT)
1362 node->first = node->left->first;
1363 node->node_idx = node->left->node_idx;
1365 else
1367 node->first = node;
1368 node->node_idx = re_dfa_add_node (dfa, node->token);
1369 if (BE (node->node_idx == -1, 0))
1370 return REG_ESPACE;
1372 return REG_NOERROR;
1375 /* Pass 2: compute NEXT on the tree. Preorder visit. */
1376 static reg_errcode_t
1377 calc_next (extra, node)
1378 void *extra;
1379 bin_tree_t *node;
1381 switch (node->token.type)
1383 case OP_DUP_ASTERISK:
1384 node->left->next = node;
1385 break;
1386 case CONCAT:
1387 node->left->next = node->right->first;
1388 node->right->next = node->next;
1389 break;
1390 default:
1391 if (node->left)
1392 node->left->next = node->next;
1393 if (node->right)
1394 node->right->next = node->next;
1395 break;
1397 return REG_NOERROR;
1400 /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1401 static reg_errcode_t
1402 link_nfa_nodes (extra, node)
1403 void *extra;
1404 bin_tree_t *node;
1406 re_dfa_t *dfa = (re_dfa_t *) extra;
1407 int idx = node->node_idx;
1408 reg_errcode_t err = REG_NOERROR;
1410 switch (node->token.type)
1412 case CONCAT:
1413 break;
1415 case END_OF_RE:
1416 assert (node->next == NULL);
1417 break;
1419 case OP_DUP_ASTERISK:
1420 case OP_ALT:
1422 int left, right;
1423 dfa->has_plural_match = 1;
1424 if (node->left != NULL)
1425 left = node->left->first->node_idx;
1426 else
1427 left = node->next->node_idx;
1428 if (node->right != NULL)
1429 right = node->right->first->node_idx;
1430 else
1431 right = node->next->node_idx;
1432 assert (left > -1);
1433 assert (right > -1);
1434 err = re_node_set_init_2 (dfa->edests + idx, left, right);
1436 break;
1438 case ANCHOR:
1439 case OP_OPEN_SUBEXP:
1440 case OP_CLOSE_SUBEXP:
1441 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1442 break;
1444 case OP_BACK_REF:
1445 dfa->nexts[idx] = node->next->node_idx;
1446 if (node->token.type == OP_BACK_REF)
1447 re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1448 break;
1450 default:
1451 assert (!IS_EPSILON_NODE (node->token.type));
1452 dfa->nexts[idx] = node->next->node_idx;
1453 break;
1456 return err;
1459 /* Duplicate the epsilon closure of the node ROOT_NODE.
1460 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1461 to their own constraint. */
1463 static reg_errcode_t
1464 duplicate_node_closure (dfa, top_org_node, top_clone_node, root_node,
1465 init_constraint)
1466 re_dfa_t *dfa;
1467 int top_org_node, top_clone_node, root_node;
1468 unsigned int init_constraint;
1470 int org_node, clone_node, ret;
1471 unsigned int constraint = init_constraint;
1472 for (org_node = top_org_node, clone_node = top_clone_node;;)
1474 int org_dest, clone_dest;
1475 if (dfa->nodes[org_node].type == OP_BACK_REF)
1477 /* If the back reference epsilon-transit, its destination must
1478 also have the constraint. Then duplicate the epsilon closure
1479 of the destination of the back reference, and store it in
1480 edests of the back reference. */
1481 org_dest = dfa->nexts[org_node];
1482 re_node_set_empty (dfa->edests + clone_node);
1483 clone_dest = duplicate_node (dfa, org_dest, constraint);
1484 if (BE (clone_dest == -1, 0))
1485 return REG_ESPACE;
1486 dfa->nexts[clone_node] = dfa->nexts[org_node];
1487 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1488 if (BE (ret < 0, 0))
1489 return REG_ESPACE;
1491 else if (dfa->edests[org_node].nelem == 0)
1493 /* In case of the node can't epsilon-transit, don't duplicate the
1494 destination and store the original destination as the
1495 destination of the node. */
1496 dfa->nexts[clone_node] = dfa->nexts[org_node];
1497 break;
1499 else if (dfa->edests[org_node].nelem == 1)
1501 /* In case of the node can epsilon-transit, and it has only one
1502 destination. */
1503 org_dest = dfa->edests[org_node].elems[0];
1504 re_node_set_empty (dfa->edests + clone_node);
1505 if (dfa->nodes[org_node].type == ANCHOR)
1507 /* In case of the node has another constraint, append it. */
1508 if (org_node == root_node && clone_node != org_node)
1510 /* ...but if the node is root_node itself, it means the
1511 epsilon closure have a loop, then tie it to the
1512 destination of the root_node. */
1513 ret = re_node_set_insert (dfa->edests + clone_node,
1514 org_dest);
1515 if (BE (ret < 0, 0))
1516 return REG_ESPACE;
1517 break;
1519 constraint |= dfa->nodes[org_node].opr.ctx_type;
1521 clone_dest = duplicate_node (dfa, org_dest, constraint);
1522 if (BE (clone_dest == -1, 0))
1523 return REG_ESPACE;
1524 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1525 if (BE (ret < 0, 0))
1526 return REG_ESPACE;
1528 else /* dfa->edests[org_node].nelem == 2 */
1530 /* In case of the node can epsilon-transit, and it has two
1531 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1532 org_dest = dfa->edests[org_node].elems[0];
1533 re_node_set_empty (dfa->edests + clone_node);
1534 /* Search for a duplicated node which satisfies the constraint. */
1535 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1536 if (clone_dest == -1)
1538 /* There are no such a duplicated node, create a new one. */
1539 reg_errcode_t err;
1540 clone_dest = duplicate_node (dfa, org_dest, constraint);
1541 if (BE (clone_dest == -1, 0))
1542 return REG_ESPACE;
1543 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1544 if (BE (ret < 0, 0))
1545 return REG_ESPACE;
1546 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1547 root_node, constraint);
1548 if (BE (err != REG_NOERROR, 0))
1549 return err;
1551 else
1553 /* There are a duplicated node which satisfy the constraint,
1554 use it to avoid infinite loop. */
1555 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1556 if (BE (ret < 0, 0))
1557 return REG_ESPACE;
1560 org_dest = dfa->edests[org_node].elems[1];
1561 clone_dest = duplicate_node (dfa, org_dest, constraint);
1562 if (BE (clone_dest == -1, 0))
1563 return REG_ESPACE;
1564 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1565 if (BE (ret < 0, 0))
1566 return REG_ESPACE;
1568 org_node = org_dest;
1569 clone_node = clone_dest;
1571 return REG_NOERROR;
1574 /* Search for a node which is duplicated from the node ORG_NODE, and
1575 satisfies the constraint CONSTRAINT. */
1577 static int
1578 search_duplicated_node (dfa, org_node, constraint)
1579 const re_dfa_t *dfa;
1580 int org_node;
1581 unsigned int constraint;
1583 int idx;
1584 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1586 if (org_node == dfa->org_indices[idx]
1587 && constraint == dfa->nodes[idx].constraint)
1588 return idx; /* Found. */
1590 return -1; /* Not found. */
1593 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1594 Return the index of the new node, or -1 if insufficient storage is
1595 available. */
1597 static int
1598 duplicate_node (dfa, org_idx, constraint)
1599 re_dfa_t *dfa;
1600 int org_idx;
1601 unsigned int constraint;
1603 int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1604 if (BE (dup_idx != -1, 1))
1606 dfa->nodes[dup_idx].constraint = constraint;
1607 if (dfa->nodes[org_idx].type == ANCHOR)
1608 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type;
1609 dfa->nodes[dup_idx].duplicated = 1;
1611 /* Store the index of the original node. */
1612 dfa->org_indices[dup_idx] = org_idx;
1614 return dup_idx;
1617 static reg_errcode_t
1618 calc_inveclosure (dfa)
1619 re_dfa_t *dfa;
1621 int src, idx, ret;
1622 for (idx = 0; idx < dfa->nodes_len; ++idx)
1623 re_node_set_init_empty (dfa->inveclosures + idx);
1625 for (src = 0; src < dfa->nodes_len; ++src)
1627 int *elems = dfa->eclosures[src].elems;
1628 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1630 ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1631 if (BE (ret == -1, 0))
1632 return REG_ESPACE;
1636 return REG_NOERROR;
1639 /* Calculate "eclosure" for all the node in DFA. */
1641 static reg_errcode_t
1642 calc_eclosure (dfa)
1643 re_dfa_t *dfa;
1645 int node_idx, incomplete;
1646 #ifdef DEBUG
1647 assert (dfa->nodes_len > 0);
1648 #endif
1649 incomplete = 0;
1650 /* For each nodes, calculate epsilon closure. */
1651 for (node_idx = 0; ; ++node_idx)
1653 reg_errcode_t err;
1654 re_node_set eclosure_elem;
1655 if (node_idx == dfa->nodes_len)
1657 if (!incomplete)
1658 break;
1659 incomplete = 0;
1660 node_idx = 0;
1663 #ifdef DEBUG
1664 assert (dfa->eclosures[node_idx].nelem != -1);
1665 #endif
1667 /* If we have already calculated, skip it. */
1668 if (dfa->eclosures[node_idx].nelem != 0)
1669 continue;
1670 /* Calculate epsilon closure of `node_idx'. */
1671 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1672 if (BE (err != REG_NOERROR, 0))
1673 return err;
1675 if (dfa->eclosures[node_idx].nelem == 0)
1677 incomplete = 1;
1678 re_node_set_free (&eclosure_elem);
1681 return REG_NOERROR;
1684 /* Calculate epsilon closure of NODE. */
1686 static reg_errcode_t
1687 calc_eclosure_iter (new_set, dfa, node, root)
1688 re_node_set *new_set;
1689 re_dfa_t *dfa;
1690 int node, root;
1692 reg_errcode_t err;
1693 unsigned int constraint;
1694 int i, incomplete;
1695 re_node_set eclosure;
1696 incomplete = 0;
1697 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1698 if (BE (err != REG_NOERROR, 0))
1699 return err;
1701 /* This indicates that we are calculating this node now.
1702 We reference this value to avoid infinite loop. */
1703 dfa->eclosures[node].nelem = -1;
1705 constraint = ((dfa->nodes[node].type == ANCHOR)
1706 ? dfa->nodes[node].opr.ctx_type : 0);
1707 /* If the current node has constraints, duplicate all nodes.
1708 Since they must inherit the constraints. */
1709 if (constraint
1710 && dfa->edests[node].nelem
1711 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1713 int org_node, cur_node;
1714 org_node = cur_node = node;
1715 err = duplicate_node_closure (dfa, node, node, node, constraint);
1716 if (BE (err != REG_NOERROR, 0))
1717 return err;
1720 /* Expand each epsilon destination nodes. */
1721 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1722 for (i = 0; i < dfa->edests[node].nelem; ++i)
1724 re_node_set eclosure_elem;
1725 int edest = dfa->edests[node].elems[i];
1726 /* If calculating the epsilon closure of `edest' is in progress,
1727 return intermediate result. */
1728 if (dfa->eclosures[edest].nelem == -1)
1730 incomplete = 1;
1731 continue;
1733 /* If we haven't calculated the epsilon closure of `edest' yet,
1734 calculate now. Otherwise use calculated epsilon closure. */
1735 if (dfa->eclosures[edest].nelem == 0)
1737 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1738 if (BE (err != REG_NOERROR, 0))
1739 return err;
1741 else
1742 eclosure_elem = dfa->eclosures[edest];
1743 /* Merge the epsilon closure of `edest'. */
1744 re_node_set_merge (&eclosure, &eclosure_elem);
1745 /* If the epsilon closure of `edest' is incomplete,
1746 the epsilon closure of this node is also incomplete. */
1747 if (dfa->eclosures[edest].nelem == 0)
1749 incomplete = 1;
1750 re_node_set_free (&eclosure_elem);
1754 /* Epsilon closures include itself. */
1755 re_node_set_insert (&eclosure, node);
1756 if (incomplete && !root)
1757 dfa->eclosures[node].nelem = 0;
1758 else
1759 dfa->eclosures[node] = eclosure;
1760 *new_set = eclosure;
1761 return REG_NOERROR;
1764 /* Functions for token which are used in the parser. */
1766 /* Fetch a token from INPUT.
1767 We must not use this function inside bracket expressions. */
1769 static void
1770 fetch_token (result, input, syntax)
1771 re_token_t *result;
1772 re_string_t *input;
1773 reg_syntax_t syntax;
1775 re_string_skip_bytes (input, peek_token (result, input, syntax));
1778 /* Peek a token from INPUT, and return the length of the token.
1779 We must not use this function inside bracket expressions. */
1781 static int
1782 peek_token (token, input, syntax)
1783 re_token_t *token;
1784 re_string_t *input;
1785 reg_syntax_t syntax;
1787 unsigned char c;
1789 if (re_string_eoi (input))
1791 token->type = END_OF_RE;
1792 return 0;
1795 c = re_string_peek_byte (input, 0);
1796 token->opr.c = c;
1798 token->word_char = 0;
1799 #ifdef RE_ENABLE_I18N
1800 token->mb_partial = 0;
1801 if (input->mb_cur_max > 1 &&
1802 !re_string_first_byte (input, re_string_cur_idx (input)))
1804 token->type = CHARACTER;
1805 token->mb_partial = 1;
1806 return 1;
1808 #endif
1809 if (c == '\\')
1811 unsigned char c2;
1812 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1814 token->type = BACK_SLASH;
1815 return 1;
1818 c2 = re_string_peek_byte_case (input, 1);
1819 token->opr.c = c2;
1820 token->type = CHARACTER;
1821 #ifdef RE_ENABLE_I18N
1822 if (input->mb_cur_max > 1)
1824 wint_t wc = re_string_wchar_at (input,
1825 re_string_cur_idx (input) + 1);
1826 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1828 else
1829 #endif
1830 token->word_char = IS_WORD_CHAR (c2) != 0;
1832 switch (c2)
1834 case '|':
1835 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1836 token->type = OP_ALT;
1837 break;
1838 case '1': case '2': case '3': case '4': case '5':
1839 case '6': case '7': case '8': case '9':
1840 if (!(syntax & RE_NO_BK_REFS))
1842 token->type = OP_BACK_REF;
1843 token->opr.idx = c2 - '1';
1845 break;
1846 case '<':
1847 if (!(syntax & RE_NO_GNU_OPS))
1849 token->type = ANCHOR;
1850 token->opr.ctx_type = WORD_FIRST;
1852 break;
1853 case '>':
1854 if (!(syntax & RE_NO_GNU_OPS))
1856 token->type = ANCHOR;
1857 token->opr.ctx_type = WORD_LAST;
1859 break;
1860 case 'b':
1861 if (!(syntax & RE_NO_GNU_OPS))
1863 token->type = ANCHOR;
1864 token->opr.ctx_type = WORD_DELIM;
1866 break;
1867 case 'B':
1868 if (!(syntax & RE_NO_GNU_OPS))
1870 token->type = ANCHOR;
1871 token->opr.ctx_type = NOT_WORD_DELIM;
1873 break;
1874 case 'w':
1875 if (!(syntax & RE_NO_GNU_OPS))
1876 token->type = OP_WORD;
1877 break;
1878 case 'W':
1879 if (!(syntax & RE_NO_GNU_OPS))
1880 token->type = OP_NOTWORD;
1881 break;
1882 case 's':
1883 if (!(syntax & RE_NO_GNU_OPS))
1884 token->type = OP_SPACE;
1885 break;
1886 case 'S':
1887 if (!(syntax & RE_NO_GNU_OPS))
1888 token->type = OP_NOTSPACE;
1889 break;
1890 case '`':
1891 if (!(syntax & RE_NO_GNU_OPS))
1893 token->type = ANCHOR;
1894 token->opr.ctx_type = BUF_FIRST;
1896 break;
1897 case '\'':
1898 if (!(syntax & RE_NO_GNU_OPS))
1900 token->type = ANCHOR;
1901 token->opr.ctx_type = BUF_LAST;
1903 break;
1904 case '(':
1905 if (!(syntax & RE_NO_BK_PARENS))
1906 token->type = OP_OPEN_SUBEXP;
1907 break;
1908 case ')':
1909 if (!(syntax & RE_NO_BK_PARENS))
1910 token->type = OP_CLOSE_SUBEXP;
1911 break;
1912 case '+':
1913 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1914 token->type = OP_DUP_PLUS;
1915 break;
1916 case '?':
1917 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1918 token->type = OP_DUP_QUESTION;
1919 break;
1920 case '{':
1921 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1922 token->type = OP_OPEN_DUP_NUM;
1923 break;
1924 case '}':
1925 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1926 token->type = OP_CLOSE_DUP_NUM;
1927 break;
1928 default:
1929 break;
1931 return 2;
1934 token->type = CHARACTER;
1935 #ifdef RE_ENABLE_I18N
1936 if (input->mb_cur_max > 1)
1938 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1939 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1941 else
1942 #endif
1943 token->word_char = IS_WORD_CHAR (token->opr.c);
1945 switch (c)
1947 case '\n':
1948 if (syntax & RE_NEWLINE_ALT)
1949 token->type = OP_ALT;
1950 break;
1951 case '|':
1952 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1953 token->type = OP_ALT;
1954 break;
1955 case '*':
1956 token->type = OP_DUP_ASTERISK;
1957 break;
1958 case '+':
1959 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1960 token->type = OP_DUP_PLUS;
1961 break;
1962 case '?':
1963 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1964 token->type = OP_DUP_QUESTION;
1965 break;
1966 case '{':
1967 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1968 token->type = OP_OPEN_DUP_NUM;
1969 break;
1970 case '}':
1971 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1972 token->type = OP_CLOSE_DUP_NUM;
1973 break;
1974 case '(':
1975 if (syntax & RE_NO_BK_PARENS)
1976 token->type = OP_OPEN_SUBEXP;
1977 break;
1978 case ')':
1979 if (syntax & RE_NO_BK_PARENS)
1980 token->type = OP_CLOSE_SUBEXP;
1981 break;
1982 case '[':
1983 token->type = OP_OPEN_BRACKET;
1984 break;
1985 case '.':
1986 token->type = OP_PERIOD;
1987 break;
1988 case '^':
1989 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1990 re_string_cur_idx (input) != 0)
1992 char prev = re_string_peek_byte (input, -1);
1993 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1994 break;
1996 token->type = ANCHOR;
1997 token->opr.ctx_type = LINE_FIRST;
1998 break;
1999 case '$':
2000 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
2001 re_string_cur_idx (input) + 1 != re_string_length (input))
2003 re_token_t next;
2004 re_string_skip_bytes (input, 1);
2005 peek_token (&next, input, syntax);
2006 re_string_skip_bytes (input, -1);
2007 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
2008 break;
2010 token->type = ANCHOR;
2011 token->opr.ctx_type = LINE_LAST;
2012 break;
2013 default:
2014 break;
2016 return 1;
2019 /* Peek a token from INPUT, and return the length of the token.
2020 We must not use this function out of bracket expressions. */
2022 static int
2023 peek_token_bracket (token, input, syntax)
2024 re_token_t *token;
2025 re_string_t *input;
2026 reg_syntax_t syntax;
2028 unsigned char c;
2029 if (re_string_eoi (input))
2031 token->type = END_OF_RE;
2032 return 0;
2034 c = re_string_peek_byte (input, 0);
2035 token->opr.c = c;
2037 #ifdef RE_ENABLE_I18N
2038 if (input->mb_cur_max > 1 &&
2039 !re_string_first_byte (input, re_string_cur_idx (input)))
2041 token->type = CHARACTER;
2042 return 1;
2044 #endif /* RE_ENABLE_I18N */
2046 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2047 && re_string_cur_idx (input) + 1 < re_string_length (input))
2049 /* In this case, '\' escape a character. */
2050 unsigned char c2;
2051 re_string_skip_bytes (input, 1);
2052 c2 = re_string_peek_byte (input, 0);
2053 token->opr.c = c2;
2054 token->type = CHARACTER;
2055 return 1;
2057 if (c == '[') /* '[' is a special char in a bracket exps. */
2059 unsigned char c2;
2060 int token_len;
2061 if (re_string_cur_idx (input) + 1 < re_string_length (input))
2062 c2 = re_string_peek_byte (input, 1);
2063 else
2064 c2 = 0;
2065 token->opr.c = c2;
2066 token_len = 2;
2067 switch (c2)
2069 case '.':
2070 token->type = OP_OPEN_COLL_ELEM;
2071 break;
2072 case '=':
2073 token->type = OP_OPEN_EQUIV_CLASS;
2074 break;
2075 case ':':
2076 if (syntax & RE_CHAR_CLASSES)
2078 token->type = OP_OPEN_CHAR_CLASS;
2079 break;
2081 /* else fall through. */
2082 default:
2083 token->type = CHARACTER;
2084 token->opr.c = c;
2085 token_len = 1;
2086 break;
2088 return token_len;
2090 switch (c)
2092 case '-':
2093 token->type = OP_CHARSET_RANGE;
2094 break;
2095 case ']':
2096 token->type = OP_CLOSE_BRACKET;
2097 break;
2098 case '^':
2099 token->type = OP_NON_MATCH_LIST;
2100 break;
2101 default:
2102 token->type = CHARACTER;
2104 return 1;
2107 /* Functions for parser. */
2109 /* Entry point of the parser.
2110 Parse the regular expression REGEXP and return the structure tree.
2111 If an error is occured, ERR is set by error code, and return NULL.
2112 This function build the following tree, from regular expression <reg_exp>:
2116 <reg_exp> EOR
2118 CAT means concatenation.
2119 EOR means end of regular expression. */
2121 static bin_tree_t *
2122 parse (regexp, preg, syntax, err)
2123 re_string_t *regexp;
2124 regex_t *preg;
2125 reg_syntax_t syntax;
2126 reg_errcode_t *err;
2128 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2129 bin_tree_t *tree, *eor, *root;
2130 re_token_t current_token;
2131 dfa->syntax = syntax;
2132 fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2133 tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2134 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2135 return NULL;
2136 eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2137 if (tree != NULL)
2138 root = create_tree (dfa, tree, eor, CONCAT);
2139 else
2140 root = eor;
2141 if (BE (eor == NULL || root == NULL, 0))
2143 *err = REG_ESPACE;
2144 return NULL;
2146 return root;
2149 /* This function build the following tree, from regular expression
2150 <branch1>|<branch2>:
2154 <branch1> <branch2>
2156 ALT means alternative, which represents the operator `|'. */
2158 static bin_tree_t *
2159 parse_reg_exp (regexp, preg, token, syntax, nest, err)
2160 re_string_t *regexp;
2161 regex_t *preg;
2162 re_token_t *token;
2163 reg_syntax_t syntax;
2164 int nest;
2165 reg_errcode_t *err;
2167 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2168 bin_tree_t *tree, *branch = NULL;
2169 tree = parse_branch (regexp, preg, token, syntax, nest, err);
2170 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2171 return NULL;
2173 while (token->type == OP_ALT)
2175 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2176 if (token->type != OP_ALT && token->type != END_OF_RE
2177 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2179 branch = parse_branch (regexp, preg, token, syntax, nest, err);
2180 if (BE (*err != REG_NOERROR && branch == NULL, 0))
2181 return NULL;
2183 else
2184 branch = NULL;
2185 tree = create_tree (dfa, tree, branch, OP_ALT);
2186 if (BE (tree == NULL, 0))
2188 *err = REG_ESPACE;
2189 return NULL;
2192 return tree;
2195 /* This function build the following tree, from regular expression
2196 <exp1><exp2>:
2200 <exp1> <exp2>
2202 CAT means concatenation. */
2204 static bin_tree_t *
2205 parse_branch (regexp, preg, token, syntax, nest, err)
2206 re_string_t *regexp;
2207 regex_t *preg;
2208 re_token_t *token;
2209 reg_syntax_t syntax;
2210 int nest;
2211 reg_errcode_t *err;
2213 bin_tree_t *tree, *exp;
2214 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2215 tree = parse_expression (regexp, preg, token, syntax, nest, err);
2216 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2217 return NULL;
2219 while (token->type != OP_ALT && token->type != END_OF_RE
2220 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2222 exp = parse_expression (regexp, preg, token, syntax, nest, err);
2223 if (BE (*err != REG_NOERROR && exp == NULL, 0))
2225 return NULL;
2227 if (tree != NULL && exp != NULL)
2229 tree = create_tree (dfa, tree, exp, CONCAT);
2230 if (tree == NULL)
2232 *err = REG_ESPACE;
2233 return NULL;
2236 else if (tree == NULL)
2237 tree = exp;
2238 /* Otherwise exp == NULL, we don't need to create new tree. */
2240 return tree;
2243 /* This function build the following tree, from regular expression a*:
2249 static bin_tree_t *
2250 parse_expression (regexp, preg, token, syntax, nest, err)
2251 re_string_t *regexp;
2252 regex_t *preg;
2253 re_token_t *token;
2254 reg_syntax_t syntax;
2255 int nest;
2256 reg_errcode_t *err;
2258 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2259 bin_tree_t *tree;
2260 switch (token->type)
2262 case CHARACTER:
2263 tree = create_token_tree (dfa, NULL, NULL, token);
2264 if (BE (tree == NULL, 0))
2266 *err = REG_ESPACE;
2267 return NULL;
2269 #ifdef RE_ENABLE_I18N
2270 if (dfa->mb_cur_max > 1)
2272 while (!re_string_eoi (regexp)
2273 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2275 bin_tree_t *mbc_remain;
2276 fetch_token (token, regexp, syntax);
2277 mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2278 tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2279 if (BE (mbc_remain == NULL || tree == NULL, 0))
2281 *err = REG_ESPACE;
2282 return NULL;
2286 #endif
2287 break;
2288 case OP_OPEN_SUBEXP:
2289 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2290 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2291 return NULL;
2292 break;
2293 case OP_OPEN_BRACKET:
2294 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2295 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2296 return NULL;
2297 break;
2298 case OP_BACK_REF:
2299 if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2301 *err = REG_ESUBREG;
2302 return NULL;
2304 dfa->used_bkref_map |= 1 << token->opr.idx;
2305 tree = create_token_tree (dfa, NULL, NULL, token);
2306 if (BE (tree == NULL, 0))
2308 *err = REG_ESPACE;
2309 return NULL;
2311 ++dfa->nbackref;
2312 dfa->has_mb_node = 1;
2313 break;
2314 case OP_OPEN_DUP_NUM:
2315 if (syntax & RE_CONTEXT_INVALID_DUP)
2317 *err = REG_BADRPT;
2318 return NULL;
2320 /* FALLTHROUGH */
2321 case OP_DUP_ASTERISK:
2322 case OP_DUP_PLUS:
2323 case OP_DUP_QUESTION:
2324 if (syntax & RE_CONTEXT_INVALID_OPS)
2326 *err = REG_BADRPT;
2327 return NULL;
2329 else if (syntax & RE_CONTEXT_INDEP_OPS)
2331 fetch_token (token, regexp, syntax);
2332 return parse_expression (regexp, preg, token, syntax, nest, err);
2334 /* else fall through */
2335 case OP_CLOSE_SUBEXP:
2336 if ((token->type == OP_CLOSE_SUBEXP) &&
2337 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2339 *err = REG_ERPAREN;
2340 return NULL;
2342 /* else fall through */
2343 case OP_CLOSE_DUP_NUM:
2344 /* We treat it as a normal character. */
2346 /* Then we can these characters as normal characters. */
2347 token->type = CHARACTER;
2348 /* mb_partial and word_char bits should be initialized already
2349 by peek_token. */
2350 tree = create_token_tree (dfa, NULL, NULL, token);
2351 if (BE (tree == NULL, 0))
2353 *err = REG_ESPACE;
2354 return NULL;
2356 break;
2357 case ANCHOR:
2358 if ((token->opr.ctx_type
2359 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2360 && dfa->word_ops_used == 0)
2361 init_word_char (dfa);
2362 if (token->opr.ctx_type == WORD_DELIM
2363 || token->opr.ctx_type == NOT_WORD_DELIM)
2365 bin_tree_t *tree_first, *tree_last;
2366 if (token->opr.ctx_type == WORD_DELIM)
2368 token->opr.ctx_type = WORD_FIRST;
2369 tree_first = create_token_tree (dfa, NULL, NULL, token);
2370 token->opr.ctx_type = WORD_LAST;
2372 else
2374 token->opr.ctx_type = INSIDE_WORD;
2375 tree_first = create_token_tree (dfa, NULL, NULL, token);
2376 token->opr.ctx_type = INSIDE_NOTWORD;
2378 tree_last = create_token_tree (dfa, NULL, NULL, token);
2379 tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2380 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2382 *err = REG_ESPACE;
2383 return NULL;
2386 else
2388 tree = create_token_tree (dfa, NULL, NULL, token);
2389 if (BE (tree == NULL, 0))
2391 *err = REG_ESPACE;
2392 return NULL;
2395 /* We must return here, since ANCHORs can't be followed
2396 by repetition operators.
2397 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2398 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2399 fetch_token (token, regexp, syntax);
2400 return tree;
2401 case OP_PERIOD:
2402 tree = create_token_tree (dfa, NULL, NULL, token);
2403 if (BE (tree == NULL, 0))
2405 *err = REG_ESPACE;
2406 return NULL;
2408 if (dfa->mb_cur_max > 1)
2409 dfa->has_mb_node = 1;
2410 break;
2411 case OP_WORD:
2412 case OP_NOTWORD:
2413 tree = build_charclass_op (dfa, regexp->trans,
2414 (const unsigned char *) "alnum",
2415 (const unsigned char *) "_",
2416 token->type == OP_NOTWORD, err);
2417 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2418 return NULL;
2419 break;
2420 case OP_SPACE:
2421 case OP_NOTSPACE:
2422 tree = build_charclass_op (dfa, regexp->trans,
2423 (const unsigned char *) "space",
2424 (const unsigned char *) "",
2425 token->type == OP_NOTSPACE, err);
2426 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2427 return NULL;
2428 break;
2429 case OP_ALT:
2430 case END_OF_RE:
2431 return NULL;
2432 case BACK_SLASH:
2433 *err = REG_EESCAPE;
2434 return NULL;
2435 default:
2436 /* Must not happen? */
2437 #ifdef DEBUG
2438 assert (0);
2439 #endif
2440 return NULL;
2442 fetch_token (token, regexp, syntax);
2444 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2445 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2447 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2448 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2449 return NULL;
2450 /* In BRE consecutive duplications are not allowed. */
2451 if ((syntax & RE_CONTEXT_INVALID_DUP)
2452 && (token->type == OP_DUP_ASTERISK
2453 || token->type == OP_OPEN_DUP_NUM))
2455 *err = REG_BADRPT;
2456 return NULL;
2460 return tree;
2463 /* This function build the following tree, from regular expression
2464 (<reg_exp>):
2465 SUBEXP
2467 <reg_exp>
2470 static bin_tree_t *
2471 parse_sub_exp (regexp, preg, token, syntax, nest, err)
2472 re_string_t *regexp;
2473 regex_t *preg;
2474 re_token_t *token;
2475 reg_syntax_t syntax;
2476 int nest;
2477 reg_errcode_t *err;
2479 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2480 bin_tree_t *tree;
2481 size_t cur_nsub;
2482 cur_nsub = preg->re_nsub++;
2484 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2486 /* The subexpression may be a null string. */
2487 if (token->type == OP_CLOSE_SUBEXP)
2488 tree = NULL;
2489 else
2491 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2492 if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2493 *err = REG_EPAREN;
2494 if (BE (*err != REG_NOERROR, 0))
2495 return NULL;
2498 if (cur_nsub <= '9' - '1')
2499 dfa->completed_bkref_map |= 1 << cur_nsub;
2501 tree = create_tree (dfa, tree, NULL, SUBEXP);
2502 if (BE (tree == NULL, 0))
2504 *err = REG_ESPACE;
2505 return NULL;
2507 tree->token.opr.idx = cur_nsub;
2508 return tree;
2511 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2513 static bin_tree_t *
2514 parse_dup_op (elem, regexp, dfa, token, syntax, err)
2515 bin_tree_t *elem;
2516 re_string_t *regexp;
2517 re_dfa_t *dfa;
2518 re_token_t *token;
2519 reg_syntax_t syntax;
2520 reg_errcode_t *err;
2522 bin_tree_t *tree = NULL, *old_tree = NULL;
2523 int i, start, end, start_idx = re_string_cur_idx (regexp);
2524 re_token_t start_token = *token;
2526 if (token->type == OP_OPEN_DUP_NUM)
2528 end = 0;
2529 start = fetch_number (regexp, token, syntax);
2530 if (start == -1)
2532 if (token->type == CHARACTER && token->opr.c == ',')
2533 start = 0; /* We treat "{,m}" as "{0,m}". */
2534 else
2536 *err = REG_BADBR; /* <re>{} is invalid. */
2537 return NULL;
2540 if (BE (start != -2, 1))
2542 /* We treat "{n}" as "{n,n}". */
2543 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2544 : ((token->type == CHARACTER && token->opr.c == ',')
2545 ? fetch_number (regexp, token, syntax) : -2));
2547 if (BE (start == -2 || end == -2, 0))
2549 /* Invalid sequence. */
2550 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2552 if (token->type == END_OF_RE)
2553 *err = REG_EBRACE;
2554 else
2555 *err = REG_BADBR;
2557 return NULL;
2560 /* If the syntax bit is set, rollback. */
2561 re_string_set_index (regexp, start_idx);
2562 *token = start_token;
2563 token->type = CHARACTER;
2564 /* mb_partial and word_char bits should be already initialized by
2565 peek_token. */
2566 return elem;
2569 if (BE (end != -1 && start > end, 0))
2571 /* First number greater than second. */
2572 *err = REG_BADBR;
2573 return NULL;
2576 else
2578 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2579 end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2582 fetch_token (token, regexp, syntax);
2584 if (BE (elem == NULL, 0))
2585 return NULL;
2586 if (BE (start == 0 && end == 0, 0))
2588 postorder (elem, free_tree, NULL);
2589 return NULL;
2592 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2593 if (BE (start > 0, 0))
2595 tree = elem;
2596 for (i = 2; i <= start; ++i)
2598 elem = duplicate_tree (elem, dfa);
2599 tree = create_tree (dfa, tree, elem, CONCAT);
2600 if (BE (elem == NULL || tree == NULL, 0))
2601 goto parse_dup_op_espace;
2604 if (start == end)
2605 return tree;
2607 /* Duplicate ELEM before it is marked optional. */
2608 elem = duplicate_tree (elem, dfa);
2609 old_tree = tree;
2611 else
2612 old_tree = NULL;
2614 if (elem->token.type == SUBEXP)
2615 postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2617 tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2618 if (BE (tree == NULL, 0))
2619 goto parse_dup_op_espace;
2621 /* This loop is actually executed only when end != -1,
2622 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2623 already created the start+1-th copy. */
2624 for (i = start + 2; i <= end; ++i)
2626 elem = duplicate_tree (elem, dfa);
2627 tree = create_tree (dfa, tree, elem, CONCAT);
2628 if (BE (elem == NULL || tree == NULL, 0))
2629 goto parse_dup_op_espace;
2631 tree = create_tree (dfa, tree, NULL, OP_ALT);
2632 if (BE (tree == NULL, 0))
2633 goto parse_dup_op_espace;
2636 if (old_tree)
2637 tree = create_tree (dfa, old_tree, tree, CONCAT);
2639 return tree;
2641 parse_dup_op_espace:
2642 *err = REG_ESPACE;
2643 return NULL;
2646 /* Size of the names for collating symbol/equivalence_class/character_class.
2647 I'm not sure, but maybe enough. */
2648 #define BRACKET_NAME_BUF_SIZE 32
2650 #ifndef _LIBC
2651 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2652 Build the range expression which starts from START_ELEM, and ends
2653 at END_ELEM. The result are written to MBCSET and SBCSET.
2654 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2655 mbcset->range_ends, is a pointer argument sinse we may
2656 update it. */
2658 static reg_errcode_t
2659 # ifdef RE_ENABLE_I18N
2660 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2661 re_charset_t *mbcset;
2662 int *range_alloc;
2663 # else /* not RE_ENABLE_I18N */
2664 build_range_exp (sbcset, start_elem, end_elem)
2665 # endif /* not RE_ENABLE_I18N */
2666 bitset_t sbcset;
2667 bracket_elem_t *start_elem, *end_elem;
2669 unsigned int start_ch, end_ch;
2670 /* Equivalence Classes and Character Classes can't be a range start/end. */
2671 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2672 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2674 return REG_ERANGE;
2676 /* We can handle no multi character collating elements without libc
2677 support. */
2678 if (BE ((start_elem->type == COLL_SYM
2679 && strlen ((char *) start_elem->opr.name) > 1)
2680 || (end_elem->type == COLL_SYM
2681 && strlen ((char *) end_elem->opr.name) > 1), 0))
2682 return REG_ECOLLATE;
2684 # ifdef RE_ENABLE_I18N
2686 wchar_t wc;
2687 wint_t start_wc;
2688 wint_t end_wc;
2689 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2691 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2692 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2693 : 0));
2694 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2695 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2696 : 0));
2697 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2698 ? __btowc (start_ch) : start_elem->opr.wch);
2699 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2700 ? __btowc (end_ch) : end_elem->opr.wch);
2701 if (start_wc == WEOF || end_wc == WEOF)
2702 return REG_ECOLLATE;
2703 cmp_buf[0] = start_wc;
2704 cmp_buf[4] = end_wc;
2705 if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2706 return REG_ERANGE;
2708 /* Got valid collation sequence values, add them as a new entry.
2709 However, for !_LIBC we have no collation elements: if the
2710 character set is single byte, the single byte character set
2711 that we build below suffices. parse_bracket_exp passes
2712 no MBCSET if dfa->mb_cur_max == 1. */
2713 if (mbcset)
2715 /* Check the space of the arrays. */
2716 if (BE (*range_alloc == mbcset->nranges, 0))
2718 /* There is not enough space, need realloc. */
2719 wchar_t *new_array_start, *new_array_end;
2720 int new_nranges;
2722 /* +1 in case of mbcset->nranges is 0. */
2723 new_nranges = 2 * mbcset->nranges + 1;
2724 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2725 are NULL if *range_alloc == 0. */
2726 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2727 new_nranges);
2728 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2729 new_nranges);
2731 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2732 return REG_ESPACE;
2734 mbcset->range_starts = new_array_start;
2735 mbcset->range_ends = new_array_end;
2736 *range_alloc = new_nranges;
2739 mbcset->range_starts[mbcset->nranges] = start_wc;
2740 mbcset->range_ends[mbcset->nranges++] = end_wc;
2743 /* Build the table for single byte characters. */
2744 for (wc = 0; wc < SBC_MAX; ++wc)
2746 cmp_buf[2] = wc;
2747 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2748 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2749 bitset_set (sbcset, wc);
2752 # else /* not RE_ENABLE_I18N */
2754 unsigned int ch;
2755 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2756 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2757 : 0));
2758 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2759 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2760 : 0));
2761 if (start_ch > end_ch)
2762 return REG_ERANGE;
2763 /* Build the table for single byte characters. */
2764 for (ch = 0; ch < SBC_MAX; ++ch)
2765 if (start_ch <= ch && ch <= end_ch)
2766 bitset_set (sbcset, ch);
2768 # endif /* not RE_ENABLE_I18N */
2769 return REG_NOERROR;
2771 #endif /* not _LIBC */
2773 #ifndef _LIBC
2774 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2775 Build the collating element which is represented by NAME.
2776 The result are written to MBCSET and SBCSET.
2777 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2778 pointer argument since we may update it. */
2780 static reg_errcode_t
2781 # ifdef RE_ENABLE_I18N
2782 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2783 re_charset_t *mbcset;
2784 int *coll_sym_alloc;
2785 # else /* not RE_ENABLE_I18N */
2786 build_collating_symbol (sbcset, name)
2787 # endif /* not RE_ENABLE_I18N */
2788 bitset_t sbcset;
2789 const unsigned char *name;
2791 size_t name_len = strlen ((const char *) name);
2792 if (BE (name_len != 1, 0))
2793 return REG_ECOLLATE;
2794 else
2796 bitset_set (sbcset, name[0]);
2797 return REG_NOERROR;
2800 #endif /* not _LIBC */
2802 /* This function parse bracket expression like "[abc]", "[a-c]",
2803 "[[.a-a.]]" etc. */
2805 static bin_tree_t *
2806 parse_bracket_exp (regexp, dfa, token, syntax, err)
2807 re_string_t *regexp;
2808 re_dfa_t *dfa;
2809 re_token_t *token;
2810 reg_syntax_t syntax;
2811 reg_errcode_t *err;
2813 #ifdef _LIBC
2814 const unsigned char *collseqmb;
2815 const char *collseqwc;
2816 uint32_t nrules;
2817 int32_t table_size;
2818 const int32_t *symb_table;
2819 const unsigned char *extra;
2821 /* Local function for parse_bracket_exp used in _LIBC environement.
2822 Seek the collating symbol entry correspondings to NAME.
2823 Return the index of the symbol in the SYMB_TABLE. */
2825 auto inline int32_t
2826 __attribute ((always_inline))
2827 seek_collating_symbol_entry (name, name_len)
2828 const unsigned char *name;
2829 size_t name_len;
2831 int32_t hash = elem_hash ((const char *) name, name_len);
2832 int32_t elem = hash % table_size;
2833 int32_t second = hash % (table_size - 2);
2834 while (symb_table[2 * elem] != 0)
2836 /* First compare the hashing value. */
2837 if (symb_table[2 * elem] == hash
2838 /* Compare the length of the name. */
2839 && name_len == extra[symb_table[2 * elem + 1]]
2840 /* Compare the name. */
2841 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2842 name_len) == 0)
2844 /* Yep, this is the entry. */
2845 break;
2848 /* Next entry. */
2849 elem += second;
2851 return elem;
2854 /* Local function for parse_bracket_exp used in _LIBC environement.
2855 Look up the collation sequence value of BR_ELEM.
2856 Return the value if succeeded, UINT_MAX otherwise. */
2858 auto inline unsigned int
2859 __attribute ((always_inline))
2860 lookup_collation_sequence_value (br_elem)
2861 bracket_elem_t *br_elem;
2863 if (br_elem->type == SB_CHAR)
2866 if (MB_CUR_MAX == 1)
2868 if (nrules == 0)
2869 return collseqmb[br_elem->opr.ch];
2870 else
2872 wint_t wc = __btowc (br_elem->opr.ch);
2873 return __collseq_table_lookup (collseqwc, wc);
2876 else if (br_elem->type == MB_CHAR)
2878 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2880 else if (br_elem->type == COLL_SYM)
2882 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2883 if (nrules != 0)
2885 int32_t elem, idx;
2886 elem = seek_collating_symbol_entry (br_elem->opr.name,
2887 sym_name_len);
2888 if (symb_table[2 * elem] != 0)
2890 /* We found the entry. */
2891 idx = symb_table[2 * elem + 1];
2892 /* Skip the name of collating element name. */
2893 idx += 1 + extra[idx];
2894 /* Skip the byte sequence of the collating element. */
2895 idx += 1 + extra[idx];
2896 /* Adjust for the alignment. */
2897 idx = (idx + 3) & ~3;
2898 /* Skip the multibyte collation sequence value. */
2899 idx += sizeof (unsigned int);
2900 /* Skip the wide char sequence of the collating element. */
2901 idx += sizeof (unsigned int) *
2902 (1 + *(unsigned int *) (extra + idx));
2903 /* Return the collation sequence value. */
2904 return *(unsigned int *) (extra + idx);
2906 else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2908 /* No valid character. Match it as a single byte
2909 character. */
2910 return collseqmb[br_elem->opr.name[0]];
2913 else if (sym_name_len == 1)
2914 return collseqmb[br_elem->opr.name[0]];
2916 return UINT_MAX;
2919 /* Local function for parse_bracket_exp used in _LIBC environement.
2920 Build the range expression which starts from START_ELEM, and ends
2921 at END_ELEM. The result are written to MBCSET and SBCSET.
2922 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2923 mbcset->range_ends, is a pointer argument sinse we may
2924 update it. */
2926 auto inline reg_errcode_t
2927 __attribute ((always_inline))
2928 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2929 re_charset_t *mbcset;
2930 int *range_alloc;
2931 bitset_t sbcset;
2932 bracket_elem_t *start_elem, *end_elem;
2934 unsigned int ch;
2935 uint32_t start_collseq;
2936 uint32_t end_collseq;
2938 /* Equivalence Classes and Character Classes can't be a range
2939 start/end. */
2940 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2941 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2943 return REG_ERANGE;
2945 start_collseq = lookup_collation_sequence_value (start_elem);
2946 end_collseq = lookup_collation_sequence_value (end_elem);
2947 /* Check start/end collation sequence values. */
2948 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2949 return REG_ECOLLATE;
2950 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2951 return REG_ERANGE;
2953 /* Got valid collation sequence values, add them as a new entry.
2954 However, if we have no collation elements, and the character set
2955 is single byte, the single byte character set that we
2956 build below suffices. */
2957 if (nrules > 0 || dfa->mb_cur_max > 1)
2959 /* Check the space of the arrays. */
2960 if (BE (*range_alloc == mbcset->nranges, 0))
2962 /* There is not enough space, need realloc. */
2963 uint32_t *new_array_start;
2964 uint32_t *new_array_end;
2965 int new_nranges;
2967 /* +1 in case of mbcset->nranges is 0. */
2968 new_nranges = 2 * mbcset->nranges + 1;
2969 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2970 new_nranges);
2971 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2972 new_nranges);
2974 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2975 return REG_ESPACE;
2977 mbcset->range_starts = new_array_start;
2978 mbcset->range_ends = new_array_end;
2979 *range_alloc = new_nranges;
2982 mbcset->range_starts[mbcset->nranges] = start_collseq;
2983 mbcset->range_ends[mbcset->nranges++] = end_collseq;
2986 /* Build the table for single byte characters. */
2987 for (ch = 0; ch < SBC_MAX; ch++)
2989 uint32_t ch_collseq;
2991 if (MB_CUR_MAX == 1)
2993 if (nrules == 0)
2994 ch_collseq = collseqmb[ch];
2995 else
2996 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2997 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2998 bitset_set (sbcset, ch);
3000 return REG_NOERROR;
3003 /* Local function for parse_bracket_exp used in _LIBC environement.
3004 Build the collating element which is represented by NAME.
3005 The result are written to MBCSET and SBCSET.
3006 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
3007 pointer argument sinse we may update it. */
3009 auto inline reg_errcode_t
3010 __attribute ((always_inline))
3011 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
3012 re_charset_t *mbcset;
3013 int *coll_sym_alloc;
3014 bitset_t sbcset;
3015 const unsigned char *name;
3017 int32_t elem, idx;
3018 size_t name_len = strlen ((const char *) name);
3019 if (nrules != 0)
3021 elem = seek_collating_symbol_entry (name, name_len);
3022 if (symb_table[2 * elem] != 0)
3024 /* We found the entry. */
3025 idx = symb_table[2 * elem + 1];
3026 /* Skip the name of collating element name. */
3027 idx += 1 + extra[idx];
3029 else if (symb_table[2 * elem] == 0 && name_len == 1)
3031 /* No valid character, treat it as a normal
3032 character. */
3033 bitset_set (sbcset, name[0]);
3034 return REG_NOERROR;
3036 else
3037 return REG_ECOLLATE;
3039 /* Got valid collation sequence, add it as a new entry. */
3040 /* Check the space of the arrays. */
3041 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3043 /* Not enough, realloc it. */
3044 /* +1 in case of mbcset->ncoll_syms is 0. */
3045 int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3046 /* Use realloc since mbcset->coll_syms is NULL
3047 if *alloc == 0. */
3048 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3049 new_coll_sym_alloc);
3050 if (BE (new_coll_syms == NULL, 0))
3051 return REG_ESPACE;
3052 mbcset->coll_syms = new_coll_syms;
3053 *coll_sym_alloc = new_coll_sym_alloc;
3055 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3056 return REG_NOERROR;
3058 else
3060 if (BE (name_len != 1, 0))
3061 return REG_ECOLLATE;
3062 else
3064 bitset_set (sbcset, name[0]);
3065 return REG_NOERROR;
3069 #endif
3071 re_token_t br_token;
3072 re_bitset_ptr_t sbcset;
3073 #ifdef RE_ENABLE_I18N
3074 re_charset_t *mbcset;
3075 int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3076 int equiv_class_alloc = 0, char_class_alloc = 0;
3077 #endif /* not RE_ENABLE_I18N */
3078 int non_match = 0;
3079 bin_tree_t *work_tree;
3080 int token_len;
3081 int first_round = 1;
3082 #ifdef _LIBC
3083 collseqmb = (const unsigned char *)
3084 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3085 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3086 if (nrules)
3089 if (MB_CUR_MAX > 1)
3091 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3092 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3093 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3094 _NL_COLLATE_SYMB_TABLEMB);
3095 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3096 _NL_COLLATE_SYMB_EXTRAMB);
3098 #endif
3099 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3100 #ifdef RE_ENABLE_I18N
3101 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3102 #endif /* RE_ENABLE_I18N */
3103 #ifdef RE_ENABLE_I18N
3104 if (BE (sbcset == NULL || mbcset == NULL, 0))
3105 #else
3106 if (BE (sbcset == NULL, 0))
3107 #endif /* RE_ENABLE_I18N */
3109 *err = REG_ESPACE;
3110 return NULL;
3113 token_len = peek_token_bracket (token, regexp, syntax);
3114 if (BE (token->type == END_OF_RE, 0))
3116 *err = REG_BADPAT;
3117 goto parse_bracket_exp_free_return;
3119 if (token->type == OP_NON_MATCH_LIST)
3121 #ifdef RE_ENABLE_I18N
3122 mbcset->non_match = 1;
3123 #endif /* not RE_ENABLE_I18N */
3124 non_match = 1;
3125 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3126 bitset_set (sbcset, '\0');
3127 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3128 token_len = peek_token_bracket (token, regexp, syntax);
3129 if (BE (token->type == END_OF_RE, 0))
3131 *err = REG_BADPAT;
3132 goto parse_bracket_exp_free_return;
3136 /* We treat the first ']' as a normal character. */
3137 if (token->type == OP_CLOSE_BRACKET)
3138 token->type = CHARACTER;
3140 while (1)
3142 bracket_elem_t start_elem, end_elem;
3143 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3144 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3145 reg_errcode_t ret;
3146 int token_len2 = 0, is_range_exp = 0;
3147 re_token_t token2;
3149 start_elem.opr.name = start_name_buf;
3150 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3151 syntax, first_round);
3152 if (BE (ret != REG_NOERROR, 0))
3154 *err = ret;
3155 goto parse_bracket_exp_free_return;
3157 first_round = 0;
3159 /* Get information about the next token. We need it in any case. */
3160 token_len = peek_token_bracket (token, regexp, syntax);
3162 /* Do not check for ranges if we know they are not allowed. */
3163 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3165 if (BE (token->type == END_OF_RE, 0))
3167 *err = REG_EBRACK;
3168 goto parse_bracket_exp_free_return;
3170 if (token->type == OP_CHARSET_RANGE)
3172 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3173 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3174 if (BE (token2.type == END_OF_RE, 0))
3176 *err = REG_EBRACK;
3177 goto parse_bracket_exp_free_return;
3179 if (token2.type == OP_CLOSE_BRACKET)
3181 /* We treat the last '-' as a normal character. */
3182 re_string_skip_bytes (regexp, -token_len);
3183 token->type = CHARACTER;
3185 else
3186 is_range_exp = 1;
3190 if (is_range_exp == 1)
3192 end_elem.opr.name = end_name_buf;
3193 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3194 dfa, syntax, 1);
3195 if (BE (ret != REG_NOERROR, 0))
3197 *err = ret;
3198 goto parse_bracket_exp_free_return;
3201 token_len = peek_token_bracket (token, regexp, syntax);
3203 #ifdef _LIBC
3204 *err = build_range_exp (sbcset, mbcset, &range_alloc,
3205 &start_elem, &end_elem);
3206 #else
3207 # ifdef RE_ENABLE_I18N
3208 *err = build_range_exp (sbcset,
3209 dfa->mb_cur_max > 1 ? mbcset : NULL,
3210 &range_alloc, &start_elem, &end_elem);
3211 # else
3212 *err = build_range_exp (sbcset, &start_elem, &end_elem);
3213 # endif
3214 #endif /* RE_ENABLE_I18N */
3215 if (BE (*err != REG_NOERROR, 0))
3216 goto parse_bracket_exp_free_return;
3218 else
3220 switch (start_elem.type)
3222 case SB_CHAR:
3223 bitset_set (sbcset, start_elem.opr.ch);
3224 break;
3225 #ifdef RE_ENABLE_I18N
3226 case MB_CHAR:
3227 /* Check whether the array has enough space. */
3228 if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3230 wchar_t *new_mbchars;
3231 /* Not enough, realloc it. */
3232 /* +1 in case of mbcset->nmbchars is 0. */
3233 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3234 /* Use realloc since array is NULL if *alloc == 0. */
3235 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3236 mbchar_alloc);
3237 if (BE (new_mbchars == NULL, 0))
3238 goto parse_bracket_exp_espace;
3239 mbcset->mbchars = new_mbchars;
3241 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3242 break;
3243 #endif /* RE_ENABLE_I18N */
3244 case EQUIV_CLASS:
3245 *err = build_equiv_class (sbcset,
3246 #ifdef RE_ENABLE_I18N
3247 mbcset, &equiv_class_alloc,
3248 #endif /* RE_ENABLE_I18N */
3249 start_elem.opr.name);
3250 if (BE (*err != REG_NOERROR, 0))
3251 goto parse_bracket_exp_free_return;
3252 break;
3253 case COLL_SYM:
3254 *err = build_collating_symbol (sbcset,
3255 #ifdef RE_ENABLE_I18N
3256 mbcset, &coll_sym_alloc,
3257 #endif /* RE_ENABLE_I18N */
3258 start_elem.opr.name);
3259 if (BE (*err != REG_NOERROR, 0))
3260 goto parse_bracket_exp_free_return;
3261 break;
3262 case CHAR_CLASS:
3263 *err = build_charclass (regexp->trans, sbcset,
3264 #ifdef RE_ENABLE_I18N
3265 mbcset, &char_class_alloc,
3266 #endif /* RE_ENABLE_I18N */
3267 start_elem.opr.name, syntax);
3268 if (BE (*err != REG_NOERROR, 0))
3269 goto parse_bracket_exp_free_return;
3270 break;
3271 default:
3272 assert (0);
3273 break;
3276 if (BE (token->type == END_OF_RE, 0))
3278 *err = REG_EBRACK;
3279 goto parse_bracket_exp_free_return;
3281 if (token->type == OP_CLOSE_BRACKET)
3282 break;
3285 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3287 /* If it is non-matching list. */
3288 if (non_match)
3289 bitset_not (sbcset);
3291 #ifdef RE_ENABLE_I18N
3292 /* Ensure only single byte characters are set. */
3293 if (dfa->mb_cur_max > 1)
3294 bitset_mask (sbcset, dfa->sb_char);
3296 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3297 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3298 || mbcset->non_match)))
3300 bin_tree_t *mbc_tree;
3301 int sbc_idx;
3302 /* Build a tree for complex bracket. */
3303 dfa->has_mb_node = 1;
3304 br_token.type = COMPLEX_BRACKET;
3305 br_token.opr.mbcset = mbcset;
3306 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3307 if (BE (mbc_tree == NULL, 0))
3308 goto parse_bracket_exp_espace;
3309 for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3310 if (sbcset[sbc_idx])
3311 break;
3312 /* If there are no bits set in sbcset, there is no point
3313 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3314 if (sbc_idx < BITSET_WORDS)
3316 /* Build a tree for simple bracket. */
3317 br_token.type = SIMPLE_BRACKET;
3318 br_token.opr.sbcset = sbcset;
3319 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3320 if (BE (work_tree == NULL, 0))
3321 goto parse_bracket_exp_espace;
3323 /* Then join them by ALT node. */
3324 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3325 if (BE (work_tree == NULL, 0))
3326 goto parse_bracket_exp_espace;
3328 else
3330 re_free (sbcset);
3331 work_tree = mbc_tree;
3334 else
3335 #endif /* not RE_ENABLE_I18N */
3337 #ifdef RE_ENABLE_I18N
3338 free_charset (mbcset);
3339 #endif
3340 /* Build a tree for simple bracket. */
3341 br_token.type = SIMPLE_BRACKET;
3342 br_token.opr.sbcset = sbcset;
3343 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3344 if (BE (work_tree == NULL, 0))
3345 goto parse_bracket_exp_espace;
3347 return work_tree;
3349 parse_bracket_exp_espace:
3350 *err = REG_ESPACE;
3351 parse_bracket_exp_free_return:
3352 re_free (sbcset);
3353 #ifdef RE_ENABLE_I18N
3354 free_charset (mbcset);
3355 #endif /* RE_ENABLE_I18N */
3356 return NULL;
3359 /* Parse an element in the bracket expression. */
3361 static reg_errcode_t
3362 parse_bracket_element (elem, regexp, token, token_len, dfa, syntax,
3363 accept_hyphen)
3364 bracket_elem_t *elem;
3365 re_string_t *regexp;
3366 re_token_t *token;
3367 int token_len;
3368 re_dfa_t *dfa;
3369 reg_syntax_t syntax;
3370 int accept_hyphen;
3372 #ifdef RE_ENABLE_I18N
3373 int cur_char_size;
3374 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3375 if (cur_char_size > 1)
3377 elem->type = MB_CHAR;
3378 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3379 re_string_skip_bytes (regexp, cur_char_size);
3380 return REG_NOERROR;
3382 #endif /* RE_ENABLE_I18N */
3383 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3384 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3385 || token->type == OP_OPEN_EQUIV_CLASS)
3386 return parse_bracket_symbol (elem, regexp, token);
3387 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3389 /* A '-' must only appear as anything but a range indicator before
3390 the closing bracket. Everything else is an error. */
3391 re_token_t token2;
3392 (void) peek_token_bracket (&token2, regexp, syntax);
3393 if (token2.type != OP_CLOSE_BRACKET)
3394 /* The actual error value is not standardized since this whole
3395 case is undefined. But ERANGE makes good sense. */
3396 return REG_ERANGE;
3398 elem->type = SB_CHAR;
3399 elem->opr.ch = token->opr.c;
3400 return REG_NOERROR;
3403 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3404 such as [:<character_class>:], [.<collating_element>.], and
3405 [=<equivalent_class>=]. */
3407 static reg_errcode_t
3408 parse_bracket_symbol (elem, regexp, token)
3409 bracket_elem_t *elem;
3410 re_string_t *regexp;
3411 re_token_t *token;
3413 unsigned char ch, delim = token->opr.c;
3414 int i = 0;
3415 if (re_string_eoi(regexp))
3416 return REG_EBRACK;
3417 for (;; ++i)
3419 if (i >= BRACKET_NAME_BUF_SIZE)
3420 return REG_EBRACK;
3421 if (token->type == OP_OPEN_CHAR_CLASS)
3422 ch = re_string_fetch_byte_case (regexp);
3423 else
3424 ch = re_string_fetch_byte (regexp);
3425 if (re_string_eoi(regexp))
3426 return REG_EBRACK;
3427 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3428 break;
3429 elem->opr.name[i] = ch;
3431 re_string_skip_bytes (regexp, 1);
3432 elem->opr.name[i] = '\0';
3433 switch (token->type)
3435 case OP_OPEN_COLL_ELEM:
3436 elem->type = COLL_SYM;
3437 break;
3438 case OP_OPEN_EQUIV_CLASS:
3439 elem->type = EQUIV_CLASS;
3440 break;
3441 case OP_OPEN_CHAR_CLASS:
3442 elem->type = CHAR_CLASS;
3443 break;
3444 default:
3445 break;
3447 return REG_NOERROR;
3450 /* Helper function for parse_bracket_exp.
3451 Build the equivalence class which is represented by NAME.
3452 The result are written to MBCSET and SBCSET.
3453 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3454 is a pointer argument sinse we may update it. */
3456 static reg_errcode_t
3457 #ifdef RE_ENABLE_I18N
3458 build_equiv_class (sbcset, mbcset, equiv_class_alloc, name)
3459 re_charset_t *mbcset;
3460 int *equiv_class_alloc;
3461 #else /* not RE_ENABLE_I18N */
3462 build_equiv_class (sbcset, name)
3463 #endif /* not RE_ENABLE_I18N */
3464 bitset_t sbcset;
3465 const unsigned char *name;
3467 #if defined _LIBC
3468 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3469 if (nrules != 0)
3471 const int32_t *table, *indirect;
3472 const unsigned char *weights, *extra, *cp;
3473 unsigned char char_buf[2];
3474 int32_t idx1, idx2;
3475 unsigned int ch;
3476 size_t len;
3477 /* This #include defines a local function! */
3478 # include <locale/weight.h>
3479 /* Calculate the index for equivalence class. */
3480 cp = name;
3481 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3482 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3483 _NL_COLLATE_WEIGHTMB);
3484 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3485 _NL_COLLATE_EXTRAMB);
3486 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3487 _NL_COLLATE_INDIRECTMB);
3488 idx1 = findidx (&cp);
3489 if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3490 /* This isn't a valid character. */
3491 return REG_ECOLLATE;
3493 /* Build single byte matcing table for this equivalence class. */
3494 char_buf[1] = (unsigned char) '\0';
3495 len = weights[idx1];
3496 for (ch = 0; ch < SBC_MAX; ++ch)
3498 char_buf[0] = ch;
3499 cp = char_buf;
3500 idx2 = findidx (&cp);
3502 idx2 = table[ch];
3504 if (idx2 == 0)
3505 /* This isn't a valid character. */
3506 continue;
3507 if (len == weights[idx2])
3509 int cnt = 0;
3510 while (cnt <= len &&
3511 weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
3512 ++cnt;
3514 if (cnt > len)
3515 bitset_set (sbcset, ch);
3518 /* Check whether the array has enough space. */
3519 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3521 /* Not enough, realloc it. */
3522 /* +1 in case of mbcset->nequiv_classes is 0. */
3523 int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3524 /* Use realloc since the array is NULL if *alloc == 0. */
3525 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3526 int32_t,
3527 new_equiv_class_alloc);
3528 if (BE (new_equiv_classes == NULL, 0))
3529 return REG_ESPACE;
3530 mbcset->equiv_classes = new_equiv_classes;
3531 *equiv_class_alloc = new_equiv_class_alloc;
3533 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3535 else
3536 #endif /* _LIBC */
3538 if (BE (strlen ((const char *) name) != 1, 0))
3539 return REG_ECOLLATE;
3540 bitset_set (sbcset, *name);
3542 return REG_NOERROR;
3545 /* Helper function for parse_bracket_exp.
3546 Build the character class which is represented by NAME.
3547 The result are written to MBCSET and SBCSET.
3548 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3549 is a pointer argument sinse we may update it. */
3551 static reg_errcode_t
3552 #ifdef RE_ENABLE_I18N
3553 build_charclass (trans, sbcset, mbcset, char_class_alloc, class_name, syntax)
3554 re_charset_t *mbcset;
3555 int *char_class_alloc;
3556 #else /* not RE_ENABLE_I18N */
3557 build_charclass (trans, sbcset, class_name, syntax)
3558 #endif /* not RE_ENABLE_I18N */
3559 RE_TRANSLATE_TYPE trans;
3560 bitset_t sbcset;
3561 const unsigned char *class_name;
3562 reg_syntax_t syntax;
3564 int i;
3565 const char *name = (const char *) class_name;
3567 /* In case of REG_ICASE "upper" and "lower" match the both of
3568 upper and lower cases. */
3569 if ((syntax & RE_ICASE)
3570 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3571 name = "alpha";
3573 #ifdef RE_ENABLE_I18N
3574 /* Check the space of the arrays. */
3575 if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3577 /* Not enough, realloc it. */
3578 /* +1 in case of mbcset->nchar_classes is 0. */
3579 int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3580 /* Use realloc since array is NULL if *alloc == 0. */
3581 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3582 new_char_class_alloc);
3583 if (BE (new_char_classes == NULL, 0))
3584 return REG_ESPACE;
3585 mbcset->char_classes = new_char_classes;
3586 *char_class_alloc = new_char_class_alloc;
3588 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3589 #endif /* RE_ENABLE_I18N */
3591 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3592 for (i = 0; i < SBC_MAX; ++i) \
3594 if (ctype_func (i)) \
3596 int ch = trans ? trans[i] : i; \
3597 bitset_set (sbcset, ch); \
3601 if (strcmp (name, "alnum") == 0)
3602 BUILD_CHARCLASS_LOOP (isalnum)
3603 else if (strcmp (name, "cntrl") == 0)
3604 BUILD_CHARCLASS_LOOP (iscntrl)
3605 else if (strcmp (name, "lower") == 0)
3606 BUILD_CHARCLASS_LOOP (islower)
3607 else if (strcmp (name, "space") == 0)
3608 BUILD_CHARCLASS_LOOP (isspace)
3609 else if (strcmp (name, "alpha") == 0)
3610 BUILD_CHARCLASS_LOOP (isalpha)
3611 else if (strcmp (name, "digit") == 0)
3612 BUILD_CHARCLASS_LOOP (isdigit)
3613 else if (strcmp (name, "print") == 0)
3614 BUILD_CHARCLASS_LOOP (isprint)
3615 else if (strcmp (name, "upper") == 0)
3616 BUILD_CHARCLASS_LOOP (isupper)
3617 else if (strcmp (name, "blank") == 0)
3618 BUILD_CHARCLASS_LOOP (isblank)
3619 else if (strcmp (name, "graph") == 0)
3620 BUILD_CHARCLASS_LOOP (isgraph)
3621 else if (strcmp (name, "punct") == 0)
3622 BUILD_CHARCLASS_LOOP (ispunct)
3623 else if (strcmp (name, "xdigit") == 0)
3624 BUILD_CHARCLASS_LOOP (isxdigit)
3625 else
3626 return REG_ECTYPE;
3628 return REG_NOERROR;
3631 static bin_tree_t *
3632 build_charclass_op (dfa, trans, class_name, extra, non_match, err)
3633 re_dfa_t *dfa;
3634 RE_TRANSLATE_TYPE trans;
3635 const unsigned char *class_name;
3636 const unsigned char *extra;
3637 int non_match;
3638 reg_errcode_t *err;
3640 re_bitset_ptr_t sbcset;
3641 #ifdef RE_ENABLE_I18N
3642 re_charset_t *mbcset;
3643 int alloc = 0;
3644 #endif /* not RE_ENABLE_I18N */
3645 reg_errcode_t ret;
3646 re_token_t br_token;
3647 bin_tree_t *tree;
3649 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3650 #ifdef RE_ENABLE_I18N
3651 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3652 #endif /* RE_ENABLE_I18N */
3654 #ifdef RE_ENABLE_I18N
3655 if (BE (sbcset == NULL || mbcset == NULL, 0))
3656 #else /* not RE_ENABLE_I18N */
3657 if (BE (sbcset == NULL, 0))
3658 #endif /* not RE_ENABLE_I18N */
3660 *err = REG_ESPACE;
3661 return NULL;
3664 if (non_match)
3666 #ifdef RE_ENABLE_I18N
3668 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3669 bitset_set(cset->sbcset, '\0');
3671 mbcset->non_match = 1;
3672 #endif /* not RE_ENABLE_I18N */
3675 /* We don't care the syntax in this case. */
3676 ret = build_charclass (trans, sbcset,
3677 #ifdef RE_ENABLE_I18N
3678 mbcset, &alloc,
3679 #endif /* RE_ENABLE_I18N */
3680 class_name, 0);
3682 if (BE (ret != REG_NOERROR, 0))
3684 re_free (sbcset);
3685 #ifdef RE_ENABLE_I18N
3686 free_charset (mbcset);
3687 #endif /* RE_ENABLE_I18N */
3688 *err = ret;
3689 return NULL;
3691 /* \w match '_' also. */
3692 for (; *extra; extra++)
3693 bitset_set (sbcset, *extra);
3695 /* If it is non-matching list. */
3696 if (non_match)
3697 bitset_not (sbcset);
3699 #ifdef RE_ENABLE_I18N
3700 /* Ensure only single byte characters are set. */
3701 if (dfa->mb_cur_max > 1)
3702 bitset_mask (sbcset, dfa->sb_char);
3703 #endif
3705 /* Build a tree for simple bracket. */
3706 br_token.type = SIMPLE_BRACKET;
3707 br_token.opr.sbcset = sbcset;
3708 tree = create_token_tree (dfa, NULL, NULL, &br_token);
3709 if (BE (tree == NULL, 0))
3710 goto build_word_op_espace;
3712 #ifdef RE_ENABLE_I18N
3713 if (dfa->mb_cur_max > 1)
3715 bin_tree_t *mbc_tree;
3716 /* Build a tree for complex bracket. */
3717 br_token.type = COMPLEX_BRACKET;
3718 br_token.opr.mbcset = mbcset;
3719 dfa->has_mb_node = 1;
3720 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3721 if (BE (mbc_tree == NULL, 0))
3722 goto build_word_op_espace;
3723 /* Then join them by ALT node. */
3724 tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3725 if (BE (mbc_tree != NULL, 1))
3726 return tree;
3728 else
3730 free_charset (mbcset);
3731 return tree;
3733 #else /* not RE_ENABLE_I18N */
3734 return tree;
3735 #endif /* not RE_ENABLE_I18N */
3737 build_word_op_espace:
3738 re_free (sbcset);
3739 #ifdef RE_ENABLE_I18N
3740 free_charset (mbcset);
3741 #endif /* RE_ENABLE_I18N */
3742 *err = REG_ESPACE;
3743 return NULL;
3746 /* This is intended for the expressions like "a{1,3}".
3747 Fetch a number from `input', and return the number.
3748 Return -1, if the number field is empty like "{,1}".
3749 Return -2, If an error is occured. */
3751 static int
3752 fetch_number (input, token, syntax)
3753 re_string_t *input;
3754 re_token_t *token;
3755 reg_syntax_t syntax;
3757 int num = -1;
3758 unsigned char c;
3759 while (1)
3761 fetch_token (token, input, syntax);
3762 c = token->opr.c;
3763 if (BE (token->type == END_OF_RE, 0))
3764 return -2;
3765 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3766 break;
3767 num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3768 ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3769 num = (num > RE_DUP_MAX) ? -2 : num;
3771 return num;
3774 #ifdef RE_ENABLE_I18N
3775 static void
3776 free_charset (re_charset_t *cset)
3778 re_free (cset->mbchars);
3779 # ifdef _LIBC
3780 re_free (cset->coll_syms);
3781 re_free (cset->equiv_classes);
3782 re_free (cset->range_starts);
3783 re_free (cset->range_ends);
3784 # endif
3785 re_free (cset->char_classes);
3786 re_free (cset);
3788 #endif /* RE_ENABLE_I18N */
3790 /* Functions for binary tree operation. */
3792 /* Create a tree node. */
3794 static bin_tree_t *
3795 create_tree (dfa, left, right, type)
3796 re_dfa_t *dfa;
3797 bin_tree_t *left;
3798 bin_tree_t *right;
3799 re_token_type_t type;
3801 re_token_t t;
3802 t.type = type;
3803 return create_token_tree (dfa, left, right, &t);
3806 static bin_tree_t *
3807 create_token_tree (dfa, left, right, token)
3808 re_dfa_t *dfa;
3809 bin_tree_t *left;
3810 bin_tree_t *right;
3811 const re_token_t *token;
3813 bin_tree_t *tree;
3814 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3816 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3818 if (storage == NULL)
3819 return NULL;
3820 storage->next = dfa->str_tree_storage;
3821 dfa->str_tree_storage = storage;
3822 dfa->str_tree_storage_idx = 0;
3824 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3826 tree->parent = NULL;
3827 tree->left = left;
3828 tree->right = right;
3829 tree->token = *token;
3830 tree->token.duplicated = 0;
3831 tree->token.opt_subexp = 0;
3832 tree->first = NULL;
3833 tree->next = NULL;
3834 tree->node_idx = -1;
3836 if (left != NULL)
3837 left->parent = tree;
3838 if (right != NULL)
3839 right->parent = tree;
3840 return tree;
3843 /* Mark the tree SRC as an optional subexpression.
3844 To be called from preorder or postorder. */
3846 static reg_errcode_t
3847 mark_opt_subexp (extra, node)
3848 void *extra;
3849 bin_tree_t *node;
3851 int idx = (int) (long) extra;
3852 if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3853 node->token.opt_subexp = 1;
3855 return REG_NOERROR;
3858 /* Free the allocated memory inside NODE. */
3860 static void
3861 free_token (re_token_t *node)
3863 #ifdef RE_ENABLE_I18N
3864 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3865 free_charset (node->opr.mbcset);
3866 else
3867 #endif /* RE_ENABLE_I18N */
3868 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3869 re_free (node->opr.sbcset);
3872 /* Worker function for tree walking. Free the allocated memory inside NODE
3873 and its children. */
3875 static reg_errcode_t
3876 free_tree (void *extra, bin_tree_t *node)
3878 free_token (&node->token);
3879 return REG_NOERROR;
3883 /* Duplicate the node SRC, and return new node. This is a preorder
3884 visit similar to the one implemented by the generic visitor, but
3885 we need more infrastructure to maintain two parallel trees --- so,
3886 it's easier to duplicate. */
3888 static bin_tree_t *
3889 duplicate_tree (root, dfa)
3890 const bin_tree_t *root;
3891 re_dfa_t *dfa;
3893 const bin_tree_t *node;
3894 bin_tree_t *dup_root;
3895 bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3897 for (node = root; ; )
3899 /* Create a new tree and link it back to the current parent. */
3900 *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3901 if (*p_new == NULL)
3902 return NULL;
3903 (*p_new)->parent = dup_node;
3904 (*p_new)->token.duplicated = 1;
3905 dup_node = *p_new;
3907 /* Go to the left node, or up and to the right. */
3908 if (node->left)
3910 node = node->left;
3911 p_new = &dup_node->left;
3913 else
3915 const bin_tree_t *prev = NULL;
3916 while (node->right == prev || node->right == NULL)
3918 prev = node;
3919 node = node->parent;
3920 dup_node = dup_node->parent;
3921 if (!node)
3922 return dup_root;
3924 node = node->right;
3925 p_new = &dup_node->right;