fix gcc warning with -Wmisleading-indentation
[uclibc-ng.git] / libc / misc / regex / regcomp.c
blobdc00109dcd7c0ad260798b1bafa6e28d70839c0d
1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002,2003,2004,2005,2006 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <http://www.gnu.org/licenses/>. */
20 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
21 size_t length, reg_syntax_t syntax);
22 static void re_compile_fastmap_iter (regex_t *bufp,
23 const re_dfastate_t *init_state,
24 char *fastmap);
25 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
26 #ifdef RE_ENABLE_I18N
27 static void free_charset (re_charset_t *cset);
28 #endif
29 static void free_workarea_compile (regex_t *preg);
30 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
31 #ifdef RE_ENABLE_I18N
32 static void optimize_utf8 (re_dfa_t *dfa);
33 #endif
34 static reg_errcode_t analyze (regex_t *preg);
35 static reg_errcode_t preorder (bin_tree_t *root,
36 reg_errcode_t (fn (void *, bin_tree_t *)),
37 void *extra);
38 static reg_errcode_t postorder (bin_tree_t *root,
39 reg_errcode_t (fn (void *, bin_tree_t *)),
40 void *extra);
41 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
42 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
43 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
44 bin_tree_t *node);
45 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
46 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
47 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
48 static int duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint);
49 static int search_duplicated_node (const re_dfa_t *dfa, int org_node,
50 unsigned int constraint);
51 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
52 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
53 int node, int root);
54 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
55 static int fetch_number (re_string_t *input, re_token_t *token,
56 reg_syntax_t syntax);
57 static int peek_token (re_token_t *token, re_string_t *input,
58 reg_syntax_t syntax) internal_function;
59 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
60 reg_syntax_t syntax, reg_errcode_t *err);
61 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
62 re_token_t *token, reg_syntax_t syntax,
63 int nest, reg_errcode_t *err);
64 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
65 re_token_t *token, reg_syntax_t syntax,
66 int nest, reg_errcode_t *err);
67 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
68 re_token_t *token, reg_syntax_t syntax,
69 int nest, reg_errcode_t *err);
70 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
71 re_token_t *token, reg_syntax_t syntax,
72 int nest, reg_errcode_t *err);
73 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
74 re_dfa_t *dfa, re_token_t *token,
75 reg_syntax_t syntax, reg_errcode_t *err);
76 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
77 re_token_t *token, reg_syntax_t syntax,
78 reg_errcode_t *err);
79 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
80 re_string_t *regexp,
81 re_token_t *token, int token_len,
82 re_dfa_t *dfa,
83 reg_syntax_t syntax,
84 int accept_hyphen);
85 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
86 re_string_t *regexp,
87 re_token_t *token);
88 #ifdef RE_ENABLE_I18N
89 static reg_errcode_t build_equiv_class (bitset_t sbcset,
90 re_charset_t *mbcset,
91 int *equiv_class_alloc,
92 const unsigned char *name);
93 static reg_errcode_t build_charclass (__RE_TRANSLATE_TYPE trans,
94 bitset_t sbcset,
95 re_charset_t *mbcset,
96 int *char_class_alloc,
97 const unsigned char *class_name,
98 reg_syntax_t syntax);
99 #else /* not RE_ENABLE_I18N */
100 static reg_errcode_t build_equiv_class (bitset_t sbcset,
101 const unsigned char *name);
102 static reg_errcode_t build_charclass (__RE_TRANSLATE_TYPE trans,
103 bitset_t sbcset,
104 const unsigned char *class_name,
105 reg_syntax_t syntax);
106 #endif /* not RE_ENABLE_I18N */
107 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
108 __RE_TRANSLATE_TYPE trans,
109 const unsigned char *class_name,
110 const unsigned char *extra,
111 int non_match, reg_errcode_t *err);
112 static bin_tree_t *create_tree (re_dfa_t *dfa,
113 bin_tree_t *left, bin_tree_t *right,
114 re_token_type_t type);
115 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
116 bin_tree_t *left, bin_tree_t *right,
117 const re_token_t *token);
118 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
119 static void free_token (re_token_t *node);
120 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
121 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
123 /* This table gives an error message for each of the error codes listed
124 in regex.h. Obviously the order here has to be same as there.
125 POSIX doesn't require that we do anything for REG_NOERROR,
126 but why not be nice? */
128 static const char __re_error_msgid[] =
130 #define REG_NOERROR_IDX 0
131 gettext_noop ("Success") /* REG_NOERROR */
132 "\0"
133 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
134 gettext_noop ("No match") /* REG_NOMATCH */
135 "\0"
136 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
137 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
138 "\0"
139 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
140 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
141 "\0"
142 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
143 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
144 "\0"
145 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
146 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
147 "\0"
148 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
149 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
150 "\0"
151 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
152 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
153 "\0"
154 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
155 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
156 "\0"
157 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
158 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
159 "\0"
160 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
161 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
162 "\0"
163 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
164 gettext_noop ("Invalid range end") /* REG_ERANGE */
165 "\0"
166 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
167 gettext_noop ("Memory exhausted") /* REG_ESPACE */
168 "\0"
169 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
170 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
171 "\0"
172 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
173 gettext_noop ("Premature end of regular expression") /* REG_EEND */
174 "\0"
175 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
176 gettext_noop ("Regular expression too big") /* REG_ESIZE */
177 "\0"
178 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
179 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
182 static const uint16_t __re_error_msgid_idx[] =
184 REG_NOERROR_IDX,
185 REG_NOMATCH_IDX,
186 REG_BADPAT_IDX,
187 REG_ECOLLATE_IDX,
188 REG_ECTYPE_IDX,
189 REG_EESCAPE_IDX,
190 REG_ESUBREG_IDX,
191 REG_EBRACK_IDX,
192 REG_EPAREN_IDX,
193 REG_EBRACE_IDX,
194 REG_BADBR_IDX,
195 REG_ERANGE_IDX,
196 REG_ESPACE_IDX,
197 REG_BADRPT_IDX,
198 REG_EEND_IDX,
199 REG_ESIZE_IDX,
200 REG_ERPAREN_IDX
203 /* Entry points for GNU code. */
205 /* re_compile_pattern is the GNU regular expression compiler: it
206 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
207 Returns 0 if the pattern was valid, otherwise an error string.
209 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
210 are set in BUFP on entry. */
212 const char *
213 re_compile_pattern (const char *pattern,
214 size_t length,
215 struct re_pattern_buffer *bufp)
217 reg_errcode_t ret;
219 /* And GNU code determines whether or not to get register information
220 by passing null for the REGS argument to re_match, etc., not by
221 setting no_sub, unless RE_NO_SUB is set. */
222 bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
224 /* Match anchors at newline. */
225 bufp->newline_anchor = 1;
227 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
229 if (!ret)
230 return NULL;
231 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
234 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
235 also be assigned to arbitrarily: each pattern buffer stores its own
236 syntax, so it can be changed between regex compilations. */
237 /* This has no initializer because initialized variables in Emacs
238 become read-only after dumping. */
239 reg_syntax_t re_syntax_options;
242 /* Specify the precise syntax of regexps for compilation. This provides
243 for compatibility for various utilities which historically have
244 different, incompatible syntaxes.
246 The argument SYNTAX is a bit mask comprised of the various bits
247 defined in regex.h. We return the old syntax. */
249 reg_syntax_t
250 re_set_syntax (reg_syntax_t syntax)
252 reg_syntax_t ret = re_syntax_options;
254 re_syntax_options = syntax;
255 return ret;
259 re_compile_fastmap (struct re_pattern_buffer *bufp)
261 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
262 char *fastmap = bufp->fastmap;
264 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
265 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
266 if (dfa->init_state != dfa->init_state_word)
267 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
268 if (dfa->init_state != dfa->init_state_nl)
269 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
270 if (dfa->init_state != dfa->init_state_begbuf)
271 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
272 bufp->fastmap_accurate = 1;
273 return 0;
275 libc_hidden_def(re_compile_fastmap)
277 static __inline__ void
278 __attribute ((always_inline))
279 re_set_fastmap (char *fastmap, int icase, int ch)
281 fastmap[ch] = 1;
282 if (icase)
283 fastmap[tolower (ch)] = 1;
286 /* Helper function for re_compile_fastmap.
287 Compile fastmap for the initial_state INIT_STATE. */
289 static void
290 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
291 char *fastmap)
293 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
294 int node_cnt;
295 int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
296 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
298 int node = init_state->nodes.elems[node_cnt];
299 re_token_type_t type = dfa->nodes[node].type;
301 if (type == CHARACTER)
303 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
304 #ifdef RE_ENABLE_I18N
305 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
307 unsigned char *buf = alloca (dfa->mb_cur_max), *p;
308 wchar_t wc;
309 mbstate_t state;
311 p = buf;
312 *p++ = dfa->nodes[node].opr.c;
313 while (++node < dfa->nodes_len
314 && dfa->nodes[node].type == CHARACTER
315 && dfa->nodes[node].mb_partial)
316 *p++ = dfa->nodes[node].opr.c;
317 memset (&state, '\0', sizeof (state));
318 if (mbrtowc (&wc, (const char *) buf, p - buf,
319 &state) == p - buf
320 && (__wcrtomb ((char *) buf, towlower (wc), &state)
321 != (size_t) -1))
322 re_set_fastmap (fastmap, 0, buf[0]);
324 #endif
326 else if (type == SIMPLE_BRACKET)
328 int i, ch;
329 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
331 int j;
332 bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
333 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
334 if (w & ((bitset_word_t) 1 << j))
335 re_set_fastmap (fastmap, icase, ch);
338 #ifdef RE_ENABLE_I18N
339 else if (type == COMPLEX_BRACKET)
341 int i;
342 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
343 if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
344 || cset->nranges || cset->nchar_classes)
346 # if 0
347 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
349 /* In this case we want to catch the bytes which are
350 the first byte of any collation elements.
351 e.g. In da_DK, we want to catch 'a' since "aa"
352 is a valid collation element, and don't catch
353 'b' since 'b' is the only collation element
354 which starts from 'b'. */
355 const int32_t *table = (const int32_t *)
356 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
357 for (i = 0; i < SBC_MAX; ++i)
358 if (table[i] < 0)
359 re_set_fastmap (fastmap, icase, i);
361 # else
362 if (dfa->mb_cur_max > 1)
363 for (i = 0; i < SBC_MAX; ++i)
364 if (__btowc (i) == WEOF)
365 re_set_fastmap (fastmap, icase, i);
366 # endif
368 for (i = 0; i < cset->nmbchars; ++i)
370 char buf[256];
371 mbstate_t state;
372 memset (&state, '\0', sizeof (state));
373 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
374 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
375 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
377 if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
378 != (size_t) -1)
379 re_set_fastmap (fastmap, 0, *(unsigned char *) buf);
383 #endif /* RE_ENABLE_I18N */
384 else if (type == OP_PERIOD
385 #ifdef RE_ENABLE_I18N
386 || type == OP_UTF8_PERIOD
387 #endif
388 || type == END_OF_RE)
390 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
391 if (type == END_OF_RE)
392 bufp->can_be_null = 1;
393 return;
398 /* Entry point for POSIX code. */
399 /* regcomp takes a regular expression as a string and compiles it.
401 PREG is a regex_t *. We do not expect any fields to be initialized,
402 since POSIX says we shouldn't. Thus, we set
404 `buffer' to the compiled pattern;
405 `used' to the length of the compiled pattern;
406 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
407 REG_EXTENDED bit in CFLAGS is set; otherwise, to
408 RE_SYNTAX_POSIX_BASIC;
409 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
410 `fastmap' to an allocated space for the fastmap;
411 `fastmap_accurate' to zero;
412 `re_nsub' to the number of subexpressions in PATTERN.
414 PATTERN is the address of the pattern string.
416 CFLAGS is a series of bits which affect compilation.
418 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
419 use POSIX basic syntax.
421 If REG_NEWLINE is set, then . and [^...] don't match newline.
422 Also, regexec will try a match beginning after every newline.
424 If REG_ICASE is set, then we considers upper- and lowercase
425 versions of letters to be equivalent when matching.
427 If REG_NOSUB is set, then when PREG is passed to regexec, that
428 routine will report only success or failure, and nothing about the
429 registers.
431 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
432 the return codes and their meanings.) */
435 regcomp (regex_t *__restrict preg,
436 const char *__restrict pattern,
437 int cflags)
439 reg_errcode_t ret;
440 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
441 : RE_SYNTAX_POSIX_BASIC);
443 preg->buffer = NULL;
444 preg->allocated = 0;
445 preg->used = 0;
447 /* Try to allocate space for the fastmap. */
448 preg->fastmap = re_malloc (char, SBC_MAX);
449 if (BE (preg->fastmap == NULL, 0))
450 return REG_ESPACE;
452 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
454 /* If REG_NEWLINE is set, newlines are treated differently. */
455 if (cflags & REG_NEWLINE)
456 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
457 syntax &= ~RE_DOT_NEWLINE;
458 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
459 /* It also changes the matching behavior. */
460 preg->newline_anchor = 1;
462 else
463 preg->newline_anchor = 0;
464 preg->no_sub = !!(cflags & REG_NOSUB);
465 preg->translate = NULL;
467 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
469 /* POSIX doesn't distinguish between an unmatched open-group and an
470 unmatched close-group: both are REG_EPAREN. */
471 if (ret == REG_ERPAREN)
472 ret = REG_EPAREN;
474 /* We have already checked preg->fastmap != NULL. */
475 if (BE (ret == REG_NOERROR, 1))
476 /* Compute the fastmap now, since regexec cannot modify the pattern
477 buffer. This function never fails in this implementation. */
478 (void) re_compile_fastmap (preg);
479 else
481 /* Some error occurred while compiling the expression. */
482 re_free (preg->fastmap);
483 preg->fastmap = NULL;
486 return (int) ret;
489 /* Returns a message corresponding to an error code, ERRCODE, returned
490 from either regcomp or regexec. We don't use PREG here. */
492 size_t
493 regerror (int errcode,
494 const regex_t *__restrict preg,
495 char *__restrict errbuf,
496 size_t errbuf_size)
498 const char *msg;
499 size_t msg_size;
501 if (BE (errcode < 0
502 || errcode >= (int) (sizeof (__re_error_msgid_idx)
503 / sizeof (__re_error_msgid_idx[0])), 0))
504 /* Only error codes returned by the rest of the code should be passed
505 to this routine. If we are given anything else, or if other regex
506 code generates an invalid error code, then the program has a bug.
507 Dump core so we can fix it. */
508 abort ();
510 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
512 msg_size = strlen (msg) + 1; /* Includes the null. */
514 if (BE (errbuf_size != 0, 1))
516 if (BE (msg_size > errbuf_size, 0))
518 memcpy (errbuf, msg, errbuf_size - 1);
519 errbuf[errbuf_size - 1] = 0;
521 else
522 memcpy (errbuf, msg, msg_size);
525 return msg_size;
529 #ifdef RE_ENABLE_I18N
530 /* This static array is used for the map to single-byte characters when
531 UTF-8 is used. Otherwise we would allocate memory just to initialize
532 it the same all the time. UTF-8 is the preferred encoding so this is
533 a worthwhile optimization. */
534 static const bitset_t utf8_sb_map =
536 /* Set the first 128 bits. */
537 [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
539 #endif
542 static void
543 free_dfa_content (re_dfa_t *dfa)
545 int i, j;
547 if (dfa->nodes)
548 for (i = 0; i < dfa->nodes_len; ++i)
549 free_token (dfa->nodes + i);
550 re_free (dfa->nexts);
551 for (i = 0; i < dfa->nodes_len; ++i)
553 if (dfa->eclosures != NULL)
554 re_node_set_free (dfa->eclosures + i);
555 if (dfa->inveclosures != NULL)
556 re_node_set_free (dfa->inveclosures + i);
557 if (dfa->edests != NULL)
558 re_node_set_free (dfa->edests + i);
560 re_free (dfa->edests);
561 re_free (dfa->eclosures);
562 re_free (dfa->inveclosures);
563 re_free (dfa->nodes);
565 if (dfa->state_table)
566 for (i = 0; i <= dfa->state_hash_mask; ++i)
568 struct re_state_table_entry *entry = dfa->state_table + i;
569 for (j = 0; j < entry->num; ++j)
571 re_dfastate_t *state = entry->array[j];
572 free_state (state);
574 re_free (entry->array);
576 re_free (dfa->state_table);
577 #ifdef RE_ENABLE_I18N
578 if (dfa->sb_char != utf8_sb_map)
579 re_free (dfa->sb_char);
580 #endif
581 re_free (dfa->subexp_map);
582 #ifdef DEBUG
583 re_free (dfa->re_str);
584 #endif
586 re_free (dfa);
590 /* Free dynamically allocated space used by PREG. */
592 void
593 regfree (regex_t *preg)
595 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
596 if (BE (dfa != NULL, 1))
597 free_dfa_content (dfa);
598 preg->buffer = NULL;
599 preg->allocated = 0;
601 re_free (preg->fastmap);
602 preg->fastmap = NULL;
604 re_free (preg->translate);
605 preg->translate = NULL;
607 libc_hidden_def(regfree)
609 /* Entry points compatible with 4.2 BSD regex library. We don't define
610 them unless specifically requested. */
612 #if defined _REGEX_RE_COMP || defined __UCLIBC__
614 /* BSD has one and only one pattern buffer. */
615 static struct re_pattern_buffer *re_comp_buf;
617 char *
618 /* Make BCD definitions weak in libc, so POSIX programs can redefine
619 these names if they don't use our functions, and still use
620 regcomp/regexec above without link errors. */
621 weak_function
622 re_comp (const char *s)
624 reg_errcode_t ret;
626 /* "If re_comp() is passed NULL or a null string, it returns
627 * without changing the currently compiled regular expression." */
628 if (!s || !s[0])
630 if (!re_comp_buf)
631 return gettext ("No previous regular expression");
632 return NULL;
635 if (!re_comp_buf)
637 re_comp_buf = calloc (1, sizeof(*re_comp_buf));
638 if (!re_comp_buf)
640 ret = REG_ESPACE;
641 goto err;
645 if (re_comp_buf->buffer)
647 regfree (re_comp_buf);
648 memset (re_comp_buf, '\0', sizeof(*re_comp_buf));
651 if (re_comp_buf->fastmap == NULL)
653 re_comp_buf->fastmap = malloc (SBC_MAX);
654 if (re_comp_buf->fastmap == NULL)
656 ret = REG_ESPACE;
657 goto err;
661 /* Since `re_exec' always passes NULL for the `regs' argument, we
662 don't need to initialize the pattern buffer fields which affect it. */
664 /* Match anchors at newlines. */
665 re_comp_buf->newline_anchor = 1;
667 ret = re_compile_internal (re_comp_buf, s, strlen (s), re_syntax_options);
669 if (!ret)
670 return NULL;
671 free (re_comp_buf);
672 re_comp_buf = NULL;
674 err:
675 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
676 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
679 #if 0
680 libc_freeres_fn (free_mem)
682 regfree (re_comp_buf);
683 free (re_comp_buf);
684 re_comp_buf = NULL;
686 #endif
688 #endif /* _REGEX_RE_COMP */
690 /* Internal entry point.
691 Compile the regular expression PATTERN, whose length is LENGTH.
692 SYNTAX indicate regular expression's syntax. */
694 static reg_errcode_t
695 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
696 reg_syntax_t syntax)
698 reg_errcode_t err = REG_NOERROR;
699 re_dfa_t *dfa;
700 re_string_t regexp;
702 /* Initialize the pattern buffer. */
703 preg->fastmap_accurate = 0;
704 preg->syntax = syntax;
705 preg->not_bol = preg->not_eol = 0;
706 preg->used = 0;
707 preg->re_nsub = 0;
708 preg->can_be_null = 0;
709 preg->regs_allocated = REGS_UNALLOCATED;
711 /* Initialize the dfa. */
712 dfa = (re_dfa_t *) preg->buffer;
713 if (BE (preg->allocated < sizeof (re_dfa_t), 0))
715 /* If zero allocated, but buffer is non-null, try to realloc
716 enough space. This loses if buffer's address is bogus, but
717 that is the user's responsibility. If ->buffer is NULL this
718 is a simple allocation. */
719 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
720 if (dfa == NULL)
721 return REG_ESPACE;
722 preg->allocated = sizeof (re_dfa_t);
723 preg->buffer = (unsigned char *) dfa;
725 preg->used = sizeof (re_dfa_t);
727 err = init_dfa (dfa, length);
728 if (BE (err != REG_NOERROR, 0))
730 free_dfa_content (dfa);
731 preg->buffer = NULL;
732 preg->allocated = 0;
733 return err;
735 #ifdef DEBUG
736 /* Note: length+1 will not overflow since it is checked in init_dfa. */
737 dfa->re_str = re_malloc (char, length + 1);
738 strncpy (dfa->re_str, pattern, length + 1);
739 #endif
741 __libc_lock_init (dfa->lock);
743 err = re_string_construct (&regexp, pattern, length, preg->translate,
744 syntax & RE_ICASE, dfa);
745 if (BE (err != REG_NOERROR, 0))
747 re_compile_internal_free_return:
748 free_workarea_compile (preg);
749 re_string_destruct (&regexp);
750 free_dfa_content (dfa);
751 preg->buffer = NULL;
752 preg->allocated = 0;
753 return err;
756 /* Parse the regular expression, and build a structure tree. */
757 preg->re_nsub = 0;
758 dfa->str_tree = parse (&regexp, preg, syntax, &err);
759 if (BE (dfa->str_tree == NULL, 0))
760 goto re_compile_internal_free_return;
762 /* Analyze the tree and create the nfa. */
763 err = analyze (preg);
764 if (BE (err != REG_NOERROR, 0))
765 goto re_compile_internal_free_return;
767 #ifdef RE_ENABLE_I18N
768 /* If possible, do searching in single byte encoding to speed things up. */
769 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
770 optimize_utf8 (dfa);
771 #endif
773 /* Then create the initial state of the dfa. */
774 err = create_initial_state (dfa);
776 /* Release work areas. */
777 free_workarea_compile (preg);
778 re_string_destruct (&regexp);
780 if (BE (err != REG_NOERROR, 0))
782 free_dfa_content (dfa);
783 preg->buffer = NULL;
784 preg->allocated = 0;
787 return err;
790 /* Initialize DFA. We use the length of the regular expression PAT_LEN
791 as the initial length of some arrays. */
793 static reg_errcode_t
794 init_dfa (re_dfa_t *dfa, size_t pat_len)
796 unsigned int table_size;
797 #if 1
798 char *codeset_name;
799 #endif
801 memset (dfa, '\0', sizeof (re_dfa_t));
803 /* Force allocation of str_tree_storage the first time. */
804 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
806 /* Avoid overflows. */
807 if (pat_len == SIZE_MAX)
808 return REG_ESPACE;
810 dfa->nodes_alloc = pat_len + 1;
811 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
813 /* table_size = 2 ^ ceil(log pat_len) */
814 for (table_size = 1; ; table_size <<= 1)
815 if (table_size > pat_len)
816 break;
818 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
819 dfa->state_hash_mask = table_size - 1;
821 dfa->mb_cur_max = MB_CUR_MAX;
822 #if 0
823 if (dfa->mb_cur_max == 6
824 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
825 dfa->is_utf8 = 1;
826 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
827 != 0);
828 #else
829 # ifdef HAVE_LANGINFO_CODESET
830 codeset_name = nl_langinfo (CODESET);
831 # else
832 codeset_name = getenv ("LC_ALL");
833 if (codeset_name == NULL || codeset_name[0] == '\0')
834 codeset_name = getenv ("LC_CTYPE");
835 if (codeset_name == NULL || codeset_name[0] == '\0')
836 codeset_name = getenv ("LANG");
837 if (codeset_name == NULL)
838 codeset_name = "";
839 else if (strchr (codeset_name, '.') != NULL)
840 codeset_name = strchr (codeset_name, '.') + 1;
841 # endif
843 if (strcasecmp (codeset_name, "UTF-8") == 0
844 || strcasecmp (codeset_name, "UTF8") == 0)
845 dfa->is_utf8 = 1;
847 /* We check exhaustively in the loop below if this charset is a
848 superset of ASCII. */
849 dfa->map_notascii = 0;
850 #endif
852 #ifdef RE_ENABLE_I18N
853 if (dfa->mb_cur_max > 1)
855 if (dfa->is_utf8)
856 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
857 else
859 int i, j, ch;
861 dfa->sb_char = calloc (sizeof (bitset_t), 1);
862 if (BE (dfa->sb_char == NULL, 0))
863 return REG_ESPACE;
865 /* Set the bits corresponding to single byte chars. */
866 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
867 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
869 wint_t wch = __btowc (ch);
870 if (wch != WEOF)
871 dfa->sb_char[i] |= (bitset_word_t) 1 << j;
872 # if 1
873 if (isascii (ch) && wch != ch)
874 dfa->map_notascii = 1;
875 # endif
879 #endif
881 if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
882 return REG_ESPACE;
883 return REG_NOERROR;
886 /* Initialize WORD_CHAR table, which indicate which character is
887 "word". In this case "word" means that it is the word construction
888 character used by some operators like "\<", "\>", etc. */
890 static void
891 internal_function
892 init_word_char (re_dfa_t *dfa)
894 int i, j, ch;
895 dfa->word_ops_used = 1;
896 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
897 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
898 if (isalnum (ch) || ch == '_')
899 dfa->word_char[i] |= (bitset_word_t) 1 << j;
902 /* Free the work area which are only used while compiling. */
904 static void
905 free_workarea_compile (regex_t *preg)
907 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
908 bin_tree_storage_t *storage, *next;
909 for (storage = dfa->str_tree_storage; storage; storage = next)
911 next = storage->next;
912 re_free (storage);
914 dfa->str_tree_storage = NULL;
915 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
916 dfa->str_tree = NULL;
917 re_free (dfa->org_indices);
918 dfa->org_indices = NULL;
921 /* Create initial states for all contexts. */
923 static reg_errcode_t
924 create_initial_state (re_dfa_t *dfa)
926 int first, i;
927 reg_errcode_t err;
928 re_node_set init_nodes;
930 /* Initial states have the epsilon closure of the node which is
931 the first node of the regular expression. */
932 first = dfa->str_tree->first->node_idx;
933 dfa->init_node = first;
934 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
935 if (BE (err != REG_NOERROR, 0))
936 return err;
938 /* The back-references which are in initial states can epsilon transit,
939 since in this case all of the subexpressions can be null.
940 Then we add epsilon closures of the nodes which are the next nodes of
941 the back-references. */
942 if (dfa->nbackref > 0)
943 for (i = 0; i < init_nodes.nelem; ++i)
945 int node_idx = init_nodes.elems[i];
946 re_token_type_t type = dfa->nodes[node_idx].type;
948 int clexp_idx;
949 if (type != OP_BACK_REF)
950 continue;
951 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
953 re_token_t *clexp_node;
954 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
955 if (clexp_node->type == OP_CLOSE_SUBEXP
956 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
957 break;
959 if (clexp_idx == init_nodes.nelem)
960 continue;
962 if (type == OP_BACK_REF)
964 int dest_idx = dfa->edests[node_idx].elems[0];
965 if (!re_node_set_contains (&init_nodes, dest_idx))
967 re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
968 i = 0;
973 /* It must be the first time to invoke acquire_state. */
974 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
975 /* We don't check ERR here, since the initial state must not be NULL. */
976 if (BE (dfa->init_state == NULL, 0))
977 return err;
978 if (dfa->init_state->has_constraint)
980 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
981 CONTEXT_WORD);
982 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
983 CONTEXT_NEWLINE);
984 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
985 &init_nodes,
986 CONTEXT_NEWLINE
987 | CONTEXT_BEGBUF);
988 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
989 || dfa->init_state_begbuf == NULL, 0))
990 return err;
992 else
993 dfa->init_state_word = dfa->init_state_nl
994 = dfa->init_state_begbuf = dfa->init_state;
996 re_node_set_free (&init_nodes);
997 return REG_NOERROR;
1000 #ifdef RE_ENABLE_I18N
1001 /* If it is possible to do searching in single byte encoding instead of UTF-8
1002 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1003 DFA nodes where needed. */
1005 static void
1006 optimize_utf8 (re_dfa_t *dfa)
1008 int node, i, mb_chars = 0, has_period = 0;
1010 for (node = 0; node < dfa->nodes_len; ++node)
1011 switch (dfa->nodes[node].type)
1013 case CHARACTER:
1014 if (dfa->nodes[node].opr.c >= 0x80)
1015 mb_chars = 1;
1016 break;
1017 case ANCHOR:
1018 switch (dfa->nodes[node].opr.idx)
1020 case LINE_FIRST:
1021 case LINE_LAST:
1022 case BUF_FIRST:
1023 case BUF_LAST:
1024 break;
1025 default:
1026 /* Word anchors etc. cannot be handled. */
1027 return;
1029 break;
1030 case OP_PERIOD:
1031 has_period = 1;
1032 break;
1033 case OP_BACK_REF:
1034 case OP_ALT:
1035 case END_OF_RE:
1036 case OP_DUP_ASTERISK:
1037 case OP_OPEN_SUBEXP:
1038 case OP_CLOSE_SUBEXP:
1039 break;
1040 case COMPLEX_BRACKET:
1041 return;
1042 case SIMPLE_BRACKET:
1043 /* Just double check. The non-ASCII range starts at 0x80. */
1044 assert (0x80 % BITSET_WORD_BITS == 0);
1045 for (i = 0x80 / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1046 if (dfa->nodes[node].opr.sbcset[i])
1047 return;
1048 break;
1049 default:
1050 abort ();
1053 if (mb_chars || has_period)
1054 for (node = 0; node < dfa->nodes_len; ++node)
1056 if (dfa->nodes[node].type == CHARACTER
1057 && dfa->nodes[node].opr.c >= 0x80)
1058 dfa->nodes[node].mb_partial = 0;
1059 else if (dfa->nodes[node].type == OP_PERIOD)
1060 dfa->nodes[node].type = OP_UTF8_PERIOD;
1063 /* The search can be in single byte locale. */
1064 dfa->mb_cur_max = 1;
1065 dfa->is_utf8 = 0;
1066 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1068 #endif
1070 /* Analyze the structure tree, and calculate "first", "next", "edest",
1071 "eclosure", and "inveclosure". */
1073 static reg_errcode_t
1074 analyze (regex_t *preg)
1076 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1077 reg_errcode_t ret;
1079 /* Allocate arrays. */
1080 dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1081 dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1082 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1083 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1084 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1085 || dfa->eclosures == NULL, 0))
1086 return REG_ESPACE;
1088 dfa->subexp_map = re_malloc (int, preg->re_nsub);
1089 if (dfa->subexp_map != NULL)
1091 int i;
1092 for (i = 0; i < preg->re_nsub; i++)
1093 dfa->subexp_map[i] = i;
1094 preorder (dfa->str_tree, optimize_subexps, dfa);
1095 for (i = 0; i < preg->re_nsub; i++)
1096 if (dfa->subexp_map[i] != i)
1097 break;
1098 if (i == preg->re_nsub)
1100 free (dfa->subexp_map);
1101 dfa->subexp_map = NULL;
1105 ret = postorder (dfa->str_tree, lower_subexps, preg);
1106 if (BE (ret != REG_NOERROR, 0))
1107 return ret;
1108 ret = postorder (dfa->str_tree, calc_first, dfa);
1109 if (BE (ret != REG_NOERROR, 0))
1110 return ret;
1111 preorder (dfa->str_tree, calc_next, dfa);
1112 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1113 if (BE (ret != REG_NOERROR, 0))
1114 return ret;
1115 ret = calc_eclosure (dfa);
1116 if (BE (ret != REG_NOERROR, 0))
1117 return ret;
1119 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1120 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1121 if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1122 || dfa->nbackref)
1124 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1125 if (BE (dfa->inveclosures == NULL, 0))
1126 return REG_ESPACE;
1127 ret = calc_inveclosure (dfa);
1130 return ret;
1133 /* Our parse trees are very unbalanced, so we cannot use a stack to
1134 implement parse tree visits. Instead, we use parent pointers and
1135 some hairy code in these two functions. */
1136 static reg_errcode_t
1137 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1138 void *extra)
1140 bin_tree_t *node, *prev;
1142 for (node = root; ; )
1144 /* Descend down the tree, preferably to the left (or to the right
1145 if that's the only child). */
1146 while (node->left || node->right)
1147 if (node->left)
1148 node = node->left;
1149 else
1150 node = node->right;
1154 reg_errcode_t err = fn (extra, node);
1155 if (BE (err != REG_NOERROR, 0))
1156 return err;
1157 if (node->parent == NULL)
1158 return REG_NOERROR;
1159 prev = node;
1160 node = node->parent;
1162 /* Go up while we have a node that is reached from the right. */
1163 while (node->right == prev || node->right == NULL);
1164 node = node->right;
1168 static reg_errcode_t
1169 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1170 void *extra)
1172 bin_tree_t *node;
1174 for (node = root; ; )
1176 reg_errcode_t err = fn (extra, node);
1177 if (BE (err != REG_NOERROR, 0))
1178 return err;
1180 /* Go to the left node, or up and to the right. */
1181 if (node->left)
1182 node = node->left;
1183 else
1185 bin_tree_t *prev = NULL;
1186 while (node->right == prev || node->right == NULL)
1188 prev = node;
1189 node = node->parent;
1190 if (!node)
1191 return REG_NOERROR;
1193 node = node->right;
1198 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1199 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1200 backreferences as well. Requires a preorder visit. */
1201 static reg_errcode_t
1202 optimize_subexps (void *extra, bin_tree_t *node)
1204 re_dfa_t *dfa = (re_dfa_t *) extra;
1206 if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1208 int idx = node->token.opr.idx;
1209 node->token.opr.idx = dfa->subexp_map[idx];
1210 dfa->used_bkref_map |= 1 << node->token.opr.idx;
1213 else if (node->token.type == SUBEXP
1214 && node->left && node->left->token.type == SUBEXP)
1216 int other_idx = node->left->token.opr.idx;
1218 node->left = node->left->left;
1219 if (node->left)
1220 node->left->parent = node;
1222 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1223 if (other_idx < BITSET_WORD_BITS)
1224 dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1227 return REG_NOERROR;
1230 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1231 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1232 static reg_errcode_t
1233 lower_subexps (void *extra, bin_tree_t *node)
1235 regex_t *preg = (regex_t *) extra;
1236 reg_errcode_t err = REG_NOERROR;
1238 if (node->left && node->left->token.type == SUBEXP)
1240 node->left = lower_subexp (&err, preg, node->left);
1241 if (node->left)
1242 node->left->parent = node;
1244 if (node->right && node->right->token.type == SUBEXP)
1246 node->right = lower_subexp (&err, preg, node->right);
1247 if (node->right)
1248 node->right->parent = node;
1251 return err;
1254 static bin_tree_t *
1255 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1257 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1258 bin_tree_t *body = node->left;
1259 bin_tree_t *op, *cls, *tree1, *tree;
1261 if (preg->no_sub
1262 /* We do not optimize empty subexpressions, because otherwise we may
1263 have bad CONCAT nodes with NULL children. This is obviously not
1264 very common, so we do not lose much. An example that triggers
1265 this case is the sed "script" /\(\)/x. */
1266 && node->left != NULL
1267 && (node->token.opr.idx >= BITSET_WORD_BITS
1268 || !(dfa->used_bkref_map
1269 & ((bitset_word_t) 1 << node->token.opr.idx))))
1270 return node->left;
1272 /* Convert the SUBEXP node to the concatenation of an
1273 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1274 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1275 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1276 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1277 tree = create_tree (dfa, op, tree1, CONCAT);
1278 if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1280 *err = REG_ESPACE;
1281 return NULL;
1284 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1285 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1286 return tree;
1289 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1290 nodes. Requires a postorder visit. */
1291 static reg_errcode_t
1292 calc_first (void *extra, bin_tree_t *node)
1294 re_dfa_t *dfa = (re_dfa_t *) extra;
1295 if (node->token.type == CONCAT)
1297 node->first = node->left->first;
1298 node->node_idx = node->left->node_idx;
1300 else
1302 node->first = node;
1303 node->node_idx = re_dfa_add_node (dfa, node->token);
1304 if (BE (node->node_idx == -1, 0))
1305 return REG_ESPACE;
1307 return REG_NOERROR;
1310 /* Pass 2: compute NEXT on the tree. Preorder visit. */
1311 static reg_errcode_t
1312 calc_next (void *extra, bin_tree_t *node)
1314 switch (node->token.type)
1316 case OP_DUP_ASTERISK:
1317 node->left->next = node;
1318 break;
1319 case CONCAT:
1320 node->left->next = node->right->first;
1321 node->right->next = node->next;
1322 break;
1323 default:
1324 if (node->left)
1325 node->left->next = node->next;
1326 if (node->right)
1327 node->right->next = node->next;
1328 break;
1330 return REG_NOERROR;
1333 /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1334 static reg_errcode_t
1335 link_nfa_nodes (void *extra, bin_tree_t *node)
1337 re_dfa_t *dfa = (re_dfa_t *) extra;
1338 int idx = node->node_idx;
1339 reg_errcode_t err = REG_NOERROR;
1341 switch (node->token.type)
1343 case CONCAT:
1344 break;
1346 case END_OF_RE:
1347 assert (node->next == NULL);
1348 break;
1350 case OP_DUP_ASTERISK:
1351 case OP_ALT:
1353 int left, right;
1354 dfa->has_plural_match = 1;
1355 if (node->left != NULL)
1356 left = node->left->first->node_idx;
1357 else
1358 left = node->next->node_idx;
1359 if (node->right != NULL)
1360 right = node->right->first->node_idx;
1361 else
1362 right = node->next->node_idx;
1363 assert (left > -1);
1364 assert (right > -1);
1365 err = re_node_set_init_2 (dfa->edests + idx, left, right);
1367 break;
1369 case ANCHOR:
1370 case OP_OPEN_SUBEXP:
1371 case OP_CLOSE_SUBEXP:
1372 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1373 break;
1375 case OP_BACK_REF:
1376 dfa->nexts[idx] = node->next->node_idx;
1377 if (node->token.type == OP_BACK_REF)
1378 re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1379 break;
1381 default:
1382 assert (!IS_EPSILON_NODE (node->token.type));
1383 dfa->nexts[idx] = node->next->node_idx;
1384 break;
1387 return err;
1390 /* Duplicate the epsilon closure of the node ROOT_NODE.
1391 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1392 to their own constraint. */
1394 static reg_errcode_t
1395 internal_function
1396 duplicate_node_closure (re_dfa_t *dfa, int top_org_node, int top_clone_node,
1397 int root_node, unsigned int init_constraint)
1399 int org_node, clone_node, ret;
1400 unsigned int constraint = init_constraint;
1401 for (org_node = top_org_node, clone_node = top_clone_node;;)
1403 int org_dest, clone_dest;
1404 if (dfa->nodes[org_node].type == OP_BACK_REF)
1406 /* If the back reference epsilon-transit, its destination must
1407 also have the constraint. Then duplicate the epsilon closure
1408 of the destination of the back reference, and store it in
1409 edests of the back reference. */
1410 org_dest = dfa->nexts[org_node];
1411 re_node_set_empty (dfa->edests + clone_node);
1412 clone_dest = duplicate_node (dfa, org_dest, constraint);
1413 if (BE (clone_dest == -1, 0))
1414 return REG_ESPACE;
1415 dfa->nexts[clone_node] = dfa->nexts[org_node];
1416 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1417 if (BE (ret < 0, 0))
1418 return REG_ESPACE;
1420 else if (dfa->edests[org_node].nelem == 0)
1422 /* In case of the node can't epsilon-transit, don't duplicate the
1423 destination and store the original destination as the
1424 destination of the node. */
1425 dfa->nexts[clone_node] = dfa->nexts[org_node];
1426 break;
1428 else if (dfa->edests[org_node].nelem == 1)
1430 /* In case of the node can epsilon-transit, and it has only one
1431 destination. */
1432 org_dest = dfa->edests[org_node].elems[0];
1433 re_node_set_empty (dfa->edests + clone_node);
1434 if (dfa->nodes[org_node].type == ANCHOR)
1436 /* In case of the node has another constraint, append it. */
1437 if (org_node == root_node && clone_node != org_node)
1439 /* ...but if the node is root_node itself, it means the
1440 epsilon closure have a loop, then tie it to the
1441 destination of the root_node. */
1442 ret = re_node_set_insert (dfa->edests + clone_node,
1443 org_dest);
1444 if (BE (ret < 0, 0))
1445 return REG_ESPACE;
1446 break;
1448 constraint |= dfa->nodes[org_node].opr.ctx_type;
1450 clone_dest = duplicate_node (dfa, org_dest, constraint);
1451 if (BE (clone_dest == -1, 0))
1452 return REG_ESPACE;
1453 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1454 if (BE (ret < 0, 0))
1455 return REG_ESPACE;
1457 else /* dfa->edests[org_node].nelem == 2 */
1459 /* In case of the node can epsilon-transit, and it has two
1460 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1461 org_dest = dfa->edests[org_node].elems[0];
1462 re_node_set_empty (dfa->edests + clone_node);
1463 /* Search for a duplicated node which satisfies the constraint. */
1464 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1465 if (clone_dest == -1)
1467 /* There are no such a duplicated node, create a new one. */
1468 reg_errcode_t err;
1469 clone_dest = duplicate_node (dfa, org_dest, constraint);
1470 if (BE (clone_dest == -1, 0))
1471 return REG_ESPACE;
1472 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1473 if (BE (ret < 0, 0))
1474 return REG_ESPACE;
1475 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1476 root_node, constraint);
1477 if (BE (err != REG_NOERROR, 0))
1478 return err;
1480 else
1482 /* There are a duplicated node which satisfy the constraint,
1483 use it to avoid infinite loop. */
1484 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1485 if (BE (ret < 0, 0))
1486 return REG_ESPACE;
1489 org_dest = dfa->edests[org_node].elems[1];
1490 clone_dest = duplicate_node (dfa, org_dest, constraint);
1491 if (BE (clone_dest == -1, 0))
1492 return REG_ESPACE;
1493 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1494 if (BE (ret < 0, 0))
1495 return REG_ESPACE;
1497 org_node = org_dest;
1498 clone_node = clone_dest;
1500 return REG_NOERROR;
1503 /* Search for a node which is duplicated from the node ORG_NODE, and
1504 satisfies the constraint CONSTRAINT. */
1506 static int
1507 search_duplicated_node (const re_dfa_t *dfa, int org_node,
1508 unsigned int constraint)
1510 int idx;
1511 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1513 if (org_node == dfa->org_indices[idx]
1514 && constraint == dfa->nodes[idx].constraint)
1515 return idx; /* Found. */
1517 return -1; /* Not found. */
1520 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1521 Return the index of the new node, or -1 if insufficient storage is
1522 available. */
1524 static int
1525 duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint)
1527 int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1528 if (BE (dup_idx != -1, 1))
1530 dfa->nodes[dup_idx].constraint = constraint;
1531 if (dfa->nodes[org_idx].type == ANCHOR)
1532 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type;
1533 dfa->nodes[dup_idx].duplicated = 1;
1535 /* Store the index of the original node. */
1536 dfa->org_indices[dup_idx] = org_idx;
1538 return dup_idx;
1541 static reg_errcode_t
1542 calc_inveclosure (re_dfa_t *dfa)
1544 int src, idx, ret;
1545 for (idx = 0; idx < dfa->nodes_len; ++idx)
1546 re_node_set_init_empty (dfa->inveclosures + idx);
1548 for (src = 0; src < dfa->nodes_len; ++src)
1550 int *elems = dfa->eclosures[src].elems;
1551 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1553 ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1554 if (BE (ret == -1, 0))
1555 return REG_ESPACE;
1559 return REG_NOERROR;
1562 /* Calculate "eclosure" for all the node in DFA. */
1564 static reg_errcode_t
1565 calc_eclosure (re_dfa_t *dfa)
1567 int node_idx, incomplete;
1568 #ifdef DEBUG
1569 assert (dfa->nodes_len > 0);
1570 #endif
1571 incomplete = 0;
1572 /* For each nodes, calculate epsilon closure. */
1573 for (node_idx = 0; ; ++node_idx)
1575 reg_errcode_t err;
1576 re_node_set eclosure_elem;
1577 if (node_idx == dfa->nodes_len)
1579 if (!incomplete)
1580 break;
1581 incomplete = 0;
1582 node_idx = 0;
1585 #ifdef DEBUG
1586 assert (dfa->eclosures[node_idx].nelem != -1);
1587 #endif
1589 /* If we have already calculated, skip it. */
1590 if (dfa->eclosures[node_idx].nelem != 0)
1591 continue;
1592 /* Calculate epsilon closure of `node_idx'. */
1593 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1594 if (BE (err != REG_NOERROR, 0))
1595 return err;
1597 if (dfa->eclosures[node_idx].nelem == 0)
1599 incomplete = 1;
1600 re_node_set_free (&eclosure_elem);
1603 return REG_NOERROR;
1606 /* Calculate epsilon closure of NODE. */
1608 static reg_errcode_t
1609 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, int node, int root)
1611 reg_errcode_t err;
1612 unsigned int constraint;
1613 int i, incomplete;
1614 re_node_set eclosure;
1615 incomplete = 0;
1616 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1617 if (BE (err != REG_NOERROR, 0))
1618 return err;
1620 /* This indicates that we are calculating this node now.
1621 We reference this value to avoid infinite loop. */
1622 dfa->eclosures[node].nelem = -1;
1624 constraint = ((dfa->nodes[node].type == ANCHOR)
1625 ? dfa->nodes[node].opr.ctx_type : 0);
1626 /* If the current node has constraints, duplicate all nodes.
1627 Since they must inherit the constraints. */
1628 if (constraint
1629 && dfa->edests[node].nelem
1630 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1632 err = duplicate_node_closure (dfa, node, node, node, constraint);
1633 if (BE (err != REG_NOERROR, 0))
1634 return err;
1637 /* Expand each epsilon destination nodes. */
1638 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1639 for (i = 0; i < dfa->edests[node].nelem; ++i)
1641 re_node_set eclosure_elem;
1642 int edest = dfa->edests[node].elems[i];
1643 /* If calculating the epsilon closure of `edest' is in progress,
1644 return intermediate result. */
1645 if (dfa->eclosures[edest].nelem == -1)
1647 incomplete = 1;
1648 continue;
1650 /* If we haven't calculated the epsilon closure of `edest' yet,
1651 calculate now. Otherwise use calculated epsilon closure. */
1652 if (dfa->eclosures[edest].nelem == 0)
1654 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1655 if (BE (err != REG_NOERROR, 0))
1656 return err;
1658 else
1659 eclosure_elem = dfa->eclosures[edest];
1660 /* Merge the epsilon closure of `edest'. */
1661 re_node_set_merge (&eclosure, &eclosure_elem);
1662 /* If the epsilon closure of `edest' is incomplete,
1663 the epsilon closure of this node is also incomplete. */
1664 if (dfa->eclosures[edest].nelem == 0)
1666 incomplete = 1;
1667 re_node_set_free (&eclosure_elem);
1671 /* Epsilon closures include itself. */
1672 re_node_set_insert (&eclosure, node);
1673 if (incomplete && !root)
1674 dfa->eclosures[node].nelem = 0;
1675 else
1676 dfa->eclosures[node] = eclosure;
1677 *new_set = eclosure;
1678 return REG_NOERROR;
1681 /* Functions for token which are used in the parser. */
1683 /* Fetch a token from INPUT.
1684 We must not use this function inside bracket expressions. */
1686 static void
1687 internal_function
1688 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1690 re_string_skip_bytes (input, peek_token (result, input, syntax));
1693 /* Peek a token from INPUT, and return the length of the token.
1694 We must not use this function inside bracket expressions. */
1696 static int
1697 internal_function
1698 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1700 unsigned char c;
1702 if (re_string_eoi (input))
1704 token->type = END_OF_RE;
1705 return 0;
1708 c = re_string_peek_byte (input, 0);
1709 token->opr.c = c;
1711 token->word_char = 0;
1712 #ifdef RE_ENABLE_I18N
1713 token->mb_partial = 0;
1714 if (input->mb_cur_max > 1 &&
1715 !re_string_first_byte (input, re_string_cur_idx (input)))
1717 token->type = CHARACTER;
1718 token->mb_partial = 1;
1719 return 1;
1721 #endif
1722 if (c == '\\')
1724 unsigned char c2;
1725 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1727 token->type = BACK_SLASH;
1728 return 1;
1731 c2 = re_string_peek_byte_case (input, 1);
1732 token->opr.c = c2;
1733 token->type = CHARACTER;
1734 #ifdef RE_ENABLE_I18N
1735 if (input->mb_cur_max > 1)
1737 wint_t wc = re_string_wchar_at (input,
1738 re_string_cur_idx (input) + 1);
1739 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1741 else
1742 #endif
1743 token->word_char = IS_WORD_CHAR (c2) != 0;
1745 switch (c2)
1747 case '|':
1748 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1749 token->type = OP_ALT;
1750 break;
1751 case '1': case '2': case '3': case '4': case '5':
1752 case '6': case '7': case '8': case '9':
1753 if (!(syntax & RE_NO_BK_REFS))
1755 token->type = OP_BACK_REF;
1756 token->opr.idx = c2 - '1';
1758 break;
1759 case '<':
1760 if (!(syntax & RE_NO_GNU_OPS))
1762 token->type = ANCHOR;
1763 token->opr.ctx_type = WORD_FIRST;
1765 break;
1766 case '>':
1767 if (!(syntax & RE_NO_GNU_OPS))
1769 token->type = ANCHOR;
1770 token->opr.ctx_type = WORD_LAST;
1772 break;
1773 case 'b':
1774 if (!(syntax & RE_NO_GNU_OPS))
1776 token->type = ANCHOR;
1777 token->opr.ctx_type = WORD_DELIM;
1779 break;
1780 case 'B':
1781 if (!(syntax & RE_NO_GNU_OPS))
1783 token->type = ANCHOR;
1784 token->opr.ctx_type = NOT_WORD_DELIM;
1786 break;
1787 case 'w':
1788 if (!(syntax & RE_NO_GNU_OPS))
1789 token->type = OP_WORD;
1790 break;
1791 case 'W':
1792 if (!(syntax & RE_NO_GNU_OPS))
1793 token->type = OP_NOTWORD;
1794 break;
1795 case 's':
1796 if (!(syntax & RE_NO_GNU_OPS))
1797 token->type = OP_SPACE;
1798 break;
1799 case 'S':
1800 if (!(syntax & RE_NO_GNU_OPS))
1801 token->type = OP_NOTSPACE;
1802 break;
1803 case '`':
1804 if (!(syntax & RE_NO_GNU_OPS))
1806 token->type = ANCHOR;
1807 token->opr.ctx_type = BUF_FIRST;
1809 break;
1810 case '\'':
1811 if (!(syntax & RE_NO_GNU_OPS))
1813 token->type = ANCHOR;
1814 token->opr.ctx_type = BUF_LAST;
1816 break;
1817 case '(':
1818 if (!(syntax & RE_NO_BK_PARENS))
1819 token->type = OP_OPEN_SUBEXP;
1820 break;
1821 case ')':
1822 if (!(syntax & RE_NO_BK_PARENS))
1823 token->type = OP_CLOSE_SUBEXP;
1824 break;
1825 case '+':
1826 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1827 token->type = OP_DUP_PLUS;
1828 break;
1829 case '?':
1830 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1831 token->type = OP_DUP_QUESTION;
1832 break;
1833 case '{':
1834 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1835 token->type = OP_OPEN_DUP_NUM;
1836 break;
1837 case '}':
1838 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1839 token->type = OP_CLOSE_DUP_NUM;
1840 break;
1841 default:
1842 break;
1844 return 2;
1847 token->type = CHARACTER;
1848 #ifdef RE_ENABLE_I18N
1849 if (input->mb_cur_max > 1)
1851 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1852 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1854 else
1855 #endif
1856 token->word_char = IS_WORD_CHAR (token->opr.c);
1858 switch (c)
1860 case '\n':
1861 if (syntax & RE_NEWLINE_ALT)
1862 token->type = OP_ALT;
1863 break;
1864 case '|':
1865 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1866 token->type = OP_ALT;
1867 break;
1868 case '*':
1869 token->type = OP_DUP_ASTERISK;
1870 break;
1871 case '+':
1872 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1873 token->type = OP_DUP_PLUS;
1874 break;
1875 case '?':
1876 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1877 token->type = OP_DUP_QUESTION;
1878 break;
1879 case '{':
1880 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1881 token->type = OP_OPEN_DUP_NUM;
1882 break;
1883 case '}':
1884 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1885 token->type = OP_CLOSE_DUP_NUM;
1886 break;
1887 case '(':
1888 if (syntax & RE_NO_BK_PARENS)
1889 token->type = OP_OPEN_SUBEXP;
1890 break;
1891 case ')':
1892 if (syntax & RE_NO_BK_PARENS)
1893 token->type = OP_CLOSE_SUBEXP;
1894 break;
1895 case '[':
1896 token->type = OP_OPEN_BRACKET;
1897 break;
1898 case '.':
1899 token->type = OP_PERIOD;
1900 break;
1901 case '^':
1902 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1903 re_string_cur_idx (input) != 0)
1905 char prev = re_string_peek_byte (input, -1);
1906 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1907 break;
1909 token->type = ANCHOR;
1910 token->opr.ctx_type = LINE_FIRST;
1911 break;
1912 case '$':
1913 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1914 re_string_cur_idx (input) + 1 != re_string_length (input))
1916 re_token_t next;
1917 re_string_skip_bytes (input, 1);
1918 peek_token (&next, input, syntax);
1919 re_string_skip_bytes (input, -1);
1920 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1921 break;
1923 token->type = ANCHOR;
1924 token->opr.ctx_type = LINE_LAST;
1925 break;
1926 default:
1927 break;
1929 return 1;
1932 /* Peek a token from INPUT, and return the length of the token.
1933 We must not use this function out of bracket expressions. */
1935 static int
1936 internal_function
1937 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1939 unsigned char c;
1940 if (re_string_eoi (input))
1942 token->type = END_OF_RE;
1943 return 0;
1945 c = re_string_peek_byte (input, 0);
1946 token->opr.c = c;
1948 #ifdef RE_ENABLE_I18N
1949 if (input->mb_cur_max > 1 &&
1950 !re_string_first_byte (input, re_string_cur_idx (input)))
1952 token->type = CHARACTER;
1953 return 1;
1955 #endif
1957 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
1958 && re_string_cur_idx (input) + 1 < re_string_length (input))
1960 /* In this case, '\' escape a character. */
1961 unsigned char c2;
1962 re_string_skip_bytes (input, 1);
1963 c2 = re_string_peek_byte (input, 0);
1964 token->opr.c = c2;
1965 token->type = CHARACTER;
1966 return 1;
1968 if (c == '[') /* '[' is a special char in a bracket exps. */
1970 unsigned char c2;
1971 int token_len;
1972 if (re_string_cur_idx (input) + 1 < re_string_length (input))
1973 c2 = re_string_peek_byte (input, 1);
1974 else
1975 c2 = 0;
1976 token->opr.c = c2;
1977 token_len = 2;
1978 switch (c2)
1980 case '.':
1981 token->type = OP_OPEN_COLL_ELEM;
1982 break;
1983 case '=':
1984 token->type = OP_OPEN_EQUIV_CLASS;
1985 break;
1986 case ':':
1987 if (syntax & RE_CHAR_CLASSES)
1989 token->type = OP_OPEN_CHAR_CLASS;
1990 break;
1992 /* else fall through. */
1993 default:
1994 token->type = CHARACTER;
1995 token->opr.c = c;
1996 token_len = 1;
1997 break;
1999 return token_len;
2001 switch (c)
2003 case '-':
2004 token->type = OP_CHARSET_RANGE;
2005 break;
2006 case ']':
2007 token->type = OP_CLOSE_BRACKET;
2008 break;
2009 case '^':
2010 token->type = OP_NON_MATCH_LIST;
2011 break;
2012 default:
2013 token->type = CHARACTER;
2015 return 1;
2018 /* Functions for parser. */
2020 /* Entry point of the parser.
2021 Parse the regular expression REGEXP and return the structure tree.
2022 If an error is occured, ERR is set by error code, and return NULL.
2023 This function build the following tree, from regular expression <reg_exp>:
2027 <reg_exp> EOR
2029 CAT means concatenation.
2030 EOR means end of regular expression. */
2032 static bin_tree_t *
2033 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2034 reg_errcode_t *err)
2036 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2037 bin_tree_t *tree, *eor, *root;
2038 re_token_t current_token;
2039 dfa->syntax = syntax;
2040 fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2041 tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2042 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2043 return NULL;
2044 eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2045 if (tree != NULL)
2046 root = create_tree (dfa, tree, eor, CONCAT);
2047 else
2048 root = eor;
2049 if (BE (eor == NULL || root == NULL, 0))
2051 *err = REG_ESPACE;
2052 return NULL;
2054 return root;
2057 /* This function build the following tree, from regular expression
2058 <branch1>|<branch2>:
2062 <branch1> <branch2>
2064 ALT means alternative, which represents the operator `|'. */
2066 static bin_tree_t *
2067 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2068 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2070 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2071 bin_tree_t *tree, *branch = NULL;
2072 tree = parse_branch (regexp, preg, token, syntax, nest, err);
2073 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2074 return NULL;
2076 while (token->type == OP_ALT)
2078 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2079 if (token->type != OP_ALT && token->type != END_OF_RE
2080 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2082 branch = parse_branch (regexp, preg, token, syntax, nest, err);
2083 if (BE (*err != REG_NOERROR && branch == NULL, 0))
2084 return NULL;
2086 else
2087 branch = NULL;
2088 tree = create_tree (dfa, tree, branch, OP_ALT);
2089 if (BE (tree == NULL, 0))
2091 *err = REG_ESPACE;
2092 return NULL;
2095 return tree;
2098 /* This function build the following tree, from regular expression
2099 <exp1><exp2>:
2103 <exp1> <exp2>
2105 CAT means concatenation. */
2107 static bin_tree_t *
2108 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2109 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2111 bin_tree_t *tree, *exp;
2112 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2113 tree = parse_expression (regexp, preg, token, syntax, nest, err);
2114 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2115 return NULL;
2117 while (token->type != OP_ALT && token->type != END_OF_RE
2118 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2120 exp = parse_expression (regexp, preg, token, syntax, nest, err);
2121 if (BE (*err != REG_NOERROR && exp == NULL, 0))
2123 return NULL;
2125 if (tree != NULL && exp != NULL)
2127 tree = create_tree (dfa, tree, exp, CONCAT);
2128 if (tree == NULL)
2130 *err = REG_ESPACE;
2131 return NULL;
2134 else if (tree == NULL)
2135 tree = exp;
2136 /* Otherwise exp == NULL, we don't need to create new tree. */
2138 return tree;
2141 /* This function build the following tree, from regular expression a*:
2147 static bin_tree_t *
2148 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2149 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2151 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2152 bin_tree_t *tree;
2153 switch (token->type)
2155 case CHARACTER:
2156 tree = create_token_tree (dfa, NULL, NULL, token);
2157 if (BE (tree == NULL, 0))
2159 *err = REG_ESPACE;
2160 return NULL;
2162 #ifdef RE_ENABLE_I18N
2163 if (dfa->mb_cur_max > 1)
2165 while (!re_string_eoi (regexp)
2166 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2168 bin_tree_t *mbc_remain;
2169 fetch_token (token, regexp, syntax);
2170 mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2171 tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2172 if (BE (mbc_remain == NULL || tree == NULL, 0))
2174 *err = REG_ESPACE;
2175 return NULL;
2179 #endif
2180 break;
2181 case OP_OPEN_SUBEXP:
2182 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2183 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2184 return NULL;
2185 break;
2186 case OP_OPEN_BRACKET:
2187 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2188 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2189 return NULL;
2190 break;
2191 case OP_BACK_REF:
2192 if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2194 *err = REG_ESUBREG;
2195 return NULL;
2197 dfa->used_bkref_map |= 1 << token->opr.idx;
2198 tree = create_token_tree (dfa, NULL, NULL, token);
2199 if (BE (tree == NULL, 0))
2201 *err = REG_ESPACE;
2202 return NULL;
2204 ++dfa->nbackref;
2205 dfa->has_mb_node = 1;
2206 break;
2207 case OP_OPEN_DUP_NUM:
2208 if (syntax & RE_CONTEXT_INVALID_DUP)
2210 *err = REG_BADRPT;
2211 return NULL;
2213 /* FALLTHROUGH */
2214 case OP_DUP_ASTERISK:
2215 case OP_DUP_PLUS:
2216 case OP_DUP_QUESTION:
2217 if (syntax & RE_CONTEXT_INVALID_OPS)
2219 *err = REG_BADRPT;
2220 return NULL;
2222 else if (syntax & RE_CONTEXT_INDEP_OPS)
2224 fetch_token (token, regexp, syntax);
2225 return parse_expression (regexp, preg, token, syntax, nest, err);
2227 /* else fall through */
2228 case OP_CLOSE_SUBEXP:
2229 if ((token->type == OP_CLOSE_SUBEXP) &&
2230 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2232 *err = REG_ERPAREN;
2233 return NULL;
2235 /* else fall through */
2236 case OP_CLOSE_DUP_NUM:
2237 /* We treat it as a normal character. */
2239 /* Then we can these characters as normal characters. */
2240 token->type = CHARACTER;
2241 /* mb_partial and word_char bits should be initialized already
2242 by peek_token. */
2243 tree = create_token_tree (dfa, NULL, NULL, token);
2244 if (BE (tree == NULL, 0))
2246 *err = REG_ESPACE;
2247 return NULL;
2249 break;
2250 case ANCHOR:
2251 if ((token->opr.ctx_type
2252 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2253 && dfa->word_ops_used == 0)
2254 init_word_char (dfa);
2255 if (token->opr.ctx_type == WORD_DELIM
2256 || token->opr.ctx_type == NOT_WORD_DELIM)
2258 bin_tree_t *tree_first, *tree_last;
2259 if (token->opr.ctx_type == WORD_DELIM)
2261 token->opr.ctx_type = WORD_FIRST;
2262 tree_first = create_token_tree (dfa, NULL, NULL, token);
2263 token->opr.ctx_type = WORD_LAST;
2265 else
2267 token->opr.ctx_type = INSIDE_WORD;
2268 tree_first = create_token_tree (dfa, NULL, NULL, token);
2269 token->opr.ctx_type = INSIDE_NOTWORD;
2271 tree_last = create_token_tree (dfa, NULL, NULL, token);
2272 tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2273 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2275 *err = REG_ESPACE;
2276 return NULL;
2279 else
2281 tree = create_token_tree (dfa, NULL, NULL, token);
2282 if (BE (tree == NULL, 0))
2284 *err = REG_ESPACE;
2285 return NULL;
2288 /* We must return here, since ANCHORs can't be followed
2289 by repetition operators.
2290 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2291 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2292 fetch_token (token, regexp, syntax);
2293 return tree;
2294 case OP_PERIOD:
2295 tree = create_token_tree (dfa, NULL, NULL, token);
2296 if (BE (tree == NULL, 0))
2298 *err = REG_ESPACE;
2299 return NULL;
2301 if (dfa->mb_cur_max > 1)
2302 dfa->has_mb_node = 1;
2303 break;
2304 case OP_WORD:
2305 case OP_NOTWORD:
2306 tree = build_charclass_op (dfa, regexp->trans,
2307 (const unsigned char *) "alnum",
2308 (const unsigned char *) "_",
2309 token->type == OP_NOTWORD, err);
2310 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2311 return NULL;
2312 break;
2313 case OP_SPACE:
2314 case OP_NOTSPACE:
2315 tree = build_charclass_op (dfa, regexp->trans,
2316 (const unsigned char *) "space",
2317 (const unsigned char *) "",
2318 token->type == OP_NOTSPACE, err);
2319 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2320 return NULL;
2321 break;
2322 case OP_ALT:
2323 case END_OF_RE:
2324 return NULL;
2325 case BACK_SLASH:
2326 *err = REG_EESCAPE;
2327 return NULL;
2328 default:
2329 /* Must not happen? */
2330 #ifdef DEBUG
2331 assert (0);
2332 #endif
2333 return NULL;
2335 fetch_token (token, regexp, syntax);
2337 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2338 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2340 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2341 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2342 return NULL;
2343 /* In BRE consecutive duplications are not allowed. */
2344 if ((syntax & RE_CONTEXT_INVALID_DUP)
2345 && (token->type == OP_DUP_ASTERISK
2346 || token->type == OP_OPEN_DUP_NUM))
2348 *err = REG_BADRPT;
2349 return NULL;
2353 return tree;
2356 /* This function build the following tree, from regular expression
2357 (<reg_exp>):
2358 SUBEXP
2360 <reg_exp>
2363 static bin_tree_t *
2364 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2365 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2367 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2368 bin_tree_t *tree;
2369 size_t cur_nsub;
2370 cur_nsub = preg->re_nsub++;
2372 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2374 /* The subexpression may be a null string. */
2375 if (token->type == OP_CLOSE_SUBEXP)
2376 tree = NULL;
2377 else
2379 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2380 if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2381 *err = REG_EPAREN;
2382 if (BE (*err != REG_NOERROR, 0))
2383 return NULL;
2386 if (cur_nsub <= '9' - '1')
2387 dfa->completed_bkref_map |= 1 << cur_nsub;
2389 tree = create_tree (dfa, tree, NULL, SUBEXP);
2390 if (BE (tree == NULL, 0))
2392 *err = REG_ESPACE;
2393 return NULL;
2395 tree->token.opr.idx = cur_nsub;
2396 return tree;
2399 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2401 static bin_tree_t *
2402 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2403 re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2405 bin_tree_t *tree = NULL, *old_tree = NULL;
2406 int i, start, end, start_idx = re_string_cur_idx (regexp);
2407 re_token_t start_token = *token;
2409 if (token->type == OP_OPEN_DUP_NUM)
2411 end = 0;
2412 start = fetch_number (regexp, token, syntax);
2413 if (start == -1)
2415 if (token->type == CHARACTER && token->opr.c == ',')
2416 start = 0; /* We treat "{,m}" as "{0,m}". */
2417 else
2419 *err = REG_BADBR; /* <re>{} is invalid. */
2420 return NULL;
2423 if (BE (start != -2, 1))
2425 /* We treat "{n}" as "{n,n}". */
2426 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2427 : ((token->type == CHARACTER && token->opr.c == ',')
2428 ? fetch_number (regexp, token, syntax) : -2));
2430 if (BE (start == -2 || end == -2, 0))
2432 /* Invalid sequence. */
2433 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2435 if (token->type == END_OF_RE)
2436 *err = REG_EBRACE;
2437 else
2438 *err = REG_BADBR;
2440 return NULL;
2443 /* If the syntax bit is set, rollback. */
2444 re_string_set_index (regexp, start_idx);
2445 *token = start_token;
2446 token->type = CHARACTER;
2447 /* mb_partial and word_char bits should be already initialized by
2448 peek_token. */
2449 return elem;
2452 if (BE (end != -1 && start > end, 0))
2454 /* First number greater than second. */
2455 *err = REG_BADBR;
2456 return NULL;
2459 else
2461 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2462 end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2465 fetch_token (token, regexp, syntax);
2467 if (BE (elem == NULL, 0))
2468 return NULL;
2469 if (BE (start == 0 && end == 0, 0))
2471 postorder (elem, free_tree, NULL);
2472 return NULL;
2475 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2476 if (BE (start > 0, 0))
2478 tree = elem;
2479 for (i = 2; i <= start; ++i)
2481 elem = duplicate_tree (elem, dfa);
2482 tree = create_tree (dfa, tree, elem, CONCAT);
2483 if (BE (elem == NULL || tree == NULL, 0))
2484 goto parse_dup_op_espace;
2487 if (start == end)
2488 return tree;
2490 /* Duplicate ELEM before it is marked optional. */
2491 elem = duplicate_tree (elem, dfa);
2492 old_tree = tree;
2494 else
2495 old_tree = NULL;
2497 if (elem->token.type == SUBEXP)
2498 postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2500 tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2501 if (BE (tree == NULL, 0))
2502 goto parse_dup_op_espace;
2504 /* This loop is actually executed only when end != -1,
2505 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2506 already created the start+1-th copy. */
2507 for (i = start + 2; i <= end; ++i)
2509 elem = duplicate_tree (elem, dfa);
2510 tree = create_tree (dfa, tree, elem, CONCAT);
2511 if (BE (elem == NULL || tree == NULL, 0))
2512 goto parse_dup_op_espace;
2514 tree = create_tree (dfa, tree, NULL, OP_ALT);
2515 if (BE (tree == NULL, 0))
2516 goto parse_dup_op_espace;
2519 if (old_tree)
2520 tree = create_tree (dfa, old_tree, tree, CONCAT);
2522 return tree;
2524 parse_dup_op_espace:
2525 *err = REG_ESPACE;
2526 return NULL;
2529 /* Size of the names for collating symbol/equivalence_class/character_class.
2530 I'm not sure, but maybe enough. */
2531 #define BRACKET_NAME_BUF_SIZE 32
2533 #if 1
2534 /* Local function for parse_bracket_exp only used in case of NOT glibc.
2535 Build the range expression which starts from START_ELEM, and ends
2536 at END_ELEM. The result are written to MBCSET and SBCSET.
2537 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2538 mbcset->range_ends, is a pointer argument sinse we may
2539 update it. */
2541 static reg_errcode_t
2542 internal_function
2543 # ifdef RE_ENABLE_I18N
2544 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2545 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2546 # else
2547 build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
2548 bracket_elem_t *end_elem)
2549 # endif
2551 unsigned int start_ch, end_ch;
2552 /* Equivalence Classes and Character Classes can't be a range start/end. */
2553 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2554 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2556 return REG_ERANGE;
2558 /* We can handle no multi character collating elements without libc
2559 support. */
2560 if (BE ((start_elem->type == COLL_SYM
2561 && strlen ((char *) start_elem->opr.name) > 1)
2562 || (end_elem->type == COLL_SYM
2563 && strlen ((char *) end_elem->opr.name) > 1), 0))
2564 return REG_ECOLLATE;
2566 # ifdef RE_ENABLE_I18N
2568 wchar_t wc;
2569 wint_t start_wc;
2570 wint_t end_wc;
2571 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2573 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2574 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2575 : 0));
2576 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2577 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2578 : 0));
2579 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2580 ? __btowc (start_ch) : start_elem->opr.wch);
2581 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2582 ? __btowc (end_ch) : end_elem->opr.wch);
2583 if (start_wc == WEOF || end_wc == WEOF)
2584 return REG_ECOLLATE;
2585 cmp_buf[0] = start_wc;
2586 cmp_buf[4] = end_wc;
2587 if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2588 return REG_ERANGE;
2590 /* Got valid collation sequence values, add them as a new entry.
2591 However, for !glibc we have no collation elements: if the
2592 character set is single byte, the single byte character set
2593 that we build below suffices. parse_bracket_exp passes
2594 no MBCSET if dfa->mb_cur_max == 1. */
2595 if (mbcset)
2597 /* Check the space of the arrays. */
2598 if (BE (*range_alloc == mbcset->nranges, 0))
2600 /* There is not enough space, need realloc. */
2601 wchar_t *new_array_start, *new_array_end;
2602 int new_nranges;
2604 /* +1 in case of mbcset->nranges is 0. */
2605 new_nranges = 2 * mbcset->nranges + 1;
2606 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2607 are NULL if *range_alloc == 0. */
2608 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2609 new_nranges);
2610 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2611 new_nranges);
2613 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2614 return REG_ESPACE;
2616 mbcset->range_starts = new_array_start;
2617 mbcset->range_ends = new_array_end;
2618 *range_alloc = new_nranges;
2621 mbcset->range_starts[mbcset->nranges] = start_wc;
2622 mbcset->range_ends[mbcset->nranges++] = end_wc;
2625 /* Build the table for single byte characters. */
2626 for (wc = 0; wc < SBC_MAX; ++wc)
2628 cmp_buf[2] = wc;
2629 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2630 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2631 bitset_set (sbcset, wc);
2634 # else /* not RE_ENABLE_I18N */
2636 unsigned int ch;
2637 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2638 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2639 : 0));
2640 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2641 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2642 : 0));
2643 if (start_ch > end_ch)
2644 return REG_ERANGE;
2645 /* Build the table for single byte characters. */
2646 for (ch = 0; ch < SBC_MAX; ++ch)
2647 if (start_ch <= ch && ch <= end_ch)
2648 bitset_set (sbcset, ch);
2650 # endif /* not RE_ENABLE_I18N */
2651 return REG_NOERROR;
2653 #endif
2655 #if 1
2656 /* Helper function for parse_bracket_exp only used in case of NOT glibc.
2657 Build the collating element which is represented by NAME.
2658 The result are written to MBCSET and SBCSET.
2659 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2660 pointer argument since we may update it. */
2662 static reg_errcode_t
2663 internal_function
2664 # ifdef RE_ENABLE_I18N
2665 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2666 int *coll_sym_alloc, const unsigned char *name)
2667 # else
2668 build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2669 # endif
2671 size_t name_len = strlen ((const char *) name);
2672 if (BE (name_len != 1, 0))
2673 return REG_ECOLLATE;
2674 bitset_set (sbcset, name[0]);
2675 return REG_NOERROR;
2677 #endif
2679 /* This function parse bracket expression like "[abc]", "[a-c]",
2680 "[[.a-a.]]" etc. */
2682 static bin_tree_t *
2683 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2684 reg_syntax_t syntax, reg_errcode_t *err)
2686 #if 0
2687 const unsigned char *collseqmb;
2688 const char *collseqwc;
2689 uint32_t nrules;
2690 int32_t table_size;
2691 const int32_t *symb_table;
2692 const unsigned char *extra;
2694 /* Local function for parse_bracket_exp used in glibc.
2695 Seek the collating symbol entry correspondings to NAME.
2696 Return the index of the symbol in the SYMB_TABLE. */
2698 auto __inline__ int32_t
2699 __attribute ((always_inline))
2700 seek_collating_symbol_entry (const unsigned char *name, size_t name_len)
2702 int32_t hash = elem_hash ((const char *) name, name_len);
2703 int32_t elem = hash % table_size;
2704 if (symb_table[2 * elem] != 0)
2706 int32_t second = hash % (table_size - 2) + 1;
2710 /* First compare the hashing value. */
2711 if (symb_table[2 * elem] == hash
2712 /* Compare the length of the name. */
2713 && name_len == extra[symb_table[2 * elem + 1]]
2714 /* Compare the name. */
2715 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2716 name_len) == 0)
2718 /* Yep, this is the entry. */
2719 break;
2722 /* Next entry. */
2723 elem += second;
2725 while (symb_table[2 * elem] != 0);
2727 return elem;
2730 /* Local function for parse_bracket_exp used in glibc.
2731 Look up the collation sequence value of BR_ELEM.
2732 Return the value if succeeded, UINT_MAX otherwise. */
2734 auto __inline__ unsigned int
2735 __attribute ((always_inline))
2736 lookup_collation_sequence_value (bracket_elem_t *br_elem)
2738 if (br_elem->type == SB_CHAR)
2741 if (MB_CUR_MAX == 1)
2743 if (nrules == 0)
2744 return collseqmb[br_elem->opr.ch];
2745 else
2747 wint_t wc = __btowc (br_elem->opr.ch);
2748 return __collseq_table_lookup (collseqwc, wc);
2751 else if (br_elem->type == MB_CHAR)
2753 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2755 else if (br_elem->type == COLL_SYM)
2757 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2758 if (nrules != 0)
2760 int32_t elem, idx;
2761 elem = seek_collating_symbol_entry (br_elem->opr.name,
2762 sym_name_len);
2763 if (symb_table[2 * elem] != 0)
2765 /* We found the entry. */
2766 idx = symb_table[2 * elem + 1];
2767 /* Skip the name of collating element name. */
2768 idx += 1 + extra[idx];
2769 /* Skip the byte sequence of the collating element. */
2770 idx += 1 + extra[idx];
2771 /* Adjust for the alignment. */
2772 idx = (idx + 3) & ~3;
2773 /* Skip the multibyte collation sequence value. */
2774 idx += sizeof (unsigned int);
2775 /* Skip the wide char sequence of the collating element. */
2776 idx += sizeof (unsigned int) *
2777 (1 + *(unsigned int *) (extra + idx));
2778 /* Return the collation sequence value. */
2779 return *(unsigned int *) (extra + idx);
2781 else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2783 /* No valid character. Match it as a single byte
2784 character. */
2785 return collseqmb[br_elem->opr.name[0]];
2788 else if (sym_name_len == 1)
2789 return collseqmb[br_elem->opr.name[0]];
2791 return UINT_MAX;
2794 /* Local function for parse_bracket_exp used in glibc.
2795 Build the range expression which starts from START_ELEM, and ends
2796 at END_ELEM. The result are written to MBCSET and SBCSET.
2797 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2798 mbcset->range_ends, is a pointer argument sinse we may
2799 update it. */
2801 auto __inline__ reg_errcode_t
2802 __attribute ((always_inline))
2803 build_range_exp (re_charset_t *mbcset,
2804 int *range_alloc,
2805 bitset_t sbcset,
2806 bracket_elem_t *start_elem,
2807 bracket_elem_t *end_elem)
2809 unsigned int ch;
2810 uint32_t start_collseq;
2811 uint32_t end_collseq;
2813 /* Equivalence Classes and Character Classes can't be a range
2814 start/end. */
2815 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2816 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2818 return REG_ERANGE;
2820 start_collseq = lookup_collation_sequence_value (start_elem);
2821 end_collseq = lookup_collation_sequence_value (end_elem);
2822 /* Check start/end collation sequence values. */
2823 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2824 return REG_ECOLLATE;
2825 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2826 return REG_ERANGE;
2828 /* Got valid collation sequence values, add them as a new entry.
2829 However, if we have no collation elements, and the character set
2830 is single byte, the single byte character set that we
2831 build below suffices. */
2832 if (nrules > 0 || dfa->mb_cur_max > 1)
2834 /* Check the space of the arrays. */
2835 if (BE (*range_alloc == mbcset->nranges, 0))
2837 /* There is not enough space, need realloc. */
2838 uint32_t *new_array_start;
2839 uint32_t *new_array_end;
2840 int new_nranges;
2842 /* +1 in case of mbcset->nranges is 0. */
2843 new_nranges = 2 * mbcset->nranges + 1;
2844 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2845 new_nranges);
2846 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2847 new_nranges);
2849 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2850 return REG_ESPACE;
2852 mbcset->range_starts = new_array_start;
2853 mbcset->range_ends = new_array_end;
2854 *range_alloc = new_nranges;
2857 mbcset->range_starts[mbcset->nranges] = start_collseq;
2858 mbcset->range_ends[mbcset->nranges++] = end_collseq;
2861 /* Build the table for single byte characters. */
2862 for (ch = 0; ch < SBC_MAX; ch++)
2864 uint32_t ch_collseq;
2866 if (MB_CUR_MAX == 1)
2868 if (nrules == 0)
2869 ch_collseq = collseqmb[ch];
2870 else
2871 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2872 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2873 bitset_set (sbcset, ch);
2875 return REG_NOERROR;
2878 /* Local function for parse_bracket_exp used in glibc.
2879 Build the collating element which is represented by NAME.
2880 The result are written to MBCSET and SBCSET.
2881 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2882 pointer argument sinse we may update it. */
2884 auto __inline__ reg_errcode_t
2885 __attribute ((always_inline))
2886 build_collating_symbol (re_charset_t *mbcset,
2887 int *coll_sym_alloc,
2888 bitset_t sbcset,
2889 const unsigned char *name)
2891 int32_t elem, idx;
2892 size_t name_len = strlen ((const char *) name);
2893 if (nrules != 0)
2895 elem = seek_collating_symbol_entry (name, name_len);
2896 if (symb_table[2 * elem] != 0)
2898 /* We found the entry. */
2899 idx = symb_table[2 * elem + 1];
2900 /* Skip the name of collating element name. */
2901 idx += 1 + extra[idx];
2903 else if (symb_table[2 * elem] == 0 && name_len == 1)
2905 /* No valid character, treat it as a normal
2906 character. */
2907 bitset_set (sbcset, name[0]);
2908 return REG_NOERROR;
2910 else
2911 return REG_ECOLLATE;
2913 /* Got valid collation sequence, add it as a new entry. */
2914 /* Check the space of the arrays. */
2915 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
2917 /* Not enough, realloc it. */
2918 /* +1 in case of mbcset->ncoll_syms is 0. */
2919 int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
2920 /* Use realloc since mbcset->coll_syms is NULL
2921 if *alloc == 0. */
2922 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
2923 new_coll_sym_alloc);
2924 if (BE (new_coll_syms == NULL, 0))
2925 return REG_ESPACE;
2926 mbcset->coll_syms = new_coll_syms;
2927 *coll_sym_alloc = new_coll_sym_alloc;
2929 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
2930 return REG_NOERROR;
2932 else
2934 if (BE (name_len != 1, 0))
2935 return REG_ECOLLATE;
2936 else
2938 bitset_set (sbcset, name[0]);
2939 return REG_NOERROR;
2943 #endif
2945 re_token_t br_token;
2946 re_bitset_ptr_t sbcset;
2947 #ifdef RE_ENABLE_I18N
2948 re_charset_t *mbcset;
2949 int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
2950 int equiv_class_alloc = 0, char_class_alloc = 0;
2951 #endif
2952 int non_match = 0;
2953 bin_tree_t *work_tree;
2954 int token_len;
2955 int first_round = 1;
2956 #if 0
2957 collseqmb = (const unsigned char *)
2958 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
2959 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
2960 if (nrules)
2963 if (MB_CUR_MAX > 1)
2965 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
2966 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
2967 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
2968 _NL_COLLATE_SYMB_TABLEMB);
2969 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
2970 _NL_COLLATE_SYMB_EXTRAMB);
2972 #endif
2973 sbcset = calloc (sizeof (bitset_t), 1);
2974 #ifdef RE_ENABLE_I18N
2975 mbcset = calloc (sizeof (re_charset_t), 1);
2976 #endif
2977 #ifdef RE_ENABLE_I18N
2978 if (BE (sbcset == NULL || mbcset == NULL, 0))
2979 #else
2980 if (BE (sbcset == NULL, 0))
2981 #endif
2983 *err = REG_ESPACE;
2984 return NULL;
2987 token_len = peek_token_bracket (token, regexp, syntax);
2988 if (BE (token->type == END_OF_RE, 0))
2990 *err = REG_BADPAT;
2991 goto parse_bracket_exp_free_return;
2993 if (token->type == OP_NON_MATCH_LIST)
2995 #ifdef RE_ENABLE_I18N
2996 mbcset->non_match = 1;
2997 #endif
2998 non_match = 1;
2999 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3000 bitset_set (sbcset, '\0');
3001 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3002 token_len = peek_token_bracket (token, regexp, syntax);
3003 if (BE (token->type == END_OF_RE, 0))
3005 *err = REG_BADPAT;
3006 goto parse_bracket_exp_free_return;
3010 /* We treat the first ']' as a normal character. */
3011 if (token->type == OP_CLOSE_BRACKET)
3012 token->type = CHARACTER;
3014 while (1)
3016 bracket_elem_t start_elem, end_elem;
3017 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3018 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3019 reg_errcode_t ret;
3020 int token_len2 = 0, is_range_exp = 0;
3021 re_token_t token2;
3023 start_elem.opr.name = start_name_buf;
3024 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3025 syntax, first_round);
3026 if (BE (ret != REG_NOERROR, 0))
3028 *err = ret;
3029 goto parse_bracket_exp_free_return;
3031 first_round = 0;
3033 /* Get information about the next token. We need it in any case. */
3034 token_len = peek_token_bracket (token, regexp, syntax);
3036 /* Do not check for ranges if we know they are not allowed. */
3037 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3039 if (BE (token->type == END_OF_RE, 0))
3041 *err = REG_EBRACK;
3042 goto parse_bracket_exp_free_return;
3044 if (token->type == OP_CHARSET_RANGE)
3046 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3047 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3048 if (BE (token2.type == END_OF_RE, 0))
3050 *err = REG_EBRACK;
3051 goto parse_bracket_exp_free_return;
3053 if (token2.type == OP_CLOSE_BRACKET)
3055 /* We treat the last '-' as a normal character. */
3056 re_string_skip_bytes (regexp, -token_len);
3057 token->type = CHARACTER;
3059 else
3060 is_range_exp = 1;
3064 if (is_range_exp == 1)
3066 end_elem.opr.name = end_name_buf;
3067 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3068 dfa, syntax, 1);
3069 if (BE (ret != REG_NOERROR, 0))
3071 *err = ret;
3072 goto parse_bracket_exp_free_return;
3075 token_len = peek_token_bracket (token, regexp, syntax);
3077 #if 0
3078 *err = build_range_exp (sbcset, mbcset, &range_alloc,
3079 &start_elem, &end_elem);
3080 #else
3081 # ifdef RE_ENABLE_I18N
3082 *err = build_range_exp (sbcset,
3083 dfa->mb_cur_max > 1 ? mbcset : NULL,
3084 &range_alloc, &start_elem, &end_elem);
3085 # else
3086 *err = build_range_exp (sbcset, &start_elem, &end_elem);
3087 # endif
3088 #endif
3089 if (BE (*err != REG_NOERROR, 0))
3090 goto parse_bracket_exp_free_return;
3092 else
3094 switch (start_elem.type)
3096 case SB_CHAR:
3097 bitset_set (sbcset, start_elem.opr.ch);
3098 break;
3099 #ifdef RE_ENABLE_I18N
3100 case MB_CHAR:
3101 /* Check whether the array has enough space. */
3102 if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3104 wchar_t *new_mbchars;
3105 /* Not enough, realloc it. */
3106 /* +1 in case of mbcset->nmbchars is 0. */
3107 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3108 /* Use realloc since array is NULL if *alloc == 0. */
3109 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3110 mbchar_alloc);
3111 if (BE (new_mbchars == NULL, 0))
3112 goto parse_bracket_exp_espace;
3113 mbcset->mbchars = new_mbchars;
3115 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3116 break;
3117 #endif /* RE_ENABLE_I18N */
3118 case EQUIV_CLASS:
3119 *err = build_equiv_class (sbcset,
3120 #ifdef RE_ENABLE_I18N
3121 mbcset, &equiv_class_alloc,
3122 #endif
3123 start_elem.opr.name);
3124 if (BE (*err != REG_NOERROR, 0))
3125 goto parse_bracket_exp_free_return;
3126 break;
3127 case COLL_SYM:
3128 *err = build_collating_symbol (sbcset,
3129 #ifdef RE_ENABLE_I18N
3130 mbcset, &coll_sym_alloc,
3131 #endif
3132 start_elem.opr.name);
3133 if (BE (*err != REG_NOERROR, 0))
3134 goto parse_bracket_exp_free_return;
3135 break;
3136 case CHAR_CLASS:
3137 *err = build_charclass (regexp->trans, sbcset,
3138 #ifdef RE_ENABLE_I18N
3139 mbcset, &char_class_alloc,
3140 #endif
3141 start_elem.opr.name, syntax);
3142 if (BE (*err != REG_NOERROR, 0))
3143 goto parse_bracket_exp_free_return;
3144 break;
3145 default:
3146 assert (0);
3147 break;
3150 if (BE (token->type == END_OF_RE, 0))
3152 *err = REG_EBRACK;
3153 goto parse_bracket_exp_free_return;
3155 if (token->type == OP_CLOSE_BRACKET)
3156 break;
3159 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3161 /* If it is non-matching list. */
3162 if (non_match)
3163 bitset_not (sbcset);
3165 #ifdef RE_ENABLE_I18N
3166 /* Ensure only single byte characters are set. */
3167 if (dfa->mb_cur_max > 1)
3168 bitset_mask (sbcset, dfa->sb_char);
3170 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3171 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3172 || mbcset->non_match)))
3174 bin_tree_t *mbc_tree;
3175 int sbc_idx;
3176 /* Build a tree for complex bracket. */
3177 dfa->has_mb_node = 1;
3178 br_token.type = COMPLEX_BRACKET;
3179 br_token.opr.mbcset = mbcset;
3180 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3181 if (BE (mbc_tree == NULL, 0))
3182 goto parse_bracket_exp_espace;
3183 for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3184 if (sbcset[sbc_idx])
3185 break;
3186 /* If there are no bits set in sbcset, there is no point
3187 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3188 if (sbc_idx < BITSET_WORDS)
3190 /* Build a tree for simple bracket. */
3191 br_token.type = SIMPLE_BRACKET;
3192 br_token.opr.sbcset = sbcset;
3193 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3194 if (BE (work_tree == NULL, 0))
3195 goto parse_bracket_exp_espace;
3197 /* Then join them by ALT node. */
3198 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3199 if (BE (work_tree == NULL, 0))
3200 goto parse_bracket_exp_espace;
3202 else
3204 re_free (sbcset);
3205 work_tree = mbc_tree;
3208 else
3209 #endif /* not RE_ENABLE_I18N */
3211 #ifdef RE_ENABLE_I18N
3212 free_charset (mbcset);
3213 #endif
3214 /* Build a tree for simple bracket. */
3215 br_token.type = SIMPLE_BRACKET;
3216 br_token.opr.sbcset = sbcset;
3217 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3218 if (BE (work_tree == NULL, 0))
3219 goto parse_bracket_exp_espace;
3221 return work_tree;
3223 parse_bracket_exp_espace:
3224 *err = REG_ESPACE;
3225 parse_bracket_exp_free_return:
3226 re_free (sbcset);
3227 #ifdef RE_ENABLE_I18N
3228 free_charset (mbcset);
3229 #endif
3230 return NULL;
3233 /* Parse an element in the bracket expression. */
3235 static reg_errcode_t
3236 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3237 re_token_t *token, int token_len, re_dfa_t *dfa,
3238 reg_syntax_t syntax, int accept_hyphen)
3240 #ifdef RE_ENABLE_I18N
3241 int cur_char_size;
3242 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3243 if (cur_char_size > 1)
3245 elem->type = MB_CHAR;
3246 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3247 re_string_skip_bytes (regexp, cur_char_size);
3248 return REG_NOERROR;
3250 #endif /* RE_ENABLE_I18N */
3251 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3252 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3253 || token->type == OP_OPEN_EQUIV_CLASS)
3254 return parse_bracket_symbol (elem, regexp, token);
3255 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3257 /* A '-' must only appear as anything but a range indicator before
3258 the closing bracket. Everything else is an error. */
3259 re_token_t token2;
3260 (void) peek_token_bracket (&token2, regexp, syntax);
3261 if (token2.type != OP_CLOSE_BRACKET)
3262 /* The actual error value is not standardized since this whole
3263 case is undefined. But ERANGE makes good sense. */
3264 return REG_ERANGE;
3266 elem->type = SB_CHAR;
3267 elem->opr.ch = token->opr.c;
3268 return REG_NOERROR;
3271 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3272 such as [:<character_class>:], [.<collating_element>.], and
3273 [=<equivalent_class>=]. */
3275 static reg_errcode_t
3276 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3277 re_token_t *token)
3279 unsigned char ch, delim = token->opr.c;
3280 int i = 0;
3281 if (re_string_eoi(regexp))
3282 return REG_EBRACK;
3283 for (;; ++i)
3285 if (i >= BRACKET_NAME_BUF_SIZE)
3286 return REG_EBRACK;
3287 if (token->type == OP_OPEN_CHAR_CLASS)
3288 ch = re_string_fetch_byte_case (regexp);
3289 else
3290 ch = re_string_fetch_byte (regexp);
3291 if (re_string_eoi(regexp))
3292 return REG_EBRACK;
3293 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3294 break;
3295 elem->opr.name[i] = ch;
3297 re_string_skip_bytes (regexp, 1);
3298 elem->opr.name[i] = '\0';
3299 switch (token->type)
3301 case OP_OPEN_COLL_ELEM:
3302 elem->type = COLL_SYM;
3303 break;
3304 case OP_OPEN_EQUIV_CLASS:
3305 elem->type = EQUIV_CLASS;
3306 break;
3307 case OP_OPEN_CHAR_CLASS:
3308 elem->type = CHAR_CLASS;
3309 break;
3310 default:
3311 break;
3313 return REG_NOERROR;
3316 /* Helper function for parse_bracket_exp.
3317 Build the equivalence class which is represented by NAME.
3318 The result are written to MBCSET and SBCSET.
3319 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3320 is a pointer argument sinse we may update it. */
3322 static reg_errcode_t
3323 #ifdef RE_ENABLE_I18N
3324 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3325 int *equiv_class_alloc, const unsigned char *name)
3326 #else
3327 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3328 #endif
3330 #if 0
3331 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3332 if (nrules != 0)
3334 const int32_t *table, *indirect;
3335 const unsigned char *weights, *extra, *cp;
3336 unsigned char char_buf[2];
3337 int32_t idx1, idx2;
3338 unsigned int ch;
3339 size_t len;
3340 /* This #include defines a local function! */
3341 # include <locale/weight.h>
3342 /* Calculate the index for equivalence class. */
3343 cp = name;
3344 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3345 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3346 _NL_COLLATE_WEIGHTMB);
3347 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3348 _NL_COLLATE_EXTRAMB);
3349 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3350 _NL_COLLATE_INDIRECTMB);
3351 idx1 = findidx (&cp);
3352 if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3353 /* This isn't a valid character. */
3354 return REG_ECOLLATE;
3356 /* Build single byte matcing table for this equivalence class. */
3357 char_buf[1] = (unsigned char) '\0';
3358 len = weights[idx1];
3359 for (ch = 0; ch < SBC_MAX; ++ch)
3361 char_buf[0] = ch;
3362 cp = char_buf;
3363 idx2 = findidx (&cp);
3365 idx2 = table[ch];
3367 if (idx2 == 0)
3368 /* This isn't a valid character. */
3369 continue;
3370 if (len == weights[idx2])
3372 int cnt = 0;
3373 while (cnt <= len &&
3374 weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
3375 ++cnt;
3377 if (cnt > len)
3378 bitset_set (sbcset, ch);
3381 /* Check whether the array has enough space. */
3382 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3384 /* Not enough, realloc it. */
3385 /* +1 in case of mbcset->nequiv_classes is 0. */
3386 int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3387 /* Use realloc since the array is NULL if *alloc == 0. */
3388 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3389 int32_t,
3390 new_equiv_class_alloc);
3391 if (BE (new_equiv_classes == NULL, 0))
3392 return REG_ESPACE;
3393 mbcset->equiv_classes = new_equiv_classes;
3394 *equiv_class_alloc = new_equiv_class_alloc;
3396 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3398 else
3399 #endif
3401 if (BE (strlen ((const char *) name) != 1, 0))
3402 return REG_ECOLLATE;
3403 bitset_set (sbcset, *name);
3405 return REG_NOERROR;
3408 /* Helper function for parse_bracket_exp.
3409 Build the character class which is represented by NAME.
3410 The result are written to MBCSET and SBCSET.
3411 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3412 is a pointer argument sinse we may update it. */
3414 static reg_errcode_t
3415 #ifdef RE_ENABLE_I18N
3416 build_charclass (__RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3417 re_charset_t *mbcset, int *char_class_alloc,
3418 const unsigned char *class_name, reg_syntax_t syntax)
3419 #else
3420 build_charclass (__RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3421 const unsigned char *class_name, reg_syntax_t syntax)
3422 #endif
3424 int i;
3425 const char *name = (const char *) class_name;
3427 /* In case of REG_ICASE "upper" and "lower" match the both of
3428 upper and lower cases. */
3429 if ((syntax & RE_ICASE)
3430 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3431 name = "alpha";
3433 #ifdef RE_ENABLE_I18N
3434 /* Check the space of the arrays. */
3435 if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3437 /* Not enough, realloc it. */
3438 /* +1 in case of mbcset->nchar_classes is 0. */
3439 int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3440 /* Use realloc since array is NULL if *alloc == 0. */
3441 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3442 new_char_class_alloc);
3443 if (BE (new_char_classes == NULL, 0))
3444 return REG_ESPACE;
3445 mbcset->char_classes = new_char_classes;
3446 *char_class_alloc = new_char_class_alloc;
3448 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3449 #endif /* RE_ENABLE_I18N */
3451 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3452 do { \
3453 if (BE (trans != NULL, 0)) \
3455 for (i = 0; i < SBC_MAX; ++i) \
3456 if (ctype_func (i)) \
3457 bitset_set (sbcset, trans[i]); \
3459 else \
3461 for (i = 0; i < SBC_MAX; ++i) \
3462 if (ctype_func (i)) \
3463 bitset_set (sbcset, i); \
3465 } while (0)
3467 if (strcmp (name, "alnum") == 0)
3468 BUILD_CHARCLASS_LOOP (isalnum);
3469 else if (strcmp (name, "cntrl") == 0)
3470 BUILD_CHARCLASS_LOOP (iscntrl);
3471 else if (strcmp (name, "lower") == 0)
3472 BUILD_CHARCLASS_LOOP (islower);
3473 else if (strcmp (name, "space") == 0)
3474 BUILD_CHARCLASS_LOOP (isspace);
3475 else if (strcmp (name, "alpha") == 0)
3476 BUILD_CHARCLASS_LOOP (isalpha);
3477 else if (strcmp (name, "digit") == 0)
3478 BUILD_CHARCLASS_LOOP (isdigit);
3479 else if (strcmp (name, "print") == 0)
3480 BUILD_CHARCLASS_LOOP (isprint);
3481 else if (strcmp (name, "upper") == 0)
3482 BUILD_CHARCLASS_LOOP (isupper);
3483 else if (strcmp (name, "blank") == 0)
3484 BUILD_CHARCLASS_LOOP (isblank);
3485 else if (strcmp (name, "graph") == 0)
3486 BUILD_CHARCLASS_LOOP (isgraph);
3487 else if (strcmp (name, "punct") == 0)
3488 BUILD_CHARCLASS_LOOP (ispunct);
3489 else if (strcmp (name, "xdigit") == 0)
3490 BUILD_CHARCLASS_LOOP (isxdigit);
3491 else
3492 return REG_ECTYPE;
3494 return REG_NOERROR;
3497 static bin_tree_t *
3498 build_charclass_op (re_dfa_t *dfa, __RE_TRANSLATE_TYPE trans,
3499 const unsigned char *class_name,
3500 const unsigned char *extra, int non_match,
3501 reg_errcode_t *err)
3503 re_bitset_ptr_t sbcset;
3504 #ifdef RE_ENABLE_I18N
3505 re_charset_t *mbcset;
3506 int alloc = 0;
3507 #endif
3508 reg_errcode_t ret;
3509 re_token_t br_token;
3510 bin_tree_t *tree;
3512 sbcset = calloc (sizeof (bitset_t), 1);
3513 #ifdef RE_ENABLE_I18N
3514 mbcset = calloc (sizeof (re_charset_t), 1);
3515 #endif
3517 #ifdef RE_ENABLE_I18N
3518 if (BE (sbcset == NULL || mbcset == NULL, 0))
3519 #else
3520 if (BE (sbcset == NULL, 0))
3521 #endif
3523 *err = REG_ESPACE;
3524 return NULL;
3527 if (non_match)
3529 #ifdef RE_ENABLE_I18N
3531 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3532 bitset_set(cset->sbcset, '\0');
3534 mbcset->non_match = 1;
3535 #endif
3538 /* We don't care the syntax in this case. */
3539 ret = build_charclass (trans, sbcset,
3540 #ifdef RE_ENABLE_I18N
3541 mbcset, &alloc,
3542 #endif
3543 class_name, 0);
3545 if (BE (ret != REG_NOERROR, 0))
3547 re_free (sbcset);
3548 #ifdef RE_ENABLE_I18N
3549 free_charset (mbcset);
3550 #endif
3551 *err = ret;
3552 return NULL;
3554 /* \w match '_' also. */
3555 for (; *extra; extra++)
3556 bitset_set (sbcset, *extra);
3558 /* If it is non-matching list. */
3559 if (non_match)
3560 bitset_not (sbcset);
3562 #ifdef RE_ENABLE_I18N
3563 /* Ensure only single byte characters are set. */
3564 if (dfa->mb_cur_max > 1)
3565 bitset_mask (sbcset, dfa->sb_char);
3566 #endif
3568 /* Build a tree for simple bracket. */
3569 br_token.type = SIMPLE_BRACKET;
3570 br_token.opr.sbcset = sbcset;
3571 tree = create_token_tree (dfa, NULL, NULL, &br_token);
3572 if (BE (tree == NULL, 0))
3573 goto build_word_op_espace;
3575 #ifdef RE_ENABLE_I18N
3576 if (dfa->mb_cur_max > 1)
3578 bin_tree_t *mbc_tree;
3579 /* Build a tree for complex bracket. */
3580 br_token.type = COMPLEX_BRACKET;
3581 br_token.opr.mbcset = mbcset;
3582 dfa->has_mb_node = 1;
3583 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3584 if (BE (mbc_tree == NULL, 0))
3585 goto build_word_op_espace;
3586 /* Then join them by ALT node. */
3587 tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3588 if (BE (mbc_tree != NULL, 1))
3589 return tree;
3591 else
3593 free_charset (mbcset);
3594 return tree;
3596 #else /* not RE_ENABLE_I18N */
3597 return tree;
3598 #endif
3600 build_word_op_espace:
3601 re_free (sbcset);
3602 #ifdef RE_ENABLE_I18N
3603 free_charset (mbcset);
3604 #endif
3605 *err = REG_ESPACE;
3606 return NULL;
3609 /* This is intended for the expressions like "a{1,3}".
3610 Fetch a number from `input', and return the number.
3611 Return -1, if the number field is empty like "{,1}".
3612 Return -2, If an error is occured. */
3614 static int
3615 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3617 int num = -1;
3618 unsigned char c;
3619 while (1)
3621 fetch_token (token, input, syntax);
3622 c = token->opr.c;
3623 if (BE (token->type == END_OF_RE, 0))
3624 return -2;
3625 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3626 break;
3627 num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3628 ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3629 num = (num > RE_DUP_MAX) ? -2 : num;
3631 return num;
3634 #ifdef RE_ENABLE_I18N
3635 static void
3636 free_charset (re_charset_t *cset)
3638 re_free (cset->mbchars);
3639 # if 0
3640 re_free (cset->coll_syms);
3641 re_free (cset->equiv_classes);
3642 re_free (cset->range_starts);
3643 re_free (cset->range_ends);
3644 # endif
3645 re_free (cset->char_classes);
3646 re_free (cset);
3648 #endif /* RE_ENABLE_I18N */
3650 /* Functions for binary tree operation. */
3652 /* Create a tree node. */
3654 static bin_tree_t *
3655 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3656 re_token_type_t type)
3658 re_token_t t;
3659 t.type = type;
3660 return create_token_tree (dfa, left, right, &t);
3663 static bin_tree_t *
3664 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3665 const re_token_t *token)
3667 bin_tree_t *tree;
3668 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3670 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3672 if (storage == NULL)
3673 return NULL;
3674 storage->next = dfa->str_tree_storage;
3675 dfa->str_tree_storage = storage;
3676 dfa->str_tree_storage_idx = 0;
3678 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3680 tree->parent = NULL;
3681 tree->left = left;
3682 tree->right = right;
3683 tree->token = *token;
3684 tree->token.duplicated = 0;
3685 tree->token.opt_subexp = 0;
3686 tree->first = NULL;
3687 tree->next = NULL;
3688 tree->node_idx = -1;
3690 if (left != NULL)
3691 left->parent = tree;
3692 if (right != NULL)
3693 right->parent = tree;
3694 return tree;
3697 /* Mark the tree SRC as an optional subexpression.
3698 To be called from preorder or postorder. */
3700 static reg_errcode_t
3701 mark_opt_subexp (void *extra, bin_tree_t *node)
3703 int idx = (int) (long) extra;
3704 if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3705 node->token.opt_subexp = 1;
3707 return REG_NOERROR;
3710 /* Free the allocated memory inside NODE. */
3712 static void
3713 free_token (re_token_t *node)
3715 #ifdef RE_ENABLE_I18N
3716 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3717 free_charset (node->opr.mbcset);
3718 else
3719 #endif
3720 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3721 re_free (node->opr.sbcset);
3724 /* Worker function for tree walking. Free the allocated memory inside NODE
3725 and its children. */
3727 static reg_errcode_t
3728 free_tree (void *extra, bin_tree_t *node)
3730 free_token (&node->token);
3731 return REG_NOERROR;
3735 /* Duplicate the node SRC, and return new node. This is a preorder
3736 visit similar to the one implemented by the generic visitor, but
3737 we need more infrastructure to maintain two parallel trees --- so,
3738 it's easier to duplicate. */
3740 static bin_tree_t *
3741 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3743 const bin_tree_t *node;
3744 bin_tree_t *dup_root;
3745 bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3747 for (node = root; ; )
3749 /* Create a new tree and link it back to the current parent. */
3750 *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3751 if (*p_new == NULL)
3752 return NULL;
3753 (*p_new)->parent = dup_node;
3754 (*p_new)->token.duplicated = 1;
3755 dup_node = *p_new;
3757 /* Go to the left node, or up and to the right. */
3758 if (node->left)
3760 node = node->left;
3761 p_new = &dup_node->left;
3763 else
3765 const bin_tree_t *prev = NULL;
3766 while (node->right == prev || node->right == NULL)
3768 prev = node;
3769 node = node->parent;
3770 dup_node = dup_node->parent;
3771 if (!node)
3772 return dup_root;
3774 node = node->right;
3775 p_new = &dup_node->right;