Update.
[glibc.git] / posix / regcomp.c
blobbc9e56bd02663908910f463019ea98bb3a5ec69a
1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002, 2003, 2004 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19 02111-1307 USA. */
21 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
22 int length, reg_syntax_t syntax);
23 static void re_compile_fastmap_iter (regex_t *bufp,
24 const re_dfastate_t *init_state,
25 char *fastmap);
26 static reg_errcode_t init_dfa (re_dfa_t *dfa, int pat_len);
27 static void init_word_char (re_dfa_t *dfa);
28 #ifdef RE_ENABLE_I18N
29 static void free_charset (re_charset_t *cset);
30 #endif /* RE_ENABLE_I18N */
31 static void free_workarea_compile (regex_t *preg);
32 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
33 #ifdef RE_ENABLE_I18N
34 static void optimize_utf8 (re_dfa_t *dfa);
35 #endif
36 static reg_errcode_t analyze (re_dfa_t *dfa);
37 static reg_errcode_t analyze_tree (re_dfa_t *dfa, bin_tree_t *node);
38 static void calc_first (re_dfa_t *dfa, bin_tree_t *node);
39 static void calc_next (re_dfa_t *dfa, bin_tree_t *node);
40 static void calc_epsdest (re_dfa_t *dfa, bin_tree_t *node);
41 static reg_errcode_t duplicate_node_closure (re_dfa_t *dfa, int top_org_node,
42 int top_clone_node, int root_node,
43 unsigned int constraint);
44 static reg_errcode_t duplicate_node (int *new_idx, re_dfa_t *dfa, int org_idx,
45 unsigned int constraint);
46 static int search_duplicated_node (re_dfa_t *dfa, int org_node,
47 unsigned int constraint);
48 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
49 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
50 int node, int root);
51 static void calc_inveclosure (re_dfa_t *dfa);
52 static int fetch_number (re_string_t *input, re_token_t *token,
53 reg_syntax_t syntax);
54 static void fetch_token (re_token_t *result, re_string_t *input,
55 reg_syntax_t syntax);
56 static int peek_token (re_token_t *token, re_string_t *input,
57 reg_syntax_t syntax);
58 static int peek_token_bracket (re_token_t *token, re_string_t *input,
59 reg_syntax_t syntax);
60 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
61 reg_syntax_t syntax, reg_errcode_t *err);
62 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
63 re_token_t *token, reg_syntax_t syntax,
64 int nest, reg_errcode_t *err);
65 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
66 re_token_t *token, reg_syntax_t syntax,
67 int nest, reg_errcode_t *err);
68 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
69 re_token_t *token, reg_syntax_t syntax,
70 int nest, reg_errcode_t *err);
71 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
72 re_token_t *token, reg_syntax_t syntax,
73 int nest, reg_errcode_t *err);
74 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
75 re_dfa_t *dfa, re_token_t *token,
76 reg_syntax_t syntax, reg_errcode_t *err);
77 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
78 re_token_t *token, reg_syntax_t syntax,
79 reg_errcode_t *err);
80 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
81 re_string_t *regexp,
82 re_token_t *token, int token_len,
83 re_dfa_t *dfa,
84 reg_syntax_t syntax,
85 int accept_hyphen);
86 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
87 re_string_t *regexp,
88 re_token_t *token);
89 #ifndef _LIBC
90 # ifdef RE_ENABLE_I18N
91 static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
92 re_charset_t *mbcset, int *range_alloc,
93 bracket_elem_t *start_elem,
94 bracket_elem_t *end_elem);
95 static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
96 re_charset_t *mbcset,
97 int *coll_sym_alloc,
98 const unsigned char *name);
99 # else /* not RE_ENABLE_I18N */
100 static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
101 bracket_elem_t *start_elem,
102 bracket_elem_t *end_elem);
103 static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
104 const unsigned char *name);
105 # endif /* not RE_ENABLE_I18N */
106 #endif /* not _LIBC */
107 #ifdef RE_ENABLE_I18N
108 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
109 re_charset_t *mbcset,
110 int *equiv_class_alloc,
111 const unsigned char *name);
112 static reg_errcode_t build_charclass (unsigned RE_TRANSLATE_TYPE trans,
113 re_bitset_ptr_t sbcset,
114 re_charset_t *mbcset,
115 int *char_class_alloc,
116 const unsigned char *class_name,
117 reg_syntax_t syntax);
118 #else /* not RE_ENABLE_I18N */
119 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
120 const unsigned char *name);
121 static reg_errcode_t build_charclass (unsigned RE_TRANSLATE_TYPE trans,
122 re_bitset_ptr_t sbcset,
123 const unsigned char *class_name,
124 reg_syntax_t syntax);
125 #endif /* not RE_ENABLE_I18N */
126 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
127 unsigned RE_TRANSLATE_TYPE trans,
128 const unsigned char *class_name,
129 const unsigned char *extra,
130 int non_match, reg_errcode_t *err);
131 static bin_tree_t *create_tree (re_dfa_t *dfa,
132 bin_tree_t *left, bin_tree_t *right,
133 re_token_type_t type, int index);
134 static bin_tree_t *re_dfa_add_tree_node (re_dfa_t *dfa,
135 bin_tree_t *left, bin_tree_t *right,
136 const re_token_t *token)
137 __attribute ((noinline));
138 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
139 static void mark_opt_subexp (const bin_tree_t *src, re_dfa_t *dfa);
140 static void mark_opt_subexp_iter (const bin_tree_t *src, re_dfa_t *dfa, int idx);
142 /* This table gives an error message for each of the error codes listed
143 in regex.h. Obviously the order here has to be same as there.
144 POSIX doesn't require that we do anything for REG_NOERROR,
145 but why not be nice? */
147 const char __re_error_msgid[] attribute_hidden =
149 #define REG_NOERROR_IDX 0
150 gettext_noop ("Success") /* REG_NOERROR */
151 "\0"
152 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
153 gettext_noop ("No match") /* REG_NOMATCH */
154 "\0"
155 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
156 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
157 "\0"
158 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
159 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
160 "\0"
161 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
162 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
163 "\0"
164 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
165 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
166 "\0"
167 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
168 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
169 "\0"
170 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
171 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
172 "\0"
173 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
174 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
175 "\0"
176 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
177 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
178 "\0"
179 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
180 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
181 "\0"
182 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
183 gettext_noop ("Invalid range end") /* REG_ERANGE */
184 "\0"
185 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
186 gettext_noop ("Memory exhausted") /* REG_ESPACE */
187 "\0"
188 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
189 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
190 "\0"
191 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
192 gettext_noop ("Premature end of regular expression") /* REG_EEND */
193 "\0"
194 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
195 gettext_noop ("Regular expression too big") /* REG_ESIZE */
196 "\0"
197 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
198 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
201 const size_t __re_error_msgid_idx[] attribute_hidden =
203 REG_NOERROR_IDX,
204 REG_NOMATCH_IDX,
205 REG_BADPAT_IDX,
206 REG_ECOLLATE_IDX,
207 REG_ECTYPE_IDX,
208 REG_EESCAPE_IDX,
209 REG_ESUBREG_IDX,
210 REG_EBRACK_IDX,
211 REG_EPAREN_IDX,
212 REG_EBRACE_IDX,
213 REG_BADBR_IDX,
214 REG_ERANGE_IDX,
215 REG_ESPACE_IDX,
216 REG_BADRPT_IDX,
217 REG_EEND_IDX,
218 REG_ESIZE_IDX,
219 REG_ERPAREN_IDX
222 /* Entry points for GNU code. */
224 /* re_compile_pattern is the GNU regular expression compiler: it
225 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
226 Returns 0 if the pattern was valid, otherwise an error string.
228 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
229 are set in BUFP on entry. */
231 const char *
232 re_compile_pattern (pattern, length, bufp)
233 const char *pattern;
234 size_t length;
235 struct re_pattern_buffer *bufp;
237 reg_errcode_t ret;
239 /* And GNU code determines whether or not to get register information
240 by passing null for the REGS argument to re_match, etc., not by
241 setting no_sub. */
242 bufp->no_sub = 0;
244 /* Match anchors at newline. */
245 bufp->newline_anchor = 1;
247 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
249 if (!ret)
250 return NULL;
251 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
253 #ifdef _LIBC
254 weak_alias (__re_compile_pattern, re_compile_pattern)
255 #endif
257 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
258 also be assigned to arbitrarily: each pattern buffer stores its own
259 syntax, so it can be changed between regex compilations. */
260 /* This has no initializer because initialized variables in Emacs
261 become read-only after dumping. */
262 reg_syntax_t re_syntax_options;
265 /* Specify the precise syntax of regexps for compilation. This provides
266 for compatibility for various utilities which historically have
267 different, incompatible syntaxes.
269 The argument SYNTAX is a bit mask comprised of the various bits
270 defined in regex.h. We return the old syntax. */
272 reg_syntax_t
273 re_set_syntax (syntax)
274 reg_syntax_t syntax;
276 reg_syntax_t ret = re_syntax_options;
278 re_syntax_options = syntax;
279 return ret;
281 #ifdef _LIBC
282 weak_alias (__re_set_syntax, re_set_syntax)
283 #endif
286 re_compile_fastmap (bufp)
287 struct re_pattern_buffer *bufp;
289 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
290 char *fastmap = bufp->fastmap;
292 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
293 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
294 if (dfa->init_state != dfa->init_state_word)
295 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
296 if (dfa->init_state != dfa->init_state_nl)
297 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
298 if (dfa->init_state != dfa->init_state_begbuf)
299 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
300 bufp->fastmap_accurate = 1;
301 return 0;
303 #ifdef _LIBC
304 weak_alias (__re_compile_fastmap, re_compile_fastmap)
305 #endif
307 static inline void
308 __attribute ((always_inline))
309 re_set_fastmap (char *fastmap, int icase, int ch)
311 fastmap[ch] = 1;
312 if (icase)
313 fastmap[tolower (ch)] = 1;
316 /* Helper function for re_compile_fastmap.
317 Compile fastmap for the initial_state INIT_STATE. */
319 static void
320 re_compile_fastmap_iter (bufp, init_state, fastmap)
321 regex_t *bufp;
322 const re_dfastate_t *init_state;
323 char *fastmap;
325 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
326 int node_cnt;
327 int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
328 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
330 int node = init_state->nodes.elems[node_cnt];
331 re_token_type_t type = dfa->nodes[node].type;
333 if (type == CHARACTER)
335 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
336 #ifdef RE_ENABLE_I18N
337 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
339 unsigned char *buf = alloca (dfa->mb_cur_max), *p;
340 wchar_t wc;
341 mbstate_t state;
343 p = buf;
344 *p++ = dfa->nodes[node].opr.c;
345 while (++node < dfa->nodes_len
346 && dfa->nodes[node].type == CHARACTER
347 && dfa->nodes[node].mb_partial)
348 *p++ = dfa->nodes[node].opr.c;
349 memset (&state, 0, sizeof (state));
350 if (mbrtowc (&wc, (const char *) buf, p - buf,
351 &state) == p - buf
352 && __wcrtomb ((char *) buf, towlower (wc), &state) > 0)
353 re_set_fastmap (fastmap, 0, buf[0]);
355 #endif
357 else if (type == SIMPLE_BRACKET)
359 int i, j, ch;
360 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
361 for (j = 0; j < UINT_BITS; ++j, ++ch)
362 if (dfa->nodes[node].opr.sbcset[i] & (1 << j))
363 re_set_fastmap (fastmap, icase, ch);
365 #ifdef RE_ENABLE_I18N
366 else if (type == COMPLEX_BRACKET)
368 int i;
369 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
370 if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
371 || cset->nranges || cset->nchar_classes)
373 # ifdef _LIBC
374 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
376 /* In this case we want to catch the bytes which are
377 the first byte of any collation elements.
378 e.g. In da_DK, we want to catch 'a' since "aa"
379 is a valid collation element, and don't catch
380 'b' since 'b' is the only collation element
381 which starts from 'b'. */
382 int j, ch;
383 const int32_t *table = (const int32_t *)
384 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
385 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
386 for (j = 0; j < UINT_BITS; ++j, ++ch)
387 if (table[ch] < 0)
388 re_set_fastmap (fastmap, icase, ch);
390 # else
391 if (dfa->mb_cur_max > 1)
392 for (i = 0; i < SBC_MAX; ++i)
393 if (__btowc (i) == WEOF)
394 re_set_fastmap (fastmap, icase, i);
395 # endif /* not _LIBC */
397 for (i = 0; i < cset->nmbchars; ++i)
399 char buf[256];
400 mbstate_t state;
401 memset (&state, '\0', sizeof (state));
402 __wcrtomb (buf, cset->mbchars[i], &state);
403 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
404 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
406 __wcrtomb (buf, towlower (cset->mbchars[i]), &state);
407 re_set_fastmap (fastmap, 0, *(unsigned char *) buf);
411 #endif /* RE_ENABLE_I18N */
412 else if (type == OP_PERIOD
413 #ifdef RE_ENABLE_I18N
414 || type == OP_UTF8_PERIOD
415 #endif /* RE_ENABLE_I18N */
416 || type == END_OF_RE)
418 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
419 if (type == END_OF_RE)
420 bufp->can_be_null = 1;
421 return;
426 /* Entry point for POSIX code. */
427 /* regcomp takes a regular expression as a string and compiles it.
429 PREG is a regex_t *. We do not expect any fields to be initialized,
430 since POSIX says we shouldn't. Thus, we set
432 `buffer' to the compiled pattern;
433 `used' to the length of the compiled pattern;
434 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
435 REG_EXTENDED bit in CFLAGS is set; otherwise, to
436 RE_SYNTAX_POSIX_BASIC;
437 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
438 `fastmap' to an allocated space for the fastmap;
439 `fastmap_accurate' to zero;
440 `re_nsub' to the number of subexpressions in PATTERN.
442 PATTERN is the address of the pattern string.
444 CFLAGS is a series of bits which affect compilation.
446 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
447 use POSIX basic syntax.
449 If REG_NEWLINE is set, then . and [^...] don't match newline.
450 Also, regexec will try a match beginning after every newline.
452 If REG_ICASE is set, then we considers upper- and lowercase
453 versions of letters to be equivalent when matching.
455 If REG_NOSUB is set, then when PREG is passed to regexec, that
456 routine will report only success or failure, and nothing about the
457 registers.
459 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
460 the return codes and their meanings.) */
463 regcomp (preg, pattern, cflags)
464 regex_t *__restrict preg;
465 const char *__restrict pattern;
466 int cflags;
468 reg_errcode_t ret;
469 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
470 : RE_SYNTAX_POSIX_BASIC);
472 preg->buffer = NULL;
473 preg->allocated = 0;
474 preg->used = 0;
476 /* Try to allocate space for the fastmap. */
477 preg->fastmap = re_malloc (char, SBC_MAX);
478 if (BE (preg->fastmap == NULL, 0))
479 return REG_ESPACE;
481 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
483 /* If REG_NEWLINE is set, newlines are treated differently. */
484 if (cflags & REG_NEWLINE)
485 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
486 syntax &= ~RE_DOT_NEWLINE;
487 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
488 /* It also changes the matching behavior. */
489 preg->newline_anchor = 1;
491 else
492 preg->newline_anchor = 0;
493 preg->no_sub = !!(cflags & REG_NOSUB);
494 preg->translate = NULL;
496 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
498 /* POSIX doesn't distinguish between an unmatched open-group and an
499 unmatched close-group: both are REG_EPAREN. */
500 if (ret == REG_ERPAREN)
501 ret = REG_EPAREN;
503 /* We have already checked preg->fastmap != NULL. */
504 if (BE (ret == REG_NOERROR, 1))
505 /* Compute the fastmap now, since regexec cannot modify the pattern
506 buffer. This function never fails in this implementation. */
507 (void) re_compile_fastmap (preg);
508 else
510 /* Some error occurred while compiling the expression. */
511 re_free (preg->fastmap);
512 preg->fastmap = NULL;
515 return (int) ret;
517 #ifdef _LIBC
518 weak_alias (__regcomp, regcomp)
519 #endif
521 /* Returns a message corresponding to an error code, ERRCODE, returned
522 from either regcomp or regexec. We don't use PREG here. */
524 size_t
525 regerror (errcode, preg, errbuf, errbuf_size)
526 int errcode;
527 const regex_t *preg;
528 char *errbuf;
529 size_t errbuf_size;
531 const char *msg;
532 size_t msg_size;
534 if (BE (errcode < 0
535 || errcode >= (int) (sizeof (__re_error_msgid_idx)
536 / sizeof (__re_error_msgid_idx[0])), 0))
537 /* Only error codes returned by the rest of the code should be passed
538 to this routine. If we are given anything else, or if other regex
539 code generates an invalid error code, then the program has a bug.
540 Dump core so we can fix it. */
541 abort ();
543 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
545 msg_size = strlen (msg) + 1; /* Includes the null. */
547 if (BE (errbuf_size != 0, 1))
549 if (BE (msg_size > errbuf_size, 0))
551 #if defined HAVE_MEMPCPY || defined _LIBC
552 *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
553 #else
554 memcpy (errbuf, msg, errbuf_size - 1);
555 errbuf[errbuf_size - 1] = 0;
556 #endif
558 else
559 memcpy (errbuf, msg, msg_size);
562 return msg_size;
564 #ifdef _LIBC
565 weak_alias (__regerror, regerror)
566 #endif
569 static void
570 free_dfa_content (re_dfa_t *dfa)
572 int i, j;
574 re_free (dfa->subexps);
576 if (dfa->nodes)
577 for (i = 0; i < dfa->nodes_len; ++i)
579 re_token_t *node = dfa->nodes + i;
580 #ifdef RE_ENABLE_I18N
581 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
582 free_charset (node->opr.mbcset);
583 else
584 #endif /* RE_ENABLE_I18N */
585 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
586 re_free (node->opr.sbcset);
588 re_free (dfa->nexts);
589 for (i = 0; i < dfa->nodes_len; ++i)
591 if (dfa->eclosures != NULL)
592 re_node_set_free (dfa->eclosures + i);
593 if (dfa->inveclosures != NULL)
594 re_node_set_free (dfa->inveclosures + i);
595 if (dfa->edests != NULL)
596 re_node_set_free (dfa->edests + i);
598 re_free (dfa->edests);
599 re_free (dfa->eclosures);
600 re_free (dfa->inveclosures);
601 re_free (dfa->nodes);
603 if (dfa->state_table)
604 for (i = 0; i <= dfa->state_hash_mask; ++i)
606 struct re_state_table_entry *entry = dfa->state_table + i;
607 for (j = 0; j < entry->num; ++j)
609 re_dfastate_t *state = entry->array[j];
610 free_state (state);
612 re_free (entry->array);
614 re_free (dfa->state_table);
615 #ifdef RE_ENABLE_I18N
616 re_free (dfa->sb_char);
617 #endif
618 #ifdef DEBUG
619 re_free (dfa->re_str);
620 #endif
622 re_free (dfa);
626 /* Free dynamically allocated space used by PREG. */
628 void
629 regfree (preg)
630 regex_t *preg;
632 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
633 if (BE (dfa != NULL, 1))
634 free_dfa_content (dfa);
635 preg->buffer = NULL;
636 preg->allocated = 0;
638 re_free (preg->fastmap);
639 preg->fastmap = NULL;
641 re_free (preg->translate);
642 preg->translate = NULL;
644 #ifdef _LIBC
645 weak_alias (__regfree, regfree)
646 #endif
648 /* Entry points compatible with 4.2 BSD regex library. We don't define
649 them unless specifically requested. */
651 #if defined _REGEX_RE_COMP || defined _LIBC
653 /* BSD has one and only one pattern buffer. */
654 static struct re_pattern_buffer re_comp_buf;
656 char *
657 # ifdef _LIBC
658 /* Make these definitions weak in libc, so POSIX programs can redefine
659 these names if they don't use our functions, and still use
660 regcomp/regexec above without link errors. */
661 weak_function
662 # endif
663 re_comp (s)
664 const char *s;
666 reg_errcode_t ret;
667 char *fastmap;
669 if (!s)
671 if (!re_comp_buf.buffer)
672 return gettext ("No previous regular expression");
673 return 0;
676 if (re_comp_buf.buffer)
678 fastmap = re_comp_buf.fastmap;
679 re_comp_buf.fastmap = NULL;
680 __regfree (&re_comp_buf);
681 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
682 re_comp_buf.fastmap = fastmap;
685 if (re_comp_buf.fastmap == NULL)
687 re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
688 if (re_comp_buf.fastmap == NULL)
689 return (char *) gettext (__re_error_msgid
690 + __re_error_msgid_idx[(int) REG_ESPACE]);
693 /* Since `re_exec' always passes NULL for the `regs' argument, we
694 don't need to initialize the pattern buffer fields which affect it. */
696 /* Match anchors at newlines. */
697 re_comp_buf.newline_anchor = 1;
699 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
701 if (!ret)
702 return NULL;
704 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
705 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
708 #ifdef _LIBC
709 libc_freeres_fn (free_mem)
711 __regfree (&re_comp_buf);
713 #endif
715 #endif /* _REGEX_RE_COMP */
717 /* Internal entry point.
718 Compile the regular expression PATTERN, whose length is LENGTH.
719 SYNTAX indicate regular expression's syntax. */
721 static reg_errcode_t
722 re_compile_internal (preg, pattern, length, syntax)
723 regex_t *preg;
724 const char * pattern;
725 int length;
726 reg_syntax_t syntax;
728 reg_errcode_t err = REG_NOERROR;
729 re_dfa_t *dfa;
730 re_string_t regexp;
732 /* Initialize the pattern buffer. */
733 preg->fastmap_accurate = 0;
734 preg->syntax = syntax;
735 preg->not_bol = preg->not_eol = 0;
736 preg->used = 0;
737 preg->re_nsub = 0;
738 preg->can_be_null = 0;
739 preg->regs_allocated = REGS_UNALLOCATED;
741 /* Initialize the dfa. */
742 dfa = (re_dfa_t *) preg->buffer;
743 if (BE (preg->allocated < sizeof (re_dfa_t), 0))
745 /* If zero allocated, but buffer is non-null, try to realloc
746 enough space. This loses if buffer's address is bogus, but
747 that is the user's responsibility. If ->buffer is NULL this
748 is a simple allocation. */
749 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
750 if (dfa == NULL)
751 return REG_ESPACE;
752 preg->allocated = sizeof (re_dfa_t);
753 preg->buffer = (unsigned char *) dfa;
755 preg->used = sizeof (re_dfa_t);
757 err = init_dfa (dfa, length);
758 if (BE (err != REG_NOERROR, 0))
760 free_dfa_content (dfa);
761 preg->buffer = NULL;
762 preg->allocated = 0;
763 return err;
765 #ifdef DEBUG
766 dfa->re_str = re_malloc (char, length + 1);
767 strncpy (dfa->re_str, pattern, length + 1);
768 #endif
770 err = re_string_construct (&regexp, pattern, length, preg->translate,
771 syntax & RE_ICASE, dfa);
772 if (BE (err != REG_NOERROR, 0))
774 re_compile_internal_free_return:
775 free_workarea_compile (preg);
776 re_string_destruct (&regexp);
777 free_dfa_content (dfa);
778 preg->buffer = NULL;
779 preg->allocated = 0;
780 return err;
783 /* Parse the regular expression, and build a structure tree. */
784 preg->re_nsub = 0;
785 dfa->str_tree = parse (&regexp, preg, syntax, &err);
786 if (BE (dfa->str_tree == NULL, 0))
787 goto re_compile_internal_free_return;
789 #ifdef RE_ENABLE_I18N
790 /* If possible, do searching in single byte encoding to speed things up. */
791 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
792 optimize_utf8 (dfa);
793 #endif
795 /* Analyze the tree and collect information which is necessary to
796 create the dfa. */
797 err = analyze (dfa);
798 if (BE (err != REG_NOERROR, 0))
799 goto re_compile_internal_free_return;
801 /* Then create the initial state of the dfa. */
802 err = create_initial_state (dfa);
804 /* Release work areas. */
805 free_workarea_compile (preg);
806 re_string_destruct (&regexp);
808 if (BE (err != REG_NOERROR, 0))
810 free_dfa_content (dfa);
811 preg->buffer = NULL;
812 preg->allocated = 0;
815 return err;
818 /* Initialize DFA. We use the length of the regular expression PAT_LEN
819 as the initial length of some arrays. */
821 static reg_errcode_t
822 init_dfa (dfa, pat_len)
823 re_dfa_t *dfa;
824 int pat_len;
826 int table_size;
828 memset (dfa, '\0', sizeof (re_dfa_t));
830 /* Force allocation of str_tree_storage the first time. */
831 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
833 dfa->nodes_alloc = pat_len + 1;
834 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
836 dfa->states_alloc = pat_len + 1;
838 /* table_size = 2 ^ ceil(log pat_len) */
839 for (table_size = 1; table_size > 0; table_size <<= 1)
840 if (table_size > pat_len)
841 break;
843 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
844 dfa->state_hash_mask = table_size - 1;
846 dfa->subexps_alloc = 1;
847 dfa->subexps = re_malloc (re_subexp_t, dfa->subexps_alloc);
849 dfa->mb_cur_max = MB_CUR_MAX;
850 #ifdef _LIBC
851 if (dfa->mb_cur_max == 6
852 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
853 dfa->is_utf8 = 1;
854 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
855 != 0);
856 #endif
857 #ifdef RE_ENABLE_I18N
858 if (dfa->mb_cur_max > 1)
860 int i, j, ch;
862 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset), 1);
863 if (BE (dfa->sb_char == NULL, 0))
864 return REG_ESPACE;
865 if (dfa->is_utf8)
866 memset (dfa->sb_char, 255, sizeof (unsigned int) * BITSET_UINTS / 2);
867 else
868 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
869 for (j = 0; j < UINT_BITS; ++j, ++ch)
870 if (__btowc (ch) != WEOF)
871 dfa->sb_char[i] |= 1 << j;
873 #endif
875 if (BE (dfa->nodes == NULL || dfa->state_table == NULL
876 || dfa->subexps == NULL, 0))
877 return REG_ESPACE;
878 return REG_NOERROR;
881 /* Initialize WORD_CHAR table, which indicate which character is
882 "word". In this case "word" means that it is the word construction
883 character used by some operators like "\<", "\>", etc. */
885 static void
886 init_word_char (dfa)
887 re_dfa_t *dfa;
889 int i, j, ch;
890 dfa->word_ops_used = 1;
891 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
892 for (j = 0; j < UINT_BITS; ++j, ++ch)
893 if (isalnum (ch) || ch == '_')
894 dfa->word_char[i] |= 1 << j;
897 /* Free the work area which are only used while compiling. */
899 static void
900 free_workarea_compile (preg)
901 regex_t *preg;
903 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
904 bin_tree_storage_t *storage, *next;
905 for (storage = dfa->str_tree_storage; storage; storage = next)
907 next = storage->next;
908 re_free (storage);
910 dfa->str_tree_storage = NULL;
911 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
912 dfa->str_tree = NULL;
913 re_free (dfa->org_indices);
914 dfa->org_indices = NULL;
917 /* Create initial states for all contexts. */
919 static reg_errcode_t
920 create_initial_state (dfa)
921 re_dfa_t *dfa;
923 int first, i;
924 reg_errcode_t err;
925 re_node_set init_nodes;
927 /* Initial states have the epsilon closure of the node which is
928 the first node of the regular expression. */
929 first = dfa->str_tree->first;
930 dfa->init_node = first;
931 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
932 if (BE (err != REG_NOERROR, 0))
933 return err;
935 /* The back-references which are in initial states can epsilon transit,
936 since in this case all of the subexpressions can be null.
937 Then we add epsilon closures of the nodes which are the next nodes of
938 the back-references. */
939 if (dfa->nbackref > 0)
940 for (i = 0; i < init_nodes.nelem; ++i)
942 int node_idx = init_nodes.elems[i];
943 re_token_type_t type = dfa->nodes[node_idx].type;
945 int clexp_idx;
946 if (type != OP_BACK_REF)
947 continue;
948 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
950 re_token_t *clexp_node;
951 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
952 if (clexp_node->type == OP_CLOSE_SUBEXP
953 && clexp_node->opr.idx + 1 == dfa->nodes[node_idx].opr.idx)
954 break;
956 if (clexp_idx == init_nodes.nelem)
957 continue;
959 if (type == OP_BACK_REF)
961 int dest_idx = dfa->edests[node_idx].elems[0];
962 if (!re_node_set_contains (&init_nodes, dest_idx))
964 re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
965 i = 0;
970 /* It must be the first time to invoke acquire_state. */
971 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
972 /* We don't check ERR here, since the initial state must not be NULL. */
973 if (BE (dfa->init_state == NULL, 0))
974 return err;
975 if (dfa->init_state->has_constraint)
977 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
978 CONTEXT_WORD);
979 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
980 CONTEXT_NEWLINE);
981 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
982 &init_nodes,
983 CONTEXT_NEWLINE
984 | CONTEXT_BEGBUF);
985 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
986 || dfa->init_state_begbuf == NULL, 0))
987 return err;
989 else
990 dfa->init_state_word = dfa->init_state_nl
991 = dfa->init_state_begbuf = dfa->init_state;
993 re_node_set_free (&init_nodes);
994 return REG_NOERROR;
997 #ifdef RE_ENABLE_I18N
998 /* If it is possible to do searching in single byte encoding instead of UTF-8
999 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1000 DFA nodes where needed. */
1002 static void
1003 optimize_utf8 (dfa)
1004 re_dfa_t *dfa;
1006 int node, i, mb_chars = 0, has_period = 0;
1008 for (node = 0; node < dfa->nodes_len; ++node)
1009 switch (dfa->nodes[node].type)
1011 case CHARACTER:
1012 if (dfa->nodes[node].opr.c >= 0x80)
1013 mb_chars = 1;
1014 break;
1015 case ANCHOR:
1016 switch (dfa->nodes[node].opr.idx)
1018 case LINE_FIRST:
1019 case LINE_LAST:
1020 case BUF_FIRST:
1021 case BUF_LAST:
1022 break;
1023 default:
1024 /* Word anchors etc. cannot be handled. */
1025 return;
1027 break;
1028 case OP_PERIOD:
1029 has_period = 1;
1030 break;
1031 case OP_BACK_REF:
1032 case OP_ALT:
1033 case END_OF_RE:
1034 case OP_DUP_ASTERISK:
1035 case OP_DUP_QUESTION:
1036 case OP_OPEN_SUBEXP:
1037 case OP_CLOSE_SUBEXP:
1038 break;
1039 case SIMPLE_BRACKET:
1040 /* Just double check. */
1041 for (i = 0x80 / UINT_BITS; i < BITSET_UINTS; ++i)
1042 if (dfa->nodes[node].opr.sbcset[i])
1043 return;
1044 break;
1045 default:
1046 return;
1049 if (mb_chars || has_period)
1050 for (node = 0; node < dfa->nodes_len; ++node)
1052 if (dfa->nodes[node].type == CHARACTER
1053 && dfa->nodes[node].opr.c >= 0x80)
1054 dfa->nodes[node].mb_partial = 0;
1055 else if (dfa->nodes[node].type == OP_PERIOD)
1056 dfa->nodes[node].type = OP_UTF8_PERIOD;
1059 /* The search can be in single byte locale. */
1060 dfa->mb_cur_max = 1;
1061 dfa->is_utf8 = 0;
1062 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1064 #endif
1066 /* Analyze the structure tree, and calculate "first", "next", "edest",
1067 "eclosure", and "inveclosure". */
1069 static reg_errcode_t
1070 analyze (dfa)
1071 re_dfa_t *dfa;
1073 int i;
1074 reg_errcode_t ret;
1076 /* Allocate arrays. */
1077 dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1078 dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1079 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1080 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1081 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1082 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1083 || dfa->eclosures == NULL || dfa->inveclosures == NULL, 0))
1084 return REG_ESPACE;
1085 /* Initialize them. */
1086 for (i = 0; i < dfa->nodes_len; ++i)
1088 dfa->nexts[i] = -1;
1089 re_node_set_init_empty (dfa->edests + i);
1090 re_node_set_init_empty (dfa->eclosures + i);
1091 re_node_set_init_empty (dfa->inveclosures + i);
1094 ret = analyze_tree (dfa, dfa->str_tree);
1095 if (BE (ret == REG_NOERROR, 1))
1097 ret = calc_eclosure (dfa);
1098 if (ret == REG_NOERROR)
1099 calc_inveclosure (dfa);
1101 return ret;
1104 /* Helper functions for analyze.
1105 This function calculate "first", "next", and "edest" for the subtree
1106 whose root is NODE. */
1108 static reg_errcode_t
1109 analyze_tree (dfa, node)
1110 re_dfa_t *dfa;
1111 bin_tree_t *node;
1113 reg_errcode_t ret;
1114 if (node->first == -1)
1115 calc_first (dfa, node);
1116 if (node->next == -1)
1117 calc_next (dfa, node);
1118 if (node->eclosure.nelem == 0)
1119 calc_epsdest (dfa, node);
1120 /* Calculate "first" etc. for the left child. */
1121 if (node->left != NULL)
1123 ret = analyze_tree (dfa, node->left);
1124 if (BE (ret != REG_NOERROR, 0))
1125 return ret;
1127 /* Calculate "first" etc. for the right child. */
1128 if (node->right != NULL)
1130 ret = analyze_tree (dfa, node->right);
1131 if (BE (ret != REG_NOERROR, 0))
1132 return ret;
1134 return REG_NOERROR;
1137 /* Calculate "first" for the node NODE. */
1138 static void
1139 calc_first (dfa, node)
1140 re_dfa_t *dfa;
1141 bin_tree_t *node;
1143 int idx, type;
1144 idx = node->node_idx;
1145 type = (node->type == 0) ? dfa->nodes[idx].type : node->type;
1147 switch (type)
1149 #ifdef DEBUG
1150 case OP_OPEN_BRACKET:
1151 case OP_CLOSE_BRACKET:
1152 case OP_OPEN_DUP_NUM:
1153 case OP_CLOSE_DUP_NUM:
1154 case OP_DUP_PLUS:
1155 case OP_NON_MATCH_LIST:
1156 case OP_OPEN_COLL_ELEM:
1157 case OP_CLOSE_COLL_ELEM:
1158 case OP_OPEN_EQUIV_CLASS:
1159 case OP_CLOSE_EQUIV_CLASS:
1160 case OP_OPEN_CHAR_CLASS:
1161 case OP_CLOSE_CHAR_CLASS:
1162 /* These must not appear here. */
1163 assert (0);
1164 #endif
1165 case END_OF_RE:
1166 case CHARACTER:
1167 case OP_PERIOD:
1168 case OP_DUP_ASTERISK:
1169 case OP_DUP_QUESTION:
1170 #ifdef RE_ENABLE_I18N
1171 case OP_UTF8_PERIOD:
1172 case COMPLEX_BRACKET:
1173 #endif /* RE_ENABLE_I18N */
1174 case SIMPLE_BRACKET:
1175 case OP_BACK_REF:
1176 case ANCHOR:
1177 case OP_OPEN_SUBEXP:
1178 case OP_CLOSE_SUBEXP:
1179 node->first = idx;
1180 break;
1181 case OP_ALT:
1182 node->first = idx;
1183 break;
1184 /* else fall through */
1185 default:
1186 #ifdef DEBUG
1187 assert (node->left != NULL);
1188 #endif
1189 if (node->left->first == -1)
1190 calc_first (dfa, node->left);
1191 node->first = node->left->first;
1192 break;
1196 /* Calculate "next" for the node NODE. */
1198 static void
1199 calc_next (dfa, node)
1200 re_dfa_t *dfa;
1201 bin_tree_t *node;
1203 int idx, type;
1204 bin_tree_t *parent = node->parent;
1205 if (parent == NULL)
1207 node->next = -1;
1208 idx = node->node_idx;
1209 if (node->type == 0)
1210 dfa->nexts[idx] = node->next;
1211 return;
1214 idx = parent->node_idx;
1215 type = (parent->type == 0) ? dfa->nodes[idx].type : parent->type;
1217 switch (type)
1219 case OP_DUP_ASTERISK:
1220 node->next = idx;
1221 break;
1222 case CONCAT:
1223 if (parent->left == node)
1225 if (parent->right->first == -1)
1226 calc_first (dfa, parent->right);
1227 node->next = parent->right->first;
1228 break;
1230 /* else fall through */
1231 default:
1232 if (parent->next == -1)
1233 calc_next (dfa, parent);
1234 node->next = parent->next;
1235 break;
1237 idx = node->node_idx;
1238 if (node->type == 0)
1239 dfa->nexts[idx] = node->next;
1242 /* Calculate "edest" for the node NODE. */
1244 static void
1245 calc_epsdest (dfa, node)
1246 re_dfa_t *dfa;
1247 bin_tree_t *node;
1249 int idx;
1250 idx = node->node_idx;
1251 if (node->type == 0)
1253 if (dfa->nodes[idx].type == OP_DUP_ASTERISK
1254 || dfa->nodes[idx].type == OP_DUP_QUESTION)
1256 if (node->left->first == -1)
1257 calc_first (dfa, node->left);
1258 if (node->next == -1)
1259 calc_next (dfa, node);
1260 re_node_set_init_2 (dfa->edests + idx, node->left->first,
1261 node->next);
1263 else if (dfa->nodes[idx].type == OP_ALT)
1265 int left, right;
1266 if (node->left != NULL)
1268 if (node->left->first == -1)
1269 calc_first (dfa, node->left);
1270 left = node->left->first;
1272 else
1274 if (node->next == -1)
1275 calc_next (dfa, node);
1276 left = node->next;
1278 if (node->right != NULL)
1280 if (node->right->first == -1)
1281 calc_first (dfa, node->right);
1282 right = node->right->first;
1284 else
1286 if (node->next == -1)
1287 calc_next (dfa, node);
1288 right = node->next;
1290 re_node_set_init_2 (dfa->edests + idx, left, right);
1292 else if (dfa->nodes[idx].type == ANCHOR
1293 || dfa->nodes[idx].type == OP_OPEN_SUBEXP
1294 || dfa->nodes[idx].type == OP_CLOSE_SUBEXP
1295 || dfa->nodes[idx].type == OP_BACK_REF)
1296 re_node_set_init_1 (dfa->edests + idx, node->next);
1297 else
1298 assert (!IS_EPSILON_NODE (dfa->nodes[idx].type));
1302 /* Duplicate the epsilon closure of the node ROOT_NODE.
1303 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1304 to their own constraint. */
1306 static reg_errcode_t
1307 duplicate_node_closure (dfa, top_org_node, top_clone_node, root_node,
1308 init_constraint)
1309 re_dfa_t *dfa;
1310 int top_org_node, top_clone_node, root_node;
1311 unsigned int init_constraint;
1313 reg_errcode_t err;
1314 int org_node, clone_node, ret;
1315 unsigned int constraint = init_constraint;
1316 for (org_node = top_org_node, clone_node = top_clone_node;;)
1318 int org_dest, clone_dest;
1319 if (dfa->nodes[org_node].type == OP_BACK_REF)
1321 /* If the back reference epsilon-transit, its destination must
1322 also have the constraint. Then duplicate the epsilon closure
1323 of the destination of the back reference, and store it in
1324 edests of the back reference. */
1325 org_dest = dfa->nexts[org_node];
1326 re_node_set_empty (dfa->edests + clone_node);
1327 err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1328 if (BE (err != REG_NOERROR, 0))
1329 return err;
1330 dfa->nexts[clone_node] = dfa->nexts[org_node];
1331 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1332 if (BE (ret < 0, 0))
1333 return REG_ESPACE;
1335 else if (dfa->edests[org_node].nelem == 0)
1337 /* In case of the node can't epsilon-transit, don't duplicate the
1338 destination and store the original destination as the
1339 destination of the node. */
1340 dfa->nexts[clone_node] = dfa->nexts[org_node];
1341 break;
1343 else if (dfa->edests[org_node].nelem == 1)
1345 /* In case of the node can epsilon-transit, and it has only one
1346 destination. */
1347 org_dest = dfa->edests[org_node].elems[0];
1348 re_node_set_empty (dfa->edests + clone_node);
1349 if (dfa->nodes[org_node].type == ANCHOR)
1351 /* In case of the node has another constraint, append it. */
1352 if (org_node == root_node && clone_node != org_node)
1354 /* ...but if the node is root_node itself, it means the
1355 epsilon closure have a loop, then tie it to the
1356 destination of the root_node. */
1357 ret = re_node_set_insert (dfa->edests + clone_node,
1358 org_dest);
1359 if (BE (ret < 0, 0))
1360 return REG_ESPACE;
1361 break;
1363 constraint |= dfa->nodes[org_node].opr.ctx_type;
1365 err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1366 if (BE (err != REG_NOERROR, 0))
1367 return err;
1368 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1369 if (BE (ret < 0, 0))
1370 return REG_ESPACE;
1372 else /* dfa->edests[org_node].nelem == 2 */
1374 /* In case of the node can epsilon-transit, and it has two
1375 destinations. E.g. '|', '*', '+', '?'. */
1376 org_dest = dfa->edests[org_node].elems[0];
1377 re_node_set_empty (dfa->edests + clone_node);
1378 /* Search for a duplicated node which satisfies the constraint. */
1379 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1380 if (clone_dest == -1)
1382 /* There are no such a duplicated node, create a new one. */
1383 err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1384 if (BE (err != REG_NOERROR, 0))
1385 return err;
1386 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1387 if (BE (ret < 0, 0))
1388 return REG_ESPACE;
1389 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1390 root_node, constraint);
1391 if (BE (err != REG_NOERROR, 0))
1392 return err;
1394 else
1396 /* There are a duplicated node which satisfy the constraint,
1397 use it to avoid infinite loop. */
1398 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1399 if (BE (ret < 0, 0))
1400 return REG_ESPACE;
1403 org_dest = dfa->edests[org_node].elems[1];
1404 err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1405 if (BE (err != REG_NOERROR, 0))
1406 return err;
1407 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1408 if (BE (ret < 0, 0))
1409 return REG_ESPACE;
1411 org_node = org_dest;
1412 clone_node = clone_dest;
1414 return REG_NOERROR;
1417 /* Search for a node which is duplicated from the node ORG_NODE, and
1418 satisfies the constraint CONSTRAINT. */
1420 static int
1421 search_duplicated_node (dfa, org_node, constraint)
1422 re_dfa_t *dfa;
1423 int org_node;
1424 unsigned int constraint;
1426 int idx;
1427 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1429 if (org_node == dfa->org_indices[idx]
1430 && constraint == dfa->nodes[idx].constraint)
1431 return idx; /* Found. */
1433 return -1; /* Not found. */
1436 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1437 The new index will be stored in NEW_IDX and return REG_NOERROR if succeeded,
1438 otherwise return the error code. */
1440 static reg_errcode_t
1441 duplicate_node (new_idx, dfa, org_idx, constraint)
1442 re_dfa_t *dfa;
1443 int *new_idx, org_idx;
1444 unsigned int constraint;
1446 int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx], 1);
1447 if (BE (dup_idx == -1, 0))
1448 return REG_ESPACE;
1449 dfa->nodes[dup_idx].constraint = constraint;
1450 if (dfa->nodes[org_idx].type == ANCHOR)
1451 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type;
1452 dfa->nodes[dup_idx].duplicated = 1;
1453 re_node_set_init_empty (dfa->edests + dup_idx);
1454 re_node_set_init_empty (dfa->eclosures + dup_idx);
1455 re_node_set_init_empty (dfa->inveclosures + dup_idx);
1457 /* Store the index of the original node. */
1458 dfa->org_indices[dup_idx] = org_idx;
1459 *new_idx = dup_idx;
1460 return REG_NOERROR;
1463 static void
1464 calc_inveclosure (dfa)
1465 re_dfa_t *dfa;
1467 int src, idx, dest;
1468 for (src = 0; src < dfa->nodes_len; ++src)
1470 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1472 dest = dfa->eclosures[src].elems[idx];
1473 re_node_set_insert (dfa->inveclosures + dest, src);
1478 /* Calculate "eclosure" for all the node in DFA. */
1480 static reg_errcode_t
1481 calc_eclosure (dfa)
1482 re_dfa_t *dfa;
1484 int node_idx, incomplete;
1485 #ifdef DEBUG
1486 assert (dfa->nodes_len > 0);
1487 #endif
1488 incomplete = 0;
1489 /* For each nodes, calculate epsilon closure. */
1490 for (node_idx = 0; ; ++node_idx)
1492 reg_errcode_t err;
1493 re_node_set eclosure_elem;
1494 if (node_idx == dfa->nodes_len)
1496 if (!incomplete)
1497 break;
1498 incomplete = 0;
1499 node_idx = 0;
1502 #ifdef DEBUG
1503 assert (dfa->eclosures[node_idx].nelem != -1);
1504 #endif
1505 /* If we have already calculated, skip it. */
1506 if (dfa->eclosures[node_idx].nelem != 0)
1507 continue;
1508 /* Calculate epsilon closure of `node_idx'. */
1509 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1510 if (BE (err != REG_NOERROR, 0))
1511 return err;
1513 if (dfa->eclosures[node_idx].nelem == 0)
1515 incomplete = 1;
1516 re_node_set_free (&eclosure_elem);
1519 return REG_NOERROR;
1522 /* Calculate epsilon closure of NODE. */
1524 static reg_errcode_t
1525 calc_eclosure_iter (new_set, dfa, node, root)
1526 re_node_set *new_set;
1527 re_dfa_t *dfa;
1528 int node, root;
1530 reg_errcode_t err;
1531 unsigned int constraint;
1532 int i, incomplete;
1533 re_node_set eclosure;
1534 incomplete = 0;
1535 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1536 if (BE (err != REG_NOERROR, 0))
1537 return err;
1539 /* This indicates that we are calculating this node now.
1540 We reference this value to avoid infinite loop. */
1541 dfa->eclosures[node].nelem = -1;
1543 constraint = ((dfa->nodes[node].type == ANCHOR)
1544 ? dfa->nodes[node].opr.ctx_type : 0);
1545 /* If the current node has constraints, duplicate all nodes.
1546 Since they must inherit the constraints. */
1547 if (constraint && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1549 int org_node, cur_node;
1550 org_node = cur_node = node;
1551 err = duplicate_node_closure (dfa, node, node, node, constraint);
1552 if (BE (err != REG_NOERROR, 0))
1553 return err;
1556 /* Expand each epsilon destination nodes. */
1557 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1558 for (i = 0; i < dfa->edests[node].nelem; ++i)
1560 re_node_set eclosure_elem;
1561 int edest = dfa->edests[node].elems[i];
1562 /* If calculating the epsilon closure of `edest' is in progress,
1563 return intermediate result. */
1564 if (dfa->eclosures[edest].nelem == -1)
1566 incomplete = 1;
1567 continue;
1569 /* If we haven't calculated the epsilon closure of `edest' yet,
1570 calculate now. Otherwise use calculated epsilon closure. */
1571 if (dfa->eclosures[edest].nelem == 0)
1573 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1574 if (BE (err != REG_NOERROR, 0))
1575 return err;
1577 else
1578 eclosure_elem = dfa->eclosures[edest];
1579 /* Merge the epsilon closure of `edest'. */
1580 re_node_set_merge (&eclosure, &eclosure_elem);
1581 /* If the epsilon closure of `edest' is incomplete,
1582 the epsilon closure of this node is also incomplete. */
1583 if (dfa->eclosures[edest].nelem == 0)
1585 incomplete = 1;
1586 re_node_set_free (&eclosure_elem);
1590 /* Epsilon closures include itself. */
1591 re_node_set_insert (&eclosure, node);
1592 if (incomplete && !root)
1593 dfa->eclosures[node].nelem = 0;
1594 else
1595 dfa->eclosures[node] = eclosure;
1596 *new_set = eclosure;
1597 return REG_NOERROR;
1600 /* Functions for token which are used in the parser. */
1602 /* Fetch a token from INPUT.
1603 We must not use this function inside bracket expressions. */
1605 static void
1606 fetch_token (result, input, syntax)
1607 re_token_t *result;
1608 re_string_t *input;
1609 reg_syntax_t syntax;
1611 re_string_skip_bytes (input, peek_token (result, input, syntax));
1614 /* Peek a token from INPUT, and return the length of the token.
1615 We must not use this function inside bracket expressions. */
1617 static int
1618 peek_token (token, input, syntax)
1619 re_token_t *token;
1620 re_string_t *input;
1621 reg_syntax_t syntax;
1623 unsigned char c;
1625 if (re_string_eoi (input))
1627 token->type = END_OF_RE;
1628 return 0;
1631 c = re_string_peek_byte (input, 0);
1632 token->opr.c = c;
1634 token->word_char = 0;
1635 #ifdef RE_ENABLE_I18N
1636 token->mb_partial = 0;
1637 if (input->mb_cur_max > 1 &&
1638 !re_string_first_byte (input, re_string_cur_idx (input)))
1640 token->type = CHARACTER;
1641 token->mb_partial = 1;
1642 return 1;
1644 #endif
1645 if (c == '\\')
1647 unsigned char c2;
1648 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1650 token->type = BACK_SLASH;
1651 return 1;
1654 c2 = re_string_peek_byte_case (input, 1);
1655 token->opr.c = c2;
1656 token->type = CHARACTER;
1657 #ifdef RE_ENABLE_I18N
1658 if (input->mb_cur_max > 1)
1660 wint_t wc = re_string_wchar_at (input,
1661 re_string_cur_idx (input) + 1);
1662 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1664 else
1665 #endif
1666 token->word_char = IS_WORD_CHAR (c2) != 0;
1668 switch (c2)
1670 case '|':
1671 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1672 token->type = OP_ALT;
1673 break;
1674 case '1': case '2': case '3': case '4': case '5':
1675 case '6': case '7': case '8': case '9':
1676 if (!(syntax & RE_NO_BK_REFS))
1678 token->type = OP_BACK_REF;
1679 token->opr.idx = c2 - '0';
1681 break;
1682 case '<':
1683 if (!(syntax & RE_NO_GNU_OPS))
1685 token->type = ANCHOR;
1686 token->opr.ctx_type = WORD_FIRST;
1688 break;
1689 case '>':
1690 if (!(syntax & RE_NO_GNU_OPS))
1692 token->type = ANCHOR;
1693 token->opr.ctx_type = WORD_LAST;
1695 break;
1696 case 'b':
1697 if (!(syntax & RE_NO_GNU_OPS))
1699 token->type = ANCHOR;
1700 token->opr.ctx_type = WORD_DELIM;
1702 break;
1703 case 'B':
1704 if (!(syntax & RE_NO_GNU_OPS))
1706 token->type = ANCHOR;
1707 token->opr.ctx_type = INSIDE_WORD;
1709 break;
1710 case 'w':
1711 if (!(syntax & RE_NO_GNU_OPS))
1712 token->type = OP_WORD;
1713 break;
1714 case 'W':
1715 if (!(syntax & RE_NO_GNU_OPS))
1716 token->type = OP_NOTWORD;
1717 break;
1718 case 's':
1719 if (!(syntax & RE_NO_GNU_OPS))
1720 token->type = OP_SPACE;
1721 break;
1722 case 'S':
1723 if (!(syntax & RE_NO_GNU_OPS))
1724 token->type = OP_NOTSPACE;
1725 break;
1726 case '`':
1727 if (!(syntax & RE_NO_GNU_OPS))
1729 token->type = ANCHOR;
1730 token->opr.ctx_type = BUF_FIRST;
1732 break;
1733 case '\'':
1734 if (!(syntax & RE_NO_GNU_OPS))
1736 token->type = ANCHOR;
1737 token->opr.ctx_type = BUF_LAST;
1739 break;
1740 case '(':
1741 if (!(syntax & RE_NO_BK_PARENS))
1742 token->type = OP_OPEN_SUBEXP;
1743 break;
1744 case ')':
1745 if (!(syntax & RE_NO_BK_PARENS))
1746 token->type = OP_CLOSE_SUBEXP;
1747 break;
1748 case '+':
1749 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1750 token->type = OP_DUP_PLUS;
1751 break;
1752 case '?':
1753 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1754 token->type = OP_DUP_QUESTION;
1755 break;
1756 case '{':
1757 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1758 token->type = OP_OPEN_DUP_NUM;
1759 break;
1760 case '}':
1761 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1762 token->type = OP_CLOSE_DUP_NUM;
1763 break;
1764 default:
1765 break;
1767 return 2;
1770 token->type = CHARACTER;
1771 #ifdef RE_ENABLE_I18N
1772 if (input->mb_cur_max > 1)
1774 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1775 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1777 else
1778 #endif
1779 token->word_char = IS_WORD_CHAR (token->opr.c);
1781 switch (c)
1783 case '\n':
1784 if (syntax & RE_NEWLINE_ALT)
1785 token->type = OP_ALT;
1786 break;
1787 case '|':
1788 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1789 token->type = OP_ALT;
1790 break;
1791 case '*':
1792 token->type = OP_DUP_ASTERISK;
1793 break;
1794 case '+':
1795 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1796 token->type = OP_DUP_PLUS;
1797 break;
1798 case '?':
1799 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1800 token->type = OP_DUP_QUESTION;
1801 break;
1802 case '{':
1803 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1804 token->type = OP_OPEN_DUP_NUM;
1805 break;
1806 case '}':
1807 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1808 token->type = OP_CLOSE_DUP_NUM;
1809 break;
1810 case '(':
1811 if (syntax & RE_NO_BK_PARENS)
1812 token->type = OP_OPEN_SUBEXP;
1813 break;
1814 case ')':
1815 if (syntax & RE_NO_BK_PARENS)
1816 token->type = OP_CLOSE_SUBEXP;
1817 break;
1818 case '[':
1819 token->type = OP_OPEN_BRACKET;
1820 break;
1821 case '.':
1822 token->type = OP_PERIOD;
1823 break;
1824 case '^':
1825 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1826 re_string_cur_idx (input) != 0)
1828 char prev = re_string_peek_byte (input, -1);
1829 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1830 break;
1832 token->type = ANCHOR;
1833 token->opr.ctx_type = LINE_FIRST;
1834 break;
1835 case '$':
1836 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1837 re_string_cur_idx (input) + 1 != re_string_length (input))
1839 re_token_t next;
1840 re_string_skip_bytes (input, 1);
1841 peek_token (&next, input, syntax);
1842 re_string_skip_bytes (input, -1);
1843 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1844 break;
1846 token->type = ANCHOR;
1847 token->opr.ctx_type = LINE_LAST;
1848 break;
1849 default:
1850 break;
1852 return 1;
1855 /* Peek a token from INPUT, and return the length of the token.
1856 We must not use this function out of bracket expressions. */
1858 static int
1859 peek_token_bracket (token, input, syntax)
1860 re_token_t *token;
1861 re_string_t *input;
1862 reg_syntax_t syntax;
1864 unsigned char c;
1865 if (re_string_eoi (input))
1867 token->type = END_OF_RE;
1868 return 0;
1870 c = re_string_peek_byte (input, 0);
1871 token->opr.c = c;
1873 #ifdef RE_ENABLE_I18N
1874 if (input->mb_cur_max > 1 &&
1875 !re_string_first_byte (input, re_string_cur_idx (input)))
1877 token->type = CHARACTER;
1878 return 1;
1880 #endif /* RE_ENABLE_I18N */
1882 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
1883 && re_string_cur_idx (input) + 1 < re_string_length (input))
1885 /* In this case, '\' escape a character. */
1886 unsigned char c2;
1887 re_string_skip_bytes (input, 1);
1888 c2 = re_string_peek_byte (input, 0);
1889 token->opr.c = c2;
1890 token->type = CHARACTER;
1891 return 1;
1893 if (c == '[') /* '[' is a special char in a bracket exps. */
1895 unsigned char c2;
1896 int token_len;
1897 if (re_string_cur_idx (input) + 1 < re_string_length (input))
1898 c2 = re_string_peek_byte (input, 1);
1899 else
1900 c2 = 0;
1901 token->opr.c = c2;
1902 token_len = 2;
1903 switch (c2)
1905 case '.':
1906 token->type = OP_OPEN_COLL_ELEM;
1907 break;
1908 case '=':
1909 token->type = OP_OPEN_EQUIV_CLASS;
1910 break;
1911 case ':':
1912 if (syntax & RE_CHAR_CLASSES)
1914 token->type = OP_OPEN_CHAR_CLASS;
1915 break;
1917 /* else fall through. */
1918 default:
1919 token->type = CHARACTER;
1920 token->opr.c = c;
1921 token_len = 1;
1922 break;
1924 return token_len;
1926 switch (c)
1928 case '-':
1929 token->type = OP_CHARSET_RANGE;
1930 break;
1931 case ']':
1932 token->type = OP_CLOSE_BRACKET;
1933 break;
1934 case '^':
1935 token->type = OP_NON_MATCH_LIST;
1936 break;
1937 default:
1938 token->type = CHARACTER;
1940 return 1;
1943 /* Functions for parser. */
1945 /* Entry point of the parser.
1946 Parse the regular expression REGEXP and return the structure tree.
1947 If an error is occured, ERR is set by error code, and return NULL.
1948 This function build the following tree, from regular expression <reg_exp>:
1952 <reg_exp> EOR
1954 CAT means concatenation.
1955 EOR means end of regular expression. */
1957 static bin_tree_t *
1958 parse (regexp, preg, syntax, err)
1959 re_string_t *regexp;
1960 regex_t *preg;
1961 reg_syntax_t syntax;
1962 reg_errcode_t *err;
1964 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1965 bin_tree_t *tree, *eor, *root;
1966 re_token_t current_token;
1967 dfa->syntax = syntax;
1968 fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
1969 tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
1970 if (BE (*err != REG_NOERROR && tree == NULL, 0))
1971 return NULL;
1972 eor = re_dfa_add_tree_node (dfa, NULL, NULL, &current_token);
1973 if (tree != NULL)
1974 root = create_tree (dfa, tree, eor, CONCAT, 0);
1975 else
1976 root = eor;
1977 if (BE (eor == NULL || root == NULL, 0))
1979 *err = REG_ESPACE;
1980 return NULL;
1982 return root;
1985 /* This function build the following tree, from regular expression
1986 <branch1>|<branch2>:
1990 <branch1> <branch2>
1992 ALT means alternative, which represents the operator `|'. */
1994 static bin_tree_t *
1995 parse_reg_exp (regexp, preg, token, syntax, nest, err)
1996 re_string_t *regexp;
1997 regex_t *preg;
1998 re_token_t *token;
1999 reg_syntax_t syntax;
2000 int nest;
2001 reg_errcode_t *err;
2003 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2004 bin_tree_t *tree, *branch = NULL;
2005 tree = parse_branch (regexp, preg, token, syntax, nest, err);
2006 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2007 return NULL;
2009 while (token->type == OP_ALT)
2011 re_token_t alt_token = *token;
2012 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2013 if (token->type != OP_ALT && token->type != END_OF_RE
2014 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2016 branch = parse_branch (regexp, preg, token, syntax, nest, err);
2017 if (BE (*err != REG_NOERROR && branch == NULL, 0))
2018 return NULL;
2020 else
2021 branch = NULL;
2022 tree = re_dfa_add_tree_node (dfa, tree, branch, &alt_token);
2023 if (BE (tree == NULL, 0))
2025 *err = REG_ESPACE;
2026 return NULL;
2028 dfa->has_plural_match = 1;
2030 return tree;
2033 /* This function build the following tree, from regular expression
2034 <exp1><exp2>:
2038 <exp1> <exp2>
2040 CAT means concatenation. */
2042 static bin_tree_t *
2043 parse_branch (regexp, preg, token, syntax, nest, err)
2044 re_string_t *regexp;
2045 regex_t *preg;
2046 re_token_t *token;
2047 reg_syntax_t syntax;
2048 int nest;
2049 reg_errcode_t *err;
2051 bin_tree_t *tree, *exp;
2052 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2053 tree = parse_expression (regexp, preg, token, syntax, nest, err);
2054 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2055 return NULL;
2057 while (token->type != OP_ALT && token->type != END_OF_RE
2058 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2060 exp = parse_expression (regexp, preg, token, syntax, nest, err);
2061 if (BE (*err != REG_NOERROR && exp == NULL, 0))
2063 return NULL;
2065 if (tree != NULL && exp != NULL)
2067 tree = create_tree (dfa, tree, exp, CONCAT, 0);
2068 if (tree == NULL)
2070 *err = REG_ESPACE;
2071 return NULL;
2074 else if (tree == NULL)
2075 tree = exp;
2076 /* Otherwise exp == NULL, we don't need to create new tree. */
2078 return tree;
2081 /* This function build the following tree, from regular expression a*:
2087 static bin_tree_t *
2088 parse_expression (regexp, preg, token, syntax, nest, err)
2089 re_string_t *regexp;
2090 regex_t *preg;
2091 re_token_t *token;
2092 reg_syntax_t syntax;
2093 int nest;
2094 reg_errcode_t *err;
2096 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2097 bin_tree_t *tree;
2098 switch (token->type)
2100 case CHARACTER:
2101 tree = re_dfa_add_tree_node (dfa, NULL, NULL, token);
2102 if (BE (tree == NULL, 0))
2104 *err = REG_ESPACE;
2105 return NULL;
2107 #ifdef RE_ENABLE_I18N
2108 if (dfa->mb_cur_max > 1)
2110 while (!re_string_eoi (regexp)
2111 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2113 bin_tree_t *mbc_remain;
2114 fetch_token (token, regexp, syntax);
2115 mbc_remain = re_dfa_add_tree_node (dfa, NULL, NULL, token);
2116 tree = create_tree (dfa, tree, mbc_remain, CONCAT, 0);
2117 if (BE (mbc_remain == NULL || tree == NULL, 0))
2119 *err = REG_ESPACE;
2120 return NULL;
2124 #endif
2125 break;
2126 case OP_OPEN_SUBEXP:
2127 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2128 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2129 return NULL;
2130 break;
2131 case OP_OPEN_BRACKET:
2132 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2133 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2134 return NULL;
2135 break;
2136 case OP_BACK_REF:
2137 if (BE (preg->re_nsub < token->opr.idx
2138 || dfa->subexps[token->opr.idx - 1].end == -1, 0))
2140 *err = REG_ESUBREG;
2141 return NULL;
2143 dfa->used_bkref_map |= 1 << (token->opr.idx - 1);
2144 tree = re_dfa_add_tree_node (dfa, NULL, NULL, token);
2145 if (BE (tree == NULL, 0))
2147 *err = REG_ESPACE;
2148 return NULL;
2150 ++dfa->nbackref;
2151 dfa->has_mb_node = 1;
2152 break;
2153 case OP_OPEN_DUP_NUM:
2154 if (syntax & RE_CONTEXT_INVALID_DUP)
2156 *err = REG_BADRPT;
2157 return NULL;
2159 /* FALLTHROUGH */
2160 case OP_DUP_ASTERISK:
2161 case OP_DUP_PLUS:
2162 case OP_DUP_QUESTION:
2163 if (syntax & RE_CONTEXT_INVALID_OPS)
2165 *err = REG_BADRPT;
2166 return NULL;
2168 else if (syntax & RE_CONTEXT_INDEP_OPS)
2170 fetch_token (token, regexp, syntax);
2171 return parse_expression (regexp, preg, token, syntax, nest, err);
2173 /* else fall through */
2174 case OP_CLOSE_SUBEXP:
2175 if ((token->type == OP_CLOSE_SUBEXP) &&
2176 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2178 *err = REG_ERPAREN;
2179 return NULL;
2181 /* else fall through */
2182 case OP_CLOSE_DUP_NUM:
2183 /* We treat it as a normal character. */
2185 /* Then we can these characters as normal characters. */
2186 token->type = CHARACTER;
2187 /* mb_partial and word_char bits should be initialized already
2188 by peek_token. */
2189 tree = re_dfa_add_tree_node (dfa, NULL, NULL, token);
2190 if (BE (tree == NULL, 0))
2192 *err = REG_ESPACE;
2193 return NULL;
2195 break;
2196 case ANCHOR:
2197 if ((token->opr.ctx_type
2198 & (WORD_DELIM | INSIDE_WORD | WORD_FIRST | WORD_LAST))
2199 && dfa->word_ops_used == 0)
2200 init_word_char (dfa);
2201 if (token->opr.ctx_type == WORD_DELIM)
2203 bin_tree_t *tree_first, *tree_last;
2204 token->opr.ctx_type = WORD_FIRST;
2205 tree_first = re_dfa_add_tree_node (dfa, NULL, NULL, token);
2206 token->opr.ctx_type = WORD_LAST;
2207 tree_last = re_dfa_add_tree_node (dfa, NULL, NULL, token);
2208 token->type = OP_ALT;
2209 tree = re_dfa_add_tree_node (dfa, tree_first, tree_last, token);
2210 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2212 *err = REG_ESPACE;
2213 return NULL;
2216 else
2218 tree = re_dfa_add_tree_node (dfa, NULL, NULL, token);
2219 if (BE (tree == NULL, 0))
2221 *err = REG_ESPACE;
2222 return NULL;
2225 /* We must return here, since ANCHORs can't be followed
2226 by repetition operators.
2227 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2228 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2229 fetch_token (token, regexp, syntax);
2230 return tree;
2231 case OP_PERIOD:
2232 tree = re_dfa_add_tree_node (dfa, NULL, NULL, token);
2233 if (BE (tree == NULL, 0))
2235 *err = REG_ESPACE;
2236 return NULL;
2238 if (dfa->mb_cur_max > 1)
2239 dfa->has_mb_node = 1;
2240 break;
2241 case OP_WORD:
2242 case OP_NOTWORD:
2243 tree = build_charclass_op (dfa, regexp->trans,
2244 (const unsigned char *) "alnum",
2245 (const unsigned char *) "_",
2246 token->type == OP_NOTWORD, err);
2247 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2248 return NULL;
2249 break;
2250 case OP_SPACE:
2251 case OP_NOTSPACE:
2252 tree = build_charclass_op (dfa, regexp->trans,
2253 (const unsigned char *) "space",
2254 (const unsigned char *) "",
2255 token->type == OP_NOTSPACE, err);
2256 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2257 return NULL;
2258 break;
2259 case OP_ALT:
2260 case END_OF_RE:
2261 return NULL;
2262 case BACK_SLASH:
2263 *err = REG_EESCAPE;
2264 return NULL;
2265 default:
2266 /* Must not happen? */
2267 #ifdef DEBUG
2268 assert (0);
2269 #endif
2270 return NULL;
2272 fetch_token (token, regexp, syntax);
2274 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2275 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2277 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2278 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2279 return NULL;
2280 /* In BRE consecutive duplications are not allowed. */
2281 if ((syntax & RE_CONTEXT_INVALID_DUP)
2282 && (token->type == OP_DUP_ASTERISK
2283 || token->type == OP_OPEN_DUP_NUM))
2285 *err = REG_BADRPT;
2286 return NULL;
2288 dfa->has_plural_match = 1;
2291 return tree;
2294 /* This function build the following tree, from regular expression
2295 (<reg_exp>):
2296 SUBEXP
2298 <reg_exp>
2301 static bin_tree_t *
2302 parse_sub_exp (regexp, preg, token, syntax, nest, err)
2303 re_string_t *regexp;
2304 regex_t *preg;
2305 re_token_t *token;
2306 reg_syntax_t syntax;
2307 int nest;
2308 reg_errcode_t *err;
2310 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2311 bin_tree_t *tree, *left_par, *right_par;
2312 size_t cur_nsub;
2313 cur_nsub = preg->re_nsub++;
2314 if (BE (dfa->subexps_alloc < preg->re_nsub, 0))
2316 re_subexp_t *new_array;
2317 dfa->subexps_alloc *= 2;
2318 new_array = re_realloc (dfa->subexps, re_subexp_t, dfa->subexps_alloc);
2319 if (BE (new_array == NULL, 0))
2321 dfa->subexps_alloc /= 2;
2322 *err = REG_ESPACE;
2323 return NULL;
2325 dfa->subexps = new_array;
2327 dfa->subexps[cur_nsub].start = dfa->nodes_len;
2328 dfa->subexps[cur_nsub].end = -1;
2330 left_par = re_dfa_add_tree_node (dfa, NULL, NULL, token);
2331 if (BE (left_par == NULL, 0))
2333 *err = REG_ESPACE;
2334 return NULL;
2336 dfa->nodes[left_par->node_idx].opr.idx = cur_nsub;
2337 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2339 /* The subexpression may be a null string. */
2340 if (token->type == OP_CLOSE_SUBEXP)
2341 tree = NULL;
2342 else
2344 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2345 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2346 return NULL;
2348 if (BE (token->type != OP_CLOSE_SUBEXP, 0))
2350 *err = REG_EPAREN;
2351 return NULL;
2353 right_par = re_dfa_add_tree_node (dfa, NULL, NULL, token);
2354 dfa->subexps[cur_nsub].end = dfa->nodes_len;
2355 tree = ((tree == NULL) ? right_par
2356 : create_tree (dfa, tree, right_par, CONCAT, 0));
2357 tree = create_tree (dfa, left_par, tree, CONCAT, 0);
2358 if (BE (right_par == NULL || tree == NULL, 0))
2360 *err = REG_ESPACE;
2361 return NULL;
2363 dfa->nodes[right_par->node_idx].opr.idx = cur_nsub;
2365 return tree;
2368 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2370 static bin_tree_t *
2371 parse_dup_op (elem, regexp, dfa, token, syntax, err)
2372 bin_tree_t *elem;
2373 re_string_t *regexp;
2374 re_dfa_t *dfa;
2375 re_token_t *token;
2376 reg_syntax_t syntax;
2377 reg_errcode_t *err;
2379 re_token_t dup_token;
2380 bin_tree_t *tree = NULL;
2381 int i, start, end, start_idx = re_string_cur_idx (regexp);
2382 re_token_t start_token = *token;
2384 if (token->type == OP_OPEN_DUP_NUM)
2386 end = 0;
2387 start = fetch_number (regexp, token, syntax);
2388 if (start == -1)
2390 if (token->type == CHARACTER && token->opr.c == ',')
2391 start = 0; /* We treat "{,m}" as "{0,m}". */
2392 else
2394 *err = REG_BADBR; /* <re>{} is invalid. */
2395 return NULL;
2398 if (BE (start != -2, 1))
2400 /* We treat "{n}" as "{n,n}". */
2401 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2402 : ((token->type == CHARACTER && token->opr.c == ',')
2403 ? fetch_number (regexp, token, syntax) : -2));
2405 if (BE (start == -2 || end == -2, 0))
2407 /* Invalid sequence. */
2408 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2410 if (token->type == END_OF_RE)
2411 *err = REG_EBRACE;
2412 else
2413 *err = REG_BADBR;
2415 return NULL;
2418 /* If the syntax bit is set, rollback. */
2419 re_string_set_index (regexp, start_idx);
2420 *token = start_token;
2421 token->type = CHARACTER;
2422 /* mb_partial and word_char bits should be already initialized by
2423 peek_token. */
2424 return elem;
2427 if (BE (end != -1 && start > end, 0))
2429 /* First number greater than second. */
2430 *err = REG_BADBR;
2431 return NULL;
2434 else
2436 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2437 end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2440 /* Treat "<re>{0}*" etc. as "<re>{0}". */
2441 if (BE (elem == NULL, 0))
2442 start = end = 0;
2444 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2445 else if (BE (start > 0, 0))
2447 tree = elem;
2448 for (i = 2; i <= start; ++i)
2450 elem = duplicate_tree (elem, dfa);
2451 tree = create_tree (dfa, tree, elem, CONCAT, 0);
2452 if (BE (elem == NULL || tree == NULL, 0))
2453 goto parse_dup_op_espace;
2457 if (BE (end != start, 1))
2459 dup_token.type = (end == -1 ? OP_DUP_ASTERISK : OP_DUP_QUESTION);
2460 if (BE (start > 0, 0))
2462 elem = duplicate_tree (elem, dfa);
2463 if (BE (elem == NULL, 0))
2464 goto parse_dup_op_espace;
2466 /* This subexpression will be marked as optional, so that
2467 empty matches do not touch the registers. */
2468 mark_opt_subexp (elem, dfa);
2470 /* Prepare the tree with the modifier. */
2471 elem = re_dfa_add_tree_node (dfa, elem, NULL, &dup_token);
2472 tree = create_tree (dfa, tree, elem, CONCAT, 0);
2474 else
2476 /* We do not need to duplicate the tree because we have not
2477 created it yet. */
2478 mark_opt_subexp (elem, dfa);
2479 tree = elem = re_dfa_add_tree_node (dfa, elem, NULL, &dup_token);
2482 if (BE (elem == NULL || tree == NULL, 0))
2483 goto parse_dup_op_espace;
2485 /* This loop is actually executed only when end != -1,
2486 to rewrite <re>{0,n} as <re>?<re>?<re>?... We have
2487 already created the start+1-th copy. */
2488 for (i = start + 2; i <= end; ++i)
2490 elem = duplicate_tree (elem, dfa);
2491 tree = create_tree (dfa, tree, elem, CONCAT, 0);
2492 if (BE (elem == NULL || tree == NULL, 0))
2494 *err = REG_ESPACE;
2495 return NULL;
2500 fetch_token (token, regexp, syntax);
2501 return tree;
2503 parse_dup_op_espace:
2504 *err = REG_ESPACE;
2505 return NULL;
2508 /* Size of the names for collating symbol/equivalence_class/character_class.
2509 I'm not sure, but maybe enough. */
2510 #define BRACKET_NAME_BUF_SIZE 32
2512 #ifndef _LIBC
2513 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2514 Build the range expression which starts from START_ELEM, and ends
2515 at END_ELEM. The result are written to MBCSET and SBCSET.
2516 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2517 mbcset->range_ends, is a pointer argument sinse we may
2518 update it. */
2520 static reg_errcode_t
2521 # ifdef RE_ENABLE_I18N
2522 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2523 re_charset_t *mbcset;
2524 int *range_alloc;
2525 # else /* not RE_ENABLE_I18N */
2526 build_range_exp (sbcset, start_elem, end_elem)
2527 # endif /* not RE_ENABLE_I18N */
2528 re_bitset_ptr_t sbcset;
2529 bracket_elem_t *start_elem, *end_elem;
2531 unsigned int start_ch, end_ch;
2532 /* Equivalence Classes and Character Classes can't be a range start/end. */
2533 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2534 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2536 return REG_ERANGE;
2538 /* We can handle no multi character collating elements without libc
2539 support. */
2540 if (BE ((start_elem->type == COLL_SYM
2541 && strlen ((char *) start_elem->opr.name) > 1)
2542 || (end_elem->type == COLL_SYM
2543 && strlen ((char *) end_elem->opr.name) > 1), 0))
2544 return REG_ECOLLATE;
2546 # ifdef RE_ENABLE_I18N
2548 wchar_t wc, start_wc, end_wc;
2549 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2551 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2552 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2553 : 0));
2554 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2555 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2556 : 0));
2557 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2558 ? __btowc (start_ch) : start_elem->opr.wch);
2559 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2560 ? __btowc (end_ch) : end_elem->opr.wch);
2561 if (start_wc == WEOF || end_wc == WEOF)
2562 return REG_ECOLLATE;
2563 cmp_buf[0] = start_wc;
2564 cmp_buf[4] = end_wc;
2565 if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2566 return REG_ERANGE;
2568 /* Got valid collation sequence values, add them as a new entry.
2569 However, for !_LIBC we have no collation elements: if the
2570 character set is single byte, the single byte character set
2571 that we build below suffices. parse_bracket_exp passes
2572 no MBCSET if dfa->mb_cur_max == 1. */
2573 if (mbcset)
2575 /* Check the space of the arrays. */
2576 if (BE (*range_alloc == mbcset->nranges, 0))
2578 /* There is not enough space, need realloc. */
2579 wchar_t *new_array_start, *new_array_end;
2580 int new_nranges;
2582 /* +1 in case of mbcset->nranges is 0. */
2583 new_nranges = 2 * mbcset->nranges + 1;
2584 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2585 are NULL if *range_alloc == 0. */
2586 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2587 new_nranges);
2588 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2589 new_nranges);
2591 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2592 return REG_ESPACE;
2594 mbcset->range_starts = new_array_start;
2595 mbcset->range_ends = new_array_end;
2596 *range_alloc = new_nranges;
2599 mbcset->range_starts[mbcset->nranges] = start_wc;
2600 mbcset->range_ends[mbcset->nranges++] = end_wc;
2603 /* Build the table for single byte characters. */
2604 for (wc = 0; wc < SBC_MAX; ++wc)
2606 cmp_buf[2] = wc;
2607 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2608 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2609 bitset_set (sbcset, wc);
2612 # else /* not RE_ENABLE_I18N */
2614 unsigned int ch;
2615 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2616 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2617 : 0));
2618 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2619 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2620 : 0));
2621 if (start_ch > end_ch)
2622 return REG_ERANGE;
2623 /* Build the table for single byte characters. */
2624 for (ch = 0; ch < SBC_MAX; ++ch)
2625 if (start_ch <= ch && ch <= end_ch)
2626 bitset_set (sbcset, ch);
2628 # endif /* not RE_ENABLE_I18N */
2629 return REG_NOERROR;
2631 #endif /* not _LIBC */
2633 #ifndef _LIBC
2634 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2635 Build the collating element which is represented by NAME.
2636 The result are written to MBCSET and SBCSET.
2637 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2638 pointer argument since we may update it. */
2640 static reg_errcode_t
2641 # ifdef RE_ENABLE_I18N
2642 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2643 re_charset_t *mbcset;
2644 int *coll_sym_alloc;
2645 # else /* not RE_ENABLE_I18N */
2646 build_collating_symbol (sbcset, name)
2647 # endif /* not RE_ENABLE_I18N */
2648 re_bitset_ptr_t sbcset;
2649 const unsigned char *name;
2651 size_t name_len = strlen ((const char *) name);
2652 if (BE (name_len != 1, 0))
2653 return REG_ECOLLATE;
2654 else
2656 bitset_set (sbcset, name[0]);
2657 return REG_NOERROR;
2660 #endif /* not _LIBC */
2662 /* This function parse bracket expression like "[abc]", "[a-c]",
2663 "[[.a-a.]]" etc. */
2665 static bin_tree_t *
2666 parse_bracket_exp (regexp, dfa, token, syntax, err)
2667 re_string_t *regexp;
2668 re_dfa_t *dfa;
2669 re_token_t *token;
2670 reg_syntax_t syntax;
2671 reg_errcode_t *err;
2673 #ifdef _LIBC
2674 const unsigned char *collseqmb;
2675 const char *collseqwc;
2676 uint32_t nrules;
2677 int32_t table_size;
2678 const int32_t *symb_table;
2679 const unsigned char *extra;
2681 /* Local function for parse_bracket_exp used in _LIBC environement.
2682 Seek the collating symbol entry correspondings to NAME.
2683 Return the index of the symbol in the SYMB_TABLE. */
2685 static inline int32_t
2686 __attribute ((always_inline))
2687 seek_collating_symbol_entry (name, name_len)
2688 const unsigned char *name;
2689 size_t name_len;
2691 int32_t hash = elem_hash ((const char *) name, name_len);
2692 int32_t elem = hash % table_size;
2693 int32_t second = hash % (table_size - 2);
2694 while (symb_table[2 * elem] != 0)
2696 /* First compare the hashing value. */
2697 if (symb_table[2 * elem] == hash
2698 /* Compare the length of the name. */
2699 && name_len == extra[symb_table[2 * elem + 1]]
2700 /* Compare the name. */
2701 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2702 name_len) == 0)
2704 /* Yep, this is the entry. */
2705 break;
2708 /* Next entry. */
2709 elem += second;
2711 return elem;
2714 /* Local function for parse_bracket_exp used in _LIBC environement.
2715 Look up the collation sequence value of BR_ELEM.
2716 Return the value if succeeded, UINT_MAX otherwise. */
2718 static inline unsigned int
2719 __attribute ((always_inline))
2720 lookup_collation_sequence_value (br_elem)
2721 bracket_elem_t *br_elem;
2723 if (br_elem->type == SB_CHAR)
2726 if (MB_CUR_MAX == 1)
2728 if (nrules == 0)
2729 return collseqmb[br_elem->opr.ch];
2730 else
2732 wint_t wc = __btowc (br_elem->opr.ch);
2733 return __collseq_table_lookup (collseqwc, wc);
2736 else if (br_elem->type == MB_CHAR)
2738 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2740 else if (br_elem->type == COLL_SYM)
2742 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2743 if (nrules != 0)
2745 int32_t elem, idx;
2746 elem = seek_collating_symbol_entry (br_elem->opr.name,
2747 sym_name_len);
2748 if (symb_table[2 * elem] != 0)
2750 /* We found the entry. */
2751 idx = symb_table[2 * elem + 1];
2752 /* Skip the name of collating element name. */
2753 idx += 1 + extra[idx];
2754 /* Skip the byte sequence of the collating element. */
2755 idx += 1 + extra[idx];
2756 /* Adjust for the alignment. */
2757 idx = (idx + 3) & ~3;
2758 /* Skip the multibyte collation sequence value. */
2759 idx += sizeof (unsigned int);
2760 /* Skip the wide char sequence of the collating element. */
2761 idx += sizeof (unsigned int) *
2762 (1 + *(unsigned int *) (extra + idx));
2763 /* Return the collation sequence value. */
2764 return *(unsigned int *) (extra + idx);
2766 else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2768 /* No valid character. Match it as a single byte
2769 character. */
2770 return collseqmb[br_elem->opr.name[0]];
2773 else if (sym_name_len == 1)
2774 return collseqmb[br_elem->opr.name[0]];
2776 return UINT_MAX;
2779 /* Local function for parse_bracket_exp used in _LIBC environement.
2780 Build the range expression which starts from START_ELEM, and ends
2781 at END_ELEM. The result are written to MBCSET and SBCSET.
2782 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2783 mbcset->range_ends, is a pointer argument sinse we may
2784 update it. */
2786 static inline reg_errcode_t
2787 __attribute ((always_inline))
2788 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2789 re_charset_t *mbcset;
2790 int *range_alloc;
2791 re_bitset_ptr_t sbcset;
2792 bracket_elem_t *start_elem, *end_elem;
2794 unsigned int ch;
2795 uint32_t start_collseq;
2796 uint32_t end_collseq;
2798 /* Equivalence Classes and Character Classes can't be a range
2799 start/end. */
2800 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2801 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2803 return REG_ERANGE;
2805 start_collseq = lookup_collation_sequence_value (start_elem);
2806 end_collseq = lookup_collation_sequence_value (end_elem);
2807 /* Check start/end collation sequence values. */
2808 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2809 return REG_ECOLLATE;
2810 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2811 return REG_ERANGE;
2813 /* Got valid collation sequence values, add them as a new entry.
2814 However, if we have no collation elements, and the character set
2815 is single byte, the single byte character set that we
2816 build below suffices. */
2817 if (nrules > 0 || dfa->mb_cur_max > 1)
2819 /* Check the space of the arrays. */
2820 if (BE (*range_alloc == mbcset->nranges, 0))
2822 /* There is not enough space, need realloc. */
2823 uint32_t *new_array_start;
2824 uint32_t *new_array_end;
2825 int new_nranges;
2827 /* +1 in case of mbcset->nranges is 0. */
2828 new_nranges = 2 * mbcset->nranges + 1;
2829 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2830 new_nranges);
2831 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2832 new_nranges);
2834 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2835 return REG_ESPACE;
2837 mbcset->range_starts = new_array_start;
2838 mbcset->range_ends = new_array_end;
2839 *range_alloc = new_nranges;
2842 mbcset->range_starts[mbcset->nranges] = start_collseq;
2843 mbcset->range_ends[mbcset->nranges++] = end_collseq;
2846 /* Build the table for single byte characters. */
2847 for (ch = 0; ch < SBC_MAX; ch++)
2849 uint32_t ch_collseq;
2851 if (MB_CUR_MAX == 1)
2853 if (nrules == 0)
2854 ch_collseq = collseqmb[ch];
2855 else
2856 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2857 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2858 bitset_set (sbcset, ch);
2860 return REG_NOERROR;
2863 /* Local function for parse_bracket_exp used in _LIBC environement.
2864 Build the collating element which is represented by NAME.
2865 The result are written to MBCSET and SBCSET.
2866 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2867 pointer argument sinse we may update it. */
2869 static inline reg_errcode_t
2870 __attribute ((always_inline))
2871 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2872 re_charset_t *mbcset;
2873 int *coll_sym_alloc;
2874 re_bitset_ptr_t sbcset;
2875 const unsigned char *name;
2877 int32_t elem, idx;
2878 size_t name_len = strlen ((const char *) name);
2879 if (nrules != 0)
2881 elem = seek_collating_symbol_entry (name, name_len);
2882 if (symb_table[2 * elem] != 0)
2884 /* We found the entry. */
2885 idx = symb_table[2 * elem + 1];
2886 /* Skip the name of collating element name. */
2887 idx += 1 + extra[idx];
2889 else if (symb_table[2 * elem] == 0 && name_len == 1)
2891 /* No valid character, treat it as a normal
2892 character. */
2893 bitset_set (sbcset, name[0]);
2894 return REG_NOERROR;
2896 else
2897 return REG_ECOLLATE;
2899 /* Got valid collation sequence, add it as a new entry. */
2900 /* Check the space of the arrays. */
2901 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
2903 /* Not enough, realloc it. */
2904 /* +1 in case of mbcset->ncoll_syms is 0. */
2905 int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
2906 /* Use realloc since mbcset->coll_syms is NULL
2907 if *alloc == 0. */
2908 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
2909 new_coll_sym_alloc);
2910 if (BE (new_coll_syms == NULL, 0))
2911 return REG_ESPACE;
2912 mbcset->coll_syms = new_coll_syms;
2913 *coll_sym_alloc = new_coll_sym_alloc;
2915 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
2916 return REG_NOERROR;
2918 else
2920 if (BE (name_len != 1, 0))
2921 return REG_ECOLLATE;
2922 else
2924 bitset_set (sbcset, name[0]);
2925 return REG_NOERROR;
2929 #endif
2931 re_token_t br_token;
2932 re_bitset_ptr_t sbcset;
2933 #ifdef RE_ENABLE_I18N
2934 re_charset_t *mbcset;
2935 int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
2936 int equiv_class_alloc = 0, char_class_alloc = 0;
2937 #endif /* not RE_ENABLE_I18N */
2938 int non_match = 0;
2939 bin_tree_t *work_tree;
2940 int token_len;
2941 int first_round = 1;
2942 #ifdef _LIBC
2943 collseqmb = (const unsigned char *)
2944 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
2945 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
2946 if (nrules)
2949 if (MB_CUR_MAX > 1)
2951 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
2952 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
2953 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
2954 _NL_COLLATE_SYMB_TABLEMB);
2955 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
2956 _NL_COLLATE_SYMB_EXTRAMB);
2958 #endif
2959 sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
2960 #ifdef RE_ENABLE_I18N
2961 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
2962 #endif /* RE_ENABLE_I18N */
2963 #ifdef RE_ENABLE_I18N
2964 if (BE (sbcset == NULL || mbcset == NULL, 0))
2965 #else
2966 if (BE (sbcset == NULL, 0))
2967 #endif /* RE_ENABLE_I18N */
2969 *err = REG_ESPACE;
2970 return NULL;
2973 token_len = peek_token_bracket (token, regexp, syntax);
2974 if (BE (token->type == END_OF_RE, 0))
2976 *err = REG_BADPAT;
2977 goto parse_bracket_exp_free_return;
2979 if (token->type == OP_NON_MATCH_LIST)
2981 #ifdef RE_ENABLE_I18N
2982 mbcset->non_match = 1;
2983 #endif /* not RE_ENABLE_I18N */
2984 non_match = 1;
2985 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
2986 bitset_set (sbcset, '\0');
2987 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
2988 token_len = peek_token_bracket (token, regexp, syntax);
2989 if (BE (token->type == END_OF_RE, 0))
2991 *err = REG_BADPAT;
2992 goto parse_bracket_exp_free_return;
2996 /* We treat the first ']' as a normal character. */
2997 if (token->type == OP_CLOSE_BRACKET)
2998 token->type = CHARACTER;
3000 while (1)
3002 bracket_elem_t start_elem, end_elem;
3003 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3004 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3005 reg_errcode_t ret;
3006 int token_len2 = 0, is_range_exp = 0;
3007 re_token_t token2;
3009 start_elem.opr.name = start_name_buf;
3010 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3011 syntax, first_round);
3012 if (BE (ret != REG_NOERROR, 0))
3014 *err = ret;
3015 goto parse_bracket_exp_free_return;
3017 first_round = 0;
3019 /* Get information about the next token. We need it in any case. */
3020 token_len = peek_token_bracket (token, regexp, syntax);
3022 /* Do not check for ranges if we know they are not allowed. */
3023 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3025 if (BE (token->type == END_OF_RE, 0))
3027 *err = REG_EBRACK;
3028 goto parse_bracket_exp_free_return;
3030 if (token->type == OP_CHARSET_RANGE)
3032 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3033 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3034 if (BE (token2.type == END_OF_RE, 0))
3036 *err = REG_EBRACK;
3037 goto parse_bracket_exp_free_return;
3039 if (token2.type == OP_CLOSE_BRACKET)
3041 /* We treat the last '-' as a normal character. */
3042 re_string_skip_bytes (regexp, -token_len);
3043 token->type = CHARACTER;
3045 else
3046 is_range_exp = 1;
3050 if (is_range_exp == 1)
3052 end_elem.opr.name = end_name_buf;
3053 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3054 dfa, syntax, 1);
3055 if (BE (ret != REG_NOERROR, 0))
3057 *err = ret;
3058 goto parse_bracket_exp_free_return;
3061 token_len = peek_token_bracket (token, regexp, syntax);
3063 #ifdef _LIBC
3064 *err = build_range_exp (sbcset, mbcset, &range_alloc,
3065 &start_elem, &end_elem);
3066 #else
3067 # ifdef RE_ENABLE_I18N
3068 *err = build_range_exp (sbcset,
3069 dfa->mb_cur_max > 1 ? mbcset : NULL,
3070 &range_alloc, &start_elem, &end_elem);
3071 # else
3072 *err = build_range_exp (sbcset, &start_elem, &end_elem);
3073 # endif
3074 #endif /* RE_ENABLE_I18N */
3075 if (BE (*err != REG_NOERROR, 0))
3076 goto parse_bracket_exp_free_return;
3078 else
3080 switch (start_elem.type)
3082 case SB_CHAR:
3083 bitset_set (sbcset, start_elem.opr.ch);
3084 break;
3085 #ifdef RE_ENABLE_I18N
3086 case MB_CHAR:
3087 /* Check whether the array has enough space. */
3088 if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3090 wchar_t *new_mbchars;
3091 /* Not enough, realloc it. */
3092 /* +1 in case of mbcset->nmbchars is 0. */
3093 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3094 /* Use realloc since array is NULL if *alloc == 0. */
3095 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3096 mbchar_alloc);
3097 if (BE (new_mbchars == NULL, 0))
3098 goto parse_bracket_exp_espace;
3099 mbcset->mbchars = new_mbchars;
3101 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3102 break;
3103 #endif /* RE_ENABLE_I18N */
3104 case EQUIV_CLASS:
3105 *err = build_equiv_class (sbcset,
3106 #ifdef RE_ENABLE_I18N
3107 mbcset, &equiv_class_alloc,
3108 #endif /* RE_ENABLE_I18N */
3109 start_elem.opr.name);
3110 if (BE (*err != REG_NOERROR, 0))
3111 goto parse_bracket_exp_free_return;
3112 break;
3113 case COLL_SYM:
3114 *err = build_collating_symbol (sbcset,
3115 #ifdef RE_ENABLE_I18N
3116 mbcset, &coll_sym_alloc,
3117 #endif /* RE_ENABLE_I18N */
3118 start_elem.opr.name);
3119 if (BE (*err != REG_NOERROR, 0))
3120 goto parse_bracket_exp_free_return;
3121 break;
3122 case CHAR_CLASS:
3123 *err = build_charclass (regexp->trans, sbcset,
3124 #ifdef RE_ENABLE_I18N
3125 mbcset, &char_class_alloc,
3126 #endif /* RE_ENABLE_I18N */
3127 start_elem.opr.name, syntax);
3128 if (BE (*err != REG_NOERROR, 0))
3129 goto parse_bracket_exp_free_return;
3130 break;
3131 default:
3132 assert (0);
3133 break;
3136 if (BE (token->type == END_OF_RE, 0))
3138 *err = REG_EBRACK;
3139 goto parse_bracket_exp_free_return;
3141 if (token->type == OP_CLOSE_BRACKET)
3142 break;
3145 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3147 /* If it is non-matching list. */
3148 if (non_match)
3149 bitset_not (sbcset);
3151 #ifdef RE_ENABLE_I18N
3152 /* Ensure only single byte characters are set. */
3153 if (dfa->mb_cur_max > 1)
3154 bitset_mask (sbcset, dfa->sb_char);
3155 #endif /* RE_ENABLE_I18N */
3157 /* Build a tree for simple bracket. */
3158 br_token.type = SIMPLE_BRACKET;
3159 br_token.opr.sbcset = sbcset;
3160 work_tree = re_dfa_add_tree_node (dfa, NULL, NULL, &br_token);
3161 if (BE (work_tree == NULL, 0))
3162 goto parse_bracket_exp_espace;
3164 #ifdef RE_ENABLE_I18N
3165 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3166 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3167 || mbcset->non_match)))
3169 re_token_t alt_token;
3170 bin_tree_t *mbc_tree;
3171 int sbc_idx;
3172 /* Build a tree for complex bracket. */
3173 dfa->has_mb_node = 1;
3174 for (sbc_idx = 0; sbc_idx < BITSET_UINTS; ++sbc_idx)
3175 if (sbcset[sbc_idx])
3176 break;
3177 /* If there are no bits set in sbcset, there is no point
3178 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3179 if (sbc_idx == BITSET_UINTS)
3181 re_free (sbcset);
3182 dfa->nodes[work_tree->node_idx].type = COMPLEX_BRACKET;
3183 dfa->nodes[work_tree->node_idx].opr.mbcset = mbcset;
3184 return work_tree;
3186 br_token.type = COMPLEX_BRACKET;
3187 br_token.opr.mbcset = mbcset;
3188 mbc_tree = re_dfa_add_tree_node (dfa, NULL, NULL, &br_token);
3189 if (BE (mbc_tree == NULL, 0))
3190 goto parse_bracket_exp_espace;
3191 /* Then join them by ALT node. */
3192 alt_token.type = OP_ALT;
3193 dfa->has_plural_match = 1;
3194 work_tree = re_dfa_add_tree_node (dfa, work_tree, mbc_tree, &alt_token);
3195 if (BE (mbc_tree != NULL, 1))
3196 return work_tree;
3198 else
3200 free_charset (mbcset);
3201 return work_tree;
3203 #else /* not RE_ENABLE_I18N */
3204 return work_tree;
3205 #endif /* not RE_ENABLE_I18N */
3207 parse_bracket_exp_espace:
3208 *err = REG_ESPACE;
3209 parse_bracket_exp_free_return:
3210 re_free (sbcset);
3211 #ifdef RE_ENABLE_I18N
3212 free_charset (mbcset);
3213 #endif /* RE_ENABLE_I18N */
3214 return NULL;
3217 /* Parse an element in the bracket expression. */
3219 static reg_errcode_t
3220 parse_bracket_element (elem, regexp, token, token_len, dfa, syntax,
3221 accept_hyphen)
3222 bracket_elem_t *elem;
3223 re_string_t *regexp;
3224 re_token_t *token;
3225 int token_len;
3226 re_dfa_t *dfa;
3227 reg_syntax_t syntax;
3228 int accept_hyphen;
3230 #ifdef RE_ENABLE_I18N
3231 int cur_char_size;
3232 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3233 if (cur_char_size > 1)
3235 elem->type = MB_CHAR;
3236 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3237 re_string_skip_bytes (regexp, cur_char_size);
3238 return REG_NOERROR;
3240 #endif /* RE_ENABLE_I18N */
3241 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3242 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3243 || token->type == OP_OPEN_EQUIV_CLASS)
3244 return parse_bracket_symbol (elem, regexp, token);
3245 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3247 /* A '-' must only appear as anything but a range indicator before
3248 the closing bracket. Everything else is an error. */
3249 re_token_t token2;
3250 (void) peek_token_bracket (&token2, regexp, syntax);
3251 if (token2.type != OP_CLOSE_BRACKET)
3252 /* The actual error value is not standardized since this whole
3253 case is undefined. But ERANGE makes good sense. */
3254 return REG_ERANGE;
3256 elem->type = SB_CHAR;
3257 elem->opr.ch = token->opr.c;
3258 return REG_NOERROR;
3261 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3262 such as [:<character_class>:], [.<collating_element>.], and
3263 [=<equivalent_class>=]. */
3265 static reg_errcode_t
3266 parse_bracket_symbol (elem, regexp, token)
3267 bracket_elem_t *elem;
3268 re_string_t *regexp;
3269 re_token_t *token;
3271 unsigned char ch, delim = token->opr.c;
3272 int i = 0;
3273 if (re_string_eoi(regexp))
3274 return REG_EBRACK;
3275 for (;; ++i)
3277 if (i >= BRACKET_NAME_BUF_SIZE)
3278 return REG_EBRACK;
3279 if (token->type == OP_OPEN_CHAR_CLASS)
3280 ch = re_string_fetch_byte_case (regexp);
3281 else
3282 ch = re_string_fetch_byte (regexp);
3283 if (re_string_eoi(regexp))
3284 return REG_EBRACK;
3285 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3286 break;
3287 elem->opr.name[i] = ch;
3289 re_string_skip_bytes (regexp, 1);
3290 elem->opr.name[i] = '\0';
3291 switch (token->type)
3293 case OP_OPEN_COLL_ELEM:
3294 elem->type = COLL_SYM;
3295 break;
3296 case OP_OPEN_EQUIV_CLASS:
3297 elem->type = EQUIV_CLASS;
3298 break;
3299 case OP_OPEN_CHAR_CLASS:
3300 elem->type = CHAR_CLASS;
3301 break;
3302 default:
3303 break;
3305 return REG_NOERROR;
3308 /* Helper function for parse_bracket_exp.
3309 Build the equivalence class which is represented by NAME.
3310 The result are written to MBCSET and SBCSET.
3311 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3312 is a pointer argument sinse we may update it. */
3314 static reg_errcode_t
3315 #ifdef RE_ENABLE_I18N
3316 build_equiv_class (sbcset, mbcset, equiv_class_alloc, name)
3317 re_charset_t *mbcset;
3318 int *equiv_class_alloc;
3319 #else /* not RE_ENABLE_I18N */
3320 build_equiv_class (sbcset, name)
3321 #endif /* not RE_ENABLE_I18N */
3322 re_bitset_ptr_t sbcset;
3323 const unsigned char *name;
3325 #if defined _LIBC
3326 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3327 if (nrules != 0)
3329 const int32_t *table, *indirect;
3330 const unsigned char *weights, *extra, *cp;
3331 unsigned char char_buf[2];
3332 int32_t idx1, idx2;
3333 unsigned int ch;
3334 size_t len;
3335 /* This #include defines a local function! */
3336 # include <locale/weight.h>
3337 /* Calculate the index for equivalence class. */
3338 cp = name;
3339 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3340 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3341 _NL_COLLATE_WEIGHTMB);
3342 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3343 _NL_COLLATE_EXTRAMB);
3344 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3345 _NL_COLLATE_INDIRECTMB);
3346 idx1 = findidx (&cp);
3347 if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3348 /* This isn't a valid character. */
3349 return REG_ECOLLATE;
3351 /* Build single byte matcing table for this equivalence class. */
3352 char_buf[1] = (unsigned char) '\0';
3353 len = weights[idx1];
3354 for (ch = 0; ch < SBC_MAX; ++ch)
3356 char_buf[0] = ch;
3357 cp = char_buf;
3358 idx2 = findidx (&cp);
3360 idx2 = table[ch];
3362 if (idx2 == 0)
3363 /* This isn't a valid character. */
3364 continue;
3365 if (len == weights[idx2])
3367 int cnt = 0;
3368 while (cnt <= len &&
3369 weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
3370 ++cnt;
3372 if (cnt > len)
3373 bitset_set (sbcset, ch);
3376 /* Check whether the array has enough space. */
3377 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3379 /* Not enough, realloc it. */
3380 /* +1 in case of mbcset->nequiv_classes is 0. */
3381 int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3382 /* Use realloc since the array is NULL if *alloc == 0. */
3383 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3384 int32_t,
3385 new_equiv_class_alloc);
3386 if (BE (new_equiv_classes == NULL, 0))
3387 return REG_ESPACE;
3388 mbcset->equiv_classes = new_equiv_classes;
3389 *equiv_class_alloc = new_equiv_class_alloc;
3391 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3393 else
3394 #endif /* _LIBC */
3396 if (BE (strlen ((const char *) name) != 1, 0))
3397 return REG_ECOLLATE;
3398 bitset_set (sbcset, *name);
3400 return REG_NOERROR;
3403 /* Helper function for parse_bracket_exp.
3404 Build the character class which is represented by NAME.
3405 The result are written to MBCSET and SBCSET.
3406 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3407 is a pointer argument sinse we may update it. */
3409 static reg_errcode_t
3410 #ifdef RE_ENABLE_I18N
3411 build_charclass (trans, sbcset, mbcset, char_class_alloc, class_name, syntax)
3412 re_charset_t *mbcset;
3413 int *char_class_alloc;
3414 #else /* not RE_ENABLE_I18N */
3415 build_charclass (trans, sbcset, class_name, syntax)
3416 #endif /* not RE_ENABLE_I18N */
3417 unsigned RE_TRANSLATE_TYPE trans;
3418 re_bitset_ptr_t sbcset;
3419 const unsigned char *class_name;
3420 reg_syntax_t syntax;
3422 int i;
3423 const char *name = (const char *) class_name;
3425 /* In case of REG_ICASE "upper" and "lower" match the both of
3426 upper and lower cases. */
3427 if ((syntax & RE_ICASE)
3428 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3429 name = "alpha";
3431 #ifdef RE_ENABLE_I18N
3432 /* Check the space of the arrays. */
3433 if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3435 /* Not enough, realloc it. */
3436 /* +1 in case of mbcset->nchar_classes is 0. */
3437 int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3438 /* Use realloc since array is NULL if *alloc == 0. */
3439 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3440 new_char_class_alloc);
3441 if (BE (new_char_classes == NULL, 0))
3442 return REG_ESPACE;
3443 mbcset->char_classes = new_char_classes;
3444 *char_class_alloc = new_char_class_alloc;
3446 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3447 #endif /* RE_ENABLE_I18N */
3449 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3450 for (i = 0; i < SBC_MAX; ++i) \
3452 if (ctype_func (i)) \
3454 int ch = trans ? trans[i] : i; \
3455 bitset_set (sbcset, ch); \
3459 if (strcmp (name, "alnum") == 0)
3460 BUILD_CHARCLASS_LOOP (isalnum)
3461 else if (strcmp (name, "cntrl") == 0)
3462 BUILD_CHARCLASS_LOOP (iscntrl)
3463 else if (strcmp (name, "lower") == 0)
3464 BUILD_CHARCLASS_LOOP (islower)
3465 else if (strcmp (name, "space") == 0)
3466 BUILD_CHARCLASS_LOOP (isspace)
3467 else if (strcmp (name, "alpha") == 0)
3468 BUILD_CHARCLASS_LOOP (isalpha)
3469 else if (strcmp (name, "digit") == 0)
3470 BUILD_CHARCLASS_LOOP (isdigit)
3471 else if (strcmp (name, "print") == 0)
3472 BUILD_CHARCLASS_LOOP (isprint)
3473 else if (strcmp (name, "upper") == 0)
3474 BUILD_CHARCLASS_LOOP (isupper)
3475 else if (strcmp (name, "blank") == 0)
3476 BUILD_CHARCLASS_LOOP (isblank)
3477 else if (strcmp (name, "graph") == 0)
3478 BUILD_CHARCLASS_LOOP (isgraph)
3479 else if (strcmp (name, "punct") == 0)
3480 BUILD_CHARCLASS_LOOP (ispunct)
3481 else if (strcmp (name, "xdigit") == 0)
3482 BUILD_CHARCLASS_LOOP (isxdigit)
3483 else
3484 return REG_ECTYPE;
3486 return REG_NOERROR;
3489 static bin_tree_t *
3490 build_charclass_op (dfa, trans, class_name, extra, non_match, err)
3491 re_dfa_t *dfa;
3492 unsigned RE_TRANSLATE_TYPE trans;
3493 const unsigned char *class_name;
3494 const unsigned char *extra;
3495 int non_match;
3496 reg_errcode_t *err;
3498 re_bitset_ptr_t sbcset;
3499 #ifdef RE_ENABLE_I18N
3500 re_charset_t *mbcset;
3501 int alloc = 0;
3502 #endif /* not RE_ENABLE_I18N */
3503 reg_errcode_t ret;
3504 re_token_t br_token;
3505 bin_tree_t *tree;
3507 sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
3508 #ifdef RE_ENABLE_I18N
3509 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3510 #endif /* RE_ENABLE_I18N */
3512 #ifdef RE_ENABLE_I18N
3513 if (BE (sbcset == NULL || mbcset == NULL, 0))
3514 #else /* not RE_ENABLE_I18N */
3515 if (BE (sbcset == NULL, 0))
3516 #endif /* not RE_ENABLE_I18N */
3518 *err = REG_ESPACE;
3519 return NULL;
3522 if (non_match)
3524 #ifdef RE_ENABLE_I18N
3526 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3527 bitset_set(cset->sbcset, '\0');
3529 mbcset->non_match = 1;
3530 #endif /* not RE_ENABLE_I18N */
3533 /* We don't care the syntax in this case. */
3534 ret = build_charclass (trans, sbcset,
3535 #ifdef RE_ENABLE_I18N
3536 mbcset, &alloc,
3537 #endif /* RE_ENABLE_I18N */
3538 class_name, 0);
3540 if (BE (ret != REG_NOERROR, 0))
3542 re_free (sbcset);
3543 #ifdef RE_ENABLE_I18N
3544 free_charset (mbcset);
3545 #endif /* RE_ENABLE_I18N */
3546 *err = ret;
3547 return NULL;
3549 /* \w match '_' also. */
3550 for (; *extra; extra++)
3551 bitset_set (sbcset, *extra);
3553 /* If it is non-matching list. */
3554 if (non_match)
3555 bitset_not (sbcset);
3557 #ifdef RE_ENABLE_I18N
3558 /* Ensure only single byte characters are set. */
3559 if (dfa->mb_cur_max > 1)
3560 bitset_mask (sbcset, dfa->sb_char);
3561 #endif
3563 /* Build a tree for simple bracket. */
3564 br_token.type = SIMPLE_BRACKET;
3565 br_token.opr.sbcset = sbcset;
3566 tree = re_dfa_add_tree_node (dfa, NULL, NULL, &br_token);
3567 if (BE (tree == NULL, 0))
3568 goto build_word_op_espace;
3570 #ifdef RE_ENABLE_I18N
3571 if (dfa->mb_cur_max > 1)
3573 re_token_t alt_token;
3574 bin_tree_t *mbc_tree;
3575 /* Build a tree for complex bracket. */
3576 br_token.type = COMPLEX_BRACKET;
3577 br_token.opr.mbcset = mbcset;
3578 dfa->has_mb_node = 1;
3579 mbc_tree = re_dfa_add_tree_node (dfa, NULL, NULL, &br_token);
3580 if (BE (mbc_tree == NULL, 0))
3581 goto build_word_op_espace;
3582 /* Then join them by ALT node. */
3583 alt_token.type = OP_ALT;
3584 dfa->has_plural_match = 1;
3585 tree = re_dfa_add_tree_node (dfa, tree, mbc_tree, &alt_token);
3586 if (BE (mbc_tree != NULL, 1))
3587 return tree;
3589 else
3591 free_charset (mbcset);
3592 return tree;
3594 #else /* not RE_ENABLE_I18N */
3595 return tree;
3596 #endif /* not RE_ENABLE_I18N */
3598 build_word_op_espace:
3599 re_free (sbcset);
3600 #ifdef RE_ENABLE_I18N
3601 free_charset (mbcset);
3602 #endif /* RE_ENABLE_I18N */
3603 *err = REG_ESPACE;
3604 return NULL;
3607 /* This is intended for the expressions like "a{1,3}".
3608 Fetch a number from `input', and return the number.
3609 Return -1, if the number field is empty like "{,1}".
3610 Return -2, If an error is occured. */
3612 static int
3613 fetch_number (input, token, syntax)
3614 re_string_t *input;
3615 re_token_t *token;
3616 reg_syntax_t syntax;
3618 int num = -1;
3619 unsigned char c;
3620 while (1)
3622 fetch_token (token, input, syntax);
3623 c = token->opr.c;
3624 if (BE (token->type == END_OF_RE, 0))
3625 return -2;
3626 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3627 break;
3628 num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3629 ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3630 num = (num > RE_DUP_MAX) ? -2 : num;
3632 return num;
3635 #ifdef RE_ENABLE_I18N
3636 static void
3637 free_charset (re_charset_t *cset)
3639 re_free (cset->mbchars);
3640 # ifdef _LIBC
3641 re_free (cset->coll_syms);
3642 re_free (cset->equiv_classes);
3643 re_free (cset->range_starts);
3644 re_free (cset->range_ends);
3645 # endif
3646 re_free (cset->char_classes);
3647 re_free (cset);
3649 #endif /* RE_ENABLE_I18N */
3651 /* Functions for binary tree operation. */
3653 /* Create a tree node. */
3655 static bin_tree_t *
3656 create_tree (dfa, left, right, type, index)
3657 re_dfa_t *dfa;
3658 bin_tree_t *left;
3659 bin_tree_t *right;
3660 re_token_type_t type;
3661 int index;
3663 bin_tree_t *tree;
3664 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3666 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3668 if (storage == NULL)
3669 return NULL;
3670 storage->next = dfa->str_tree_storage;
3671 dfa->str_tree_storage = storage;
3672 dfa->str_tree_storage_idx = 0;
3674 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3676 tree->parent = NULL;
3677 tree->left = left;
3678 tree->right = right;
3679 tree->type = type;
3680 tree->node_idx = index;
3681 tree->first = -1;
3682 tree->next = -1;
3683 re_node_set_init_empty (&tree->eclosure);
3685 if (left != NULL)
3686 left->parent = tree;
3687 if (right != NULL)
3688 right->parent = tree;
3689 return tree;
3692 /* Create both a DFA node and a tree for it. */
3694 static bin_tree_t *
3695 re_dfa_add_tree_node (dfa, left, right, token)
3696 re_dfa_t *dfa;
3697 bin_tree_t *left;
3698 bin_tree_t *right;
3699 const re_token_t *token;
3701 int new_idx = re_dfa_add_node (dfa, *token, 0);
3703 if (new_idx == -1)
3704 return NULL;
3706 return create_tree (dfa, left, right, 0, new_idx);
3709 /* Mark the tree SRC as an optional subexpression. */
3711 static void
3712 mark_opt_subexp (src, dfa)
3713 const bin_tree_t *src;
3714 re_dfa_t *dfa;
3716 /* Pass an OPT_SUBEXP_IDX which is != 1 if the duplicated tree is
3717 a subexpression. */
3718 if (src->type == CONCAT
3719 && src->left->type == NON_TYPE
3720 && dfa->nodes[src->left->node_idx].type == OP_OPEN_SUBEXP)
3721 mark_opt_subexp_iter (src, dfa, dfa->nodes[src->left->node_idx].opr.idx);
3725 /* Recursive tree walker for mark_opt_subexp. */
3727 static void
3728 mark_opt_subexp_iter (src, dfa, idx)
3729 const bin_tree_t *src;
3730 re_dfa_t *dfa;
3731 int idx;
3733 int node_idx;
3735 if (src->type == NON_TYPE)
3737 node_idx = src->node_idx;
3738 if ((dfa->nodes[node_idx].type == OP_OPEN_SUBEXP
3739 || dfa->nodes[node_idx].type == OP_CLOSE_SUBEXP)
3740 && dfa->nodes[node_idx].opr.idx == idx)
3741 dfa->nodes[node_idx].opt_subexp = 1;
3744 if (src->left != NULL)
3745 mark_opt_subexp_iter (src->left, dfa, idx);
3747 if (src->right != NULL)
3748 mark_opt_subexp_iter (src->right, dfa, idx);
3752 /* Duplicate the node SRC, and return new node. */
3754 static bin_tree_t *
3755 duplicate_tree (src, dfa)
3756 const bin_tree_t *src;
3757 re_dfa_t *dfa;
3759 bin_tree_t *left = NULL, *right = NULL, *new_tree;
3760 int new_node_idx;
3761 /* Since node indies must be according to Post-order of the tree,
3762 we must duplicate the left at first. */
3763 if (src->left != NULL)
3765 left = duplicate_tree (src->left, dfa);
3766 if (left == NULL)
3767 return NULL;
3770 /* Secondaly, duplicate the right. */
3771 if (src->right != NULL)
3773 right = duplicate_tree (src->right, dfa);
3774 if (right == NULL)
3775 return NULL;
3778 /* At last, duplicate itself. */
3779 if (src->type == NON_TYPE)
3781 new_node_idx = re_dfa_add_node (dfa, dfa->nodes[src->node_idx], 0);
3782 dfa->nodes[new_node_idx].duplicated = 1;
3783 if (BE (new_node_idx == -1, 0))
3784 return NULL;
3786 else
3787 new_node_idx = src->type;
3789 new_tree = create_tree (dfa, left, right, src->type, new_node_idx);
3790 return new_tree;