Support inline syscall with six arguments.
[glibc/pb-stable.git] / posix / regcomp.c
blobc9c0d9eb37bf4d1a3dcbc904ebf5183c3d4df95c
1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19 02111-1307 USA. */
21 #include <assert.h>
22 #include <ctype.h>
23 #include <limits.h>
24 #include <locale.h>
25 #include <stdio.h>
26 #include <stdlib.h>
27 #include <string.h>
29 #if defined HAVE_WCHAR_H || defined _LIBC
30 # include <wchar.h>
31 #endif /* HAVE_WCHAR_H || _LIBC */
32 #if defined HAVE_WCTYPE_H || defined _LIBC
33 # include <wctype.h>
34 #endif /* HAVE_WCTYPE_H || _LIBC */
36 /* In case that the system doesn't have isblank(). */
37 #if !defined _LIBC && !defined HAVE_ISBLANK && !defined isblank
38 # define isblank(ch) ((ch) == ' ' || (ch) == '\t')
39 #endif
41 #ifdef _LIBC
42 # ifndef _RE_DEFINE_LOCALE_FUNCTIONS
43 # define _RE_DEFINE_LOCALE_FUNCTIONS 1
44 # include <locale/localeinfo.h>
45 # include <locale/elem-hash.h>
46 # include <locale/coll-lookup.h>
47 # endif
48 #endif
50 /* This is for other GNU distributions with internationalized messages. */
51 #if HAVE_LIBINTL_H || defined _LIBC
52 # include <libintl.h>
53 # ifdef _LIBC
54 # undef gettext
55 # define gettext(msgid) \
56 INTUSE(__dcgettext) (INTUSE(_libc_intl_domainname), msgid, LC_MESSAGES)
57 # endif
58 #else
59 # define gettext(msgid) (msgid)
60 #endif
62 #ifndef gettext_noop
63 /* This define is so xgettext can find the internationalizable
64 strings. */
65 # define gettext_noop(String) String
66 #endif
68 #include <regex.h>
69 #include "regex_internal.h"
71 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
72 int length, reg_syntax_t syntax);
73 static void re_compile_fastmap_iter (regex_t *bufp,
74 const re_dfastate_t *init_state,
75 char *fastmap);
76 static reg_errcode_t init_dfa (re_dfa_t *dfa, int pat_len);
77 static reg_errcode_t init_word_char (re_dfa_t *dfa);
78 #ifdef RE_ENABLE_I18N
79 static void free_charset (re_charset_t *cset);
80 #endif /* RE_ENABLE_I18N */
81 static void free_workarea_compile (regex_t *preg);
82 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
83 static reg_errcode_t analyze (re_dfa_t *dfa);
84 static reg_errcode_t analyze_tree (re_dfa_t *dfa, bin_tree_t *node);
85 static void calc_first (re_dfa_t *dfa, bin_tree_t *node);
86 static void calc_next (re_dfa_t *dfa, bin_tree_t *node);
87 static void calc_epsdest (re_dfa_t *dfa, bin_tree_t *node);
88 static reg_errcode_t duplicate_node_closure (re_dfa_t *dfa, int top_org_node,
89 int top_clone_node, int root_node,
90 unsigned int constraint);
91 static reg_errcode_t duplicate_node (int *new_idx, re_dfa_t *dfa, int org_idx,
92 unsigned int constraint);
93 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
94 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
95 int node, int root);
96 static void calc_inveclosure (re_dfa_t *dfa);
97 static int fetch_number (re_string_t *input, re_token_t *token,
98 reg_syntax_t syntax);
99 static re_token_t fetch_token (re_string_t *input, reg_syntax_t syntax);
100 static int peek_token (re_token_t *token, re_string_t *input,
101 reg_syntax_t syntax);
102 static int peek_token_bracket (re_token_t *token, re_string_t *input,
103 reg_syntax_t syntax);
104 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
105 reg_syntax_t syntax, reg_errcode_t *err);
106 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
107 re_token_t *token, reg_syntax_t syntax,
108 int nest, reg_errcode_t *err);
109 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
110 re_token_t *token, reg_syntax_t syntax,
111 int nest, reg_errcode_t *err);
112 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
113 re_token_t *token, reg_syntax_t syntax,
114 int nest, reg_errcode_t *err);
115 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
116 re_token_t *token, reg_syntax_t syntax,
117 int nest, reg_errcode_t *err);
118 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
119 re_dfa_t *dfa, re_token_t *token,
120 reg_syntax_t syntax, reg_errcode_t *err);
121 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
122 re_token_t *token, reg_syntax_t syntax,
123 reg_errcode_t *err);
124 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
125 re_string_t *regexp,
126 re_token_t *token, int token_len,
127 re_dfa_t *dfa,
128 reg_syntax_t syntax);
129 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
130 re_string_t *regexp,
131 re_token_t *token);
132 #ifndef _LIBC
133 # ifdef RE_ENABLE_I18N
134 static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
135 re_charset_t *mbcset, int *range_alloc,
136 bracket_elem_t *start_elem,
137 bracket_elem_t *end_elem);
138 static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
139 re_charset_t *mbcset,
140 int *coll_sym_alloc,
141 const unsigned char *name);
142 # else /* not RE_ENABLE_I18N */
143 static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
144 bracket_elem_t *start_elem,
145 bracket_elem_t *end_elem);
146 static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
147 const unsigned char *name);
148 # endif /* not RE_ENABLE_I18N */
149 #endif /* not _LIBC */
150 #ifdef RE_ENABLE_I18N
151 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
152 re_charset_t *mbcset,
153 int *equiv_class_alloc,
154 const unsigned char *name);
155 static reg_errcode_t build_charclass (re_bitset_ptr_t sbcset,
156 re_charset_t *mbcset,
157 int *char_class_alloc,
158 const unsigned char *class_name,
159 reg_syntax_t syntax);
160 #else /* not RE_ENABLE_I18N */
161 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
162 const unsigned char *name);
163 static reg_errcode_t build_charclass (re_bitset_ptr_t sbcset,
164 const unsigned char *class_name,
165 reg_syntax_t syntax);
166 #endif /* not RE_ENABLE_I18N */
167 static bin_tree_t *build_word_op (re_dfa_t *dfa, int not, reg_errcode_t *err);
168 static void free_bin_tree (bin_tree_t *tree);
169 static bin_tree_t *create_tree (bin_tree_t *left, bin_tree_t *right,
170 re_token_type_t type, int index);
171 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
173 /* This table gives an error message for each of the error codes listed
174 in regex.h. Obviously the order here has to be same as there.
175 POSIX doesn't require that we do anything for REG_NOERROR,
176 but why not be nice? */
178 const char __re_error_msgid[] attribute_hidden =
180 #define REG_NOERROR_IDX 0
181 gettext_noop ("Success") /* REG_NOERROR */
182 "\0"
183 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
184 gettext_noop ("No match") /* REG_NOMATCH */
185 "\0"
186 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
187 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
188 "\0"
189 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
190 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
191 "\0"
192 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
193 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
194 "\0"
195 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
196 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
197 "\0"
198 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
199 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
200 "\0"
201 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
202 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
203 "\0"
204 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
205 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
206 "\0"
207 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
208 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
209 "\0"
210 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
211 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
212 "\0"
213 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
214 gettext_noop ("Invalid range end") /* REG_ERANGE */
215 "\0"
216 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
217 gettext_noop ("Memory exhausted") /* REG_ESPACE */
218 "\0"
219 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
220 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
221 "\0"
222 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
223 gettext_noop ("Premature end of regular expression") /* REG_EEND */
224 "\0"
225 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
226 gettext_noop ("Regular expression too big") /* REG_ESIZE */
227 "\0"
228 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
229 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
232 const size_t __re_error_msgid_idx[] attribute_hidden =
234 REG_NOERROR_IDX,
235 REG_NOMATCH_IDX,
236 REG_BADPAT_IDX,
237 REG_ECOLLATE_IDX,
238 REG_ECTYPE_IDX,
239 REG_EESCAPE_IDX,
240 REG_ESUBREG_IDX,
241 REG_EBRACK_IDX,
242 REG_EPAREN_IDX,
243 REG_EBRACE_IDX,
244 REG_BADBR_IDX,
245 REG_ERANGE_IDX,
246 REG_ESPACE_IDX,
247 REG_BADRPT_IDX,
248 REG_EEND_IDX,
249 REG_ESIZE_IDX,
250 REG_ERPAREN_IDX
253 /* Entry points for GNU code. */
255 /* re_compile_pattern is the GNU regular expression compiler: it
256 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
257 Returns 0 if the pattern was valid, otherwise an error string.
259 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
260 are set in BUFP on entry. */
262 const char *
263 re_compile_pattern (pattern, length, bufp)
264 const char *pattern;
265 size_t length;
266 struct re_pattern_buffer *bufp;
268 reg_errcode_t ret;
270 /* And GNU code determines whether or not to get register information
271 by passing null for the REGS argument to re_match, etc., not by
272 setting no_sub. */
273 bufp->no_sub = 0;
275 /* Match anchors at newline. */
276 bufp->newline_anchor = 1;
278 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
280 if (!ret)
281 return NULL;
282 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
284 #ifdef _LIBC
285 weak_alias (__re_compile_pattern, re_compile_pattern)
286 #endif
288 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
289 also be assigned to arbitrarily: each pattern buffer stores its own
290 syntax, so it can be changed between regex compilations. */
291 /* This has no initializer because initialized variables in Emacs
292 become read-only after dumping. */
293 reg_syntax_t re_syntax_options;
296 /* Specify the precise syntax of regexps for compilation. This provides
297 for compatibility for various utilities which historically have
298 different, incompatible syntaxes.
300 The argument SYNTAX is a bit mask comprised of the various bits
301 defined in regex.h. We return the old syntax. */
303 reg_syntax_t
304 re_set_syntax (syntax)
305 reg_syntax_t syntax;
307 reg_syntax_t ret = re_syntax_options;
309 re_syntax_options = syntax;
310 return ret;
312 #ifdef _LIBC
313 weak_alias (__re_set_syntax, re_set_syntax)
314 #endif
317 re_compile_fastmap (bufp)
318 struct re_pattern_buffer *bufp;
320 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
321 char *fastmap = bufp->fastmap;
323 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
324 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
325 if (dfa->init_state != dfa->init_state_word)
326 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
327 if (dfa->init_state != dfa->init_state_nl)
328 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
329 if (dfa->init_state != dfa->init_state_begbuf)
330 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
331 bufp->fastmap_accurate = 1;
332 return 0;
334 #ifdef _LIBC
335 weak_alias (__re_compile_fastmap, re_compile_fastmap)
336 #endif
338 static inline void
339 re_set_fastmap (char *fastmap, int icase, int ch)
341 fastmap[ch] = 1;
342 if (icase)
343 fastmap[tolower (ch)] = 1;
346 /* Helper function for re_compile_fastmap.
347 Compile fastmap for the initial_state INIT_STATE. */
349 static void
350 re_compile_fastmap_iter (bufp, init_state, fastmap)
351 regex_t *bufp;
352 const re_dfastate_t *init_state;
353 char *fastmap;
355 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
356 int node_cnt;
357 int icase = (MB_CUR_MAX == 1 && (bufp->syntax & RE_ICASE));
358 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
360 int node = init_state->nodes.elems[node_cnt];
361 re_token_type_t type = dfa->nodes[node].type;
363 if (type == CHARACTER)
364 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
365 else if (type == SIMPLE_BRACKET)
367 int i, j, ch;
368 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
369 for (j = 0; j < UINT_BITS; ++j, ++ch)
370 if (dfa->nodes[node].opr.sbcset[i] & (1 << j))
371 re_set_fastmap (fastmap, icase, ch);
373 #ifdef RE_ENABLE_I18N
374 else if (type == COMPLEX_BRACKET)
376 int i;
377 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
378 if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
379 || cset->nranges || cset->nchar_classes)
381 # ifdef _LIBC
382 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
384 /* In this case we want to catch the bytes which are
385 the first byte of any collation elements.
386 e.g. In da_DK, we want to catch 'a' since "aa"
387 is a valid collation element, and don't catch
388 'b' since 'b' is the only collation element
389 which starts from 'b'. */
390 int j, ch;
391 const int32_t *table = (const int32_t *)
392 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
393 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
394 for (j = 0; j < UINT_BITS; ++j, ++ch)
395 if (table[ch] < 0)
396 re_set_fastmap (fastmap, icase, ch);
398 # else
399 if (MB_CUR_MAX > 1)
400 for (i = 0; i < SBC_MAX; ++i)
401 if (__btowc (i) == WEOF)
402 re_set_fastmap (fastmap, icase, i);
403 # endif /* not _LIBC */
405 for (i = 0; i < cset->nmbchars; ++i)
407 char buf[256];
408 mbstate_t state;
409 memset (&state, '\0', sizeof (state));
410 __wcrtomb (buf, cset->mbchars[i], &state);
411 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
414 #endif /* RE_ENABLE_I18N */
415 else if (type == END_OF_RE || type == OP_PERIOD)
417 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
418 if (type == END_OF_RE)
419 bufp->can_be_null = 1;
420 return;
425 /* Entry point for POSIX code. */
426 /* regcomp takes a regular expression as a string and compiles it.
428 PREG is a regex_t *. We do not expect any fields to be initialized,
429 since POSIX says we shouldn't. Thus, we set
431 `buffer' to the compiled pattern;
432 `used' to the length of the compiled pattern;
433 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
434 REG_EXTENDED bit in CFLAGS is set; otherwise, to
435 RE_SYNTAX_POSIX_BASIC;
436 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
437 `fastmap' to an allocated space for the fastmap;
438 `fastmap_accurate' to zero;
439 `re_nsub' to the number of subexpressions in PATTERN.
441 PATTERN is the address of the pattern string.
443 CFLAGS is a series of bits which affect compilation.
445 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
446 use POSIX basic syntax.
448 If REG_NEWLINE is set, then . and [^...] don't match newline.
449 Also, regexec will try a match beginning after every newline.
451 If REG_ICASE is set, then we considers upper- and lowercase
452 versions of letters to be equivalent when matching.
454 If REG_NOSUB is set, then when PREG is passed to regexec, that
455 routine will report only success or failure, and nothing about the
456 registers.
458 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
459 the return codes and their meanings.) */
462 regcomp (preg, pattern, cflags)
463 regex_t *__restrict preg;
464 const char *__restrict pattern;
465 int cflags;
467 reg_errcode_t ret;
468 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
469 : RE_SYNTAX_POSIX_BASIC);
471 preg->buffer = NULL;
472 preg->allocated = 0;
473 preg->used = 0;
475 /* Try to allocate space for the fastmap. */
476 preg->fastmap = re_malloc (char, SBC_MAX);
477 if (BE (preg->fastmap == NULL, 0))
478 return REG_ESPACE;
480 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
482 /* If REG_NEWLINE is set, newlines are treated differently. */
483 if (cflags & REG_NEWLINE)
484 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
485 syntax &= ~RE_DOT_NEWLINE;
486 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
487 /* It also changes the matching behavior. */
488 preg->newline_anchor = 1;
490 else
491 preg->newline_anchor = 0;
492 preg->no_sub = !!(cflags & REG_NOSUB);
493 preg->translate = NULL;
495 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
497 /* POSIX doesn't distinguish between an unmatched open-group and an
498 unmatched close-group: both are REG_EPAREN. */
499 if (ret == REG_ERPAREN)
500 ret = REG_EPAREN;
502 /* We have already checked preg->fastmap != NULL. */
503 if (BE (ret == REG_NOERROR, 1))
504 /* Compute the fastmap now, since regexec cannot modify the pattern
505 buffer. This function nevers fails in this implementation. */
506 (void) __re_compile_fastmap (preg);
507 else
509 /* Some error occurred while compiling the expression. */
510 re_free (preg->fastmap);
511 preg->fastmap = NULL;
514 return (int) ret;
516 #ifdef _LIBC
517 weak_alias (__regcomp, regcomp)
518 #endif
520 /* Returns a message corresponding to an error code, ERRCODE, returned
521 from either regcomp or regexec. We don't use PREG here. */
523 size_t
524 regerror (errcode, preg, errbuf, errbuf_size)
525 int errcode;
526 const regex_t *preg;
527 char *errbuf;
528 size_t errbuf_size;
530 const char *msg;
531 size_t msg_size;
533 if (BE (errcode < 0
534 || errcode >= (int) (sizeof (__re_error_msgid_idx)
535 / sizeof (__re_error_msgid_idx[0])), 0))
536 /* Only error codes returned by the rest of the code should be passed
537 to this routine. If we are given anything else, or if other regex
538 code generates an invalid error code, then the program has a bug.
539 Dump core so we can fix it. */
540 abort ();
542 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
544 msg_size = strlen (msg) + 1; /* Includes the null. */
546 if (BE (errbuf_size != 0, 1))
548 if (BE (msg_size > errbuf_size, 0))
550 #if defined HAVE_MEMPCPY || defined _LIBC
551 *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
552 #else
553 memcpy (errbuf, msg, errbuf_size - 1);
554 errbuf[errbuf_size - 1] = 0;
555 #endif
557 else
558 memcpy (errbuf, msg, msg_size);
561 return msg_size;
563 #ifdef _LIBC
564 weak_alias (__regerror, regerror)
565 #endif
568 static void
569 free_dfa_content (re_dfa_t *dfa)
571 int i, j;
573 re_free (dfa->subexps);
575 for (i = 0; i < dfa->nodes_len; ++i)
577 re_token_t *node = dfa->nodes + i;
578 #ifdef RE_ENABLE_I18N
579 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
580 free_charset (node->opr.mbcset);
581 else
582 #endif /* RE_ENABLE_I18N */
583 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
584 re_free (node->opr.sbcset);
586 re_free (dfa->nexts);
587 for (i = 0; i < dfa->nodes_len; ++i)
589 if (dfa->eclosures != NULL)
590 re_node_set_free (dfa->eclosures + i);
591 if (dfa->inveclosures != NULL)
592 re_node_set_free (dfa->inveclosures + i);
593 if (dfa->edests != NULL)
594 re_node_set_free (dfa->edests + i);
596 re_free (dfa->edests);
597 re_free (dfa->eclosures);
598 re_free (dfa->inveclosures);
599 re_free (dfa->nodes);
601 for (i = 0; i <= dfa->state_hash_mask; ++i)
603 struct re_state_table_entry *entry = dfa->state_table + i;
604 for (j = 0; j < entry->num; ++j)
606 re_dfastate_t *state = entry->array[j];
607 free_state (state);
609 re_free (entry->array);
611 re_free (dfa->state_table);
613 if (dfa->word_char != NULL)
614 re_free (dfa->word_char);
615 #ifdef DEBUG
616 re_free (dfa->re_str);
617 #endif
619 re_free (dfa);
623 /* Free dynamically allocated space used by PREG. */
625 void
626 regfree (preg)
627 regex_t *preg;
629 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
630 if (BE (dfa != NULL, 1))
631 free_dfa_content (dfa);
633 re_free (preg->fastmap);
635 #ifdef _LIBC
636 weak_alias (__regfree, regfree)
637 #endif
639 /* Entry points compatible with 4.2 BSD regex library. We don't define
640 them unless specifically requested. */
642 #if defined _REGEX_RE_COMP || defined _LIBC
644 /* BSD has one and only one pattern buffer. */
645 static struct re_pattern_buffer re_comp_buf;
647 char *
648 # ifdef _LIBC
649 /* Make these definitions weak in libc, so POSIX programs can redefine
650 these names if they don't use our functions, and still use
651 regcomp/regexec above without link errors. */
652 weak_function
653 # endif
654 re_comp (s)
655 const char *s;
657 reg_errcode_t ret;
658 char *fastmap;
660 if (!s)
662 if (!re_comp_buf.buffer)
663 return gettext ("No previous regular expression");
664 return 0;
667 if (re_comp_buf.buffer)
669 fastmap = re_comp_buf.fastmap;
670 re_comp_buf.fastmap = NULL;
671 __regfree (&re_comp_buf);
672 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
673 re_comp_buf.fastmap = fastmap;
676 if (re_comp_buf.fastmap == NULL)
678 re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
679 if (re_comp_buf.fastmap == NULL)
680 return (char *) gettext (__re_error_msgid
681 + __re_error_msgid_idx[(int) REG_ESPACE]);
684 /* Since `re_exec' always passes NULL for the `regs' argument, we
685 don't need to initialize the pattern buffer fields which affect it. */
687 /* Match anchors at newlines. */
688 re_comp_buf.newline_anchor = 1;
690 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
692 if (!ret)
693 return NULL;
695 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
696 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
699 #ifdef _LIBC
700 libc_freeres_fn (free_mem)
702 __regfree (&re_comp_buf);
704 #endif
706 #endif /* _REGEX_RE_COMP */
708 /* Internal entry point.
709 Compile the regular expression PATTERN, whose length is LENGTH.
710 SYNTAX indicate regular expression's syntax. */
712 static reg_errcode_t
713 re_compile_internal (preg, pattern, length, syntax)
714 regex_t *preg;
715 const char * pattern;
716 int length;
717 reg_syntax_t syntax;
719 reg_errcode_t err = REG_NOERROR;
720 re_dfa_t *dfa;
721 re_string_t regexp;
723 /* Initialize the pattern buffer. */
724 preg->fastmap_accurate = 0;
725 preg->syntax = syntax;
726 preg->not_bol = preg->not_eol = 0;
727 preg->used = 0;
728 preg->re_nsub = 0;
729 preg->can_be_null = 0;
730 preg->regs_allocated = REGS_UNALLOCATED;
732 /* Initialize the dfa. */
733 dfa = (re_dfa_t *) preg->buffer;
734 if (preg->allocated < sizeof (re_dfa_t))
736 /* If zero allocated, but buffer is non-null, try to realloc
737 enough space. This loses if buffer's address is bogus, but
738 that is the user's responsibility. If ->buffer is NULL this
739 is a simple allocation. */
740 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
741 if (dfa == NULL)
742 return REG_ESPACE;
743 preg->allocated = sizeof (re_dfa_t);
745 preg->buffer = (unsigned char *) dfa;
746 preg->used = sizeof (re_dfa_t);
748 err = init_dfa (dfa, length);
749 if (BE (err != REG_NOERROR, 0))
751 re_free (dfa);
752 preg->buffer = NULL;
753 return err;
755 #ifdef DEBUG
756 dfa->re_str = re_malloc (char, length + 1);
757 strncpy (dfa->re_str, pattern, length + 1);
758 #endif
760 err = re_string_construct (&regexp, pattern, length, preg->translate,
761 syntax & RE_ICASE);
762 if (BE (err != REG_NOERROR, 0))
764 re_free (dfa);
765 preg->buffer = NULL;
766 return err;
769 /* Parse the regular expression, and build a structure tree. */
770 preg->re_nsub = 0;
771 dfa->str_tree = parse (&regexp, preg, syntax, &err);
772 if (BE (dfa->str_tree == NULL, 0))
773 goto re_compile_internal_free_return;
775 /* Analyze the tree and collect information which is necessary to
776 create the dfa. */
777 err = analyze (dfa);
778 if (BE (err != REG_NOERROR, 0))
779 goto re_compile_internal_free_return;
781 /* Then create the initial state of the dfa. */
782 err = create_initial_state (dfa);
784 /* Release work areas. */
785 free_workarea_compile (preg);
786 re_string_destruct (&regexp);
788 if (BE (err != REG_NOERROR, 0))
790 re_compile_internal_free_return:
791 free_dfa_content (dfa);
792 preg->buffer = NULL;
795 return err;
798 /* Initialize DFA. We use the length of the regular expression PAT_LEN
799 as the initial length of some arrays. */
801 static reg_errcode_t
802 init_dfa (dfa, pat_len)
803 re_dfa_t *dfa;
804 int pat_len;
806 int table_size;
808 memset (dfa, '\0', sizeof (re_dfa_t));
810 dfa->nodes_alloc = pat_len + 1;
811 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
813 dfa->states_alloc = pat_len + 1;
815 /* table_size = 2 ^ ceil(log pat_len) */
816 for (table_size = 1; table_size > 0; table_size <<= 1)
817 if (table_size > pat_len)
818 break;
820 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
821 dfa->state_hash_mask = table_size - 1;
823 dfa->subexps_alloc = 1;
824 dfa->subexps = re_malloc (re_subexp_t, dfa->subexps_alloc);
825 dfa->word_char = NULL;
827 if (BE (dfa->nodes == NULL || dfa->state_table == NULL
828 || dfa->subexps == NULL, 0))
830 /* We don't bother to free anything which was allocated. Very
831 soon the process will go down anyway. */
832 dfa->subexps = NULL;
833 dfa->state_table = NULL;
834 dfa->nodes = NULL;
835 return REG_ESPACE;
837 return REG_NOERROR;
840 /* Initialize WORD_CHAR table, which indicate which character is
841 "word". In this case "word" means that it is the word construction
842 character used by some operators like "\<", "\>", etc. */
844 static reg_errcode_t
845 init_word_char (dfa)
846 re_dfa_t *dfa;
848 int i, j, ch;
849 dfa->word_char = (re_bitset_ptr_t) calloc (sizeof (bitset), 1);
850 if (BE (dfa->word_char == NULL, 0))
851 return REG_ESPACE;
852 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
853 for (j = 0; j < UINT_BITS; ++j, ++ch)
854 if (isalnum (ch) || ch == '_')
855 dfa->word_char[i] |= 1 << j;
856 return REG_NOERROR;
859 /* Free the work area which are only used while compiling. */
861 static void
862 free_workarea_compile (preg)
863 regex_t *preg;
865 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
866 free_bin_tree (dfa->str_tree);
867 dfa->str_tree = NULL;
870 /* Create initial states for all contexts. */
872 static reg_errcode_t
873 create_initial_state (dfa)
874 re_dfa_t *dfa;
876 int first, i;
877 reg_errcode_t err;
878 re_node_set init_nodes;
880 /* Initial states have the epsilon closure of the node which is
881 the first node of the regular expression. */
882 first = dfa->str_tree->first;
883 dfa->init_node = first;
884 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
885 if (BE (err != REG_NOERROR, 0))
886 return err;
888 /* The back-references which are in initial states can epsilon transit,
889 since in this case all of the subexpressions can be null.
890 Then we add epsilon closures of the nodes which are the next nodes of
891 the back-references. */
892 if (dfa->nbackref > 0)
893 for (i = 0; i < init_nodes.nelem; ++i)
895 int node_idx = init_nodes.elems[i];
896 re_token_type_t type = dfa->nodes[node_idx].type;
898 int clexp_idx;
899 if (type != OP_BACK_REF)
900 continue;
901 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
903 re_token_t *clexp_node;
904 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
905 if (clexp_node->type == OP_CLOSE_SUBEXP
906 && clexp_node->opr.idx + 1 == dfa->nodes[node_idx].opr.idx)
907 break;
909 if (clexp_idx == init_nodes.nelem)
910 continue;
912 if (type == OP_BACK_REF)
914 int dest_idx = dfa->edests[node_idx].elems[0];
915 if (!re_node_set_contains (&init_nodes, dest_idx))
917 re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
918 i = 0;
923 /* It must be the first time to invoke acquire_state. */
924 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
925 /* We don't check ERR here, since the initial state must not be NULL. */
926 if (BE (dfa->init_state == NULL, 0))
927 return err;
928 if (dfa->init_state->has_constraint)
930 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
931 CONTEXT_WORD);
932 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
933 CONTEXT_NEWLINE);
934 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
935 &init_nodes,
936 CONTEXT_NEWLINE
937 | CONTEXT_BEGBUF);
938 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
939 || dfa->init_state_begbuf == NULL, 0))
940 return err;
942 else
943 dfa->init_state_word = dfa->init_state_nl
944 = dfa->init_state_begbuf = dfa->init_state;
946 re_node_set_free (&init_nodes);
947 return REG_NOERROR;
950 /* Analyze the structure tree, and calculate "first", "next", "edest",
951 "eclosure", and "inveclosure". */
953 static reg_errcode_t
954 analyze (dfa)
955 re_dfa_t *dfa;
957 int i;
958 reg_errcode_t ret;
960 /* Allocate arrays. */
961 dfa->nexts = re_malloc (int, dfa->nodes_alloc);
962 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
963 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
964 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_alloc);
965 if (BE (dfa->nexts == NULL || dfa->edests == NULL
966 || dfa->eclosures == NULL || dfa->inveclosures == NULL, 0))
967 return REG_ESPACE;
968 /* Initialize them. */
969 for (i = 0; i < dfa->nodes_len; ++i)
971 dfa->nexts[i] = -1;
972 re_node_set_init_empty (dfa->edests + i);
973 re_node_set_init_empty (dfa->eclosures + i);
974 re_node_set_init_empty (dfa->inveclosures + i);
977 ret = analyze_tree (dfa, dfa->str_tree);
978 if (BE (ret == REG_NOERROR, 1))
980 ret = calc_eclosure (dfa);
981 if (ret == REG_NOERROR)
982 calc_inveclosure (dfa);
984 return ret;
987 /* Helper functions for analyze.
988 This function calculate "first", "next", and "edest" for the subtree
989 whose root is NODE. */
991 static reg_errcode_t
992 analyze_tree (dfa, node)
993 re_dfa_t *dfa;
994 bin_tree_t *node;
996 reg_errcode_t ret;
997 if (node->first == -1)
998 calc_first (dfa, node);
999 if (node->next == -1)
1000 calc_next (dfa, node);
1001 if (node->eclosure.nelem == 0)
1002 calc_epsdest (dfa, node);
1003 /* Calculate "first" etc. for the left child. */
1004 if (node->left != NULL)
1006 ret = analyze_tree (dfa, node->left);
1007 if (BE (ret != REG_NOERROR, 0))
1008 return ret;
1010 /* Calculate "first" etc. for the right child. */
1011 if (node->right != NULL)
1013 ret = analyze_tree (dfa, node->right);
1014 if (BE (ret != REG_NOERROR, 0))
1015 return ret;
1017 return REG_NOERROR;
1020 /* Calculate "first" for the node NODE. */
1021 static void
1022 calc_first (dfa, node)
1023 re_dfa_t *dfa;
1024 bin_tree_t *node;
1026 int idx, type;
1027 idx = node->node_idx;
1028 type = (node->type == 0) ? dfa->nodes[idx].type : node->type;
1030 switch (type)
1032 #ifdef DEBUG
1033 case OP_OPEN_BRACKET:
1034 case OP_CLOSE_BRACKET:
1035 case OP_OPEN_DUP_NUM:
1036 case OP_CLOSE_DUP_NUM:
1037 case OP_NON_MATCH_LIST:
1038 case OP_OPEN_COLL_ELEM:
1039 case OP_CLOSE_COLL_ELEM:
1040 case OP_OPEN_EQUIV_CLASS:
1041 case OP_CLOSE_EQUIV_CLASS:
1042 case OP_OPEN_CHAR_CLASS:
1043 case OP_CLOSE_CHAR_CLASS:
1044 /* These must not be appeared here. */
1045 assert (0);
1046 #endif
1047 case END_OF_RE:
1048 case CHARACTER:
1049 case OP_PERIOD:
1050 case OP_DUP_ASTERISK:
1051 case OP_DUP_QUESTION:
1052 #ifdef RE_ENABLE_I18N
1053 case COMPLEX_BRACKET:
1054 #endif /* RE_ENABLE_I18N */
1055 case SIMPLE_BRACKET:
1056 case OP_BACK_REF:
1057 case ANCHOR:
1058 case OP_OPEN_SUBEXP:
1059 case OP_CLOSE_SUBEXP:
1060 node->first = idx;
1061 break;
1062 case OP_DUP_PLUS:
1063 #ifdef DEBUG
1064 assert (node->left != NULL);
1065 #endif
1066 if (node->left->first == -1)
1067 calc_first (dfa, node->left);
1068 node->first = node->left->first;
1069 break;
1070 case OP_ALT:
1071 node->first = idx;
1072 break;
1073 /* else fall through */
1074 default:
1075 #ifdef DEBUG
1076 assert (node->left != NULL);
1077 #endif
1078 if (node->left->first == -1)
1079 calc_first (dfa, node->left);
1080 node->first = node->left->first;
1081 break;
1085 /* Calculate "next" for the node NODE. */
1087 static void
1088 calc_next (dfa, node)
1089 re_dfa_t *dfa;
1090 bin_tree_t *node;
1092 int idx, type;
1093 bin_tree_t *parent = node->parent;
1094 if (parent == NULL)
1096 node->next = -1;
1097 idx = node->node_idx;
1098 if (node->type == 0)
1099 dfa->nexts[idx] = node->next;
1100 return;
1103 idx = parent->node_idx;
1104 type = (parent->type == 0) ? dfa->nodes[idx].type : parent->type;
1106 switch (type)
1108 case OP_DUP_ASTERISK:
1109 case OP_DUP_PLUS:
1110 node->next = idx;
1111 break;
1112 case CONCAT:
1113 if (parent->left == node)
1115 if (parent->right->first == -1)
1116 calc_first (dfa, parent->right);
1117 node->next = parent->right->first;
1118 break;
1120 /* else fall through */
1121 default:
1122 if (parent->next == -1)
1123 calc_next (dfa, parent);
1124 node->next = parent->next;
1125 break;
1127 idx = node->node_idx;
1128 if (node->type == 0)
1129 dfa->nexts[idx] = node->next;
1132 /* Calculate "edest" for the node NODE. */
1134 static void
1135 calc_epsdest (dfa, node)
1136 re_dfa_t *dfa;
1137 bin_tree_t *node;
1139 int idx;
1140 idx = node->node_idx;
1141 if (node->type == 0)
1143 if (dfa->nodes[idx].type == OP_DUP_ASTERISK
1144 || dfa->nodes[idx].type == OP_DUP_PLUS
1145 || dfa->nodes[idx].type == OP_DUP_QUESTION)
1147 if (node->left->first == -1)
1148 calc_first (dfa, node->left);
1149 if (node->next == -1)
1150 calc_next (dfa, node);
1151 re_node_set_init_2 (dfa->edests + idx, node->left->first,
1152 node->next);
1154 else if (dfa->nodes[idx].type == OP_ALT)
1156 int left, right;
1157 if (node->left != NULL)
1159 if (node->left->first == -1)
1160 calc_first (dfa, node->left);
1161 left = node->left->first;
1163 else
1165 if (node->next == -1)
1166 calc_next (dfa, node);
1167 left = node->next;
1169 if (node->right != NULL)
1171 if (node->right->first == -1)
1172 calc_first (dfa, node->right);
1173 right = node->right->first;
1175 else
1177 if (node->next == -1)
1178 calc_next (dfa, node);
1179 right = node->next;
1181 re_node_set_init_2 (dfa->edests + idx, left, right);
1183 else if (dfa->nodes[idx].type == ANCHOR
1184 || dfa->nodes[idx].type == OP_OPEN_SUBEXP
1185 || dfa->nodes[idx].type == OP_CLOSE_SUBEXP
1186 || dfa->nodes[idx].type == OP_BACK_REF)
1187 re_node_set_init_1 (dfa->edests + idx, node->next);
1191 /* Duplicate the epsilon closure of the node ROOT_NODE.
1192 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1193 to their own constraint. */
1195 static reg_errcode_t
1196 duplicate_node_closure (dfa, top_org_node, top_clone_node, root_node,
1197 init_constraint)
1198 re_dfa_t *dfa;
1199 int top_org_node, top_clone_node, root_node;
1200 unsigned int init_constraint;
1202 reg_errcode_t err;
1203 int org_node, clone_node, ret;
1204 unsigned int constraint = init_constraint;
1205 for (org_node = top_org_node, clone_node = top_clone_node;;)
1207 int org_dest, clone_dest;
1208 if (dfa->nodes[org_node].type == OP_BACK_REF)
1210 /* If the back reference epsilon-transit, its destination must
1211 also have the constraint. Then duplicate the epsilon closure
1212 of the destination of the back reference, and store it in
1213 edests of the back reference. */
1214 org_dest = dfa->nexts[org_node];
1215 re_node_set_empty (dfa->edests + clone_node);
1216 err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1217 if (BE (err != REG_NOERROR, 0))
1218 return err;
1219 dfa->nexts[clone_node] = dfa->nexts[org_node];
1220 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1221 if (BE (ret < 0, 0))
1222 return REG_ESPACE;
1224 else if (dfa->edests[org_node].nelem == 0)
1226 /* In case of the node can't epsilon-transit, don't duplicate the
1227 destination and store the original destination as the
1228 destination of the node. */
1229 dfa->nexts[clone_node] = dfa->nexts[org_node];
1230 break;
1232 else if (dfa->edests[org_node].nelem == 1)
1234 /* In case of the node can epsilon-transit, and it has only one
1235 destination. */
1236 org_dest = dfa->edests[org_node].elems[0];
1237 re_node_set_empty (dfa->edests + clone_node);
1238 if (dfa->nodes[org_node].type == ANCHOR)
1240 /* In case of the node has another constraint, append it. */
1241 if (org_node == root_node && clone_node != org_node)
1243 /* ...but if the node is root_node itself, it means the
1244 epsilon closure have a loop, then tie it to the
1245 destination of the root_node. */
1246 ret = re_node_set_insert (dfa->edests + clone_node,
1247 org_dest);
1248 if (BE (ret < 0, 0))
1249 return REG_ESPACE;
1250 break;
1252 constraint |= dfa->nodes[org_node].opr.ctx_type;
1254 err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1255 if (BE (err != REG_NOERROR, 0))
1256 return err;
1257 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1258 if (BE (ret < 0, 0))
1259 return REG_ESPACE;
1261 else /* dfa->edests[org_node].nelem == 2 */
1263 /* In case of the node can epsilon-transit, and it has two
1264 destinations. */
1265 org_dest = dfa->edests[org_node].elems[0];
1266 re_node_set_empty (dfa->edests + clone_node);
1267 err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1268 if (BE (err != REG_NOERROR, 0))
1269 return err;
1270 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1271 if (BE (ret < 0, 0))
1272 return REG_ESPACE;
1274 err = duplicate_node_closure (dfa, org_dest, clone_dest, root_node,
1275 constraint);
1276 if (BE (err != REG_NOERROR, 0))
1277 return err;
1279 org_dest = dfa->edests[org_node].elems[1];
1280 err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1281 if (BE (err != REG_NOERROR, 0))
1282 return err;
1283 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1284 if (BE (ret < 0, 0))
1285 return REG_ESPACE;
1287 org_node = org_dest;
1288 clone_node = clone_dest;
1290 return REG_NOERROR;
1293 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1294 The new index will be stored in NEW_IDX and return REG_NOERROR if succeeded,
1295 otherwise return the error code. */
1297 static reg_errcode_t
1298 duplicate_node (new_idx, dfa, org_idx, constraint)
1299 re_dfa_t *dfa;
1300 int *new_idx, org_idx;
1301 unsigned int constraint;
1303 re_token_t dup;
1304 int dup_idx;
1306 dup = dfa->nodes[org_idx];
1307 dup_idx = re_dfa_add_node (dfa, dup, 1);
1308 if (BE (dup_idx == -1, 0))
1309 return REG_ESPACE;
1310 dfa->nodes[dup_idx].constraint = constraint;
1311 if (dfa->nodes[org_idx].type == ANCHOR)
1312 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type;
1313 dfa->nodes[dup_idx].duplicated = 1;
1314 re_node_set_init_empty (dfa->edests + dup_idx);
1315 re_node_set_init_empty (dfa->eclosures + dup_idx);
1316 re_node_set_init_empty (dfa->inveclosures + dup_idx);
1318 *new_idx = dup_idx;
1319 return REG_NOERROR;
1322 static void
1323 calc_inveclosure (dfa)
1324 re_dfa_t *dfa;
1326 int src, idx, dest;
1327 for (src = 0; src < dfa->nodes_len; ++src)
1329 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1331 dest = dfa->eclosures[src].elems[idx];
1332 re_node_set_insert (dfa->inveclosures + dest, src);
1337 /* Calculate "eclosure" for all the node in DFA. */
1339 static reg_errcode_t
1340 calc_eclosure (dfa)
1341 re_dfa_t *dfa;
1343 int node_idx, incomplete;
1344 #ifdef DEBUG
1345 assert (dfa->nodes_len > 0);
1346 #endif
1347 incomplete = 0;
1348 /* For each nodes, calculate epsilon closure. */
1349 for (node_idx = 0; ; ++node_idx)
1351 reg_errcode_t err;
1352 re_node_set eclosure_elem;
1353 if (node_idx == dfa->nodes_len)
1355 if (!incomplete)
1356 break;
1357 incomplete = 0;
1358 node_idx = 0;
1361 #ifdef DEBUG
1362 assert (dfa->eclosures[node_idx].nelem != -1);
1363 #endif
1364 /* If we have already calculated, skip it. */
1365 if (dfa->eclosures[node_idx].nelem != 0)
1366 continue;
1367 /* Calculate epsilon closure of `node_idx'. */
1368 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1369 if (BE (err != REG_NOERROR, 0))
1370 return err;
1372 if (dfa->eclosures[node_idx].nelem == 0)
1374 incomplete = 1;
1375 re_node_set_free (&eclosure_elem);
1378 return REG_NOERROR;
1381 /* Calculate epsilon closure of NODE. */
1383 static reg_errcode_t
1384 calc_eclosure_iter (new_set, dfa, node, root)
1385 re_node_set *new_set;
1386 re_dfa_t *dfa;
1387 int node, root;
1389 reg_errcode_t err;
1390 unsigned int constraint;
1391 int i, incomplete;
1392 re_node_set eclosure;
1393 incomplete = 0;
1394 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1395 if (BE (err != REG_NOERROR, 0))
1396 return err;
1398 /* This indicates that we are calculating this node now.
1399 We reference this value to avoid infinite loop. */
1400 dfa->eclosures[node].nelem = -1;
1402 constraint = ((dfa->nodes[node].type == ANCHOR)
1403 ? dfa->nodes[node].opr.ctx_type : 0);
1404 /* If the current node has constraints, duplicate all nodes.
1405 Since they must inherit the constraints. */
1406 if (constraint && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1408 int org_node, cur_node;
1409 org_node = cur_node = node;
1410 err = duplicate_node_closure (dfa, node, node, node, constraint);
1411 if (BE (err != REG_NOERROR, 0))
1412 return err;
1415 /* Expand each epsilon destination nodes. */
1416 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1417 for (i = 0; i < dfa->edests[node].nelem; ++i)
1419 re_node_set eclosure_elem;
1420 int edest = dfa->edests[node].elems[i];
1421 /* If calculating the epsilon closure of `edest' is in progress,
1422 return intermediate result. */
1423 if (dfa->eclosures[edest].nelem == -1)
1425 incomplete = 1;
1426 continue;
1428 /* If we haven't calculated the epsilon closure of `edest' yet,
1429 calculate now. Otherwise use calculated epsilon closure. */
1430 if (dfa->eclosures[edest].nelem == 0)
1432 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1433 if (BE (err != REG_NOERROR, 0))
1434 return err;
1436 else
1437 eclosure_elem = dfa->eclosures[edest];
1438 /* Merge the epsilon closure of `edest'. */
1439 re_node_set_merge (&eclosure, &eclosure_elem);
1440 /* If the epsilon closure of `edest' is incomplete,
1441 the epsilon closure of this node is also incomplete. */
1442 if (dfa->eclosures[edest].nelem == 0)
1444 incomplete = 1;
1445 re_node_set_free (&eclosure_elem);
1449 /* Epsilon closures include itself. */
1450 re_node_set_insert (&eclosure, node);
1451 if (incomplete && !root)
1452 dfa->eclosures[node].nelem = 0;
1453 else
1454 dfa->eclosures[node] = eclosure;
1455 *new_set = eclosure;
1456 return REG_NOERROR;
1459 /* Functions for token which are used in the parser. */
1461 /* Fetch a token from INPUT.
1462 We must not use this function inside bracket expressions. */
1464 static re_token_t
1465 fetch_token (input, syntax)
1466 re_string_t *input;
1467 reg_syntax_t syntax;
1469 re_token_t token;
1470 int consumed_byte;
1471 consumed_byte = peek_token (&token, input, syntax);
1472 re_string_skip_bytes (input, consumed_byte);
1473 return token;
1476 /* Peek a token from INPUT, and return the length of the token.
1477 We must not use this function inside bracket expressions. */
1479 static int
1480 peek_token (token, input, syntax)
1481 re_token_t *token;
1482 re_string_t *input;
1483 reg_syntax_t syntax;
1485 unsigned char c;
1487 if (re_string_eoi (input))
1489 token->type = END_OF_RE;
1490 return 0;
1493 c = re_string_peek_byte (input, 0);
1494 token->opr.c = c;
1496 #ifdef RE_ENABLE_I18N
1497 token->mb_partial = 0;
1498 if (MB_CUR_MAX > 1 &&
1499 !re_string_first_byte (input, re_string_cur_idx (input)))
1501 token->type = CHARACTER;
1502 token->mb_partial = 1;
1503 return 1;
1505 #endif
1506 if (c == '\\')
1508 unsigned char c2;
1509 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1511 token->type = BACK_SLASH;
1512 return 1;
1515 c2 = re_string_peek_byte_case (input, 1);
1516 token->opr.c = c2;
1517 token->type = CHARACTER;
1518 switch (c2)
1520 case '|':
1521 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1522 token->type = OP_ALT;
1523 break;
1524 case '1': case '2': case '3': case '4': case '5':
1525 case '6': case '7': case '8': case '9':
1526 if (!(syntax & RE_NO_BK_REFS))
1528 token->type = OP_BACK_REF;
1529 token->opr.idx = c2 - '0';
1531 break;
1532 case '<':
1533 if (!(syntax & RE_NO_GNU_OPS))
1535 token->type = ANCHOR;
1536 token->opr.idx = WORD_FIRST;
1538 break;
1539 case '>':
1540 if (!(syntax & RE_NO_GNU_OPS))
1542 token->type = ANCHOR;
1543 token->opr.idx = WORD_LAST;
1545 break;
1546 case 'b':
1547 if (!(syntax & RE_NO_GNU_OPS))
1549 token->type = ANCHOR;
1550 token->opr.idx = WORD_DELIM;
1552 break;
1553 case 'B':
1554 if (!(syntax & RE_NO_GNU_OPS))
1556 token->type = ANCHOR;
1557 token->opr.idx = INSIDE_WORD;
1559 break;
1560 case 'w':
1561 if (!(syntax & RE_NO_GNU_OPS))
1562 token->type = OP_WORD;
1563 break;
1564 case 'W':
1565 if (!(syntax & RE_NO_GNU_OPS))
1566 token->type = OP_NOTWORD;
1567 break;
1568 case '`':
1569 if (!(syntax & RE_NO_GNU_OPS))
1571 token->type = ANCHOR;
1572 token->opr.idx = BUF_FIRST;
1574 break;
1575 case '\'':
1576 if (!(syntax & RE_NO_GNU_OPS))
1578 token->type = ANCHOR;
1579 token->opr.idx = BUF_LAST;
1581 break;
1582 case '(':
1583 if (!(syntax & RE_NO_BK_PARENS))
1584 token->type = OP_OPEN_SUBEXP;
1585 break;
1586 case ')':
1587 if (!(syntax & RE_NO_BK_PARENS))
1588 token->type = OP_CLOSE_SUBEXP;
1589 break;
1590 case '+':
1591 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1592 token->type = OP_DUP_PLUS;
1593 break;
1594 case '?':
1595 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1596 token->type = OP_DUP_QUESTION;
1597 break;
1598 case '{':
1599 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1600 token->type = OP_OPEN_DUP_NUM;
1601 break;
1602 case '}':
1603 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1604 token->type = OP_CLOSE_DUP_NUM;
1605 break;
1606 default:
1607 break;
1609 return 2;
1612 token->type = CHARACTER;
1613 switch (c)
1615 case '\n':
1616 if (syntax & RE_NEWLINE_ALT)
1617 token->type = OP_ALT;
1618 break;
1619 case '|':
1620 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1621 token->type = OP_ALT;
1622 break;
1623 case '*':
1624 token->type = OP_DUP_ASTERISK;
1625 break;
1626 case '+':
1627 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1628 token->type = OP_DUP_PLUS;
1629 break;
1630 case '?':
1631 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1632 token->type = OP_DUP_QUESTION;
1633 break;
1634 case '{':
1635 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1636 token->type = OP_OPEN_DUP_NUM;
1637 break;
1638 case '}':
1639 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1640 token->type = OP_CLOSE_DUP_NUM;
1641 break;
1642 case '(':
1643 if (syntax & RE_NO_BK_PARENS)
1644 token->type = OP_OPEN_SUBEXP;
1645 break;
1646 case ')':
1647 if (syntax & RE_NO_BK_PARENS)
1648 token->type = OP_CLOSE_SUBEXP;
1649 break;
1650 case '[':
1651 token->type = OP_OPEN_BRACKET;
1652 break;
1653 case '.':
1654 token->type = OP_PERIOD;
1655 break;
1656 case '^':
1657 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1658 re_string_cur_idx (input) != 0)
1660 char prev = re_string_peek_byte (input, -1);
1661 if (prev != '|' && prev != '(' &&
1662 (!(syntax & RE_NEWLINE_ALT) || prev != '\n'))
1663 break;
1665 token->type = ANCHOR;
1666 token->opr.idx = LINE_FIRST;
1667 break;
1668 case '$':
1669 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1670 re_string_cur_idx (input) + 1 != re_string_length (input))
1672 re_token_t next;
1673 re_string_skip_bytes (input, 1);
1674 peek_token (&next, input, syntax);
1675 re_string_skip_bytes (input, -1);
1676 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1677 break;
1679 token->type = ANCHOR;
1680 token->opr.idx = LINE_LAST;
1681 break;
1682 default:
1683 break;
1685 return 1;
1688 /* Peek a token from INPUT, and return the length of the token.
1689 We must not use this function out of bracket expressions. */
1691 static int
1692 peek_token_bracket (token, input, syntax)
1693 re_token_t *token;
1694 re_string_t *input;
1695 reg_syntax_t syntax;
1697 unsigned char c;
1698 if (re_string_eoi (input))
1700 token->type = END_OF_RE;
1701 return 0;
1703 c = re_string_peek_byte (input, 0);
1704 token->opr.c = c;
1706 #ifdef RE_ENABLE_I18N
1707 if (MB_CUR_MAX > 1 &&
1708 !re_string_first_byte (input, re_string_cur_idx (input)))
1710 token->type = CHARACTER;
1711 return 1;
1713 #endif /* RE_ENABLE_I18N */
1715 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS))
1717 /* In this case, '\' escape a character. */
1718 unsigned char c2;
1719 re_string_skip_bytes (input, 1);
1720 c2 = re_string_peek_byte (input, 0);
1721 token->opr.c = c2;
1722 token->type = CHARACTER;
1723 return 1;
1725 if (c == '[') /* '[' is a special char in a bracket exps. */
1727 unsigned char c2;
1728 int token_len;
1729 c2 = re_string_peek_byte (input, 1);
1730 token->opr.c = c2;
1731 token_len = 2;
1732 switch (c2)
1734 case '.':
1735 token->type = OP_OPEN_COLL_ELEM;
1736 break;
1737 case '=':
1738 token->type = OP_OPEN_EQUIV_CLASS;
1739 break;
1740 case ':':
1741 if (syntax & RE_CHAR_CLASSES)
1743 token->type = OP_OPEN_CHAR_CLASS;
1744 break;
1746 /* else fall through. */
1747 default:
1748 token->type = CHARACTER;
1749 token->opr.c = c;
1750 token_len = 1;
1751 break;
1753 return token_len;
1755 switch (c)
1757 case '-':
1758 token->type = OP_CHARSET_RANGE;
1759 break;
1760 case ']':
1761 token->type = OP_CLOSE_BRACKET;
1762 break;
1763 case '^':
1764 token->type = OP_NON_MATCH_LIST;
1765 break;
1766 default:
1767 token->type = CHARACTER;
1769 return 1;
1772 /* Functions for parser. */
1774 /* Entry point of the parser.
1775 Parse the regular expression REGEXP and return the structure tree.
1776 If an error is occured, ERR is set by error code, and return NULL.
1777 This function build the following tree, from regular expression <reg_exp>:
1781 <reg_exp> EOR
1783 CAT means concatenation.
1784 EOR means end of regular expression. */
1786 static bin_tree_t *
1787 parse (regexp, preg, syntax, err)
1788 re_string_t *regexp;
1789 regex_t *preg;
1790 reg_syntax_t syntax;
1791 reg_errcode_t *err;
1793 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1794 bin_tree_t *tree, *eor, *root;
1795 re_token_t current_token;
1796 int new_idx;
1797 current_token = fetch_token (regexp, syntax);
1798 tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
1799 if (BE (*err != REG_NOERROR && tree == NULL, 0))
1800 return NULL;
1801 new_idx = re_dfa_add_node (dfa, current_token, 0);
1802 eor = create_tree (NULL, NULL, 0, new_idx);
1803 if (tree != NULL)
1804 root = create_tree (tree, eor, CONCAT, 0);
1805 else
1806 root = eor;
1807 if (BE (new_idx == -1 || eor == NULL || root == NULL, 0))
1809 *err = REG_ESPACE;
1810 return NULL;
1812 return root;
1815 /* This function build the following tree, from regular expression
1816 <branch1>|<branch2>:
1820 <branch1> <branch2>
1822 ALT means alternative, which represents the operator `|'. */
1824 static bin_tree_t *
1825 parse_reg_exp (regexp, preg, token, syntax, nest, err)
1826 re_string_t *regexp;
1827 regex_t *preg;
1828 re_token_t *token;
1829 reg_syntax_t syntax;
1830 int nest;
1831 reg_errcode_t *err;
1833 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1834 bin_tree_t *tree, *branch = NULL;
1835 int new_idx;
1836 tree = parse_branch (regexp, preg, token, syntax, nest, err);
1837 if (BE (*err != REG_NOERROR && tree == NULL, 0))
1838 return NULL;
1840 while (token->type == OP_ALT)
1842 re_token_t alt_token = *token;
1843 new_idx = re_dfa_add_node (dfa, alt_token, 0);
1844 *token = fetch_token (regexp, syntax);
1845 if (token->type != OP_ALT && token->type != END_OF_RE
1846 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
1848 branch = parse_branch (regexp, preg, token, syntax, nest, err);
1849 if (BE (*err != REG_NOERROR && branch == NULL, 0))
1851 free_bin_tree (tree);
1852 return NULL;
1855 else
1856 branch = NULL;
1857 tree = create_tree (tree, branch, 0, new_idx);
1858 if (BE (new_idx == -1 || tree == NULL, 0))
1860 *err = REG_ESPACE;
1861 return NULL;
1863 dfa->has_plural_match = 1;
1865 return tree;
1868 /* This function build the following tree, from regular expression
1869 <exp1><exp2>:
1873 <exp1> <exp2>
1875 CAT means concatenation. */
1877 static bin_tree_t *
1878 parse_branch (regexp, preg, token, syntax, nest, err)
1879 re_string_t *regexp;
1880 regex_t *preg;
1881 re_token_t *token;
1882 reg_syntax_t syntax;
1883 int nest;
1884 reg_errcode_t *err;
1886 bin_tree_t *tree, *exp;
1887 tree = parse_expression (regexp, preg, token, syntax, nest, err);
1888 if (BE (*err != REG_NOERROR && tree == NULL, 0))
1889 return NULL;
1891 while (token->type != OP_ALT && token->type != END_OF_RE
1892 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
1894 exp = parse_expression (regexp, preg, token, syntax, nest, err);
1895 if (BE (*err != REG_NOERROR && exp == NULL, 0))
1897 free_bin_tree (tree);
1898 return NULL;
1900 if (tree != NULL && exp != NULL)
1902 tree = create_tree (tree, exp, CONCAT, 0);
1903 if (tree == NULL)
1905 *err = REG_ESPACE;
1906 return NULL;
1909 else if (tree == NULL)
1910 tree = exp;
1911 /* Otherwise exp == NULL, we don't need to create new tree. */
1913 return tree;
1916 /* This function build the following tree, from regular expression a*:
1922 static bin_tree_t *
1923 parse_expression (regexp, preg, token, syntax, nest, err)
1924 re_string_t *regexp;
1925 regex_t *preg;
1926 re_token_t *token;
1927 reg_syntax_t syntax;
1928 int nest;
1929 reg_errcode_t *err;
1931 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1932 bin_tree_t *tree;
1933 int new_idx;
1934 switch (token->type)
1936 case CHARACTER:
1937 new_idx = re_dfa_add_node (dfa, *token, 0);
1938 tree = create_tree (NULL, NULL, 0, new_idx);
1939 if (BE (new_idx == -1 || tree == NULL, 0))
1941 *err = REG_ESPACE;
1942 return NULL;
1944 #ifdef RE_ENABLE_I18N
1945 if (MB_CUR_MAX > 1)
1947 while (!re_string_eoi (regexp)
1948 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
1950 bin_tree_t *mbc_remain;
1951 *token = fetch_token (regexp, syntax);
1952 new_idx = re_dfa_add_node (dfa, *token, 0);
1953 mbc_remain = create_tree (NULL, NULL, 0, new_idx);
1954 tree = create_tree (tree, mbc_remain, CONCAT, 0);
1955 if (BE (new_idx == -1 || mbc_remain == NULL || tree == NULL, 0))
1956 return *err = REG_ESPACE, NULL;
1959 #endif
1960 break;
1961 case OP_OPEN_SUBEXP:
1962 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
1963 if (BE (*err != REG_NOERROR && tree == NULL, 0))
1964 return NULL;
1965 break;
1966 case OP_OPEN_BRACKET:
1967 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
1968 if (BE (*err != REG_NOERROR && tree == NULL, 0))
1969 return NULL;
1970 break;
1971 case OP_BACK_REF:
1972 if (BE (preg->re_nsub < token->opr.idx
1973 || dfa->subexps[token->opr.idx - 1].end == -1, 0))
1975 *err = REG_ESUBREG;
1976 return NULL;
1978 new_idx = re_dfa_add_node (dfa, *token, 0);
1979 tree = create_tree (NULL, NULL, 0, new_idx);
1980 if (BE (new_idx == -1 || tree == NULL, 0))
1982 *err = REG_ESPACE;
1983 return NULL;
1985 ++dfa->nbackref;
1986 dfa->has_mb_node = 1;
1987 break;
1988 case OP_DUP_ASTERISK:
1989 case OP_DUP_PLUS:
1990 case OP_DUP_QUESTION:
1991 case OP_OPEN_DUP_NUM:
1992 if (syntax & RE_CONTEXT_INVALID_OPS)
1994 *err = REG_BADRPT;
1995 return NULL;
1997 else if (syntax & RE_CONTEXT_INDEP_OPS)
1999 *token = fetch_token (regexp, syntax);
2000 return parse_expression (regexp, preg, token, syntax, nest, err);
2002 /* else fall through */
2003 case OP_CLOSE_SUBEXP:
2004 if ((token->type == OP_CLOSE_SUBEXP) &&
2005 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2007 *err = REG_ERPAREN;
2008 return NULL;
2010 /* else fall through */
2011 case OP_CLOSE_DUP_NUM:
2012 /* We treat it as a normal character. */
2014 /* Then we can these characters as normal characters. */
2015 token->type = CHARACTER;
2016 new_idx = re_dfa_add_node (dfa, *token, 0);
2017 tree = create_tree (NULL, NULL, 0, new_idx);
2018 if (BE (new_idx == -1 || tree == NULL, 0))
2020 *err = REG_ESPACE;
2021 return NULL;
2023 break;
2024 case ANCHOR:
2025 if (dfa->word_char == NULL)
2027 *err = init_word_char (dfa);
2028 if (BE (*err != REG_NOERROR, 0))
2029 return NULL;
2031 if (token->opr.ctx_type == WORD_DELIM)
2033 bin_tree_t *tree_first, *tree_last;
2034 int idx_first, idx_last;
2035 token->opr.ctx_type = WORD_FIRST;
2036 idx_first = re_dfa_add_node (dfa, *token, 0);
2037 tree_first = create_tree (NULL, NULL, 0, idx_first);
2038 token->opr.ctx_type = WORD_LAST;
2039 idx_last = re_dfa_add_node (dfa, *token, 0);
2040 tree_last = create_tree (NULL, NULL, 0, idx_last);
2041 token->type = OP_ALT;
2042 new_idx = re_dfa_add_node (dfa, *token, 0);
2043 tree = create_tree (tree_first, tree_last, 0, new_idx);
2044 if (BE (idx_first == -1 || idx_last == -1 || new_idx == -1
2045 || tree_first == NULL || tree_last == NULL
2046 || tree == NULL, 0))
2048 *err = REG_ESPACE;
2049 return NULL;
2052 else
2054 new_idx = re_dfa_add_node (dfa, *token, 0);
2055 tree = create_tree (NULL, NULL, 0, new_idx);
2056 if (BE (new_idx == -1 || tree == NULL, 0))
2057 return *err = REG_ESPACE, NULL;
2059 /* We must return here, since ANCHORs can't be followed
2060 by repetition operators.
2061 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2062 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2063 *token = fetch_token (regexp, syntax);
2064 return tree;
2065 case OP_PERIOD:
2066 new_idx = re_dfa_add_node (dfa, *token, 0);
2067 tree = create_tree (NULL, NULL, 0, new_idx);
2068 if (BE (new_idx == -1 || tree == NULL, 0))
2070 *err = REG_ESPACE;
2071 return NULL;
2073 if (MB_CUR_MAX > 1)
2074 dfa->has_mb_node = 1;
2075 break;
2076 case OP_WORD:
2077 tree = build_word_op (dfa, 0, err);
2078 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2079 return NULL;
2080 break;
2081 case OP_NOTWORD:
2082 tree = build_word_op (dfa, 1, err);
2083 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2084 return NULL;
2085 break;
2086 case OP_ALT:
2087 case END_OF_RE:
2088 return NULL;
2089 case BACK_SLASH:
2090 *err = REG_EESCAPE;
2091 return NULL;
2092 default:
2093 /* Must not happen? */
2094 #ifdef DEBUG
2095 assert (0);
2096 #endif
2097 return NULL;
2099 *token = fetch_token (regexp, syntax);
2101 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2102 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2104 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2105 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2106 return NULL;
2107 dfa->has_plural_match = 1;
2110 return tree;
2113 /* This function build the following tree, from regular expression
2114 (<reg_exp>):
2115 SUBEXP
2117 <reg_exp>
2120 static bin_tree_t *
2121 parse_sub_exp (regexp, preg, token, syntax, nest, err)
2122 re_string_t *regexp;
2123 regex_t *preg;
2124 re_token_t *token;
2125 reg_syntax_t syntax;
2126 int nest;
2127 reg_errcode_t *err;
2129 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2130 bin_tree_t *tree, *left_par, *right_par;
2131 size_t cur_nsub;
2132 int new_idx;
2133 cur_nsub = preg->re_nsub++;
2134 if (dfa->subexps_alloc < preg->re_nsub)
2136 re_subexp_t *new_array;
2137 dfa->subexps_alloc *= 2;
2138 new_array = re_realloc (dfa->subexps, re_subexp_t, dfa->subexps_alloc);
2139 if (BE (new_array == NULL, 0))
2141 dfa->subexps_alloc /= 2;
2142 *err = REG_ESPACE;
2143 return NULL;
2145 dfa->subexps = new_array;
2147 dfa->subexps[cur_nsub].start = dfa->nodes_len;
2148 dfa->subexps[cur_nsub].end = -1;
2150 new_idx = re_dfa_add_node (dfa, *token, 0);
2151 left_par = create_tree (NULL, NULL, 0, new_idx);
2152 if (BE (new_idx == -1 || left_par == NULL, 0))
2154 *err = REG_ESPACE;
2155 return NULL;
2157 dfa->nodes[new_idx].opr.idx = cur_nsub;
2158 *token = fetch_token (regexp, syntax);
2160 /* The subexpression may be a null string. */
2161 if (token->type == OP_CLOSE_SUBEXP)
2162 tree = NULL;
2163 else
2165 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2166 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2167 return NULL;
2169 if (BE (token->type != OP_CLOSE_SUBEXP, 0))
2171 free_bin_tree (tree);
2172 *err = REG_BADPAT;
2173 return NULL;
2175 new_idx = re_dfa_add_node (dfa, *token, 0);
2176 dfa->subexps[cur_nsub].end = dfa->nodes_len;
2177 right_par = create_tree (NULL, NULL, 0, new_idx);
2178 tree = ((tree == NULL) ? right_par
2179 : create_tree (tree, right_par, CONCAT, 0));
2180 tree = create_tree (left_par, tree, CONCAT, 0);
2181 if (BE (new_idx == -1 || right_par == NULL || tree == NULL, 0))
2183 *err = REG_ESPACE;
2184 return NULL;
2186 dfa->nodes[new_idx].opr.idx = cur_nsub;
2188 return tree;
2191 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2193 static bin_tree_t *
2194 parse_dup_op (dup_elem, regexp, dfa, token, syntax, err)
2195 bin_tree_t *dup_elem;
2196 re_string_t *regexp;
2197 re_dfa_t *dfa;
2198 re_token_t *token;
2199 reg_syntax_t syntax;
2200 reg_errcode_t *err;
2202 re_token_t dup_token;
2203 bin_tree_t *tree = dup_elem, *work_tree;
2204 int new_idx, start_idx = re_string_cur_idx (regexp);
2205 re_token_t start_token = *token;
2206 if (token->type == OP_OPEN_DUP_NUM)
2208 int i;
2209 int end = 0;
2210 int start = fetch_number (regexp, token, syntax);
2211 bin_tree_t *elem;
2212 if (start == -1)
2214 if (token->type == CHARACTER && token->opr.c == ',')
2215 start = 0; /* We treat "{,m}" as "{0,m}". */
2216 else
2218 *err = REG_BADBR; /* <re>{} is invalid. */
2219 return NULL;
2222 if (BE (start != -2, 1))
2224 /* We treat "{n}" as "{n,n}". */
2225 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2226 : ((token->type == CHARACTER && token->opr.c == ',')
2227 ? fetch_number (regexp, token, syntax) : -2));
2229 if (BE (start == -2 || end == -2, 0))
2231 /* Invalid sequence. */
2232 if (token->type == OP_CLOSE_DUP_NUM)
2233 goto parse_dup_op_invalid_interval;
2234 else
2235 goto parse_dup_op_ebrace;
2237 if (BE (start == 0 && end == 0, 0))
2239 /* We treat "<re>{0}" and "<re>{0,0}" as null string. */
2240 *token = fetch_token (regexp, syntax);
2241 free_bin_tree (dup_elem);
2242 return NULL;
2245 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2246 elem = tree;
2247 for (i = 0; i < start; ++i)
2248 if (i != 0)
2250 work_tree = duplicate_tree (elem, dfa);
2251 tree = create_tree (tree, work_tree, CONCAT, 0);
2252 if (BE (work_tree == NULL || tree == NULL, 0))
2253 goto parse_dup_op_espace;
2256 if (end == -1)
2258 /* We treat "<re>{0,}" as "<re>*". */
2259 dup_token.type = OP_DUP_ASTERISK;
2260 if (start > 0)
2262 elem = duplicate_tree (elem, dfa);
2263 new_idx = re_dfa_add_node (dfa, dup_token, 0);
2264 work_tree = create_tree (elem, NULL, 0, new_idx);
2265 tree = create_tree (tree, work_tree, CONCAT, 0);
2266 if (BE (elem == NULL || new_idx == -1 || work_tree == NULL
2267 || tree == NULL, 0))
2268 goto parse_dup_op_espace;
2270 else
2272 new_idx = re_dfa_add_node (dfa, dup_token, 0);
2273 tree = create_tree (elem, NULL, 0, new_idx);
2274 if (BE (new_idx == -1 || tree == NULL, 0))
2275 goto parse_dup_op_espace;
2278 else if (end - start > 0)
2280 /* Then extract "<re>{0,m}" to "<re>?<re>?...<re>?". */
2281 dup_token.type = OP_DUP_QUESTION;
2282 if (start > 0)
2284 elem = duplicate_tree (elem, dfa);
2285 new_idx = re_dfa_add_node (dfa, dup_token, 0);
2286 elem = create_tree (elem, NULL, 0, new_idx);
2287 tree = create_tree (tree, elem, CONCAT, 0);
2288 if (BE (elem == NULL || new_idx == -1 || tree == NULL, 0))
2289 goto parse_dup_op_espace;
2291 else
2293 new_idx = re_dfa_add_node (dfa, dup_token, 0);
2294 tree = elem = create_tree (elem, NULL, 0, new_idx);
2295 if (BE (new_idx == -1 || tree == NULL, 0))
2296 goto parse_dup_op_espace;
2298 for (i = 1; i < end - start; ++i)
2300 work_tree = duplicate_tree (elem, dfa);
2301 tree = create_tree (tree, work_tree, CONCAT, 0);
2302 if (BE (work_tree == NULL || tree == NULL, 0))
2304 *err = REG_ESPACE;
2305 return NULL;
2310 else
2312 new_idx = re_dfa_add_node (dfa, *token, 0);
2313 tree = create_tree (tree, NULL, 0, new_idx);
2314 if (BE (new_idx == -1 || tree == NULL, 0))
2316 *err = REG_ESPACE;
2317 return NULL;
2320 *token = fetch_token (regexp, syntax);
2321 return tree;
2323 parse_dup_op_espace:
2324 free_bin_tree (tree);
2325 *err = REG_ESPACE;
2326 return NULL;
2328 parse_dup_op_ebrace:
2329 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2331 *err = REG_EBRACE;
2332 return NULL;
2334 goto parse_dup_op_rollback;
2335 parse_dup_op_invalid_interval:
2336 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2338 *err = REG_BADBR;
2339 return NULL;
2341 parse_dup_op_rollback:
2342 re_string_set_index (regexp, start_idx);
2343 *token = start_token;
2344 token->type = CHARACTER;
2345 return dup_elem;
2348 /* Size of the names for collating symbol/equivalence_class/character_class.
2349 I'm not sure, but maybe enough. */
2350 #define BRACKET_NAME_BUF_SIZE 32
2352 #ifndef _LIBC
2353 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2354 Build the range expression which starts from START_ELEM, and ends
2355 at END_ELEM. The result are written to MBCSET and SBCSET.
2356 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2357 mbcset->range_ends, is a pointer argument sinse we may
2358 update it. */
2360 static reg_errcode_t
2361 # ifdef RE_ENABLE_I18N
2362 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2363 re_charset_t *mbcset;
2364 int *range_alloc;
2365 # else /* not RE_ENABLE_I18N */
2366 build_range_exp (sbcset, start_elem, end_elem)
2367 # endif /* not RE_ENABLE_I18N */
2368 re_bitset_ptr_t sbcset;
2369 bracket_elem_t *start_elem, *end_elem;
2371 unsigned int start_ch, end_ch;
2372 /* Equivalence Classes and Character Classes can't be a range start/end. */
2373 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2374 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2376 return REG_ERANGE;
2378 /* We can handle no multi character collating elements without libc
2379 support. */
2380 if (BE ((start_elem->type == COLL_SYM
2381 && strlen ((char *) start_elem->opr.name) > 1)
2382 || (end_elem->type == COLL_SYM
2383 && strlen ((char *) end_elem->opr.name) > 1), 0))
2384 return REG_ECOLLATE;
2386 # ifdef RE_ENABLE_I18N
2388 wchar_t wc, start_wc, end_wc;
2389 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2391 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2392 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2393 : 0));
2394 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2395 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2396 : 0));
2397 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2398 ? __btowc (start_ch) : start_elem->opr.wch);
2399 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2400 ? __btowc (end_ch) : end_elem->opr.wch);
2401 cmp_buf[0] = start_wc;
2402 cmp_buf[4] = end_wc;
2403 if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2404 return REG_ERANGE;
2406 /* Check the space of the arrays. */
2407 if (*range_alloc == mbcset->nranges)
2409 /* There are not enough space, need realloc. */
2410 wchar_t *new_array_start, *new_array_end;
2411 int new_nranges;
2413 /* +1 in case of mbcset->nranges is 0. */
2414 new_nranges = 2 * mbcset->nranges + 1;
2415 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2416 are NULL if *range_alloc == 0. */
2417 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2418 new_nranges);
2419 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2420 new_nranges);
2422 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2423 return REG_ESPACE;
2425 mbcset->range_starts = new_array_start;
2426 mbcset->range_ends = new_array_end;
2427 *range_alloc = new_nranges;
2430 mbcset->range_starts[mbcset->nranges] = start_wc;
2431 mbcset->range_ends[mbcset->nranges++] = end_wc;
2433 /* Build the table for single byte characters. */
2434 for (wc = 0; wc <= SBC_MAX; ++wc)
2436 cmp_buf[2] = wc;
2437 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2438 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2439 bitset_set (sbcset, wc);
2442 # else /* not RE_ENABLE_I18N */
2444 unsigned int ch;
2445 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2446 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2447 : 0));
2448 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2449 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2450 : 0));
2451 if (start_ch > end_ch)
2452 return REG_ERANGE;
2453 /* Build the table for single byte characters. */
2454 for (ch = 0; ch <= SBC_MAX; ++ch)
2455 if (start_ch <= ch && ch <= end_ch)
2456 bitset_set (sbcset, ch);
2458 # endif /* not RE_ENABLE_I18N */
2459 return REG_NOERROR;
2461 #endif /* not _LIBC */
2463 #ifndef _LIBC
2464 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2465 Build the collating element which is represented by NAME.
2466 The result are written to MBCSET and SBCSET.
2467 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2468 pointer argument since we may update it. */
2470 static reg_errcode_t
2471 # ifdef RE_ENABLE_I18N
2472 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2473 re_charset_t *mbcset;
2474 int *coll_sym_alloc;
2475 # else /* not RE_ENABLE_I18N */
2476 build_collating_symbol (sbcset, name)
2477 # endif /* not RE_ENABLE_I18N */
2478 re_bitset_ptr_t sbcset;
2479 const unsigned char *name;
2481 size_t name_len = strlen ((const char *) name);
2482 if (BE (name_len != 1, 0))
2483 return REG_ECOLLATE;
2484 else
2486 bitset_set (sbcset, name[0]);
2487 return REG_NOERROR;
2490 #endif /* not _LIBC */
2492 /* This function parse bracket expression like "[abc]", "[a-c]",
2493 "[[.a-a.]]" etc. */
2495 static bin_tree_t *
2496 parse_bracket_exp (regexp, dfa, token, syntax, err)
2497 re_string_t *regexp;
2498 re_dfa_t *dfa;
2499 re_token_t *token;
2500 reg_syntax_t syntax;
2501 reg_errcode_t *err;
2503 #ifdef _LIBC
2504 const unsigned char *collseqmb;
2505 const char *collseqwc;
2506 uint32_t nrules;
2507 int32_t table_size;
2508 const int32_t *symb_table;
2509 const unsigned char *extra;
2511 /* Local function for parse_bracket_exp used in _LIBC environement.
2512 Seek the collating symbol entry correspondings to NAME.
2513 Return the index of the symbol in the SYMB_TABLE. */
2515 static inline int32_t
2516 seek_collating_symbol_entry (name, name_len)
2517 const unsigned char *name;
2518 size_t name_len;
2520 int32_t hash = elem_hash ((const char *) name, name_len);
2521 int32_t elem = hash % table_size;
2522 int32_t second = hash % (table_size - 2);
2523 while (symb_table[2 * elem] != 0)
2525 /* First compare the hashing value. */
2526 if (symb_table[2 * elem] == hash
2527 /* Compare the length of the name. */
2528 && name_len == extra[symb_table[2 * elem + 1]]
2529 /* Compare the name. */
2530 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2531 name_len) == 0)
2533 /* Yep, this is the entry. */
2534 break;
2537 /* Next entry. */
2538 elem += second;
2540 return elem;
2543 /* Local function for parse_bracket_exp used in _LIBC environement.
2544 Look up the collation sequence value of BR_ELEM.
2545 Return the value if succeeded, UINT_MAX otherwise. */
2547 static inline unsigned int
2548 lookup_collation_sequence_value (br_elem)
2549 bracket_elem_t *br_elem;
2551 if (br_elem->type == SB_CHAR)
2554 if (MB_CUR_MAX == 1)
2556 if (nrules == 0)
2557 return collseqmb[br_elem->opr.ch];
2558 else
2560 wint_t wc = __btowc (br_elem->opr.ch);
2561 return collseq_table_lookup (collseqwc, wc);
2564 else if (br_elem->type == MB_CHAR)
2566 return collseq_table_lookup (collseqwc, br_elem->opr.wch);
2568 else if (br_elem->type == COLL_SYM)
2570 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2571 if (nrules != 0)
2573 int32_t elem, idx;
2574 elem = seek_collating_symbol_entry (br_elem->opr.name,
2575 sym_name_len);
2576 if (symb_table[2 * elem] != 0)
2578 /* We found the entry. */
2579 idx = symb_table[2 * elem + 1];
2580 /* Skip the name of collating element name. */
2581 idx += 1 + extra[idx];
2582 /* Skip the byte sequence of the collating element. */
2583 idx += 1 + extra[idx];
2584 /* Adjust for the alignment. */
2585 idx = (idx + 3) & ~3;
2586 /* Skip the multibyte collation sequence value. */
2587 idx += sizeof (unsigned int);
2588 /* Skip the wide char sequence of the collating element. */
2589 idx += sizeof (unsigned int) *
2590 (1 + *(unsigned int *) (extra + idx));
2591 /* Return the collation sequence value. */
2592 return *(unsigned int *) (extra + idx);
2594 else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2596 /* No valid character. Match it as a single byte
2597 character. */
2598 return collseqmb[br_elem->opr.name[0]];
2601 else if (sym_name_len == 1)
2602 return collseqmb[br_elem->opr.name[0]];
2604 return UINT_MAX;
2607 /* Local function for parse_bracket_exp used in _LIBC environement.
2608 Build the range expression which starts from START_ELEM, and ends
2609 at END_ELEM. The result are written to MBCSET and SBCSET.
2610 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2611 mbcset->range_ends, is a pointer argument sinse we may
2612 update it. */
2614 static inline reg_errcode_t
2615 # ifdef RE_ENABLE_I18N
2616 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2617 re_charset_t *mbcset;
2618 int *range_alloc;
2619 # else /* not RE_ENABLE_I18N */
2620 build_range_exp (sbcset, start_elem, end_elem)
2621 # endif /* not RE_ENABLE_I18N */
2622 re_bitset_ptr_t sbcset;
2623 bracket_elem_t *start_elem, *end_elem;
2625 unsigned int ch;
2626 uint32_t start_collseq;
2627 uint32_t end_collseq;
2629 # ifdef RE_ENABLE_I18N
2630 /* Check the space of the arrays. */
2631 if (*range_alloc == mbcset->nranges)
2633 /* There are not enough space, need realloc. */
2634 uint32_t *new_array_start;
2635 uint32_t *new_array_end;
2636 int new_nranges;
2638 /* +1 in case of mbcset->nranges is 0. */
2639 new_nranges = 2 * mbcset->nranges + 1;
2640 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2641 are NULL if *range_alloc == 0. */
2642 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2643 new_nranges);
2644 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2645 new_nranges);
2647 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2648 return REG_ESPACE;
2650 mbcset->range_starts = new_array_start;
2651 mbcset->range_ends = new_array_end;
2652 *range_alloc = new_nranges;
2654 # endif /* RE_ENABLE_I18N */
2656 /* Equivalence Classes and Character Classes can't be a range
2657 start/end. */
2658 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2659 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2661 return REG_ERANGE;
2663 start_collseq = lookup_collation_sequence_value (start_elem);
2664 end_collseq = lookup_collation_sequence_value (end_elem);
2665 /* Check start/end collation sequence values. */
2666 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2667 return REG_ECOLLATE;
2668 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2669 return REG_ERANGE;
2671 # ifdef RE_ENABLE_I18N
2672 /* Got valid collation sequence values, add them as a new entry. */
2673 mbcset->range_starts[mbcset->nranges] = start_collseq;
2674 mbcset->range_ends[mbcset->nranges++] = end_collseq;
2675 # endif /* RE_ENABLE_I18N */
2677 /* Build the table for single byte characters. */
2678 for (ch = 0; ch <= SBC_MAX; ch++)
2680 uint32_t ch_collseq;
2682 if (MB_CUR_MAX == 1)
2684 if (nrules == 0)
2685 ch_collseq = collseqmb[ch];
2686 else
2687 ch_collseq = collseq_table_lookup (collseqwc, __btowc (ch));
2688 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2689 bitset_set (sbcset, ch);
2691 return REG_NOERROR;
2694 /* Local function for parse_bracket_exp used in _LIBC environement.
2695 Build the collating element which is represented by NAME.
2696 The result are written to MBCSET and SBCSET.
2697 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2698 pointer argument sinse we may update it. */
2700 static inline reg_errcode_t
2701 # ifdef RE_ENABLE_I18N
2702 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2703 re_charset_t *mbcset;
2704 int *coll_sym_alloc;
2705 # else /* not RE_ENABLE_I18N */
2706 build_collating_symbol (sbcset, name)
2707 # endif /* not RE_ENABLE_I18N */
2708 re_bitset_ptr_t sbcset;
2709 const unsigned char *name;
2711 int32_t elem, idx;
2712 size_t name_len = strlen ((const char *) name);
2713 if (nrules != 0)
2715 elem = seek_collating_symbol_entry (name, name_len);
2716 if (symb_table[2 * elem] != 0)
2718 /* We found the entry. */
2719 idx = symb_table[2 * elem + 1];
2720 /* Skip the name of collating element name. */
2721 idx += 1 + extra[idx];
2723 else if (symb_table[2 * elem] == 0 && name_len == 1)
2725 /* No valid character, treat it as a normal
2726 character. */
2727 bitset_set (sbcset, name[0]);
2728 return REG_NOERROR;
2730 else
2731 return REG_ECOLLATE;
2733 # ifdef RE_ENABLE_I18N
2734 /* Got valid collation sequence, add it as a new entry. */
2735 /* Check the space of the arrays. */
2736 if (*coll_sym_alloc == mbcset->ncoll_syms)
2738 /* Not enough, realloc it. */
2739 /* +1 in case of mbcset->ncoll_syms is 0. */
2740 *coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
2741 /* Use realloc since mbcset->coll_syms is NULL
2742 if *alloc == 0. */
2743 mbcset->coll_syms = re_realloc (mbcset->coll_syms, int32_t,
2744 *coll_sym_alloc);
2745 if (BE (mbcset->coll_syms == NULL, 0))
2746 return REG_ESPACE;
2748 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
2749 # endif /* RE_ENABLE_I18N */
2750 return REG_NOERROR;
2752 else
2754 if (BE (name_len != 1, 0))
2755 return REG_ECOLLATE;
2756 else
2758 bitset_set (sbcset, name[0]);
2759 return REG_NOERROR;
2763 #endif
2765 re_token_t br_token;
2766 re_bitset_ptr_t sbcset;
2767 #ifdef RE_ENABLE_I18N
2768 re_charset_t *mbcset;
2769 int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
2770 int equiv_class_alloc = 0, char_class_alloc = 0;
2771 #else /* not RE_ENABLE_I18N */
2772 int non_match = 0;
2773 #endif /* not RE_ENABLE_I18N */
2774 bin_tree_t *work_tree;
2775 int token_len, new_idx;
2776 #ifdef _LIBC
2777 collseqmb = (const unsigned char *)
2778 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
2779 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
2780 if (nrules)
2783 if (MB_CUR_MAX > 1)
2785 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
2786 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
2787 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
2788 _NL_COLLATE_SYMB_TABLEMB);
2789 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
2790 _NL_COLLATE_SYMB_EXTRAMB);
2792 #endif
2793 sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
2794 #ifdef RE_ENABLE_I18N
2795 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
2796 #endif /* RE_ENABLE_I18N */
2797 #ifdef RE_ENABLE_I18N
2798 if (BE (sbcset == NULL || mbcset == NULL, 0))
2799 #else
2800 if (BE (sbcset == NULL, 0))
2801 #endif /* RE_ENABLE_I18N */
2803 *err = REG_ESPACE;
2804 return NULL;
2807 token_len = peek_token_bracket (token, regexp, syntax);
2808 if (BE (token->type == END_OF_RE, 0))
2810 *err = REG_BADPAT;
2811 goto parse_bracket_exp_free_return;
2813 if (token->type == OP_NON_MATCH_LIST)
2815 #ifdef RE_ENABLE_I18N
2816 int i;
2817 mbcset->non_match = 1;
2818 #else /* not RE_ENABLE_I18N */
2819 non_match = 1;
2820 #endif /* not RE_ENABLE_I18N */
2821 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
2822 bitset_set (sbcset, '\0');
2823 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
2824 token_len = peek_token_bracket (token, regexp, syntax);
2825 if (BE (token->type == END_OF_RE, 0))
2827 *err = REG_BADPAT;
2828 goto parse_bracket_exp_free_return;
2830 #ifdef RE_ENABLE_I18N
2831 if (MB_CUR_MAX > 1)
2832 for (i = 0; i < SBC_MAX; ++i)
2833 if (__btowc (i) == WEOF)
2834 bitset_set (sbcset, i);
2835 #endif /* RE_ENABLE_I18N */
2838 /* We treat the first ']' as a normal character. */
2839 if (token->type == OP_CLOSE_BRACKET)
2840 token->type = CHARACTER;
2842 while (1)
2844 bracket_elem_t start_elem, end_elem;
2845 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
2846 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
2847 reg_errcode_t ret;
2848 int token_len2 = 0, is_range_exp = 0;
2849 re_token_t token2;
2851 start_elem.opr.name = start_name_buf;
2852 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
2853 syntax);
2854 if (BE (ret != REG_NOERROR, 0))
2856 *err = ret;
2857 goto parse_bracket_exp_free_return;
2860 token_len = peek_token_bracket (token, regexp, syntax);
2861 if (BE (token->type == END_OF_RE, 0))
2863 *err = REG_BADPAT;
2864 goto parse_bracket_exp_free_return;
2866 if (token->type == OP_CHARSET_RANGE)
2868 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
2869 token_len2 = peek_token_bracket (&token2, regexp, syntax);
2870 if (BE (token->type == END_OF_RE, 0))
2872 *err = REG_BADPAT;
2873 goto parse_bracket_exp_free_return;
2875 if (token2.type == OP_CLOSE_BRACKET)
2877 /* We treat the last '-' as a normal character. */
2878 re_string_skip_bytes (regexp, -token_len);
2879 token->type = CHARACTER;
2881 else
2882 is_range_exp = 1;
2885 if (is_range_exp == 1)
2887 end_elem.opr.name = end_name_buf;
2888 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
2889 dfa, syntax);
2890 if (BE (ret != REG_NOERROR, 0))
2892 *err = ret;
2893 goto parse_bracket_exp_free_return;
2896 token_len = peek_token_bracket (token, regexp, syntax);
2897 if (BE (token->type == END_OF_RE, 0))
2899 *err = REG_BADPAT;
2900 goto parse_bracket_exp_free_return;
2902 *err = build_range_exp (sbcset,
2903 #ifdef RE_ENABLE_I18N
2904 mbcset, &range_alloc,
2905 #endif /* RE_ENABLE_I18N */
2906 &start_elem, &end_elem);
2907 if (BE (*err != REG_NOERROR, 0))
2908 goto parse_bracket_exp_free_return;
2910 else
2912 switch (start_elem.type)
2914 case SB_CHAR:
2915 bitset_set (sbcset, start_elem.opr.ch);
2916 break;
2917 #ifdef RE_ENABLE_I18N
2918 case MB_CHAR:
2919 /* Check whether the array has enough space. */
2920 if (mbchar_alloc == mbcset->nmbchars)
2922 /* Not enough, realloc it. */
2923 /* +1 in case of mbcset->nmbchars is 0. */
2924 mbchar_alloc = 2 * mbcset->nmbchars + 1;
2925 /* Use realloc since array is NULL if *alloc == 0. */
2926 mbcset->mbchars = re_realloc (mbcset->mbchars, wchar_t,
2927 mbchar_alloc);
2928 if (BE (mbcset->mbchars == NULL, 0))
2929 goto parse_bracket_exp_espace;
2931 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
2932 break;
2933 #endif /* RE_ENABLE_I18N */
2934 case EQUIV_CLASS:
2935 *err = build_equiv_class (sbcset,
2936 #ifdef RE_ENABLE_I18N
2937 mbcset, &equiv_class_alloc,
2938 #endif /* RE_ENABLE_I18N */
2939 start_elem.opr.name);
2940 if (BE (*err != REG_NOERROR, 0))
2941 goto parse_bracket_exp_free_return;
2942 break;
2943 case COLL_SYM:
2944 *err = build_collating_symbol (sbcset,
2945 #ifdef RE_ENABLE_I18N
2946 mbcset, &coll_sym_alloc,
2947 #endif /* RE_ENABLE_I18N */
2948 start_elem.opr.name);
2949 if (BE (*err != REG_NOERROR, 0))
2950 goto parse_bracket_exp_free_return;
2951 break;
2952 case CHAR_CLASS:
2953 ret = build_charclass (sbcset,
2954 #ifdef RE_ENABLE_I18N
2955 mbcset, &char_class_alloc,
2956 #endif /* RE_ENABLE_I18N */
2957 start_elem.opr.name, syntax);
2958 if (BE (ret != REG_NOERROR, 0))
2959 goto parse_bracket_exp_espace;
2960 break;
2961 default:
2962 assert (0);
2963 break;
2966 if (token->type == OP_CLOSE_BRACKET)
2967 break;
2970 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
2972 /* If it is non-matching list. */
2973 #ifdef RE_ENABLE_I18N
2974 if (mbcset->non_match)
2975 #else /* not RE_ENABLE_I18N */
2976 if (non_match)
2977 #endif /* not RE_ENABLE_I18N */
2978 bitset_not (sbcset);
2980 /* Build a tree for simple bracket. */
2981 br_token.type = SIMPLE_BRACKET;
2982 br_token.opr.sbcset = sbcset;
2983 new_idx = re_dfa_add_node (dfa, br_token, 0);
2984 work_tree = create_tree (NULL, NULL, 0, new_idx);
2985 if (BE (new_idx == -1 || work_tree == NULL, 0))
2986 goto parse_bracket_exp_espace;
2988 #ifdef RE_ENABLE_I18N
2989 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
2990 || mbcset->nranges || (MB_CUR_MAX > 1 && (mbcset->nchar_classes
2991 || mbcset->non_match)))
2993 re_token_t alt_token;
2994 bin_tree_t *mbc_tree;
2995 /* Build a tree for complex bracket. */
2996 br_token.type = COMPLEX_BRACKET;
2997 br_token.opr.mbcset = mbcset;
2998 dfa->has_mb_node = 1;
2999 new_idx = re_dfa_add_node (dfa, br_token, 0);
3000 mbc_tree = create_tree (NULL, NULL, 0, new_idx);
3001 if (BE (new_idx == -1 || mbc_tree == NULL, 0))
3002 goto parse_bracket_exp_espace;
3003 /* Then join them by ALT node. */
3004 dfa->has_plural_match = 1;
3005 alt_token.type = OP_ALT;
3006 new_idx = re_dfa_add_node (dfa, alt_token, 0);
3007 work_tree = create_tree (work_tree, mbc_tree, 0, new_idx);
3008 if (BE (new_idx != -1 && mbc_tree != NULL, 1))
3009 return work_tree;
3011 else
3013 free_charset (mbcset);
3014 return work_tree;
3016 #else /* not RE_ENABLE_I18N */
3017 return work_tree;
3018 #endif /* not RE_ENABLE_I18N */
3020 parse_bracket_exp_espace:
3021 *err = REG_ESPACE;
3022 parse_bracket_exp_free_return:
3023 re_free (sbcset);
3024 #ifdef RE_ENABLE_I18N
3025 free_charset (mbcset);
3026 #endif /* RE_ENABLE_I18N */
3027 return NULL;
3030 /* Parse an element in the bracket expression. */
3032 static reg_errcode_t
3033 parse_bracket_element (elem, regexp, token, token_len, dfa, syntax)
3034 bracket_elem_t *elem;
3035 re_string_t *regexp;
3036 re_token_t *token;
3037 int token_len;
3038 re_dfa_t *dfa;
3039 reg_syntax_t syntax;
3041 #ifdef RE_ENABLE_I18N
3042 int cur_char_size;
3043 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3044 if (cur_char_size > 1)
3046 elem->type = MB_CHAR;
3047 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3048 re_string_skip_bytes (regexp, cur_char_size);
3049 return REG_NOERROR;
3051 #endif /* RE_ENABLE_I18N */
3052 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3053 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3054 || token->type == OP_OPEN_EQUIV_CLASS)
3055 return parse_bracket_symbol (elem, regexp, token);
3056 elem->type = SB_CHAR;
3057 elem->opr.ch = token->opr.c;
3058 return REG_NOERROR;
3061 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3062 such as [:<character_class>:], [.<collating_element>.], and
3063 [=<equivalent_class>=]. */
3065 static reg_errcode_t
3066 parse_bracket_symbol (elem, regexp, token)
3067 bracket_elem_t *elem;
3068 re_string_t *regexp;
3069 re_token_t *token;
3071 unsigned char ch, delim = token->opr.c;
3072 int i = 0;
3073 for (;; ++i)
3075 if (re_string_eoi(regexp) || i >= BRACKET_NAME_BUF_SIZE)
3076 return REG_EBRACK;
3077 if (token->type == OP_OPEN_CHAR_CLASS)
3078 ch = re_string_fetch_byte_case (regexp);
3079 else
3080 ch = re_string_fetch_byte (regexp);
3081 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3082 break;
3083 elem->opr.name[i] = ch;
3085 re_string_skip_bytes (regexp, 1);
3086 elem->opr.name[i] = '\0';
3087 switch (token->type)
3089 case OP_OPEN_COLL_ELEM:
3090 elem->type = COLL_SYM;
3091 break;
3092 case OP_OPEN_EQUIV_CLASS:
3093 elem->type = EQUIV_CLASS;
3094 break;
3095 case OP_OPEN_CHAR_CLASS:
3096 elem->type = CHAR_CLASS;
3097 break;
3098 default:
3099 break;
3101 return REG_NOERROR;
3104 /* Helper function for parse_bracket_exp.
3105 Build the equivalence class which is represented by NAME.
3106 The result are written to MBCSET and SBCSET.
3107 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3108 is a pointer argument sinse we may update it. */
3110 static reg_errcode_t
3111 #ifdef RE_ENABLE_I18N
3112 build_equiv_class (sbcset, mbcset, equiv_class_alloc, name)
3113 re_charset_t *mbcset;
3114 int *equiv_class_alloc;
3115 #else /* not RE_ENABLE_I18N */
3116 build_equiv_class (sbcset, name)
3117 #endif /* not RE_ENABLE_I18N */
3118 re_bitset_ptr_t sbcset;
3119 const unsigned char *name;
3121 #if defined _LIBC && defined RE_ENABLE_I18N
3122 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3123 if (nrules != 0)
3125 const int32_t *table, *indirect;
3126 const unsigned char *weights, *extra, *cp;
3127 unsigned char char_buf[2];
3128 int32_t idx1, idx2;
3129 unsigned int ch;
3130 size_t len;
3131 /* This #include defines a local function! */
3132 # include <locale/weight.h>
3133 /* Calculate the index for equivalence class. */
3134 cp = name;
3135 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3136 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3137 _NL_COLLATE_WEIGHTMB);
3138 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3139 _NL_COLLATE_EXTRAMB);
3140 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3141 _NL_COLLATE_INDIRECTMB);
3142 idx1 = findidx (&cp);
3143 if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3144 /* This isn't a valid character. */
3145 return REG_ECOLLATE;
3147 /* Build single byte matcing table for this equivalence class. */
3148 char_buf[1] = (unsigned char) '\0';
3149 len = weights[idx1];
3150 for (ch = 0; ch < SBC_MAX; ++ch)
3152 char_buf[0] = ch;
3153 cp = char_buf;
3154 idx2 = findidx (&cp);
3156 idx2 = table[ch];
3158 if (idx2 == 0)
3159 /* This isn't a valid character. */
3160 continue;
3161 if (len == weights[idx2])
3163 int cnt = 0;
3164 while (cnt <= len &&
3165 weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
3166 ++cnt;
3168 if (cnt > len)
3169 bitset_set (sbcset, ch);
3172 /* Check whether the array has enough space. */
3173 if (*equiv_class_alloc == mbcset->nequiv_classes)
3175 /* Not enough, realloc it. */
3176 /* +1 in case of mbcset->nequiv_classes is 0. */
3177 *equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3178 /* Use realloc since the array is NULL if *alloc == 0. */
3179 mbcset->equiv_classes = re_realloc (mbcset->equiv_classes, int32_t,
3180 *equiv_class_alloc);
3181 if (BE (mbcset->equiv_classes == NULL, 0))
3182 return REG_ESPACE;
3184 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3186 else
3187 #endif /* _LIBC && RE_ENABLE_I18N */
3189 if (BE (strlen ((const char *) name) != 1, 0))
3190 return REG_ECOLLATE;
3191 bitset_set (sbcset, *name);
3193 return REG_NOERROR;
3196 /* Helper function for parse_bracket_exp.
3197 Build the character class which is represented by NAME.
3198 The result are written to MBCSET and SBCSET.
3199 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3200 is a pointer argument sinse we may update it. */
3202 static reg_errcode_t
3203 #ifdef RE_ENABLE_I18N
3204 build_charclass (sbcset, mbcset, char_class_alloc, class_name, syntax)
3205 re_charset_t *mbcset;
3206 int *char_class_alloc;
3207 #else /* not RE_ENABLE_I18N */
3208 build_charclass (sbcset, class_name, syntax)
3209 #endif /* not RE_ENABLE_I18N */
3210 re_bitset_ptr_t sbcset;
3211 const unsigned char *class_name;
3212 reg_syntax_t syntax;
3214 int i;
3215 const char *name = (const char *) class_name;
3217 /* In case of REG_ICASE "upper" and "lower" match the both of
3218 upper and lower cases. */
3219 if ((syntax & RE_ICASE)
3220 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3221 name = "alpha";
3223 #ifdef RE_ENABLE_I18N
3224 /* Check the space of the arrays. */
3225 if (*char_class_alloc == mbcset->nchar_classes)
3227 /* Not enough, realloc it. */
3228 /* +1 in case of mbcset->nchar_classes is 0. */
3229 *char_class_alloc = 2 * mbcset->nchar_classes + 1;
3230 /* Use realloc since array is NULL if *alloc == 0. */
3231 mbcset->char_classes = re_realloc (mbcset->char_classes, wctype_t,
3232 *char_class_alloc);
3233 if (BE (mbcset->char_classes == NULL, 0))
3234 return REG_ESPACE;
3236 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3237 #endif /* RE_ENABLE_I18N */
3239 #define BUILD_CHARCLASS_LOOP(ctype_func)\
3240 for (i = 0; i < SBC_MAX; ++i) \
3242 if (ctype_func (i)) \
3243 bitset_set (sbcset, i); \
3246 if (strcmp (name, "alnum") == 0)
3247 BUILD_CHARCLASS_LOOP (isalnum)
3248 else if (strcmp (name, "cntrl") == 0)
3249 BUILD_CHARCLASS_LOOP (iscntrl)
3250 else if (strcmp (name, "lower") == 0)
3251 BUILD_CHARCLASS_LOOP (islower)
3252 else if (strcmp (name, "space") == 0)
3253 BUILD_CHARCLASS_LOOP (isspace)
3254 else if (strcmp (name, "alpha") == 0)
3255 BUILD_CHARCLASS_LOOP (isalpha)
3256 else if (strcmp (name, "digit") == 0)
3257 BUILD_CHARCLASS_LOOP (isdigit)
3258 else if (strcmp (name, "print") == 0)
3259 BUILD_CHARCLASS_LOOP (isprint)
3260 else if (strcmp (name, "upper") == 0)
3261 BUILD_CHARCLASS_LOOP (isupper)
3262 else if (strcmp (name, "blank") == 0)
3263 BUILD_CHARCLASS_LOOP (isblank)
3264 else if (strcmp (name, "graph") == 0)
3265 BUILD_CHARCLASS_LOOP (isgraph)
3266 else if (strcmp (name, "punct") == 0)
3267 BUILD_CHARCLASS_LOOP (ispunct)
3268 else if (strcmp (name, "xdigit") == 0)
3269 BUILD_CHARCLASS_LOOP (isxdigit)
3270 else
3271 return REG_ECTYPE;
3273 return REG_NOERROR;
3276 static bin_tree_t *
3277 build_word_op (dfa, not, err)
3278 re_dfa_t *dfa;
3279 int not;
3280 reg_errcode_t *err;
3282 re_bitset_ptr_t sbcset;
3283 #ifdef RE_ENABLE_I18N
3284 re_charset_t *mbcset;
3285 int alloc = 0;
3286 #else /* not RE_ENABLE_I18N */
3287 int non_match = 0;
3288 #endif /* not RE_ENABLE_I18N */
3289 reg_errcode_t ret;
3290 re_token_t br_token;
3291 bin_tree_t *tree;
3292 int new_idx;
3294 sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
3295 #ifdef RE_ENABLE_I18N
3296 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3297 #endif /* RE_ENABLE_I18N */
3299 #ifdef RE_ENABLE_I18N
3300 if (BE (sbcset == NULL || mbcset == NULL, 0))
3301 #else /* not RE_ENABLE_I18N */
3302 if (BE (sbcset == NULL, 0))
3303 #endif /* not RE_ENABLE_I18N */
3305 *err = REG_ESPACE;
3306 return NULL;
3309 if (not)
3311 #ifdef RE_ENABLE_I18N
3312 int i;
3314 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3315 bitset_set(cset->sbcset, '\0');
3317 mbcset->non_match = 1;
3318 if (MB_CUR_MAX > 1)
3319 for (i = 0; i < SBC_MAX; ++i)
3320 if (__btowc (i) == WEOF)
3321 bitset_set (sbcset, i);
3322 #else /* not RE_ENABLE_I18N */
3323 non_match = 1;
3324 #endif /* not RE_ENABLE_I18N */
3327 /* We don't care the syntax in this case. */
3328 ret = build_charclass (sbcset,
3329 #ifdef RE_ENABLE_I18N
3330 mbcset, &alloc,
3331 #endif /* RE_ENABLE_I18N */
3332 (const unsigned char *) "alpha", 0);
3334 if (BE (ret != REG_NOERROR, 0))
3336 re_free (sbcset);
3337 #ifdef RE_ENABLE_I18N
3338 free_charset (mbcset);
3339 #endif /* RE_ENABLE_I18N */
3340 *err = REG_ESPACE;
3341 return NULL;
3343 /* \w match '_' also. */
3344 bitset_set (sbcset, '_');
3346 /* If it is non-matching list. */
3347 #ifdef RE_ENABLE_I18N
3348 if (mbcset->non_match)
3349 #else /* not RE_ENABLE_I18N */
3350 if (non_match)
3351 #endif /* not RE_ENABLE_I18N */
3352 bitset_not (sbcset);
3354 /* Build a tree for simple bracket. */
3355 br_token.type = SIMPLE_BRACKET;
3356 br_token.opr.sbcset = sbcset;
3357 new_idx = re_dfa_add_node (dfa, br_token, 0);
3358 tree = create_tree (NULL, NULL, 0, new_idx);
3359 if (BE (new_idx == -1 || tree == NULL, 0))
3360 goto build_word_op_espace;
3362 #ifdef RE_ENABLE_I18N
3363 if (MB_CUR_MAX > 1)
3365 re_token_t alt_token;
3366 bin_tree_t *mbc_tree;
3367 /* Build a tree for complex bracket. */
3368 br_token.type = COMPLEX_BRACKET;
3369 br_token.opr.mbcset = mbcset;
3370 dfa->has_mb_node = 1;
3371 new_idx = re_dfa_add_node (dfa, br_token, 0);
3372 mbc_tree = create_tree (NULL, NULL, 0, new_idx);
3373 if (BE (new_idx == -1 || mbc_tree == NULL, 0))
3374 goto build_word_op_espace;
3375 /* Then join them by ALT node. */
3376 alt_token.type = OP_ALT;
3377 new_idx = re_dfa_add_node (dfa, alt_token, 0);
3378 tree = create_tree (tree, mbc_tree, 0, new_idx);
3379 if (BE (new_idx != -1 && mbc_tree != NULL, 1))
3380 return tree;
3382 else
3384 free_charset (mbcset);
3385 return tree;
3387 #else /* not RE_ENABLE_I18N */
3388 return tree;
3389 #endif /* not RE_ENABLE_I18N */
3391 build_word_op_espace:
3392 re_free (sbcset);
3393 #ifdef RE_ENABLE_I18N
3394 free_charset (mbcset);
3395 #endif /* RE_ENABLE_I18N */
3396 *err = REG_ESPACE;
3397 return NULL;
3400 /* This is intended for the expressions like "a{1,3}".
3401 Fetch a number from `input', and return the number.
3402 Return -1, if the number field is empty like "{,1}".
3403 Return -2, If an error is occured. */
3405 static int
3406 fetch_number (input, token, syntax)
3407 re_string_t *input;
3408 re_token_t *token;
3409 reg_syntax_t syntax;
3411 int num = -1;
3412 unsigned char c;
3413 while (1)
3415 *token = fetch_token (input, syntax);
3416 c = token->opr.c;
3417 if (BE (token->type == END_OF_RE, 0))
3418 return -2;
3419 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3420 break;
3421 num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3422 ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3423 num = (num > RE_DUP_MAX) ? -2 : num;
3425 return num;
3428 #ifdef RE_ENABLE_I18N
3429 static void
3430 free_charset (re_charset_t *cset)
3432 re_free (cset->mbchars);
3433 # ifdef _LIBC
3434 re_free (cset->coll_syms);
3435 re_free (cset->equiv_classes);
3436 re_free (cset->range_starts);
3437 re_free (cset->range_ends);
3438 # endif
3439 re_free (cset->char_classes);
3440 re_free (cset);
3442 #endif /* RE_ENABLE_I18N */
3444 /* Functions for binary tree operation. */
3446 /* Create a node of tree.
3447 Note: This function automatically free left and right if malloc fails. */
3449 static bin_tree_t *
3450 create_tree (left, right, type, index)
3451 bin_tree_t *left;
3452 bin_tree_t *right;
3453 re_token_type_t type;
3454 int index;
3456 bin_tree_t *tree;
3457 tree = re_malloc (bin_tree_t, 1);
3458 if (BE (tree == NULL, 0))
3460 free_bin_tree (left);
3461 free_bin_tree (right);
3462 return NULL;
3464 tree->parent = NULL;
3465 tree->left = left;
3466 tree->right = right;
3467 tree->type = type;
3468 tree->node_idx = index;
3469 tree->first = -1;
3470 tree->next = -1;
3471 re_node_set_init_empty (&tree->eclosure);
3473 if (left != NULL)
3474 left->parent = tree;
3475 if (right != NULL)
3476 right->parent = tree;
3477 return tree;
3480 /* Free the sub tree pointed by TREE. */
3482 static void
3483 free_bin_tree (tree)
3484 bin_tree_t *tree;
3486 if (tree == NULL)
3487 return;
3488 /*re_node_set_free (&tree->eclosure);*/
3489 free_bin_tree (tree->left);
3490 free_bin_tree (tree->right);
3491 re_free (tree);
3494 /* Duplicate the node SRC, and return new node. */
3496 static bin_tree_t *
3497 duplicate_tree (src, dfa)
3498 const bin_tree_t *src;
3499 re_dfa_t *dfa;
3501 bin_tree_t *left = NULL, *right = NULL, *new_tree;
3502 int new_node_idx;
3503 /* Since node indies must be according to Post-order of the tree,
3504 we must duplicate the left at first. */
3505 if (src->left != NULL)
3507 left = duplicate_tree (src->left, dfa);
3508 if (left == NULL)
3509 return NULL;
3512 /* Secondaly, duplicate the right. */
3513 if (src->right != NULL)
3515 right = duplicate_tree (src->right, dfa);
3516 if (right == NULL)
3518 free_bin_tree (left);
3519 return NULL;
3523 /* At last, duplicate itself. */
3524 if (src->type == NON_TYPE)
3526 new_node_idx = re_dfa_add_node (dfa, dfa->nodes[src->node_idx], 0);
3527 dfa->nodes[new_node_idx].duplicated = 1;
3528 if (BE (new_node_idx == -1, 0))
3530 free_bin_tree (left);
3531 free_bin_tree (right);
3532 return NULL;
3535 else
3536 new_node_idx = src->type;
3538 new_tree = create_tree (left, right, src->type, new_node_idx);
3539 if (BE (new_tree == NULL, 0))
3541 free_bin_tree (left);
3542 free_bin_tree (right);
3544 return new_tree;