* sysdeps/generic/allocrtsig.c: Include <testrtsig.h>, not
[glibc.git] / posix / regcomp.c
blob9bb06aa7bf99750fa2d80b62e4b0a592a93b5f6e
1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19 02111-1307 USA. */
21 #include <assert.h>
22 #include <ctype.h>
23 #include <limits.h>
24 #include <locale.h>
25 #include <stdio.h>
26 #include <stdlib.h>
27 #include <string.h>
29 #if defined HAVE_WCHAR_H || defined _LIBC
30 # include <wchar.h>
31 #endif /* HAVE_WCHAR_H || _LIBC */
32 #if defined HAVE_WCTYPE_H || defined _LIBC
33 # include <wctype.h>
34 #endif /* HAVE_WCTYPE_H || _LIBC */
36 /* In case that the system doesn't have isblank(). */
37 #if !defined _LIBC && !defined HAVE_ISBLANK && !defined isblank
38 # define isblank(ch) ((ch) == ' ' || (ch) == '\t')
39 #endif
41 #ifdef _LIBC
42 # ifndef _RE_DEFINE_LOCALE_FUNCTIONS
43 # define _RE_DEFINE_LOCALE_FUNCTIONS 1
44 # include <locale/localeinfo.h>
45 # include <locale/elem-hash.h>
46 # include <locale/coll-lookup.h>
47 # endif
48 #endif
50 /* This is for other GNU distributions with internationalized messages. */
51 #if HAVE_LIBINTL_H || defined _LIBC
52 # include <libintl.h>
53 # ifdef _LIBC
54 # undef gettext
55 # define gettext(msgid) \
56 INTUSE(__dcgettext) (INTUSE(_libc_intl_domainname), msgid, LC_MESSAGES)
57 # endif
58 #else
59 # define gettext(msgid) (msgid)
60 #endif
62 #ifndef gettext_noop
63 /* This define is so xgettext can find the internationalizable
64 strings. */
65 # define gettext_noop(String) String
66 #endif
68 #include <regex.h>
69 #include "regex_internal.h"
71 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
72 int length, reg_syntax_t syntax);
73 static void re_compile_fastmap_iter (regex_t *bufp,
74 const re_dfastate_t *init_state,
75 char *fastmap);
76 static reg_errcode_t init_dfa (re_dfa_t *dfa, int pat_len);
77 static reg_errcode_t init_word_char (re_dfa_t *dfa);
78 #ifdef RE_ENABLE_I18N
79 static void free_charset (re_charset_t *cset);
80 #endif /* RE_ENABLE_I18N */
81 static void free_workarea_compile (regex_t *preg);
82 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
83 static reg_errcode_t analyze (re_dfa_t *dfa);
84 static reg_errcode_t analyze_tree (re_dfa_t *dfa, bin_tree_t *node);
85 static void calc_first (re_dfa_t *dfa, bin_tree_t *node);
86 static void calc_next (re_dfa_t *dfa, bin_tree_t *node);
87 static void calc_epsdest (re_dfa_t *dfa, bin_tree_t *node);
88 static reg_errcode_t duplicate_node_closure (re_dfa_t *dfa, int top_org_node,
89 int top_clone_node, int root_node,
90 unsigned int constraint);
91 static reg_errcode_t duplicate_node (int *new_idx, re_dfa_t *dfa, int org_idx,
92 unsigned int constraint);
93 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
94 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
95 int node, int root);
96 static void calc_inveclosure (re_dfa_t *dfa);
97 static int fetch_number (re_string_t *input, re_token_t *token,
98 reg_syntax_t syntax);
99 static re_token_t fetch_token (re_string_t *input, reg_syntax_t syntax);
100 static int peek_token (re_token_t *token, re_string_t *input,
101 reg_syntax_t syntax);
102 static int peek_token_bracket (re_token_t *token, re_string_t *input,
103 reg_syntax_t syntax);
104 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
105 reg_syntax_t syntax, reg_errcode_t *err);
106 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
107 re_token_t *token, reg_syntax_t syntax,
108 int nest, reg_errcode_t *err);
109 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
110 re_token_t *token, reg_syntax_t syntax,
111 int nest, reg_errcode_t *err);
112 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
113 re_token_t *token, reg_syntax_t syntax,
114 int nest, reg_errcode_t *err);
115 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
116 re_token_t *token, reg_syntax_t syntax,
117 int nest, reg_errcode_t *err);
118 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
119 re_dfa_t *dfa, re_token_t *token,
120 reg_syntax_t syntax, reg_errcode_t *err);
121 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
122 re_token_t *token, reg_syntax_t syntax,
123 reg_errcode_t *err);
124 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
125 re_string_t *regexp,
126 re_token_t *token, int token_len,
127 re_dfa_t *dfa,
128 reg_syntax_t syntax);
129 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
130 re_string_t *regexp,
131 re_token_t *token);
132 #ifndef _LIBC
133 # ifdef RE_ENABLE_I18N
134 static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
135 re_charset_t *mbcset, int *range_alloc,
136 bracket_elem_t *start_elem,
137 bracket_elem_t *end_elem);
138 static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
139 re_charset_t *mbcset,
140 int *coll_sym_alloc,
141 const unsigned char *name);
142 # else /* not RE_ENABLE_I18N */
143 static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
144 bracket_elem_t *start_elem,
145 bracket_elem_t *end_elem);
146 static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
147 const unsigned char *name);
148 # endif /* not RE_ENABLE_I18N */
149 #endif /* not _LIBC */
150 #ifdef RE_ENABLE_I18N
151 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
152 re_charset_t *mbcset,
153 int *equiv_class_alloc,
154 const unsigned char *name);
155 static reg_errcode_t build_charclass (re_bitset_ptr_t sbcset,
156 re_charset_t *mbcset,
157 int *char_class_alloc,
158 const unsigned char *class_name,
159 reg_syntax_t syntax);
160 #else /* not RE_ENABLE_I18N */
161 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
162 const unsigned char *name);
163 static reg_errcode_t build_charclass (re_bitset_ptr_t sbcset,
164 const unsigned char *class_name,
165 reg_syntax_t syntax);
166 #endif /* not RE_ENABLE_I18N */
167 static bin_tree_t *build_word_op (re_dfa_t *dfa, int not, reg_errcode_t *err);
168 static void free_bin_tree (bin_tree_t *tree);
169 static bin_tree_t *create_tree (bin_tree_t *left, bin_tree_t *right,
170 re_token_type_t type, int index);
171 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
173 /* This table gives an error message for each of the error codes listed
174 in regex.h. Obviously the order here has to be same as there.
175 POSIX doesn't require that we do anything for REG_NOERROR,
176 but why not be nice? */
178 const char __re_error_msgid[] attribute_hidden =
180 #define REG_NOERROR_IDX 0
181 gettext_noop ("Success") /* REG_NOERROR */
182 "\0"
183 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
184 gettext_noop ("No match") /* REG_NOMATCH */
185 "\0"
186 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
187 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
188 "\0"
189 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
190 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
191 "\0"
192 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
193 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
194 "\0"
195 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
196 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
197 "\0"
198 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
199 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
200 "\0"
201 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
202 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
203 "\0"
204 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
205 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
206 "\0"
207 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
208 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
209 "\0"
210 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
211 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
212 "\0"
213 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
214 gettext_noop ("Invalid range end") /* REG_ERANGE */
215 "\0"
216 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
217 gettext_noop ("Memory exhausted") /* REG_ESPACE */
218 "\0"
219 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
220 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
221 "\0"
222 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
223 gettext_noop ("Premature end of regular expression") /* REG_EEND */
224 "\0"
225 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
226 gettext_noop ("Regular expression too big") /* REG_ESIZE */
227 "\0"
228 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
229 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
232 const size_t __re_error_msgid_idx[] attribute_hidden =
234 REG_NOERROR_IDX,
235 REG_NOMATCH_IDX,
236 REG_BADPAT_IDX,
237 REG_ECOLLATE_IDX,
238 REG_ECTYPE_IDX,
239 REG_EESCAPE_IDX,
240 REG_ESUBREG_IDX,
241 REG_EBRACK_IDX,
242 REG_EPAREN_IDX,
243 REG_EBRACE_IDX,
244 REG_BADBR_IDX,
245 REG_ERANGE_IDX,
246 REG_ESPACE_IDX,
247 REG_BADRPT_IDX,
248 REG_EEND_IDX,
249 REG_ESIZE_IDX,
250 REG_ERPAREN_IDX
253 /* Entry points for GNU code. */
255 /* re_compile_pattern is the GNU regular expression compiler: it
256 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
257 Returns 0 if the pattern was valid, otherwise an error string.
259 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
260 are set in BUFP on entry. */
262 const char *
263 re_compile_pattern (pattern, length, bufp)
264 const char *pattern;
265 size_t length;
266 struct re_pattern_buffer *bufp;
268 reg_errcode_t ret;
270 /* GNU code is written to assume at least RE_NREGS registers will be set
271 (and at least one extra will be -1). */
272 bufp->regs_allocated = REGS_UNALLOCATED;
274 /* And GNU code determines whether or not to get register information
275 by passing null for the REGS argument to re_match, etc., not by
276 setting no_sub. */
277 bufp->no_sub = 0;
279 /* Match anchors at newline. */
280 bufp->newline_anchor = 1;
282 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
284 if (!ret)
285 return NULL;
286 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
288 #ifdef _LIBC
289 weak_alias (__re_compile_pattern, re_compile_pattern)
290 #endif
292 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
293 also be assigned to arbitrarily: each pattern buffer stores its own
294 syntax, so it can be changed between regex compilations. */
295 /* This has no initializer because initialized variables in Emacs
296 become read-only after dumping. */
297 reg_syntax_t re_syntax_options;
300 /* Specify the precise syntax of regexps for compilation. This provides
301 for compatibility for various utilities which historically have
302 different, incompatible syntaxes.
304 The argument SYNTAX is a bit mask comprised of the various bits
305 defined in regex.h. We return the old syntax. */
307 reg_syntax_t
308 re_set_syntax (syntax)
309 reg_syntax_t syntax;
311 reg_syntax_t ret = re_syntax_options;
313 re_syntax_options = syntax;
314 return ret;
316 #ifdef _LIBC
317 weak_alias (__re_set_syntax, re_set_syntax)
318 #endif
321 re_compile_fastmap (bufp)
322 struct re_pattern_buffer *bufp;
324 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
325 char *fastmap = bufp->fastmap;
327 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
328 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
329 if (dfa->init_state != dfa->init_state_word)
330 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
331 if (dfa->init_state != dfa->init_state_nl)
332 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
333 if (dfa->init_state != dfa->init_state_begbuf)
334 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
335 bufp->fastmap_accurate = 1;
336 return 0;
338 #ifdef _LIBC
339 weak_alias (__re_compile_fastmap, re_compile_fastmap)
340 #endif
342 /* Helper function for re_compile_fastmap.
343 Compile fastmap for the initial_state INIT_STATE. */
345 static void
346 re_compile_fastmap_iter (bufp, init_state, fastmap)
347 regex_t *bufp;
348 const re_dfastate_t *init_state;
349 char *fastmap;
351 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
352 int node_cnt;
353 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
355 int node = init_state->nodes.elems[node_cnt];
356 re_token_type_t type = dfa->nodes[node].type;
358 if (type == CHARACTER)
359 fastmap[dfa->nodes[node].opr.c] = 1;
360 else if (type == SIMPLE_BRACKET)
362 int i, j, ch;
363 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
364 for (j = 0; j < UINT_BITS; ++j, ++ch)
365 if (dfa->nodes[node].opr.sbcset[i] & (1 << j))
366 fastmap[ch] = 1;
368 #ifdef RE_ENABLE_I18N
369 else if (type == COMPLEX_BRACKET)
371 int i;
372 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
373 if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
374 || cset->nranges || cset->nchar_classes)
376 # ifdef _LIBC
377 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
379 /* In this case we want to catch the bytes which are
380 the first byte of any collation elements.
381 e.g. In da_DK, we want to catch 'a' since "aa"
382 is a valid collation element, and don't catch
383 'b' since 'b' is the only collation element
384 which starts from 'b'. */
385 int j, ch;
386 const int32_t *table = (const int32_t *)
387 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
388 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
389 for (j = 0; j < UINT_BITS; ++j, ++ch)
390 if (table[ch] < 0)
391 fastmap[ch] = 1;
393 # else
394 if (MB_CUR_MAX > 1)
395 for (i = 0; i < SBC_MAX; ++i)
396 if (__btowc (i) == WEOF)
397 fastmap[i] = 1;
398 # endif /* not _LIBC */
400 for (i = 0; i < cset->nmbchars; ++i)
402 char buf[256];
403 wctomb (buf, cset->mbchars[i]);
404 fastmap[*(unsigned char *) buf] = 1;
407 #endif /* RE_ENABLE_I18N */
408 else if (type == END_OF_RE || type == OP_PERIOD
409 #ifdef RE_ENABLE_I18N
410 || type == COMPLEX_BRACKET
411 #endif /* RE_ENABLE_I18N */
414 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
415 if (type == END_OF_RE)
416 bufp->can_be_null = 1;
417 return;
422 /* Entry point for POSIX code. */
423 /* regcomp takes a regular expression as a string and compiles it.
425 PREG is a regex_t *. We do not expect any fields to be initialized,
426 since POSIX says we shouldn't. Thus, we set
428 `buffer' to the compiled pattern;
429 `used' to the length of the compiled pattern;
430 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
431 REG_EXTENDED bit in CFLAGS is set; otherwise, to
432 RE_SYNTAX_POSIX_BASIC;
433 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
434 `fastmap' to an allocated space for the fastmap;
435 `fastmap_accurate' to zero;
436 `re_nsub' to the number of subexpressions in PATTERN.
438 PATTERN is the address of the pattern string.
440 CFLAGS is a series of bits which affect compilation.
442 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
443 use POSIX basic syntax.
445 If REG_NEWLINE is set, then . and [^...] don't match newline.
446 Also, regexec will try a match beginning after every newline.
448 If REG_ICASE is set, then we considers upper- and lowercase
449 versions of letters to be equivalent when matching.
451 If REG_NOSUB is set, then when PREG is passed to regexec, that
452 routine will report only success or failure, and nothing about the
453 registers.
455 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
456 the return codes and their meanings.) */
459 regcomp (preg, pattern, cflags)
460 regex_t *__restrict preg;
461 const char *__restrict pattern;
462 int cflags;
464 reg_errcode_t ret;
465 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
466 : RE_SYNTAX_POSIX_BASIC);
468 preg->buffer = NULL;
469 preg->allocated = 0;
470 preg->used = 0;
472 /* Try to allocate space for the fastmap. */
473 preg->fastmap = re_malloc (char, SBC_MAX);
474 if (BE (preg->fastmap == NULL, 0))
475 return REG_ESPACE;
477 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
479 /* If REG_NEWLINE is set, newlines are treated differently. */
480 if (cflags & REG_NEWLINE)
481 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
482 syntax &= ~RE_DOT_NEWLINE;
483 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
484 /* It also changes the matching behavior. */
485 preg->newline_anchor = 1;
487 else
488 preg->newline_anchor = 0;
489 preg->no_sub = !!(cflags & REG_NOSUB);
490 preg->translate = NULL;
492 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
494 /* POSIX doesn't distinguish between an unmatched open-group and an
495 unmatched close-group: both are REG_EPAREN. */
496 if (ret == REG_ERPAREN)
497 ret = REG_EPAREN;
499 /* We have already checked preg->fastmap != NULL. */
500 if (BE (ret == REG_NOERROR, 1))
502 /* Compute the fastmap now, since regexec cannot modify the pattern
503 buffer. */
504 if (BE (re_compile_fastmap (preg) == -2, 0))
506 /* Some error occurred while computing the fastmap, just forget
507 about it. */
508 re_free (preg->fastmap);
509 preg->fastmap = NULL;
513 return (int) ret;
515 #ifdef _LIBC
516 weak_alias (__regcomp, regcomp)
517 #endif
519 /* Returns a message corresponding to an error code, ERRCODE, returned
520 from either regcomp or regexec. We don't use PREG here. */
522 size_t
523 regerror (errcode, preg, errbuf, errbuf_size)
524 int errcode;
525 const regex_t *preg;
526 char *errbuf;
527 size_t errbuf_size;
529 const char *msg;
530 size_t msg_size;
532 if (BE (errcode < 0
533 || errcode >= (int) (sizeof (__re_error_msgid_idx)
534 / sizeof (__re_error_msgid_idx[0])), 0))
535 /* Only error codes returned by the rest of the code should be passed
536 to this routine. If we are given anything else, or if other regex
537 code generates an invalid error code, then the program has a bug.
538 Dump core so we can fix it. */
539 abort ();
541 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
543 msg_size = strlen (msg) + 1; /* Includes the null. */
545 if (BE (errbuf_size != 0, 1))
547 if (BE (msg_size > errbuf_size, 0))
549 #if defined HAVE_MEMPCPY || defined _LIBC
550 *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
551 #else
552 memcpy (errbuf, msg, errbuf_size - 1);
553 errbuf[errbuf_size - 1] = 0;
554 #endif
556 else
557 memcpy (errbuf, msg, msg_size);
560 return msg_size;
562 #ifdef _LIBC
563 weak_alias (__regerror, regerror)
564 #endif
566 /* Free dynamically allocated space used by PREG. */
568 void
569 regfree (preg)
570 regex_t *preg;
572 int i, j;
573 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
574 if (BE (dfa != NULL, 1))
576 re_free (dfa->subexps);
578 for (i = 0; i < dfa->nodes_len; ++i)
580 re_token_t *node = dfa->nodes + i;
581 #ifdef RE_ENABLE_I18N
582 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
583 free_charset (node->opr.mbcset);
584 else
585 #endif /* RE_ENABLE_I18N */
586 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
587 re_free (node->opr.sbcset);
589 re_free (dfa->nexts);
590 for (i = 0; i < dfa->nodes_len; ++i)
592 if (dfa->eclosures != NULL)
593 re_node_set_free (dfa->eclosures + i);
594 if (dfa->inveclosures != NULL)
595 re_node_set_free (dfa->inveclosures + i);
596 if (dfa->edests != NULL)
597 re_node_set_free (dfa->edests + i);
599 re_free (dfa->edests);
600 re_free (dfa->eclosures);
601 re_free (dfa->inveclosures);
602 re_free (dfa->nodes);
604 for (i = 0; i <= dfa->state_hash_mask; ++i)
606 struct re_state_table_entry *entry = dfa->state_table + i;
607 for (j = 0; j < entry->num; ++j)
609 re_dfastate_t *state = entry->array[j];
610 if (state->entrance_nodes != &state->nodes)
612 re_node_set_free (state->entrance_nodes);
613 re_free (state->entrance_nodes);
615 re_node_set_free (&state->nodes);
616 re_free (state->trtable);
617 re_free (state->trtable_search);
618 re_free (state);
620 re_free (entry->array);
622 re_free (dfa->state_table);
624 if (dfa->word_char != NULL)
625 re_free (dfa->word_char);
626 #ifdef DEBUG
627 re_free (dfa->re_str);
628 #endif
629 re_free (dfa);
631 re_free (preg->fastmap);
633 #ifdef _LIBC
634 weak_alias (__regfree, regfree)
635 #endif
637 /* Entry points compatible with 4.2 BSD regex library. We don't define
638 them unless specifically requested. */
640 #if defined _REGEX_RE_COMP || defined _LIBC
642 /* BSD has one and only one pattern buffer. */
643 static struct re_pattern_buffer re_comp_buf;
645 char *
646 # ifdef _LIBC
647 /* Make these definitions weak in libc, so POSIX programs can redefine
648 these names if they don't use our functions, and still use
649 regcomp/regexec above without link errors. */
650 weak_function
651 # endif
652 re_comp (s)
653 const char *s;
655 reg_errcode_t ret;
656 char *fastmap;
658 if (!s)
660 if (!re_comp_buf.buffer)
661 return gettext ("No previous regular expression");
662 return 0;
665 if (re_comp_buf.buffer)
667 fastmap = re_comp_buf.fastmap;
668 re_comp_buf.fastmap = NULL;
669 __regfree (&re_comp_buf);
670 re_comp_buf.buffer = NULL;
671 re_comp_buf.allocated = 0;
672 re_comp_buf.fastmap = fastmap;
675 if (re_comp_buf.fastmap == NULL)
677 re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
678 if (re_comp_buf.fastmap == NULL)
679 return (char *) gettext (__re_error_msgid
680 + __re_error_msgid_idx[(int) REG_ESPACE]);
683 /* Since `re_exec' always passes NULL for the `regs' argument, we
684 don't need to initialize the pattern buffer fields which affect it. */
686 /* Match anchors at newlines. */
687 re_comp_buf.newline_anchor = 1;
689 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
691 if (!ret)
692 return NULL;
694 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
695 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
698 #ifdef _LIBC
699 static void __attribute__ ((unused))
700 free_mem (void)
702 __regfree (&re_comp_buf);
704 text_set_element (__libc_subfreeres, free_mem);
705 #endif
707 #endif /* _REGEX_RE_COMP */
709 /* Internal entry point.
710 Compile the regular expression PATTERN, whose length is LENGTH.
711 SYNTAX indicate regular expression's syntax. */
713 static reg_errcode_t
714 re_compile_internal (preg, pattern, length, syntax)
715 regex_t *preg;
716 const char * pattern;
717 int length;
718 reg_syntax_t syntax;
720 reg_errcode_t err = REG_NOERROR;
721 re_dfa_t *dfa;
722 re_string_t regexp;
724 /* Initialize the pattern buffer. */
725 preg->fastmap_accurate = 0;
726 preg->syntax = syntax;
727 preg->not_bol = preg->not_eol = 0;
728 preg->used = 0;
729 preg->re_nsub = 0;
731 /* Initialize the dfa. */
732 dfa = (re_dfa_t *) preg->buffer;
733 if (preg->allocated < sizeof (re_dfa_t))
735 /* If zero allocated, but buffer is non-null, try to realloc
736 enough space. This loses if buffer's address is bogus, but
737 that is the user's responsibility. If ->buffer is NULL this
738 is a simple allocation. */
739 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
740 if (dfa == NULL)
741 return REG_ESPACE;
742 preg->allocated = sizeof (re_dfa_t);
744 preg->buffer = (unsigned char *) dfa;
745 preg->used = sizeof (re_dfa_t);
747 err = init_dfa (dfa, length);
748 if (BE (err != REG_NOERROR, 0))
750 re_free (dfa);
751 preg->buffer = NULL;
752 return err;
754 #ifdef DEBUG
755 dfa->re_str = re_malloc (char, length + 1);
756 strncpy (dfa->re_str, pattern, length + 1);
757 #endif
759 err = re_string_construct (&regexp, pattern, length, preg->translate,
760 syntax & RE_ICASE);
761 if (BE (err != REG_NOERROR, 0))
763 re_free (dfa);
764 preg->buffer = NULL;
765 return err;
768 /* Parse the regular expression, and build a structure tree. */
769 preg->re_nsub = 0;
770 dfa->str_tree = parse (&regexp, preg, syntax, &err);
771 if (BE (dfa->str_tree == NULL, 0))
772 goto re_compile_internal_free_return;
774 /* Analyze the tree and collect information which is necessary to
775 create the dfa. */
776 err = analyze (dfa);
777 if (BE (err != REG_NOERROR, 0))
778 goto re_compile_internal_free_return;
780 /* Then create the initial state of the dfa. */
781 err = create_initial_state (dfa);
782 if (BE (err != REG_NOERROR, 0))
783 goto re_compile_internal_free_return;
785 re_compile_internal_free_return:
786 /* Release work areas. */
787 free_workarea_compile (preg);
788 re_string_destruct (&regexp);
790 return err;
793 /* Initialize DFA. We use the length of the regular expression PAT_LEN
794 as the initial length of some arrays. */
796 static reg_errcode_t
797 init_dfa (dfa, pat_len)
798 re_dfa_t *dfa;
799 int pat_len;
801 int table_size;
803 memset (dfa, '\0', sizeof (re_dfa_t));
805 dfa->nodes_alloc = pat_len + 1;
806 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
808 dfa->states_alloc = pat_len + 1;
810 /* table_size = 2 ^ ceil(log pat_len) */
811 for (table_size = 1; table_size > 0; table_size <<= 1)
812 if (table_size > pat_len)
813 break;
815 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
816 dfa->state_hash_mask = table_size - 1;
818 dfa->subexps_alloc = 1;
819 dfa->subexps = re_malloc (re_subexp_t, dfa->subexps_alloc);
820 dfa->word_char = NULL;
822 if (BE (dfa->nodes == NULL || dfa->state_table == NULL
823 || dfa->subexps == NULL, 0))
825 /* We don't bother to free anything which was allocated. Very
826 soon the process will go down anyway. */
827 dfa->subexps = NULL;
828 dfa->state_table = NULL;
829 dfa->nodes = NULL;
830 return REG_ESPACE;
832 return REG_NOERROR;
835 /* Initialize WORD_CHAR table, which indicate which character is
836 "word". In this case "word" means that it is the word construction
837 character used by some operators like "\<", "\>", etc. */
839 static reg_errcode_t
840 init_word_char (dfa)
841 re_dfa_t *dfa;
843 int i, j, ch;
844 dfa->word_char = (re_bitset_ptr_t) calloc (sizeof (bitset), 1);
845 if (BE (dfa->word_char == NULL, 0))
846 return REG_ESPACE;
847 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
848 for (j = 0; j < UINT_BITS; ++j, ++ch)
849 if (isalnum (ch) || ch == '_')
850 dfa->word_char[i] |= 1 << j;
851 return REG_NOERROR;
854 /* Free the work area which are only used while compiling. */
856 static void
857 free_workarea_compile (preg)
858 regex_t *preg;
860 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
861 free_bin_tree (dfa->str_tree);
862 dfa->str_tree = NULL;
865 /* Create initial states for all contexts. */
867 static reg_errcode_t
868 create_initial_state (dfa)
869 re_dfa_t *dfa;
871 int first, i;
872 reg_errcode_t err;
873 re_node_set init_nodes;
875 /* Initial states have the epsilon closure of the node which is
876 the first node of the regular expression. */
877 first = dfa->str_tree->first;
878 dfa->init_node = first;
879 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
880 if (BE (err != REG_NOERROR, 0))
881 return err;
883 /* The back-references which are in initial states can epsilon transit,
884 since in this case all of the subexpressions can be null.
885 Then we add epsilon closures of the nodes which are the next nodes of
886 the back-references. */
887 if (dfa->nbackref > 0)
888 for (i = 0; i < init_nodes.nelem; ++i)
890 int node_idx = init_nodes.elems[i];
891 re_token_type_t type = dfa->nodes[node_idx].type;
893 int clexp_idx;
894 if (type != OP_BACK_REF)
895 continue;
896 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
898 re_token_t *clexp_node;
899 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
900 if (clexp_node->type == OP_CLOSE_SUBEXP
901 && clexp_node->opr.idx + 1 == dfa->nodes[node_idx].opr.idx)
902 break;
904 if (clexp_idx == init_nodes.nelem)
905 continue;
907 if (type == OP_BACK_REF)
909 int dest_idx = dfa->edests[node_idx].elems[0];
910 if (!re_node_set_contains (&init_nodes, dest_idx))
912 re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
913 i = 0;
918 /* It must be the first time to invoke acquire_state. */
919 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
920 /* We don't check ERR here, since the initial state must not be NULL. */
921 if (BE (dfa->init_state == NULL, 0))
922 return err;
923 if (dfa->init_state->has_constraint)
925 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
926 CONTEXT_WORD);
927 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
928 CONTEXT_NEWLINE);
929 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
930 &init_nodes,
931 CONTEXT_NEWLINE
932 | CONTEXT_BEGBUF);
933 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
934 || dfa->init_state_begbuf == NULL, 0))
935 return err;
937 else
938 dfa->init_state_word = dfa->init_state_nl
939 = dfa->init_state_begbuf = dfa->init_state;
941 re_node_set_free (&init_nodes);
942 return REG_NOERROR;
945 /* Analyze the structure tree, and calculate "first", "next", "edest",
946 "eclosure", and "inveclosure". */
948 static reg_errcode_t
949 analyze (dfa)
950 re_dfa_t *dfa;
952 int i;
953 reg_errcode_t ret;
955 /* Allocate arrays. */
956 dfa->nexts = re_malloc (int, dfa->nodes_alloc);
957 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
958 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
959 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_alloc);
960 if (BE (dfa->nexts == NULL || dfa->edests == NULL
961 || dfa->eclosures == NULL || dfa->inveclosures == NULL, 0))
962 return REG_ESPACE;
963 /* Initialize them. */
964 for (i = 0; i < dfa->nodes_len; ++i)
966 dfa->nexts[i] = -1;
967 re_node_set_init_empty (dfa->edests + i);
968 re_node_set_init_empty (dfa->eclosures + i);
969 re_node_set_init_empty (dfa->inveclosures + i);
972 ret = analyze_tree (dfa, dfa->str_tree);
973 if (BE (ret == REG_NOERROR, 1))
975 ret = calc_eclosure (dfa);
976 if (ret == REG_NOERROR)
977 calc_inveclosure (dfa);
979 return ret;
982 /* Helper functions for analyze.
983 This function calculate "first", "next", and "edest" for the subtree
984 whose root is NODE. */
986 static reg_errcode_t
987 analyze_tree (dfa, node)
988 re_dfa_t *dfa;
989 bin_tree_t *node;
991 reg_errcode_t ret;
992 if (node->first == -1)
993 calc_first (dfa, node);
994 if (node->next == -1)
995 calc_next (dfa, node);
996 if (node->eclosure.nelem == 0)
997 calc_epsdest (dfa, node);
998 /* Calculate "first" etc. for the left child. */
999 if (node->left != NULL)
1001 ret = analyze_tree (dfa, node->left);
1002 if (BE (ret != REG_NOERROR, 0))
1003 return ret;
1005 /* Calculate "first" etc. for the right child. */
1006 if (node->right != NULL)
1008 ret = analyze_tree (dfa, node->right);
1009 if (BE (ret != REG_NOERROR, 0))
1010 return ret;
1012 return REG_NOERROR;
1015 /* Calculate "first" for the node NODE. */
1016 static void
1017 calc_first (dfa, node)
1018 re_dfa_t *dfa;
1019 bin_tree_t *node;
1021 int idx, type;
1022 idx = node->node_idx;
1023 type = (node->type == 0) ? dfa->nodes[idx].type : node->type;
1025 switch (type)
1027 #ifdef DEBUG
1028 case OP_OPEN_BRACKET:
1029 case OP_CLOSE_BRACKET:
1030 case OP_OPEN_DUP_NUM:
1031 case OP_CLOSE_DUP_NUM:
1032 case OP_NON_MATCH_LIST:
1033 case OP_OPEN_COLL_ELEM:
1034 case OP_CLOSE_COLL_ELEM:
1035 case OP_OPEN_EQUIV_CLASS:
1036 case OP_CLOSE_EQUIV_CLASS:
1037 case OP_OPEN_CHAR_CLASS:
1038 case OP_CLOSE_CHAR_CLASS:
1039 /* These must not be appeared here. */
1040 assert (0);
1041 #endif
1042 case END_OF_RE:
1043 case CHARACTER:
1044 case OP_PERIOD:
1045 case OP_DUP_ASTERISK:
1046 case OP_DUP_QUESTION:
1047 #ifdef RE_ENABLE_I18N
1048 case COMPLEX_BRACKET:
1049 #endif /* RE_ENABLE_I18N */
1050 case SIMPLE_BRACKET:
1051 case OP_BACK_REF:
1052 case ANCHOR:
1053 case OP_OPEN_SUBEXP:
1054 case OP_CLOSE_SUBEXP:
1055 node->first = idx;
1056 break;
1057 case OP_DUP_PLUS:
1058 #ifdef DEBUG
1059 assert (node->left != NULL);
1060 #endif
1061 if (node->left->first == -1)
1062 calc_first (dfa, node->left);
1063 node->first = node->left->first;
1064 break;
1065 case OP_ALT:
1066 node->first = idx;
1067 break;
1068 /* else fall through */
1069 default:
1070 #ifdef DEBUG
1071 assert (node->left != NULL);
1072 #endif
1073 if (node->left->first == -1)
1074 calc_first (dfa, node->left);
1075 node->first = node->left->first;
1076 break;
1080 /* Calculate "next" for the node NODE. */
1082 static void
1083 calc_next (dfa, node)
1084 re_dfa_t *dfa;
1085 bin_tree_t *node;
1087 int idx, type;
1088 bin_tree_t *parent = node->parent;
1089 if (parent == NULL)
1091 node->next = -1;
1092 idx = node->node_idx;
1093 if (node->type == 0)
1094 dfa->nexts[idx] = node->next;
1095 return;
1098 idx = parent->node_idx;
1099 type = (parent->type == 0) ? dfa->nodes[idx].type : parent->type;
1101 switch (type)
1103 case OP_DUP_ASTERISK:
1104 case OP_DUP_PLUS:
1105 node->next = idx;
1106 break;
1107 case CONCAT:
1108 if (parent->left == node)
1110 if (parent->right->first == -1)
1111 calc_first (dfa, parent->right);
1112 node->next = parent->right->first;
1113 break;
1115 /* else fall through */
1116 default:
1117 if (parent->next == -1)
1118 calc_next (dfa, parent);
1119 node->next = parent->next;
1120 break;
1122 idx = node->node_idx;
1123 if (node->type == 0)
1124 dfa->nexts[idx] = node->next;
1127 /* Calculate "edest" for the node NODE. */
1129 static void
1130 calc_epsdest (dfa, node)
1131 re_dfa_t *dfa;
1132 bin_tree_t *node;
1134 int idx;
1135 idx = node->node_idx;
1136 if (node->type == 0)
1138 if (dfa->nodes[idx].type == OP_DUP_ASTERISK
1139 || dfa->nodes[idx].type == OP_DUP_PLUS
1140 || dfa->nodes[idx].type == OP_DUP_QUESTION)
1142 if (node->left->first == -1)
1143 calc_first (dfa, node->left);
1144 if (node->next == -1)
1145 calc_next (dfa, node);
1146 re_node_set_init_2 (dfa->edests + idx, node->left->first,
1147 node->next);
1149 else if (dfa->nodes[idx].type == OP_ALT)
1151 int left, right;
1152 if (node->left != NULL)
1154 if (node->left->first == -1)
1155 calc_first (dfa, node->left);
1156 left = node->left->first;
1158 else
1160 if (node->next == -1)
1161 calc_next (dfa, node);
1162 left = node->next;
1164 if (node->right != NULL)
1166 if (node->right->first == -1)
1167 calc_first (dfa, node->right);
1168 right = node->right->first;
1170 else
1172 if (node->next == -1)
1173 calc_next (dfa, node);
1174 right = node->next;
1176 re_node_set_init_2 (dfa->edests + idx, left, right);
1178 else if (dfa->nodes[idx].type == ANCHOR
1179 || dfa->nodes[idx].type == OP_OPEN_SUBEXP
1180 || dfa->nodes[idx].type == OP_CLOSE_SUBEXP
1181 || dfa->nodes[idx].type == OP_BACK_REF)
1182 re_node_set_init_1 (dfa->edests + idx, node->next);
1186 /* Duplicate the epsilon closure of the node ROOT_NODE.
1187 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1188 to their own constraint. */
1190 static reg_errcode_t
1191 duplicate_node_closure (dfa, top_org_node, top_clone_node, root_node,
1192 init_constraint)
1193 re_dfa_t *dfa;
1194 int top_org_node, top_clone_node, root_node;
1195 unsigned int init_constraint;
1197 reg_errcode_t err;
1198 int org_node, clone_node, ret;
1199 unsigned int constraint = init_constraint;
1200 for (org_node = top_org_node, clone_node = top_clone_node;;)
1202 int org_dest, clone_dest;
1203 if (dfa->nodes[org_node].type == OP_BACK_REF)
1205 /* If the back reference epsilon-transit, its destination must
1206 also have the constraint. Then duplicate the epsilon closure
1207 of the destination of the back reference, and store it in
1208 edests of the back reference. */
1209 org_dest = dfa->nexts[org_node];
1210 re_node_set_empty (dfa->edests + clone_node);
1211 err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1212 if (BE (err != REG_NOERROR, 0))
1213 return err;
1214 dfa->nexts[clone_node] = dfa->nexts[org_node];
1215 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1216 if (BE (ret < 0, 0))
1217 return REG_ESPACE;
1219 else if (dfa->edests[org_node].nelem == 0)
1221 /* In case of the node can't epsilon-transit, don't duplicate the
1222 destination and store the original destination as the
1223 destination of the node. */
1224 dfa->nexts[clone_node] = dfa->nexts[org_node];
1225 break;
1227 else if (dfa->edests[org_node].nelem == 1)
1229 /* In case of the node can epsilon-transit, and it has only one
1230 destination. */
1231 org_dest = dfa->edests[org_node].elems[0];
1232 re_node_set_empty (dfa->edests + clone_node);
1233 if (dfa->nodes[org_node].type == ANCHOR)
1235 /* In case of the node has another constraint, append it. */
1236 if (org_node == root_node && clone_node != org_node)
1238 /* ...but if the node is root_node itself, it means the
1239 epsilon closure have a loop, then tie it to the
1240 destination of the root_node. */
1241 ret = re_node_set_insert (dfa->edests + clone_node,
1242 org_dest);
1243 if (BE (ret < 0, 0))
1244 return REG_ESPACE;
1245 break;
1247 constraint |= dfa->nodes[org_node].opr.ctx_type;
1249 err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1250 if (BE (err != REG_NOERROR, 0))
1251 return err;
1252 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1253 if (BE (ret < 0, 0))
1254 return REG_ESPACE;
1256 else /* dfa->edests[org_node].nelem == 2 */
1258 /* In case of the node can epsilon-transit, and it has two
1259 destinations. */
1260 org_dest = dfa->edests[org_node].elems[0];
1261 re_node_set_empty (dfa->edests + clone_node);
1262 err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1263 if (BE (err != REG_NOERROR, 0))
1264 return err;
1265 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1266 if (BE (ret < 0, 0))
1267 return REG_ESPACE;
1269 err = duplicate_node_closure (dfa, org_dest, clone_dest, root_node,
1270 constraint);
1271 if (BE (err != REG_NOERROR, 0))
1272 return err;
1274 org_dest = dfa->edests[org_node].elems[1];
1275 err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1276 if (BE (err != REG_NOERROR, 0))
1277 return err;
1278 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1279 if (BE (ret < 0, 0))
1280 return REG_ESPACE;
1282 org_node = org_dest;
1283 clone_node = clone_dest;
1285 return REG_NOERROR;
1288 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1289 The new index will be stored in NEW_IDX and return REG_NOERROR if succeeded,
1290 otherwise return the error code. */
1292 static reg_errcode_t
1293 duplicate_node (new_idx, dfa, org_idx, constraint)
1294 re_dfa_t *dfa;
1295 int *new_idx, org_idx;
1296 unsigned int constraint;
1298 re_token_t dup;
1299 int dup_idx;
1301 dup = dfa->nodes[org_idx];
1302 dup_idx = re_dfa_add_node (dfa, dup, 1);
1303 if (BE (dup_idx == -1, 0))
1304 return REG_ESPACE;
1305 dfa->nodes[dup_idx].constraint = constraint;
1306 if (dfa->nodes[org_idx].type == ANCHOR)
1307 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type;
1308 dfa->nodes[dup_idx].duplicated = 1;
1309 re_node_set_init_empty (dfa->edests + dup_idx);
1310 re_node_set_init_empty (dfa->eclosures + dup_idx);
1311 re_node_set_init_empty (dfa->inveclosures + dup_idx);
1313 *new_idx = dup_idx;
1314 return REG_NOERROR;
1317 static void
1318 calc_inveclosure (dfa)
1319 re_dfa_t *dfa;
1321 int src, idx, dest;
1322 for (src = 0; src < dfa->nodes_len; ++src)
1324 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1326 dest = dfa->eclosures[src].elems[idx];
1327 re_node_set_insert (dfa->inveclosures + dest, src);
1332 /* Calculate "eclosure" for all the node in DFA. */
1334 static reg_errcode_t
1335 calc_eclosure (dfa)
1336 re_dfa_t *dfa;
1338 int node_idx, incomplete;
1339 #ifdef DEBUG
1340 assert (dfa->nodes_len > 0);
1341 #endif
1342 incomplete = 0;
1343 /* For each nodes, calculate epsilon closure. */
1344 for (node_idx = 0; ; ++node_idx)
1346 reg_errcode_t err;
1347 re_node_set eclosure_elem;
1348 if (node_idx == dfa->nodes_len)
1350 if (!incomplete)
1351 break;
1352 incomplete = 0;
1353 node_idx = 0;
1356 #ifdef DEBUG
1357 assert (dfa->eclosures[node_idx].nelem != -1);
1358 #endif
1359 /* If we have already calculated, skip it. */
1360 if (dfa->eclosures[node_idx].nelem != 0)
1361 continue;
1362 /* Calculate epsilon closure of `node_idx'. */
1363 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1364 if (BE (err != REG_NOERROR, 0))
1365 return err;
1367 if (dfa->eclosures[node_idx].nelem == 0)
1369 incomplete = 1;
1370 re_node_set_free (&eclosure_elem);
1373 return REG_NOERROR;
1376 /* Calculate epsilon closure of NODE. */
1378 static reg_errcode_t
1379 calc_eclosure_iter (new_set, dfa, node, root)
1380 re_node_set *new_set;
1381 re_dfa_t *dfa;
1382 int node, root;
1384 reg_errcode_t err;
1385 unsigned int constraint;
1386 int i, incomplete;
1387 re_node_set eclosure;
1388 incomplete = 0;
1389 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1390 if (BE (err != REG_NOERROR, 0))
1391 return err;
1393 /* This indicates that we are calculating this node now.
1394 We reference this value to avoid infinite loop. */
1395 dfa->eclosures[node].nelem = -1;
1397 constraint = ((dfa->nodes[node].type == ANCHOR)
1398 ? dfa->nodes[node].opr.ctx_type : 0);
1399 /* If the current node has constraints, duplicate all nodes.
1400 Since they must inherit the constraints. */
1401 if (constraint && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1403 int org_node, cur_node;
1404 org_node = cur_node = node;
1405 err = duplicate_node_closure (dfa, node, node, node, constraint);
1406 if (BE (err != REG_NOERROR, 0))
1407 return err;
1410 /* Expand each epsilon destination nodes. */
1411 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1412 for (i = 0; i < dfa->edests[node].nelem; ++i)
1414 re_node_set eclosure_elem;
1415 int edest = dfa->edests[node].elems[i];
1416 /* If calculating the epsilon closure of `edest' is in progress,
1417 return intermediate result. */
1418 if (dfa->eclosures[edest].nelem == -1)
1420 incomplete = 1;
1421 continue;
1423 /* If we haven't calculated the epsilon closure of `edest' yet,
1424 calculate now. Otherwise use calculated epsilon closure. */
1425 if (dfa->eclosures[edest].nelem == 0)
1427 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1428 if (BE (err != REG_NOERROR, 0))
1429 return err;
1431 else
1432 eclosure_elem = dfa->eclosures[edest];
1433 /* Merge the epsilon closure of `edest'. */
1434 re_node_set_merge (&eclosure, &eclosure_elem);
1435 /* If the epsilon closure of `edest' is incomplete,
1436 the epsilon closure of this node is also incomplete. */
1437 if (dfa->eclosures[edest].nelem == 0)
1439 incomplete = 1;
1440 re_node_set_free (&eclosure_elem);
1444 /* Epsilon closures include itself. */
1445 re_node_set_insert (&eclosure, node);
1446 if (incomplete && !root)
1447 dfa->eclosures[node].nelem = 0;
1448 else
1449 dfa->eclosures[node] = eclosure;
1450 *new_set = eclosure;
1451 return REG_NOERROR;
1454 /* Functions for token which are used in the parser. */
1456 /* Fetch a token from INPUT.
1457 We must not use this function inside bracket expressions. */
1459 static re_token_t
1460 fetch_token (input, syntax)
1461 re_string_t *input;
1462 reg_syntax_t syntax;
1464 re_token_t token;
1465 int consumed_byte;
1466 consumed_byte = peek_token (&token, input, syntax);
1467 re_string_skip_bytes (input, consumed_byte);
1468 return token;
1471 /* Peek a token from INPUT, and return the length of the token.
1472 We must not use this function inside bracket expressions. */
1474 static int
1475 peek_token (token, input, syntax)
1476 re_token_t *token;
1477 re_string_t *input;
1478 reg_syntax_t syntax;
1480 unsigned char c;
1482 if (re_string_eoi (input))
1484 token->type = END_OF_RE;
1485 return 0;
1488 c = re_string_peek_byte (input, 0);
1489 token->opr.c = c;
1491 #ifdef RE_ENABLE_I18N
1492 token->mb_partial = 0;
1493 if (MB_CUR_MAX > 1 &&
1494 !re_string_first_byte (input, re_string_cur_idx (input)))
1496 token->type = CHARACTER;
1497 token->mb_partial = 1;
1498 return 1;
1500 #endif
1501 if (c == '\\')
1503 unsigned char c2;
1504 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1506 token->type = BACK_SLASH;
1507 return 1;
1510 c2 = re_string_peek_byte_case (input, 1);
1511 token->opr.c = c2;
1512 token->type = CHARACTER;
1513 switch (c2)
1515 case '|':
1516 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1517 token->type = OP_ALT;
1518 break;
1519 case '1': case '2': case '3': case '4': case '5':
1520 case '6': case '7': case '8': case '9':
1521 if (!(syntax & RE_NO_BK_REFS))
1523 token->type = OP_BACK_REF;
1524 token->opr.idx = c2 - '0';
1526 break;
1527 case '<':
1528 if (!(syntax & RE_NO_GNU_OPS))
1530 token->type = ANCHOR;
1531 token->opr.idx = WORD_FIRST;
1533 break;
1534 case '>':
1535 if (!(syntax & RE_NO_GNU_OPS))
1537 token->type = ANCHOR;
1538 token->opr.idx = WORD_LAST;
1540 break;
1541 case 'b':
1542 if (!(syntax & RE_NO_GNU_OPS))
1544 token->type = ANCHOR;
1545 token->opr.idx = WORD_DELIM;
1547 break;
1548 case 'B':
1549 if (!(syntax & RE_NO_GNU_OPS))
1551 token->type = ANCHOR;
1552 token->opr.idx = INSIDE_WORD;
1554 break;
1555 case 'w':
1556 if (!(syntax & RE_NO_GNU_OPS))
1557 token->type = OP_WORD;
1558 break;
1559 case 'W':
1560 if (!(syntax & RE_NO_GNU_OPS))
1561 token->type = OP_NOTWORD;
1562 break;
1563 case '`':
1564 if (!(syntax & RE_NO_GNU_OPS))
1566 token->type = ANCHOR;
1567 token->opr.idx = BUF_FIRST;
1569 break;
1570 case '\'':
1571 if (!(syntax & RE_NO_GNU_OPS))
1573 token->type = ANCHOR;
1574 token->opr.idx = BUF_LAST;
1576 break;
1577 case '(':
1578 if (!(syntax & RE_NO_BK_PARENS))
1579 token->type = OP_OPEN_SUBEXP;
1580 break;
1581 case ')':
1582 if (!(syntax & RE_NO_BK_PARENS))
1583 token->type = OP_CLOSE_SUBEXP;
1584 break;
1585 case '+':
1586 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1587 token->type = OP_DUP_PLUS;
1588 break;
1589 case '?':
1590 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1591 token->type = OP_DUP_QUESTION;
1592 break;
1593 case '{':
1594 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1595 token->type = OP_OPEN_DUP_NUM;
1596 break;
1597 case '}':
1598 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1599 token->type = OP_CLOSE_DUP_NUM;
1600 break;
1601 default:
1602 break;
1604 return 2;
1607 token->type = CHARACTER;
1608 switch (c)
1610 case '\n':
1611 if (syntax & RE_NEWLINE_ALT)
1612 token->type = OP_ALT;
1613 break;
1614 case '|':
1615 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1616 token->type = OP_ALT;
1617 break;
1618 case '*':
1619 token->type = OP_DUP_ASTERISK;
1620 break;
1621 case '+':
1622 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1623 token->type = OP_DUP_PLUS;
1624 break;
1625 case '?':
1626 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1627 token->type = OP_DUP_QUESTION;
1628 break;
1629 case '{':
1630 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1631 token->type = OP_OPEN_DUP_NUM;
1632 break;
1633 case '}':
1634 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1635 token->type = OP_CLOSE_DUP_NUM;
1636 break;
1637 case '(':
1638 if (syntax & RE_NO_BK_PARENS)
1639 token->type = OP_OPEN_SUBEXP;
1640 break;
1641 case ')':
1642 if (syntax & RE_NO_BK_PARENS)
1643 token->type = OP_CLOSE_SUBEXP;
1644 break;
1645 case '[':
1646 token->type = OP_OPEN_BRACKET;
1647 break;
1648 case '.':
1649 token->type = OP_PERIOD;
1650 break;
1651 case '^':
1652 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1653 re_string_cur_idx (input) != 0)
1655 char prev = re_string_peek_byte (input, -1);
1656 if (prev != '|' && prev != '(' &&
1657 (!(syntax & RE_NEWLINE_ALT) || prev != '\n'))
1658 break;
1660 token->type = ANCHOR;
1661 token->opr.idx = LINE_FIRST;
1662 break;
1663 case '$':
1664 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1665 re_string_cur_idx (input) + 1 != re_string_length (input))
1667 re_token_t next;
1668 re_string_skip_bytes (input, 1);
1669 peek_token (&next, input, syntax);
1670 re_string_skip_bytes (input, -1);
1671 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1672 break;
1674 token->type = ANCHOR;
1675 token->opr.idx = LINE_LAST;
1676 break;
1677 default:
1678 break;
1680 return 1;
1683 /* Peek a token from INPUT, and return the length of the token.
1684 We must not use this function out of bracket expressions. */
1686 static int
1687 peek_token_bracket (token, input, syntax)
1688 re_token_t *token;
1689 re_string_t *input;
1690 reg_syntax_t syntax;
1692 unsigned char c;
1693 if (re_string_eoi (input))
1695 token->type = END_OF_RE;
1696 return 0;
1698 c = re_string_peek_byte (input, 0);
1699 token->opr.c = c;
1701 #ifdef RE_ENABLE_I18N
1702 if (MB_CUR_MAX > 1 &&
1703 !re_string_first_byte (input, re_string_cur_idx (input)))
1705 token->type = CHARACTER;
1706 return 1;
1708 #endif /* RE_ENABLE_I18N */
1710 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS))
1712 /* In this case, '\' escape a character. */
1713 unsigned char c2;
1714 re_string_skip_bytes (input, 1);
1715 c2 = re_string_peek_byte (input, 0);
1716 token->opr.c = c2;
1717 token->type = CHARACTER;
1718 return 1;
1720 if (c == '[') /* '[' is a special char in a bracket exps. */
1722 unsigned char c2;
1723 int token_len;
1724 c2 = re_string_peek_byte (input, 1);
1725 token->opr.c = c2;
1726 token_len = 2;
1727 switch (c2)
1729 case '.':
1730 token->type = OP_OPEN_COLL_ELEM;
1731 break;
1732 case '=':
1733 token->type = OP_OPEN_EQUIV_CLASS;
1734 break;
1735 case ':':
1736 if (syntax & RE_CHAR_CLASSES)
1738 token->type = OP_OPEN_CHAR_CLASS;
1739 break;
1741 /* else fall through. */
1742 default:
1743 token->type = CHARACTER;
1744 token->opr.c = c;
1745 token_len = 1;
1746 break;
1748 return token_len;
1750 switch (c)
1752 case '-':
1753 token->type = OP_CHARSET_RANGE;
1754 break;
1755 case ']':
1756 token->type = OP_CLOSE_BRACKET;
1757 break;
1758 case '^':
1759 token->type = OP_NON_MATCH_LIST;
1760 break;
1761 default:
1762 token->type = CHARACTER;
1764 return 1;
1767 /* Functions for parser. */
1769 /* Entry point of the parser.
1770 Parse the regular expression REGEXP and return the structure tree.
1771 If an error is occured, ERR is set by error code, and return NULL.
1772 This function build the following tree, from regular expression <reg_exp>:
1776 <reg_exp> EOR
1778 CAT means concatenation.
1779 EOR means end of regular expression. */
1781 static bin_tree_t *
1782 parse (regexp, preg, syntax, err)
1783 re_string_t *regexp;
1784 regex_t *preg;
1785 reg_syntax_t syntax;
1786 reg_errcode_t *err;
1788 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1789 bin_tree_t *tree, *eor, *root;
1790 re_token_t current_token;
1791 int new_idx;
1792 current_token = fetch_token (regexp, syntax);
1793 tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
1794 if (BE (*err != REG_NOERROR && tree == NULL, 0))
1795 return NULL;
1796 new_idx = re_dfa_add_node (dfa, current_token, 0);
1797 eor = create_tree (NULL, NULL, 0, new_idx);
1798 if (tree != NULL)
1799 root = create_tree (tree, eor, CONCAT, 0);
1800 else
1801 root = eor;
1802 if (BE (new_idx == -1 || eor == NULL || root == NULL, 0))
1804 *err = REG_ESPACE;
1805 return NULL;
1807 return root;
1810 /* This function build the following tree, from regular expression
1811 <branch1>|<branch2>:
1815 <branch1> <branch2>
1817 ALT means alternative, which represents the operator `|'. */
1819 static bin_tree_t *
1820 parse_reg_exp (regexp, preg, token, syntax, nest, err)
1821 re_string_t *regexp;
1822 regex_t *preg;
1823 re_token_t *token;
1824 reg_syntax_t syntax;
1825 int nest;
1826 reg_errcode_t *err;
1828 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1829 bin_tree_t *tree, *branch = NULL;
1830 int new_idx;
1831 tree = parse_branch (regexp, preg, token, syntax, nest, err);
1832 if (BE (*err != REG_NOERROR && tree == NULL, 0))
1833 return NULL;
1835 while (token->type == OP_ALT)
1837 re_token_t alt_token = *token;
1838 new_idx = re_dfa_add_node (dfa, alt_token, 0);
1839 *token = fetch_token (regexp, syntax);
1840 if (token->type != OP_ALT && token->type != END_OF_RE
1841 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
1843 branch = parse_branch (regexp, preg, token, syntax, nest, err);
1844 if (BE (*err != REG_NOERROR && branch == NULL, 0))
1846 free_bin_tree (tree);
1847 return NULL;
1850 else
1851 branch = NULL;
1852 tree = create_tree (tree, branch, 0, new_idx);
1853 if (BE (new_idx == -1 || tree == NULL, 0))
1855 *err = REG_ESPACE;
1856 return NULL;
1858 dfa->has_plural_match = 1;
1860 return tree;
1863 /* This function build the following tree, from regular expression
1864 <exp1><exp2>:
1868 <exp1> <exp2>
1870 CAT means concatenation. */
1872 static bin_tree_t *
1873 parse_branch (regexp, preg, token, syntax, nest, err)
1874 re_string_t *regexp;
1875 regex_t *preg;
1876 re_token_t *token;
1877 reg_syntax_t syntax;
1878 int nest;
1879 reg_errcode_t *err;
1881 bin_tree_t *tree, *exp;
1882 tree = parse_expression (regexp, preg, token, syntax, nest, err);
1883 if (BE (*err != REG_NOERROR && tree == NULL, 0))
1884 return NULL;
1886 while (token->type != OP_ALT && token->type != END_OF_RE
1887 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
1889 exp = parse_expression (regexp, preg, token, syntax, nest, err);
1890 if (BE (*err != REG_NOERROR && exp == NULL, 0))
1892 free_bin_tree (tree);
1893 return NULL;
1895 if (tree != NULL && exp != NULL)
1897 tree = create_tree (tree, exp, CONCAT, 0);
1898 if (tree == NULL)
1900 *err = REG_ESPACE;
1901 return NULL;
1904 else if (tree == NULL)
1905 tree = exp;
1906 /* Otherwise exp == NULL, we don't need to create new tree. */
1908 return tree;
1911 /* This function build the following tree, from regular expression a*:
1917 static bin_tree_t *
1918 parse_expression (regexp, preg, token, syntax, nest, err)
1919 re_string_t *regexp;
1920 regex_t *preg;
1921 re_token_t *token;
1922 reg_syntax_t syntax;
1923 int nest;
1924 reg_errcode_t *err;
1926 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1927 bin_tree_t *tree;
1928 int new_idx;
1929 switch (token->type)
1931 case CHARACTER:
1932 new_idx = re_dfa_add_node (dfa, *token, 0);
1933 tree = create_tree (NULL, NULL, 0, new_idx);
1934 if (BE (new_idx == -1 || tree == NULL, 0))
1936 *err = REG_ESPACE;
1937 return NULL;
1939 #ifdef RE_ENABLE_I18N
1940 if (MB_CUR_MAX > 1)
1942 while (!re_string_eoi (regexp)
1943 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
1945 bin_tree_t *mbc_remain;
1946 *token = fetch_token (regexp, syntax);
1947 new_idx = re_dfa_add_node (dfa, *token, 0);
1948 mbc_remain = create_tree (NULL, NULL, 0, new_idx);
1949 tree = create_tree (tree, mbc_remain, CONCAT, 0);
1950 if (BE (new_idx == -1 || mbc_remain == NULL || tree == NULL, 0))
1951 return *err = REG_ESPACE, NULL;
1954 #endif
1955 break;
1956 case OP_OPEN_SUBEXP:
1957 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
1958 if (BE (*err != REG_NOERROR && tree == NULL, 0))
1959 return NULL;
1960 break;
1961 case OP_OPEN_BRACKET:
1962 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
1963 if (BE (*err != REG_NOERROR && tree == NULL, 0))
1964 return NULL;
1965 break;
1966 case OP_BACK_REF:
1967 if (BE (preg->re_nsub < token->opr.idx
1968 || dfa->subexps[token->opr.idx - 1].end == -1, 0))
1970 *err = REG_ESUBREG;
1971 return NULL;
1973 new_idx = re_dfa_add_node (dfa, *token, 0);
1974 tree = create_tree (NULL, NULL, 0, new_idx);
1975 if (BE (new_idx == -1 || tree == NULL, 0))
1977 *err = REG_ESPACE;
1978 return NULL;
1980 ++dfa->nbackref;
1981 dfa->has_mb_node = 1;
1982 break;
1983 case OP_DUP_ASTERISK:
1984 case OP_DUP_PLUS:
1985 case OP_DUP_QUESTION:
1986 case OP_OPEN_DUP_NUM:
1987 if (syntax & RE_CONTEXT_INVALID_OPS)
1989 *err = REG_BADRPT;
1990 return NULL;
1992 else if (syntax & RE_CONTEXT_INDEP_OPS)
1994 *token = fetch_token (regexp, syntax);
1995 return parse_expression (regexp, preg, token, syntax, nest, err);
1997 /* else fall through */
1998 case OP_CLOSE_SUBEXP:
1999 if ((token->type == OP_CLOSE_SUBEXP) &&
2000 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2002 *err = REG_ERPAREN;
2003 return NULL;
2005 /* else fall through */
2006 case OP_CLOSE_DUP_NUM:
2007 /* We treat it as a normal character. */
2009 /* Then we can these characters as normal characters. */
2010 token->type = CHARACTER;
2011 new_idx = re_dfa_add_node (dfa, *token, 0);
2012 tree = create_tree (NULL, NULL, 0, new_idx);
2013 if (BE (new_idx == -1 || tree == NULL, 0))
2015 *err = REG_ESPACE;
2016 return NULL;
2018 break;
2019 case ANCHOR:
2020 if (dfa->word_char == NULL)
2022 *err = init_word_char (dfa);
2023 if (BE (*err != REG_NOERROR, 0))
2024 return NULL;
2026 if (token->opr.ctx_type == WORD_DELIM)
2028 bin_tree_t *tree_first, *tree_last;
2029 int idx_first, idx_last;
2030 token->opr.ctx_type = WORD_FIRST;
2031 idx_first = re_dfa_add_node (dfa, *token, 0);
2032 tree_first = create_tree (NULL, NULL, 0, idx_first);
2033 token->opr.ctx_type = WORD_LAST;
2034 idx_last = re_dfa_add_node (dfa, *token, 0);
2035 tree_last = create_tree (NULL, NULL, 0, idx_last);
2036 token->type = OP_ALT;
2037 new_idx = re_dfa_add_node (dfa, *token, 0);
2038 tree = create_tree (tree_first, tree_last, 0, new_idx);
2039 if (BE (idx_first == -1 || idx_last == -1 || new_idx == -1
2040 || tree_first == NULL || tree_last == NULL
2041 || tree == NULL, 0))
2043 *err = REG_ESPACE;
2044 return NULL;
2047 else
2049 new_idx = re_dfa_add_node (dfa, *token, 0);
2050 tree = create_tree (NULL, NULL, 0, new_idx);
2051 if (BE (new_idx == -1 || tree == NULL, 0))
2052 return *err = REG_ESPACE, NULL;
2054 /* We must return here, since ANCHORs can't be followed
2055 by repetition operators.
2056 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2057 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2058 *token = fetch_token (regexp, syntax);
2059 return tree;
2060 case OP_PERIOD:
2061 new_idx = re_dfa_add_node (dfa, *token, 0);
2062 tree = create_tree (NULL, NULL, 0, new_idx);
2063 if (BE (new_idx == -1 || tree == NULL, 0))
2065 *err = REG_ESPACE;
2066 return NULL;
2068 if (MB_CUR_MAX > 1)
2069 dfa->has_mb_node = 1;
2070 break;
2071 case OP_WORD:
2072 tree = build_word_op (dfa, 0, err);
2073 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2074 return NULL;
2075 break;
2076 case OP_NOTWORD:
2077 tree = build_word_op (dfa, 1, err);
2078 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2079 return NULL;
2080 break;
2081 case OP_ALT:
2082 case END_OF_RE:
2083 return NULL;
2084 case BACK_SLASH:
2085 *err = REG_EESCAPE;
2086 return NULL;
2087 default:
2088 /* Must not happen? */
2089 #ifdef DEBUG
2090 assert (0);
2091 #endif
2092 return NULL;
2094 *token = fetch_token (regexp, syntax);
2096 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2097 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2099 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2100 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2101 return NULL;
2102 dfa->has_plural_match = 1;
2105 return tree;
2108 /* This function build the following tree, from regular expression
2109 (<reg_exp>):
2110 SUBEXP
2112 <reg_exp>
2115 static bin_tree_t *
2116 parse_sub_exp (regexp, preg, token, syntax, nest, err)
2117 re_string_t *regexp;
2118 regex_t *preg;
2119 re_token_t *token;
2120 reg_syntax_t syntax;
2121 int nest;
2122 reg_errcode_t *err;
2124 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2125 bin_tree_t *tree, *left_par, *right_par;
2126 size_t cur_nsub;
2127 int new_idx;
2128 cur_nsub = preg->re_nsub++;
2129 if (dfa->subexps_alloc < preg->re_nsub)
2131 re_subexp_t *new_array;
2132 dfa->subexps_alloc *= 2;
2133 new_array = re_realloc (dfa->subexps, re_subexp_t, dfa->subexps_alloc);
2134 if (BE (new_array == NULL, 0))
2136 dfa->subexps_alloc /= 2;
2137 *err = REG_ESPACE;
2138 return NULL;
2140 dfa->subexps = new_array;
2142 dfa->subexps[cur_nsub].start = dfa->nodes_len;
2143 dfa->subexps[cur_nsub].end = -1;
2145 new_idx = re_dfa_add_node (dfa, *token, 0);
2146 left_par = create_tree (NULL, NULL, 0, new_idx);
2147 if (BE (new_idx == -1 || left_par == NULL, 0))
2149 *err = REG_ESPACE;
2150 return NULL;
2152 dfa->nodes[new_idx].opr.idx = cur_nsub;
2153 *token = fetch_token (regexp, syntax);
2155 /* The subexpression may be a null string. */
2156 if (token->type == OP_CLOSE_SUBEXP)
2157 tree = NULL;
2158 else
2160 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2161 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2162 return NULL;
2164 if (BE (token->type != OP_CLOSE_SUBEXP, 0))
2166 free_bin_tree (tree);
2167 *err = REG_BADPAT;
2168 return NULL;
2170 new_idx = re_dfa_add_node (dfa, *token, 0);
2171 dfa->subexps[cur_nsub].end = dfa->nodes_len;
2172 right_par = create_tree (NULL, NULL, 0, new_idx);
2173 tree = ((tree == NULL) ? right_par
2174 : create_tree (tree, right_par, CONCAT, 0));
2175 tree = create_tree (left_par, tree, CONCAT, 0);
2176 if (BE (new_idx == -1 || right_par == NULL || tree == NULL, 0))
2178 *err = REG_ESPACE;
2179 return NULL;
2181 dfa->nodes[new_idx].opr.idx = cur_nsub;
2183 return tree;
2186 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2188 static bin_tree_t *
2189 parse_dup_op (dup_elem, regexp, dfa, token, syntax, err)
2190 bin_tree_t *dup_elem;
2191 re_string_t *regexp;
2192 re_dfa_t *dfa;
2193 re_token_t *token;
2194 reg_syntax_t syntax;
2195 reg_errcode_t *err;
2197 re_token_t dup_token;
2198 bin_tree_t *tree = dup_elem, *work_tree;
2199 int new_idx, start_idx = re_string_cur_idx (regexp);
2200 re_token_t start_token = *token;
2201 if (token->type == OP_OPEN_DUP_NUM)
2203 int i;
2204 int end = 0;
2205 int start = fetch_number (regexp, token, syntax);
2206 bin_tree_t *elem;
2207 if (start == -1)
2209 if (token->type == CHARACTER && token->opr.c == ',')
2210 start = 0; /* We treat "{,m}" as "{0,m}". */
2211 else
2213 *err = REG_BADBR; /* <re>{} is invalid. */
2214 return NULL;
2217 if (BE (start != -2, 1))
2219 /* We treat "{n}" as "{n,n}". */
2220 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2221 : ((token->type == CHARACTER && token->opr.c == ',')
2222 ? fetch_number (regexp, token, syntax) : -2));
2224 if (BE (start == -2 || end == -2, 0))
2226 /* Invalid sequence. */
2227 if (token->type == OP_CLOSE_DUP_NUM)
2228 goto parse_dup_op_invalid_interval;
2229 else
2230 goto parse_dup_op_ebrace;
2232 if (BE (start == 0 && end == 0, 0))
2234 /* We treat "<re>{0}" and "<re>{0,0}" as null string. */
2235 *token = fetch_token (regexp, syntax);
2236 free_bin_tree (dup_elem);
2237 return NULL;
2240 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2241 elem = tree;
2242 for (i = 0; i < start; ++i)
2243 if (i != 0)
2245 work_tree = duplicate_tree (elem, dfa);
2246 tree = create_tree (tree, work_tree, CONCAT, 0);
2247 if (BE (work_tree == NULL || tree == NULL, 0))
2248 goto parse_dup_op_espace;
2251 if (end == -1)
2253 /* We treat "<re>{0,}" as "<re>*". */
2254 dup_token.type = OP_DUP_ASTERISK;
2255 if (start > 0)
2257 elem = duplicate_tree (elem, dfa);
2258 new_idx = re_dfa_add_node (dfa, dup_token, 0);
2259 work_tree = create_tree (elem, NULL, 0, new_idx);
2260 tree = create_tree (tree, work_tree, CONCAT, 0);
2261 if (BE (elem == NULL || new_idx == -1 || work_tree == NULL
2262 || tree == NULL, 0))
2263 goto parse_dup_op_espace;
2265 else
2267 new_idx = re_dfa_add_node (dfa, dup_token, 0);
2268 tree = create_tree (elem, NULL, 0, new_idx);
2269 if (BE (new_idx == -1 || tree == NULL, 0))
2270 goto parse_dup_op_espace;
2273 else if (end - start > 0)
2275 /* Then extract "<re>{0,m}" to "<re>?<re>?...<re>?". */
2276 dup_token.type = OP_DUP_QUESTION;
2277 if (start > 0)
2279 elem = duplicate_tree (elem, dfa);
2280 new_idx = re_dfa_add_node (dfa, dup_token, 0);
2281 elem = create_tree (elem, NULL, 0, new_idx);
2282 tree = create_tree (tree, elem, CONCAT, 0);
2283 if (BE (elem == NULL || new_idx == -1 || tree == NULL, 0))
2284 goto parse_dup_op_espace;
2286 else
2288 new_idx = re_dfa_add_node (dfa, dup_token, 0);
2289 tree = elem = create_tree (elem, NULL, 0, new_idx);
2290 if (BE (new_idx == -1 || tree == NULL, 0))
2291 goto parse_dup_op_espace;
2293 for (i = 1; i < end - start; ++i)
2295 work_tree = duplicate_tree (elem, dfa);
2296 tree = create_tree (tree, work_tree, CONCAT, 0);
2297 if (BE (work_tree == NULL || tree == NULL, 0))
2299 *err = REG_ESPACE;
2300 return NULL;
2305 else
2307 new_idx = re_dfa_add_node (dfa, *token, 0);
2308 tree = create_tree (tree, NULL, 0, new_idx);
2309 if (BE (new_idx == -1 || tree == NULL, 0))
2311 *err = REG_ESPACE;
2312 return NULL;
2315 *token = fetch_token (regexp, syntax);
2316 return tree;
2318 parse_dup_op_espace:
2319 free_bin_tree (tree);
2320 *err = REG_ESPACE;
2321 return NULL;
2323 parse_dup_op_ebrace:
2324 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2326 *err = REG_EBRACE;
2327 return NULL;
2329 goto parse_dup_op_rollback;
2330 parse_dup_op_invalid_interval:
2331 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2333 *err = REG_BADBR;
2334 return NULL;
2336 parse_dup_op_rollback:
2337 re_string_set_index (regexp, start_idx);
2338 *token = start_token;
2339 token->type = CHARACTER;
2340 return dup_elem;
2343 /* Size of the names for collating symbol/equivalence_class/character_class.
2344 I'm not sure, but maybe enough. */
2345 #define BRACKET_NAME_BUF_SIZE 32
2347 #ifndef _LIBC
2348 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2349 Build the range expression which starts from START_ELEM, and ends
2350 at END_ELEM. The result are written to MBCSET and SBCSET.
2351 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2352 mbcset->range_ends, is a pointer argument sinse we may
2353 update it. */
2355 static reg_errcode_t
2356 # ifdef RE_ENABLE_I18N
2357 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2358 re_charset_t *mbcset;
2359 int *range_alloc;
2360 # else /* not RE_ENABLE_I18N */
2361 build_range_exp (sbcset, start_elem, end_elem)
2362 # endif /* not RE_ENABLE_I18N */
2363 re_bitset_ptr_t sbcset;
2364 bracket_elem_t *start_elem, *end_elem;
2366 unsigned int start_ch, end_ch;
2367 /* Equivalence Classes and Character Classes can't be a range start/end. */
2368 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2369 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2371 return REG_ERANGE;
2373 /* We can handle no multi character collating elements without libc
2374 support. */
2375 if (BE ((start_elem->type == COLL_SYM
2376 && strlen ((char *) start_elem->opr.name) > 1)
2377 || (end_elem->type == COLL_SYM
2378 && strlen ((char *) end_elem->opr.name) > 1), 0))
2379 return REG_ECOLLATE;
2381 # ifdef RE_ENABLE_I18N
2383 wchar_t wc, start_wc, end_wc;
2384 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2386 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2387 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2388 : 0));
2389 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2390 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2391 : 0));
2392 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2393 ? __btowc (start_ch) : start_elem->opr.wch);
2394 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2395 ? __btowc (end_ch) : end_elem->opr.wch);
2396 cmp_buf[0] = start_wc;
2397 cmp_buf[4] = end_wc;
2398 if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2399 return REG_ERANGE;
2401 /* Check the space of the arrays. */
2402 if (*range_alloc == mbcset->nranges)
2404 /* There are not enough space, need realloc. */
2405 wchar_t *new_array_start, *new_array_end;
2406 int new_nranges;
2408 /* +1 in case of mbcset->nranges is 0. */
2409 new_nranges = 2 * mbcset->nranges + 1;
2410 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2411 are NULL if *range_alloc == 0. */
2412 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2413 new_nranges);
2414 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2415 new_nranges);
2417 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2418 return REG_ESPACE;
2420 mbcset->range_starts = new_array_start;
2421 mbcset->range_ends = new_array_end;
2422 *range_alloc = new_nranges;
2425 mbcset->range_starts[mbcset->nranges] = start_wc;
2426 mbcset->range_ends[mbcset->nranges++] = end_wc;
2428 /* Build the table for single byte characters. */
2429 for (wc = 0; wc <= SBC_MAX; ++wc)
2431 cmp_buf[2] = wc;
2432 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2433 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2434 bitset_set (sbcset, wc);
2437 # else /* not RE_ENABLE_I18N */
2439 unsigned int ch;
2440 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2441 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2442 : 0));
2443 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2444 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2445 : 0));
2446 if (start_ch > end_ch)
2447 return REG_ERANGE;
2448 /* Build the table for single byte characters. */
2449 for (ch = 0; ch <= SBC_MAX; ++ch)
2450 if (start_ch <= ch && ch <= end_ch)
2451 bitset_set (sbcset, ch);
2453 # endif /* not RE_ENABLE_I18N */
2454 return REG_NOERROR;
2456 #endif /* not _LIBC */
2458 #ifndef _LIBC
2459 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2460 Build the collating element which is represented by NAME.
2461 The result are written to MBCSET and SBCSET.
2462 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2463 pointer argument since we may update it. */
2465 static reg_errcode_t
2466 # ifdef RE_ENABLE_I18N
2467 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2468 re_charset_t *mbcset;
2469 int *coll_sym_alloc;
2470 # else /* not RE_ENABLE_I18N */
2471 build_collating_symbol (sbcset, name)
2472 # endif /* not RE_ENABLE_I18N */
2473 re_bitset_ptr_t sbcset;
2474 const unsigned char *name;
2476 size_t name_len = strlen ((const char *) name);
2477 if (BE (name_len != 1, 0))
2478 return REG_ECOLLATE;
2479 else
2481 bitset_set (sbcset, name[0]);
2482 return REG_NOERROR;
2485 #endif /* not _LIBC */
2487 /* This function parse bracket expression like "[abc]", "[a-c]",
2488 "[[.a-a.]]" etc. */
2490 static bin_tree_t *
2491 parse_bracket_exp (regexp, dfa, token, syntax, err)
2492 re_string_t *regexp;
2493 re_dfa_t *dfa;
2494 re_token_t *token;
2495 reg_syntax_t syntax;
2496 reg_errcode_t *err;
2498 #ifdef _LIBC
2499 const unsigned char *collseqmb;
2500 const char *collseqwc;
2501 uint32_t nrules;
2502 int32_t table_size;
2503 const int32_t *symb_table;
2504 const unsigned char *extra;
2506 /* Local function for parse_bracket_exp used in _LIBC environement.
2507 Seek the collating symbol entry correspondings to NAME.
2508 Return the index of the symbol in the SYMB_TABLE. */
2510 static inline int32_t
2511 seek_collating_symbol_entry (name, name_len)
2512 const unsigned char *name;
2513 size_t name_len;
2515 int32_t hash = elem_hash ((const char *) name, name_len);
2516 int32_t elem = hash % table_size;
2517 int32_t second = hash % (table_size - 2);
2518 while (symb_table[2 * elem] != 0)
2520 /* First compare the hashing value. */
2521 if (symb_table[2 * elem] == hash
2522 /* Compare the length of the name. */
2523 && name_len == extra[symb_table[2 * elem + 1]]
2524 /* Compare the name. */
2525 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2526 name_len) == 0)
2528 /* Yep, this is the entry. */
2529 break;
2532 /* Next entry. */
2533 elem += second;
2535 return elem;
2538 /* Local function for parse_bracket_exp used in _LIBC environement.
2539 Look up the collation sequence value of BR_ELEM.
2540 Return the value if succeeded, UINT_MAX otherwise. */
2542 static inline unsigned int
2543 lookup_collation_sequence_value (br_elem)
2544 bracket_elem_t *br_elem;
2546 if (br_elem->type == SB_CHAR)
2549 if (MB_CUR_MAX == 1)
2551 if (nrules == 0)
2552 return collseqmb[br_elem->opr.ch];
2553 else
2555 wint_t wc = __btowc (br_elem->opr.ch);
2556 return collseq_table_lookup (collseqwc, wc);
2559 else if (br_elem->type == MB_CHAR)
2561 return collseq_table_lookup (collseqwc, br_elem->opr.wch);
2563 else if (br_elem->type == COLL_SYM)
2565 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2566 if (nrules != 0)
2568 int32_t elem, idx;
2569 elem = seek_collating_symbol_entry (br_elem->opr.name,
2570 sym_name_len);
2571 if (symb_table[2 * elem] != 0)
2573 /* We found the entry. */
2574 idx = symb_table[2 * elem + 1];
2575 /* Skip the name of collating element name. */
2576 idx += 1 + extra[idx];
2577 /* Skip the byte sequence of the collating element. */
2578 idx += 1 + extra[idx];
2579 /* Adjust for the alignment. */
2580 idx = (idx + 3) & ~3;
2581 /* Skip the multibyte collation sequence value. */
2582 idx += sizeof (unsigned int);
2583 /* Skip the wide char sequence of the collating element. */
2584 idx += sizeof (unsigned int) *
2585 (1 + *(unsigned int *) (extra + idx));
2586 /* Return the collation sequence value. */
2587 return *(unsigned int *) (extra + idx);
2589 else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2591 /* No valid character. Match it as a single byte
2592 character. */
2593 return collseqmb[br_elem->opr.name[0]];
2596 else if (sym_name_len == 1)
2597 return collseqmb[br_elem->opr.name[0]];
2599 return UINT_MAX;
2602 /* Local function for parse_bracket_exp used in _LIBC environement.
2603 Build the range expression which starts from START_ELEM, and ends
2604 at END_ELEM. The result are written to MBCSET and SBCSET.
2605 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2606 mbcset->range_ends, is a pointer argument sinse we may
2607 update it. */
2609 static inline reg_errcode_t
2610 # ifdef RE_ENABLE_I18N
2611 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2612 re_charset_t *mbcset;
2613 int *range_alloc;
2614 # else /* not RE_ENABLE_I18N */
2615 build_range_exp (sbcset, start_elem, end_elem)
2616 # endif /* not RE_ENABLE_I18N */
2617 re_bitset_ptr_t sbcset;
2618 bracket_elem_t *start_elem, *end_elem;
2620 unsigned int ch;
2621 uint32_t start_collseq;
2622 uint32_t end_collseq;
2624 # ifdef RE_ENABLE_I18N
2625 /* Check the space of the arrays. */
2626 if (*range_alloc == mbcset->nranges)
2628 /* There are not enough space, need realloc. */
2629 uint32_t *new_array_start;
2630 uint32_t *new_array_end;
2631 int new_nranges;
2633 /* +1 in case of mbcset->nranges is 0. */
2634 new_nranges = 2 * mbcset->nranges + 1;
2635 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2636 are NULL if *range_alloc == 0. */
2637 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2638 new_nranges);
2639 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2640 new_nranges);
2642 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2643 return REG_ESPACE;
2645 mbcset->range_starts = new_array_start;
2646 mbcset->range_ends = new_array_end;
2647 *range_alloc = new_nranges;
2649 # endif /* RE_ENABLE_I18N */
2651 /* Equivalence Classes and Character Classes can't be a range
2652 start/end. */
2653 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2654 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2656 return REG_ERANGE;
2658 start_collseq = lookup_collation_sequence_value (start_elem);
2659 end_collseq = lookup_collation_sequence_value (end_elem);
2660 /* Check start/end collation sequence values. */
2661 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2662 return REG_ECOLLATE;
2663 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2664 return REG_ERANGE;
2666 # ifdef RE_ENABLE_I18N
2667 /* Got valid collation sequence values, add them as a new entry. */
2668 mbcset->range_starts[mbcset->nranges] = start_collseq;
2669 mbcset->range_ends[mbcset->nranges++] = end_collseq;
2670 # endif /* RE_ENABLE_I18N */
2672 /* Build the table for single byte characters. */
2673 for (ch = 0; ch <= SBC_MAX; ch++)
2675 uint32_t ch_collseq;
2677 if (MB_CUR_MAX == 1)
2679 if (nrules == 0)
2680 ch_collseq = collseqmb[ch];
2681 else
2682 ch_collseq = collseq_table_lookup (collseqwc, __btowc (ch));
2683 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2684 bitset_set (sbcset, ch);
2686 return REG_NOERROR;
2689 /* Local function for parse_bracket_exp used in _LIBC environement.
2690 Build the collating element which is represented by NAME.
2691 The result are written to MBCSET and SBCSET.
2692 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2693 pointer argument sinse we may update it. */
2695 static inline reg_errcode_t
2696 # ifdef RE_ENABLE_I18N
2697 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2698 re_charset_t *mbcset;
2699 int *coll_sym_alloc;
2700 # else /* not RE_ENABLE_I18N */
2701 build_collating_symbol (sbcset, name)
2702 # endif /* not RE_ENABLE_I18N */
2703 re_bitset_ptr_t sbcset;
2704 const unsigned char *name;
2706 int32_t elem, idx;
2707 size_t name_len = strlen ((const char *) name);
2708 if (nrules != 0)
2710 elem = seek_collating_symbol_entry (name, name_len);
2711 if (symb_table[2 * elem] != 0)
2713 /* We found the entry. */
2714 idx = symb_table[2 * elem + 1];
2715 /* Skip the name of collating element name. */
2716 idx += 1 + extra[idx];
2718 else if (symb_table[2 * elem] == 0 && name_len == 1)
2720 /* No valid character, treat it as a normal
2721 character. */
2722 bitset_set (sbcset, name[0]);
2723 return REG_NOERROR;
2725 else
2726 return REG_ECOLLATE;
2728 # ifdef RE_ENABLE_I18N
2729 /* Got valid collation sequence, add it as a new entry. */
2730 /* Check the space of the arrays. */
2731 if (*coll_sym_alloc == mbcset->ncoll_syms)
2733 /* Not enough, realloc it. */
2734 /* +1 in case of mbcset->ncoll_syms is 0. */
2735 *coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
2736 /* Use realloc since mbcset->coll_syms is NULL
2737 if *alloc == 0. */
2738 mbcset->coll_syms = re_realloc (mbcset->coll_syms, int32_t,
2739 *coll_sym_alloc);
2740 if (BE (mbcset->coll_syms == NULL, 0))
2741 return REG_ESPACE;
2743 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
2744 # endif /* RE_ENABLE_I18N */
2745 return REG_NOERROR;
2747 else
2749 if (BE (name_len != 1, 0))
2750 return REG_ECOLLATE;
2751 else
2753 bitset_set (sbcset, name[0]);
2754 return REG_NOERROR;
2758 #endif
2760 re_token_t br_token;
2761 re_bitset_ptr_t sbcset;
2762 #ifdef RE_ENABLE_I18N
2763 re_charset_t *mbcset;
2764 int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
2765 int equiv_class_alloc = 0, char_class_alloc = 0;
2766 #else /* not RE_ENABLE_I18N */
2767 int non_match = 0;
2768 #endif /* not RE_ENABLE_I18N */
2769 bin_tree_t *work_tree;
2770 int token_len, new_idx;
2771 #ifdef _LIBC
2772 collseqmb = (const unsigned char *)
2773 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
2774 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
2775 if (nrules)
2778 if (MB_CUR_MAX > 1)
2780 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
2781 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
2782 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
2783 _NL_COLLATE_SYMB_TABLEMB);
2784 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
2785 _NL_COLLATE_SYMB_EXTRAMB);
2787 #endif
2788 sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
2789 #ifdef RE_ENABLE_I18N
2790 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
2791 #endif /* RE_ENABLE_I18N */
2792 #ifdef RE_ENABLE_I18N
2793 if (BE (sbcset == NULL || mbcset == NULL, 0))
2794 #else
2795 if (BE (sbcset == NULL, 0))
2796 #endif /* RE_ENABLE_I18N */
2798 *err = REG_ESPACE;
2799 return NULL;
2802 token_len = peek_token_bracket (token, regexp, syntax);
2803 if (BE (token->type == END_OF_RE, 0))
2805 *err = REG_BADPAT;
2806 goto parse_bracket_exp_free_return;
2808 if (token->type == OP_NON_MATCH_LIST)
2810 #ifdef RE_ENABLE_I18N
2811 int i;
2812 mbcset->non_match = 1;
2813 #else /* not RE_ENABLE_I18N */
2814 non_match = 1;
2815 #endif /* not RE_ENABLE_I18N */
2816 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
2817 bitset_set (sbcset, '\0');
2818 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
2819 token_len = peek_token_bracket (token, regexp, syntax);
2820 if (BE (token->type == END_OF_RE, 0))
2822 *err = REG_BADPAT;
2823 goto parse_bracket_exp_free_return;
2825 #ifdef RE_ENABLE_I18N
2826 if (MB_CUR_MAX > 1)
2827 for (i = 0; i < SBC_MAX; ++i)
2828 if (__btowc (i) == WEOF)
2829 bitset_set (sbcset, i);
2830 #endif /* RE_ENABLE_I18N */
2833 /* We treat the first ']' as a normal character. */
2834 if (token->type == OP_CLOSE_BRACKET)
2835 token->type = CHARACTER;
2837 while (1)
2839 bracket_elem_t start_elem, end_elem;
2840 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
2841 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
2842 reg_errcode_t ret;
2843 int token_len2 = 0, is_range_exp = 0;
2844 re_token_t token2;
2846 start_elem.opr.name = start_name_buf;
2847 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
2848 syntax);
2849 if (BE (ret != REG_NOERROR, 0))
2851 *err = ret;
2852 goto parse_bracket_exp_free_return;
2855 token_len = peek_token_bracket (token, regexp, syntax);
2856 if (BE (token->type == END_OF_RE, 0))
2858 *err = REG_BADPAT;
2859 goto parse_bracket_exp_free_return;
2861 if (token->type == OP_CHARSET_RANGE)
2863 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
2864 token_len2 = peek_token_bracket (&token2, regexp, syntax);
2865 if (BE (token->type == END_OF_RE, 0))
2867 *err = REG_BADPAT;
2868 goto parse_bracket_exp_free_return;
2870 if (token2.type == OP_CLOSE_BRACKET)
2872 /* We treat the last '-' as a normal character. */
2873 re_string_skip_bytes (regexp, -token_len);
2874 token->type = CHARACTER;
2876 else
2877 is_range_exp = 1;
2880 if (is_range_exp == 1)
2882 end_elem.opr.name = end_name_buf;
2883 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
2884 dfa, syntax);
2885 if (BE (ret != REG_NOERROR, 0))
2887 *err = ret;
2888 goto parse_bracket_exp_free_return;
2891 token_len = peek_token_bracket (token, regexp, syntax);
2892 if (BE (token->type == END_OF_RE, 0))
2894 *err = REG_BADPAT;
2895 goto parse_bracket_exp_free_return;
2897 *err = build_range_exp (sbcset,
2898 #ifdef RE_ENABLE_I18N
2899 mbcset, &range_alloc,
2900 #endif /* RE_ENABLE_I18N */
2901 &start_elem, &end_elem);
2902 if (BE (*err != REG_NOERROR, 0))
2903 goto parse_bracket_exp_free_return;
2905 else
2907 switch (start_elem.type)
2909 case SB_CHAR:
2910 bitset_set (sbcset, start_elem.opr.ch);
2911 break;
2912 #ifdef RE_ENABLE_I18N
2913 case MB_CHAR:
2914 /* Check whether the array has enough space. */
2915 if (mbchar_alloc == mbcset->nmbchars)
2917 /* Not enough, realloc it. */
2918 /* +1 in case of mbcset->nmbchars is 0. */
2919 mbchar_alloc = 2 * mbcset->nmbchars + 1;
2920 /* Use realloc since array is NULL if *alloc == 0. */
2921 mbcset->mbchars = re_realloc (mbcset->mbchars, wchar_t,
2922 mbchar_alloc);
2923 if (BE (mbcset->mbchars == NULL, 0))
2924 goto parse_bracket_exp_espace;
2926 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
2927 break;
2928 #endif /* RE_ENABLE_I18N */
2929 case EQUIV_CLASS:
2930 *err = build_equiv_class (sbcset,
2931 #ifdef RE_ENABLE_I18N
2932 mbcset, &equiv_class_alloc,
2933 #endif /* RE_ENABLE_I18N */
2934 start_elem.opr.name);
2935 if (BE (*err != REG_NOERROR, 0))
2936 goto parse_bracket_exp_free_return;
2937 break;
2938 case COLL_SYM:
2939 *err = build_collating_symbol (sbcset,
2940 #ifdef RE_ENABLE_I18N
2941 mbcset, &coll_sym_alloc,
2942 #endif /* RE_ENABLE_I18N */
2943 start_elem.opr.name);
2944 if (BE (*err != REG_NOERROR, 0))
2945 goto parse_bracket_exp_free_return;
2946 break;
2947 case CHAR_CLASS:
2948 ret = build_charclass (sbcset,
2949 #ifdef RE_ENABLE_I18N
2950 mbcset, &char_class_alloc,
2951 #endif /* RE_ENABLE_I18N */
2952 start_elem.opr.name, syntax);
2953 if (BE (ret != REG_NOERROR, 0))
2954 goto parse_bracket_exp_espace;
2955 break;
2956 default:
2957 assert (0);
2958 break;
2961 if (token->type == OP_CLOSE_BRACKET)
2962 break;
2965 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
2967 /* If it is non-matching list. */
2968 #ifdef RE_ENABLE_I18N
2969 if (mbcset->non_match)
2970 #else /* not RE_ENABLE_I18N */
2971 if (non_match)
2972 #endif /* not RE_ENABLE_I18N */
2973 bitset_not (sbcset);
2975 /* Build a tree for simple bracket. */
2976 br_token.type = SIMPLE_BRACKET;
2977 br_token.opr.sbcset = sbcset;
2978 new_idx = re_dfa_add_node (dfa, br_token, 0);
2979 work_tree = create_tree (NULL, NULL, 0, new_idx);
2980 if (BE (new_idx == -1 || work_tree == NULL, 0))
2981 goto parse_bracket_exp_espace;
2983 #ifdef RE_ENABLE_I18N
2984 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
2985 || mbcset->nranges || (MB_CUR_MAX > 1 && (mbcset->nchar_classes
2986 || mbcset->non_match)))
2988 re_token_t alt_token;
2989 bin_tree_t *mbc_tree;
2990 /* Build a tree for complex bracket. */
2991 br_token.type = COMPLEX_BRACKET;
2992 br_token.opr.mbcset = mbcset;
2993 dfa->has_mb_node = 1;
2994 new_idx = re_dfa_add_node (dfa, br_token, 0);
2995 mbc_tree = create_tree (NULL, NULL, 0, new_idx);
2996 if (BE (new_idx == -1 || mbc_tree == NULL, 0))
2997 goto parse_bracket_exp_espace;
2998 /* Then join them by ALT node. */
2999 dfa->has_plural_match = 1;
3000 alt_token.type = OP_ALT;
3001 new_idx = re_dfa_add_node (dfa, alt_token, 0);
3002 work_tree = create_tree (work_tree, mbc_tree, 0, new_idx);
3003 if (BE (new_idx != -1 && mbc_tree != NULL, 1))
3004 return work_tree;
3006 else
3008 free_charset (mbcset);
3009 return work_tree;
3011 #else /* not RE_ENABLE_I18N */
3012 return work_tree;
3013 #endif /* not RE_ENABLE_I18N */
3015 parse_bracket_exp_espace:
3016 *err = REG_ESPACE;
3017 parse_bracket_exp_free_return:
3018 re_free (sbcset);
3019 #ifdef RE_ENABLE_I18N
3020 free_charset (mbcset);
3021 #endif /* RE_ENABLE_I18N */
3022 return NULL;
3025 /* Parse an element in the bracket expression. */
3027 static reg_errcode_t
3028 parse_bracket_element (elem, regexp, token, token_len, dfa, syntax)
3029 bracket_elem_t *elem;
3030 re_string_t *regexp;
3031 re_token_t *token;
3032 int token_len;
3033 re_dfa_t *dfa;
3034 reg_syntax_t syntax;
3036 #ifdef RE_ENABLE_I18N
3037 int cur_char_size;
3038 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3039 if (cur_char_size > 1)
3041 elem->type = MB_CHAR;
3042 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3043 re_string_skip_bytes (regexp, cur_char_size);
3044 return REG_NOERROR;
3046 #endif /* RE_ENABLE_I18N */
3047 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3048 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3049 || token->type == OP_OPEN_EQUIV_CLASS)
3050 return parse_bracket_symbol (elem, regexp, token);
3051 elem->type = SB_CHAR;
3052 elem->opr.ch = token->opr.c;
3053 return REG_NOERROR;
3056 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3057 such as [:<character_class>:], [.<collating_element>.], and
3058 [=<equivalent_class>=]. */
3060 static reg_errcode_t
3061 parse_bracket_symbol (elem, regexp, token)
3062 bracket_elem_t *elem;
3063 re_string_t *regexp;
3064 re_token_t *token;
3066 unsigned char ch, delim = token->opr.c;
3067 int i = 0;
3068 for (;; ++i)
3070 if (re_string_eoi(regexp) || i >= BRACKET_NAME_BUF_SIZE)
3071 return REG_EBRACK;
3072 if (token->type == OP_OPEN_CHAR_CLASS)
3073 ch = re_string_fetch_byte_case (regexp);
3074 else
3075 ch = re_string_fetch_byte (regexp);
3076 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3077 break;
3078 elem->opr.name[i] = ch;
3080 re_string_skip_bytes (regexp, 1);
3081 elem->opr.name[i] = '\0';
3082 switch (token->type)
3084 case OP_OPEN_COLL_ELEM:
3085 elem->type = COLL_SYM;
3086 break;
3087 case OP_OPEN_EQUIV_CLASS:
3088 elem->type = EQUIV_CLASS;
3089 break;
3090 case OP_OPEN_CHAR_CLASS:
3091 elem->type = CHAR_CLASS;
3092 break;
3093 default:
3094 break;
3096 return REG_NOERROR;
3099 /* Helper function for parse_bracket_exp.
3100 Build the equivalence class which is represented by NAME.
3101 The result are written to MBCSET and SBCSET.
3102 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3103 is a pointer argument sinse we may update it. */
3105 static reg_errcode_t
3106 #ifdef RE_ENABLE_I18N
3107 build_equiv_class (sbcset, mbcset, equiv_class_alloc, name)
3108 re_charset_t *mbcset;
3109 int *equiv_class_alloc;
3110 #else /* not RE_ENABLE_I18N */
3111 build_equiv_class (sbcset, name)
3112 #endif /* not RE_ENABLE_I18N */
3113 re_bitset_ptr_t sbcset;
3114 const unsigned char *name;
3116 #if defined _LIBC && defined RE_ENABLE_I18N
3117 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3118 if (nrules != 0)
3120 const int32_t *table, *indirect;
3121 const unsigned char *weights, *extra, *cp;
3122 unsigned char char_buf[2];
3123 int32_t idx1, idx2;
3124 unsigned int ch;
3125 size_t len;
3126 /* This #include defines a local function! */
3127 # include <locale/weight.h>
3128 /* Calculate the index for equivalence class. */
3129 cp = name;
3130 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3131 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3132 _NL_COLLATE_WEIGHTMB);
3133 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3134 _NL_COLLATE_EXTRAMB);
3135 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3136 _NL_COLLATE_INDIRECTMB);
3137 idx1 = findidx (&cp);
3138 if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3139 /* This isn't a valid character. */
3140 return REG_ECOLLATE;
3142 /* Build single byte matcing table for this equivalence class. */
3143 char_buf[1] = (unsigned char) '\0';
3144 len = weights[idx1];
3145 for (ch = 0; ch < SBC_MAX; ++ch)
3147 char_buf[0] = ch;
3148 cp = char_buf;
3149 idx2 = findidx (&cp);
3151 idx2 = table[ch];
3153 if (idx2 == 0)
3154 /* This isn't a valid character. */
3155 continue;
3156 if (len == weights[idx2])
3158 int cnt = 0;
3159 while (cnt <= len &&
3160 weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
3161 ++cnt;
3163 if (cnt > len)
3164 bitset_set (sbcset, ch);
3167 /* Check whether the array has enough space. */
3168 if (*equiv_class_alloc == mbcset->nequiv_classes)
3170 /* Not enough, realloc it. */
3171 /* +1 in case of mbcset->nequiv_classes is 0. */
3172 *equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3173 /* Use realloc since the array is NULL if *alloc == 0. */
3174 mbcset->equiv_classes = re_realloc (mbcset->equiv_classes, int32_t,
3175 *equiv_class_alloc);
3176 if (BE (mbcset->equiv_classes == NULL, 0))
3177 return REG_ESPACE;
3179 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3181 else
3182 #endif /* _LIBC && RE_ENABLE_I18N */
3184 if (BE (strlen ((const char *) name) != 1, 0))
3185 return REG_ECOLLATE;
3186 bitset_set (sbcset, *name);
3188 return REG_NOERROR;
3191 /* Helper function for parse_bracket_exp.
3192 Build the character class which is represented by NAME.
3193 The result are written to MBCSET and SBCSET.
3194 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3195 is a pointer argument sinse we may update it. */
3197 static reg_errcode_t
3198 #ifdef RE_ENABLE_I18N
3199 build_charclass (sbcset, mbcset, char_class_alloc, class_name, syntax)
3200 re_charset_t *mbcset;
3201 int *char_class_alloc;
3202 #else /* not RE_ENABLE_I18N */
3203 build_charclass (sbcset, class_name, syntax)
3204 #endif /* not RE_ENABLE_I18N */
3205 re_bitset_ptr_t sbcset;
3206 const unsigned char *class_name;
3207 reg_syntax_t syntax;
3209 int i;
3210 const char *name = (const char *) class_name;
3212 /* In case of REG_ICASE "upper" and "lower" match the both of
3213 upper and lower cases. */
3214 if ((syntax & RE_ICASE)
3215 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3216 name = "alpha";
3218 #ifdef RE_ENABLE_I18N
3219 /* Check the space of the arrays. */
3220 if (*char_class_alloc == mbcset->nchar_classes)
3222 /* Not enough, realloc it. */
3223 /* +1 in case of mbcset->nchar_classes is 0. */
3224 *char_class_alloc = 2 * mbcset->nchar_classes + 1;
3225 /* Use realloc since array is NULL if *alloc == 0. */
3226 mbcset->char_classes = re_realloc (mbcset->char_classes, wctype_t,
3227 *char_class_alloc);
3228 if (BE (mbcset->char_classes == NULL, 0))
3229 return REG_ESPACE;
3231 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3232 #endif /* RE_ENABLE_I18N */
3234 #define BUILD_CHARCLASS_LOOP(ctype_func)\
3235 for (i = 0; i < SBC_MAX; ++i) \
3237 if (ctype_func (i)) \
3238 bitset_set (sbcset, i); \
3241 if (strcmp (name, "alnum") == 0)
3242 BUILD_CHARCLASS_LOOP (isalnum)
3243 else if (strcmp (name, "cntrl") == 0)
3244 BUILD_CHARCLASS_LOOP (iscntrl)
3245 else if (strcmp (name, "lower") == 0)
3246 BUILD_CHARCLASS_LOOP (islower)
3247 else if (strcmp (name, "space") == 0)
3248 BUILD_CHARCLASS_LOOP (isspace)
3249 else if (strcmp (name, "alpha") == 0)
3250 BUILD_CHARCLASS_LOOP (isalpha)
3251 else if (strcmp (name, "digit") == 0)
3252 BUILD_CHARCLASS_LOOP (isdigit)
3253 else if (strcmp (name, "print") == 0)
3254 BUILD_CHARCLASS_LOOP (isprint)
3255 else if (strcmp (name, "upper") == 0)
3256 BUILD_CHARCLASS_LOOP (isupper)
3257 else if (strcmp (name, "blank") == 0)
3258 BUILD_CHARCLASS_LOOP (isblank)
3259 else if (strcmp (name, "graph") == 0)
3260 BUILD_CHARCLASS_LOOP (isgraph)
3261 else if (strcmp (name, "punct") == 0)
3262 BUILD_CHARCLASS_LOOP (ispunct)
3263 else if (strcmp (name, "xdigit") == 0)
3264 BUILD_CHARCLASS_LOOP (isxdigit)
3265 else
3266 return REG_ECTYPE;
3268 return REG_NOERROR;
3271 static bin_tree_t *
3272 build_word_op (dfa, not, err)
3273 re_dfa_t *dfa;
3274 int not;
3275 reg_errcode_t *err;
3277 re_bitset_ptr_t sbcset;
3278 #ifdef RE_ENABLE_I18N
3279 re_charset_t *mbcset;
3280 int alloc = 0;
3281 #else /* not RE_ENABLE_I18N */
3282 int non_match = 0;
3283 #endif /* not RE_ENABLE_I18N */
3284 reg_errcode_t ret;
3285 re_token_t br_token;
3286 bin_tree_t *tree;
3287 int new_idx;
3289 sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
3290 #ifdef RE_ENABLE_I18N
3291 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3292 #endif /* RE_ENABLE_I18N */
3294 #ifdef RE_ENABLE_I18N
3295 if (BE (sbcset == NULL || mbcset == NULL, 0))
3296 #else /* not RE_ENABLE_I18N */
3297 if (BE (sbcset == NULL, 0))
3298 #endif /* not RE_ENABLE_I18N */
3300 *err = REG_ESPACE;
3301 return NULL;
3304 if (not)
3306 #ifdef RE_ENABLE_I18N
3307 int i;
3309 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3310 bitset_set(cset->sbcset, '\0');
3312 mbcset->non_match = 1;
3313 if (MB_CUR_MAX > 1)
3314 for (i = 0; i < SBC_MAX; ++i)
3315 if (__btowc (i) == WEOF)
3316 bitset_set (sbcset, i);
3317 #else /* not RE_ENABLE_I18N */
3318 non_match = 1;
3319 #endif /* not RE_ENABLE_I18N */
3322 /* We don't care the syntax in this case. */
3323 ret = build_charclass (sbcset,
3324 #ifdef RE_ENABLE_I18N
3325 mbcset, &alloc,
3326 #endif /* RE_ENABLE_I18N */
3327 (const unsigned char *) "alpha", 0);
3329 if (BE (ret != REG_NOERROR, 0))
3331 re_free (sbcset);
3332 #ifdef RE_ENABLE_I18N
3333 free_charset (mbcset);
3334 #endif /* RE_ENABLE_I18N */
3335 *err = REG_ESPACE;
3336 return NULL;
3338 /* \w match '_' also. */
3339 bitset_set (sbcset, '_');
3341 /* If it is non-matching list. */
3342 #ifdef RE_ENABLE_I18N
3343 if (mbcset->non_match)
3344 #else /* not RE_ENABLE_I18N */
3345 if (non_match)
3346 #endif /* not RE_ENABLE_I18N */
3347 bitset_not (sbcset);
3349 /* Build a tree for simple bracket. */
3350 br_token.type = SIMPLE_BRACKET;
3351 br_token.opr.sbcset = sbcset;
3352 new_idx = re_dfa_add_node (dfa, br_token, 0);
3353 tree = create_tree (NULL, NULL, 0, new_idx);
3354 if (BE (new_idx == -1 || tree == NULL, 0))
3355 goto build_word_op_espace;
3357 #ifdef RE_ENABLE_I18N
3358 if (MB_CUR_MAX > 1)
3360 re_token_t alt_token;
3361 bin_tree_t *mbc_tree;
3362 /* Build a tree for complex bracket. */
3363 br_token.type = COMPLEX_BRACKET;
3364 br_token.opr.mbcset = mbcset;
3365 dfa->has_mb_node = 1;
3366 new_idx = re_dfa_add_node (dfa, br_token, 0);
3367 mbc_tree = create_tree (NULL, NULL, 0, new_idx);
3368 if (BE (new_idx == -1 || mbc_tree == NULL, 0))
3369 goto build_word_op_espace;
3370 /* Then join them by ALT node. */
3371 alt_token.type = OP_ALT;
3372 new_idx = re_dfa_add_node (dfa, alt_token, 0);
3373 tree = create_tree (tree, mbc_tree, 0, new_idx);
3374 if (BE (new_idx != -1 && mbc_tree != NULL, 1))
3375 return tree;
3377 else
3379 free_charset (mbcset);
3380 return tree;
3382 #else /* not RE_ENABLE_I18N */
3383 return tree;
3384 #endif /* not RE_ENABLE_I18N */
3386 build_word_op_espace:
3387 re_free (sbcset);
3388 #ifdef RE_ENABLE_I18N
3389 free_charset (mbcset);
3390 #endif /* RE_ENABLE_I18N */
3391 *err = REG_ESPACE;
3392 return NULL;
3395 /* This is intended for the expressions like "a{1,3}".
3396 Fetch a number from `input', and return the number.
3397 Return -1, if the number field is empty like "{,1}".
3398 Return -2, If an error is occured. */
3400 static int
3401 fetch_number (input, token, syntax)
3402 re_string_t *input;
3403 re_token_t *token;
3404 reg_syntax_t syntax;
3406 int num = -1;
3407 unsigned char c;
3408 while (1)
3410 *token = fetch_token (input, syntax);
3411 c = token->opr.c;
3412 if (BE (token->type == END_OF_RE, 0))
3413 return -2;
3414 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3415 break;
3416 num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3417 ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3418 num = (num > RE_DUP_MAX) ? -2 : num;
3420 return num;
3423 #ifdef RE_ENABLE_I18N
3424 static void
3425 free_charset (re_charset_t *cset)
3427 re_free (cset->mbchars);
3428 # ifdef _LIBC
3429 re_free (cset->coll_syms);
3430 re_free (cset->equiv_classes);
3431 re_free (cset->range_starts);
3432 re_free (cset->range_ends);
3433 # endif
3434 re_free (cset->char_classes);
3435 re_free (cset);
3437 #endif /* RE_ENABLE_I18N */
3439 /* Functions for binary tree operation. */
3441 /* Create a node of tree.
3442 Note: This function automatically free left and right if malloc fails. */
3444 static bin_tree_t *
3445 create_tree (left, right, type, index)
3446 bin_tree_t *left;
3447 bin_tree_t *right;
3448 re_token_type_t type;
3449 int index;
3451 bin_tree_t *tree;
3452 tree = re_malloc (bin_tree_t, 1);
3453 if (BE (tree == NULL, 0))
3455 free_bin_tree (left);
3456 free_bin_tree (right);
3457 return NULL;
3459 tree->parent = NULL;
3460 tree->left = left;
3461 tree->right = right;
3462 tree->type = type;
3463 tree->node_idx = index;
3464 tree->first = -1;
3465 tree->next = -1;
3466 re_node_set_init_empty (&tree->eclosure);
3468 if (left != NULL)
3469 left->parent = tree;
3470 if (right != NULL)
3471 right->parent = tree;
3472 return tree;
3475 /* Free the sub tree pointed by TREE. */
3477 static void
3478 free_bin_tree (tree)
3479 bin_tree_t *tree;
3481 if (tree == NULL)
3482 return;
3483 /*re_node_set_free (&tree->eclosure);*/
3484 free_bin_tree (tree->left);
3485 free_bin_tree (tree->right);
3486 re_free (tree);
3489 /* Duplicate the node SRC, and return new node. */
3491 static bin_tree_t *
3492 duplicate_tree (src, dfa)
3493 const bin_tree_t *src;
3494 re_dfa_t *dfa;
3496 bin_tree_t *left = NULL, *right = NULL, *new_tree;
3497 int new_node_idx;
3498 /* Since node indies must be according to Post-order of the tree,
3499 we must duplicate the left at first. */
3500 if (src->left != NULL)
3502 left = duplicate_tree (src->left, dfa);
3503 if (left == NULL)
3504 return NULL;
3507 /* Secondaly, duplicate the right. */
3508 if (src->right != NULL)
3510 right = duplicate_tree (src->right, dfa);
3511 if (right == NULL)
3513 free_bin_tree (left);
3514 return NULL;
3518 /* At last, duplicate itself. */
3519 if (src->type == NON_TYPE)
3521 new_node_idx = re_dfa_add_node (dfa, dfa->nodes[src->node_idx], 0);
3522 dfa->nodes[new_node_idx].duplicated = 1;
3523 if (BE (new_node_idx == -1, 0))
3525 free_bin_tree (left);
3526 free_bin_tree (right);
3527 return NULL;
3530 else
3531 new_node_idx = src->type;
3533 new_tree = create_tree (left, right, src->type, new_node_idx);
3534 if (BE (new_tree == NULL, 0))
3536 free_bin_tree (left);
3537 free_bin_tree (right);
3539 return new_tree;