Update.
[glibc.git] / posix / regcomp.c
blob7917c647274ad7a3a56245a1a7cc24bb5d798569
1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19 02111-1307 USA. */
21 #include <assert.h>
22 #include <ctype.h>
23 #include <limits.h>
24 #include <locale.h>
25 #include <stdio.h>
26 #include <stdlib.h>
27 #include <string.h>
29 #if defined HAVE_WCHAR_H || defined _LIBC
30 # include <wchar.h>
31 #endif /* HAVE_WCHAR_H || _LIBC */
32 #if defined HAVE_WCTYPE_H || defined _LIBC
33 # include <wctype.h>
34 #endif /* HAVE_WCTYPE_H || _LIBC */
36 /* In case that the system doesn't have isblank(). */
37 #if !defined _LIBC && !defined HAVE_ISBLANK && !defined isblank
38 # define isblank(ch) ((ch) == ' ' || (ch) == '\t')
39 #endif
41 #ifdef _LIBC
42 # ifndef _RE_DEFINE_LOCALE_FUNCTIONS
43 # define _RE_DEFINE_LOCALE_FUNCTIONS 1
44 # include <locale/localeinfo.h>
45 # include <locale/elem-hash.h>
46 # include <locale/coll-lookup.h>
47 # endif
48 #endif
50 /* This is for other GNU distributions with internationalized messages. */
51 #if HAVE_LIBINTL_H || defined _LIBC
52 # include <libintl.h>
53 # ifdef _LIBC
54 # undef gettext
55 # define gettext(msgid) \
56 INTUSE(__dcgettext) (INTUSE(_libc_intl_domainname), msgid, LC_MESSAGES)
57 # endif
58 #else
59 # define gettext(msgid) (msgid)
60 #endif
62 #ifndef gettext_noop
63 /* This define is so xgettext can find the internationalizable
64 strings. */
65 # define gettext_noop(String) String
66 #endif
68 #include <regex.h>
69 #include "regex_internal.h"
71 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
72 int length, reg_syntax_t syntax);
73 static void re_compile_fastmap_iter (regex_t *bufp,
74 const re_dfastate_t *init_state,
75 char *fastmap);
76 static reg_errcode_t init_dfa (re_dfa_t *dfa, int pat_len);
77 static reg_errcode_t init_word_char (re_dfa_t *dfa);
78 #ifdef RE_ENABLE_I18N
79 static void free_charset (re_charset_t *cset);
80 #endif /* RE_ENABLE_I18N */
81 static void free_workarea_compile (regex_t *preg);
82 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
83 static reg_errcode_t analyze (re_dfa_t *dfa);
84 static reg_errcode_t analyze_tree (re_dfa_t *dfa, bin_tree_t *node);
85 static void calc_first (re_dfa_t *dfa, bin_tree_t *node);
86 static void calc_next (re_dfa_t *dfa, bin_tree_t *node);
87 static void calc_epsdest (re_dfa_t *dfa, bin_tree_t *node);
88 static reg_errcode_t duplicate_node (int *new_idx, re_dfa_t *dfa, int org_idx,
89 unsigned int constraint);
90 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
91 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
92 int node, int root);
93 static void calc_inveclosure (re_dfa_t *dfa);
94 static int fetch_number (re_string_t *input, re_token_t *token,
95 reg_syntax_t syntax);
96 static re_token_t fetch_token (re_string_t *input, reg_syntax_t syntax);
97 static int peek_token (re_token_t *token, re_string_t *input,
98 reg_syntax_t syntax);
99 static int peek_token_bracket (re_token_t *token, re_string_t *input,
100 reg_syntax_t syntax);
101 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
102 reg_syntax_t syntax, reg_errcode_t *err);
103 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
104 re_token_t *token, reg_syntax_t syntax,
105 int nest, reg_errcode_t *err);
106 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
107 re_token_t *token, reg_syntax_t syntax,
108 int nest, reg_errcode_t *err);
109 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
110 re_token_t *token, reg_syntax_t syntax,
111 int nest, reg_errcode_t *err);
112 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
113 re_token_t *token, reg_syntax_t syntax,
114 int nest, reg_errcode_t *err);
115 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
116 re_dfa_t *dfa, re_token_t *token,
117 reg_syntax_t syntax, reg_errcode_t *err);
118 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
119 re_token_t *token, reg_syntax_t syntax,
120 reg_errcode_t *err);
121 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
122 re_string_t *regexp,
123 re_token_t *token, int token_len,
124 re_dfa_t *dfa,
125 reg_syntax_t syntax);
126 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
127 re_string_t *regexp,
128 re_token_t *token);
129 #ifndef _LIBC
130 # ifdef RE_ENABLE_I18N
131 static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
132 re_charset_t *mbcset, int *range_alloc,
133 bracket_elem_t *start_elem,
134 bracket_elem_t *end_elem);
135 static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
136 re_charset_t *mbcset,
137 int *coll_sym_alloc,
138 const unsigned char *name);
139 # else /* not RE_ENABLE_I18N */
140 static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
141 bracket_elem_t *start_elem,
142 bracket_elem_t *end_elem);
143 static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
144 const unsigned char *name);
145 # endif /* not RE_ENABLE_I18N */
146 #endif /* not _LIBC */
147 #ifdef RE_ENABLE_I18N
148 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
149 re_charset_t *mbcset,
150 int *equiv_class_alloc,
151 const unsigned char *name);
152 static reg_errcode_t build_charclass (re_bitset_ptr_t sbcset,
153 re_charset_t *mbcset,
154 int *char_class_alloc,
155 const unsigned char *class_name,
156 reg_syntax_t syntax);
157 #else /* not RE_ENABLE_I18N */
158 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
159 const unsigned char *name);
160 static reg_errcode_t build_charclass (re_bitset_ptr_t sbcset,
161 const unsigned char *class_name,
162 reg_syntax_t syntax);
163 #endif /* not RE_ENABLE_I18N */
164 static bin_tree_t *build_word_op (re_dfa_t *dfa, int not, reg_errcode_t *err);
165 static void free_bin_tree (bin_tree_t *tree);
166 static bin_tree_t *create_tree (bin_tree_t *left, bin_tree_t *right,
167 re_token_type_t type, int index);
168 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
170 /* This table gives an error message for each of the error codes listed
171 in regex.h. Obviously the order here has to be same as there.
172 POSIX doesn't require that we do anything for REG_NOERROR,
173 but why not be nice? */
175 const char __re_error_msgid[] attribute_hidden =
177 #define REG_NOERROR_IDX 0
178 gettext_noop ("Success") /* REG_NOERROR */
179 "\0"
180 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
181 gettext_noop ("No match") /* REG_NOMATCH */
182 "\0"
183 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
184 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
185 "\0"
186 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
187 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
188 "\0"
189 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
190 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
191 "\0"
192 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
193 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
194 "\0"
195 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
196 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
197 "\0"
198 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
199 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
200 "\0"
201 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
202 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
203 "\0"
204 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
205 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
206 "\0"
207 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
208 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
209 "\0"
210 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
211 gettext_noop ("Invalid range end") /* REG_ERANGE */
212 "\0"
213 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
214 gettext_noop ("Memory exhausted") /* REG_ESPACE */
215 "\0"
216 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
217 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
218 "\0"
219 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
220 gettext_noop ("Premature end of regular expression") /* REG_EEND */
221 "\0"
222 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
223 gettext_noop ("Regular expression too big") /* REG_ESIZE */
224 "\0"
225 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
226 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
229 const size_t __re_error_msgid_idx[] attribute_hidden =
231 REG_NOERROR_IDX,
232 REG_NOMATCH_IDX,
233 REG_BADPAT_IDX,
234 REG_ECOLLATE_IDX,
235 REG_ECTYPE_IDX,
236 REG_EESCAPE_IDX,
237 REG_ESUBREG_IDX,
238 REG_EBRACK_IDX,
239 REG_EPAREN_IDX,
240 REG_EBRACE_IDX,
241 REG_BADBR_IDX,
242 REG_ERANGE_IDX,
243 REG_ESPACE_IDX,
244 REG_BADRPT_IDX,
245 REG_EEND_IDX,
246 REG_ESIZE_IDX,
247 REG_ERPAREN_IDX
250 /* Entry points for GNU code. */
252 /* re_compile_pattern is the GNU regular expression compiler: it
253 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
254 Returns 0 if the pattern was valid, otherwise an error string.
256 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
257 are set in BUFP on entry. */
259 const char *
260 re_compile_pattern (pattern, length, bufp)
261 const char *pattern;
262 size_t length;
263 struct re_pattern_buffer *bufp;
265 reg_errcode_t ret;
267 /* GNU code is written to assume at least RE_NREGS registers will be set
268 (and at least one extra will be -1). */
269 bufp->regs_allocated = REGS_UNALLOCATED;
271 /* And GNU code determines whether or not to get register information
272 by passing null for the REGS argument to re_match, etc., not by
273 setting no_sub. */
274 bufp->no_sub = 0;
276 /* Match anchors at newline. */
277 bufp->newline_anchor = 1;
279 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
281 if (!ret)
282 return NULL;
283 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
285 #ifdef _LIBC
286 weak_alias (__re_compile_pattern, re_compile_pattern)
287 #endif
289 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
290 also be assigned to arbitrarily: each pattern buffer stores its own
291 syntax, so it can be changed between regex compilations. */
292 /* This has no initializer because initialized variables in Emacs
293 become read-only after dumping. */
294 reg_syntax_t re_syntax_options;
297 /* Specify the precise syntax of regexps for compilation. This provides
298 for compatibility for various utilities which historically have
299 different, incompatible syntaxes.
301 The argument SYNTAX is a bit mask comprised of the various bits
302 defined in regex.h. We return the old syntax. */
304 reg_syntax_t
305 re_set_syntax (syntax)
306 reg_syntax_t syntax;
308 reg_syntax_t ret = re_syntax_options;
310 re_syntax_options = syntax;
311 return ret;
313 #ifdef _LIBC
314 weak_alias (__re_set_syntax, re_set_syntax)
315 #endif
318 re_compile_fastmap (bufp)
319 struct re_pattern_buffer *bufp;
321 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
322 char *fastmap = bufp->fastmap;
324 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
325 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
326 if (dfa->init_state != dfa->init_state_word)
327 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
328 if (dfa->init_state != dfa->init_state_nl)
329 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
330 if (dfa->init_state != dfa->init_state_begbuf)
331 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
332 bufp->fastmap_accurate = 1;
333 return 0;
335 #ifdef _LIBC
336 weak_alias (__re_compile_fastmap, re_compile_fastmap)
337 #endif
339 /* Helper function for re_compile_fastmap.
340 Compile fastmap for the initial_state INIT_STATE. */
342 static void
343 re_compile_fastmap_iter (bufp, init_state, fastmap)
344 regex_t *bufp;
345 const re_dfastate_t *init_state;
346 char *fastmap;
348 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
349 int node_cnt;
350 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
352 int node = init_state->nodes.elems[node_cnt];
353 re_token_type_t type = dfa->nodes[node].type;
354 if (type == OP_CONTEXT_NODE)
356 node = dfa->nodes[node].opr.ctx_info->entity;
357 type = dfa->nodes[node].type;
360 if (type == CHARACTER)
361 fastmap[dfa->nodes[node].opr.c] = 1;
362 else if (type == SIMPLE_BRACKET)
364 int i, j, ch;
365 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
366 for (j = 0; j < UINT_BITS; ++j, ++ch)
367 if (dfa->nodes[node].opr.sbcset[i] & (1 << j))
368 fastmap[ch] = 1;
370 #ifdef RE_ENABLE_I18N
371 else if (type == COMPLEX_BRACKET)
373 int i;
374 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
375 if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
376 || cset->nranges || cset->nchar_classes)
378 # ifdef _LIBC
379 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
381 /* In this case we want to catch the bytes which are
382 the first byte of any collation elements.
383 e.g. In da_DK, we want to catch 'a' since "aa"
384 is a valid collation element, and don't catch
385 'b' since 'b' is the only collation element
386 which starts from 'b'. */
387 int j, ch;
388 const int32_t *table = (const int32_t *)
389 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
390 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
391 for (j = 0; j < UINT_BITS; ++j, ++ch)
392 if (table[ch] < 0)
393 fastmap[ch] = 1;
395 # else
396 if (MB_CUR_MAX > 1)
397 for (i = 0; i < SBC_MAX; ++i)
398 if (__btowc (i) == WEOF)
399 fastmap[i] = 1;
400 # endif /* not _LIBC */
402 for (i = 0; i < cset->nmbchars; ++i)
404 char buf[256];
405 wctomb (buf, cset->mbchars[i]);
406 fastmap[*(unsigned char *) buf] = 1;
409 #endif /* RE_ENABLE_I18N */
410 else if (type == END_OF_RE || type == OP_PERIOD
411 #ifdef RE_ENABLE_I18N
412 || type == COMPLEX_BRACKET
413 #endif /* RE_ENABLE_I18N */
416 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
417 if (type == END_OF_RE)
418 bufp->can_be_null = 1;
419 return;
424 /* Entry point for POSIX code. */
425 /* regcomp takes a regular expression as a string and compiles it.
427 PREG is a regex_t *. We do not expect any fields to be initialized,
428 since POSIX says we shouldn't. Thus, we set
430 `buffer' to the compiled pattern;
431 `used' to the length of the compiled pattern;
432 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
433 REG_EXTENDED bit in CFLAGS is set; otherwise, to
434 RE_SYNTAX_POSIX_BASIC;
435 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
436 `fastmap' to an allocated space for the fastmap;
437 `fastmap_accurate' to zero;
438 `re_nsub' to the number of subexpressions in PATTERN.
440 PATTERN is the address of the pattern string.
442 CFLAGS is a series of bits which affect compilation.
444 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
445 use POSIX basic syntax.
447 If REG_NEWLINE is set, then . and [^...] don't match newline.
448 Also, regexec will try a match beginning after every newline.
450 If REG_ICASE is set, then we considers upper- and lowercase
451 versions of letters to be equivalent when matching.
453 If REG_NOSUB is set, then when PREG is passed to regexec, that
454 routine will report only success or failure, and nothing about the
455 registers.
457 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
458 the return codes and their meanings.) */
461 regcomp (preg, pattern, cflags)
462 regex_t *__restrict preg;
463 const char *__restrict pattern;
464 int cflags;
466 reg_errcode_t ret;
467 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
468 : RE_SYNTAX_POSIX_BASIC);
470 preg->buffer = NULL;
471 preg->allocated = 0;
472 preg->used = 0;
474 /* Try to allocate space for the fastmap. */
475 preg->fastmap = re_malloc (char, SBC_MAX);
476 if (BE (preg->fastmap == NULL, 0))
477 return REG_ESPACE;
479 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
481 /* If REG_NEWLINE is set, newlines are treated differently. */
482 if (cflags & REG_NEWLINE)
483 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
484 syntax &= ~RE_DOT_NEWLINE;
485 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
486 /* It also changes the matching behavior. */
487 preg->newline_anchor = 1;
489 else
490 preg->newline_anchor = 0;
491 preg->no_sub = !!(cflags & REG_NOSUB);
492 preg->translate = NULL;
494 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
496 /* POSIX doesn't distinguish between an unmatched open-group and an
497 unmatched close-group: both are REG_EPAREN. */
498 if (ret == REG_ERPAREN)
499 ret = REG_EPAREN;
501 /* We have already checked preg->fastmap != NULL. */
502 if (BE (ret == REG_NOERROR, 1))
504 /* Compute the fastmap now, since regexec cannot modify the pattern
505 buffer. */
506 if (BE (re_compile_fastmap (preg) == -2, 0))
508 /* Some error occurred while computing the fastmap, just forget
509 about it. */
510 re_free (preg->fastmap);
511 preg->fastmap = NULL;
515 return (int) ret;
517 #ifdef _LIBC
518 weak_alias (__regcomp, regcomp)
519 #endif
521 /* Returns a message corresponding to an error code, ERRCODE, returned
522 from either regcomp or regexec. We don't use PREG here. */
524 size_t
525 regerror (errcode, preg, errbuf, errbuf_size)
526 int errcode;
527 const regex_t *preg;
528 char *errbuf;
529 size_t errbuf_size;
531 const char *msg;
532 size_t msg_size;
534 if (BE (errcode < 0
535 || errcode >= (int) (sizeof (__re_error_msgid_idx)
536 / sizeof (__re_error_msgid_idx[0])), 0))
537 /* Only error codes returned by the rest of the code should be passed
538 to this routine. If we are given anything else, or if other regex
539 code generates an invalid error code, then the program has a bug.
540 Dump core so we can fix it. */
541 abort ();
543 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
545 msg_size = strlen (msg) + 1; /* Includes the null. */
547 if (BE (errbuf_size != 0, 1))
549 if (BE (msg_size > errbuf_size, 0))
551 #if defined HAVE_MEMPCPY || defined _LIBC
552 *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
553 #else
554 memcpy (errbuf, msg, errbuf_size - 1);
555 errbuf[errbuf_size - 1] = 0;
556 #endif
558 else
559 memcpy (errbuf, msg, msg_size);
562 return msg_size;
564 #ifdef _LIBC
565 weak_alias (__regerror, regerror)
566 #endif
568 /* Free dynamically allocated space used by PREG. */
570 void
571 regfree (preg)
572 regex_t *preg;
574 int i, j;
575 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
576 if (BE (dfa != NULL, 1))
578 re_free (dfa->subexps);
580 for (i = 0; i < dfa->nodes_len; ++i)
582 re_token_t *node = dfa->nodes + i;
583 #ifdef RE_ENABLE_I18N
584 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
585 free_charset (node->opr.mbcset);
586 else
587 #endif /* RE_ENABLE_I18N */
588 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
589 re_free (node->opr.sbcset);
590 else if (node->type == OP_CONTEXT_NODE)
592 if (dfa->nodes[node->opr.ctx_info->entity].type == OP_BACK_REF)
594 if (node->opr.ctx_info->bkref_eclosure != NULL)
595 re_node_set_free (node->opr.ctx_info->bkref_eclosure);
596 re_free (node->opr.ctx_info->bkref_eclosure);
598 re_free (node->opr.ctx_info);
601 re_free (dfa->firsts);
602 re_free (dfa->nexts);
603 for (i = 0; i < dfa->nodes_len; ++i)
605 if (dfa->eclosures != NULL)
606 re_node_set_free (dfa->eclosures + i);
607 if (dfa->inveclosures != NULL)
608 re_node_set_free (dfa->inveclosures + i);
609 if (dfa->edests != NULL)
610 re_node_set_free (dfa->edests + i);
612 re_free (dfa->edests);
613 re_free (dfa->eclosures);
614 re_free (dfa->inveclosures);
615 re_free (dfa->nodes);
617 for (i = 0; i <= dfa->state_hash_mask; ++i)
619 struct re_state_table_entry *entry = dfa->state_table + i;
620 for (j = 0; j < entry->num; ++j)
622 re_dfastate_t *state = entry->array[j];
623 if (state->entrance_nodes != &state->nodes)
625 re_node_set_free (state->entrance_nodes);
626 re_free (state->entrance_nodes);
628 re_node_set_free (&state->nodes);
629 re_free (state->trtable);
630 re_free (state->trtable_search);
631 re_free (state);
633 re_free (entry->array);
635 re_free (dfa->state_table);
637 if (dfa->word_char != NULL)
638 re_free (dfa->word_char);
639 #ifdef DEBUG
640 re_free (dfa->re_str);
641 #endif
642 re_free (dfa);
644 re_free (preg->fastmap);
646 #ifdef _LIBC
647 weak_alias (__regfree, regfree)
648 #endif
650 /* Entry points compatible with 4.2 BSD regex library. We don't define
651 them unless specifically requested. */
653 #if defined _REGEX_RE_COMP || defined _LIBC
655 /* BSD has one and only one pattern buffer. */
656 static struct re_pattern_buffer re_comp_buf;
658 char *
659 # ifdef _LIBC
660 /* Make these definitions weak in libc, so POSIX programs can redefine
661 these names if they don't use our functions, and still use
662 regcomp/regexec above without link errors. */
663 weak_function
664 # endif
665 re_comp (s)
666 const char *s;
668 reg_errcode_t ret;
670 if (!s)
672 if (!re_comp_buf.buffer)
673 return gettext ("No previous regular expression");
674 return 0;
677 if (!re_comp_buf.buffer)
679 re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
680 if (re_comp_buf.fastmap == NULL)
681 return (char *) gettext (__re_error_msgid
682 + __re_error_msgid_idx[(int) REG_ESPACE]);
685 /* Since `re_exec' always passes NULL for the `regs' argument, we
686 don't need to initialize the pattern buffer fields which affect it. */
688 /* Match anchors at newlines. */
689 re_comp_buf.newline_anchor = 1;
691 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
693 if (!ret)
694 return NULL;
696 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
697 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
699 #endif /* _REGEX_RE_COMP */
701 /* Internal entry point.
702 Compile the regular expression PATTERN, whose length is LENGTH.
703 SYNTAX indicate regular expression's syntax. */
705 static reg_errcode_t
706 re_compile_internal (preg, pattern, length, syntax)
707 regex_t *preg;
708 const char * pattern;
709 int length;
710 reg_syntax_t syntax;
712 reg_errcode_t err = REG_NOERROR;
713 re_dfa_t *dfa;
714 re_string_t regexp;
716 /* Initialize the pattern buffer. */
717 preg->fastmap_accurate = 0;
718 preg->syntax = syntax;
719 preg->not_bol = preg->not_eol = 0;
720 preg->used = 0;
721 preg->re_nsub = 0;
723 /* Initialize the dfa. */
724 dfa = (re_dfa_t *) preg->buffer;
725 if (preg->allocated < sizeof (re_dfa_t))
727 /* If zero allocated, but buffer is non-null, try to realloc
728 enough space. This loses if buffer's address is bogus, but
729 that is the user's responsibility. If ->buffer is NULL this
730 is a simple allocation. */
731 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
732 if (dfa == NULL)
733 return REG_ESPACE;
734 preg->allocated = sizeof (re_dfa_t);
736 preg->buffer = (unsigned char *) dfa;
737 preg->used = sizeof (re_dfa_t);
739 err = init_dfa (dfa, length);
740 if (BE (err != REG_NOERROR, 0))
742 re_free (dfa);
743 preg->buffer = NULL;
744 return err;
746 #ifdef DEBUG
747 dfa->re_str = re_malloc (char, length + 1);
748 strncpy (dfa->re_str, pattern, length + 1);
749 #endif
751 err = re_string_construct (&regexp, pattern, length, preg->translate,
752 syntax & RE_ICASE);
753 if (BE (err != REG_NOERROR, 0))
755 re_free (dfa);
756 preg->buffer = NULL;
757 return err;
760 /* Parse the regular expression, and build a structure tree. */
761 preg->re_nsub = 0;
762 dfa->str_tree = parse (&regexp, preg, syntax, &err);
763 if (BE (dfa->str_tree == NULL, 0))
764 goto re_compile_internal_free_return;
766 /* Analyze the tree and collect information which is necessary to
767 create the dfa. */
768 err = analyze (dfa);
769 if (BE (err != REG_NOERROR, 0))
770 goto re_compile_internal_free_return;
772 /* Then create the initial state of the dfa. */
773 err = create_initial_state (dfa);
774 if (BE (err != REG_NOERROR, 0))
775 goto re_compile_internal_free_return;
777 re_compile_internal_free_return:
778 /* Release work areas. */
779 free_workarea_compile (preg);
780 re_string_destruct (&regexp);
782 return err;
785 /* Initialize DFA. We use the length of the regular expression PAT_LEN
786 as the initial length of some arrays. */
788 static reg_errcode_t
789 init_dfa (dfa, pat_len)
790 re_dfa_t *dfa;
791 int pat_len;
793 int table_size;
795 memset (dfa, '\0', sizeof (re_dfa_t));
797 dfa->nodes_alloc = pat_len + 1;
798 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
800 dfa->states_alloc = pat_len + 1;
802 /* table_size = 2 ^ ceil(log pat_len) */
803 for (table_size = 1; table_size > 0; table_size <<= 1)
804 if (table_size > pat_len)
805 break;
807 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
808 dfa->state_hash_mask = table_size - 1;
810 dfa->subexps_alloc = 1;
811 dfa->subexps = re_malloc (re_subexp_t, dfa->subexps_alloc);
812 dfa->word_char = NULL;
814 if (BE (dfa->nodes == NULL || dfa->state_table == NULL
815 || dfa->subexps == NULL, 0))
817 /* We don't bother to free anything which was allocated. Very
818 soon the process will go down anyway. */
819 dfa->subexps = NULL;
820 dfa->state_table = NULL;
821 dfa->nodes = NULL;
822 return REG_ESPACE;
824 return REG_NOERROR;
827 /* Initialize WORD_CHAR table, which indicate which character is
828 "word". In this case "word" means that it is the word construction
829 character used by some operators like "\<", "\>", etc. */
831 static reg_errcode_t
832 init_word_char (dfa)
833 re_dfa_t *dfa;
835 int i, j, ch;
836 dfa->word_char = (re_bitset_ptr_t) calloc (sizeof (bitset), 1);
837 if (BE (dfa->word_char == NULL, 0))
838 return REG_ESPACE;
839 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
840 for (j = 0; j < UINT_BITS; ++j, ++ch)
841 if (isalnum (ch) || ch == '_')
842 dfa->word_char[i] |= 1 << j;
843 return REG_NOERROR;
846 /* Free the work area which are only used while compiling. */
848 static void
849 free_workarea_compile (preg)
850 regex_t *preg;
852 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
853 free_bin_tree (dfa->str_tree);
854 dfa->str_tree = NULL;
857 /* Create initial states for all contexts. */
859 static reg_errcode_t
860 create_initial_state (dfa)
861 re_dfa_t *dfa;
863 int first, i;
864 reg_errcode_t err;
865 re_node_set init_nodes;
867 /* Initial states have the epsilon closure of the node which is
868 the first node of the regular expression. */
869 first = dfa->str_tree->first;
870 dfa->init_node = first;
871 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
872 if (BE (err != REG_NOERROR, 0))
873 return err;
875 /* The back-references which are in initial states can epsilon transit,
876 since in this case all of the subexpressions can be null.
877 Then we add epsilon closures of the nodes which are the next nodes of
878 the back-references. */
879 if (dfa->nbackref > 0)
880 for (i = 0; i < init_nodes.nelem; ++i)
882 int node_idx = init_nodes.elems[i];
883 re_token_type_t type = dfa->nodes[node_idx].type;
885 int clexp_idx;
886 int entity = (type != OP_CONTEXT_NODE ? node_idx
887 : dfa->nodes[node_idx].opr.ctx_info->entity);
888 if ((type != OP_CONTEXT_NODE
889 || (dfa->nodes[entity].type != OP_BACK_REF))
890 && (type != OP_BACK_REF))
891 continue;
892 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
894 re_token_t *clexp_node;
895 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
896 if (clexp_node->type == OP_CLOSE_SUBEXP
897 && clexp_node->opr.idx + 1 == dfa->nodes[entity].opr.idx)
898 break;
900 if (clexp_idx == init_nodes.nelem)
901 continue;
903 if (type == OP_CONTEXT_NODE
904 && (dfa->nodes[dfa->nodes[node_idx].opr.ctx_info->entity].type
905 == OP_BACK_REF))
907 int prev_nelem = init_nodes.nelem;
908 re_node_set_merge (&init_nodes,
909 dfa->nodes[node_idx].opr.ctx_info->bkref_eclosure);
910 if (prev_nelem < init_nodes.nelem)
911 i = 0;
913 else if (type == OP_BACK_REF)
915 int next_idx = dfa->nexts[node_idx];
916 if (!re_node_set_contains (&init_nodes, next_idx))
918 re_node_set_merge (&init_nodes, dfa->eclosures + next_idx);
919 i = 0;
924 /* It must be the first time to invoke acquire_state. */
925 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
926 /* We don't check ERR here, since the initial state must not be NULL. */
927 if (BE (dfa->init_state == NULL, 0))
928 return err;
929 if (dfa->init_state->has_constraint)
931 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
932 CONTEXT_WORD);
933 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
934 CONTEXT_NEWLINE);
935 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
936 &init_nodes,
937 CONTEXT_NEWLINE
938 | CONTEXT_BEGBUF);
939 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
940 || dfa->init_state_begbuf == NULL, 0))
941 return err;
943 else
944 dfa->init_state_word = dfa->init_state_nl
945 = dfa->init_state_begbuf = dfa->init_state;
947 re_node_set_free (&init_nodes);
948 return REG_NOERROR;
951 /* Analyze the structure tree, and calculate "first", "next", "edest",
952 "eclosure", and "inveclosure". */
954 static reg_errcode_t
955 analyze (dfa)
956 re_dfa_t *dfa;
958 int i;
959 reg_errcode_t ret;
961 /* Allocate arrays. */
962 dfa->firsts = re_malloc (int, dfa->nodes_alloc);
963 dfa->nexts = re_malloc (int, dfa->nodes_alloc);
964 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
965 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
966 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_alloc);
967 if (BE (dfa->firsts == NULL || dfa->nexts == NULL || dfa->edests == NULL
968 || dfa->eclosures == NULL || dfa->inveclosures == NULL, 0))
969 return REG_ESPACE;
970 /* Initialize them. */
971 for (i = 0; i < dfa->nodes_len; ++i)
973 dfa->firsts[i] = -1;
974 dfa->nexts[i] = -1;
975 re_node_set_init_empty (dfa->edests + i);
976 re_node_set_init_empty (dfa->eclosures + i);
977 re_node_set_init_empty (dfa->inveclosures + i);
980 ret = analyze_tree (dfa, dfa->str_tree);
981 if (BE (ret == REG_NOERROR, 1))
983 ret = calc_eclosure (dfa);
984 if (ret == REG_NOERROR)
985 calc_inveclosure (dfa);
987 return ret;
990 /* Helper functions for analyze.
991 This function calculate "first", "next", and "edest" for the subtree
992 whose root is NODE. */
994 static reg_errcode_t
995 analyze_tree (dfa, node)
996 re_dfa_t *dfa;
997 bin_tree_t *node;
999 reg_errcode_t ret;
1000 if (node->first == -1)
1001 calc_first (dfa, node);
1002 if (node->next == -1)
1003 calc_next (dfa, node);
1004 if (node->eclosure.nelem == 0)
1005 calc_epsdest (dfa, node);
1006 /* Calculate "first" etc. for the left child. */
1007 if (node->left != NULL)
1009 ret = analyze_tree (dfa, node->left);
1010 if (BE (ret != REG_NOERROR, 0))
1011 return ret;
1013 /* Calculate "first" etc. for the right child. */
1014 if (node->right != NULL)
1016 ret = analyze_tree (dfa, node->right);
1017 if (BE (ret != REG_NOERROR, 0))
1018 return ret;
1020 return REG_NOERROR;
1023 /* Calculate "first" for the node NODE. */
1024 static void
1025 calc_first (dfa, node)
1026 re_dfa_t *dfa;
1027 bin_tree_t *node;
1029 int idx, type;
1030 idx = node->node_idx;
1031 type = (node->type == 0) ? dfa->nodes[idx].type : node->type;
1033 switch (type)
1035 #ifdef DEBUG
1036 case OP_OPEN_BRACKET:
1037 case OP_CLOSE_BRACKET:
1038 case OP_OPEN_DUP_NUM:
1039 case OP_CLOSE_DUP_NUM:
1040 case OP_NON_MATCH_LIST:
1041 case OP_OPEN_COLL_ELEM:
1042 case OP_CLOSE_COLL_ELEM:
1043 case OP_OPEN_EQUIV_CLASS:
1044 case OP_CLOSE_EQUIV_CLASS:
1045 case OP_OPEN_CHAR_CLASS:
1046 case OP_CLOSE_CHAR_CLASS:
1047 /* These must not be appeared here. */
1048 assert (0);
1049 #endif
1050 case END_OF_RE:
1051 case CHARACTER:
1052 case OP_PERIOD:
1053 case OP_DUP_ASTERISK:
1054 case OP_DUP_QUESTION:
1055 #ifdef RE_ENABLE_I18N
1056 case COMPLEX_BRACKET:
1057 #endif /* RE_ENABLE_I18N */
1058 case SIMPLE_BRACKET:
1059 case OP_BACK_REF:
1060 case ANCHOR:
1061 case OP_OPEN_SUBEXP:
1062 case OP_CLOSE_SUBEXP:
1063 node->first = idx;
1064 break;
1065 case OP_DUP_PLUS:
1066 #ifdef DEBUG
1067 assert (node->left != NULL);
1068 #endif
1069 if (node->left->first == -1)
1070 calc_first (dfa, node->left);
1071 node->first = node->left->first;
1072 break;
1073 case OP_ALT:
1074 node->first = idx;
1075 break;
1076 /* else fall through */
1077 default:
1078 #ifdef DEBUG
1079 assert (node->left != NULL);
1080 #endif
1081 if (node->left->first == -1)
1082 calc_first (dfa, node->left);
1083 node->first = node->left->first;
1084 break;
1086 if (node->type == 0)
1087 dfa->firsts[idx] = node->first;
1090 /* Calculate "next" for the node NODE. */
1092 static void
1093 calc_next (dfa, node)
1094 re_dfa_t *dfa;
1095 bin_tree_t *node;
1097 int idx, type;
1098 bin_tree_t *parent = node->parent;
1099 if (parent == NULL)
1101 node->next = -1;
1102 idx = node->node_idx;
1103 if (node->type == 0)
1104 dfa->nexts[idx] = node->next;
1105 return;
1108 idx = parent->node_idx;
1109 type = (parent->type == 0) ? dfa->nodes[idx].type : parent->type;
1111 switch (type)
1113 case OP_DUP_ASTERISK:
1114 case OP_DUP_PLUS:
1115 node->next = idx;
1116 break;
1117 case CONCAT:
1118 if (parent->left == node)
1120 if (parent->right->first == -1)
1121 calc_first (dfa, parent->right);
1122 node->next = parent->right->first;
1123 break;
1125 /* else fall through */
1126 default:
1127 if (parent->next == -1)
1128 calc_next (dfa, parent);
1129 node->next = parent->next;
1130 break;
1132 idx = node->node_idx;
1133 if (node->type == 0)
1134 dfa->nexts[idx] = node->next;
1137 /* Calculate "edest" for the node NODE. */
1139 static void
1140 calc_epsdest (dfa, node)
1141 re_dfa_t *dfa;
1142 bin_tree_t *node;
1144 int idx;
1145 idx = node->node_idx;
1146 if (node->type == 0)
1148 if (dfa->nodes[idx].type == OP_DUP_ASTERISK
1149 || dfa->nodes[idx].type == OP_DUP_PLUS
1150 || dfa->nodes[idx].type == OP_DUP_QUESTION)
1152 if (node->left->first == -1)
1153 calc_first (dfa, node->left);
1154 if (node->next == -1)
1155 calc_next (dfa, node);
1156 re_node_set_init_2 (dfa->edests + idx, node->left->first,
1157 node->next);
1159 else if (dfa->nodes[idx].type == OP_ALT)
1161 int left, right;
1162 if (node->left != NULL)
1164 if (node->left->first == -1)
1165 calc_first (dfa, node->left);
1166 left = node->left->first;
1168 else
1170 if (node->next == -1)
1171 calc_next (dfa, node);
1172 left = node->next;
1174 if (node->right != NULL)
1176 if (node->right->first == -1)
1177 calc_first (dfa, node->right);
1178 right = node->right->first;
1180 else
1182 if (node->next == -1)
1183 calc_next (dfa, node);
1184 right = node->next;
1186 re_node_set_init_2 (dfa->edests + idx, left, right);
1188 else if (dfa->nodes[idx].type == ANCHOR
1189 || dfa->nodes[idx].type == OP_OPEN_SUBEXP
1190 || dfa->nodes[idx].type == OP_CLOSE_SUBEXP)
1191 re_node_set_init_1 (dfa->edests + idx, node->next);
1195 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1196 The new index will be stored in NEW_IDX and return REG_NOERROR if succeeded,
1197 otherwise return the error code. */
1199 static reg_errcode_t
1200 duplicate_node (new_idx, dfa, org_idx, constraint)
1201 re_dfa_t *dfa;
1202 int *new_idx, org_idx;
1203 unsigned int constraint;
1205 re_token_t dup;
1206 int dup_idx;
1207 reg_errcode_t err;
1209 dup.type = OP_CONTEXT_NODE;
1210 if (dfa->nodes[org_idx].type == OP_CONTEXT_NODE)
1212 /* If the node whose index is ORG_IDX is the same as the intended
1213 node, use it. */
1214 if (dfa->nodes[org_idx].constraint == constraint)
1216 *new_idx = org_idx;
1217 return REG_NOERROR;
1219 dup.constraint = constraint |
1220 dfa->nodes[org_idx].constraint;
1222 else
1223 dup.constraint = constraint;
1225 /* In case that `entity' points OP_CONTEXT_NODE,
1226 we correct `entity' to real entity in calc_inveclosures(). */
1227 dup.opr.ctx_info = malloc (sizeof (*dup.opr.ctx_info));
1228 dup_idx = re_dfa_add_node (dfa, dup, 1);
1229 if (BE (dup.opr.ctx_info == NULL || dup_idx == -1, 0))
1230 return REG_ESPACE;
1231 dup.opr.ctx_info->entity = org_idx;
1232 dup.opr.ctx_info->bkref_eclosure = NULL;
1234 dfa->nodes[dup_idx].duplicated = 1;
1235 dfa->firsts[dup_idx] = dfa->firsts[org_idx];
1236 dfa->nexts[dup_idx] = dfa->nexts[org_idx];
1237 err = re_node_set_init_copy (dfa->edests + dup_idx, dfa->edests + org_idx);
1238 if (BE (err != REG_NOERROR, 0))
1239 return err;
1240 /* Since we don't duplicate epsilon nodes, epsilon closure have
1241 only itself. */
1242 err = re_node_set_init_1 (dfa->eclosures + dup_idx, dup_idx);
1243 if (BE (err != REG_NOERROR, 0))
1244 return err;
1245 err = re_node_set_init_1 (dfa->inveclosures + dup_idx, dup_idx);
1246 if (BE (err != REG_NOERROR, 0))
1247 return err;
1248 /* Then we must update inveclosure for this node.
1249 We process them at last part of calc_eclosure(),
1250 since we don't complete to calculate them here. */
1252 *new_idx = dup_idx;
1253 return REG_NOERROR;
1256 static void
1257 calc_inveclosure (dfa)
1258 re_dfa_t *dfa;
1260 int src, idx, dest, entity;
1261 for (src = 0; src < dfa->nodes_len; ++src)
1263 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1265 dest = dfa->eclosures[src].elems[idx];
1266 re_node_set_insert (dfa->inveclosures + dest, src);
1269 entity = src;
1270 while (dfa->nodes[entity].type == OP_CONTEXT_NODE)
1272 entity = dfa->nodes[entity].opr.ctx_info->entity;
1273 re_node_set_merge (dfa->inveclosures + src,
1274 dfa->inveclosures + entity);
1275 dfa->nodes[src].opr.ctx_info->entity = entity;
1280 /* Calculate "eclosure" for all the node in DFA. */
1282 static reg_errcode_t
1283 calc_eclosure (dfa)
1284 re_dfa_t *dfa;
1286 int idx, node_idx, max, incomplete = 0;
1287 #ifdef DEBUG
1288 assert (dfa->nodes_len > 0);
1289 #endif
1290 /* For each nodes, calculate epsilon closure. */
1291 for (node_idx = 0, max = dfa->nodes_len; ; ++node_idx)
1293 reg_errcode_t err;
1294 re_node_set eclosure_elem;
1295 if (node_idx == max)
1297 if (!incomplete)
1298 break;
1299 incomplete = 0;
1300 node_idx = 0;
1303 #ifdef DEBUG
1304 assert (dfa->nodes[node_idx].type != OP_CONTEXT_NODE);
1305 assert (dfa->eclosures[node_idx].nelem != -1);
1306 #endif
1307 /* If we have already calculated, skip it. */
1308 if (dfa->eclosures[node_idx].nelem != 0)
1309 continue;
1310 /* Calculate epsilon closure of `node_idx'. */
1311 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1312 if (BE (err != REG_NOERROR, 0))
1313 return err;
1315 if (dfa->eclosures[node_idx].nelem == 0)
1317 incomplete = 1;
1318 re_node_set_free (&eclosure_elem);
1322 /* for duplicated nodes. */
1323 for (idx = max; idx < dfa->nodes_len; ++idx)
1325 int entity, i, constraint;
1326 re_node_set *bkref_eclosure;
1327 entity = dfa->nodes[idx].opr.ctx_info->entity;
1328 re_node_set_merge (dfa->inveclosures + idx, dfa->inveclosures + entity);
1329 if (dfa->nodes[entity].type != OP_BACK_REF)
1330 continue;
1332 /* If the node is backreference, duplicate the epsilon closure of
1333 the next node. Since it may epsilon transit. */
1334 /* Note: duplicate_node() may realloc dfa->eclosures, etc. */
1335 bkref_eclosure = re_malloc (re_node_set, 1);
1336 if (BE (bkref_eclosure == NULL, 0))
1337 return REG_ESPACE;
1338 re_node_set_init_empty (bkref_eclosure);
1339 constraint = dfa->nodes[idx].constraint;
1340 for (i = 0; i < dfa->eclosures[dfa->nexts[idx]].nelem; ++i)
1342 int dest_node_idx = dfa->eclosures[dfa->nexts[idx]].elems[i];
1343 if (!IS_EPSILON_NODE (dfa->nodes[dest_node_idx].type))
1345 reg_errcode_t err;
1346 err = duplicate_node (&dest_node_idx, dfa, dest_node_idx,
1347 constraint);
1348 if (BE (err != REG_NOERROR, 0))
1349 return err;
1351 re_node_set_insert (bkref_eclosure, dest_node_idx);
1353 dfa->nodes[idx].opr.ctx_info->bkref_eclosure = bkref_eclosure;
1356 return REG_NOERROR;
1359 /* Calculate epsilon closure of NODE. */
1361 static reg_errcode_t
1362 calc_eclosure_iter (new_set, dfa, node, root)
1363 re_node_set *new_set;
1364 re_dfa_t *dfa;
1365 int node, root;
1367 reg_errcode_t err;
1368 unsigned int constraint;
1369 int i, max, incomplete = 0;
1370 re_node_set eclosure;
1371 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1372 if (BE (err != REG_NOERROR, 0))
1373 return err;
1375 /* This indicates that we are calculating this node now.
1376 We reference this value to avoid infinite loop. */
1377 dfa->eclosures[node].nelem = -1;
1379 constraint = ((dfa->nodes[node].type == ANCHOR)
1380 ? dfa->nodes[node].opr.ctx_type : 0);
1382 /* Expand each epsilon destination nodes. */
1383 if (dfa->edests[node].nelem != 0)
1384 for (i = 0; i < dfa->edests[node].nelem; ++i)
1386 re_node_set eclosure_elem;
1387 int edest = dfa->edests[node].elems[i];
1388 /* If calculating the epsilon closure of `edest' is in progress,
1389 return intermediate result. */
1390 if (dfa->eclosures[edest].nelem == -1)
1392 incomplete = 1;
1393 continue;
1395 /* If we haven't calculated the epsilon closure of `edest' yet,
1396 calculate now. Otherwise use calculated epsilon closure. */
1397 if (dfa->eclosures[edest].nelem == 0)
1399 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1400 if (BE (err != REG_NOERROR, 0))
1401 return err;
1403 else
1404 eclosure_elem = dfa->eclosures[edest];
1405 /* Merge the epsilon closure of `edest'. */
1406 re_node_set_merge (&eclosure, &eclosure_elem);
1407 /* If the epsilon closure of `edest' is incomplete,
1408 the epsilon closure of this node is also incomplete. */
1409 if (dfa->eclosures[edest].nelem == 0)
1411 incomplete = 1;
1412 re_node_set_free (&eclosure_elem);
1416 /* If the current node has constraints, duplicate all non-epsilon nodes.
1417 Since they must inherit the constraints. */
1418 if (constraint)
1419 for (i = 0, max = eclosure.nelem; i < max; ++i)
1421 int dest = eclosure.elems[i];
1422 if (!IS_EPSILON_NODE (dfa->nodes[dest].type))
1424 int dup_dest;
1425 reg_errcode_t err;
1426 err = duplicate_node (&dup_dest, dfa, dest, constraint);
1427 if (BE (err != REG_NOERROR, 0))
1428 return err;
1429 if (dest != dup_dest)
1431 re_node_set_remove_at (&eclosure, i--);
1432 re_node_set_insert (&eclosure, dup_dest);
1433 --max;
1438 /* Epsilon closures include itself. */
1439 re_node_set_insert (&eclosure, node);
1440 if (incomplete && !root)
1441 dfa->eclosures[node].nelem = 0;
1442 else
1443 dfa->eclosures[node] = eclosure;
1444 *new_set = eclosure;
1445 return REG_NOERROR;
1448 /* Functions for token which are used in the parser. */
1450 /* Fetch a token from INPUT.
1451 We must not use this function inside bracket expressions. */
1453 static re_token_t
1454 fetch_token (input, syntax)
1455 re_string_t *input;
1456 reg_syntax_t syntax;
1458 re_token_t token;
1459 int consumed_byte;
1460 consumed_byte = peek_token (&token, input, syntax);
1461 re_string_skip_bytes (input, consumed_byte);
1462 return token;
1465 /* Peek a token from INPUT, and return the length of the token.
1466 We must not use this function inside bracket expressions. */
1468 static int
1469 peek_token (token, input, syntax)
1470 re_token_t *token;
1471 re_string_t *input;
1472 reg_syntax_t syntax;
1474 unsigned char c;
1476 if (re_string_eoi (input))
1478 token->type = END_OF_RE;
1479 return 0;
1482 c = re_string_peek_byte (input, 0);
1483 token->opr.c = c;
1485 #ifdef RE_ENABLE_I18N
1486 token->mb_partial = 0;
1487 if (MB_CUR_MAX > 1 &&
1488 !re_string_first_byte (input, re_string_cur_idx (input)))
1490 token->type = CHARACTER;
1491 token->mb_partial = 1;
1492 return 1;
1494 #endif
1495 if (c == '\\')
1497 unsigned char c2;
1498 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1500 token->type = BACK_SLASH;
1501 return 1;
1504 c2 = re_string_peek_byte_case (input, 1);
1505 token->opr.c = c2;
1506 token->type = CHARACTER;
1507 switch (c2)
1509 case '|':
1510 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1511 token->type = OP_ALT;
1512 break;
1513 case '1': case '2': case '3': case '4': case '5':
1514 case '6': case '7': case '8': case '9':
1515 if (!(syntax & RE_NO_BK_REFS))
1517 token->type = OP_BACK_REF;
1518 token->opr.idx = c2 - '0';
1520 break;
1521 case '<':
1522 if (!(syntax & RE_NO_GNU_OPS))
1524 token->type = ANCHOR;
1525 token->opr.idx = WORD_FIRST;
1527 break;
1528 case '>':
1529 if (!(syntax & RE_NO_GNU_OPS))
1531 token->type = ANCHOR;
1532 token->opr.idx = WORD_LAST;
1534 break;
1535 case 'b':
1536 if (!(syntax & RE_NO_GNU_OPS))
1538 token->type = ANCHOR;
1539 token->opr.idx = WORD_DELIM;
1541 break;
1542 case 'B':
1543 if (!(syntax & RE_NO_GNU_OPS))
1545 token->type = ANCHOR;
1546 token->opr.idx = INSIDE_WORD;
1548 break;
1549 case 'w':
1550 if (!(syntax & RE_NO_GNU_OPS))
1551 token->type = OP_WORD;
1552 break;
1553 case 'W':
1554 if (!(syntax & RE_NO_GNU_OPS))
1555 token->type = OP_NOTWORD;
1556 break;
1557 case '`':
1558 if (!(syntax & RE_NO_GNU_OPS))
1560 token->type = ANCHOR;
1561 token->opr.idx = BUF_FIRST;
1563 break;
1564 case '\'':
1565 if (!(syntax & RE_NO_GNU_OPS))
1567 token->type = ANCHOR;
1568 token->opr.idx = BUF_LAST;
1570 break;
1571 case '(':
1572 if (!(syntax & RE_NO_BK_PARENS))
1573 token->type = OP_OPEN_SUBEXP;
1574 break;
1575 case ')':
1576 if (!(syntax & RE_NO_BK_PARENS))
1577 token->type = OP_CLOSE_SUBEXP;
1578 break;
1579 case '+':
1580 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1581 token->type = OP_DUP_PLUS;
1582 break;
1583 case '?':
1584 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1585 token->type = OP_DUP_QUESTION;
1586 break;
1587 case '{':
1588 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1589 token->type = OP_OPEN_DUP_NUM;
1590 break;
1591 case '}':
1592 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1593 token->type = OP_CLOSE_DUP_NUM;
1594 break;
1595 default:
1596 break;
1598 return 2;
1601 token->type = CHARACTER;
1602 switch (c)
1604 case '\n':
1605 if (syntax & RE_NEWLINE_ALT)
1606 token->type = OP_ALT;
1607 break;
1608 case '|':
1609 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1610 token->type = OP_ALT;
1611 break;
1612 case '*':
1613 token->type = OP_DUP_ASTERISK;
1614 break;
1615 case '+':
1616 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1617 token->type = OP_DUP_PLUS;
1618 break;
1619 case '?':
1620 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1621 token->type = OP_DUP_QUESTION;
1622 break;
1623 case '{':
1624 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1625 token->type = OP_OPEN_DUP_NUM;
1626 break;
1627 case '}':
1628 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1629 token->type = OP_CLOSE_DUP_NUM;
1630 break;
1631 case '(':
1632 if (syntax & RE_NO_BK_PARENS)
1633 token->type = OP_OPEN_SUBEXP;
1634 break;
1635 case ')':
1636 if (syntax & RE_NO_BK_PARENS)
1637 token->type = OP_CLOSE_SUBEXP;
1638 break;
1639 case '[':
1640 token->type = OP_OPEN_BRACKET;
1641 break;
1642 case '.':
1643 token->type = OP_PERIOD;
1644 break;
1645 case '^':
1646 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1647 re_string_cur_idx (input) != 0)
1649 char prev = re_string_peek_byte (input, -1);
1650 if (prev != '|' && prev != '(' &&
1651 (!(syntax & RE_NEWLINE_ALT) || prev != '\n'))
1652 break;
1654 token->type = ANCHOR;
1655 token->opr.idx = LINE_FIRST;
1656 break;
1657 case '$':
1658 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1659 re_string_cur_idx (input) + 1 != re_string_length (input))
1661 re_token_t next;
1662 re_string_skip_bytes (input, 1);
1663 peek_token (&next, input, syntax);
1664 re_string_skip_bytes (input, -1);
1665 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1666 break;
1668 token->type = ANCHOR;
1669 token->opr.idx = LINE_LAST;
1670 break;
1671 default:
1672 break;
1674 return 1;
1677 /* Peek a token from INPUT, and return the length of the token.
1678 We must not use this function out of bracket expressions. */
1680 static int
1681 peek_token_bracket (token, input, syntax)
1682 re_token_t *token;
1683 re_string_t *input;
1684 reg_syntax_t syntax;
1686 unsigned char c;
1687 if (re_string_eoi (input))
1689 token->type = END_OF_RE;
1690 return 0;
1692 c = re_string_peek_byte (input, 0);
1693 token->opr.c = c;
1695 #ifdef RE_ENABLE_I18N
1696 if (MB_CUR_MAX > 1 &&
1697 !re_string_first_byte (input, re_string_cur_idx (input)))
1699 token->type = CHARACTER;
1700 return 1;
1702 #endif /* RE_ENABLE_I18N */
1704 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS))
1706 /* In this case, '\' escape a character. */
1707 unsigned char c2;
1708 c2 = re_string_peek_byte (input, 1);
1709 token->opr.c = c2;
1710 token->type = CHARACTER;
1711 return 1;
1713 if (c == '[') /* '[' is a special char in a bracket exps. */
1715 unsigned char c2;
1716 int token_len;
1717 c2 = re_string_peek_byte (input, 1);
1718 token->opr.c = c2;
1719 token_len = 2;
1720 switch (c2)
1722 case '.':
1723 token->type = OP_OPEN_COLL_ELEM;
1724 break;
1725 case '=':
1726 token->type = OP_OPEN_EQUIV_CLASS;
1727 break;
1728 case ':':
1729 if (syntax & RE_CHAR_CLASSES)
1731 token->type = OP_OPEN_CHAR_CLASS;
1732 break;
1734 /* else fall through. */
1735 default:
1736 token->type = CHARACTER;
1737 token->opr.c = c;
1738 token_len = 1;
1739 break;
1741 return token_len;
1743 switch (c)
1745 case '-':
1746 token->type = OP_CHARSET_RANGE;
1747 break;
1748 case ']':
1749 token->type = OP_CLOSE_BRACKET;
1750 break;
1751 case '^':
1752 token->type = OP_NON_MATCH_LIST;
1753 break;
1754 default:
1755 token->type = CHARACTER;
1757 return 1;
1760 /* Functions for parser. */
1762 /* Entry point of the parser.
1763 Parse the regular expression REGEXP and return the structure tree.
1764 If an error is occured, ERR is set by error code, and return NULL.
1765 This function build the following tree, from regular expression <reg_exp>:
1769 <reg_exp> EOR
1771 CAT means concatenation.
1772 EOR means end of regular expression. */
1774 static bin_tree_t *
1775 parse (regexp, preg, syntax, err)
1776 re_string_t *regexp;
1777 regex_t *preg;
1778 reg_syntax_t syntax;
1779 reg_errcode_t *err;
1781 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1782 bin_tree_t *tree, *eor, *root;
1783 re_token_t current_token;
1784 int new_idx;
1785 current_token = fetch_token (regexp, syntax);
1786 tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
1787 if (BE (*err != REG_NOERROR && tree == NULL, 0))
1788 return NULL;
1789 new_idx = re_dfa_add_node (dfa, current_token, 0);
1790 eor = create_tree (NULL, NULL, 0, new_idx);
1791 if (tree != NULL)
1792 root = create_tree (tree, eor, CONCAT, 0);
1793 else
1794 root = eor;
1795 if (BE (new_idx == -1 || eor == NULL || root == NULL, 0))
1796 return *err = REG_ESPACE, NULL;
1797 return root;
1800 /* This function build the following tree, from regular expression
1801 <branch1>|<branch2>:
1805 <branch1> <branch2>
1807 ALT means alternative, which represents the operator `|'. */
1809 static bin_tree_t *
1810 parse_reg_exp (regexp, preg, token, syntax, nest, err)
1811 re_string_t *regexp;
1812 regex_t *preg;
1813 re_token_t *token;
1814 reg_syntax_t syntax;
1815 int nest;
1816 reg_errcode_t *err;
1818 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1819 bin_tree_t *tree, *branch = NULL;
1820 int new_idx;
1821 tree = parse_branch (regexp, preg, token, syntax, nest, err);
1822 if (BE (*err != REG_NOERROR && tree == NULL, 0))
1823 return NULL;
1825 while (token->type == OP_ALT)
1827 re_token_t alt_token = *token;
1828 new_idx = re_dfa_add_node (dfa, alt_token, 0);
1829 *token = fetch_token (regexp, syntax);
1830 if (token->type != OP_ALT && token->type != END_OF_RE
1831 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
1833 branch = parse_branch (regexp, preg, token, syntax, nest, err);
1834 if (BE (*err != REG_NOERROR && branch == NULL, 0))
1836 free_bin_tree (tree);
1837 return NULL;
1840 else
1841 branch = NULL;
1842 tree = create_tree (tree, branch, 0, new_idx);
1843 if (BE (new_idx == -1 || tree == NULL, 0))
1844 return *err = REG_ESPACE, NULL;
1845 dfa->has_plural_match = 1;
1847 return tree;
1850 /* This function build the following tree, from regular expression
1851 <exp1><exp2>:
1855 <exp1> <exp2>
1857 CAT means concatenation. */
1859 static bin_tree_t *
1860 parse_branch (regexp, preg, token, syntax, nest, err)
1861 re_string_t *regexp;
1862 regex_t *preg;
1863 re_token_t *token;
1864 reg_syntax_t syntax;
1865 int nest;
1866 reg_errcode_t *err;
1868 bin_tree_t *tree, *exp;
1869 tree = parse_expression (regexp, preg, token, syntax, nest, err);
1870 if (BE (*err != REG_NOERROR && tree == NULL, 0))
1871 return NULL;
1873 while (token->type != OP_ALT && token->type != END_OF_RE
1874 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
1876 exp = parse_expression (regexp, preg, token, syntax, nest, err);
1877 if (BE (*err != REG_NOERROR && exp == NULL, 0))
1879 free_bin_tree (tree);
1880 return NULL;
1882 if (tree != NULL && exp != NULL)
1884 tree = create_tree (tree, exp, CONCAT, 0);
1885 if (tree == NULL)
1886 return *err = REG_ESPACE, NULL;
1888 else if (tree == NULL)
1889 tree = exp;
1890 /* Otherwise exp == NULL, we don't need to create new tree. */
1892 return tree;
1895 /* This function build the following tree, from regular expression a*:
1901 static bin_tree_t *
1902 parse_expression (regexp, preg, token, syntax, nest, err)
1903 re_string_t *regexp;
1904 regex_t *preg;
1905 re_token_t *token;
1906 reg_syntax_t syntax;
1907 int nest;
1908 reg_errcode_t *err;
1910 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1911 bin_tree_t *tree;
1912 int new_idx;
1913 switch (token->type)
1915 case CHARACTER:
1916 new_idx = re_dfa_add_node (dfa, *token, 0);
1917 tree = create_tree (NULL, NULL, 0, new_idx);
1918 if (BE (new_idx == -1 || tree == NULL, 0))
1919 return *err = REG_ESPACE, NULL;
1920 #ifdef RE_ENABLE_I18N
1921 if (MB_CUR_MAX > 1)
1923 while (!re_string_eoi (regexp)
1924 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
1926 bin_tree_t *mbc_remain;
1927 *token = fetch_token (regexp, syntax);
1928 new_idx = re_dfa_add_node (dfa, *token, 0);
1929 mbc_remain = create_tree (NULL, NULL, 0, new_idx);
1930 tree = create_tree (tree, mbc_remain, CONCAT, 0);
1931 if (BE (new_idx == -1 || mbc_remain == NULL || tree == NULL, 0))
1932 return *err = REG_ESPACE, NULL;
1935 #endif
1936 break;
1937 case OP_OPEN_SUBEXP:
1938 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
1939 if (BE (*err != REG_NOERROR && tree == NULL, 0))
1940 return NULL;
1941 break;
1942 case OP_OPEN_BRACKET:
1943 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
1944 if (BE (*err != REG_NOERROR && tree == NULL, 0))
1945 return NULL;
1946 break;
1947 case OP_BACK_REF:
1948 if (BE (preg->re_nsub < token->opr.idx
1949 || dfa->subexps[token->opr.idx - 1].end == -1, 0))
1951 *err = REG_ESUBREG;
1952 return NULL;
1954 new_idx = re_dfa_add_node (dfa, *token, 0);
1955 tree = create_tree (NULL, NULL, 0, new_idx);
1956 if (BE (new_idx == -1 || tree == NULL, 0))
1957 return *err = REG_ESPACE, NULL;
1958 ++dfa->nbackref;
1959 dfa->has_mb_node = 1;
1960 break;
1961 case OP_DUP_ASTERISK:
1962 case OP_DUP_PLUS:
1963 case OP_DUP_QUESTION:
1964 case OP_OPEN_DUP_NUM:
1965 if (syntax & RE_CONTEXT_INVALID_OPS)
1966 return *err = REG_BADRPT, NULL;
1967 else if (syntax & RE_CONTEXT_INDEP_OPS)
1969 *token = fetch_token (regexp, syntax);
1970 return parse_expression (regexp, preg, token, syntax, nest, err);
1972 /* else fall through */
1973 case OP_CLOSE_SUBEXP:
1974 if ((token->type == OP_CLOSE_SUBEXP) &&
1975 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
1976 return *err = REG_ERPAREN, NULL;
1977 /* else fall through */
1978 case OP_CLOSE_DUP_NUM:
1979 /* We treat it as a normal character. */
1981 /* Then we can these characters as normal characters. */
1982 token->type = CHARACTER;
1983 new_idx = re_dfa_add_node (dfa, *token, 0);
1984 tree = create_tree (NULL, NULL, 0, new_idx);
1985 if (BE (new_idx == -1 || tree == NULL, 0))
1986 return *err = REG_ESPACE, NULL;
1987 break;
1988 case ANCHOR:
1989 if (dfa->word_char == NULL)
1991 *err = init_word_char (dfa);
1992 if (BE (*err != REG_NOERROR, 0))
1993 return NULL;
1995 if (token->opr.ctx_type == WORD_DELIM)
1997 bin_tree_t *tree_first, *tree_last;
1998 int idx_first, idx_last;
1999 token->opr.ctx_type = WORD_FIRST;
2000 idx_first = re_dfa_add_node (dfa, *token, 0);
2001 tree_first = create_tree (NULL, NULL, 0, idx_first);
2002 token->opr.ctx_type = WORD_LAST;
2003 idx_last = re_dfa_add_node (dfa, *token, 0);
2004 tree_last = create_tree (NULL, NULL, 0, idx_last);
2005 token->type = OP_ALT;
2006 new_idx = re_dfa_add_node (dfa, *token, 0);
2007 tree = create_tree (tree_first, tree_last, 0, new_idx);
2008 if (BE (idx_first == -1 || idx_last == -1 || new_idx == -1
2009 || tree_first == NULL || tree_last == NULL
2010 || tree == NULL, 0))
2011 return *err = REG_ESPACE, NULL;
2013 else
2015 new_idx = re_dfa_add_node (dfa, *token, 0);
2016 tree = create_tree (NULL, NULL, 0, new_idx);
2017 if (BE (new_idx == -1 || tree == NULL, 0))
2018 return *err = REG_ESPACE, NULL;
2020 /* We must return here, since ANCHORs can't be followed
2021 by repetition operators.
2022 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2023 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2024 *token = fetch_token (regexp, syntax);
2025 return tree;
2026 case OP_PERIOD:
2027 new_idx = re_dfa_add_node (dfa, *token, 0);
2028 tree = create_tree (NULL, NULL, 0, new_idx);
2029 if (BE (new_idx == -1 || tree == NULL, 0))
2030 return *err = REG_ESPACE, NULL;
2031 if (MB_CUR_MAX > 1)
2032 dfa->has_mb_node = 1;
2033 break;
2034 case OP_WORD:
2035 tree = build_word_op (dfa, 0, err);
2036 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2037 return NULL;
2038 break;
2039 case OP_NOTWORD:
2040 tree = build_word_op (dfa, 1, err);
2041 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2042 return NULL;
2043 break;
2044 case OP_ALT:
2045 case END_OF_RE:
2046 return NULL;
2047 case BACK_SLASH:
2048 *err = REG_EESCAPE;
2049 return NULL;
2050 default:
2051 /* Must not happen? */
2052 #ifdef DEBUG
2053 assert (0);
2054 #endif
2055 return NULL;
2057 *token = fetch_token (regexp, syntax);
2059 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2060 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2062 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2063 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2064 return NULL;
2065 dfa->has_plural_match = 1;
2068 return tree;
2071 /* This function build the following tree, from regular expression
2072 (<reg_exp>):
2073 SUBEXP
2075 <reg_exp>
2078 static bin_tree_t *
2079 parse_sub_exp (regexp, preg, token, syntax, nest, err)
2080 re_string_t *regexp;
2081 regex_t *preg;
2082 re_token_t *token;
2083 reg_syntax_t syntax;
2084 int nest;
2085 reg_errcode_t *err;
2087 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2088 bin_tree_t *tree, *left_par, *right_par;
2089 size_t cur_nsub;
2090 int new_idx;
2091 cur_nsub = preg->re_nsub++;
2092 if (dfa->subexps_alloc < preg->re_nsub)
2094 re_subexp_t *new_array;
2095 dfa->subexps_alloc *= 2;
2096 new_array = re_realloc (dfa->subexps, re_subexp_t, dfa->subexps_alloc);
2097 if (BE (new_array == NULL, 0))
2099 dfa->subexps_alloc /= 2;
2100 *err = REG_ESPACE;
2101 return NULL;
2103 dfa->subexps = new_array;
2105 dfa->subexps[cur_nsub].start = dfa->nodes_len;
2106 dfa->subexps[cur_nsub].end = -1;
2108 new_idx = re_dfa_add_node (dfa, *token, 0);
2109 left_par = create_tree (NULL, NULL, 0, new_idx);
2110 if (BE (new_idx == -1 || left_par == NULL, 0))
2111 return *err = REG_ESPACE, NULL;
2112 dfa->nodes[new_idx].opr.idx = cur_nsub;
2113 *token = fetch_token (regexp, syntax);
2115 /* The subexpression may be a null string. */
2116 if (token->type == OP_CLOSE_SUBEXP)
2117 tree = NULL;
2118 else
2120 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2121 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2122 return NULL;
2124 if (BE (token->type != OP_CLOSE_SUBEXP, 0))
2126 free_bin_tree (tree);
2127 *err = REG_BADPAT;
2128 return NULL;
2130 new_idx = re_dfa_add_node (dfa, *token, 0);
2131 dfa->subexps[cur_nsub].end = dfa->nodes_len;
2132 right_par = create_tree (NULL, NULL, 0, new_idx);
2133 tree = ((tree == NULL) ? right_par
2134 : create_tree (tree, right_par, CONCAT, 0));
2135 tree = create_tree (left_par, tree, CONCAT, 0);
2136 if (BE (new_idx == -1 || right_par == NULL || tree == NULL, 0))
2137 return *err = REG_ESPACE, NULL;
2138 dfa->nodes[new_idx].opr.idx = cur_nsub;
2140 return tree;
2143 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2145 static bin_tree_t *
2146 parse_dup_op (dup_elem, regexp, dfa, token, syntax, err)
2147 bin_tree_t *dup_elem;
2148 re_string_t *regexp;
2149 re_dfa_t *dfa;
2150 re_token_t *token;
2151 reg_syntax_t syntax;
2152 reg_errcode_t *err;
2154 re_token_t dup_token;
2155 bin_tree_t *tree = dup_elem, *work_tree;
2156 int new_idx, start_idx = re_string_cur_idx (regexp);
2157 re_token_t start_token = *token;
2158 if (token->type == OP_OPEN_DUP_NUM)
2160 int i;
2161 int end = 0;
2162 int start = fetch_number (regexp, token, syntax);
2163 bin_tree_t *elem;
2164 if (start == -1)
2166 if (token->type == CHARACTER && token->opr.c == ',')
2167 start = 0; /* We treat "{,m}" as "{0,m}". */
2168 else
2170 *err = REG_BADBR; /* <re>{} is invalid. */
2171 return NULL;
2174 if (BE (start != -2, 1))
2176 /* We treat "{n}" as "{n,n}". */
2177 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2178 : ((token->type == CHARACTER && token->opr.c == ',')
2179 ? fetch_number (regexp, token, syntax) : -2));
2181 if (BE (start == -2 || end == -2, 0))
2183 /* Invalid sequence. */
2184 if (token->type == OP_CLOSE_DUP_NUM)
2185 goto parse_dup_op_invalid_interval;
2186 else
2187 goto parse_dup_op_ebrace;
2189 if (BE (start == 0 && end == 0, 0))
2191 /* We treat "<re>{0}" and "<re>{0,0}" as null string. */
2192 *token = fetch_token (regexp, syntax);
2193 free_bin_tree (dup_elem);
2194 return NULL;
2197 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2198 elem = tree;
2199 for (i = 0; i < start; ++i)
2200 if (i != 0)
2202 work_tree = duplicate_tree (elem, dfa);
2203 tree = create_tree (tree, work_tree, CONCAT, 0);
2204 if (BE (work_tree == NULL || tree == NULL, 0))
2205 goto parse_dup_op_espace;
2208 if (end == -1)
2210 /* We treat "<re>{0,}" as "<re>*". */
2211 dup_token.type = OP_DUP_ASTERISK;
2212 if (start > 0)
2214 elem = duplicate_tree (elem, dfa);
2215 new_idx = re_dfa_add_node (dfa, dup_token, 0);
2216 work_tree = create_tree (elem, NULL, 0, new_idx);
2217 tree = create_tree (tree, work_tree, CONCAT, 0);
2218 if (BE (elem == NULL || new_idx == -1 || work_tree == NULL
2219 || tree == NULL, 0))
2220 goto parse_dup_op_espace;
2222 else
2224 new_idx = re_dfa_add_node (dfa, dup_token, 0);
2225 tree = create_tree (elem, NULL, 0, new_idx);
2226 if (BE (new_idx == -1 || tree == NULL, 0))
2227 goto parse_dup_op_espace;
2230 else if (end - start > 0)
2232 /* Then extract "<re>{0,m}" to "<re>?<re>?...<re>?". */
2233 dup_token.type = OP_DUP_QUESTION;
2234 if (start > 0)
2236 elem = duplicate_tree (elem, dfa);
2237 new_idx = re_dfa_add_node (dfa, dup_token, 0);
2238 elem = create_tree (elem, NULL, 0, new_idx);
2239 tree = create_tree (tree, elem, CONCAT, 0);
2240 if (BE (elem == NULL || new_idx == -1 || tree == NULL, 0))
2241 goto parse_dup_op_espace;
2243 else
2245 new_idx = re_dfa_add_node (dfa, dup_token, 0);
2246 tree = elem = create_tree (elem, NULL, 0, new_idx);
2247 if (BE (new_idx == -1 || tree == NULL, 0))
2248 goto parse_dup_op_espace;
2250 for (i = 1; i < end - start; ++i)
2252 work_tree = duplicate_tree (elem, dfa);
2253 tree = create_tree (tree, work_tree, CONCAT, 0);
2254 if (BE (work_tree == NULL || tree == NULL, 0))
2255 return *err = REG_ESPACE, NULL;
2259 else
2261 new_idx = re_dfa_add_node (dfa, *token, 0);
2262 tree = create_tree (tree, NULL, 0, new_idx);
2263 if (BE (new_idx == -1 || tree == NULL, 0))
2264 return *err = REG_ESPACE, NULL;
2266 *token = fetch_token (regexp, syntax);
2267 return tree;
2269 parse_dup_op_espace:
2270 free_bin_tree (tree);
2271 *err = REG_ESPACE;
2272 return NULL;
2274 parse_dup_op_ebrace:
2275 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2277 *err = REG_EBRACE;
2278 return NULL;
2280 goto parse_dup_op_rollback;
2281 parse_dup_op_invalid_interval:
2282 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2284 *err = REG_BADBR;
2285 return NULL;
2287 parse_dup_op_rollback:
2288 re_string_set_index (regexp, start_idx);
2289 *token = start_token;
2290 token->type = CHARACTER;
2291 return dup_elem;
2294 /* Size of the names for collating symbol/equivalence_class/character_class.
2295 I'm not sure, but maybe enough. */
2296 #define BRACKET_NAME_BUF_SIZE 32
2298 #ifndef _LIBC
2299 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2300 Build the range expression which starts from START_ELEM, and ends
2301 at END_ELEM. The result are written to MBCSET and SBCSET.
2302 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2303 mbcset->range_ends, is a pointer argument sinse we may
2304 update it. */
2306 static reg_errcode_t
2307 # ifdef RE_ENABLE_I18N
2308 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2309 re_charset_t *mbcset;
2310 int *range_alloc;
2311 # else /* not RE_ENABLE_I18N */
2312 build_range_exp (sbcset, start_elem, end_elem)
2313 # endif /* not RE_ENABLE_I18N */
2314 re_bitset_ptr_t sbcset;
2315 bracket_elem_t *start_elem, *end_elem;
2317 unsigned int start_ch, end_ch;
2318 /* Equivalence Classes and Character Classes can't be a range start/end. */
2319 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2320 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2322 return REG_ERANGE;
2324 /* We can handle no multi character collating elements without libc
2325 support. */
2326 if (BE ((start_elem->type == COLL_SYM
2327 && strlen ((char *) start_elem->opr.name) > 1)
2328 || (end_elem->type == COLL_SYM
2329 && strlen ((char *) end_elem->opr.name) > 1), 0))
2330 return REG_ECOLLATE;
2332 # ifdef RE_ENABLE_I18N
2334 wchar_t wc, start_wc, end_wc;
2335 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2337 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2338 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2339 : 0));
2340 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2341 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2342 : 0));
2343 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2344 ? __btowc (start_ch) : start_elem->opr.wch);
2345 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2346 ? __btowc (end_ch) : end_elem->opr.wch);
2347 cmp_buf[0] = start_wc;
2348 cmp_buf[4] = end_wc;
2349 if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2350 return REG_ERANGE;
2352 /* Check the space of the arrays. */
2353 if (*range_alloc == mbcset->nranges)
2355 /* There are not enough space, need realloc. */
2356 wchar_t *new_array_start, *new_array_end;
2357 int new_nranges;
2359 /* +1 in case of mbcset->nranges is 0. */
2360 new_nranges = 2 * mbcset->nranges + 1;
2361 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2362 are NULL if *range_alloc == 0. */
2363 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2364 new_nranges);
2365 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2366 new_nranges);
2368 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2369 return REG_ESPACE;
2371 mbcset->range_starts = new_array_start;
2372 mbcset->range_ends = new_array_end;
2373 *range_alloc = new_nranges;
2376 mbcset->range_starts[mbcset->nranges] = start_wc;
2377 mbcset->range_ends[mbcset->nranges++] = end_wc;
2379 /* Build the table for single byte characters. */
2380 for (wc = 0; wc <= SBC_MAX; ++wc)
2382 cmp_buf[2] = wc;
2383 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2384 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2385 bitset_set (sbcset, wc);
2388 # else /* not RE_ENABLE_I18N */
2390 unsigned int ch;
2391 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2392 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2393 : 0));
2394 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2395 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2396 : 0));
2397 if (start_ch > end_ch)
2398 return REG_ERANGE;
2399 /* Build the table for single byte characters. */
2400 for (ch = 0; ch <= SBC_MAX; ++ch)
2401 if (start_ch <= ch && ch <= end_ch)
2402 bitset_set (sbcset, ch);
2404 # endif /* not RE_ENABLE_I18N */
2405 return REG_NOERROR;
2407 #endif /* not _LIBC */
2409 #ifndef _LIBC
2410 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2411 Build the collating element which is represented by NAME.
2412 The result are written to MBCSET and SBCSET.
2413 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2414 pointer argument since we may update it. */
2416 static reg_errcode_t
2417 # ifdef RE_ENABLE_I18N
2418 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2419 re_charset_t *mbcset;
2420 int *coll_sym_alloc;
2421 # else /* not RE_ENABLE_I18N */
2422 build_collating_symbol (sbcset, name)
2423 # endif /* not RE_ENABLE_I18N */
2424 re_bitset_ptr_t sbcset;
2425 const unsigned char *name;
2427 size_t name_len = strlen ((const char *) name);
2428 if (BE (name_len != 1, 0))
2429 return REG_ECOLLATE;
2430 else
2432 bitset_set (sbcset, name[0]);
2433 return REG_NOERROR;
2436 #endif /* not _LIBC */
2438 /* This function parse bracket expression like "[abc]", "[a-c]",
2439 "[[.a-a.]]" etc. */
2441 static bin_tree_t *
2442 parse_bracket_exp (regexp, dfa, token, syntax, err)
2443 re_string_t *regexp;
2444 re_dfa_t *dfa;
2445 re_token_t *token;
2446 reg_syntax_t syntax;
2447 reg_errcode_t *err;
2449 #ifdef _LIBC
2450 const unsigned char *collseqmb;
2451 const char *collseqwc;
2452 uint32_t nrules;
2453 int32_t table_size;
2454 const int32_t *symb_table;
2455 const unsigned char *extra;
2457 /* Local function for parse_bracket_exp used in _LIBC environement.
2458 Seek the collating symbol entry correspondings to NAME.
2459 Return the index of the symbol in the SYMB_TABLE. */
2461 static inline int32_t
2462 seek_collating_symbol_entry (name, name_len)
2463 const unsigned char *name;
2464 size_t name_len;
2466 int32_t hash = elem_hash ((const char *) name, name_len);
2467 int32_t elem = hash % table_size;
2468 int32_t second = hash % (table_size - 2);
2469 while (symb_table[2 * elem] != 0)
2471 /* First compare the hashing value. */
2472 if (symb_table[2 * elem] == hash
2473 /* Compare the length of the name. */
2474 && name_len == extra[symb_table[2 * elem + 1]]
2475 /* Compare the name. */
2476 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2477 name_len) == 0)
2479 /* Yep, this is the entry. */
2480 break;
2483 /* Next entry. */
2484 elem += second;
2486 return elem;
2489 /* Local function for parse_bracket_exp used in _LIBC environement.
2490 Look up the collation sequence value of BR_ELEM.
2491 Return the value if succeeded, UINT_MAX otherwise. */
2493 static inline unsigned int
2494 lookup_collation_sequence_value (br_elem)
2495 bracket_elem_t *br_elem;
2497 if (br_elem->type == SB_CHAR)
2500 if (MB_CUR_MAX == 1)
2502 if (nrules == 0)
2503 return collseqmb[br_elem->opr.ch];
2504 else
2506 wint_t wc = __btowc (br_elem->opr.ch);
2507 return collseq_table_lookup (collseqwc, wc);
2510 else if (br_elem->type == MB_CHAR)
2512 return collseq_table_lookup (collseqwc, br_elem->opr.wch);
2514 else if (br_elem->type == COLL_SYM)
2516 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2517 if (nrules != 0)
2519 int32_t elem, idx;
2520 elem = seek_collating_symbol_entry (br_elem->opr.name,
2521 sym_name_len);
2522 if (symb_table[2 * elem] != 0)
2524 /* We found the entry. */
2525 idx = symb_table[2 * elem + 1];
2526 /* Skip the name of collating element name. */
2527 idx += 1 + extra[idx];
2528 /* Skip the byte sequence of the collating element. */
2529 idx += 1 + extra[idx];
2530 /* Adjust for the alignment. */
2531 idx = (idx + 3) & ~3;
2532 /* Skip the multibyte collation sequence value. */
2533 idx += sizeof (unsigned int);
2534 /* Skip the wide char sequence of the collating element. */
2535 idx += sizeof (unsigned int) *
2536 (1 + *(unsigned int *) (extra + idx));
2537 /* Return the collation sequence value. */
2538 return *(unsigned int *) (extra + idx);
2540 else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2542 /* No valid character. Match it as a single byte
2543 character. */
2544 return collseqmb[br_elem->opr.name[0]];
2547 else if (sym_name_len == 1)
2548 return collseqmb[br_elem->opr.name[0]];
2550 return UINT_MAX;
2553 /* Local function for parse_bracket_exp used in _LIBC environement.
2554 Build the range expression which starts from START_ELEM, and ends
2555 at END_ELEM. The result are written to MBCSET and SBCSET.
2556 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2557 mbcset->range_ends, is a pointer argument sinse we may
2558 update it. */
2560 static inline reg_errcode_t
2561 # ifdef RE_ENABLE_I18N
2562 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2563 re_charset_t *mbcset;
2564 int *range_alloc;
2565 # else /* not RE_ENABLE_I18N */
2566 build_range_exp (sbcset, start_elem, end_elem)
2567 # endif /* not RE_ENABLE_I18N */
2568 re_bitset_ptr_t sbcset;
2569 bracket_elem_t *start_elem, *end_elem;
2571 unsigned int ch;
2572 uint32_t start_collseq;
2573 uint32_t end_collseq;
2575 # ifdef RE_ENABLE_I18N
2576 /* Check the space of the arrays. */
2577 if (*range_alloc == mbcset->nranges)
2579 /* There are not enough space, need realloc. */
2580 uint32_t *new_array_start;
2581 uint32_t *new_array_end;
2582 int new_nranges;
2584 /* +1 in case of mbcset->nranges is 0. */
2585 new_nranges = 2 * mbcset->nranges + 1;
2586 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2587 are NULL if *range_alloc == 0. */
2588 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2589 new_nranges);
2590 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2591 new_nranges);
2593 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2594 return REG_ESPACE;
2596 mbcset->range_starts = new_array_start;
2597 mbcset->range_ends = new_array_end;
2598 *range_alloc = new_nranges;
2600 # endif /* RE_ENABLE_I18N */
2602 /* Equivalence Classes and Character Classes can't be a range
2603 start/end. */
2604 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2605 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2607 return REG_ERANGE;
2609 start_collseq = lookup_collation_sequence_value (start_elem);
2610 end_collseq = lookup_collation_sequence_value (end_elem);
2611 /* Check start/end collation sequence values. */
2612 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2613 return REG_ECOLLATE;
2614 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2615 return REG_ERANGE;
2617 # ifdef RE_ENABLE_I18N
2618 /* Got valid collation sequence values, add them as a new entry. */
2619 mbcset->range_starts[mbcset->nranges] = start_collseq;
2620 mbcset->range_ends[mbcset->nranges++] = end_collseq;
2621 # endif /* RE_ENABLE_I18N */
2623 /* Build the table for single byte characters. */
2624 for (ch = 0; ch <= SBC_MAX; ch++)
2626 uint32_t ch_collseq;
2628 if (MB_CUR_MAX == 1)
2630 if (nrules == 0)
2631 ch_collseq = collseqmb[ch];
2632 else
2633 ch_collseq = collseq_table_lookup (collseqwc, __btowc (ch));
2634 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2635 bitset_set (sbcset, ch);
2637 return REG_NOERROR;
2640 /* Local function for parse_bracket_exp used in _LIBC environement.
2641 Build the collating element which is represented by NAME.
2642 The result are written to MBCSET and SBCSET.
2643 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2644 pointer argument sinse we may update it. */
2646 static inline reg_errcode_t
2647 # ifdef RE_ENABLE_I18N
2648 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2649 re_charset_t *mbcset;
2650 int *coll_sym_alloc;
2651 # else /* not RE_ENABLE_I18N */
2652 build_collating_symbol (sbcset, name)
2653 # endif /* not RE_ENABLE_I18N */
2654 re_bitset_ptr_t sbcset;
2655 const unsigned char *name;
2657 int32_t elem, idx;
2658 size_t name_len = strlen ((const char *) name);
2659 if (nrules != 0)
2661 elem = seek_collating_symbol_entry (name, name_len);
2662 if (symb_table[2 * elem] != 0)
2664 /* We found the entry. */
2665 idx = symb_table[2 * elem + 1];
2666 /* Skip the name of collating element name. */
2667 idx += 1 + extra[idx];
2669 else if (symb_table[2 * elem] == 0 && name_len == 1)
2671 /* No valid character, treat it as a normal
2672 character. */
2673 bitset_set (sbcset, name[0]);
2674 return REG_NOERROR;
2676 else
2677 return REG_ECOLLATE;
2679 # ifdef RE_ENABLE_I18N
2680 /* Got valid collation sequence, add it as a new entry. */
2681 /* Check the space of the arrays. */
2682 if (*coll_sym_alloc == mbcset->ncoll_syms)
2684 /* Not enough, realloc it. */
2685 /* +1 in case of mbcset->ncoll_syms is 0. */
2686 *coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
2687 /* Use realloc since mbcset->coll_syms is NULL
2688 if *alloc == 0. */
2689 mbcset->coll_syms = re_realloc (mbcset->coll_syms, int32_t,
2690 *coll_sym_alloc);
2691 if (BE (mbcset->coll_syms == NULL, 0))
2692 return REG_ESPACE;
2694 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
2695 # endif /* RE_ENABLE_I18N */
2696 return REG_NOERROR;
2698 else
2700 if (BE (name_len != 1, 0))
2701 return REG_ECOLLATE;
2702 else
2704 bitset_set (sbcset, name[0]);
2705 return REG_NOERROR;
2709 #endif
2711 re_token_t br_token;
2712 re_bitset_ptr_t sbcset;
2713 #ifdef RE_ENABLE_I18N
2714 re_charset_t *mbcset;
2715 int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
2716 int equiv_class_alloc = 0, char_class_alloc = 0;
2717 #else /* not RE_ENABLE_I18N */
2718 int non_match = 0;
2719 #endif /* not RE_ENABLE_I18N */
2720 bin_tree_t *work_tree;
2721 int token_len, new_idx;
2722 #ifdef _LIBC
2723 collseqmb = (const unsigned char *)
2724 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
2725 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
2726 if (nrules)
2729 if (MB_CUR_MAX > 1)
2731 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
2732 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
2733 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
2734 _NL_COLLATE_SYMB_TABLEMB);
2735 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
2736 _NL_COLLATE_SYMB_EXTRAMB);
2738 #endif
2739 sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
2740 #ifdef RE_ENABLE_I18N
2741 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
2742 #endif /* RE_ENABLE_I18N */
2743 #ifdef RE_ENABLE_I18N
2744 if (BE (sbcset == NULL || mbcset == NULL, 0))
2745 #else
2746 if (BE (sbcset == NULL, 0))
2747 #endif /* RE_ENABLE_I18N */
2749 *err = REG_ESPACE;
2750 return NULL;
2753 token_len = peek_token_bracket (token, regexp, syntax);
2754 if (BE (token->type == END_OF_RE, 0))
2756 *err = REG_BADPAT;
2757 goto parse_bracket_exp_free_return;
2759 if (token->type == OP_NON_MATCH_LIST)
2761 #ifdef RE_ENABLE_I18N
2762 int i;
2763 mbcset->non_match = 1;
2764 #else /* not RE_ENABLE_I18N */
2765 non_match = 1;
2766 #endif /* not RE_ENABLE_I18N */
2767 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
2768 bitset_set (sbcset, '\0');
2769 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
2770 token_len = peek_token_bracket (token, regexp, syntax);
2771 if (BE (token->type == END_OF_RE, 0))
2773 *err = REG_BADPAT;
2774 goto parse_bracket_exp_free_return;
2776 #ifdef RE_ENABLE_I18N
2777 if (MB_CUR_MAX > 1)
2778 for (i = 0; i < SBC_MAX; ++i)
2779 if (__btowc (i) == WEOF)
2780 bitset_set (sbcset, i);
2781 #endif /* RE_ENABLE_I18N */
2784 /* We treat the first ']' as a normal character. */
2785 if (token->type == OP_CLOSE_BRACKET)
2786 token->type = CHARACTER;
2788 while (1)
2790 bracket_elem_t start_elem, end_elem;
2791 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
2792 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
2793 reg_errcode_t ret;
2794 int token_len2 = 0, is_range_exp = 0;
2795 re_token_t token2;
2797 start_elem.opr.name = start_name_buf;
2798 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
2799 syntax);
2800 if (BE (ret != REG_NOERROR, 0))
2802 *err = ret;
2803 goto parse_bracket_exp_free_return;
2806 token_len = peek_token_bracket (token, regexp, syntax);
2807 if (BE (token->type == END_OF_RE, 0))
2809 *err = REG_BADPAT;
2810 goto parse_bracket_exp_free_return;
2812 if (token->type == OP_CHARSET_RANGE)
2814 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
2815 token_len2 = peek_token_bracket (&token2, regexp, syntax);
2816 if (BE (token->type == END_OF_RE, 0))
2818 *err = REG_BADPAT;
2819 goto parse_bracket_exp_free_return;
2821 if (token2.type == OP_CLOSE_BRACKET)
2823 /* We treat the last '-' as a normal character. */
2824 re_string_skip_bytes (regexp, -token_len);
2825 token->type = CHARACTER;
2827 else
2828 is_range_exp = 1;
2831 if (is_range_exp == 1)
2833 end_elem.opr.name = end_name_buf;
2834 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
2835 dfa, syntax);
2836 if (BE (ret != REG_NOERROR, 0))
2838 *err = ret;
2839 goto parse_bracket_exp_free_return;
2842 token_len = peek_token_bracket (token, regexp, syntax);
2843 if (BE (token->type == END_OF_RE, 0))
2845 *err = REG_BADPAT;
2846 goto parse_bracket_exp_free_return;
2848 *err = build_range_exp (sbcset,
2849 #ifdef RE_ENABLE_I18N
2850 mbcset, &range_alloc,
2851 #endif /* RE_ENABLE_I18N */
2852 &start_elem, &end_elem);
2853 if (BE (*err != REG_NOERROR, 0))
2854 goto parse_bracket_exp_free_return;
2856 else
2858 switch (start_elem.type)
2860 case SB_CHAR:
2861 bitset_set (sbcset, start_elem.opr.ch);
2862 break;
2863 #ifdef RE_ENABLE_I18N
2864 case MB_CHAR:
2865 /* Check whether the array has enough space. */
2866 if (mbchar_alloc == mbcset->nmbchars)
2868 /* Not enough, realloc it. */
2869 /* +1 in case of mbcset->nmbchars is 0. */
2870 mbchar_alloc = 2 * mbcset->nmbchars + 1;
2871 /* Use realloc since array is NULL if *alloc == 0. */
2872 mbcset->mbchars = re_realloc (mbcset->mbchars, wchar_t,
2873 mbchar_alloc);
2874 if (BE (mbcset->mbchars == NULL, 0))
2875 goto parse_bracket_exp_espace;
2877 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
2878 break;
2879 #endif /* RE_ENABLE_I18N */
2880 case EQUIV_CLASS:
2881 *err = build_equiv_class (sbcset,
2882 #ifdef RE_ENABLE_I18N
2883 mbcset, &equiv_class_alloc,
2884 #endif /* RE_ENABLE_I18N */
2885 start_elem.opr.name);
2886 if (BE (*err != REG_NOERROR, 0))
2887 goto parse_bracket_exp_free_return;
2888 break;
2889 case COLL_SYM:
2890 *err = build_collating_symbol (sbcset,
2891 #ifdef RE_ENABLE_I18N
2892 mbcset, &coll_sym_alloc,
2893 #endif /* RE_ENABLE_I18N */
2894 start_elem.opr.name);
2895 if (BE (*err != REG_NOERROR, 0))
2896 goto parse_bracket_exp_free_return;
2897 break;
2898 case CHAR_CLASS:
2899 ret = build_charclass (sbcset,
2900 #ifdef RE_ENABLE_I18N
2901 mbcset, &char_class_alloc,
2902 #endif /* RE_ENABLE_I18N */
2903 start_elem.opr.name, syntax);
2904 if (BE (ret != REG_NOERROR, 0))
2905 goto parse_bracket_exp_espace;
2906 break;
2907 default:
2908 assert (0);
2909 break;
2912 if (token->type == OP_CLOSE_BRACKET)
2913 break;
2916 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
2918 /* If it is non-matching list. */
2919 #ifdef RE_ENABLE_I18N
2920 if (mbcset->non_match)
2921 #else /* not RE_ENABLE_I18N */
2922 if (non_match)
2923 #endif /* not RE_ENABLE_I18N */
2924 bitset_not (sbcset);
2926 /* Build a tree for simple bracket. */
2927 br_token.type = SIMPLE_BRACKET;
2928 br_token.opr.sbcset = sbcset;
2929 new_idx = re_dfa_add_node (dfa, br_token, 0);
2930 work_tree = create_tree (NULL, NULL, 0, new_idx);
2931 if (BE (new_idx == -1 || work_tree == NULL, 0))
2932 goto parse_bracket_exp_espace;
2934 #ifdef RE_ENABLE_I18N
2935 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
2936 || mbcset->nranges || (MB_CUR_MAX > 1 && (mbcset->nchar_classes
2937 || mbcset->non_match)))
2939 re_token_t alt_token;
2940 bin_tree_t *mbc_tree;
2941 /* Build a tree for complex bracket. */
2942 br_token.type = COMPLEX_BRACKET;
2943 br_token.opr.mbcset = mbcset;
2944 dfa->has_mb_node = 1;
2945 new_idx = re_dfa_add_node (dfa, br_token, 0);
2946 mbc_tree = create_tree (NULL, NULL, 0, new_idx);
2947 if (BE (new_idx == -1 || mbc_tree == NULL, 0))
2948 goto parse_bracket_exp_espace;
2949 /* Then join them by ALT node. */
2950 dfa->has_plural_match = 1;
2951 alt_token.type = OP_ALT;
2952 new_idx = re_dfa_add_node (dfa, alt_token, 0);
2953 work_tree = create_tree (work_tree, mbc_tree, 0, new_idx);
2954 if (BE (new_idx != -1 && mbc_tree != NULL, 1))
2955 return work_tree;
2957 else
2959 free_charset (mbcset);
2960 return work_tree;
2962 #else /* not RE_ENABLE_I18N */
2963 return work_tree;
2964 #endif /* not RE_ENABLE_I18N */
2966 parse_bracket_exp_espace:
2967 *err = REG_ESPACE;
2968 parse_bracket_exp_free_return:
2969 re_free (sbcset);
2970 #ifdef RE_ENABLE_I18N
2971 free_charset (mbcset);
2972 #endif /* RE_ENABLE_I18N */
2973 return NULL;
2976 /* Parse an element in the bracket expression. */
2978 static reg_errcode_t
2979 parse_bracket_element (elem, regexp, token, token_len, dfa, syntax)
2980 bracket_elem_t *elem;
2981 re_string_t *regexp;
2982 re_token_t *token;
2983 int token_len;
2984 re_dfa_t *dfa;
2985 reg_syntax_t syntax;
2987 #ifdef RE_ENABLE_I18N
2988 int cur_char_size;
2989 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
2990 if (cur_char_size > 1)
2992 elem->type = MB_CHAR;
2993 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
2994 re_string_skip_bytes (regexp, cur_char_size);
2995 return REG_NOERROR;
2997 #endif /* RE_ENABLE_I18N */
2998 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
2999 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3000 || token->type == OP_OPEN_EQUIV_CLASS)
3001 return parse_bracket_symbol (elem, regexp, token);
3002 elem->type = SB_CHAR;
3003 elem->opr.ch = token->opr.c;
3004 return REG_NOERROR;
3007 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3008 such as [:<character_class>:], [.<collating_element>.], and
3009 [=<equivalent_class>=]. */
3011 static reg_errcode_t
3012 parse_bracket_symbol (elem, regexp, token)
3013 bracket_elem_t *elem;
3014 re_string_t *regexp;
3015 re_token_t *token;
3017 unsigned char ch, delim = token->opr.c;
3018 int i = 0;
3019 for (;; ++i)
3021 if (re_string_eoi(regexp) || i >= BRACKET_NAME_BUF_SIZE)
3022 return REG_EBRACK;
3023 if (token->type == OP_OPEN_CHAR_CLASS)
3024 ch = re_string_fetch_byte_case (regexp);
3025 else
3026 ch = re_string_fetch_byte (regexp);
3027 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3028 break;
3029 elem->opr.name[i] = ch;
3031 re_string_skip_bytes (regexp, 1);
3032 elem->opr.name[i] = '\0';
3033 switch (token->type)
3035 case OP_OPEN_COLL_ELEM:
3036 elem->type = COLL_SYM;
3037 break;
3038 case OP_OPEN_EQUIV_CLASS:
3039 elem->type = EQUIV_CLASS;
3040 break;
3041 case OP_OPEN_CHAR_CLASS:
3042 elem->type = CHAR_CLASS;
3043 break;
3044 default:
3045 break;
3047 return REG_NOERROR;
3050 /* Helper function for parse_bracket_exp.
3051 Build the equivalence class which is represented by NAME.
3052 The result are written to MBCSET and SBCSET.
3053 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3054 is a pointer argument sinse we may update it. */
3056 static reg_errcode_t
3057 #ifdef RE_ENABLE_I18N
3058 build_equiv_class (sbcset, mbcset, equiv_class_alloc, name)
3059 re_charset_t *mbcset;
3060 int *equiv_class_alloc;
3061 #else /* not RE_ENABLE_I18N */
3062 build_equiv_class (sbcset, name)
3063 #endif /* not RE_ENABLE_I18N */
3064 re_bitset_ptr_t sbcset;
3065 const unsigned char *name;
3067 #if defined _LIBC && defined RE_ENABLE_I18N
3068 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3069 if (nrules != 0)
3071 const int32_t *table, *indirect;
3072 const unsigned char *weights, *extra, *cp;
3073 unsigned char char_buf[2];
3074 int32_t idx1, idx2;
3075 unsigned int ch;
3076 size_t len;
3077 /* This #include defines a local function! */
3078 # include <locale/weight.h>
3079 /* Calculate the index for equivalence class. */
3080 cp = name;
3081 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3082 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3083 _NL_COLLATE_WEIGHTMB);
3084 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3085 _NL_COLLATE_EXTRAMB);
3086 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3087 _NL_COLLATE_INDIRECTMB);
3088 idx1 = findidx (&cp);
3089 if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3090 /* This isn't a valid character. */
3091 return REG_ECOLLATE;
3093 /* Build single byte matcing table for this equivalence class. */
3094 char_buf[1] = (unsigned char) '\0';
3095 len = weights[idx1];
3096 for (ch = 0; ch < SBC_MAX; ++ch)
3098 char_buf[0] = ch;
3099 cp = char_buf;
3100 idx2 = findidx (&cp);
3102 idx2 = table[ch];
3104 if (idx2 == 0)
3105 /* This isn't a valid character. */
3106 continue;
3107 if (len == weights[idx2])
3109 int cnt = 0;
3110 while (cnt <= len &&
3111 weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
3112 ++cnt;
3114 if (cnt > len)
3115 bitset_set (sbcset, ch);
3118 /* Check whether the array has enough space. */
3119 if (*equiv_class_alloc == mbcset->nequiv_classes)
3121 /* Not enough, realloc it. */
3122 /* +1 in case of mbcset->nequiv_classes is 0. */
3123 *equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3124 /* Use realloc since the array is NULL if *alloc == 0. */
3125 mbcset->equiv_classes = re_realloc (mbcset->equiv_classes, int32_t,
3126 *equiv_class_alloc);
3127 if (BE (mbcset->equiv_classes == NULL, 0))
3128 return REG_ESPACE;
3130 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3132 else
3133 #endif /* _LIBC && RE_ENABLE_I18N */
3135 if (BE (strlen ((const char *) name) != 1, 0))
3136 return REG_ECOLLATE;
3137 bitset_set (sbcset, *name);
3139 return REG_NOERROR;
3142 /* Helper function for parse_bracket_exp.
3143 Build the character class which is represented by NAME.
3144 The result are written to MBCSET and SBCSET.
3145 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3146 is a pointer argument sinse we may update it. */
3148 static reg_errcode_t
3149 #ifdef RE_ENABLE_I18N
3150 build_charclass (sbcset, mbcset, char_class_alloc, class_name, syntax)
3151 re_charset_t *mbcset;
3152 int *char_class_alloc;
3153 #else /* not RE_ENABLE_I18N */
3154 build_charclass (sbcset, class_name, syntax)
3155 #endif /* not RE_ENABLE_I18N */
3156 re_bitset_ptr_t sbcset;
3157 const unsigned char *class_name;
3158 reg_syntax_t syntax;
3160 int i;
3161 const char *name = (const char *) class_name;
3163 /* In case of REG_ICASE "upper" and "lower" match the both of
3164 upper and lower cases. */
3165 if ((syntax & RE_ICASE)
3166 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3167 name = "alpha";
3169 #ifdef RE_ENABLE_I18N
3170 /* Check the space of the arrays. */
3171 if (*char_class_alloc == mbcset->nchar_classes)
3173 /* Not enough, realloc it. */
3174 /* +1 in case of mbcset->nchar_classes is 0. */
3175 *char_class_alloc = 2 * mbcset->nchar_classes + 1;
3176 /* Use realloc since array is NULL if *alloc == 0. */
3177 mbcset->char_classes = re_realloc (mbcset->char_classes, wctype_t,
3178 *char_class_alloc);
3179 if (BE (mbcset->char_classes == NULL, 0))
3180 return REG_ESPACE;
3182 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3183 #endif /* RE_ENABLE_I18N */
3185 #define BUILD_CHARCLASS_LOOP(ctype_func)\
3186 for (i = 0; i < SBC_MAX; ++i) \
3188 if (ctype_func (i)) \
3189 bitset_set (sbcset, i); \
3192 if (strcmp (name, "alnum") == 0)
3193 BUILD_CHARCLASS_LOOP (isalnum)
3194 else if (strcmp (name, "cntrl") == 0)
3195 BUILD_CHARCLASS_LOOP (iscntrl)
3196 else if (strcmp (name, "lower") == 0)
3197 BUILD_CHARCLASS_LOOP (islower)
3198 else if (strcmp (name, "space") == 0)
3199 BUILD_CHARCLASS_LOOP (isspace)
3200 else if (strcmp (name, "alpha") == 0)
3201 BUILD_CHARCLASS_LOOP (isalpha)
3202 else if (strcmp (name, "digit") == 0)
3203 BUILD_CHARCLASS_LOOP (isdigit)
3204 else if (strcmp (name, "print") == 0)
3205 BUILD_CHARCLASS_LOOP (isprint)
3206 else if (strcmp (name, "upper") == 0)
3207 BUILD_CHARCLASS_LOOP (isupper)
3208 else if (strcmp (name, "blank") == 0)
3209 BUILD_CHARCLASS_LOOP (isblank)
3210 else if (strcmp (name, "graph") == 0)
3211 BUILD_CHARCLASS_LOOP (isgraph)
3212 else if (strcmp (name, "punct") == 0)
3213 BUILD_CHARCLASS_LOOP (ispunct)
3214 else if (strcmp (name, "xdigit") == 0)
3215 BUILD_CHARCLASS_LOOP (isxdigit)
3216 else
3217 return REG_ECTYPE;
3219 return REG_NOERROR;
3222 static bin_tree_t *
3223 build_word_op (dfa, not, err)
3224 re_dfa_t *dfa;
3225 int not;
3226 reg_errcode_t *err;
3228 re_bitset_ptr_t sbcset;
3229 #ifdef RE_ENABLE_I18N
3230 re_charset_t *mbcset;
3231 int alloc = 0;
3232 #else /* not RE_ENABLE_I18N */
3233 int non_match = 0;
3234 #endif /* not RE_ENABLE_I18N */
3235 reg_errcode_t ret;
3236 re_token_t br_token;
3237 bin_tree_t *tree;
3238 int new_idx;
3240 sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
3241 #ifdef RE_ENABLE_I18N
3242 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3243 #endif /* RE_ENABLE_I18N */
3245 #ifdef RE_ENABLE_I18N
3246 if (BE (sbcset == NULL || mbcset == NULL, 0))
3247 #else /* not RE_ENABLE_I18N */
3248 if (BE (sbcset == NULL, 0))
3249 #endif /* not RE_ENABLE_I18N */
3251 *err = REG_ESPACE;
3252 return NULL;
3255 if (not)
3257 #ifdef RE_ENABLE_I18N
3258 int i;
3260 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3261 bitset_set(cset->sbcset, '\0');
3263 mbcset->non_match = 1;
3264 if (MB_CUR_MAX > 1)
3265 for (i = 0; i < SBC_MAX; ++i)
3266 if (__btowc (i) == WEOF)
3267 bitset_set (sbcset, i);
3268 #else /* not RE_ENABLE_I18N */
3269 non_match = 1;
3270 #endif /* not RE_ENABLE_I18N */
3273 /* We don't care the syntax in this case. */
3274 ret = build_charclass (sbcset,
3275 #ifdef RE_ENABLE_I18N
3276 mbcset, &alloc,
3277 #endif /* RE_ENABLE_I18N */
3278 (const unsigned char *) "alpha", 0);
3280 if (BE (ret != REG_NOERROR, 0))
3282 re_free (sbcset);
3283 #ifdef RE_ENABLE_I18N
3284 free_charset (mbcset);
3285 #endif /* RE_ENABLE_I18N */
3286 *err = REG_ESPACE;
3287 return NULL;
3289 /* \w match '_' also. */
3290 bitset_set (sbcset, '_');
3292 /* If it is non-matching list. */
3293 #ifdef RE_ENABLE_I18N
3294 if (mbcset->non_match)
3295 #else /* not RE_ENABLE_I18N */
3296 if (non_match)
3297 #endif /* not RE_ENABLE_I18N */
3298 bitset_not (sbcset);
3300 /* Build a tree for simple bracket. */
3301 br_token.type = SIMPLE_BRACKET;
3302 br_token.opr.sbcset = sbcset;
3303 new_idx = re_dfa_add_node (dfa, br_token, 0);
3304 tree = create_tree (NULL, NULL, 0, new_idx);
3305 if (BE (new_idx == -1 || tree == NULL, 0))
3306 goto build_word_op_espace;
3308 #ifdef RE_ENABLE_I18N
3309 if (MB_CUR_MAX > 1)
3311 re_token_t alt_token;
3312 bin_tree_t *mbc_tree;
3313 /* Build a tree for complex bracket. */
3314 br_token.type = COMPLEX_BRACKET;
3315 br_token.opr.mbcset = mbcset;
3316 dfa->has_mb_node = 1;
3317 new_idx = re_dfa_add_node (dfa, br_token, 0);
3318 mbc_tree = create_tree (NULL, NULL, 0, new_idx);
3319 if (BE (new_idx == -1 || mbc_tree == NULL, 0))
3320 goto build_word_op_espace;
3321 /* Then join them by ALT node. */
3322 alt_token.type = OP_ALT;
3323 new_idx = re_dfa_add_node (dfa, alt_token, 0);
3324 tree = create_tree (tree, mbc_tree, 0, new_idx);
3325 if (BE (new_idx != -1 && mbc_tree != NULL, 1))
3326 return tree;
3328 else
3330 free_charset (mbcset);
3331 return tree;
3333 #else /* not RE_ENABLE_I18N */
3334 return tree;
3335 #endif /* not RE_ENABLE_I18N */
3337 build_word_op_espace:
3338 re_free (sbcset);
3339 #ifdef RE_ENABLE_I18N
3340 free_charset (mbcset);
3341 #endif /* RE_ENABLE_I18N */
3342 *err = REG_ESPACE;
3343 return NULL;
3346 /* This is intended for the expressions like "a{1,3}".
3347 Fetch a number from `input', and return the number.
3348 Return -1, if the number field is empty like "{,1}".
3349 Return -2, If an error is occured. */
3351 static int
3352 fetch_number (input, token, syntax)
3353 re_string_t *input;
3354 re_token_t *token;
3355 reg_syntax_t syntax;
3357 int num = -1;
3358 unsigned char c;
3359 while (1)
3361 *token = fetch_token (input, syntax);
3362 c = token->opr.c;
3363 if (BE (token->type == END_OF_RE, 0))
3364 return -2;
3365 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3366 break;
3367 num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3368 ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3369 num = (num > RE_DUP_MAX) ? -2 : num;
3371 return num;
3374 #ifdef RE_ENABLE_I18N
3375 static void
3376 free_charset (re_charset_t *cset)
3378 re_free (cset->mbchars);
3379 # ifdef _LIBC
3380 re_free (cset->coll_syms);
3381 re_free (cset->equiv_classes);
3382 re_free (cset->range_starts);
3383 re_free (cset->range_ends);
3384 # endif
3385 re_free (cset->char_classes);
3386 re_free (cset);
3388 #endif /* RE_ENABLE_I18N */
3390 /* Functions for binary tree operation. */
3392 /* Create a node of tree.
3393 Note: This function automatically free left and right if malloc fails. */
3395 static bin_tree_t *
3396 create_tree (left, right, type, index)
3397 bin_tree_t *left;
3398 bin_tree_t *right;
3399 re_token_type_t type;
3400 int index;
3402 bin_tree_t *tree;
3403 tree = re_malloc (bin_tree_t, 1);
3404 if (BE (tree == NULL, 0))
3406 free_bin_tree (left);
3407 free_bin_tree (right);
3408 return NULL;
3410 tree->parent = NULL;
3411 tree->left = left;
3412 tree->right = right;
3413 tree->type = type;
3414 tree->node_idx = index;
3415 tree->first = -1;
3416 tree->next = -1;
3417 re_node_set_init_empty (&tree->eclosure);
3419 if (left != NULL)
3420 left->parent = tree;
3421 if (right != NULL)
3422 right->parent = tree;
3423 return tree;
3426 /* Free the sub tree pointed by TREE. */
3428 static void
3429 free_bin_tree (tree)
3430 bin_tree_t *tree;
3432 if (tree == NULL)
3433 return;
3434 /*re_node_set_free (&tree->eclosure);*/
3435 free_bin_tree (tree->left);
3436 free_bin_tree (tree->right);
3437 re_free (tree);
3440 /* Duplicate the node SRC, and return new node. */
3442 static bin_tree_t *
3443 duplicate_tree (src, dfa)
3444 const bin_tree_t *src;
3445 re_dfa_t *dfa;
3447 bin_tree_t *left = NULL, *right = NULL, *new_tree;
3448 int new_node_idx;
3449 /* Since node indies must be according to Post-order of the tree,
3450 we must duplicate the left at first. */
3451 if (src->left != NULL)
3453 left = duplicate_tree (src->left, dfa);
3454 if (left == NULL)
3455 return NULL;
3458 /* Secondaly, duplicate the right. */
3459 if (src->right != NULL)
3461 right = duplicate_tree (src->right, dfa);
3462 if (right == NULL)
3464 free_bin_tree (left);
3465 return NULL;
3469 /* At last, duplicate itself. */
3470 if (src->type == NON_TYPE)
3472 new_node_idx = re_dfa_add_node (dfa, dfa->nodes[src->node_idx], 0);
3473 dfa->nodes[new_node_idx].duplicated = 1;
3474 if (BE (new_node_idx == -1, 0))
3476 free_bin_tree (left);
3477 free_bin_tree (right);
3478 return NULL;
3481 else
3482 new_node_idx = src->type;
3484 new_tree = create_tree (left, right, src->type, new_node_idx);
3485 if (BE (new_tree == NULL, 0))
3487 free_bin_tree (left);
3488 free_bin_tree (right);
3490 return new_tree;