* sysdeps/unix/sysv/linux/x86_64/sysdep.h
[glibc.git] / posix / regcomp.c
blob258e255b34b69edf40a80d2777724e6c04e7a48b
1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19 02111-1307 USA. */
21 #include <assert.h>
22 #include <ctype.h>
23 #include <limits.h>
24 #include <locale.h>
25 #include <stdio.h>
26 #include <stdlib.h>
27 #include <string.h>
29 #if defined HAVE_WCHAR_H || defined _LIBC
30 # include <wchar.h>
31 #endif /* HAVE_WCHAR_H || _LIBC */
32 #if defined HAVE_WCTYPE_H || defined _LIBC
33 # include <wctype.h>
34 #endif /* HAVE_WCTYPE_H || _LIBC */
36 /* In case that the system doesn't have isblank(). */
37 #if !defined _LIBC && !defined HAVE_ISBLANK && !defined isblank
38 # define isblank(ch) ((ch) == ' ' || (ch) == '\t')
39 #endif
41 #ifdef _LIBC
42 # ifndef _RE_DEFINE_LOCALE_FUNCTIONS
43 # define _RE_DEFINE_LOCALE_FUNCTIONS 1
44 # include <locale/localeinfo.h>
45 # include <locale/elem-hash.h>
46 # include <locale/coll-lookup.h>
47 # endif
48 #endif
50 /* This is for other GNU distributions with internationalized messages. */
51 #if HAVE_LIBINTL_H || defined _LIBC
52 # include <libintl.h>
53 # ifdef _LIBC
54 # undef gettext
55 # define gettext(msgid) \
56 INTUSE(__dcgettext) (INTUSE(_libc_intl_domainname), msgid, LC_MESSAGES)
57 # endif
58 #else
59 # define gettext(msgid) (msgid)
60 #endif
62 #ifndef gettext_noop
63 /* This define is so xgettext can find the internationalizable
64 strings. */
65 # define gettext_noop(String) String
66 #endif
68 #include <regex.h>
69 #include "regex_internal.h"
71 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
72 int length, reg_syntax_t syntax);
73 static void re_compile_fastmap_iter (regex_t *bufp,
74 const re_dfastate_t *init_state,
75 char *fastmap);
76 static reg_errcode_t init_dfa (re_dfa_t *dfa, int pat_len);
77 static reg_errcode_t init_word_char (re_dfa_t *dfa);
78 #ifdef RE_ENABLE_I18N
79 static void free_charset (re_charset_t *cset);
80 #endif /* RE_ENABLE_I18N */
81 static void free_workarea_compile (regex_t *preg);
82 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
83 static reg_errcode_t analyze (re_dfa_t *dfa);
84 static reg_errcode_t analyze_tree (re_dfa_t *dfa, bin_tree_t *node);
85 static void calc_first (re_dfa_t *dfa, bin_tree_t *node);
86 static void calc_next (re_dfa_t *dfa, bin_tree_t *node);
87 static void calc_epsdest (re_dfa_t *dfa, bin_tree_t *node);
88 static reg_errcode_t duplicate_node (int *new_idx, re_dfa_t *dfa, int org_idx,
89 unsigned int constraint);
90 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
91 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
92 int node, int root);
93 static void calc_inveclosure (re_dfa_t *dfa);
94 static int fetch_number (re_string_t *input, re_token_t *token,
95 reg_syntax_t syntax);
96 static re_token_t fetch_token (re_string_t *input, reg_syntax_t syntax);
97 static int peek_token (re_token_t *token, re_string_t *input,
98 reg_syntax_t syntax);
99 static int peek_token_bracket (re_token_t *token, re_string_t *input,
100 reg_syntax_t syntax);
101 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
102 reg_syntax_t syntax, reg_errcode_t *err);
103 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
104 re_token_t *token, reg_syntax_t syntax,
105 int nest, reg_errcode_t *err);
106 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
107 re_token_t *token, reg_syntax_t syntax,
108 int nest, reg_errcode_t *err);
109 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
110 re_token_t *token, reg_syntax_t syntax,
111 int nest, reg_errcode_t *err);
112 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
113 re_token_t *token, reg_syntax_t syntax,
114 int nest, reg_errcode_t *err);
115 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
116 re_dfa_t *dfa, re_token_t *token,
117 reg_syntax_t syntax, reg_errcode_t *err);
118 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
119 re_token_t *token, reg_syntax_t syntax,
120 reg_errcode_t *err);
121 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
122 re_string_t *regexp,
123 re_token_t *token, int token_len,
124 re_dfa_t *dfa,
125 reg_syntax_t syntax);
126 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
127 re_string_t *regexp,
128 re_token_t *token);
129 #ifndef _LIBC
130 # ifdef RE_ENABLE_I18N
131 static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
132 re_charset_t *mbcset, int *range_alloc,
133 bracket_elem_t *start_elem,
134 bracket_elem_t *end_elem);
135 static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
136 re_charset_t *mbcset,
137 int *coll_sym_alloc,
138 const unsigned char *name);
139 # else /* not RE_ENABLE_I18N */
140 static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
141 bracket_elem_t *start_elem,
142 bracket_elem_t *end_elem);
143 static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
144 const unsigned char *name);
145 # endif /* not RE_ENABLE_I18N */
146 #endif /* not _LIBC */
147 #ifdef RE_ENABLE_I18N
148 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
149 re_charset_t *mbcset,
150 int *equiv_class_alloc,
151 const unsigned char *name);
152 static reg_errcode_t build_charclass (re_bitset_ptr_t sbcset,
153 re_charset_t *mbcset,
154 int *char_class_alloc,
155 const unsigned char *class_name,
156 reg_syntax_t syntax);
157 #else /* not RE_ENABLE_I18N */
158 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
159 const unsigned char *name);
160 static reg_errcode_t build_charclass (re_bitset_ptr_t sbcset,
161 const unsigned char *class_name,
162 reg_syntax_t syntax);
163 #endif /* not RE_ENABLE_I18N */
164 static bin_tree_t *build_word_op (re_dfa_t *dfa, int not, reg_errcode_t *err);
165 static void free_bin_tree (bin_tree_t *tree);
166 static bin_tree_t *create_tree (bin_tree_t *left, bin_tree_t *right,
167 re_token_type_t type, int index);
168 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
170 /* This table gives an error message for each of the error codes listed
171 in regex.h. Obviously the order here has to be same as there.
172 POSIX doesn't require that we do anything for REG_NOERROR,
173 but why not be nice? */
175 const char __re_error_msgid[] attribute_hidden =
177 #define REG_NOERROR_IDX 0
178 gettext_noop ("Success") /* REG_NOERROR */
179 "\0"
180 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
181 gettext_noop ("No match") /* REG_NOMATCH */
182 "\0"
183 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
184 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
185 "\0"
186 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
187 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
188 "\0"
189 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
190 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
191 "\0"
192 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
193 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
194 "\0"
195 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
196 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
197 "\0"
198 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
199 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
200 "\0"
201 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
202 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
203 "\0"
204 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
205 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
206 "\0"
207 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
208 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
209 "\0"
210 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
211 gettext_noop ("Invalid range end") /* REG_ERANGE */
212 "\0"
213 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
214 gettext_noop ("Memory exhausted") /* REG_ESPACE */
215 "\0"
216 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
217 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
218 "\0"
219 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
220 gettext_noop ("Premature end of regular expression") /* REG_EEND */
221 "\0"
222 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
223 gettext_noop ("Regular expression too big") /* REG_ESIZE */
224 "\0"
225 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
226 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
229 const size_t __re_error_msgid_idx[] attribute_hidden =
231 REG_NOERROR_IDX,
232 REG_NOMATCH_IDX,
233 REG_BADPAT_IDX,
234 REG_ECOLLATE_IDX,
235 REG_ECTYPE_IDX,
236 REG_EESCAPE_IDX,
237 REG_ESUBREG_IDX,
238 REG_EBRACK_IDX,
239 REG_EPAREN_IDX,
240 REG_EBRACE_IDX,
241 REG_BADBR_IDX,
242 REG_ERANGE_IDX,
243 REG_ESPACE_IDX,
244 REG_BADRPT_IDX,
245 REG_EEND_IDX,
246 REG_ESIZE_IDX,
247 REG_ERPAREN_IDX
250 /* Entry points for GNU code. */
252 /* re_compile_pattern is the GNU regular expression compiler: it
253 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
254 Returns 0 if the pattern was valid, otherwise an error string.
256 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
257 are set in BUFP on entry. */
259 const char *
260 re_compile_pattern (pattern, length, bufp)
261 const char *pattern;
262 size_t length;
263 struct re_pattern_buffer *bufp;
265 reg_errcode_t ret;
267 /* GNU code is written to assume at least RE_NREGS registers will be set
268 (and at least one extra will be -1). */
269 bufp->regs_allocated = REGS_UNALLOCATED;
271 /* And GNU code determines whether or not to get register information
272 by passing null for the REGS argument to re_match, etc., not by
273 setting no_sub. */
274 bufp->no_sub = 0;
276 /* Match anchors at newline. */
277 bufp->newline_anchor = 1;
279 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
281 if (!ret)
282 return NULL;
283 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
285 #ifdef _LIBC
286 weak_alias (__re_compile_pattern, re_compile_pattern)
287 #endif
289 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
290 also be assigned to arbitrarily: each pattern buffer stores its own
291 syntax, so it can be changed between regex compilations. */
292 /* This has no initializer because initialized variables in Emacs
293 become read-only after dumping. */
294 reg_syntax_t re_syntax_options;
297 /* Specify the precise syntax of regexps for compilation. This provides
298 for compatibility for various utilities which historically have
299 different, incompatible syntaxes.
301 The argument SYNTAX is a bit mask comprised of the various bits
302 defined in regex.h. We return the old syntax. */
304 reg_syntax_t
305 re_set_syntax (syntax)
306 reg_syntax_t syntax;
308 reg_syntax_t ret = re_syntax_options;
310 re_syntax_options = syntax;
311 return ret;
313 #ifdef _LIBC
314 weak_alias (__re_set_syntax, re_set_syntax)
315 #endif
318 re_compile_fastmap (bufp)
319 struct re_pattern_buffer *bufp;
321 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
322 char *fastmap = bufp->fastmap;
324 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
325 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
326 if (dfa->init_state != dfa->init_state_word)
327 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
328 if (dfa->init_state != dfa->init_state_nl)
329 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
330 if (dfa->init_state != dfa->init_state_begbuf)
331 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
332 bufp->fastmap_accurate = 1;
333 return 0;
335 #ifdef _LIBC
336 weak_alias (__re_compile_fastmap, re_compile_fastmap)
337 #endif
339 /* Helper function for re_compile_fastmap.
340 Compile fastmap for the initial_state INIT_STATE. */
342 static void
343 re_compile_fastmap_iter (bufp, init_state, fastmap)
344 regex_t *bufp;
345 const re_dfastate_t *init_state;
346 char *fastmap;
348 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
349 int node_cnt;
350 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
352 int node = init_state->nodes.elems[node_cnt];
353 re_token_type_t type = dfa->nodes[node].type;
354 if (type == OP_CONTEXT_NODE)
356 node = dfa->nodes[node].opr.ctx_info->entity;
357 type = dfa->nodes[node].type;
360 if (type == CHARACTER)
361 fastmap[dfa->nodes[node].opr.c] = 1;
362 else if (type == SIMPLE_BRACKET)
364 int i, j, ch;
365 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
366 for (j = 0; j < UINT_BITS; ++j, ++ch)
367 if (dfa->nodes[node].opr.sbcset[i] & (1 << j))
368 fastmap[ch] = 1;
370 #ifdef RE_ENABLE_I18N
371 else if (type == COMPLEX_BRACKET)
373 int i;
374 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
375 if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
376 || cset->nranges || cset->nchar_classes)
378 # ifdef _LIBC
379 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
381 /* In this case we want to catch the bytes which are
382 the first byte of any collation elements.
383 e.g. In da_DK, we want to catch 'a' since "aa"
384 is a valid collation element, and don't catch
385 'b' since 'b' is the only collation element
386 which starts from 'b'. */
387 int j, ch;
388 const int32_t *table = (const int32_t *)
389 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
390 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
391 for (j = 0; j < UINT_BITS; ++j, ++ch)
392 if (table[ch] < 0)
393 fastmap[ch] = 1;
395 # else
396 if (MB_CUR_MAX > 1)
397 for (i = 0; i < SBC_MAX; ++i)
398 if (__btowc (i) == WEOF)
399 fastmap[i] = 1;
400 # endif /* not _LIBC */
402 for (i = 0; i < cset->nmbchars; ++i)
404 char buf[256];
405 wctomb (buf, cset->mbchars[i]);
406 fastmap[*(unsigned char *) buf] = 1;
409 #endif /* RE_ENABLE_I18N */
410 else if (type == END_OF_RE || type == OP_PERIOD
411 #ifdef RE_ENABLE_I18N
412 || type == COMPLEX_BRACKET
413 #endif /* RE_ENABLE_I18N */
416 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
417 if (type == END_OF_RE)
418 bufp->can_be_null = 1;
419 return;
424 /* Entry point for POSIX code. */
425 /* regcomp takes a regular expression as a string and compiles it.
427 PREG is a regex_t *. We do not expect any fields to be initialized,
428 since POSIX says we shouldn't. Thus, we set
430 `buffer' to the compiled pattern;
431 `used' to the length of the compiled pattern;
432 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
433 REG_EXTENDED bit in CFLAGS is set; otherwise, to
434 RE_SYNTAX_POSIX_BASIC;
435 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
436 `fastmap' to an allocated space for the fastmap;
437 `fastmap_accurate' to zero;
438 `re_nsub' to the number of subexpressions in PATTERN.
440 PATTERN is the address of the pattern string.
442 CFLAGS is a series of bits which affect compilation.
444 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
445 use POSIX basic syntax.
447 If REG_NEWLINE is set, then . and [^...] don't match newline.
448 Also, regexec will try a match beginning after every newline.
450 If REG_ICASE is set, then we considers upper- and lowercase
451 versions of letters to be equivalent when matching.
453 If REG_NOSUB is set, then when PREG is passed to regexec, that
454 routine will report only success or failure, and nothing about the
455 registers.
457 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
458 the return codes and their meanings.) */
461 regcomp (preg, pattern, cflags)
462 regex_t *__restrict preg;
463 const char *__restrict pattern;
464 int cflags;
466 reg_errcode_t ret;
467 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
468 : RE_SYNTAX_POSIX_BASIC);
470 preg->buffer = NULL;
471 preg->allocated = 0;
472 preg->used = 0;
474 /* Try to allocate space for the fastmap. */
475 preg->fastmap = re_malloc (char, SBC_MAX);
476 if (BE (preg->fastmap == NULL, 0))
477 return REG_ESPACE;
479 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
481 /* If REG_NEWLINE is set, newlines are treated differently. */
482 if (cflags & REG_NEWLINE)
483 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
484 syntax &= ~RE_DOT_NEWLINE;
485 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
486 /* It also changes the matching behavior. */
487 preg->newline_anchor = 1;
489 else
490 preg->newline_anchor = 0;
491 preg->no_sub = !!(cflags & REG_NOSUB);
492 preg->translate = NULL;
494 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
496 /* POSIX doesn't distinguish between an unmatched open-group and an
497 unmatched close-group: both are REG_EPAREN. */
498 if (ret == REG_ERPAREN)
499 ret = REG_EPAREN;
501 /* We have already checked preg->fastmap != NULL. */
502 if (BE (ret == REG_NOERROR, 1))
504 /* Compute the fastmap now, since regexec cannot modify the pattern
505 buffer. */
506 if (BE (re_compile_fastmap (preg) == -2, 0))
508 /* Some error occurred while computing the fastmap, just forget
509 about it. */
510 re_free (preg->fastmap);
511 preg->fastmap = NULL;
515 return (int) ret;
517 #ifdef _LIBC
518 weak_alias (__regcomp, regcomp)
519 #endif
521 /* Returns a message corresponding to an error code, ERRCODE, returned
522 from either regcomp or regexec. We don't use PREG here. */
524 size_t
525 regerror (errcode, preg, errbuf, errbuf_size)
526 int errcode;
527 const regex_t *preg;
528 char *errbuf;
529 size_t errbuf_size;
531 const char *msg;
532 size_t msg_size;
534 if (BE (errcode < 0
535 || errcode >= (int) (sizeof (__re_error_msgid_idx)
536 / sizeof (__re_error_msgid_idx[0])), 0))
537 /* Only error codes returned by the rest of the code should be passed
538 to this routine. If we are given anything else, or if other regex
539 code generates an invalid error code, then the program has a bug.
540 Dump core so we can fix it. */
541 abort ();
543 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
545 msg_size = strlen (msg) + 1; /* Includes the null. */
547 if (BE (errbuf_size != 0, 1))
549 if (BE (msg_size > errbuf_size, 0))
551 #if defined HAVE_MEMPCPY || defined _LIBC
552 *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
553 #else
554 memcpy (errbuf, msg, errbuf_size - 1);
555 errbuf[errbuf_size - 1] = 0;
556 #endif
558 else
559 memcpy (errbuf, msg, msg_size);
562 return msg_size;
564 #ifdef _LIBC
565 weak_alias (__regerror, regerror)
566 #endif
568 /* Free dynamically allocated space used by PREG. */
570 void
571 regfree (preg)
572 regex_t *preg;
574 int i, j;
575 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
576 if (BE (dfa != NULL, 1))
578 re_free (dfa->subexps);
580 for (i = 0; i < dfa->nodes_len; ++i)
582 re_token_t *node = dfa->nodes + i;
583 #ifdef RE_ENABLE_I18N
584 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
585 free_charset (node->opr.mbcset);
586 else
587 #endif /* RE_ENABLE_I18N */
588 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
589 re_free (node->opr.sbcset);
590 else if (node->type == OP_CONTEXT_NODE)
592 if (dfa->nodes[node->opr.ctx_info->entity].type == OP_BACK_REF)
594 if (node->opr.ctx_info->bkref_eclosure != NULL)
595 re_node_set_free (node->opr.ctx_info->bkref_eclosure);
596 re_free (node->opr.ctx_info->bkref_eclosure);
598 re_free (node->opr.ctx_info);
601 re_free (dfa->firsts);
602 re_free (dfa->nexts);
603 for (i = 0; i < dfa->nodes_len; ++i)
605 if (dfa->eclosures != NULL)
606 re_node_set_free (dfa->eclosures + i);
607 if (dfa->inveclosures != NULL)
608 re_node_set_free (dfa->inveclosures + i);
609 if (dfa->edests != NULL)
610 re_node_set_free (dfa->edests + i);
612 re_free (dfa->edests);
613 re_free (dfa->eclosures);
614 re_free (dfa->inveclosures);
615 re_free (dfa->nodes);
617 for (i = 0; i <= dfa->state_hash_mask; ++i)
619 struct re_state_table_entry *entry = dfa->state_table + i;
620 for (j = 0; j < entry->num; ++j)
622 re_dfastate_t *state = entry->array[j];
623 if (state->entrance_nodes != &state->nodes)
625 re_node_set_free (state->entrance_nodes);
626 re_free (state->entrance_nodes);
628 re_node_set_free (&state->nodes);
629 re_free (state->trtable);
630 re_free (state->trtable_search);
631 re_free (state);
633 re_free (entry->array);
635 re_free (dfa->state_table);
637 if (dfa->word_char != NULL)
638 re_free (dfa->word_char);
639 re_free (dfa);
641 re_free (preg->fastmap);
643 #ifdef _LIBC
644 weak_alias (__regfree, regfree)
645 #endif
647 /* Entry points compatible with 4.2 BSD regex library. We don't define
648 them unless specifically requested. */
650 #if defined _REGEX_RE_COMP || defined _LIBC
652 /* BSD has one and only one pattern buffer. */
653 static struct re_pattern_buffer re_comp_buf;
655 char *
656 # ifdef _LIBC
657 /* Make these definitions weak in libc, so POSIX programs can redefine
658 these names if they don't use our functions, and still use
659 regcomp/regexec above without link errors. */
660 weak_function
661 # endif
662 re_comp (s)
663 const char *s;
665 reg_errcode_t ret;
667 if (!s)
669 if (!re_comp_buf.buffer)
670 return gettext ("No previous regular expression");
671 return 0;
674 if (!re_comp_buf.buffer)
676 re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
677 if (re_comp_buf.fastmap == NULL)
678 return (char *) gettext (__re_error_msgid
679 + __re_error_msgid_idx[(int) REG_ESPACE]);
682 /* Since `re_exec' always passes NULL for the `regs' argument, we
683 don't need to initialize the pattern buffer fields which affect it. */
685 /* Match anchors at newlines. */
686 re_comp_buf.newline_anchor = 1;
688 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
690 if (!ret)
691 return NULL;
693 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
694 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
696 #endif /* _REGEX_RE_COMP */
698 /* Internal entry point.
699 Compile the regular expression PATTERN, whose length is LENGTH.
700 SYNTAX indicate regular expression's syntax. */
702 static reg_errcode_t
703 re_compile_internal (preg, pattern, length, syntax)
704 regex_t *preg;
705 const char * pattern;
706 int length;
707 reg_syntax_t syntax;
709 reg_errcode_t err = REG_NOERROR;
710 re_dfa_t *dfa;
711 re_string_t regexp;
713 /* Initialize the pattern buffer. */
714 preg->fastmap_accurate = 0;
715 preg->syntax = syntax;
716 preg->not_bol = preg->not_eol = 0;
717 preg->used = 0;
718 preg->re_nsub = 0;
720 /* Initialize the dfa. */
721 dfa = (re_dfa_t *) preg->buffer;
722 if (preg->allocated < sizeof (re_dfa_t))
724 /* If zero allocated, but buffer is non-null, try to realloc
725 enough space. This loses if buffer's address is bogus, but
726 that is the user's responsibility. If ->buffer is NULL this
727 is a simple allocation. */
728 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
729 if (dfa == NULL)
730 return REG_ESPACE;
731 preg->allocated = sizeof (re_dfa_t);
733 preg->buffer = (unsigned char *) dfa;
734 preg->used = sizeof (re_dfa_t);
736 err = init_dfa (dfa, length);
737 if (BE (err != REG_NOERROR, 0))
739 re_free (dfa);
740 preg->buffer = NULL;
741 return err;
744 err = re_string_construct (&regexp, pattern, length, preg->translate,
745 syntax & RE_ICASE);
746 if (BE (err != REG_NOERROR, 0))
748 re_free (dfa);
749 preg->buffer = NULL;
750 return err;
753 /* Parse the regular expression, and build a structure tree. */
754 preg->re_nsub = 0;
755 dfa->str_tree = parse (&regexp, preg, syntax, &err);
756 if (BE (dfa->str_tree == NULL, 0))
757 goto re_compile_internal_free_return;
759 /* Analyze the tree and collect information which is necessary to
760 create the dfa. */
761 err = analyze (dfa);
762 if (BE (err != REG_NOERROR, 0))
763 goto re_compile_internal_free_return;
765 /* Then create the initial state of the dfa. */
766 err = create_initial_state (dfa);
767 if (BE (err != REG_NOERROR, 0))
768 goto re_compile_internal_free_return;
770 re_compile_internal_free_return:
771 /* Release work areas. */
772 free_workarea_compile (preg);
773 re_string_destruct (&regexp);
775 return err;
778 /* Initialize DFA. We use the length of the regular expression PAT_LEN
779 as the initial length of some arrays. */
781 static reg_errcode_t
782 init_dfa (dfa, pat_len)
783 re_dfa_t *dfa;
784 int pat_len;
786 int table_size;
788 memset (dfa, '\0', sizeof (re_dfa_t));
790 dfa->nodes_alloc = pat_len + 1;
791 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
793 dfa->states_alloc = pat_len + 1;
795 /* table_size = 2 ^ ceil(log pat_len) */
796 for (table_size = 1; table_size > 0; table_size <<= 1)
797 if (table_size > pat_len)
798 break;
800 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
801 dfa->state_hash_mask = table_size - 1;
803 dfa->subexps_alloc = 1;
804 dfa->subexps = re_malloc (re_subexp_t, dfa->subexps_alloc);
805 dfa->word_char = NULL;
807 if (BE (dfa->nodes == NULL || dfa->state_table == NULL
808 || dfa->subexps == NULL, 0))
810 /* We don't bother to free anything which was allocated. Very
811 soon the process will go down anyway. */
812 dfa->subexps = NULL;
813 dfa->state_table = NULL;
814 dfa->nodes = NULL;
815 return REG_ESPACE;
817 return REG_NOERROR;
820 /* Initialize WORD_CHAR table, which indicate which character is
821 "word". In this case "word" means that it is the word construction
822 character used by some operators like "\<", "\>", etc. */
824 static reg_errcode_t
825 init_word_char (dfa)
826 re_dfa_t *dfa;
828 int i, j, ch;
829 dfa->word_char = (re_bitset_ptr_t) calloc (sizeof (bitset), 1);
830 if (BE (dfa->word_char == NULL, 0))
831 return REG_ESPACE;
832 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
833 for (j = 0; j < UINT_BITS; ++j, ++ch)
834 if (isalnum (ch) || ch == '_')
835 dfa->word_char[i] |= 1 << j;
836 return REG_NOERROR;
839 /* Free the work area which are only used while compiling. */
841 static void
842 free_workarea_compile (preg)
843 regex_t *preg;
845 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
846 free_bin_tree (dfa->str_tree);
847 dfa->str_tree = NULL;
850 /* Create initial states for all contexts. */
852 static reg_errcode_t
853 create_initial_state (dfa)
854 re_dfa_t *dfa;
856 int first, i;
857 reg_errcode_t err;
858 re_node_set init_nodes;
860 /* Initial states have the epsilon closure of the node which is
861 the first node of the regular expression. */
862 first = dfa->str_tree->first;
863 dfa->init_node = first;
864 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
865 if (BE (err != REG_NOERROR, 0))
866 return err;
868 /* The back-references which are in initial states can epsilon transit,
869 since in this case all of the subexpressions can be null.
870 Then we add epsilon closures of the nodes which are the next nodes of
871 the back-references. */
872 if (dfa->nbackref > 0)
873 for (i = 0; i < init_nodes.nelem; ++i)
875 int node_idx = init_nodes.elems[i];
876 re_token_type_t type = dfa->nodes[node_idx].type;
877 if (type == OP_CONTEXT_NODE
878 && (dfa->nodes[dfa->nodes[node_idx].opr.ctx_info->entity].type
879 == OP_BACK_REF))
881 int prev_nelem = init_nodes.nelem;
882 re_node_set_merge (&init_nodes,
883 dfa->nodes[node_idx].opr.ctx_info->bkref_eclosure);
884 if (prev_nelem < init_nodes.nelem)
885 i = 0;
887 else if (type == OP_BACK_REF)
889 int next_idx = dfa->nexts[node_idx];
890 if (!re_node_set_contains (&init_nodes, next_idx))
892 re_node_set_merge (&init_nodes, dfa->eclosures + next_idx);
893 i = 0;
898 /* It must be the first time to invoke acquire_state. */
899 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
900 /* We don't check ERR here, since the initial state must not be NULL. */
901 if (BE (dfa->init_state == NULL, 0))
902 return err;
903 if (dfa->init_state->has_constraint)
905 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
906 CONTEXT_WORD);
907 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
908 CONTEXT_NEWLINE);
909 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
910 &init_nodes,
911 CONTEXT_NEWLINE
912 | CONTEXT_BEGBUF);
913 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
914 || dfa->init_state_begbuf == NULL, 0))
915 return err;
917 else
918 dfa->init_state_word = dfa->init_state_nl
919 = dfa->init_state_begbuf = dfa->init_state;
921 re_node_set_free (&init_nodes);
922 return REG_NOERROR;
925 /* Analyze the structure tree, and calculate "first", "next", "edest",
926 "eclosure", and "inveclosure". */
928 static reg_errcode_t
929 analyze (dfa)
930 re_dfa_t *dfa;
932 int i;
933 reg_errcode_t ret;
935 /* Allocate arrays. */
936 dfa->firsts = re_malloc (int, dfa->nodes_alloc);
937 dfa->nexts = re_malloc (int, dfa->nodes_alloc);
938 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
939 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
940 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_alloc);
941 if (BE (dfa->firsts == NULL || dfa->nexts == NULL || dfa->edests == NULL
942 || dfa->eclosures == NULL || dfa->inveclosures == NULL, 0))
943 return REG_ESPACE;
944 /* Initialize them. */
945 for (i = 0; i < dfa->nodes_len; ++i)
947 dfa->firsts[i] = -1;
948 dfa->nexts[i] = -1;
949 re_node_set_init_empty (dfa->edests + i);
950 re_node_set_init_empty (dfa->eclosures + i);
951 re_node_set_init_empty (dfa->inveclosures + i);
954 ret = analyze_tree (dfa, dfa->str_tree);
955 if (BE (ret == REG_NOERROR, 1))
957 ret = calc_eclosure (dfa);
958 if (ret == REG_NOERROR)
959 calc_inveclosure (dfa);
961 return ret;
964 /* Helper functions for analyze.
965 This function calculate "first", "next", and "edest" for the subtree
966 whose root is NODE. */
968 static reg_errcode_t
969 analyze_tree (dfa, node)
970 re_dfa_t *dfa;
971 bin_tree_t *node;
973 reg_errcode_t ret;
974 if (node->first == -1)
975 calc_first (dfa, node);
976 if (node->next == -1)
977 calc_next (dfa, node);
978 if (node->eclosure.nelem == 0)
979 calc_epsdest (dfa, node);
980 /* Calculate "first" etc. for the left child. */
981 if (node->left != NULL)
983 ret = analyze_tree (dfa, node->left);
984 if (BE (ret != REG_NOERROR, 0))
985 return ret;
987 /* Calculate "first" etc. for the right child. */
988 if (node->right != NULL)
990 ret = analyze_tree (dfa, node->right);
991 if (BE (ret != REG_NOERROR, 0))
992 return ret;
994 return REG_NOERROR;
997 /* Calculate "first" for the node NODE. */
998 static void
999 calc_first (dfa, node)
1000 re_dfa_t *dfa;
1001 bin_tree_t *node;
1003 int idx, type;
1004 idx = node->node_idx;
1005 type = (node->type == 0) ? dfa->nodes[idx].type : node->type;
1007 switch (type)
1009 #ifdef DEBUG
1010 case OP_OPEN_BRACKET:
1011 case OP_CLOSE_BRACKET:
1012 case OP_OPEN_DUP_NUM:
1013 case OP_CLOSE_DUP_NUM:
1014 case OP_NON_MATCH_LIST:
1015 case OP_OPEN_COLL_ELEM:
1016 case OP_CLOSE_COLL_ELEM:
1017 case OP_OPEN_EQUIV_CLASS:
1018 case OP_CLOSE_EQUIV_CLASS:
1019 case OP_OPEN_CHAR_CLASS:
1020 case OP_CLOSE_CHAR_CLASS:
1021 /* These must not be appeared here. */
1022 assert (0);
1023 #endif
1024 case END_OF_RE:
1025 case CHARACTER:
1026 case OP_PERIOD:
1027 case OP_DUP_ASTERISK:
1028 case OP_DUP_QUESTION:
1029 #ifdef RE_ENABLE_I18N
1030 case COMPLEX_BRACKET:
1031 #endif /* RE_ENABLE_I18N */
1032 case SIMPLE_BRACKET:
1033 case OP_BACK_REF:
1034 case ANCHOR:
1035 case OP_OPEN_SUBEXP:
1036 case OP_CLOSE_SUBEXP:
1037 node->first = idx;
1038 break;
1039 case OP_DUP_PLUS:
1040 #ifdef DEBUG
1041 assert (node->left != NULL);
1042 #endif
1043 if (node->left->first == -1)
1044 calc_first (dfa, node->left);
1045 node->first = node->left->first;
1046 break;
1047 case OP_ALT:
1048 node->first = idx;
1049 break;
1050 /* else fall through */
1051 default:
1052 #ifdef DEBUG
1053 assert (node->left != NULL);
1054 #endif
1055 if (node->left->first == -1)
1056 calc_first (dfa, node->left);
1057 node->first = node->left->first;
1058 break;
1060 if (node->type == 0)
1061 dfa->firsts[idx] = node->first;
1064 /* Calculate "next" for the node NODE. */
1066 static void
1067 calc_next (dfa, node)
1068 re_dfa_t *dfa;
1069 bin_tree_t *node;
1071 int idx, type;
1072 bin_tree_t *parent = node->parent;
1073 if (parent == NULL)
1075 node->next = -1;
1076 idx = node->node_idx;
1077 if (node->type == 0)
1078 dfa->nexts[idx] = node->next;
1079 return;
1082 idx = parent->node_idx;
1083 type = (parent->type == 0) ? dfa->nodes[idx].type : parent->type;
1085 switch (type)
1087 case OP_DUP_ASTERISK:
1088 case OP_DUP_PLUS:
1089 node->next = idx;
1090 break;
1091 case CONCAT:
1092 if (parent->left == node)
1094 if (parent->right->first == -1)
1095 calc_first (dfa, parent->right);
1096 node->next = parent->right->first;
1097 break;
1099 /* else fall through */
1100 default:
1101 if (parent->next == -1)
1102 calc_next (dfa, parent);
1103 node->next = parent->next;
1104 break;
1106 idx = node->node_idx;
1107 if (node->type == 0)
1108 dfa->nexts[idx] = node->next;
1111 /* Calculate "edest" for the node NODE. */
1113 static void
1114 calc_epsdest (dfa, node)
1115 re_dfa_t *dfa;
1116 bin_tree_t *node;
1118 int idx;
1119 idx = node->node_idx;
1120 if (node->type == 0)
1122 if (dfa->nodes[idx].type == OP_DUP_ASTERISK
1123 || dfa->nodes[idx].type == OP_DUP_PLUS
1124 || dfa->nodes[idx].type == OP_DUP_QUESTION)
1126 if (node->left->first == -1)
1127 calc_first (dfa, node->left);
1128 if (node->next == -1)
1129 calc_next (dfa, node);
1130 re_node_set_init_2 (dfa->edests + idx, node->left->first,
1131 node->next);
1133 else if (dfa->nodes[idx].type == OP_ALT)
1135 int left, right;
1136 if (node->left != NULL)
1138 if (node->left->first == -1)
1139 calc_first (dfa, node->left);
1140 left = node->left->first;
1142 else
1144 if (node->next == -1)
1145 calc_next (dfa, node);
1146 left = node->next;
1148 if (node->right != NULL)
1150 if (node->right->first == -1)
1151 calc_first (dfa, node->right);
1152 right = node->right->first;
1154 else
1156 if (node->next == -1)
1157 calc_next (dfa, node);
1158 right = node->next;
1160 re_node_set_init_2 (dfa->edests + idx, left, right);
1162 else if (dfa->nodes[idx].type == ANCHOR
1163 || dfa->nodes[idx].type == OP_OPEN_SUBEXP
1164 || dfa->nodes[idx].type == OP_CLOSE_SUBEXP)
1165 re_node_set_init_1 (dfa->edests + idx, node->next);
1169 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1170 The new index will be stored in NEW_IDX and return REG_NOERROR if succeeded,
1171 otherwise return the error code. */
1173 static reg_errcode_t
1174 duplicate_node (new_idx, dfa, org_idx, constraint)
1175 re_dfa_t *dfa;
1176 int *new_idx, org_idx;
1177 unsigned int constraint;
1179 re_token_t dup;
1180 int dup_idx;
1181 reg_errcode_t err;
1183 dup.type = OP_CONTEXT_NODE;
1184 if (dfa->nodes[org_idx].type == OP_CONTEXT_NODE)
1186 /* If the node whose index is ORG_IDX is the same as the intended
1187 node, use it. */
1188 if (dfa->nodes[org_idx].constraint == constraint)
1190 *new_idx = org_idx;
1191 return REG_NOERROR;
1193 dup.constraint = constraint |
1194 dfa->nodes[org_idx].constraint;
1196 else
1197 dup.constraint = constraint;
1199 /* In case that `entity' points OP_CONTEXT_NODE,
1200 we correct `entity' to real entity in calc_inveclosures(). */
1201 dup.opr.ctx_info = malloc (sizeof (*dup.opr.ctx_info));
1202 dup_idx = re_dfa_add_node (dfa, dup, 1);
1203 if (BE (dup.opr.ctx_info == NULL || dup_idx == -1, 0))
1204 return REG_ESPACE;
1205 dup.opr.ctx_info->entity = org_idx;
1206 dup.opr.ctx_info->bkref_eclosure = NULL;
1208 dfa->nodes[dup_idx].duplicated = 1;
1209 dfa->firsts[dup_idx] = dfa->firsts[org_idx];
1210 dfa->nexts[dup_idx] = dfa->nexts[org_idx];
1211 err = re_node_set_init_copy (dfa->edests + dup_idx, dfa->edests + org_idx);
1212 if (BE (err != REG_NOERROR, 0))
1213 return err;
1214 /* Since we don't duplicate epsilon nodes, epsilon closure have
1215 only itself. */
1216 err = re_node_set_init_1 (dfa->eclosures + dup_idx, dup_idx);
1217 if (BE (err != REG_NOERROR, 0))
1218 return err;
1219 err = re_node_set_init_1 (dfa->inveclosures + dup_idx, dup_idx);
1220 if (BE (err != REG_NOERROR, 0))
1221 return err;
1222 /* Then we must update inveclosure for this node.
1223 We process them at last part of calc_eclosure(),
1224 since we don't complete to calculate them here. */
1226 *new_idx = dup_idx;
1227 return REG_NOERROR;
1230 static void
1231 calc_inveclosure (dfa)
1232 re_dfa_t *dfa;
1234 int src, idx, dest, entity;
1235 for (src = 0; src < dfa->nodes_len; ++src)
1237 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1239 dest = dfa->eclosures[src].elems[idx];
1240 re_node_set_insert (dfa->inveclosures + dest, src);
1243 entity = src;
1244 while (dfa->nodes[entity].type == OP_CONTEXT_NODE)
1246 entity = dfa->nodes[entity].opr.ctx_info->entity;
1247 re_node_set_merge (dfa->inveclosures + src,
1248 dfa->inveclosures + entity);
1249 dfa->nodes[src].opr.ctx_info->entity = entity;
1254 /* Calculate "eclosure" for all the node in DFA. */
1256 static reg_errcode_t
1257 calc_eclosure (dfa)
1258 re_dfa_t *dfa;
1260 int idx, node_idx, max, incomplete = 0;
1261 #ifdef DEBUG
1262 assert (dfa->nodes_len > 0);
1263 #endif
1264 /* For each nodes, calculate epsilon closure. */
1265 for (node_idx = 0, max = dfa->nodes_len; ; ++node_idx)
1267 reg_errcode_t err;
1268 re_node_set eclosure_elem;
1269 if (node_idx == max)
1271 if (!incomplete)
1272 break;
1273 incomplete = 0;
1274 node_idx = 0;
1277 #ifdef DEBUG
1278 assert (dfa->nodes[node_idx].type != OP_CONTEXT_NODE);
1279 assert (dfa->eclosures[node_idx].nelem != -1);
1280 #endif
1281 /* If we have already calculated, skip it. */
1282 if (dfa->eclosures[node_idx].nelem != 0)
1283 continue;
1284 /* Calculate epsilon closure of `node_idx'. */
1285 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1286 if (BE (err != REG_NOERROR, 0))
1287 return err;
1289 if (dfa->eclosures[node_idx].nelem == 0)
1291 incomplete = 1;
1292 re_node_set_free (&eclosure_elem);
1296 /* for duplicated nodes. */
1297 for (idx = max; idx < dfa->nodes_len; ++idx)
1299 int entity, i, constraint;
1300 re_node_set *bkref_eclosure;
1301 entity = dfa->nodes[idx].opr.ctx_info->entity;
1302 re_node_set_merge (dfa->inveclosures + idx, dfa->inveclosures + entity);
1303 if (dfa->nodes[entity].type != OP_BACK_REF)
1304 continue;
1306 /* If the node is backreference, duplicate the epsilon closure of
1307 the next node. Since it may epsilon transit. */
1308 /* Note: duplicate_node() may realloc dfa->eclosures, etc. */
1309 bkref_eclosure = re_malloc (re_node_set, 1);
1310 if (BE (bkref_eclosure == NULL, 0))
1311 return REG_ESPACE;
1312 re_node_set_init_empty (bkref_eclosure);
1313 constraint = dfa->nodes[idx].constraint;
1314 for (i = 0; i < dfa->eclosures[dfa->nexts[idx]].nelem; ++i)
1316 int dest_node_idx = dfa->eclosures[dfa->nexts[idx]].elems[i];
1317 if (!IS_EPSILON_NODE (dfa->nodes[dest_node_idx].type))
1319 reg_errcode_t err;
1320 err = duplicate_node (&dest_node_idx, dfa, dest_node_idx,
1321 constraint);
1322 if (BE (err != REG_NOERROR, 0))
1323 return err;
1325 re_node_set_insert (bkref_eclosure, dest_node_idx);
1327 dfa->nodes[idx].opr.ctx_info->bkref_eclosure = bkref_eclosure;
1330 return REG_NOERROR;
1333 /* Calculate epsilon closure of NODE. */
1335 static reg_errcode_t
1336 calc_eclosure_iter (new_set, dfa, node, root)
1337 re_node_set *new_set;
1338 re_dfa_t *dfa;
1339 int node, root;
1341 reg_errcode_t err;
1342 unsigned int constraint;
1343 int i, max, incomplete = 0;
1344 re_node_set eclosure;
1345 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1346 if (BE (err != REG_NOERROR, 0))
1347 return err;
1349 /* This indicates that we are calculating this node now.
1350 We reference this value to avoid infinite loop. */
1351 dfa->eclosures[node].nelem = -1;
1353 constraint = ((dfa->nodes[node].type == ANCHOR)
1354 ? dfa->nodes[node].opr.ctx_type : 0);
1356 /* Expand each epsilon destination nodes. */
1357 if (dfa->edests[node].nelem != 0)
1358 for (i = 0; i < dfa->edests[node].nelem; ++i)
1360 re_node_set eclosure_elem;
1361 int edest = dfa->edests[node].elems[i];
1362 /* If calculating the epsilon closure of `edest' is in progress,
1363 return intermediate result. */
1364 if (dfa->eclosures[edest].nelem == -1)
1366 incomplete = 1;
1367 continue;
1369 /* If we haven't calculated the epsilon closure of `edest' yet,
1370 calculate now. Otherwise use calculated epsilon closure. */
1371 if (dfa->eclosures[edest].nelem == 0)
1373 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1374 if (BE (err != REG_NOERROR, 0))
1375 return err;
1377 else
1378 eclosure_elem = dfa->eclosures[edest];
1379 /* Merge the epsilon closure of `edest'. */
1380 re_node_set_merge (&eclosure, &eclosure_elem);
1381 /* If the epsilon closure of `edest' is incomplete,
1382 the epsilon closure of this node is also incomplete. */
1383 if (dfa->eclosures[edest].nelem == 0)
1385 incomplete = 1;
1386 re_node_set_free (&eclosure_elem);
1390 /* If the current node has constraints, duplicate all non-epsilon nodes.
1391 Since they must inherit the constraints. */
1392 if (constraint)
1393 for (i = 0, max = eclosure.nelem; i < max; ++i)
1395 int dest = eclosure.elems[i];
1396 if (!IS_EPSILON_NODE (dfa->nodes[dest].type))
1398 int dup_dest;
1399 reg_errcode_t err;
1400 err = duplicate_node (&dup_dest, dfa, dest, constraint);
1401 if (BE (err != REG_NOERROR, 0))
1402 return err;
1403 if (dest != dup_dest)
1405 re_node_set_remove_at (&eclosure, i--);
1406 re_node_set_insert (&eclosure, dup_dest);
1407 --max;
1412 /* Epsilon closures include itself. */
1413 re_node_set_insert (&eclosure, node);
1414 if (incomplete && !root)
1415 dfa->eclosures[node].nelem = 0;
1416 else
1417 dfa->eclosures[node] = eclosure;
1418 *new_set = eclosure;
1419 return REG_NOERROR;
1422 /* Functions for token which are used in the parser. */
1424 /* Fetch a token from INPUT.
1425 We must not use this function inside bracket expressions. */
1427 static re_token_t
1428 fetch_token (input, syntax)
1429 re_string_t *input;
1430 reg_syntax_t syntax;
1432 re_token_t token;
1433 int consumed_byte;
1434 consumed_byte = peek_token (&token, input, syntax);
1435 re_string_skip_bytes (input, consumed_byte);
1436 return token;
1439 /* Peek a token from INPUT, and return the length of the token.
1440 We must not use this function inside bracket expressions. */
1442 static int
1443 peek_token (token, input, syntax)
1444 re_token_t *token;
1445 re_string_t *input;
1446 reg_syntax_t syntax;
1448 unsigned char c;
1450 if (re_string_eoi (input))
1452 token->type = END_OF_RE;
1453 return 0;
1456 c = re_string_peek_byte (input, 0);
1457 token->opr.c = c;
1459 #ifdef RE_ENABLE_I18N
1460 token->mb_partial = 0;
1461 if (MB_CUR_MAX > 1 &&
1462 !re_string_first_byte (input, re_string_cur_idx (input)))
1464 token->type = CHARACTER;
1465 token->mb_partial = 1;
1466 return 1;
1468 #endif
1469 if (c == '\\')
1471 unsigned char c2;
1472 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1474 token->type = BACK_SLASH;
1475 return 1;
1478 c2 = re_string_peek_byte_case (input, 1);
1479 token->opr.c = c2;
1480 token->type = CHARACTER;
1481 switch (c2)
1483 case '|':
1484 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1485 token->type = OP_ALT;
1486 break;
1487 case '1': case '2': case '3': case '4': case '5':
1488 case '6': case '7': case '8': case '9':
1489 if (!(syntax & RE_NO_BK_REFS))
1491 token->type = OP_BACK_REF;
1492 token->opr.idx = c2 - '0';
1494 break;
1495 case '<':
1496 if (!(syntax & RE_NO_GNU_OPS))
1498 token->type = ANCHOR;
1499 token->opr.idx = WORD_FIRST;
1501 break;
1502 case '>':
1503 if (!(syntax & RE_NO_GNU_OPS))
1505 token->type = ANCHOR;
1506 token->opr.idx = WORD_LAST;
1508 break;
1509 case 'b':
1510 if (!(syntax & RE_NO_GNU_OPS))
1512 token->type = ANCHOR;
1513 token->opr.idx = WORD_DELIM;
1515 break;
1516 case 'B':
1517 if (!(syntax & RE_NO_GNU_OPS))
1519 token->type = ANCHOR;
1520 token->opr.idx = INSIDE_WORD;
1522 break;
1523 case 'w':
1524 if (!(syntax & RE_NO_GNU_OPS))
1525 token->type = OP_WORD;
1526 break;
1527 case 'W':
1528 if (!(syntax & RE_NO_GNU_OPS))
1529 token->type = OP_NOTWORD;
1530 break;
1531 case '`':
1532 if (!(syntax & RE_NO_GNU_OPS))
1534 token->type = ANCHOR;
1535 token->opr.idx = BUF_FIRST;
1537 break;
1538 case '\'':
1539 if (!(syntax & RE_NO_GNU_OPS))
1541 token->type = ANCHOR;
1542 token->opr.idx = BUF_LAST;
1544 break;
1545 case '(':
1546 if (!(syntax & RE_NO_BK_PARENS))
1547 token->type = OP_OPEN_SUBEXP;
1548 break;
1549 case ')':
1550 if (!(syntax & RE_NO_BK_PARENS))
1551 token->type = OP_CLOSE_SUBEXP;
1552 break;
1553 case '+':
1554 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1555 token->type = OP_DUP_PLUS;
1556 break;
1557 case '?':
1558 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1559 token->type = OP_DUP_QUESTION;
1560 break;
1561 case '{':
1562 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1563 token->type = OP_OPEN_DUP_NUM;
1564 break;
1565 case '}':
1566 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1567 token->type = OP_CLOSE_DUP_NUM;
1568 break;
1569 default:
1570 break;
1572 return 2;
1575 token->type = CHARACTER;
1576 switch (c)
1578 case '\n':
1579 if (syntax & RE_NEWLINE_ALT)
1580 token->type = OP_ALT;
1581 break;
1582 case '|':
1583 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1584 token->type = OP_ALT;
1585 break;
1586 case '*':
1587 token->type = OP_DUP_ASTERISK;
1588 break;
1589 case '+':
1590 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1591 token->type = OP_DUP_PLUS;
1592 break;
1593 case '?':
1594 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1595 token->type = OP_DUP_QUESTION;
1596 break;
1597 case '{':
1598 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1599 token->type = OP_OPEN_DUP_NUM;
1600 break;
1601 case '}':
1602 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1603 token->type = OP_CLOSE_DUP_NUM;
1604 break;
1605 case '(':
1606 if (syntax & RE_NO_BK_PARENS)
1607 token->type = OP_OPEN_SUBEXP;
1608 break;
1609 case ')':
1610 if (syntax & RE_NO_BK_PARENS)
1611 token->type = OP_CLOSE_SUBEXP;
1612 break;
1613 case '[':
1614 token->type = OP_OPEN_BRACKET;
1615 break;
1616 case '.':
1617 token->type = OP_PERIOD;
1618 break;
1619 case '^':
1620 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1621 re_string_cur_idx (input) != 0)
1623 char prev = re_string_peek_byte (input, -1);
1624 if (prev != '|' && prev != '(' &&
1625 (!(syntax & RE_NEWLINE_ALT) || prev != '\n'))
1626 break;
1628 token->type = ANCHOR;
1629 token->opr.idx = LINE_FIRST;
1630 break;
1631 case '$':
1632 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1633 re_string_cur_idx (input) + 1 != re_string_length (input))
1635 re_token_t next;
1636 re_string_skip_bytes (input, 1);
1637 peek_token (&next, input, syntax);
1638 re_string_skip_bytes (input, -1);
1639 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1640 break;
1642 token->type = ANCHOR;
1643 token->opr.idx = LINE_LAST;
1644 break;
1645 default:
1646 break;
1648 return 1;
1651 /* Peek a token from INPUT, and return the length of the token.
1652 We must not use this function out of bracket expressions. */
1654 static int
1655 peek_token_bracket (token, input, syntax)
1656 re_token_t *token;
1657 re_string_t *input;
1658 reg_syntax_t syntax;
1660 unsigned char c;
1661 if (re_string_eoi (input))
1663 token->type = END_OF_RE;
1664 return 0;
1666 c = re_string_peek_byte (input, 0);
1667 token->opr.c = c;
1669 #ifdef RE_ENABLE_I18N
1670 if (MB_CUR_MAX > 1 &&
1671 !re_string_first_byte (input, re_string_cur_idx (input)))
1673 token->type = CHARACTER;
1674 return 1;
1676 #endif /* RE_ENABLE_I18N */
1678 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS))
1680 /* In this case, '\' escape a character. */
1681 unsigned char c2;
1682 c2 = re_string_peek_byte (input, 1);
1683 token->opr.c = c2;
1684 token->type = CHARACTER;
1685 return 1;
1687 if (c == '[') /* '[' is a special char in a bracket exps. */
1689 unsigned char c2;
1690 int token_len;
1691 c2 = re_string_peek_byte (input, 1);
1692 token->opr.c = c2;
1693 token_len = 2;
1694 switch (c2)
1696 case '.':
1697 token->type = OP_OPEN_COLL_ELEM;
1698 break;
1699 case '=':
1700 token->type = OP_OPEN_EQUIV_CLASS;
1701 break;
1702 case ':':
1703 if (syntax & RE_CHAR_CLASSES)
1705 token->type = OP_OPEN_CHAR_CLASS;
1706 break;
1708 /* else fall through. */
1709 default:
1710 token->type = CHARACTER;
1711 token->opr.c = c;
1712 token_len = 1;
1713 break;
1715 return token_len;
1717 switch (c)
1719 case '-':
1720 token->type = OP_CHARSET_RANGE;
1721 break;
1722 case ']':
1723 token->type = OP_CLOSE_BRACKET;
1724 break;
1725 case '^':
1726 token->type = OP_NON_MATCH_LIST;
1727 break;
1728 default:
1729 token->type = CHARACTER;
1731 return 1;
1734 /* Functions for parser. */
1736 /* Entry point of the parser.
1737 Parse the regular expression REGEXP and return the structure tree.
1738 If an error is occured, ERR is set by error code, and return NULL.
1739 This function build the following tree, from regular expression <reg_exp>:
1743 <reg_exp> EOR
1745 CAT means concatenation.
1746 EOR means end of regular expression. */
1748 static bin_tree_t *
1749 parse (regexp, preg, syntax, err)
1750 re_string_t *regexp;
1751 regex_t *preg;
1752 reg_syntax_t syntax;
1753 reg_errcode_t *err;
1755 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1756 bin_tree_t *tree, *eor, *root;
1757 re_token_t current_token;
1758 int new_idx;
1759 current_token = fetch_token (regexp, syntax);
1760 tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
1761 if (BE (*err != REG_NOERROR && tree == NULL, 0))
1762 return NULL;
1763 new_idx = re_dfa_add_node (dfa, current_token, 0);
1764 eor = create_tree (NULL, NULL, 0, new_idx);
1765 if (tree != NULL)
1766 root = create_tree (tree, eor, CONCAT, 0);
1767 else
1768 root = eor;
1769 if (BE (new_idx == -1 || eor == NULL || root == NULL, 0))
1770 return *err = REG_ESPACE, NULL;
1771 return root;
1774 /* This function build the following tree, from regular expression
1775 <branch1>|<branch2>:
1779 <branch1> <branch2>
1781 ALT means alternative, which represents the operator `|'. */
1783 static bin_tree_t *
1784 parse_reg_exp (regexp, preg, token, syntax, nest, err)
1785 re_string_t *regexp;
1786 regex_t *preg;
1787 re_token_t *token;
1788 reg_syntax_t syntax;
1789 int nest;
1790 reg_errcode_t *err;
1792 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1793 bin_tree_t *tree, *branch = NULL;
1794 int new_idx;
1795 tree = parse_branch (regexp, preg, token, syntax, nest, err);
1796 if (BE (*err != REG_NOERROR && tree == NULL, 0))
1797 return NULL;
1799 while (token->type == OP_ALT)
1801 re_token_t alt_token = *token;
1802 new_idx = re_dfa_add_node (dfa, alt_token, 0);
1803 *token = fetch_token (regexp, syntax);
1804 if (token->type != OP_ALT && token->type != END_OF_RE
1805 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
1807 branch = parse_branch (regexp, preg, token, syntax, nest, err);
1808 if (BE (*err != REG_NOERROR && branch == NULL, 0))
1810 free_bin_tree (tree);
1811 return NULL;
1814 else
1815 branch = NULL;
1816 tree = create_tree (tree, branch, 0, new_idx);
1817 if (BE (new_idx == -1 || tree == NULL, 0))
1818 return *err = REG_ESPACE, NULL;
1820 return tree;
1823 /* This function build the following tree, from regular expression
1824 <exp1><exp2>:
1828 <exp1> <exp2>
1830 CAT means concatenation. */
1832 static bin_tree_t *
1833 parse_branch (regexp, preg, token, syntax, nest, err)
1834 re_string_t *regexp;
1835 regex_t *preg;
1836 re_token_t *token;
1837 reg_syntax_t syntax;
1838 int nest;
1839 reg_errcode_t *err;
1841 bin_tree_t *tree, *exp;
1842 tree = parse_expression (regexp, preg, token, syntax, nest, err);
1843 if (BE (*err != REG_NOERROR && tree == NULL, 0))
1844 return NULL;
1846 while (token->type != OP_ALT && token->type != END_OF_RE
1847 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
1849 exp = parse_expression (regexp, preg, token, syntax, nest, err);
1850 if (BE (*err != REG_NOERROR && exp == NULL, 0))
1852 free_bin_tree (tree);
1853 return NULL;
1855 if (tree != NULL && exp != NULL)
1857 tree = create_tree (tree, exp, CONCAT, 0);
1858 if (tree == NULL)
1859 return *err = REG_ESPACE, NULL;
1861 else if (tree == NULL)
1862 tree = exp;
1863 /* Otherwise exp == NULL, we don't need to create new tree. */
1865 return tree;
1868 /* This function build the following tree, from regular expression a*:
1874 static bin_tree_t *
1875 parse_expression (regexp, preg, token, syntax, nest, err)
1876 re_string_t *regexp;
1877 regex_t *preg;
1878 re_token_t *token;
1879 reg_syntax_t syntax;
1880 int nest;
1881 reg_errcode_t *err;
1883 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1884 bin_tree_t *tree;
1885 int new_idx;
1886 switch (token->type)
1888 case CHARACTER:
1889 new_idx = re_dfa_add_node (dfa, *token, 0);
1890 tree = create_tree (NULL, NULL, 0, new_idx);
1891 if (BE (new_idx == -1 || tree == NULL, 0))
1892 return *err = REG_ESPACE, NULL;
1893 #ifdef RE_ENABLE_I18N
1894 if (MB_CUR_MAX > 1)
1896 while (!re_string_eoi (regexp)
1897 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
1899 bin_tree_t *mbc_remain;
1900 *token = fetch_token (regexp, syntax);
1901 new_idx = re_dfa_add_node (dfa, *token, 0);
1902 mbc_remain = create_tree (NULL, NULL, 0, new_idx);
1903 tree = create_tree (tree, mbc_remain, CONCAT, 0);
1904 if (BE (new_idx == -1 || mbc_remain == NULL || tree == NULL, 0))
1905 return *err = REG_ESPACE, NULL;
1908 #endif
1909 break;
1910 case OP_OPEN_SUBEXP:
1911 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
1912 if (BE (*err != REG_NOERROR && tree == NULL, 0))
1913 return NULL;
1914 break;
1915 case OP_OPEN_BRACKET:
1916 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
1917 if (BE (*err != REG_NOERROR && tree == NULL, 0))
1918 return NULL;
1919 break;
1920 case OP_BACK_REF:
1921 if (BE (preg->re_nsub < token->opr.idx
1922 || dfa->subexps[token->opr.idx - 1].end == -1, 0))
1924 *err = REG_ESUBREG;
1925 return NULL;
1927 new_idx = re_dfa_add_node (dfa, *token, 0);
1928 tree = create_tree (NULL, NULL, 0, new_idx);
1929 if (BE (new_idx == -1 || tree == NULL, 0))
1930 return *err = REG_ESPACE, NULL;
1931 ++dfa->nbackref;
1932 dfa->has_mb_node = 1;
1933 break;
1934 case OP_DUP_ASTERISK:
1935 case OP_DUP_PLUS:
1936 case OP_DUP_QUESTION:
1937 case OP_OPEN_DUP_NUM:
1938 if (syntax & RE_CONTEXT_INVALID_OPS)
1939 return *err = REG_BADRPT, NULL;
1940 else if (syntax & RE_CONTEXT_INDEP_OPS)
1942 *token = fetch_token (regexp, syntax);
1943 return parse_expression (regexp, preg, token, syntax, nest, err);
1945 /* else fall through */
1946 case OP_CLOSE_SUBEXP:
1947 if ((token->type == OP_CLOSE_SUBEXP) &&
1948 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
1949 return *err = REG_ERPAREN, NULL;
1950 /* else fall through */
1951 case OP_CLOSE_DUP_NUM:
1952 /* We treat it as a normal character. */
1954 /* Then we can these characters as normal characters. */
1955 token->type = CHARACTER;
1956 new_idx = re_dfa_add_node (dfa, *token, 0);
1957 tree = create_tree (NULL, NULL, 0, new_idx);
1958 if (BE (new_idx == -1 || tree == NULL, 0))
1959 return *err = REG_ESPACE, NULL;
1960 break;
1961 case ANCHOR:
1962 if (dfa->word_char == NULL)
1964 *err = init_word_char (dfa);
1965 if (BE (*err != REG_NOERROR, 0))
1966 return NULL;
1968 if (token->opr.ctx_type == WORD_DELIM)
1970 bin_tree_t *tree_first, *tree_last;
1971 int idx_first, idx_last;
1972 token->opr.ctx_type = WORD_FIRST;
1973 idx_first = re_dfa_add_node (dfa, *token, 0);
1974 tree_first = create_tree (NULL, NULL, 0, idx_first);
1975 token->opr.ctx_type = WORD_LAST;
1976 idx_last = re_dfa_add_node (dfa, *token, 0);
1977 tree_last = create_tree (NULL, NULL, 0, idx_last);
1978 token->type = OP_ALT;
1979 new_idx = re_dfa_add_node (dfa, *token, 0);
1980 tree = create_tree (tree_first, tree_last, 0, new_idx);
1981 if (BE (idx_first == -1 || idx_last == -1 || new_idx == -1
1982 || tree_first == NULL || tree_last == NULL
1983 || tree == NULL, 0))
1984 return *err = REG_ESPACE, NULL;
1986 else
1988 new_idx = re_dfa_add_node (dfa, *token, 0);
1989 tree = create_tree (NULL, NULL, 0, new_idx);
1990 if (BE (new_idx == -1 || tree == NULL, 0))
1991 return *err = REG_ESPACE, NULL;
1993 /* We must return here, since ANCHORs can't be followed
1994 by repetition operators.
1995 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
1996 it must not be "<ANCHOR(^)><REPEAT(*)>". */
1997 *token = fetch_token (regexp, syntax);
1998 return tree;
1999 case OP_PERIOD:
2000 new_idx = re_dfa_add_node (dfa, *token, 0);
2001 tree = create_tree (NULL, NULL, 0, new_idx);
2002 if (BE (new_idx == -1 || tree == NULL, 0))
2003 return *err = REG_ESPACE, NULL;
2004 if (MB_CUR_MAX > 1)
2005 dfa->has_mb_node = 1;
2006 break;
2007 case OP_WORD:
2008 tree = build_word_op (dfa, 0, err);
2009 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2010 return NULL;
2011 break;
2012 case OP_NOTWORD:
2013 tree = build_word_op (dfa, 1, err);
2014 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2015 return NULL;
2016 break;
2017 case OP_ALT:
2018 case END_OF_RE:
2019 return NULL;
2020 case BACK_SLASH:
2021 *err = REG_EESCAPE;
2022 return NULL;
2023 default:
2024 /* Must not happen? */
2025 #ifdef DEBUG
2026 assert (0);
2027 #endif
2028 return NULL;
2030 *token = fetch_token (regexp, syntax);
2032 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2033 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2035 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2036 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2037 return NULL;
2040 return tree;
2043 /* This function build the following tree, from regular expression
2044 (<reg_exp>):
2045 SUBEXP
2047 <reg_exp>
2050 static bin_tree_t *
2051 parse_sub_exp (regexp, preg, token, syntax, nest, err)
2052 re_string_t *regexp;
2053 regex_t *preg;
2054 re_token_t *token;
2055 reg_syntax_t syntax;
2056 int nest;
2057 reg_errcode_t *err;
2059 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2060 bin_tree_t *tree, *left_par, *right_par;
2061 size_t cur_nsub;
2062 int new_idx;
2063 cur_nsub = preg->re_nsub++;
2064 if (dfa->subexps_alloc < preg->re_nsub)
2066 re_subexp_t *new_array;
2067 dfa->subexps_alloc *= 2;
2068 new_array = re_realloc (dfa->subexps, re_subexp_t, dfa->subexps_alloc);
2069 if (BE (new_array == NULL, 0))
2071 dfa->subexps_alloc /= 2;
2072 *err = REG_ESPACE;
2073 return NULL;
2075 dfa->subexps = new_array;
2077 dfa->subexps[cur_nsub].start = dfa->nodes_len;
2078 dfa->subexps[cur_nsub].end = -1;
2080 new_idx = re_dfa_add_node (dfa, *token, 0);
2081 left_par = create_tree (NULL, NULL, 0, new_idx);
2082 if (BE (new_idx == -1 || left_par == NULL, 0))
2083 return *err = REG_ESPACE, NULL;
2084 dfa->nodes[new_idx].opr.idx = cur_nsub;
2085 *token = fetch_token (regexp, syntax);
2087 /* The subexpression may be a null string. */
2088 if (token->type == OP_CLOSE_SUBEXP)
2089 tree = NULL;
2090 else
2092 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2093 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2094 return NULL;
2096 if (BE (token->type != OP_CLOSE_SUBEXP, 0))
2098 free_bin_tree (tree);
2099 *err = REG_BADPAT;
2100 return NULL;
2102 new_idx = re_dfa_add_node (dfa, *token, 0);
2103 dfa->subexps[cur_nsub].end = dfa->nodes_len;
2104 right_par = create_tree (NULL, NULL, 0, new_idx);
2105 tree = ((tree == NULL) ? right_par
2106 : create_tree (tree, right_par, CONCAT, 0));
2107 tree = create_tree (left_par, tree, CONCAT, 0);
2108 if (BE (new_idx == -1 || right_par == NULL || tree == NULL, 0))
2109 return *err = REG_ESPACE, NULL;
2110 dfa->nodes[new_idx].opr.idx = cur_nsub;
2112 return tree;
2115 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2117 static bin_tree_t *
2118 parse_dup_op (dup_elem, regexp, dfa, token, syntax, err)
2119 bin_tree_t *dup_elem;
2120 re_string_t *regexp;
2121 re_dfa_t *dfa;
2122 re_token_t *token;
2123 reg_syntax_t syntax;
2124 reg_errcode_t *err;
2126 re_token_t dup_token;
2127 bin_tree_t *tree = dup_elem, *work_tree;
2128 int new_idx, start_idx = re_string_cur_idx (regexp);
2129 re_token_t start_token = *token;
2130 if (token->type == OP_OPEN_DUP_NUM)
2132 int i;
2133 int end = 0;
2134 int start = fetch_number (regexp, token, syntax);
2135 bin_tree_t *elem;
2136 if (start == -1)
2138 if (token->type == CHARACTER && token->opr.c == ',')
2139 start = 0; /* We treat "{,m}" as "{0,m}". */
2140 else
2142 *err = REG_BADBR; /* <re>{} is invalid. */
2143 return NULL;
2146 if (BE (start != -2, 1))
2148 /* We treat "{n}" as "{n,n}". */
2149 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2150 : ((token->type == CHARACTER && token->opr.c == ',')
2151 ? fetch_number (regexp, token, syntax) : -2));
2153 if (BE (start == -2 || end == -2, 0))
2155 /* Invalid sequence. */
2156 if (token->type == OP_CLOSE_DUP_NUM)
2157 goto parse_dup_op_invalid_interval;
2158 else
2159 goto parse_dup_op_ebrace;
2161 if (BE (start == 0 && end == 0, 0))
2163 /* We treat "<re>{0}" and "<re>{0,0}" as null string. */
2164 *token = fetch_token (regexp, syntax);
2165 free_bin_tree (dup_elem);
2166 return NULL;
2169 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2170 elem = tree;
2171 for (i = 0; i < start; ++i)
2172 if (i != 0)
2174 work_tree = duplicate_tree (elem, dfa);
2175 tree = create_tree (tree, work_tree, CONCAT, 0);
2176 if (BE (work_tree == NULL || tree == NULL, 0))
2177 goto parse_dup_op_espace;
2180 if (end == -1)
2182 /* We treat "<re>{0,}" as "<re>*". */
2183 dup_token.type = OP_DUP_ASTERISK;
2184 if (start > 0)
2186 elem = duplicate_tree (elem, dfa);
2187 new_idx = re_dfa_add_node (dfa, dup_token, 0);
2188 work_tree = create_tree (elem, NULL, 0, new_idx);
2189 tree = create_tree (tree, work_tree, CONCAT, 0);
2190 if (BE (elem == NULL || new_idx == -1 || work_tree == NULL
2191 || tree == NULL, 0))
2192 goto parse_dup_op_espace;
2194 else
2196 new_idx = re_dfa_add_node (dfa, dup_token, 0);
2197 tree = create_tree (elem, NULL, 0, new_idx);
2198 if (BE (new_idx == -1 || tree == NULL, 0))
2199 goto parse_dup_op_espace;
2202 else if (end - start > 0)
2204 /* Then extract "<re>{0,m}" to "<re>?<re>?...<re>?". */
2205 dup_token.type = OP_DUP_QUESTION;
2206 if (start > 0)
2208 elem = duplicate_tree (elem, dfa);
2209 new_idx = re_dfa_add_node (dfa, dup_token, 0);
2210 elem = create_tree (elem, NULL, 0, new_idx);
2211 tree = create_tree (tree, elem, CONCAT, 0);
2212 if (BE (elem == NULL || new_idx == -1 || tree == NULL, 0))
2213 goto parse_dup_op_espace;
2215 else
2217 new_idx = re_dfa_add_node (dfa, dup_token, 0);
2218 tree = elem = create_tree (elem, NULL, 0, new_idx);
2219 if (BE (new_idx == -1 || tree == NULL, 0))
2220 goto parse_dup_op_espace;
2222 for (i = 1; i < end - start; ++i)
2224 work_tree = duplicate_tree (elem, dfa);
2225 tree = create_tree (tree, work_tree, CONCAT, 0);
2226 if (BE (work_tree == NULL || tree == NULL, 0))
2227 return *err = REG_ESPACE, NULL;
2231 else
2233 new_idx = re_dfa_add_node (dfa, *token, 0);
2234 tree = create_tree (tree, NULL, 0, new_idx);
2235 if (BE (new_idx == -1 || tree == NULL, 0))
2236 return *err = REG_ESPACE, NULL;
2238 *token = fetch_token (regexp, syntax);
2239 return tree;
2241 parse_dup_op_espace:
2242 free_bin_tree (tree);
2243 *err = REG_ESPACE;
2244 return NULL;
2246 parse_dup_op_ebrace:
2247 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2249 *err = REG_EBRACE;
2250 return NULL;
2252 goto parse_dup_op_rollback;
2253 parse_dup_op_invalid_interval:
2254 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2256 *err = REG_BADBR;
2257 return NULL;
2259 parse_dup_op_rollback:
2260 re_string_set_index (regexp, start_idx);
2261 *token = start_token;
2262 token->type = CHARACTER;
2263 return dup_elem;
2266 /* Size of the names for collating symbol/equivalence_class/character_class.
2267 I'm not sure, but maybe enough. */
2268 #define BRACKET_NAME_BUF_SIZE 32
2270 #ifndef _LIBC
2271 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2272 Build the range expression which starts from START_ELEM, and ends
2273 at END_ELEM. The result are written to MBCSET and SBCSET.
2274 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2275 mbcset->range_ends, is a pointer argument sinse we may
2276 update it. */
2278 static reg_errcode_t
2279 # ifdef RE_ENABLE_I18N
2280 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2281 re_charset_t *mbcset;
2282 int *range_alloc;
2283 # else /* not RE_ENABLE_I18N */
2284 build_range_exp (sbcset, start_elem, end_elem)
2285 # endif /* not RE_ENABLE_I18N */
2286 re_bitset_ptr_t sbcset;
2287 bracket_elem_t *start_elem, *end_elem;
2289 unsigned int start_ch, end_ch;
2290 /* Equivalence Classes and Character Classes can't be a range start/end. */
2291 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2292 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2294 return REG_ERANGE;
2296 /* We can handle no multi character collating elements without libc
2297 support. */
2298 if (BE ((start_elem->type == COLL_SYM
2299 && strlen ((char *) start_elem->opr.name) > 1)
2300 || (end_elem->type == COLL_SYM
2301 && strlen ((char *) end_elem->opr.name) > 1), 0))
2302 return REG_ECOLLATE;
2304 # ifdef RE_ENABLE_I18N
2306 wchar_t wc, start_wc, end_wc;
2307 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2309 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2310 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2311 : 0));
2312 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2313 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2314 : 0));
2315 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2316 ? __btowc (start_ch) : start_elem->opr.wch);
2317 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2318 ? __btowc (end_ch) : end_elem->opr.wch);
2319 cmp_buf[0] = start_wc;
2320 cmp_buf[4] = end_wc;
2321 if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2322 return REG_ERANGE;
2324 /* Check the space of the arrays. */
2325 if (*range_alloc == mbcset->nranges)
2327 /* There are not enough space, need realloc. */
2328 wchar_t *new_array_start, *new_array_end;
2329 int new_nranges;
2331 /* +1 in case of mbcset->nranges is 0. */
2332 new_nranges = 2 * mbcset->nranges + 1;
2333 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2334 are NULL if *range_alloc == 0. */
2335 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2336 new_nranges);
2337 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2338 new_nranges);
2340 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2341 return REG_ESPACE;
2343 mbcset->range_starts = new_array_start;
2344 mbcset->range_ends = new_array_end;
2345 *range_alloc = new_nranges;
2348 mbcset->range_starts[mbcset->nranges] = start_wc;
2349 mbcset->range_ends[mbcset->nranges++] = end_wc;
2351 /* Build the table for single byte characters. */
2352 for (wc = 0; wc <= SBC_MAX; ++wc)
2354 cmp_buf[2] = wc;
2355 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2356 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2357 bitset_set (sbcset, wc);
2360 # else /* not RE_ENABLE_I18N */
2362 unsigned int ch;
2363 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2364 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2365 : 0));
2366 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2367 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2368 : 0));
2369 if (start_ch > end_ch)
2370 return REG_ERANGE;
2371 /* Build the table for single byte characters. */
2372 for (ch = 0; ch <= SBC_MAX; ++ch)
2373 if (start_ch <= ch && ch <= end_ch)
2374 bitset_set (sbcset, ch);
2376 # endif /* not RE_ENABLE_I18N */
2377 return REG_NOERROR;
2379 #endif /* not _LIBC */
2381 #ifndef _LIBC
2382 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2383 Build the collating element which is represented by NAME.
2384 The result are written to MBCSET and SBCSET.
2385 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2386 pointer argument since we may update it. */
2388 static reg_errcode_t
2389 # ifdef RE_ENABLE_I18N
2390 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2391 re_charset_t *mbcset;
2392 int *coll_sym_alloc;
2393 # else /* not RE_ENABLE_I18N */
2394 build_collating_symbol (sbcset, name)
2395 # endif /* not RE_ENABLE_I18N */
2396 re_bitset_ptr_t sbcset;
2397 const unsigned char *name;
2399 size_t name_len = strlen ((const char *) name);
2400 if (BE (name_len != 1, 0))
2401 return REG_ECOLLATE;
2402 else
2404 bitset_set (sbcset, name[0]);
2405 return REG_NOERROR;
2408 #endif /* not _LIBC */
2410 /* This function parse bracket expression like "[abc]", "[a-c]",
2411 "[[.a-a.]]" etc. */
2413 static bin_tree_t *
2414 parse_bracket_exp (regexp, dfa, token, syntax, err)
2415 re_string_t *regexp;
2416 re_dfa_t *dfa;
2417 re_token_t *token;
2418 reg_syntax_t syntax;
2419 reg_errcode_t *err;
2421 #ifdef _LIBC
2422 const unsigned char *collseqmb;
2423 const char *collseqwc;
2424 uint32_t nrules;
2425 int32_t table_size;
2426 const int32_t *symb_table;
2427 const unsigned char *extra;
2429 /* Local function for parse_bracket_exp used in _LIBC environement.
2430 Seek the collating symbol entry correspondings to NAME.
2431 Return the index of the symbol in the SYMB_TABLE. */
2433 static inline int32_t
2434 seek_collating_symbol_entry (name, name_len)
2435 const unsigned char *name;
2436 size_t name_len;
2438 int32_t hash = elem_hash ((const char *) name, name_len);
2439 int32_t elem = hash % table_size;
2440 int32_t second = hash % (table_size - 2);
2441 while (symb_table[2 * elem] != 0)
2443 /* First compare the hashing value. */
2444 if (symb_table[2 * elem] == hash
2445 /* Compare the length of the name. */
2446 && name_len == extra[symb_table[2 * elem + 1]]
2447 /* Compare the name. */
2448 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2449 name_len) == 0)
2451 /* Yep, this is the entry. */
2452 break;
2455 /* Next entry. */
2456 elem += second;
2458 return elem;
2461 /* Local function for parse_bracket_exp used in _LIBC environement.
2462 Look up the collation sequence value of BR_ELEM.
2463 Return the value if succeeded, UINT_MAX otherwise. */
2465 static inline unsigned int
2466 lookup_collation_sequence_value (br_elem)
2467 bracket_elem_t *br_elem;
2469 if (br_elem->type == SB_CHAR)
2472 if (MB_CUR_MAX == 1)
2474 if (nrules == 0)
2475 return collseqmb[br_elem->opr.ch];
2476 else
2478 wint_t wc = __btowc (br_elem->opr.ch);
2479 return collseq_table_lookup (collseqwc, wc);
2482 else if (br_elem->type == MB_CHAR)
2484 return collseq_table_lookup (collseqwc, br_elem->opr.wch);
2486 else if (br_elem->type == COLL_SYM)
2488 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2489 if (nrules != 0)
2491 int32_t elem, idx;
2492 elem = seek_collating_symbol_entry (br_elem->opr.name,
2493 sym_name_len);
2494 if (symb_table[2 * elem] != 0)
2496 /* We found the entry. */
2497 idx = symb_table[2 * elem + 1];
2498 /* Skip the name of collating element name. */
2499 idx += 1 + extra[idx];
2500 /* Skip the byte sequence of the collating element. */
2501 idx += 1 + extra[idx];
2502 /* Adjust for the alignment. */
2503 idx = (idx + 3) & ~3;
2504 /* Skip the multibyte collation sequence value. */
2505 idx += sizeof (unsigned int);
2506 /* Skip the wide char sequence of the collating element. */
2507 idx += sizeof (unsigned int) *
2508 (1 + *(unsigned int *) (extra + idx));
2509 /* Return the collation sequence value. */
2510 return *(unsigned int *) (extra + idx);
2512 else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2514 /* No valid character. Match it as a single byte
2515 character. */
2516 return collseqmb[br_elem->opr.name[0]];
2519 else if (sym_name_len == 1)
2520 return collseqmb[br_elem->opr.name[0]];
2522 return UINT_MAX;
2525 /* Local function for parse_bracket_exp used in _LIBC environement.
2526 Build the range expression which starts from START_ELEM, and ends
2527 at END_ELEM. The result are written to MBCSET and SBCSET.
2528 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2529 mbcset->range_ends, is a pointer argument sinse we may
2530 update it. */
2532 static inline reg_errcode_t
2533 # ifdef RE_ENABLE_I18N
2534 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2535 re_charset_t *mbcset;
2536 int *range_alloc;
2537 # else /* not RE_ENABLE_I18N */
2538 build_range_exp (sbcset, start_elem, end_elem)
2539 # endif /* not RE_ENABLE_I18N */
2540 re_bitset_ptr_t sbcset;
2541 bracket_elem_t *start_elem, *end_elem;
2543 unsigned int ch;
2544 uint32_t start_collseq;
2545 uint32_t end_collseq;
2547 # ifdef RE_ENABLE_I18N
2548 /* Check the space of the arrays. */
2549 if (*range_alloc == mbcset->nranges)
2551 /* There are not enough space, need realloc. */
2552 uint32_t *new_array_start;
2553 uint32_t *new_array_end;
2554 int new_nranges;
2556 /* +1 in case of mbcset->nranges is 0. */
2557 new_nranges = 2 * mbcset->nranges + 1;
2558 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2559 are NULL if *range_alloc == 0. */
2560 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2561 new_nranges);
2562 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2563 new_nranges);
2565 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2566 return REG_ESPACE;
2568 mbcset->range_starts = new_array_start;
2569 mbcset->range_ends = new_array_end;
2570 *range_alloc = new_nranges;
2572 # endif /* RE_ENABLE_I18N */
2574 /* Equivalence Classes and Character Classes can't be a range
2575 start/end. */
2576 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2577 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2579 return REG_ERANGE;
2581 start_collseq = lookup_collation_sequence_value (start_elem);
2582 end_collseq = lookup_collation_sequence_value (end_elem);
2583 /* Check start/end collation sequence values. */
2584 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2585 return REG_ECOLLATE;
2586 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2587 return REG_ERANGE;
2589 # ifdef RE_ENABLE_I18N
2590 /* Got valid collation sequence values, add them as a new entry. */
2591 mbcset->range_starts[mbcset->nranges] = start_collseq;
2592 mbcset->range_ends[mbcset->nranges++] = end_collseq;
2593 # endif /* RE_ENABLE_I18N */
2595 /* Build the table for single byte characters. */
2596 for (ch = 0; ch <= SBC_MAX; ch++)
2598 uint32_t ch_collseq;
2600 if (MB_CUR_MAX == 1)
2602 if (nrules == 0)
2603 ch_collseq = collseqmb[ch];
2604 else
2605 ch_collseq = collseq_table_lookup (collseqwc, __btowc (ch));
2606 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2607 bitset_set (sbcset, ch);
2609 return REG_NOERROR;
2612 /* Local function for parse_bracket_exp used in _LIBC environement.
2613 Build the collating element which is represented by NAME.
2614 The result are written to MBCSET and SBCSET.
2615 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2616 pointer argument sinse we may update it. */
2618 static inline reg_errcode_t
2619 # ifdef RE_ENABLE_I18N
2620 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2621 re_charset_t *mbcset;
2622 int *coll_sym_alloc;
2623 # else /* not RE_ENABLE_I18N */
2624 build_collating_symbol (sbcset, name)
2625 # endif /* not RE_ENABLE_I18N */
2626 re_bitset_ptr_t sbcset;
2627 const unsigned char *name;
2629 int32_t elem, idx;
2630 size_t name_len = strlen ((const char *) name);
2631 if (nrules != 0)
2633 elem = seek_collating_symbol_entry (name, name_len);
2634 if (symb_table[2 * elem] != 0)
2636 /* We found the entry. */
2637 idx = symb_table[2 * elem + 1];
2638 /* Skip the name of collating element name. */
2639 idx += 1 + extra[idx];
2641 else if (symb_table[2 * elem] == 0 && name_len == 1)
2643 /* No valid character, treat it as a normal
2644 character. */
2645 bitset_set (sbcset, name[0]);
2646 return REG_NOERROR;
2648 else
2649 return REG_ECOLLATE;
2651 # ifdef RE_ENABLE_I18N
2652 /* Got valid collation sequence, add it as a new entry. */
2653 /* Check the space of the arrays. */
2654 if (*coll_sym_alloc == mbcset->ncoll_syms)
2656 /* Not enough, realloc it. */
2657 /* +1 in case of mbcset->ncoll_syms is 0. */
2658 *coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
2659 /* Use realloc since mbcset->coll_syms is NULL
2660 if *alloc == 0. */
2661 mbcset->coll_syms = re_realloc (mbcset->coll_syms, int32_t,
2662 *coll_sym_alloc);
2663 if (BE (mbcset->coll_syms == NULL, 0))
2664 return REG_ESPACE;
2666 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
2667 # endif /* RE_ENABLE_I18N */
2668 return REG_NOERROR;
2670 else
2672 if (BE (name_len != 1, 0))
2673 return REG_ECOLLATE;
2674 else
2676 bitset_set (sbcset, name[0]);
2677 return REG_NOERROR;
2681 #endif
2683 re_token_t br_token;
2684 re_bitset_ptr_t sbcset;
2685 #ifdef RE_ENABLE_I18N
2686 re_charset_t *mbcset;
2687 int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
2688 int equiv_class_alloc = 0, char_class_alloc = 0;
2689 #else /* not RE_ENABLE_I18N */
2690 int non_match = 0;
2691 #endif /* not RE_ENABLE_I18N */
2692 bin_tree_t *work_tree;
2693 int token_len, new_idx;
2694 #ifdef _LIBC
2695 collseqmb = (const unsigned char *)
2696 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
2697 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
2698 if (nrules)
2701 if (MB_CUR_MAX > 1)
2703 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
2704 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
2705 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
2706 _NL_COLLATE_SYMB_TABLEMB);
2707 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
2708 _NL_COLLATE_SYMB_EXTRAMB);
2710 #endif
2711 sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
2712 #ifdef RE_ENABLE_I18N
2713 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
2714 #endif /* RE_ENABLE_I18N */
2715 #ifdef RE_ENABLE_I18N
2716 if (BE (sbcset == NULL || mbcset == NULL, 0))
2717 #else
2718 if (BE (sbcset == NULL, 0))
2719 #endif /* RE_ENABLE_I18N */
2721 *err = REG_ESPACE;
2722 return NULL;
2725 token_len = peek_token_bracket (token, regexp, syntax);
2726 if (BE (token->type == END_OF_RE, 0))
2728 *err = REG_BADPAT;
2729 goto parse_bracket_exp_free_return;
2731 if (token->type == OP_NON_MATCH_LIST)
2733 #ifdef RE_ENABLE_I18N
2734 int i;
2735 mbcset->non_match = 1;
2736 #else /* not RE_ENABLE_I18N */
2737 non_match = 1;
2738 #endif /* not RE_ENABLE_I18N */
2739 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
2740 bitset_set (sbcset, '\0');
2741 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
2742 token_len = peek_token_bracket (token, regexp, syntax);
2743 if (BE (token->type == END_OF_RE, 0))
2745 *err = REG_BADPAT;
2746 goto parse_bracket_exp_free_return;
2748 #ifdef RE_ENABLE_I18N
2749 if (MB_CUR_MAX > 1)
2750 for (i = 0; i < SBC_MAX; ++i)
2751 if (__btowc (i) == WEOF)
2752 bitset_set (sbcset, i);
2753 #endif /* RE_ENABLE_I18N */
2756 /* We treat the first ']' as a normal character. */
2757 if (token->type == OP_CLOSE_BRACKET)
2758 token->type = CHARACTER;
2760 while (1)
2762 bracket_elem_t start_elem, end_elem;
2763 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
2764 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
2765 reg_errcode_t ret;
2766 int token_len2 = 0, is_range_exp = 0;
2767 re_token_t token2;
2769 start_elem.opr.name = start_name_buf;
2770 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
2771 syntax);
2772 if (BE (ret != REG_NOERROR, 0))
2774 *err = ret;
2775 goto parse_bracket_exp_free_return;
2778 token_len = peek_token_bracket (token, regexp, syntax);
2779 if (BE (token->type == END_OF_RE, 0))
2781 *err = REG_BADPAT;
2782 goto parse_bracket_exp_free_return;
2784 if (token->type == OP_CHARSET_RANGE)
2786 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
2787 token_len2 = peek_token_bracket (&token2, regexp, syntax);
2788 if (BE (token->type == END_OF_RE, 0))
2790 *err = REG_BADPAT;
2791 goto parse_bracket_exp_free_return;
2793 if (token2.type == OP_CLOSE_BRACKET)
2795 /* We treat the last '-' as a normal character. */
2796 re_string_skip_bytes (regexp, -token_len);
2797 token->type = CHARACTER;
2799 else
2800 is_range_exp = 1;
2803 if (is_range_exp == 1)
2805 end_elem.opr.name = end_name_buf;
2806 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
2807 dfa, syntax);
2808 if (BE (ret != REG_NOERROR, 0))
2810 *err = ret;
2811 goto parse_bracket_exp_free_return;
2814 token_len = peek_token_bracket (token, regexp, syntax);
2815 if (BE (token->type == END_OF_RE, 0))
2817 *err = REG_BADPAT;
2818 goto parse_bracket_exp_free_return;
2820 *err = build_range_exp (sbcset,
2821 #ifdef RE_ENABLE_I18N
2822 mbcset, &range_alloc,
2823 #endif /* RE_ENABLE_I18N */
2824 &start_elem, &end_elem);
2825 if (BE (*err != REG_NOERROR, 0))
2826 goto parse_bracket_exp_free_return;
2828 else
2830 switch (start_elem.type)
2832 case SB_CHAR:
2833 bitset_set (sbcset, start_elem.opr.ch);
2834 break;
2835 #ifdef RE_ENABLE_I18N
2836 case MB_CHAR:
2837 /* Check whether the array has enough space. */
2838 if (mbchar_alloc == mbcset->nmbchars)
2840 /* Not enough, realloc it. */
2841 /* +1 in case of mbcset->nmbchars is 0. */
2842 mbchar_alloc = 2 * mbcset->nmbchars + 1;
2843 /* Use realloc since array is NULL if *alloc == 0. */
2844 mbcset->mbchars = re_realloc (mbcset->mbchars, wchar_t,
2845 mbchar_alloc);
2846 if (BE (mbcset->mbchars == NULL, 0))
2847 goto parse_bracket_exp_espace;
2849 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
2850 break;
2851 #endif /* RE_ENABLE_I18N */
2852 case EQUIV_CLASS:
2853 *err = build_equiv_class (sbcset,
2854 #ifdef RE_ENABLE_I18N
2855 mbcset, &equiv_class_alloc,
2856 #endif /* RE_ENABLE_I18N */
2857 start_elem.opr.name);
2858 if (BE (*err != REG_NOERROR, 0))
2859 goto parse_bracket_exp_free_return;
2860 break;
2861 case COLL_SYM:
2862 *err = build_collating_symbol (sbcset,
2863 #ifdef RE_ENABLE_I18N
2864 mbcset, &coll_sym_alloc,
2865 #endif /* RE_ENABLE_I18N */
2866 start_elem.opr.name);
2867 if (BE (*err != REG_NOERROR, 0))
2868 goto parse_bracket_exp_free_return;
2869 break;
2870 case CHAR_CLASS:
2871 ret = build_charclass (sbcset,
2872 #ifdef RE_ENABLE_I18N
2873 mbcset, &char_class_alloc,
2874 #endif /* RE_ENABLE_I18N */
2875 start_elem.opr.name, syntax);
2876 if (BE (ret != REG_NOERROR, 0))
2877 goto parse_bracket_exp_espace;
2878 break;
2879 default:
2880 assert (0);
2881 break;
2884 if (token->type == OP_CLOSE_BRACKET)
2885 break;
2888 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
2890 /* If it is non-matching list. */
2891 #ifdef RE_ENABLE_I18N
2892 if (mbcset->non_match)
2893 #else /* not RE_ENABLE_I18N */
2894 if (non_match)
2895 #endif /* not RE_ENABLE_I18N */
2896 bitset_not (sbcset);
2898 /* Build a tree for simple bracket. */
2899 br_token.type = SIMPLE_BRACKET;
2900 br_token.opr.sbcset = sbcset;
2901 new_idx = re_dfa_add_node (dfa, br_token, 0);
2902 work_tree = create_tree (NULL, NULL, 0, new_idx);
2903 if (BE (new_idx == -1 || work_tree == NULL, 0))
2904 goto parse_bracket_exp_espace;
2906 #ifdef RE_ENABLE_I18N
2907 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
2908 || mbcset->nranges || (MB_CUR_MAX > 1 && (mbcset->nchar_classes
2909 || mbcset->non_match)))
2911 re_token_t alt_token;
2912 bin_tree_t *mbc_tree;
2913 /* Build a tree for complex bracket. */
2914 br_token.type = COMPLEX_BRACKET;
2915 br_token.opr.mbcset = mbcset;
2916 dfa->has_mb_node = 1;
2917 new_idx = re_dfa_add_node (dfa, br_token, 0);
2918 mbc_tree = create_tree (NULL, NULL, 0, new_idx);
2919 if (BE (new_idx == -1 || mbc_tree == NULL, 0))
2920 goto parse_bracket_exp_espace;
2921 /* Then join them by ALT node. */
2922 alt_token.type = OP_ALT;
2923 new_idx = re_dfa_add_node (dfa, alt_token, 0);
2924 work_tree = create_tree (work_tree, mbc_tree, 0, new_idx);
2925 if (BE (new_idx != -1 && mbc_tree != NULL, 1))
2926 return work_tree;
2928 else
2930 free_charset (mbcset);
2931 return work_tree;
2933 #else /* not RE_ENABLE_I18N */
2934 return work_tree;
2935 #endif /* not RE_ENABLE_I18N */
2937 parse_bracket_exp_espace:
2938 *err = REG_ESPACE;
2939 parse_bracket_exp_free_return:
2940 re_free (sbcset);
2941 #ifdef RE_ENABLE_I18N
2942 free_charset (mbcset);
2943 #endif /* RE_ENABLE_I18N */
2944 return NULL;
2947 /* Parse an element in the bracket expression. */
2949 static reg_errcode_t
2950 parse_bracket_element (elem, regexp, token, token_len, dfa, syntax)
2951 bracket_elem_t *elem;
2952 re_string_t *regexp;
2953 re_token_t *token;
2954 int token_len;
2955 re_dfa_t *dfa;
2956 reg_syntax_t syntax;
2958 #ifdef RE_ENABLE_I18N
2959 int cur_char_size;
2960 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
2961 if (cur_char_size > 1)
2963 elem->type = MB_CHAR;
2964 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
2965 re_string_skip_bytes (regexp, cur_char_size);
2966 return REG_NOERROR;
2968 #endif /* RE_ENABLE_I18N */
2969 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
2970 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
2971 || token->type == OP_OPEN_EQUIV_CLASS)
2972 return parse_bracket_symbol (elem, regexp, token);
2973 elem->type = SB_CHAR;
2974 elem->opr.ch = token->opr.c;
2975 return REG_NOERROR;
2978 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
2979 such as [:<character_class>:], [.<collating_element>.], and
2980 [=<equivalent_class>=]. */
2982 static reg_errcode_t
2983 parse_bracket_symbol (elem, regexp, token)
2984 bracket_elem_t *elem;
2985 re_string_t *regexp;
2986 re_token_t *token;
2988 unsigned char ch, delim = token->opr.c;
2989 int i = 0;
2990 for (;; ++i)
2992 if (re_string_eoi(regexp) || i >= BRACKET_NAME_BUF_SIZE)
2993 return REG_EBRACK;
2994 if (token->type == OP_OPEN_CHAR_CLASS)
2995 ch = re_string_fetch_byte_case (regexp);
2996 else
2997 ch = re_string_fetch_byte (regexp);
2998 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
2999 break;
3000 elem->opr.name[i] = ch;
3002 re_string_skip_bytes (regexp, 1);
3003 elem->opr.name[i] = '\0';
3004 switch (token->type)
3006 case OP_OPEN_COLL_ELEM:
3007 elem->type = COLL_SYM;
3008 break;
3009 case OP_OPEN_EQUIV_CLASS:
3010 elem->type = EQUIV_CLASS;
3011 break;
3012 case OP_OPEN_CHAR_CLASS:
3013 elem->type = CHAR_CLASS;
3014 break;
3015 default:
3016 break;
3018 return REG_NOERROR;
3021 /* Helper function for parse_bracket_exp.
3022 Build the equivalence class which is represented by NAME.
3023 The result are written to MBCSET and SBCSET.
3024 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3025 is a pointer argument sinse we may update it. */
3027 static reg_errcode_t
3028 #ifdef RE_ENABLE_I18N
3029 build_equiv_class (sbcset, mbcset, equiv_class_alloc, name)
3030 re_charset_t *mbcset;
3031 int *equiv_class_alloc;
3032 #else /* not RE_ENABLE_I18N */
3033 build_equiv_class (sbcset, name)
3034 #endif /* not RE_ENABLE_I18N */
3035 re_bitset_ptr_t sbcset;
3036 const unsigned char *name;
3038 #if defined _LIBC && defined RE_ENABLE_I18N
3039 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3040 if (nrules != 0)
3042 const int32_t *table, *indirect;
3043 const unsigned char *weights, *extra, *cp;
3044 unsigned char char_buf[2];
3045 int32_t idx1, idx2;
3046 unsigned int ch;
3047 size_t len;
3048 /* This #include defines a local function! */
3049 # include <locale/weight.h>
3050 /* Calculate the index for equivalence class. */
3051 cp = name;
3052 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3053 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3054 _NL_COLLATE_WEIGHTMB);
3055 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3056 _NL_COLLATE_EXTRAMB);
3057 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3058 _NL_COLLATE_INDIRECTMB);
3059 idx1 = findidx (&cp);
3060 if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3061 /* This isn't a valid character. */
3062 return REG_ECOLLATE;
3064 /* Build single byte matcing table for this equivalence class. */
3065 char_buf[1] = (unsigned char) '\0';
3066 len = weights[idx1];
3067 for (ch = 0; ch < SBC_MAX; ++ch)
3069 char_buf[0] = ch;
3070 cp = char_buf;
3071 idx2 = findidx (&cp);
3073 idx2 = table[ch];
3075 if (idx2 == 0)
3076 /* This isn't a valid character. */
3077 continue;
3078 if (len == weights[idx2])
3080 int cnt = 0;
3081 while (cnt <= len &&
3082 weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
3083 ++cnt;
3085 if (cnt > len)
3086 bitset_set (sbcset, ch);
3089 /* Check whether the array has enough space. */
3090 if (*equiv_class_alloc == mbcset->nequiv_classes)
3092 /* Not enough, realloc it. */
3093 /* +1 in case of mbcset->nequiv_classes is 0. */
3094 *equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3095 /* Use realloc since the array is NULL if *alloc == 0. */
3096 mbcset->equiv_classes = re_realloc (mbcset->equiv_classes, int32_t,
3097 *equiv_class_alloc);
3098 if (BE (mbcset->equiv_classes == NULL, 0))
3099 return REG_ESPACE;
3101 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3103 else
3104 #endif /* _LIBC && RE_ENABLE_I18N */
3106 if (BE (strlen ((const char *) name) != 1, 0))
3107 return REG_ECOLLATE;
3108 bitset_set (sbcset, *name);
3110 return REG_NOERROR;
3113 /* Helper function for parse_bracket_exp.
3114 Build the character class which is represented by NAME.
3115 The result are written to MBCSET and SBCSET.
3116 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3117 is a pointer argument sinse we may update it. */
3119 static reg_errcode_t
3120 #ifdef RE_ENABLE_I18N
3121 build_charclass (sbcset, mbcset, char_class_alloc, class_name, syntax)
3122 re_charset_t *mbcset;
3123 int *char_class_alloc;
3124 #else /* not RE_ENABLE_I18N */
3125 build_charclass (sbcset, class_name, syntax)
3126 #endif /* not RE_ENABLE_I18N */
3127 re_bitset_ptr_t sbcset;
3128 const unsigned char *class_name;
3129 reg_syntax_t syntax;
3131 int i;
3132 const char *name = (const char *) class_name;
3134 /* In case of REG_ICASE "upper" and "lower" match the both of
3135 upper and lower cases. */
3136 if ((syntax & RE_ICASE)
3137 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3138 name = "alpha";
3140 #ifdef RE_ENABLE_I18N
3141 /* Check the space of the arrays. */
3142 if (*char_class_alloc == mbcset->nchar_classes)
3144 /* Not enough, realloc it. */
3145 /* +1 in case of mbcset->nchar_classes is 0. */
3146 *char_class_alloc = 2 * mbcset->nchar_classes + 1;
3147 /* Use realloc since array is NULL if *alloc == 0. */
3148 mbcset->char_classes = re_realloc (mbcset->char_classes, wctype_t,
3149 *char_class_alloc);
3150 if (BE (mbcset->char_classes == NULL, 0))
3151 return REG_ESPACE;
3153 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3154 #endif /* RE_ENABLE_I18N */
3156 #define BUILD_CHARCLASS_LOOP(ctype_func)\
3157 for (i = 0; i < SBC_MAX; ++i) \
3159 if (ctype_func (i)) \
3160 bitset_set (sbcset, i); \
3163 if (strcmp (name, "alnum") == 0)
3164 BUILD_CHARCLASS_LOOP (isalnum)
3165 else if (strcmp (name, "cntrl") == 0)
3166 BUILD_CHARCLASS_LOOP (iscntrl)
3167 else if (strcmp (name, "lower") == 0)
3168 BUILD_CHARCLASS_LOOP (islower)
3169 else if (strcmp (name, "space") == 0)
3170 BUILD_CHARCLASS_LOOP (isspace)
3171 else if (strcmp (name, "alpha") == 0)
3172 BUILD_CHARCLASS_LOOP (isalpha)
3173 else if (strcmp (name, "digit") == 0)
3174 BUILD_CHARCLASS_LOOP (isdigit)
3175 else if (strcmp (name, "print") == 0)
3176 BUILD_CHARCLASS_LOOP (isprint)
3177 else if (strcmp (name, "upper") == 0)
3178 BUILD_CHARCLASS_LOOP (isupper)
3179 else if (strcmp (name, "blank") == 0)
3180 BUILD_CHARCLASS_LOOP (isblank)
3181 else if (strcmp (name, "graph") == 0)
3182 BUILD_CHARCLASS_LOOP (isgraph)
3183 else if (strcmp (name, "punct") == 0)
3184 BUILD_CHARCLASS_LOOP (ispunct)
3185 else if (strcmp (name, "xdigit") == 0)
3186 BUILD_CHARCLASS_LOOP (isxdigit)
3187 else
3188 return REG_ECTYPE;
3190 return REG_NOERROR;
3193 static bin_tree_t *
3194 build_word_op (dfa, not, err)
3195 re_dfa_t *dfa;
3196 int not;
3197 reg_errcode_t *err;
3199 re_bitset_ptr_t sbcset;
3200 #ifdef RE_ENABLE_I18N
3201 re_charset_t *mbcset;
3202 int alloc = 0;
3203 #else /* not RE_ENABLE_I18N */
3204 int non_match = 0;
3205 #endif /* not RE_ENABLE_I18N */
3206 reg_errcode_t ret;
3207 re_token_t br_token;
3208 bin_tree_t *tree;
3209 int new_idx;
3211 sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
3212 #ifdef RE_ENABLE_I18N
3213 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3214 #endif /* RE_ENABLE_I18N */
3216 #ifdef RE_ENABLE_I18N
3217 if (BE (sbcset == NULL || mbcset == NULL, 0))
3218 #else /* not RE_ENABLE_I18N */
3219 if (BE (sbcset == NULL, 0))
3220 #endif /* not RE_ENABLE_I18N */
3222 *err = REG_ESPACE;
3223 return NULL;
3226 if (not)
3228 #ifdef RE_ENABLE_I18N
3229 int i;
3231 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3232 bitset_set(cset->sbcset, '\0');
3234 mbcset->non_match = 1;
3235 if (MB_CUR_MAX > 1)
3236 for (i = 0; i < SBC_MAX; ++i)
3237 if (__btowc (i) == WEOF)
3238 bitset_set (sbcset, i);
3239 #else /* not RE_ENABLE_I18N */
3240 non_match = 1;
3241 #endif /* not RE_ENABLE_I18N */
3244 /* We don't care the syntax in this case. */
3245 ret = build_charclass (sbcset,
3246 #ifdef RE_ENABLE_I18N
3247 mbcset, &alloc,
3248 #endif /* RE_ENABLE_I18N */
3249 (const unsigned char *) "alpha", 0);
3251 if (BE (ret != REG_NOERROR, 0))
3253 re_free (sbcset);
3254 #ifdef RE_ENABLE_I18N
3255 free_charset (mbcset);
3256 #endif /* RE_ENABLE_I18N */
3257 *err = REG_ESPACE;
3258 return NULL;
3260 /* \w match '_' also. */
3261 bitset_set (sbcset, '_');
3263 /* If it is non-matching list. */
3264 #ifdef RE_ENABLE_I18N
3265 if (mbcset->non_match)
3266 #else /* not RE_ENABLE_I18N */
3267 if (non_match)
3268 #endif /* not RE_ENABLE_I18N */
3269 bitset_not (sbcset);
3271 /* Build a tree for simple bracket. */
3272 br_token.type = SIMPLE_BRACKET;
3273 br_token.opr.sbcset = sbcset;
3274 new_idx = re_dfa_add_node (dfa, br_token, 0);
3275 tree = create_tree (NULL, NULL, 0, new_idx);
3276 if (BE (new_idx == -1 || tree == NULL, 0))
3277 goto build_word_op_espace;
3279 #ifdef RE_ENABLE_I18N
3280 if (MB_CUR_MAX > 1)
3282 re_token_t alt_token;
3283 bin_tree_t *mbc_tree;
3284 /* Build a tree for complex bracket. */
3285 br_token.type = COMPLEX_BRACKET;
3286 br_token.opr.mbcset = mbcset;
3287 dfa->has_mb_node = 1;
3288 new_idx = re_dfa_add_node (dfa, br_token, 0);
3289 mbc_tree = create_tree (NULL, NULL, 0, new_idx);
3290 if (BE (new_idx == -1 || mbc_tree == NULL, 0))
3291 goto build_word_op_espace;
3292 /* Then join them by ALT node. */
3293 alt_token.type = OP_ALT;
3294 new_idx = re_dfa_add_node (dfa, alt_token, 0);
3295 tree = create_tree (tree, mbc_tree, 0, new_idx);
3296 if (BE (new_idx != -1 && mbc_tree != NULL, 1))
3297 return tree;
3299 else
3301 free_charset (mbcset);
3302 return tree;
3304 #else /* not RE_ENABLE_I18N */
3305 return tree;
3306 #endif /* not RE_ENABLE_I18N */
3308 build_word_op_espace:
3309 re_free (sbcset);
3310 #ifdef RE_ENABLE_I18N
3311 free_charset (mbcset);
3312 #endif /* RE_ENABLE_I18N */
3313 *err = REG_ESPACE;
3314 return NULL;
3317 /* This is intended for the expressions like "a{1,3}".
3318 Fetch a number from `input', and return the number.
3319 Return -1, if the number field is empty like "{,1}".
3320 Return -2, If an error is occured. */
3322 static int
3323 fetch_number (input, token, syntax)
3324 re_string_t *input;
3325 re_token_t *token;
3326 reg_syntax_t syntax;
3328 int num = -1;
3329 unsigned char c;
3330 while (1)
3332 *token = fetch_token (input, syntax);
3333 c = token->opr.c;
3334 if (BE (token->type == END_OF_RE, 0))
3335 return -2;
3336 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3337 break;
3338 num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3339 ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3340 num = (num > RE_DUP_MAX) ? -2 : num;
3342 return num;
3345 #ifdef RE_ENABLE_I18N
3346 static void
3347 free_charset (re_charset_t *cset)
3349 re_free (cset->mbchars);
3350 # ifdef _LIBC
3351 re_free (cset->coll_syms);
3352 re_free (cset->equiv_classes);
3353 re_free (cset->range_starts);
3354 re_free (cset->range_ends);
3355 # endif
3356 re_free (cset->char_classes);
3357 re_free (cset);
3359 #endif /* RE_ENABLE_I18N */
3361 /* Functions for binary tree operation. */
3363 /* Create a node of tree.
3364 Note: This function automatically free left and right if malloc fails. */
3366 static bin_tree_t *
3367 create_tree (left, right, type, index)
3368 bin_tree_t *left;
3369 bin_tree_t *right;
3370 re_token_type_t type;
3371 int index;
3373 bin_tree_t *tree;
3374 tree = re_malloc (bin_tree_t, 1);
3375 if (BE (tree == NULL, 0))
3377 free_bin_tree (left);
3378 free_bin_tree (right);
3379 return NULL;
3381 tree->parent = NULL;
3382 tree->left = left;
3383 tree->right = right;
3384 tree->type = type;
3385 tree->node_idx = index;
3386 tree->first = -1;
3387 tree->next = -1;
3388 re_node_set_init_empty (&tree->eclosure);
3390 if (left != NULL)
3391 left->parent = tree;
3392 if (right != NULL)
3393 right->parent = tree;
3394 return tree;
3397 /* Free the sub tree pointed by TREE. */
3399 static void
3400 free_bin_tree (tree)
3401 bin_tree_t *tree;
3403 if (tree == NULL)
3404 return;
3405 /*re_node_set_free (&tree->eclosure);*/
3406 free_bin_tree (tree->left);
3407 free_bin_tree (tree->right);
3408 re_free (tree);
3411 /* Duplicate the node SRC, and return new node. */
3413 static bin_tree_t *
3414 duplicate_tree (src, dfa)
3415 const bin_tree_t *src;
3416 re_dfa_t *dfa;
3418 bin_tree_t *left = NULL, *right = NULL, *new_tree;
3419 int new_node_idx;
3420 /* Since node indies must be according to Post-order of the tree,
3421 we must duplicate the left at first. */
3422 if (src->left != NULL)
3424 left = duplicate_tree (src->left, dfa);
3425 if (left == NULL)
3426 return NULL;
3429 /* Secondaly, duplicate the right. */
3430 if (src->right != NULL)
3432 right = duplicate_tree (src->right, dfa);
3433 if (right == NULL)
3435 free_bin_tree (left);
3436 return NULL;
3440 /* At last, duplicate itself. */
3441 if (src->type == NON_TYPE)
3443 new_node_idx = re_dfa_add_node (dfa, dfa->nodes[src->node_idx], 0);
3444 dfa->nodes[new_node_idx].duplicated = 1;
3445 if (BE (new_node_idx == -1, 0))
3447 free_bin_tree (left);
3448 free_bin_tree (right);
3449 return NULL;
3452 else
3453 new_node_idx = src->type;
3455 new_tree = create_tree (left, right, src->type, new_node_idx);
3456 if (BE (new_tree == NULL, 0))
3458 free_bin_tree (left);
3459 free_bin_tree (right);
3461 return new_tree;