* sysdeps/m68k/Makefile (CFLAGS-.oS): Append -fPIC.
[glibc.git] / posix / regcomp.c
blob97ec6e32f0df953025e477a3156b6a69115b6a70
1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19 02111-1307 USA. */
21 #include <assert.h>
22 #include <ctype.h>
23 #include <limits.h>
24 #include <locale.h>
25 #include <stdio.h>
26 #include <stdlib.h>
27 #include <string.h>
29 #if defined HAVE_WCHAR_H || defined _LIBC
30 # include <wchar.h>
31 #endif /* HAVE_WCHAR_H || _LIBC */
32 #if defined HAVE_WCTYPE_H || defined _LIBC
33 # include <wctype.h>
34 #endif /* HAVE_WCTYPE_H || _LIBC */
36 /* In case that the system doesn't have isblank(). */
37 #if !defined _LIBC && !defined HAVE_ISBLANK && !defined isblank
38 # define isblank(ch) ((ch) == ' ' || (ch) == '\t')
39 #endif
41 #ifdef _LIBC
42 # ifndef _RE_DEFINE_LOCALE_FUNCTIONS
43 # define _RE_DEFINE_LOCALE_FUNCTIONS 1
44 # include <locale/localeinfo.h>
45 # include <locale/elem-hash.h>
46 # include <locale/coll-lookup.h>
47 # endif
48 #endif
50 /* This is for other GNU distributions with internationalized messages. */
51 #if HAVE_LIBINTL_H || defined _LIBC
52 # include <libintl.h>
53 # ifdef _LIBC
54 # undef gettext
55 # define gettext(msgid) \
56 INTUSE(__dcgettext) (INTUSE(_libc_intl_domainname), msgid, LC_MESSAGES)
57 # endif
58 #else
59 # define gettext(msgid) (msgid)
60 #endif
62 #ifndef gettext_noop
63 /* This define is so xgettext can find the internationalizable
64 strings. */
65 # define gettext_noop(String) String
66 #endif
68 #include <regex.h>
69 #include "regex_internal.h"
71 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
72 int length, reg_syntax_t syntax);
73 static void re_compile_fastmap_iter (regex_t *bufp,
74 const re_dfastate_t *init_state,
75 char *fastmap);
76 static reg_errcode_t init_dfa (re_dfa_t *dfa, int pat_len);
77 static reg_errcode_t init_word_char (re_dfa_t *dfa);
78 #ifdef RE_ENABLE_I18N
79 static void free_charset (re_charset_t *cset);
80 #endif /* RE_ENABLE_I18N */
81 static void free_workarea_compile (regex_t *preg);
82 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
83 static reg_errcode_t analyze (re_dfa_t *dfa);
84 static reg_errcode_t analyze_tree (re_dfa_t *dfa, bin_tree_t *node);
85 static void calc_first (re_dfa_t *dfa, bin_tree_t *node);
86 static void calc_next (re_dfa_t *dfa, bin_tree_t *node);
87 static void calc_epsdest (re_dfa_t *dfa, bin_tree_t *node);
88 static reg_errcode_t duplicate_node_closure (re_dfa_t *dfa, int top_org_node,
89 int top_clone_node, int root_node,
90 unsigned int constraint);
91 static reg_errcode_t duplicate_node (int *new_idx, re_dfa_t *dfa, int org_idx,
92 unsigned int constraint);
93 static int search_duplicated_node (re_dfa_t *dfa, int org_node,
94 unsigned int constraint);
95 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
96 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
97 int node, int root);
98 static void calc_inveclosure (re_dfa_t *dfa);
99 static int fetch_number (re_string_t *input, re_token_t *token,
100 reg_syntax_t syntax);
101 static re_token_t fetch_token (re_string_t *input, reg_syntax_t syntax);
102 static int peek_token (re_token_t *token, re_string_t *input,
103 reg_syntax_t syntax);
104 static int peek_token_bracket (re_token_t *token, re_string_t *input,
105 reg_syntax_t syntax);
106 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
107 reg_syntax_t syntax, reg_errcode_t *err);
108 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
109 re_token_t *token, reg_syntax_t syntax,
110 int nest, reg_errcode_t *err);
111 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
112 re_token_t *token, reg_syntax_t syntax,
113 int nest, reg_errcode_t *err);
114 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
115 re_token_t *token, reg_syntax_t syntax,
116 int nest, reg_errcode_t *err);
117 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
118 re_token_t *token, reg_syntax_t syntax,
119 int nest, reg_errcode_t *err);
120 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
121 re_dfa_t *dfa, re_token_t *token,
122 reg_syntax_t syntax, reg_errcode_t *err);
123 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
124 re_token_t *token, reg_syntax_t syntax,
125 reg_errcode_t *err);
126 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
127 re_string_t *regexp,
128 re_token_t *token, int token_len,
129 re_dfa_t *dfa,
130 reg_syntax_t syntax);
131 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
132 re_string_t *regexp,
133 re_token_t *token);
134 #ifndef _LIBC
135 # ifdef RE_ENABLE_I18N
136 static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
137 re_charset_t *mbcset, int *range_alloc,
138 bracket_elem_t *start_elem,
139 bracket_elem_t *end_elem);
140 static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
141 re_charset_t *mbcset,
142 int *coll_sym_alloc,
143 const unsigned char *name);
144 # else /* not RE_ENABLE_I18N */
145 static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
146 bracket_elem_t *start_elem,
147 bracket_elem_t *end_elem);
148 static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
149 const unsigned char *name);
150 # endif /* not RE_ENABLE_I18N */
151 #endif /* not _LIBC */
152 #ifdef RE_ENABLE_I18N
153 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
154 re_charset_t *mbcset,
155 int *equiv_class_alloc,
156 const unsigned char *name);
157 static reg_errcode_t build_charclass (re_bitset_ptr_t sbcset,
158 re_charset_t *mbcset,
159 int *char_class_alloc,
160 const unsigned char *class_name,
161 reg_syntax_t syntax);
162 #else /* not RE_ENABLE_I18N */
163 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
164 const unsigned char *name);
165 static reg_errcode_t build_charclass (re_bitset_ptr_t sbcset,
166 const unsigned char *class_name,
167 reg_syntax_t syntax);
168 #endif /* not RE_ENABLE_I18N */
169 static bin_tree_t *build_word_op (re_dfa_t *dfa, int not, reg_errcode_t *err);
170 static void free_bin_tree (bin_tree_t *tree);
171 static bin_tree_t *create_tree (bin_tree_t *left, bin_tree_t *right,
172 re_token_type_t type, int index);
173 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
175 /* This table gives an error message for each of the error codes listed
176 in regex.h. Obviously the order here has to be same as there.
177 POSIX doesn't require that we do anything for REG_NOERROR,
178 but why not be nice? */
180 const char __re_error_msgid[] attribute_hidden =
182 #define REG_NOERROR_IDX 0
183 gettext_noop ("Success") /* REG_NOERROR */
184 "\0"
185 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
186 gettext_noop ("No match") /* REG_NOMATCH */
187 "\0"
188 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
189 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
190 "\0"
191 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
192 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
193 "\0"
194 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
195 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
196 "\0"
197 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
198 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
199 "\0"
200 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
201 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
202 "\0"
203 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
204 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
205 "\0"
206 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
207 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
208 "\0"
209 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
210 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
211 "\0"
212 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
213 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
214 "\0"
215 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
216 gettext_noop ("Invalid range end") /* REG_ERANGE */
217 "\0"
218 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
219 gettext_noop ("Memory exhausted") /* REG_ESPACE */
220 "\0"
221 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
222 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
223 "\0"
224 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
225 gettext_noop ("Premature end of regular expression") /* REG_EEND */
226 "\0"
227 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
228 gettext_noop ("Regular expression too big") /* REG_ESIZE */
229 "\0"
230 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
231 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
234 const size_t __re_error_msgid_idx[] attribute_hidden =
236 REG_NOERROR_IDX,
237 REG_NOMATCH_IDX,
238 REG_BADPAT_IDX,
239 REG_ECOLLATE_IDX,
240 REG_ECTYPE_IDX,
241 REG_EESCAPE_IDX,
242 REG_ESUBREG_IDX,
243 REG_EBRACK_IDX,
244 REG_EPAREN_IDX,
245 REG_EBRACE_IDX,
246 REG_BADBR_IDX,
247 REG_ERANGE_IDX,
248 REG_ESPACE_IDX,
249 REG_BADRPT_IDX,
250 REG_EEND_IDX,
251 REG_ESIZE_IDX,
252 REG_ERPAREN_IDX
255 /* Entry points for GNU code. */
257 /* re_compile_pattern is the GNU regular expression compiler: it
258 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
259 Returns 0 if the pattern was valid, otherwise an error string.
261 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
262 are set in BUFP on entry. */
264 const char *
265 re_compile_pattern (pattern, length, bufp)
266 const char *pattern;
267 size_t length;
268 struct re_pattern_buffer *bufp;
270 reg_errcode_t ret;
272 /* And GNU code determines whether or not to get register information
273 by passing null for the REGS argument to re_match, etc., not by
274 setting no_sub. */
275 bufp->no_sub = 0;
277 /* Match anchors at newline. */
278 bufp->newline_anchor = 1;
280 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
282 if (!ret)
283 return NULL;
284 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
286 #ifdef _LIBC
287 weak_alias (__re_compile_pattern, re_compile_pattern)
288 #endif
290 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
291 also be assigned to arbitrarily: each pattern buffer stores its own
292 syntax, so it can be changed between regex compilations. */
293 /* This has no initializer because initialized variables in Emacs
294 become read-only after dumping. */
295 reg_syntax_t re_syntax_options;
298 /* Specify the precise syntax of regexps for compilation. This provides
299 for compatibility for various utilities which historically have
300 different, incompatible syntaxes.
302 The argument SYNTAX is a bit mask comprised of the various bits
303 defined in regex.h. We return the old syntax. */
305 reg_syntax_t
306 re_set_syntax (syntax)
307 reg_syntax_t syntax;
309 reg_syntax_t ret = re_syntax_options;
311 re_syntax_options = syntax;
312 return ret;
314 #ifdef _LIBC
315 weak_alias (__re_set_syntax, re_set_syntax)
316 #endif
319 re_compile_fastmap (bufp)
320 struct re_pattern_buffer *bufp;
322 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
323 char *fastmap = bufp->fastmap;
325 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
326 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
327 if (dfa->init_state != dfa->init_state_word)
328 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
329 if (dfa->init_state != dfa->init_state_nl)
330 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
331 if (dfa->init_state != dfa->init_state_begbuf)
332 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
333 bufp->fastmap_accurate = 1;
334 return 0;
336 #ifdef _LIBC
337 weak_alias (__re_compile_fastmap, re_compile_fastmap)
338 #endif
340 static inline void
341 re_set_fastmap (char *fastmap, int icase, int ch)
343 fastmap[ch] = 1;
344 if (icase)
345 fastmap[tolower (ch)] = 1;
348 /* Helper function for re_compile_fastmap.
349 Compile fastmap for the initial_state INIT_STATE. */
351 static void
352 re_compile_fastmap_iter (bufp, init_state, fastmap)
353 regex_t *bufp;
354 const re_dfastate_t *init_state;
355 char *fastmap;
357 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
358 int node_cnt;
359 int icase = (MB_CUR_MAX == 1 && (bufp->syntax & RE_ICASE));
360 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
362 int node = init_state->nodes.elems[node_cnt];
363 re_token_type_t type = dfa->nodes[node].type;
365 if (type == CHARACTER)
366 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
367 else if (type == SIMPLE_BRACKET)
369 int i, j, ch;
370 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
371 for (j = 0; j < UINT_BITS; ++j, ++ch)
372 if (dfa->nodes[node].opr.sbcset[i] & (1 << j))
373 re_set_fastmap (fastmap, icase, ch);
375 #ifdef RE_ENABLE_I18N
376 else if (type == COMPLEX_BRACKET)
378 int i;
379 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
380 if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
381 || cset->nranges || cset->nchar_classes)
383 # ifdef _LIBC
384 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
386 /* In this case we want to catch the bytes which are
387 the first byte of any collation elements.
388 e.g. In da_DK, we want to catch 'a' since "aa"
389 is a valid collation element, and don't catch
390 'b' since 'b' is the only collation element
391 which starts from 'b'. */
392 int j, ch;
393 const int32_t *table = (const int32_t *)
394 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
395 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
396 for (j = 0; j < UINT_BITS; ++j, ++ch)
397 if (table[ch] < 0)
398 re_set_fastmap (fastmap, icase, ch);
400 # else
401 if (MB_CUR_MAX > 1)
402 for (i = 0; i < SBC_MAX; ++i)
403 if (__btowc (i) == WEOF)
404 re_set_fastmap (fastmap, icase, i);
405 # endif /* not _LIBC */
407 for (i = 0; i < cset->nmbchars; ++i)
409 char buf[256];
410 mbstate_t state;
411 memset (&state, '\0', sizeof (state));
412 __wcrtomb (buf, cset->mbchars[i], &state);
413 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
416 #endif /* RE_ENABLE_I18N */
417 else if (type == END_OF_RE || type == OP_PERIOD)
419 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
420 if (type == END_OF_RE)
421 bufp->can_be_null = 1;
422 return;
427 /* Entry point for POSIX code. */
428 /* regcomp takes a regular expression as a string and compiles it.
430 PREG is a regex_t *. We do not expect any fields to be initialized,
431 since POSIX says we shouldn't. Thus, we set
433 `buffer' to the compiled pattern;
434 `used' to the length of the compiled pattern;
435 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
436 REG_EXTENDED bit in CFLAGS is set; otherwise, to
437 RE_SYNTAX_POSIX_BASIC;
438 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
439 `fastmap' to an allocated space for the fastmap;
440 `fastmap_accurate' to zero;
441 `re_nsub' to the number of subexpressions in PATTERN.
443 PATTERN is the address of the pattern string.
445 CFLAGS is a series of bits which affect compilation.
447 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
448 use POSIX basic syntax.
450 If REG_NEWLINE is set, then . and [^...] don't match newline.
451 Also, regexec will try a match beginning after every newline.
453 If REG_ICASE is set, then we considers upper- and lowercase
454 versions of letters to be equivalent when matching.
456 If REG_NOSUB is set, then when PREG is passed to regexec, that
457 routine will report only success or failure, and nothing about the
458 registers.
460 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
461 the return codes and their meanings.) */
464 regcomp (preg, pattern, cflags)
465 regex_t *__restrict preg;
466 const char *__restrict pattern;
467 int cflags;
469 reg_errcode_t ret;
470 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
471 : RE_SYNTAX_POSIX_BASIC);
473 preg->buffer = NULL;
474 preg->allocated = 0;
475 preg->used = 0;
477 /* Try to allocate space for the fastmap. */
478 preg->fastmap = re_malloc (char, SBC_MAX);
479 if (BE (preg->fastmap == NULL, 0))
480 return REG_ESPACE;
482 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
484 /* If REG_NEWLINE is set, newlines are treated differently. */
485 if (cflags & REG_NEWLINE)
486 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
487 syntax &= ~RE_DOT_NEWLINE;
488 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
489 /* It also changes the matching behavior. */
490 preg->newline_anchor = 1;
492 else
493 preg->newline_anchor = 0;
494 preg->no_sub = !!(cflags & REG_NOSUB);
495 preg->translate = NULL;
497 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
499 /* POSIX doesn't distinguish between an unmatched open-group and an
500 unmatched close-group: both are REG_EPAREN. */
501 if (ret == REG_ERPAREN)
502 ret = REG_EPAREN;
504 /* We have already checked preg->fastmap != NULL. */
505 if (BE (ret == REG_NOERROR, 1))
506 /* Compute the fastmap now, since regexec cannot modify the pattern
507 buffer. This function nevers fails in this implementation. */
508 (void) re_compile_fastmap (preg);
509 else
511 /* Some error occurred while compiling the expression. */
512 re_free (preg->fastmap);
513 preg->fastmap = NULL;
516 return (int) ret;
518 #ifdef _LIBC
519 weak_alias (__regcomp, regcomp)
520 #endif
522 /* Returns a message corresponding to an error code, ERRCODE, returned
523 from either regcomp or regexec. We don't use PREG here. */
525 size_t
526 regerror (errcode, preg, errbuf, errbuf_size)
527 int errcode;
528 const regex_t *preg;
529 char *errbuf;
530 size_t errbuf_size;
532 const char *msg;
533 size_t msg_size;
535 if (BE (errcode < 0
536 || errcode >= (int) (sizeof (__re_error_msgid_idx)
537 / sizeof (__re_error_msgid_idx[0])), 0))
538 /* Only error codes returned by the rest of the code should be passed
539 to this routine. If we are given anything else, or if other regex
540 code generates an invalid error code, then the program has a bug.
541 Dump core so we can fix it. */
542 abort ();
544 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
546 msg_size = strlen (msg) + 1; /* Includes the null. */
548 if (BE (errbuf_size != 0, 1))
550 if (BE (msg_size > errbuf_size, 0))
552 #if defined HAVE_MEMPCPY || defined _LIBC
553 *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
554 #else
555 memcpy (errbuf, msg, errbuf_size - 1);
556 errbuf[errbuf_size - 1] = 0;
557 #endif
559 else
560 memcpy (errbuf, msg, msg_size);
563 return msg_size;
565 #ifdef _LIBC
566 weak_alias (__regerror, regerror)
567 #endif
570 static void
571 free_dfa_content (re_dfa_t *dfa)
573 int i, j;
575 re_free (dfa->subexps);
577 for (i = 0; i < dfa->nodes_len; ++i)
579 re_token_t *node = dfa->nodes + i;
580 #ifdef RE_ENABLE_I18N
581 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
582 free_charset (node->opr.mbcset);
583 else
584 #endif /* RE_ENABLE_I18N */
585 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
586 re_free (node->opr.sbcset);
588 re_free (dfa->nexts);
589 for (i = 0; i < dfa->nodes_len; ++i)
591 if (dfa->eclosures != NULL)
592 re_node_set_free (dfa->eclosures + i);
593 if (dfa->inveclosures != NULL)
594 re_node_set_free (dfa->inveclosures + i);
595 if (dfa->edests != NULL)
596 re_node_set_free (dfa->edests + i);
598 re_free (dfa->edests);
599 re_free (dfa->eclosures);
600 re_free (dfa->inveclosures);
601 re_free (dfa->nodes);
603 for (i = 0; i <= dfa->state_hash_mask; ++i)
605 struct re_state_table_entry *entry = dfa->state_table + i;
606 for (j = 0; j < entry->num; ++j)
608 re_dfastate_t *state = entry->array[j];
609 free_state (state);
611 re_free (entry->array);
613 re_free (dfa->state_table);
615 if (dfa->word_char != NULL)
616 re_free (dfa->word_char);
617 #ifdef DEBUG
618 re_free (dfa->re_str);
619 #endif
621 re_free (dfa);
625 /* Free dynamically allocated space used by PREG. */
627 void
628 regfree (preg)
629 regex_t *preg;
631 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
632 if (BE (dfa != NULL, 1))
633 free_dfa_content (dfa);
635 re_free (preg->fastmap);
637 #ifdef _LIBC
638 weak_alias (__regfree, regfree)
639 #endif
641 /* Entry points compatible with 4.2 BSD regex library. We don't define
642 them unless specifically requested. */
644 #if defined _REGEX_RE_COMP || defined _LIBC
646 /* BSD has one and only one pattern buffer. */
647 static struct re_pattern_buffer re_comp_buf;
649 char *
650 # ifdef _LIBC
651 /* Make these definitions weak in libc, so POSIX programs can redefine
652 these names if they don't use our functions, and still use
653 regcomp/regexec above without link errors. */
654 weak_function
655 # endif
656 re_comp (s)
657 const char *s;
659 reg_errcode_t ret;
660 char *fastmap;
662 if (!s)
664 if (!re_comp_buf.buffer)
665 return gettext ("No previous regular expression");
666 return 0;
669 if (re_comp_buf.buffer)
671 fastmap = re_comp_buf.fastmap;
672 re_comp_buf.fastmap = NULL;
673 __regfree (&re_comp_buf);
674 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
675 re_comp_buf.fastmap = fastmap;
678 if (re_comp_buf.fastmap == NULL)
680 re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
681 if (re_comp_buf.fastmap == NULL)
682 return (char *) gettext (__re_error_msgid
683 + __re_error_msgid_idx[(int) REG_ESPACE]);
686 /* Since `re_exec' always passes NULL for the `regs' argument, we
687 don't need to initialize the pattern buffer fields which affect it. */
689 /* Match anchors at newlines. */
690 re_comp_buf.newline_anchor = 1;
692 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
694 if (!ret)
695 return NULL;
697 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
698 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
701 #ifdef _LIBC
702 libc_freeres_fn (free_mem)
704 __regfree (&re_comp_buf);
706 #endif
708 #endif /* _REGEX_RE_COMP */
710 /* Internal entry point.
711 Compile the regular expression PATTERN, whose length is LENGTH.
712 SYNTAX indicate regular expression's syntax. */
714 static reg_errcode_t
715 re_compile_internal (preg, pattern, length, syntax)
716 regex_t *preg;
717 const char * pattern;
718 int length;
719 reg_syntax_t syntax;
721 reg_errcode_t err = REG_NOERROR;
722 re_dfa_t *dfa;
723 re_string_t regexp;
725 /* Initialize the pattern buffer. */
726 preg->fastmap_accurate = 0;
727 preg->syntax = syntax;
728 preg->not_bol = preg->not_eol = 0;
729 preg->used = 0;
730 preg->re_nsub = 0;
731 preg->can_be_null = 0;
732 preg->regs_allocated = REGS_UNALLOCATED;
734 /* Initialize the dfa. */
735 dfa = (re_dfa_t *) preg->buffer;
736 if (preg->allocated < sizeof (re_dfa_t))
738 /* If zero allocated, but buffer is non-null, try to realloc
739 enough space. This loses if buffer's address is bogus, but
740 that is the user's responsibility. If ->buffer is NULL this
741 is a simple allocation. */
742 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
743 if (dfa == NULL)
744 return REG_ESPACE;
745 preg->allocated = sizeof (re_dfa_t);
747 preg->buffer = (unsigned char *) dfa;
748 preg->used = sizeof (re_dfa_t);
750 err = init_dfa (dfa, length);
751 if (BE (err != REG_NOERROR, 0))
753 re_free (dfa);
754 preg->buffer = NULL;
755 return err;
757 #ifdef DEBUG
758 dfa->re_str = re_malloc (char, length + 1);
759 strncpy (dfa->re_str, pattern, length + 1);
760 #endif
762 err = re_string_construct (&regexp, pattern, length, preg->translate,
763 syntax & RE_ICASE);
764 if (BE (err != REG_NOERROR, 0))
766 re_free (dfa);
767 preg->buffer = NULL;
768 return err;
771 /* Parse the regular expression, and build a structure tree. */
772 preg->re_nsub = 0;
773 dfa->str_tree = parse (&regexp, preg, syntax, &err);
774 if (BE (dfa->str_tree == NULL, 0))
775 goto re_compile_internal_free_return;
777 /* Analyze the tree and collect information which is necessary to
778 create the dfa. */
779 err = analyze (dfa);
780 if (BE (err != REG_NOERROR, 0))
781 goto re_compile_internal_free_return;
783 /* Then create the initial state of the dfa. */
784 err = create_initial_state (dfa);
786 /* Release work areas. */
787 free_workarea_compile (preg);
788 re_string_destruct (&regexp);
790 if (BE (err != REG_NOERROR, 0))
792 re_compile_internal_free_return:
793 free_dfa_content (dfa);
794 preg->buffer = NULL;
797 return err;
800 /* Initialize DFA. We use the length of the regular expression PAT_LEN
801 as the initial length of some arrays. */
803 static reg_errcode_t
804 init_dfa (dfa, pat_len)
805 re_dfa_t *dfa;
806 int pat_len;
808 int table_size;
810 memset (dfa, '\0', sizeof (re_dfa_t));
812 dfa->nodes_alloc = pat_len + 1;
813 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
815 dfa->states_alloc = pat_len + 1;
817 /* table_size = 2 ^ ceil(log pat_len) */
818 for (table_size = 1; table_size > 0; table_size <<= 1)
819 if (table_size > pat_len)
820 break;
822 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
823 dfa->state_hash_mask = table_size - 1;
825 dfa->subexps_alloc = 1;
826 dfa->subexps = re_malloc (re_subexp_t, dfa->subexps_alloc);
827 dfa->word_char = NULL;
829 if (BE (dfa->nodes == NULL || dfa->state_table == NULL
830 || dfa->subexps == NULL, 0))
832 /* We don't bother to free anything which was allocated. Very
833 soon the process will go down anyway. */
834 dfa->subexps = NULL;
835 dfa->state_table = NULL;
836 dfa->nodes = NULL;
837 return REG_ESPACE;
839 return REG_NOERROR;
842 /* Initialize WORD_CHAR table, which indicate which character is
843 "word". In this case "word" means that it is the word construction
844 character used by some operators like "\<", "\>", etc. */
846 static reg_errcode_t
847 init_word_char (dfa)
848 re_dfa_t *dfa;
850 int i, j, ch;
851 dfa->word_char = (re_bitset_ptr_t) calloc (sizeof (bitset), 1);
852 if (BE (dfa->word_char == NULL, 0))
853 return REG_ESPACE;
854 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
855 for (j = 0; j < UINT_BITS; ++j, ++ch)
856 if (isalnum (ch) || ch == '_')
857 dfa->word_char[i] |= 1 << j;
858 return REG_NOERROR;
861 /* Free the work area which are only used while compiling. */
863 static void
864 free_workarea_compile (preg)
865 regex_t *preg;
867 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
868 free_bin_tree (dfa->str_tree);
869 dfa->str_tree = NULL;
870 re_free (dfa->org_indices);
871 dfa->org_indices = NULL;
874 /* Create initial states for all contexts. */
876 static reg_errcode_t
877 create_initial_state (dfa)
878 re_dfa_t *dfa;
880 int first, i;
881 reg_errcode_t err;
882 re_node_set init_nodes;
884 /* Initial states have the epsilon closure of the node which is
885 the first node of the regular expression. */
886 first = dfa->str_tree->first;
887 dfa->init_node = first;
888 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
889 if (BE (err != REG_NOERROR, 0))
890 return err;
892 /* The back-references which are in initial states can epsilon transit,
893 since in this case all of the subexpressions can be null.
894 Then we add epsilon closures of the nodes which are the next nodes of
895 the back-references. */
896 if (dfa->nbackref > 0)
897 for (i = 0; i < init_nodes.nelem; ++i)
899 int node_idx = init_nodes.elems[i];
900 re_token_type_t type = dfa->nodes[node_idx].type;
902 int clexp_idx;
903 if (type != OP_BACK_REF)
904 continue;
905 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
907 re_token_t *clexp_node;
908 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
909 if (clexp_node->type == OP_CLOSE_SUBEXP
910 && clexp_node->opr.idx + 1 == dfa->nodes[node_idx].opr.idx)
911 break;
913 if (clexp_idx == init_nodes.nelem)
914 continue;
916 if (type == OP_BACK_REF)
918 int dest_idx = dfa->edests[node_idx].elems[0];
919 if (!re_node_set_contains (&init_nodes, dest_idx))
921 re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
922 i = 0;
927 /* It must be the first time to invoke acquire_state. */
928 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
929 /* We don't check ERR here, since the initial state must not be NULL. */
930 if (BE (dfa->init_state == NULL, 0))
931 return err;
932 if (dfa->init_state->has_constraint)
934 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
935 CONTEXT_WORD);
936 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
937 CONTEXT_NEWLINE);
938 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
939 &init_nodes,
940 CONTEXT_NEWLINE
941 | CONTEXT_BEGBUF);
942 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
943 || dfa->init_state_begbuf == NULL, 0))
944 return err;
946 else
947 dfa->init_state_word = dfa->init_state_nl
948 = dfa->init_state_begbuf = dfa->init_state;
950 re_node_set_free (&init_nodes);
951 return REG_NOERROR;
954 /* Analyze the structure tree, and calculate "first", "next", "edest",
955 "eclosure", and "inveclosure". */
957 static reg_errcode_t
958 analyze (dfa)
959 re_dfa_t *dfa;
961 int i;
962 reg_errcode_t ret;
964 /* Allocate arrays. */
965 dfa->nexts = re_malloc (int, dfa->nodes_alloc);
966 dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
967 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
968 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
969 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_alloc);
970 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
971 || dfa->eclosures == NULL || dfa->inveclosures == NULL, 0))
972 return REG_ESPACE;
973 /* Initialize them. */
974 for (i = 0; i < dfa->nodes_len; ++i)
976 dfa->nexts[i] = -1;
977 re_node_set_init_empty (dfa->edests + i);
978 re_node_set_init_empty (dfa->eclosures + i);
979 re_node_set_init_empty (dfa->inveclosures + i);
982 ret = analyze_tree (dfa, dfa->str_tree);
983 if (BE (ret == REG_NOERROR, 1))
985 ret = calc_eclosure (dfa);
986 if (ret == REG_NOERROR)
987 calc_inveclosure (dfa);
989 return ret;
992 /* Helper functions for analyze.
993 This function calculate "first", "next", and "edest" for the subtree
994 whose root is NODE. */
996 static reg_errcode_t
997 analyze_tree (dfa, node)
998 re_dfa_t *dfa;
999 bin_tree_t *node;
1001 reg_errcode_t ret;
1002 if (node->first == -1)
1003 calc_first (dfa, node);
1004 if (node->next == -1)
1005 calc_next (dfa, node);
1006 if (node->eclosure.nelem == 0)
1007 calc_epsdest (dfa, node);
1008 /* Calculate "first" etc. for the left child. */
1009 if (node->left != NULL)
1011 ret = analyze_tree (dfa, node->left);
1012 if (BE (ret != REG_NOERROR, 0))
1013 return ret;
1015 /* Calculate "first" etc. for the right child. */
1016 if (node->right != NULL)
1018 ret = analyze_tree (dfa, node->right);
1019 if (BE (ret != REG_NOERROR, 0))
1020 return ret;
1022 return REG_NOERROR;
1025 /* Calculate "first" for the node NODE. */
1026 static void
1027 calc_first (dfa, node)
1028 re_dfa_t *dfa;
1029 bin_tree_t *node;
1031 int idx, type;
1032 idx = node->node_idx;
1033 type = (node->type == 0) ? dfa->nodes[idx].type : node->type;
1035 switch (type)
1037 #ifdef DEBUG
1038 case OP_OPEN_BRACKET:
1039 case OP_CLOSE_BRACKET:
1040 case OP_OPEN_DUP_NUM:
1041 case OP_CLOSE_DUP_NUM:
1042 case OP_NON_MATCH_LIST:
1043 case OP_OPEN_COLL_ELEM:
1044 case OP_CLOSE_COLL_ELEM:
1045 case OP_OPEN_EQUIV_CLASS:
1046 case OP_CLOSE_EQUIV_CLASS:
1047 case OP_OPEN_CHAR_CLASS:
1048 case OP_CLOSE_CHAR_CLASS:
1049 /* These must not be appeared here. */
1050 assert (0);
1051 #endif
1052 case END_OF_RE:
1053 case CHARACTER:
1054 case OP_PERIOD:
1055 case OP_DUP_ASTERISK:
1056 case OP_DUP_QUESTION:
1057 #ifdef RE_ENABLE_I18N
1058 case COMPLEX_BRACKET:
1059 #endif /* RE_ENABLE_I18N */
1060 case SIMPLE_BRACKET:
1061 case OP_BACK_REF:
1062 case ANCHOR:
1063 case OP_OPEN_SUBEXP:
1064 case OP_CLOSE_SUBEXP:
1065 node->first = idx;
1066 break;
1067 case OP_DUP_PLUS:
1068 #ifdef DEBUG
1069 assert (node->left != NULL);
1070 #endif
1071 if (node->left->first == -1)
1072 calc_first (dfa, node->left);
1073 node->first = node->left->first;
1074 break;
1075 case OP_ALT:
1076 node->first = idx;
1077 break;
1078 /* else fall through */
1079 default:
1080 #ifdef DEBUG
1081 assert (node->left != NULL);
1082 #endif
1083 if (node->left->first == -1)
1084 calc_first (dfa, node->left);
1085 node->first = node->left->first;
1086 break;
1090 /* Calculate "next" for the node NODE. */
1092 static void
1093 calc_next (dfa, node)
1094 re_dfa_t *dfa;
1095 bin_tree_t *node;
1097 int idx, type;
1098 bin_tree_t *parent = node->parent;
1099 if (parent == NULL)
1101 node->next = -1;
1102 idx = node->node_idx;
1103 if (node->type == 0)
1104 dfa->nexts[idx] = node->next;
1105 return;
1108 idx = parent->node_idx;
1109 type = (parent->type == 0) ? dfa->nodes[idx].type : parent->type;
1111 switch (type)
1113 case OP_DUP_ASTERISK:
1114 case OP_DUP_PLUS:
1115 node->next = idx;
1116 break;
1117 case CONCAT:
1118 if (parent->left == node)
1120 if (parent->right->first == -1)
1121 calc_first (dfa, parent->right);
1122 node->next = parent->right->first;
1123 break;
1125 /* else fall through */
1126 default:
1127 if (parent->next == -1)
1128 calc_next (dfa, parent);
1129 node->next = parent->next;
1130 break;
1132 idx = node->node_idx;
1133 if (node->type == 0)
1134 dfa->nexts[idx] = node->next;
1137 /* Calculate "edest" for the node NODE. */
1139 static void
1140 calc_epsdest (dfa, node)
1141 re_dfa_t *dfa;
1142 bin_tree_t *node;
1144 int idx;
1145 idx = node->node_idx;
1146 if (node->type == 0)
1148 if (dfa->nodes[idx].type == OP_DUP_ASTERISK
1149 || dfa->nodes[idx].type == OP_DUP_PLUS
1150 || dfa->nodes[idx].type == OP_DUP_QUESTION)
1152 if (node->left->first == -1)
1153 calc_first (dfa, node->left);
1154 if (node->next == -1)
1155 calc_next (dfa, node);
1156 re_node_set_init_2 (dfa->edests + idx, node->left->first,
1157 node->next);
1159 else if (dfa->nodes[idx].type == OP_ALT)
1161 int left, right;
1162 if (node->left != NULL)
1164 if (node->left->first == -1)
1165 calc_first (dfa, node->left);
1166 left = node->left->first;
1168 else
1170 if (node->next == -1)
1171 calc_next (dfa, node);
1172 left = node->next;
1174 if (node->right != NULL)
1176 if (node->right->first == -1)
1177 calc_first (dfa, node->right);
1178 right = node->right->first;
1180 else
1182 if (node->next == -1)
1183 calc_next (dfa, node);
1184 right = node->next;
1186 re_node_set_init_2 (dfa->edests + idx, left, right);
1188 else if (dfa->nodes[idx].type == ANCHOR
1189 || dfa->nodes[idx].type == OP_OPEN_SUBEXP
1190 || dfa->nodes[idx].type == OP_CLOSE_SUBEXP
1191 || dfa->nodes[idx].type == OP_BACK_REF)
1192 re_node_set_init_1 (dfa->edests + idx, node->next);
1196 /* Duplicate the epsilon closure of the node ROOT_NODE.
1197 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1198 to their own constraint. */
1200 static reg_errcode_t
1201 duplicate_node_closure (dfa, top_org_node, top_clone_node, root_node,
1202 init_constraint)
1203 re_dfa_t *dfa;
1204 int top_org_node, top_clone_node, root_node;
1205 unsigned int init_constraint;
1207 reg_errcode_t err;
1208 int org_node, clone_node, ret;
1209 unsigned int constraint = init_constraint;
1210 for (org_node = top_org_node, clone_node = top_clone_node;;)
1212 int org_dest, clone_dest;
1213 if (dfa->nodes[org_node].type == OP_BACK_REF)
1215 /* If the back reference epsilon-transit, its destination must
1216 also have the constraint. Then duplicate the epsilon closure
1217 of the destination of the back reference, and store it in
1218 edests of the back reference. */
1219 org_dest = dfa->nexts[org_node];
1220 re_node_set_empty (dfa->edests + clone_node);
1221 err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1222 if (BE (err != REG_NOERROR, 0))
1223 return err;
1224 dfa->nexts[clone_node] = dfa->nexts[org_node];
1225 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1226 if (BE (ret < 0, 0))
1227 return REG_ESPACE;
1229 else if (dfa->edests[org_node].nelem == 0)
1231 /* In case of the node can't epsilon-transit, don't duplicate the
1232 destination and store the original destination as the
1233 destination of the node. */
1234 dfa->nexts[clone_node] = dfa->nexts[org_node];
1235 break;
1237 else if (dfa->edests[org_node].nelem == 1)
1239 /* In case of the node can epsilon-transit, and it has only one
1240 destination. */
1241 org_dest = dfa->edests[org_node].elems[0];
1242 re_node_set_empty (dfa->edests + clone_node);
1243 if (dfa->nodes[org_node].type == ANCHOR)
1245 /* In case of the node has another constraint, append it. */
1246 if (org_node == root_node && clone_node != org_node)
1248 /* ...but if the node is root_node itself, it means the
1249 epsilon closure have a loop, then tie it to the
1250 destination of the root_node. */
1251 ret = re_node_set_insert (dfa->edests + clone_node,
1252 org_dest);
1253 if (BE (ret < 0, 0))
1254 return REG_ESPACE;
1255 break;
1257 constraint |= dfa->nodes[org_node].opr.ctx_type;
1259 err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1260 if (BE (err != REG_NOERROR, 0))
1261 return err;
1262 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1263 if (BE (ret < 0, 0))
1264 return REG_ESPACE;
1266 else /* dfa->edests[org_node].nelem == 2 */
1268 /* In case of the node can epsilon-transit, and it has two
1269 destinations. E.g. '|', '*', '+', '?'. */
1270 org_dest = dfa->edests[org_node].elems[0];
1271 re_node_set_empty (dfa->edests + clone_node);
1272 /* Search for a duplicated node which satisfies the constraint. */
1273 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1274 if (clone_dest == -1)
1276 /* There are no such a duplicated node, create a new one. */
1277 err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1278 if (BE (err != REG_NOERROR, 0))
1279 return err;
1280 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1281 if (BE (ret < 0, 0))
1282 return REG_ESPACE;
1283 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1284 root_node, constraint);
1285 if (BE (err != REG_NOERROR, 0))
1286 return err;
1288 else
1290 /* There are a duplicated node which satisfy the constraint,
1291 use it to avoid infinite loop. */
1292 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1293 if (BE (ret < 0, 0))
1294 return REG_ESPACE;
1297 org_dest = dfa->edests[org_node].elems[1];
1298 err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1299 if (BE (err != REG_NOERROR, 0))
1300 return err;
1301 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1302 if (BE (ret < 0, 0))
1303 return REG_ESPACE;
1305 org_node = org_dest;
1306 clone_node = clone_dest;
1308 return REG_NOERROR;
1311 /* Search for a node which is duplicated from the node ORG_NODE, and
1312 satisfies the constraint CONSTRAINT. */
1314 static int
1315 search_duplicated_node (dfa, org_node, constraint)
1316 re_dfa_t *dfa;
1317 int org_node;
1318 unsigned int constraint;
1320 int idx;
1321 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1323 if (org_node == dfa->org_indices[idx]
1324 && constraint == dfa->nodes[idx].constraint)
1325 return idx; /* Found. */
1327 return -1; /* Not found. */
1330 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1331 The new index will be stored in NEW_IDX and return REG_NOERROR if succeeded,
1332 otherwise return the error code. */
1334 static reg_errcode_t
1335 duplicate_node (new_idx, dfa, org_idx, constraint)
1336 re_dfa_t *dfa;
1337 int *new_idx, org_idx;
1338 unsigned int constraint;
1340 re_token_t dup;
1341 int dup_idx;
1343 dup = dfa->nodes[org_idx];
1344 dup_idx = re_dfa_add_node (dfa, dup, 1);
1345 if (BE (dup_idx == -1, 0))
1346 return REG_ESPACE;
1347 dfa->nodes[dup_idx].constraint = constraint;
1348 if (dfa->nodes[org_idx].type == ANCHOR)
1349 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type;
1350 dfa->nodes[dup_idx].duplicated = 1;
1351 re_node_set_init_empty (dfa->edests + dup_idx);
1352 re_node_set_init_empty (dfa->eclosures + dup_idx);
1353 re_node_set_init_empty (dfa->inveclosures + dup_idx);
1355 /* Store the index of the original node. */
1356 dfa->org_indices[dup_idx] = org_idx;
1357 *new_idx = dup_idx;
1358 return REG_NOERROR;
1361 static void
1362 calc_inveclosure (dfa)
1363 re_dfa_t *dfa;
1365 int src, idx, dest;
1366 for (src = 0; src < dfa->nodes_len; ++src)
1368 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1370 dest = dfa->eclosures[src].elems[idx];
1371 re_node_set_insert (dfa->inveclosures + dest, src);
1376 /* Calculate "eclosure" for all the node in DFA. */
1378 static reg_errcode_t
1379 calc_eclosure (dfa)
1380 re_dfa_t *dfa;
1382 int node_idx, incomplete;
1383 #ifdef DEBUG
1384 assert (dfa->nodes_len > 0);
1385 #endif
1386 incomplete = 0;
1387 /* For each nodes, calculate epsilon closure. */
1388 for (node_idx = 0; ; ++node_idx)
1390 reg_errcode_t err;
1391 re_node_set eclosure_elem;
1392 if (node_idx == dfa->nodes_len)
1394 if (!incomplete)
1395 break;
1396 incomplete = 0;
1397 node_idx = 0;
1400 #ifdef DEBUG
1401 assert (dfa->eclosures[node_idx].nelem != -1);
1402 #endif
1403 /* If we have already calculated, skip it. */
1404 if (dfa->eclosures[node_idx].nelem != 0)
1405 continue;
1406 /* Calculate epsilon closure of `node_idx'. */
1407 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1408 if (BE (err != REG_NOERROR, 0))
1409 return err;
1411 if (dfa->eclosures[node_idx].nelem == 0)
1413 incomplete = 1;
1414 re_node_set_free (&eclosure_elem);
1417 return REG_NOERROR;
1420 /* Calculate epsilon closure of NODE. */
1422 static reg_errcode_t
1423 calc_eclosure_iter (new_set, dfa, node, root)
1424 re_node_set *new_set;
1425 re_dfa_t *dfa;
1426 int node, root;
1428 reg_errcode_t err;
1429 unsigned int constraint;
1430 int i, incomplete;
1431 re_node_set eclosure;
1432 incomplete = 0;
1433 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1434 if (BE (err != REG_NOERROR, 0))
1435 return err;
1437 /* This indicates that we are calculating this node now.
1438 We reference this value to avoid infinite loop. */
1439 dfa->eclosures[node].nelem = -1;
1441 constraint = ((dfa->nodes[node].type == ANCHOR)
1442 ? dfa->nodes[node].opr.ctx_type : 0);
1443 /* If the current node has constraints, duplicate all nodes.
1444 Since they must inherit the constraints. */
1445 if (constraint && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1447 int org_node, cur_node;
1448 org_node = cur_node = node;
1449 err = duplicate_node_closure (dfa, node, node, node, constraint);
1450 if (BE (err != REG_NOERROR, 0))
1451 return err;
1454 /* Expand each epsilon destination nodes. */
1455 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1456 for (i = 0; i < dfa->edests[node].nelem; ++i)
1458 re_node_set eclosure_elem;
1459 int edest = dfa->edests[node].elems[i];
1460 /* If calculating the epsilon closure of `edest' is in progress,
1461 return intermediate result. */
1462 if (dfa->eclosures[edest].nelem == -1)
1464 incomplete = 1;
1465 continue;
1467 /* If we haven't calculated the epsilon closure of `edest' yet,
1468 calculate now. Otherwise use calculated epsilon closure. */
1469 if (dfa->eclosures[edest].nelem == 0)
1471 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1472 if (BE (err != REG_NOERROR, 0))
1473 return err;
1475 else
1476 eclosure_elem = dfa->eclosures[edest];
1477 /* Merge the epsilon closure of `edest'. */
1478 re_node_set_merge (&eclosure, &eclosure_elem);
1479 /* If the epsilon closure of `edest' is incomplete,
1480 the epsilon closure of this node is also incomplete. */
1481 if (dfa->eclosures[edest].nelem == 0)
1483 incomplete = 1;
1484 re_node_set_free (&eclosure_elem);
1488 /* Epsilon closures include itself. */
1489 re_node_set_insert (&eclosure, node);
1490 if (incomplete && !root)
1491 dfa->eclosures[node].nelem = 0;
1492 else
1493 dfa->eclosures[node] = eclosure;
1494 *new_set = eclosure;
1495 return REG_NOERROR;
1498 /* Functions for token which are used in the parser. */
1500 /* Fetch a token from INPUT.
1501 We must not use this function inside bracket expressions. */
1503 static re_token_t
1504 fetch_token (input, syntax)
1505 re_string_t *input;
1506 reg_syntax_t syntax;
1508 re_token_t token;
1509 int consumed_byte;
1510 consumed_byte = peek_token (&token, input, syntax);
1511 re_string_skip_bytes (input, consumed_byte);
1512 return token;
1515 /* Peek a token from INPUT, and return the length of the token.
1516 We must not use this function inside bracket expressions. */
1518 static int
1519 peek_token (token, input, syntax)
1520 re_token_t *token;
1521 re_string_t *input;
1522 reg_syntax_t syntax;
1524 unsigned char c;
1526 if (re_string_eoi (input))
1528 token->type = END_OF_RE;
1529 return 0;
1532 c = re_string_peek_byte (input, 0);
1533 token->opr.c = c;
1535 #ifdef RE_ENABLE_I18N
1536 token->mb_partial = 0;
1537 if (MB_CUR_MAX > 1 &&
1538 !re_string_first_byte (input, re_string_cur_idx (input)))
1540 token->type = CHARACTER;
1541 token->mb_partial = 1;
1542 return 1;
1544 #endif
1545 if (c == '\\')
1547 unsigned char c2;
1548 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1550 token->type = BACK_SLASH;
1551 return 1;
1554 c2 = re_string_peek_byte_case (input, 1);
1555 token->opr.c = c2;
1556 token->type = CHARACTER;
1557 switch (c2)
1559 case '|':
1560 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1561 token->type = OP_ALT;
1562 break;
1563 case '1': case '2': case '3': case '4': case '5':
1564 case '6': case '7': case '8': case '9':
1565 if (!(syntax & RE_NO_BK_REFS))
1567 token->type = OP_BACK_REF;
1568 token->opr.idx = c2 - '0';
1570 break;
1571 case '<':
1572 if (!(syntax & RE_NO_GNU_OPS))
1574 token->type = ANCHOR;
1575 token->opr.idx = WORD_FIRST;
1577 break;
1578 case '>':
1579 if (!(syntax & RE_NO_GNU_OPS))
1581 token->type = ANCHOR;
1582 token->opr.idx = WORD_LAST;
1584 break;
1585 case 'b':
1586 if (!(syntax & RE_NO_GNU_OPS))
1588 token->type = ANCHOR;
1589 token->opr.idx = WORD_DELIM;
1591 break;
1592 case 'B':
1593 if (!(syntax & RE_NO_GNU_OPS))
1595 token->type = ANCHOR;
1596 token->opr.idx = INSIDE_WORD;
1598 break;
1599 case 'w':
1600 if (!(syntax & RE_NO_GNU_OPS))
1601 token->type = OP_WORD;
1602 break;
1603 case 'W':
1604 if (!(syntax & RE_NO_GNU_OPS))
1605 token->type = OP_NOTWORD;
1606 break;
1607 case '`':
1608 if (!(syntax & RE_NO_GNU_OPS))
1610 token->type = ANCHOR;
1611 token->opr.idx = BUF_FIRST;
1613 break;
1614 case '\'':
1615 if (!(syntax & RE_NO_GNU_OPS))
1617 token->type = ANCHOR;
1618 token->opr.idx = BUF_LAST;
1620 break;
1621 case '(':
1622 if (!(syntax & RE_NO_BK_PARENS))
1623 token->type = OP_OPEN_SUBEXP;
1624 break;
1625 case ')':
1626 if (!(syntax & RE_NO_BK_PARENS))
1627 token->type = OP_CLOSE_SUBEXP;
1628 break;
1629 case '+':
1630 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1631 token->type = OP_DUP_PLUS;
1632 break;
1633 case '?':
1634 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1635 token->type = OP_DUP_QUESTION;
1636 break;
1637 case '{':
1638 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1639 token->type = OP_OPEN_DUP_NUM;
1640 break;
1641 case '}':
1642 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1643 token->type = OP_CLOSE_DUP_NUM;
1644 break;
1645 default:
1646 break;
1648 return 2;
1651 token->type = CHARACTER;
1652 switch (c)
1654 case '\n':
1655 if (syntax & RE_NEWLINE_ALT)
1656 token->type = OP_ALT;
1657 break;
1658 case '|':
1659 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1660 token->type = OP_ALT;
1661 break;
1662 case '*':
1663 token->type = OP_DUP_ASTERISK;
1664 break;
1665 case '+':
1666 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1667 token->type = OP_DUP_PLUS;
1668 break;
1669 case '?':
1670 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1671 token->type = OP_DUP_QUESTION;
1672 break;
1673 case '{':
1674 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1675 token->type = OP_OPEN_DUP_NUM;
1676 break;
1677 case '}':
1678 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1679 token->type = OP_CLOSE_DUP_NUM;
1680 break;
1681 case '(':
1682 if (syntax & RE_NO_BK_PARENS)
1683 token->type = OP_OPEN_SUBEXP;
1684 break;
1685 case ')':
1686 if (syntax & RE_NO_BK_PARENS)
1687 token->type = OP_CLOSE_SUBEXP;
1688 break;
1689 case '[':
1690 token->type = OP_OPEN_BRACKET;
1691 break;
1692 case '.':
1693 token->type = OP_PERIOD;
1694 break;
1695 case '^':
1696 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1697 re_string_cur_idx (input) != 0)
1699 char prev = re_string_peek_byte (input, -1);
1700 if (prev != '|' && prev != '(' &&
1701 (!(syntax & RE_NEWLINE_ALT) || prev != '\n'))
1702 break;
1704 token->type = ANCHOR;
1705 token->opr.idx = LINE_FIRST;
1706 break;
1707 case '$':
1708 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1709 re_string_cur_idx (input) + 1 != re_string_length (input))
1711 re_token_t next;
1712 re_string_skip_bytes (input, 1);
1713 peek_token (&next, input, syntax);
1714 re_string_skip_bytes (input, -1);
1715 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1716 break;
1718 token->type = ANCHOR;
1719 token->opr.idx = LINE_LAST;
1720 break;
1721 default:
1722 break;
1724 return 1;
1727 /* Peek a token from INPUT, and return the length of the token.
1728 We must not use this function out of bracket expressions. */
1730 static int
1731 peek_token_bracket (token, input, syntax)
1732 re_token_t *token;
1733 re_string_t *input;
1734 reg_syntax_t syntax;
1736 unsigned char c;
1737 if (re_string_eoi (input))
1739 token->type = END_OF_RE;
1740 return 0;
1742 c = re_string_peek_byte (input, 0);
1743 token->opr.c = c;
1745 #ifdef RE_ENABLE_I18N
1746 if (MB_CUR_MAX > 1 &&
1747 !re_string_first_byte (input, re_string_cur_idx (input)))
1749 token->type = CHARACTER;
1750 return 1;
1752 #endif /* RE_ENABLE_I18N */
1754 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS))
1756 /* In this case, '\' escape a character. */
1757 unsigned char c2;
1758 re_string_skip_bytes (input, 1);
1759 c2 = re_string_peek_byte (input, 0);
1760 token->opr.c = c2;
1761 token->type = CHARACTER;
1762 return 1;
1764 if (c == '[') /* '[' is a special char in a bracket exps. */
1766 unsigned char c2;
1767 int token_len;
1768 c2 = re_string_peek_byte (input, 1);
1769 token->opr.c = c2;
1770 token_len = 2;
1771 switch (c2)
1773 case '.':
1774 token->type = OP_OPEN_COLL_ELEM;
1775 break;
1776 case '=':
1777 token->type = OP_OPEN_EQUIV_CLASS;
1778 break;
1779 case ':':
1780 if (syntax & RE_CHAR_CLASSES)
1782 token->type = OP_OPEN_CHAR_CLASS;
1783 break;
1785 /* else fall through. */
1786 default:
1787 token->type = CHARACTER;
1788 token->opr.c = c;
1789 token_len = 1;
1790 break;
1792 return token_len;
1794 switch (c)
1796 case '-':
1797 token->type = OP_CHARSET_RANGE;
1798 break;
1799 case ']':
1800 token->type = OP_CLOSE_BRACKET;
1801 break;
1802 case '^':
1803 token->type = OP_NON_MATCH_LIST;
1804 break;
1805 default:
1806 token->type = CHARACTER;
1808 return 1;
1811 /* Functions for parser. */
1813 /* Entry point of the parser.
1814 Parse the regular expression REGEXP and return the structure tree.
1815 If an error is occured, ERR is set by error code, and return NULL.
1816 This function build the following tree, from regular expression <reg_exp>:
1820 <reg_exp> EOR
1822 CAT means concatenation.
1823 EOR means end of regular expression. */
1825 static bin_tree_t *
1826 parse (regexp, preg, syntax, err)
1827 re_string_t *regexp;
1828 regex_t *preg;
1829 reg_syntax_t syntax;
1830 reg_errcode_t *err;
1832 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1833 bin_tree_t *tree, *eor, *root;
1834 re_token_t current_token;
1835 int new_idx;
1836 current_token = fetch_token (regexp, syntax);
1837 tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
1838 if (BE (*err != REG_NOERROR && tree == NULL, 0))
1839 return NULL;
1840 new_idx = re_dfa_add_node (dfa, current_token, 0);
1841 eor = create_tree (NULL, NULL, 0, new_idx);
1842 if (tree != NULL)
1843 root = create_tree (tree, eor, CONCAT, 0);
1844 else
1845 root = eor;
1846 if (BE (new_idx == -1 || eor == NULL || root == NULL, 0))
1848 *err = REG_ESPACE;
1849 return NULL;
1851 return root;
1854 /* This function build the following tree, from regular expression
1855 <branch1>|<branch2>:
1859 <branch1> <branch2>
1861 ALT means alternative, which represents the operator `|'. */
1863 static bin_tree_t *
1864 parse_reg_exp (regexp, preg, token, syntax, nest, err)
1865 re_string_t *regexp;
1866 regex_t *preg;
1867 re_token_t *token;
1868 reg_syntax_t syntax;
1869 int nest;
1870 reg_errcode_t *err;
1872 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1873 bin_tree_t *tree, *branch = NULL;
1874 int new_idx;
1875 tree = parse_branch (regexp, preg, token, syntax, nest, err);
1876 if (BE (*err != REG_NOERROR && tree == NULL, 0))
1877 return NULL;
1879 while (token->type == OP_ALT)
1881 re_token_t alt_token = *token;
1882 new_idx = re_dfa_add_node (dfa, alt_token, 0);
1883 *token = fetch_token (regexp, syntax);
1884 if (token->type != OP_ALT && token->type != END_OF_RE
1885 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
1887 branch = parse_branch (regexp, preg, token, syntax, nest, err);
1888 if (BE (*err != REG_NOERROR && branch == NULL, 0))
1890 free_bin_tree (tree);
1891 return NULL;
1894 else
1895 branch = NULL;
1896 tree = create_tree (tree, branch, 0, new_idx);
1897 if (BE (new_idx == -1 || tree == NULL, 0))
1899 *err = REG_ESPACE;
1900 return NULL;
1902 dfa->has_plural_match = 1;
1904 return tree;
1907 /* This function build the following tree, from regular expression
1908 <exp1><exp2>:
1912 <exp1> <exp2>
1914 CAT means concatenation. */
1916 static bin_tree_t *
1917 parse_branch (regexp, preg, token, syntax, nest, err)
1918 re_string_t *regexp;
1919 regex_t *preg;
1920 re_token_t *token;
1921 reg_syntax_t syntax;
1922 int nest;
1923 reg_errcode_t *err;
1925 bin_tree_t *tree, *exp;
1926 tree = parse_expression (regexp, preg, token, syntax, nest, err);
1927 if (BE (*err != REG_NOERROR && tree == NULL, 0))
1928 return NULL;
1930 while (token->type != OP_ALT && token->type != END_OF_RE
1931 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
1933 exp = parse_expression (regexp, preg, token, syntax, nest, err);
1934 if (BE (*err != REG_NOERROR && exp == NULL, 0))
1936 free_bin_tree (tree);
1937 return NULL;
1939 if (tree != NULL && exp != NULL)
1941 tree = create_tree (tree, exp, CONCAT, 0);
1942 if (tree == NULL)
1944 *err = REG_ESPACE;
1945 return NULL;
1948 else if (tree == NULL)
1949 tree = exp;
1950 /* Otherwise exp == NULL, we don't need to create new tree. */
1952 return tree;
1955 /* This function build the following tree, from regular expression a*:
1961 static bin_tree_t *
1962 parse_expression (regexp, preg, token, syntax, nest, err)
1963 re_string_t *regexp;
1964 regex_t *preg;
1965 re_token_t *token;
1966 reg_syntax_t syntax;
1967 int nest;
1968 reg_errcode_t *err;
1970 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1971 bin_tree_t *tree;
1972 int new_idx;
1973 switch (token->type)
1975 case CHARACTER:
1976 new_idx = re_dfa_add_node (dfa, *token, 0);
1977 tree = create_tree (NULL, NULL, 0, new_idx);
1978 if (BE (new_idx == -1 || tree == NULL, 0))
1980 *err = REG_ESPACE;
1981 return NULL;
1983 #ifdef RE_ENABLE_I18N
1984 if (MB_CUR_MAX > 1)
1986 while (!re_string_eoi (regexp)
1987 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
1989 bin_tree_t *mbc_remain;
1990 *token = fetch_token (regexp, syntax);
1991 new_idx = re_dfa_add_node (dfa, *token, 0);
1992 mbc_remain = create_tree (NULL, NULL, 0, new_idx);
1993 tree = create_tree (tree, mbc_remain, CONCAT, 0);
1994 if (BE (new_idx == -1 || mbc_remain == NULL || tree == NULL, 0))
1995 return *err = REG_ESPACE, NULL;
1998 #endif
1999 break;
2000 case OP_OPEN_SUBEXP:
2001 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2002 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2003 return NULL;
2004 break;
2005 case OP_OPEN_BRACKET:
2006 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2007 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2008 return NULL;
2009 break;
2010 case OP_BACK_REF:
2011 if (BE (preg->re_nsub < token->opr.idx
2012 || dfa->subexps[token->opr.idx - 1].end == -1, 0))
2014 *err = REG_ESUBREG;
2015 return NULL;
2017 dfa->used_bkref_map |= 1 << (token->opr.idx - 1);
2018 new_idx = re_dfa_add_node (dfa, *token, 0);
2019 tree = create_tree (NULL, NULL, 0, new_idx);
2020 if (BE (new_idx == -1 || tree == NULL, 0))
2022 *err = REG_ESPACE;
2023 return NULL;
2025 ++dfa->nbackref;
2026 dfa->has_mb_node = 1;
2027 break;
2028 case OP_DUP_ASTERISK:
2029 case OP_DUP_PLUS:
2030 case OP_DUP_QUESTION:
2031 case OP_OPEN_DUP_NUM:
2032 if (syntax & RE_CONTEXT_INVALID_OPS)
2034 *err = REG_BADRPT;
2035 return NULL;
2037 else if (syntax & RE_CONTEXT_INDEP_OPS)
2039 *token = fetch_token (regexp, syntax);
2040 return parse_expression (regexp, preg, token, syntax, nest, err);
2042 /* else fall through */
2043 case OP_CLOSE_SUBEXP:
2044 if ((token->type == OP_CLOSE_SUBEXP) &&
2045 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2047 *err = REG_ERPAREN;
2048 return NULL;
2050 /* else fall through */
2051 case OP_CLOSE_DUP_NUM:
2052 /* We treat it as a normal character. */
2054 /* Then we can these characters as normal characters. */
2055 token->type = CHARACTER;
2056 new_idx = re_dfa_add_node (dfa, *token, 0);
2057 tree = create_tree (NULL, NULL, 0, new_idx);
2058 if (BE (new_idx == -1 || tree == NULL, 0))
2060 *err = REG_ESPACE;
2061 return NULL;
2063 break;
2064 case ANCHOR:
2065 if (dfa->word_char == NULL)
2067 *err = init_word_char (dfa);
2068 if (BE (*err != REG_NOERROR, 0))
2069 return NULL;
2071 if (token->opr.ctx_type == WORD_DELIM)
2073 bin_tree_t *tree_first, *tree_last;
2074 int idx_first, idx_last;
2075 token->opr.ctx_type = WORD_FIRST;
2076 idx_first = re_dfa_add_node (dfa, *token, 0);
2077 tree_first = create_tree (NULL, NULL, 0, idx_first);
2078 token->opr.ctx_type = WORD_LAST;
2079 idx_last = re_dfa_add_node (dfa, *token, 0);
2080 tree_last = create_tree (NULL, NULL, 0, idx_last);
2081 token->type = OP_ALT;
2082 new_idx = re_dfa_add_node (dfa, *token, 0);
2083 tree = create_tree (tree_first, tree_last, 0, new_idx);
2084 if (BE (idx_first == -1 || idx_last == -1 || new_idx == -1
2085 || tree_first == NULL || tree_last == NULL
2086 || tree == NULL, 0))
2088 *err = REG_ESPACE;
2089 return NULL;
2092 else
2094 new_idx = re_dfa_add_node (dfa, *token, 0);
2095 tree = create_tree (NULL, NULL, 0, new_idx);
2096 if (BE (new_idx == -1 || tree == NULL, 0))
2097 return *err = REG_ESPACE, NULL;
2099 /* We must return here, since ANCHORs can't be followed
2100 by repetition operators.
2101 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2102 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2103 *token = fetch_token (regexp, syntax);
2104 return tree;
2105 case OP_PERIOD:
2106 new_idx = re_dfa_add_node (dfa, *token, 0);
2107 tree = create_tree (NULL, NULL, 0, new_idx);
2108 if (BE (new_idx == -1 || tree == NULL, 0))
2110 *err = REG_ESPACE;
2111 return NULL;
2113 if (MB_CUR_MAX > 1)
2114 dfa->has_mb_node = 1;
2115 break;
2116 case OP_WORD:
2117 tree = build_word_op (dfa, 0, err);
2118 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2119 return NULL;
2120 break;
2121 case OP_NOTWORD:
2122 tree = build_word_op (dfa, 1, err);
2123 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2124 return NULL;
2125 break;
2126 case OP_ALT:
2127 case END_OF_RE:
2128 return NULL;
2129 case BACK_SLASH:
2130 *err = REG_EESCAPE;
2131 return NULL;
2132 default:
2133 /* Must not happen? */
2134 #ifdef DEBUG
2135 assert (0);
2136 #endif
2137 return NULL;
2139 *token = fetch_token (regexp, syntax);
2141 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2142 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2144 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2145 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2146 return NULL;
2147 dfa->has_plural_match = 1;
2150 return tree;
2153 /* This function build the following tree, from regular expression
2154 (<reg_exp>):
2155 SUBEXP
2157 <reg_exp>
2160 static bin_tree_t *
2161 parse_sub_exp (regexp, preg, token, syntax, nest, err)
2162 re_string_t *regexp;
2163 regex_t *preg;
2164 re_token_t *token;
2165 reg_syntax_t syntax;
2166 int nest;
2167 reg_errcode_t *err;
2169 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2170 bin_tree_t *tree, *left_par, *right_par;
2171 size_t cur_nsub;
2172 int new_idx;
2173 cur_nsub = preg->re_nsub++;
2174 if (dfa->subexps_alloc < preg->re_nsub)
2176 re_subexp_t *new_array;
2177 dfa->subexps_alloc *= 2;
2178 new_array = re_realloc (dfa->subexps, re_subexp_t, dfa->subexps_alloc);
2179 if (BE (new_array == NULL, 0))
2181 dfa->subexps_alloc /= 2;
2182 *err = REG_ESPACE;
2183 return NULL;
2185 dfa->subexps = new_array;
2187 dfa->subexps[cur_nsub].start = dfa->nodes_len;
2188 dfa->subexps[cur_nsub].end = -1;
2190 new_idx = re_dfa_add_node (dfa, *token, 0);
2191 left_par = create_tree (NULL, NULL, 0, new_idx);
2192 if (BE (new_idx == -1 || left_par == NULL, 0))
2194 *err = REG_ESPACE;
2195 return NULL;
2197 dfa->nodes[new_idx].opr.idx = cur_nsub;
2198 *token = fetch_token (regexp, syntax);
2200 /* The subexpression may be a null string. */
2201 if (token->type == OP_CLOSE_SUBEXP)
2202 tree = NULL;
2203 else
2205 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2206 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2207 return NULL;
2209 if (BE (token->type != OP_CLOSE_SUBEXP, 0))
2211 free_bin_tree (tree);
2212 *err = REG_BADPAT;
2213 return NULL;
2215 new_idx = re_dfa_add_node (dfa, *token, 0);
2216 dfa->subexps[cur_nsub].end = dfa->nodes_len;
2217 right_par = create_tree (NULL, NULL, 0, new_idx);
2218 tree = ((tree == NULL) ? right_par
2219 : create_tree (tree, right_par, CONCAT, 0));
2220 tree = create_tree (left_par, tree, CONCAT, 0);
2221 if (BE (new_idx == -1 || right_par == NULL || tree == NULL, 0))
2223 *err = REG_ESPACE;
2224 return NULL;
2226 dfa->nodes[new_idx].opr.idx = cur_nsub;
2228 return tree;
2231 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2233 static bin_tree_t *
2234 parse_dup_op (dup_elem, regexp, dfa, token, syntax, err)
2235 bin_tree_t *dup_elem;
2236 re_string_t *regexp;
2237 re_dfa_t *dfa;
2238 re_token_t *token;
2239 reg_syntax_t syntax;
2240 reg_errcode_t *err;
2242 re_token_t dup_token;
2243 bin_tree_t *tree = dup_elem, *work_tree;
2244 int new_idx, start_idx = re_string_cur_idx (regexp);
2245 re_token_t start_token = *token;
2246 if (token->type == OP_OPEN_DUP_NUM)
2248 int i;
2249 int end = 0;
2250 int start = fetch_number (regexp, token, syntax);
2251 bin_tree_t *elem;
2252 if (start == -1)
2254 if (token->type == CHARACTER && token->opr.c == ',')
2255 start = 0; /* We treat "{,m}" as "{0,m}". */
2256 else
2258 *err = REG_BADBR; /* <re>{} is invalid. */
2259 return NULL;
2262 if (BE (start != -2, 1))
2264 /* We treat "{n}" as "{n,n}". */
2265 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2266 : ((token->type == CHARACTER && token->opr.c == ',')
2267 ? fetch_number (regexp, token, syntax) : -2));
2269 if (BE (start == -2 || end == -2, 0))
2271 /* Invalid sequence. */
2272 if (token->type == OP_CLOSE_DUP_NUM)
2273 goto parse_dup_op_invalid_interval;
2274 else
2275 goto parse_dup_op_ebrace;
2277 if (BE (start == 0 && end == 0, 0))
2279 /* We treat "<re>{0}" and "<re>{0,0}" as null string. */
2280 *token = fetch_token (regexp, syntax);
2281 free_bin_tree (dup_elem);
2282 return NULL;
2285 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2286 elem = tree;
2287 for (i = 0; i < start; ++i)
2288 if (i != 0)
2290 work_tree = duplicate_tree (elem, dfa);
2291 tree = create_tree (tree, work_tree, CONCAT, 0);
2292 if (BE (work_tree == NULL || tree == NULL, 0))
2293 goto parse_dup_op_espace;
2296 if (end == -1)
2298 /* We treat "<re>{0,}" as "<re>*". */
2299 dup_token.type = OP_DUP_ASTERISK;
2300 if (start > 0)
2302 elem = duplicate_tree (elem, dfa);
2303 new_idx = re_dfa_add_node (dfa, dup_token, 0);
2304 work_tree = create_tree (elem, NULL, 0, new_idx);
2305 tree = create_tree (tree, work_tree, CONCAT, 0);
2306 if (BE (elem == NULL || new_idx == -1 || work_tree == NULL
2307 || tree == NULL, 0))
2308 goto parse_dup_op_espace;
2310 else
2312 new_idx = re_dfa_add_node (dfa, dup_token, 0);
2313 tree = create_tree (elem, NULL, 0, new_idx);
2314 if (BE (new_idx == -1 || tree == NULL, 0))
2315 goto parse_dup_op_espace;
2318 else if (end - start > 0)
2320 /* Then extract "<re>{0,m}" to "<re>?<re>?...<re>?". */
2321 dup_token.type = OP_DUP_QUESTION;
2322 if (start > 0)
2324 elem = duplicate_tree (elem, dfa);
2325 new_idx = re_dfa_add_node (dfa, dup_token, 0);
2326 elem = create_tree (elem, NULL, 0, new_idx);
2327 tree = create_tree (tree, elem, CONCAT, 0);
2328 if (BE (elem == NULL || new_idx == -1 || tree == NULL, 0))
2329 goto parse_dup_op_espace;
2331 else
2333 new_idx = re_dfa_add_node (dfa, dup_token, 0);
2334 tree = elem = create_tree (elem, NULL, 0, new_idx);
2335 if (BE (new_idx == -1 || tree == NULL, 0))
2336 goto parse_dup_op_espace;
2338 for (i = 1; i < end - start; ++i)
2340 work_tree = duplicate_tree (elem, dfa);
2341 tree = create_tree (tree, work_tree, CONCAT, 0);
2342 if (BE (work_tree == NULL || tree == NULL, 0))
2344 *err = REG_ESPACE;
2345 return NULL;
2350 else
2352 new_idx = re_dfa_add_node (dfa, *token, 0);
2353 tree = create_tree (tree, NULL, 0, new_idx);
2354 if (BE (new_idx == -1 || tree == NULL, 0))
2356 *err = REG_ESPACE;
2357 return NULL;
2360 *token = fetch_token (regexp, syntax);
2361 return tree;
2363 parse_dup_op_espace:
2364 free_bin_tree (tree);
2365 *err = REG_ESPACE;
2366 return NULL;
2368 parse_dup_op_ebrace:
2369 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2371 *err = REG_EBRACE;
2372 return NULL;
2374 goto parse_dup_op_rollback;
2375 parse_dup_op_invalid_interval:
2376 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2378 *err = REG_BADBR;
2379 return NULL;
2381 parse_dup_op_rollback:
2382 re_string_set_index (regexp, start_idx);
2383 *token = start_token;
2384 token->type = CHARACTER;
2385 return dup_elem;
2388 /* Size of the names for collating symbol/equivalence_class/character_class.
2389 I'm not sure, but maybe enough. */
2390 #define BRACKET_NAME_BUF_SIZE 32
2392 #ifndef _LIBC
2393 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2394 Build the range expression which starts from START_ELEM, and ends
2395 at END_ELEM. The result are written to MBCSET and SBCSET.
2396 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2397 mbcset->range_ends, is a pointer argument sinse we may
2398 update it. */
2400 static reg_errcode_t
2401 # ifdef RE_ENABLE_I18N
2402 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2403 re_charset_t *mbcset;
2404 int *range_alloc;
2405 # else /* not RE_ENABLE_I18N */
2406 build_range_exp (sbcset, start_elem, end_elem)
2407 # endif /* not RE_ENABLE_I18N */
2408 re_bitset_ptr_t sbcset;
2409 bracket_elem_t *start_elem, *end_elem;
2411 unsigned int start_ch, end_ch;
2412 /* Equivalence Classes and Character Classes can't be a range start/end. */
2413 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2414 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2416 return REG_ERANGE;
2418 /* We can handle no multi character collating elements without libc
2419 support. */
2420 if (BE ((start_elem->type == COLL_SYM
2421 && strlen ((char *) start_elem->opr.name) > 1)
2422 || (end_elem->type == COLL_SYM
2423 && strlen ((char *) end_elem->opr.name) > 1), 0))
2424 return REG_ECOLLATE;
2426 # ifdef RE_ENABLE_I18N
2428 wchar_t wc, start_wc, end_wc;
2429 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2431 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2432 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2433 : 0));
2434 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2435 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2436 : 0));
2437 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2438 ? __btowc (start_ch) : start_elem->opr.wch);
2439 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2440 ? __btowc (end_ch) : end_elem->opr.wch);
2441 cmp_buf[0] = start_wc;
2442 cmp_buf[4] = end_wc;
2443 if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2444 return REG_ERANGE;
2446 /* Check the space of the arrays. */
2447 if (*range_alloc == mbcset->nranges)
2449 /* There are not enough space, need realloc. */
2450 wchar_t *new_array_start, *new_array_end;
2451 int new_nranges;
2453 /* +1 in case of mbcset->nranges is 0. */
2454 new_nranges = 2 * mbcset->nranges + 1;
2455 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2456 are NULL if *range_alloc == 0. */
2457 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2458 new_nranges);
2459 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2460 new_nranges);
2462 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2463 return REG_ESPACE;
2465 mbcset->range_starts = new_array_start;
2466 mbcset->range_ends = new_array_end;
2467 *range_alloc = new_nranges;
2470 mbcset->range_starts[mbcset->nranges] = start_wc;
2471 mbcset->range_ends[mbcset->nranges++] = end_wc;
2473 /* Build the table for single byte characters. */
2474 for (wc = 0; wc <= SBC_MAX; ++wc)
2476 cmp_buf[2] = wc;
2477 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2478 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2479 bitset_set (sbcset, wc);
2482 # else /* not RE_ENABLE_I18N */
2484 unsigned int ch;
2485 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2486 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2487 : 0));
2488 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2489 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2490 : 0));
2491 if (start_ch > end_ch)
2492 return REG_ERANGE;
2493 /* Build the table for single byte characters. */
2494 for (ch = 0; ch <= SBC_MAX; ++ch)
2495 if (start_ch <= ch && ch <= end_ch)
2496 bitset_set (sbcset, ch);
2498 # endif /* not RE_ENABLE_I18N */
2499 return REG_NOERROR;
2501 #endif /* not _LIBC */
2503 #ifndef _LIBC
2504 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2505 Build the collating element which is represented by NAME.
2506 The result are written to MBCSET and SBCSET.
2507 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2508 pointer argument since we may update it. */
2510 static reg_errcode_t
2511 # ifdef RE_ENABLE_I18N
2512 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2513 re_charset_t *mbcset;
2514 int *coll_sym_alloc;
2515 # else /* not RE_ENABLE_I18N */
2516 build_collating_symbol (sbcset, name)
2517 # endif /* not RE_ENABLE_I18N */
2518 re_bitset_ptr_t sbcset;
2519 const unsigned char *name;
2521 size_t name_len = strlen ((const char *) name);
2522 if (BE (name_len != 1, 0))
2523 return REG_ECOLLATE;
2524 else
2526 bitset_set (sbcset, name[0]);
2527 return REG_NOERROR;
2530 #endif /* not _LIBC */
2532 /* This function parse bracket expression like "[abc]", "[a-c]",
2533 "[[.a-a.]]" etc. */
2535 static bin_tree_t *
2536 parse_bracket_exp (regexp, dfa, token, syntax, err)
2537 re_string_t *regexp;
2538 re_dfa_t *dfa;
2539 re_token_t *token;
2540 reg_syntax_t syntax;
2541 reg_errcode_t *err;
2543 #ifdef _LIBC
2544 const unsigned char *collseqmb;
2545 const char *collseqwc;
2546 uint32_t nrules;
2547 int32_t table_size;
2548 const int32_t *symb_table;
2549 const unsigned char *extra;
2551 /* Local function for parse_bracket_exp used in _LIBC environement.
2552 Seek the collating symbol entry correspondings to NAME.
2553 Return the index of the symbol in the SYMB_TABLE. */
2555 static inline int32_t
2556 seek_collating_symbol_entry (name, name_len)
2557 const unsigned char *name;
2558 size_t name_len;
2560 int32_t hash = elem_hash ((const char *) name, name_len);
2561 int32_t elem = hash % table_size;
2562 int32_t second = hash % (table_size - 2);
2563 while (symb_table[2 * elem] != 0)
2565 /* First compare the hashing value. */
2566 if (symb_table[2 * elem] == hash
2567 /* Compare the length of the name. */
2568 && name_len == extra[symb_table[2 * elem + 1]]
2569 /* Compare the name. */
2570 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2571 name_len) == 0)
2573 /* Yep, this is the entry. */
2574 break;
2577 /* Next entry. */
2578 elem += second;
2580 return elem;
2583 /* Local function for parse_bracket_exp used in _LIBC environement.
2584 Look up the collation sequence value of BR_ELEM.
2585 Return the value if succeeded, UINT_MAX otherwise. */
2587 static inline unsigned int
2588 lookup_collation_sequence_value (br_elem)
2589 bracket_elem_t *br_elem;
2591 if (br_elem->type == SB_CHAR)
2594 if (MB_CUR_MAX == 1)
2596 if (nrules == 0)
2597 return collseqmb[br_elem->opr.ch];
2598 else
2600 wint_t wc = __btowc (br_elem->opr.ch);
2601 return collseq_table_lookup (collseqwc, wc);
2604 else if (br_elem->type == MB_CHAR)
2606 return collseq_table_lookup (collseqwc, br_elem->opr.wch);
2608 else if (br_elem->type == COLL_SYM)
2610 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2611 if (nrules != 0)
2613 int32_t elem, idx;
2614 elem = seek_collating_symbol_entry (br_elem->opr.name,
2615 sym_name_len);
2616 if (symb_table[2 * elem] != 0)
2618 /* We found the entry. */
2619 idx = symb_table[2 * elem + 1];
2620 /* Skip the name of collating element name. */
2621 idx += 1 + extra[idx];
2622 /* Skip the byte sequence of the collating element. */
2623 idx += 1 + extra[idx];
2624 /* Adjust for the alignment. */
2625 idx = (idx + 3) & ~3;
2626 /* Skip the multibyte collation sequence value. */
2627 idx += sizeof (unsigned int);
2628 /* Skip the wide char sequence of the collating element. */
2629 idx += sizeof (unsigned int) *
2630 (1 + *(unsigned int *) (extra + idx));
2631 /* Return the collation sequence value. */
2632 return *(unsigned int *) (extra + idx);
2634 else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2636 /* No valid character. Match it as a single byte
2637 character. */
2638 return collseqmb[br_elem->opr.name[0]];
2641 else if (sym_name_len == 1)
2642 return collseqmb[br_elem->opr.name[0]];
2644 return UINT_MAX;
2647 /* Local function for parse_bracket_exp used in _LIBC environement.
2648 Build the range expression which starts from START_ELEM, and ends
2649 at END_ELEM. The result are written to MBCSET and SBCSET.
2650 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2651 mbcset->range_ends, is a pointer argument sinse we may
2652 update it. */
2654 static inline reg_errcode_t
2655 # ifdef RE_ENABLE_I18N
2656 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2657 re_charset_t *mbcset;
2658 int *range_alloc;
2659 # else /* not RE_ENABLE_I18N */
2660 build_range_exp (sbcset, start_elem, end_elem)
2661 # endif /* not RE_ENABLE_I18N */
2662 re_bitset_ptr_t sbcset;
2663 bracket_elem_t *start_elem, *end_elem;
2665 unsigned int ch;
2666 uint32_t start_collseq;
2667 uint32_t end_collseq;
2669 # ifdef RE_ENABLE_I18N
2670 /* Check the space of the arrays. */
2671 if (*range_alloc == mbcset->nranges)
2673 /* There are not enough space, need realloc. */
2674 uint32_t *new_array_start;
2675 uint32_t *new_array_end;
2676 int new_nranges;
2678 /* +1 in case of mbcset->nranges is 0. */
2679 new_nranges = 2 * mbcset->nranges + 1;
2680 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2681 are NULL if *range_alloc == 0. */
2682 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2683 new_nranges);
2684 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2685 new_nranges);
2687 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2688 return REG_ESPACE;
2690 mbcset->range_starts = new_array_start;
2691 mbcset->range_ends = new_array_end;
2692 *range_alloc = new_nranges;
2694 # endif /* RE_ENABLE_I18N */
2696 /* Equivalence Classes and Character Classes can't be a range
2697 start/end. */
2698 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2699 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2701 return REG_ERANGE;
2703 start_collseq = lookup_collation_sequence_value (start_elem);
2704 end_collseq = lookup_collation_sequence_value (end_elem);
2705 /* Check start/end collation sequence values. */
2706 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2707 return REG_ECOLLATE;
2708 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2709 return REG_ERANGE;
2711 # ifdef RE_ENABLE_I18N
2712 /* Got valid collation sequence values, add them as a new entry. */
2713 mbcset->range_starts[mbcset->nranges] = start_collseq;
2714 mbcset->range_ends[mbcset->nranges++] = end_collseq;
2715 # endif /* RE_ENABLE_I18N */
2717 /* Build the table for single byte characters. */
2718 for (ch = 0; ch <= SBC_MAX; ch++)
2720 uint32_t ch_collseq;
2722 if (MB_CUR_MAX == 1)
2724 if (nrules == 0)
2725 ch_collseq = collseqmb[ch];
2726 else
2727 ch_collseq = collseq_table_lookup (collseqwc, __btowc (ch));
2728 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2729 bitset_set (sbcset, ch);
2731 return REG_NOERROR;
2734 /* Local function for parse_bracket_exp used in _LIBC environement.
2735 Build the collating element which is represented by NAME.
2736 The result are written to MBCSET and SBCSET.
2737 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2738 pointer argument sinse we may update it. */
2740 static inline reg_errcode_t
2741 # ifdef RE_ENABLE_I18N
2742 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2743 re_charset_t *mbcset;
2744 int *coll_sym_alloc;
2745 # else /* not RE_ENABLE_I18N */
2746 build_collating_symbol (sbcset, name)
2747 # endif /* not RE_ENABLE_I18N */
2748 re_bitset_ptr_t sbcset;
2749 const unsigned char *name;
2751 int32_t elem, idx;
2752 size_t name_len = strlen ((const char *) name);
2753 if (nrules != 0)
2755 elem = seek_collating_symbol_entry (name, name_len);
2756 if (symb_table[2 * elem] != 0)
2758 /* We found the entry. */
2759 idx = symb_table[2 * elem + 1];
2760 /* Skip the name of collating element name. */
2761 idx += 1 + extra[idx];
2763 else if (symb_table[2 * elem] == 0 && name_len == 1)
2765 /* No valid character, treat it as a normal
2766 character. */
2767 bitset_set (sbcset, name[0]);
2768 return REG_NOERROR;
2770 else
2771 return REG_ECOLLATE;
2773 # ifdef RE_ENABLE_I18N
2774 /* Got valid collation sequence, add it as a new entry. */
2775 /* Check the space of the arrays. */
2776 if (*coll_sym_alloc == mbcset->ncoll_syms)
2778 /* Not enough, realloc it. */
2779 /* +1 in case of mbcset->ncoll_syms is 0. */
2780 *coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
2781 /* Use realloc since mbcset->coll_syms is NULL
2782 if *alloc == 0. */
2783 mbcset->coll_syms = re_realloc (mbcset->coll_syms, int32_t,
2784 *coll_sym_alloc);
2785 if (BE (mbcset->coll_syms == NULL, 0))
2786 return REG_ESPACE;
2788 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
2789 # endif /* RE_ENABLE_I18N */
2790 return REG_NOERROR;
2792 else
2794 if (BE (name_len != 1, 0))
2795 return REG_ECOLLATE;
2796 else
2798 bitset_set (sbcset, name[0]);
2799 return REG_NOERROR;
2803 #endif
2805 re_token_t br_token;
2806 re_bitset_ptr_t sbcset;
2807 #ifdef RE_ENABLE_I18N
2808 re_charset_t *mbcset;
2809 int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
2810 int equiv_class_alloc = 0, char_class_alloc = 0;
2811 #else /* not RE_ENABLE_I18N */
2812 int non_match = 0;
2813 #endif /* not RE_ENABLE_I18N */
2814 bin_tree_t *work_tree;
2815 int token_len, new_idx;
2816 #ifdef _LIBC
2817 collseqmb = (const unsigned char *)
2818 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
2819 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
2820 if (nrules)
2823 if (MB_CUR_MAX > 1)
2825 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
2826 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
2827 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
2828 _NL_COLLATE_SYMB_TABLEMB);
2829 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
2830 _NL_COLLATE_SYMB_EXTRAMB);
2832 #endif
2833 sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
2834 #ifdef RE_ENABLE_I18N
2835 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
2836 #endif /* RE_ENABLE_I18N */
2837 #ifdef RE_ENABLE_I18N
2838 if (BE (sbcset == NULL || mbcset == NULL, 0))
2839 #else
2840 if (BE (sbcset == NULL, 0))
2841 #endif /* RE_ENABLE_I18N */
2843 *err = REG_ESPACE;
2844 return NULL;
2847 token_len = peek_token_bracket (token, regexp, syntax);
2848 if (BE (token->type == END_OF_RE, 0))
2850 *err = REG_BADPAT;
2851 goto parse_bracket_exp_free_return;
2853 if (token->type == OP_NON_MATCH_LIST)
2855 #ifdef RE_ENABLE_I18N
2856 int i;
2857 mbcset->non_match = 1;
2858 #else /* not RE_ENABLE_I18N */
2859 non_match = 1;
2860 #endif /* not RE_ENABLE_I18N */
2861 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
2862 bitset_set (sbcset, '\0');
2863 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
2864 token_len = peek_token_bracket (token, regexp, syntax);
2865 if (BE (token->type == END_OF_RE, 0))
2867 *err = REG_BADPAT;
2868 goto parse_bracket_exp_free_return;
2870 #ifdef RE_ENABLE_I18N
2871 if (MB_CUR_MAX > 1)
2872 for (i = 0; i < SBC_MAX; ++i)
2873 if (__btowc (i) == WEOF)
2874 bitset_set (sbcset, i);
2875 #endif /* RE_ENABLE_I18N */
2878 /* We treat the first ']' as a normal character. */
2879 if (token->type == OP_CLOSE_BRACKET)
2880 token->type = CHARACTER;
2882 while (1)
2884 bracket_elem_t start_elem, end_elem;
2885 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
2886 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
2887 reg_errcode_t ret;
2888 int token_len2 = 0, is_range_exp = 0;
2889 re_token_t token2;
2891 start_elem.opr.name = start_name_buf;
2892 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
2893 syntax);
2894 if (BE (ret != REG_NOERROR, 0))
2896 *err = ret;
2897 goto parse_bracket_exp_free_return;
2900 token_len = peek_token_bracket (token, regexp, syntax);
2901 if (BE (token->type == END_OF_RE, 0))
2903 *err = REG_BADPAT;
2904 goto parse_bracket_exp_free_return;
2906 if (token->type == OP_CHARSET_RANGE)
2908 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
2909 token_len2 = peek_token_bracket (&token2, regexp, syntax);
2910 if (BE (token->type == END_OF_RE, 0))
2912 *err = REG_BADPAT;
2913 goto parse_bracket_exp_free_return;
2915 if (token2.type == OP_CLOSE_BRACKET)
2917 /* We treat the last '-' as a normal character. */
2918 re_string_skip_bytes (regexp, -token_len);
2919 token->type = CHARACTER;
2921 else
2922 is_range_exp = 1;
2925 if (is_range_exp == 1)
2927 end_elem.opr.name = end_name_buf;
2928 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
2929 dfa, syntax);
2930 if (BE (ret != REG_NOERROR, 0))
2932 *err = ret;
2933 goto parse_bracket_exp_free_return;
2936 token_len = peek_token_bracket (token, regexp, syntax);
2937 if (BE (token->type == END_OF_RE, 0))
2939 *err = REG_BADPAT;
2940 goto parse_bracket_exp_free_return;
2942 *err = build_range_exp (sbcset,
2943 #ifdef RE_ENABLE_I18N
2944 mbcset, &range_alloc,
2945 #endif /* RE_ENABLE_I18N */
2946 &start_elem, &end_elem);
2947 if (BE (*err != REG_NOERROR, 0))
2948 goto parse_bracket_exp_free_return;
2950 else
2952 switch (start_elem.type)
2954 case SB_CHAR:
2955 bitset_set (sbcset, start_elem.opr.ch);
2956 break;
2957 #ifdef RE_ENABLE_I18N
2958 case MB_CHAR:
2959 /* Check whether the array has enough space. */
2960 if (mbchar_alloc == mbcset->nmbchars)
2962 /* Not enough, realloc it. */
2963 /* +1 in case of mbcset->nmbchars is 0. */
2964 mbchar_alloc = 2 * mbcset->nmbchars + 1;
2965 /* Use realloc since array is NULL if *alloc == 0. */
2966 mbcset->mbchars = re_realloc (mbcset->mbchars, wchar_t,
2967 mbchar_alloc);
2968 if (BE (mbcset->mbchars == NULL, 0))
2969 goto parse_bracket_exp_espace;
2971 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
2972 break;
2973 #endif /* RE_ENABLE_I18N */
2974 case EQUIV_CLASS:
2975 *err = build_equiv_class (sbcset,
2976 #ifdef RE_ENABLE_I18N
2977 mbcset, &equiv_class_alloc,
2978 #endif /* RE_ENABLE_I18N */
2979 start_elem.opr.name);
2980 if (BE (*err != REG_NOERROR, 0))
2981 goto parse_bracket_exp_free_return;
2982 break;
2983 case COLL_SYM:
2984 *err = build_collating_symbol (sbcset,
2985 #ifdef RE_ENABLE_I18N
2986 mbcset, &coll_sym_alloc,
2987 #endif /* RE_ENABLE_I18N */
2988 start_elem.opr.name);
2989 if (BE (*err != REG_NOERROR, 0))
2990 goto parse_bracket_exp_free_return;
2991 break;
2992 case CHAR_CLASS:
2993 ret = build_charclass (sbcset,
2994 #ifdef RE_ENABLE_I18N
2995 mbcset, &char_class_alloc,
2996 #endif /* RE_ENABLE_I18N */
2997 start_elem.opr.name, syntax);
2998 if (BE (ret != REG_NOERROR, 0))
2999 goto parse_bracket_exp_espace;
3000 break;
3001 default:
3002 assert (0);
3003 break;
3006 if (token->type == OP_CLOSE_BRACKET)
3007 break;
3010 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3012 /* If it is non-matching list. */
3013 #ifdef RE_ENABLE_I18N
3014 if (mbcset->non_match)
3015 #else /* not RE_ENABLE_I18N */
3016 if (non_match)
3017 #endif /* not RE_ENABLE_I18N */
3018 bitset_not (sbcset);
3020 /* Build a tree for simple bracket. */
3021 br_token.type = SIMPLE_BRACKET;
3022 br_token.opr.sbcset = sbcset;
3023 new_idx = re_dfa_add_node (dfa, br_token, 0);
3024 work_tree = create_tree (NULL, NULL, 0, new_idx);
3025 if (BE (new_idx == -1 || work_tree == NULL, 0))
3026 goto parse_bracket_exp_espace;
3028 #ifdef RE_ENABLE_I18N
3029 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3030 || mbcset->nranges || (MB_CUR_MAX > 1 && (mbcset->nchar_classes
3031 || mbcset->non_match)))
3033 re_token_t alt_token;
3034 bin_tree_t *mbc_tree;
3035 /* Build a tree for complex bracket. */
3036 br_token.type = COMPLEX_BRACKET;
3037 br_token.opr.mbcset = mbcset;
3038 dfa->has_mb_node = 1;
3039 new_idx = re_dfa_add_node (dfa, br_token, 0);
3040 mbc_tree = create_tree (NULL, NULL, 0, new_idx);
3041 if (BE (new_idx == -1 || mbc_tree == NULL, 0))
3042 goto parse_bracket_exp_espace;
3043 /* Then join them by ALT node. */
3044 dfa->has_plural_match = 1;
3045 alt_token.type = OP_ALT;
3046 new_idx = re_dfa_add_node (dfa, alt_token, 0);
3047 work_tree = create_tree (work_tree, mbc_tree, 0, new_idx);
3048 if (BE (new_idx != -1 && mbc_tree != NULL, 1))
3049 return work_tree;
3051 else
3053 free_charset (mbcset);
3054 return work_tree;
3056 #else /* not RE_ENABLE_I18N */
3057 return work_tree;
3058 #endif /* not RE_ENABLE_I18N */
3060 parse_bracket_exp_espace:
3061 *err = REG_ESPACE;
3062 parse_bracket_exp_free_return:
3063 re_free (sbcset);
3064 #ifdef RE_ENABLE_I18N
3065 free_charset (mbcset);
3066 #endif /* RE_ENABLE_I18N */
3067 return NULL;
3070 /* Parse an element in the bracket expression. */
3072 static reg_errcode_t
3073 parse_bracket_element (elem, regexp, token, token_len, dfa, syntax)
3074 bracket_elem_t *elem;
3075 re_string_t *regexp;
3076 re_token_t *token;
3077 int token_len;
3078 re_dfa_t *dfa;
3079 reg_syntax_t syntax;
3081 #ifdef RE_ENABLE_I18N
3082 int cur_char_size;
3083 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3084 if (cur_char_size > 1)
3086 elem->type = MB_CHAR;
3087 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3088 re_string_skip_bytes (regexp, cur_char_size);
3089 return REG_NOERROR;
3091 #endif /* RE_ENABLE_I18N */
3092 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3093 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3094 || token->type == OP_OPEN_EQUIV_CLASS)
3095 return parse_bracket_symbol (elem, regexp, token);
3096 elem->type = SB_CHAR;
3097 elem->opr.ch = token->opr.c;
3098 return REG_NOERROR;
3101 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3102 such as [:<character_class>:], [.<collating_element>.], and
3103 [=<equivalent_class>=]. */
3105 static reg_errcode_t
3106 parse_bracket_symbol (elem, regexp, token)
3107 bracket_elem_t *elem;
3108 re_string_t *regexp;
3109 re_token_t *token;
3111 unsigned char ch, delim = token->opr.c;
3112 int i = 0;
3113 for (;; ++i)
3115 if (re_string_eoi(regexp) || i >= BRACKET_NAME_BUF_SIZE)
3116 return REG_EBRACK;
3117 if (token->type == OP_OPEN_CHAR_CLASS)
3118 ch = re_string_fetch_byte_case (regexp);
3119 else
3120 ch = re_string_fetch_byte (regexp);
3121 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3122 break;
3123 elem->opr.name[i] = ch;
3125 re_string_skip_bytes (regexp, 1);
3126 elem->opr.name[i] = '\0';
3127 switch (token->type)
3129 case OP_OPEN_COLL_ELEM:
3130 elem->type = COLL_SYM;
3131 break;
3132 case OP_OPEN_EQUIV_CLASS:
3133 elem->type = EQUIV_CLASS;
3134 break;
3135 case OP_OPEN_CHAR_CLASS:
3136 elem->type = CHAR_CLASS;
3137 break;
3138 default:
3139 break;
3141 return REG_NOERROR;
3144 /* Helper function for parse_bracket_exp.
3145 Build the equivalence class which is represented by NAME.
3146 The result are written to MBCSET and SBCSET.
3147 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3148 is a pointer argument sinse we may update it. */
3150 static reg_errcode_t
3151 #ifdef RE_ENABLE_I18N
3152 build_equiv_class (sbcset, mbcset, equiv_class_alloc, name)
3153 re_charset_t *mbcset;
3154 int *equiv_class_alloc;
3155 #else /* not RE_ENABLE_I18N */
3156 build_equiv_class (sbcset, name)
3157 #endif /* not RE_ENABLE_I18N */
3158 re_bitset_ptr_t sbcset;
3159 const unsigned char *name;
3161 #if defined _LIBC && defined RE_ENABLE_I18N
3162 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3163 if (nrules != 0)
3165 const int32_t *table, *indirect;
3166 const unsigned char *weights, *extra, *cp;
3167 unsigned char char_buf[2];
3168 int32_t idx1, idx2;
3169 unsigned int ch;
3170 size_t len;
3171 /* This #include defines a local function! */
3172 # include <locale/weight.h>
3173 /* Calculate the index for equivalence class. */
3174 cp = name;
3175 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3176 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3177 _NL_COLLATE_WEIGHTMB);
3178 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3179 _NL_COLLATE_EXTRAMB);
3180 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3181 _NL_COLLATE_INDIRECTMB);
3182 idx1 = findidx (&cp);
3183 if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3184 /* This isn't a valid character. */
3185 return REG_ECOLLATE;
3187 /* Build single byte matcing table for this equivalence class. */
3188 char_buf[1] = (unsigned char) '\0';
3189 len = weights[idx1];
3190 for (ch = 0; ch < SBC_MAX; ++ch)
3192 char_buf[0] = ch;
3193 cp = char_buf;
3194 idx2 = findidx (&cp);
3196 idx2 = table[ch];
3198 if (idx2 == 0)
3199 /* This isn't a valid character. */
3200 continue;
3201 if (len == weights[idx2])
3203 int cnt = 0;
3204 while (cnt <= len &&
3205 weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
3206 ++cnt;
3208 if (cnt > len)
3209 bitset_set (sbcset, ch);
3212 /* Check whether the array has enough space. */
3213 if (*equiv_class_alloc == mbcset->nequiv_classes)
3215 /* Not enough, realloc it. */
3216 /* +1 in case of mbcset->nequiv_classes is 0. */
3217 *equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3218 /* Use realloc since the array is NULL if *alloc == 0. */
3219 mbcset->equiv_classes = re_realloc (mbcset->equiv_classes, int32_t,
3220 *equiv_class_alloc);
3221 if (BE (mbcset->equiv_classes == NULL, 0))
3222 return REG_ESPACE;
3224 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3226 else
3227 #endif /* _LIBC && RE_ENABLE_I18N */
3229 if (BE (strlen ((const char *) name) != 1, 0))
3230 return REG_ECOLLATE;
3231 bitset_set (sbcset, *name);
3233 return REG_NOERROR;
3236 /* Helper function for parse_bracket_exp.
3237 Build the character class which is represented by NAME.
3238 The result are written to MBCSET and SBCSET.
3239 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3240 is a pointer argument sinse we may update it. */
3242 static reg_errcode_t
3243 #ifdef RE_ENABLE_I18N
3244 build_charclass (sbcset, mbcset, char_class_alloc, class_name, syntax)
3245 re_charset_t *mbcset;
3246 int *char_class_alloc;
3247 #else /* not RE_ENABLE_I18N */
3248 build_charclass (sbcset, class_name, syntax)
3249 #endif /* not RE_ENABLE_I18N */
3250 re_bitset_ptr_t sbcset;
3251 const unsigned char *class_name;
3252 reg_syntax_t syntax;
3254 int i;
3255 const char *name = (const char *) class_name;
3257 /* In case of REG_ICASE "upper" and "lower" match the both of
3258 upper and lower cases. */
3259 if ((syntax & RE_ICASE)
3260 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3261 name = "alpha";
3263 #ifdef RE_ENABLE_I18N
3264 /* Check the space of the arrays. */
3265 if (*char_class_alloc == mbcset->nchar_classes)
3267 /* Not enough, realloc it. */
3268 /* +1 in case of mbcset->nchar_classes is 0. */
3269 *char_class_alloc = 2 * mbcset->nchar_classes + 1;
3270 /* Use realloc since array is NULL if *alloc == 0. */
3271 mbcset->char_classes = re_realloc (mbcset->char_classes, wctype_t,
3272 *char_class_alloc);
3273 if (BE (mbcset->char_classes == NULL, 0))
3274 return REG_ESPACE;
3276 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3277 #endif /* RE_ENABLE_I18N */
3279 #define BUILD_CHARCLASS_LOOP(ctype_func)\
3280 for (i = 0; i < SBC_MAX; ++i) \
3282 if (ctype_func (i)) \
3283 bitset_set (sbcset, i); \
3286 if (strcmp (name, "alnum") == 0)
3287 BUILD_CHARCLASS_LOOP (isalnum)
3288 else if (strcmp (name, "cntrl") == 0)
3289 BUILD_CHARCLASS_LOOP (iscntrl)
3290 else if (strcmp (name, "lower") == 0)
3291 BUILD_CHARCLASS_LOOP (islower)
3292 else if (strcmp (name, "space") == 0)
3293 BUILD_CHARCLASS_LOOP (isspace)
3294 else if (strcmp (name, "alpha") == 0)
3295 BUILD_CHARCLASS_LOOP (isalpha)
3296 else if (strcmp (name, "digit") == 0)
3297 BUILD_CHARCLASS_LOOP (isdigit)
3298 else if (strcmp (name, "print") == 0)
3299 BUILD_CHARCLASS_LOOP (isprint)
3300 else if (strcmp (name, "upper") == 0)
3301 BUILD_CHARCLASS_LOOP (isupper)
3302 else if (strcmp (name, "blank") == 0)
3303 BUILD_CHARCLASS_LOOP (isblank)
3304 else if (strcmp (name, "graph") == 0)
3305 BUILD_CHARCLASS_LOOP (isgraph)
3306 else if (strcmp (name, "punct") == 0)
3307 BUILD_CHARCLASS_LOOP (ispunct)
3308 else if (strcmp (name, "xdigit") == 0)
3309 BUILD_CHARCLASS_LOOP (isxdigit)
3310 else
3311 return REG_ECTYPE;
3313 return REG_NOERROR;
3316 static bin_tree_t *
3317 build_word_op (dfa, not, err)
3318 re_dfa_t *dfa;
3319 int not;
3320 reg_errcode_t *err;
3322 re_bitset_ptr_t sbcset;
3323 #ifdef RE_ENABLE_I18N
3324 re_charset_t *mbcset;
3325 int alloc = 0;
3326 #else /* not RE_ENABLE_I18N */
3327 int non_match = 0;
3328 #endif /* not RE_ENABLE_I18N */
3329 reg_errcode_t ret;
3330 re_token_t br_token;
3331 bin_tree_t *tree;
3332 int new_idx;
3334 sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
3335 #ifdef RE_ENABLE_I18N
3336 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3337 #endif /* RE_ENABLE_I18N */
3339 #ifdef RE_ENABLE_I18N
3340 if (BE (sbcset == NULL || mbcset == NULL, 0))
3341 #else /* not RE_ENABLE_I18N */
3342 if (BE (sbcset == NULL, 0))
3343 #endif /* not RE_ENABLE_I18N */
3345 *err = REG_ESPACE;
3346 return NULL;
3349 if (not)
3351 #ifdef RE_ENABLE_I18N
3352 int i;
3354 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3355 bitset_set(cset->sbcset, '\0');
3357 mbcset->non_match = 1;
3358 if (MB_CUR_MAX > 1)
3359 for (i = 0; i < SBC_MAX; ++i)
3360 if (__btowc (i) == WEOF)
3361 bitset_set (sbcset, i);
3362 #else /* not RE_ENABLE_I18N */
3363 non_match = 1;
3364 #endif /* not RE_ENABLE_I18N */
3367 /* We don't care the syntax in this case. */
3368 ret = build_charclass (sbcset,
3369 #ifdef RE_ENABLE_I18N
3370 mbcset, &alloc,
3371 #endif /* RE_ENABLE_I18N */
3372 (const unsigned char *) "alpha", 0);
3374 if (BE (ret != REG_NOERROR, 0))
3376 re_free (sbcset);
3377 #ifdef RE_ENABLE_I18N
3378 free_charset (mbcset);
3379 #endif /* RE_ENABLE_I18N */
3380 *err = REG_ESPACE;
3381 return NULL;
3383 /* \w match '_' also. */
3384 bitset_set (sbcset, '_');
3386 /* If it is non-matching list. */
3387 #ifdef RE_ENABLE_I18N
3388 if (mbcset->non_match)
3389 #else /* not RE_ENABLE_I18N */
3390 if (non_match)
3391 #endif /* not RE_ENABLE_I18N */
3392 bitset_not (sbcset);
3394 /* Build a tree for simple bracket. */
3395 br_token.type = SIMPLE_BRACKET;
3396 br_token.opr.sbcset = sbcset;
3397 new_idx = re_dfa_add_node (dfa, br_token, 0);
3398 tree = create_tree (NULL, NULL, 0, new_idx);
3399 if (BE (new_idx == -1 || tree == NULL, 0))
3400 goto build_word_op_espace;
3402 #ifdef RE_ENABLE_I18N
3403 if (MB_CUR_MAX > 1)
3405 re_token_t alt_token;
3406 bin_tree_t *mbc_tree;
3407 /* Build a tree for complex bracket. */
3408 br_token.type = COMPLEX_BRACKET;
3409 br_token.opr.mbcset = mbcset;
3410 dfa->has_mb_node = 1;
3411 new_idx = re_dfa_add_node (dfa, br_token, 0);
3412 mbc_tree = create_tree (NULL, NULL, 0, new_idx);
3413 if (BE (new_idx == -1 || mbc_tree == NULL, 0))
3414 goto build_word_op_espace;
3415 /* Then join them by ALT node. */
3416 alt_token.type = OP_ALT;
3417 new_idx = re_dfa_add_node (dfa, alt_token, 0);
3418 tree = create_tree (tree, mbc_tree, 0, new_idx);
3419 if (BE (new_idx != -1 && mbc_tree != NULL, 1))
3420 return tree;
3422 else
3424 free_charset (mbcset);
3425 return tree;
3427 #else /* not RE_ENABLE_I18N */
3428 return tree;
3429 #endif /* not RE_ENABLE_I18N */
3431 build_word_op_espace:
3432 re_free (sbcset);
3433 #ifdef RE_ENABLE_I18N
3434 free_charset (mbcset);
3435 #endif /* RE_ENABLE_I18N */
3436 *err = REG_ESPACE;
3437 return NULL;
3440 /* This is intended for the expressions like "a{1,3}".
3441 Fetch a number from `input', and return the number.
3442 Return -1, if the number field is empty like "{,1}".
3443 Return -2, If an error is occured. */
3445 static int
3446 fetch_number (input, token, syntax)
3447 re_string_t *input;
3448 re_token_t *token;
3449 reg_syntax_t syntax;
3451 int num = -1;
3452 unsigned char c;
3453 while (1)
3455 *token = fetch_token (input, syntax);
3456 c = token->opr.c;
3457 if (BE (token->type == END_OF_RE, 0))
3458 return -2;
3459 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3460 break;
3461 num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3462 ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3463 num = (num > RE_DUP_MAX) ? -2 : num;
3465 return num;
3468 #ifdef RE_ENABLE_I18N
3469 static void
3470 free_charset (re_charset_t *cset)
3472 re_free (cset->mbchars);
3473 # ifdef _LIBC
3474 re_free (cset->coll_syms);
3475 re_free (cset->equiv_classes);
3476 re_free (cset->range_starts);
3477 re_free (cset->range_ends);
3478 # endif
3479 re_free (cset->char_classes);
3480 re_free (cset);
3482 #endif /* RE_ENABLE_I18N */
3484 /* Functions for binary tree operation. */
3486 /* Create a node of tree.
3487 Note: This function automatically free left and right if malloc fails. */
3489 static bin_tree_t *
3490 create_tree (left, right, type, index)
3491 bin_tree_t *left;
3492 bin_tree_t *right;
3493 re_token_type_t type;
3494 int index;
3496 bin_tree_t *tree;
3497 tree = re_malloc (bin_tree_t, 1);
3498 if (BE (tree == NULL, 0))
3500 free_bin_tree (left);
3501 free_bin_tree (right);
3502 return NULL;
3504 tree->parent = NULL;
3505 tree->left = left;
3506 tree->right = right;
3507 tree->type = type;
3508 tree->node_idx = index;
3509 tree->first = -1;
3510 tree->next = -1;
3511 re_node_set_init_empty (&tree->eclosure);
3513 if (left != NULL)
3514 left->parent = tree;
3515 if (right != NULL)
3516 right->parent = tree;
3517 return tree;
3520 /* Free the sub tree pointed by TREE. */
3522 static void
3523 free_bin_tree (tree)
3524 bin_tree_t *tree;
3526 if (tree == NULL)
3527 return;
3528 /*re_node_set_free (&tree->eclosure);*/
3529 free_bin_tree (tree->left);
3530 free_bin_tree (tree->right);
3531 re_free (tree);
3534 /* Duplicate the node SRC, and return new node. */
3536 static bin_tree_t *
3537 duplicate_tree (src, dfa)
3538 const bin_tree_t *src;
3539 re_dfa_t *dfa;
3541 bin_tree_t *left = NULL, *right = NULL, *new_tree;
3542 int new_node_idx;
3543 /* Since node indies must be according to Post-order of the tree,
3544 we must duplicate the left at first. */
3545 if (src->left != NULL)
3547 left = duplicate_tree (src->left, dfa);
3548 if (left == NULL)
3549 return NULL;
3552 /* Secondaly, duplicate the right. */
3553 if (src->right != NULL)
3555 right = duplicate_tree (src->right, dfa);
3556 if (right == NULL)
3558 free_bin_tree (left);
3559 return NULL;
3563 /* At last, duplicate itself. */
3564 if (src->type == NON_TYPE)
3566 new_node_idx = re_dfa_add_node (dfa, dfa->nodes[src->node_idx], 0);
3567 dfa->nodes[new_node_idx].duplicated = 1;
3568 if (BE (new_node_idx == -1, 0))
3570 free_bin_tree (left);
3571 free_bin_tree (right);
3572 return NULL;
3575 else
3576 new_node_idx = src->type;
3578 new_tree = create_tree (left, right, src->type, new_node_idx);
3579 if (BE (new_tree == NULL, 0))
3581 free_bin_tree (left);
3582 free_bin_tree (right);
3584 return new_tree;