log 2.3.5-0.fc3.8
[glibc.git] / posix / regcomp.c
blob2100bb542066c5761bef00889907a8a8727f9905
1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002, 2003, 2004 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19 02111-1307 USA. */
21 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
22 int length, reg_syntax_t syntax);
23 static void re_compile_fastmap_iter (regex_t *bufp,
24 const re_dfastate_t *init_state,
25 char *fastmap);
26 static reg_errcode_t init_dfa (re_dfa_t *dfa, int pat_len);
27 static void init_word_char (re_dfa_t *dfa);
28 #ifdef RE_ENABLE_I18N
29 static void free_charset (re_charset_t *cset);
30 #endif /* RE_ENABLE_I18N */
31 static void free_workarea_compile (regex_t *preg);
32 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
33 #ifdef RE_ENABLE_I18N
34 static void optimize_utf8 (re_dfa_t *dfa);
35 #endif
36 struct subexp_optimize
38 re_dfa_t *dfa;
39 re_token_t *nodes;
40 int no_sub, re_nsub;
42 static bin_tree_t *optimize_subexps (struct subexp_optimize *so,
43 bin_tree_t *node, int sidx, int depth);
44 static reg_errcode_t analyze (re_dfa_t *dfa);
45 static reg_errcode_t analyze_tree (re_dfa_t *dfa, bin_tree_t *node);
46 static void calc_first (re_dfa_t *dfa, bin_tree_t *node);
47 static void calc_next (re_dfa_t *dfa, bin_tree_t *node);
48 static void calc_epsdest (re_dfa_t *dfa, bin_tree_t *node);
49 static reg_errcode_t duplicate_node_closure (re_dfa_t *dfa, int top_org_node,
50 int top_clone_node, int root_node,
51 unsigned int constraint);
52 static reg_errcode_t duplicate_node (int *new_idx, re_dfa_t *dfa, int org_idx,
53 unsigned int constraint);
54 static int search_duplicated_node (re_dfa_t *dfa, int org_node,
55 unsigned int constraint);
56 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
57 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
58 int node, int root);
59 static void calc_inveclosure (re_dfa_t *dfa);
60 static int fetch_number (re_string_t *input, re_token_t *token,
61 reg_syntax_t syntax);
62 static void fetch_token (re_token_t *result, re_string_t *input,
63 reg_syntax_t syntax);
64 static int peek_token (re_token_t *token, re_string_t *input,
65 reg_syntax_t syntax);
66 static int peek_token_bracket (re_token_t *token, re_string_t *input,
67 reg_syntax_t syntax);
68 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
69 reg_syntax_t syntax, reg_errcode_t *err);
70 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
71 re_token_t *token, reg_syntax_t syntax,
72 int nest, reg_errcode_t *err);
73 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
74 re_token_t *token, reg_syntax_t syntax,
75 int nest, reg_errcode_t *err);
76 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
77 re_token_t *token, reg_syntax_t syntax,
78 int nest, reg_errcode_t *err);
79 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
80 re_token_t *token, reg_syntax_t syntax,
81 int nest, reg_errcode_t *err);
82 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
83 re_dfa_t *dfa, re_token_t *token,
84 reg_syntax_t syntax, reg_errcode_t *err);
85 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
86 re_token_t *token, reg_syntax_t syntax,
87 reg_errcode_t *err);
88 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
89 re_string_t *regexp,
90 re_token_t *token, int token_len,
91 re_dfa_t *dfa,
92 reg_syntax_t syntax,
93 int accept_hyphen);
94 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
95 re_string_t *regexp,
96 re_token_t *token);
97 #ifndef _LIBC
98 # ifdef RE_ENABLE_I18N
99 static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
100 re_charset_t *mbcset, int *range_alloc,
101 bracket_elem_t *start_elem,
102 bracket_elem_t *end_elem);
103 static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
104 re_charset_t *mbcset,
105 int *coll_sym_alloc,
106 const unsigned char *name);
107 # else /* not RE_ENABLE_I18N */
108 static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
109 bracket_elem_t *start_elem,
110 bracket_elem_t *end_elem);
111 static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
112 const unsigned char *name);
113 # endif /* not RE_ENABLE_I18N */
114 #endif /* not _LIBC */
115 #ifdef RE_ENABLE_I18N
116 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
117 re_charset_t *mbcset,
118 int *equiv_class_alloc,
119 const unsigned char *name);
120 static reg_errcode_t build_charclass (unsigned RE_TRANSLATE_TYPE trans,
121 re_bitset_ptr_t sbcset,
122 re_charset_t *mbcset,
123 int *char_class_alloc,
124 const unsigned char *class_name,
125 reg_syntax_t syntax);
126 #else /* not RE_ENABLE_I18N */
127 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
128 const unsigned char *name);
129 static reg_errcode_t build_charclass (unsigned RE_TRANSLATE_TYPE trans,
130 re_bitset_ptr_t sbcset,
131 const unsigned char *class_name,
132 reg_syntax_t syntax);
133 #endif /* not RE_ENABLE_I18N */
134 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
135 unsigned RE_TRANSLATE_TYPE trans,
136 const unsigned char *class_name,
137 const unsigned char *extra,
138 int non_match, reg_errcode_t *err);
139 static bin_tree_t *create_tree (re_dfa_t *dfa,
140 bin_tree_t *left, bin_tree_t *right,
141 re_token_type_t type, int index);
142 static bin_tree_t *re_dfa_add_tree_node (re_dfa_t *dfa,
143 bin_tree_t *left, bin_tree_t *right,
144 const re_token_t *token)
145 __attribute ((noinline));
146 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
147 static void mark_opt_subexp (const bin_tree_t *src, re_dfa_t *dfa);
148 static void mark_opt_subexp_iter (const bin_tree_t *src, re_dfa_t *dfa, int idx);
150 /* This table gives an error message for each of the error codes listed
151 in regex.h. Obviously the order here has to be same as there.
152 POSIX doesn't require that we do anything for REG_NOERROR,
153 but why not be nice? */
155 const char __re_error_msgid[] attribute_hidden =
157 #define REG_NOERROR_IDX 0
158 gettext_noop ("Success") /* REG_NOERROR */
159 "\0"
160 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
161 gettext_noop ("No match") /* REG_NOMATCH */
162 "\0"
163 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
164 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
165 "\0"
166 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
167 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
168 "\0"
169 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
170 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
171 "\0"
172 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
173 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
174 "\0"
175 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
176 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
177 "\0"
178 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
179 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
180 "\0"
181 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
182 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
183 "\0"
184 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
185 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
186 "\0"
187 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
188 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
189 "\0"
190 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
191 gettext_noop ("Invalid range end") /* REG_ERANGE */
192 "\0"
193 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
194 gettext_noop ("Memory exhausted") /* REG_ESPACE */
195 "\0"
196 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
197 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
198 "\0"
199 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
200 gettext_noop ("Premature end of regular expression") /* REG_EEND */
201 "\0"
202 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
203 gettext_noop ("Regular expression too big") /* REG_ESIZE */
204 "\0"
205 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
206 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
209 const size_t __re_error_msgid_idx[] attribute_hidden =
211 REG_NOERROR_IDX,
212 REG_NOMATCH_IDX,
213 REG_BADPAT_IDX,
214 REG_ECOLLATE_IDX,
215 REG_ECTYPE_IDX,
216 REG_EESCAPE_IDX,
217 REG_ESUBREG_IDX,
218 REG_EBRACK_IDX,
219 REG_EPAREN_IDX,
220 REG_EBRACE_IDX,
221 REG_BADBR_IDX,
222 REG_ERANGE_IDX,
223 REG_ESPACE_IDX,
224 REG_BADRPT_IDX,
225 REG_EEND_IDX,
226 REG_ESIZE_IDX,
227 REG_ERPAREN_IDX
230 /* Entry points for GNU code. */
232 /* re_compile_pattern is the GNU regular expression compiler: it
233 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
234 Returns 0 if the pattern was valid, otherwise an error string.
236 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
237 are set in BUFP on entry. */
239 const char *
240 re_compile_pattern (pattern, length, bufp)
241 const char *pattern;
242 size_t length;
243 struct re_pattern_buffer *bufp;
245 reg_errcode_t ret;
247 /* And GNU code determines whether or not to get register information
248 by passing null for the REGS argument to re_match, etc., not by
249 setting no_sub, unless RE_NO_SUB is set. */
250 bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
252 /* Match anchors at newline. */
253 bufp->newline_anchor = 1;
255 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
257 if (!ret)
258 return NULL;
259 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
261 #ifdef _LIBC
262 weak_alias (__re_compile_pattern, re_compile_pattern)
263 #endif
265 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
266 also be assigned to arbitrarily: each pattern buffer stores its own
267 syntax, so it can be changed between regex compilations. */
268 /* This has no initializer because initialized variables in Emacs
269 become read-only after dumping. */
270 reg_syntax_t re_syntax_options;
273 /* Specify the precise syntax of regexps for compilation. This provides
274 for compatibility for various utilities which historically have
275 different, incompatible syntaxes.
277 The argument SYNTAX is a bit mask comprised of the various bits
278 defined in regex.h. We return the old syntax. */
280 reg_syntax_t
281 re_set_syntax (syntax)
282 reg_syntax_t syntax;
284 reg_syntax_t ret = re_syntax_options;
286 re_syntax_options = syntax;
287 return ret;
289 #ifdef _LIBC
290 weak_alias (__re_set_syntax, re_set_syntax)
291 #endif
294 re_compile_fastmap (bufp)
295 struct re_pattern_buffer *bufp;
297 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
298 char *fastmap = bufp->fastmap;
300 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
301 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
302 if (dfa->init_state != dfa->init_state_word)
303 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
304 if (dfa->init_state != dfa->init_state_nl)
305 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
306 if (dfa->init_state != dfa->init_state_begbuf)
307 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
308 bufp->fastmap_accurate = 1;
309 return 0;
311 #ifdef _LIBC
312 weak_alias (__re_compile_fastmap, re_compile_fastmap)
313 #endif
315 static inline void
316 __attribute ((always_inline))
317 re_set_fastmap (char *fastmap, int icase, int ch)
319 fastmap[ch] = 1;
320 if (icase)
321 fastmap[tolower (ch)] = 1;
324 /* Helper function for re_compile_fastmap.
325 Compile fastmap for the initial_state INIT_STATE. */
327 static void
328 re_compile_fastmap_iter (bufp, init_state, fastmap)
329 regex_t *bufp;
330 const re_dfastate_t *init_state;
331 char *fastmap;
333 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
334 int node_cnt;
335 int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
336 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
338 int node = init_state->nodes.elems[node_cnt];
339 re_token_type_t type = dfa->nodes[node].type;
341 if (type == CHARACTER)
343 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
344 #ifdef RE_ENABLE_I18N
345 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
347 unsigned char *buf = alloca (dfa->mb_cur_max), *p;
348 wchar_t wc;
349 mbstate_t state;
351 p = buf;
352 *p++ = dfa->nodes[node].opr.c;
353 while (++node < dfa->nodes_len
354 && dfa->nodes[node].type == CHARACTER
355 && dfa->nodes[node].mb_partial)
356 *p++ = dfa->nodes[node].opr.c;
357 memset (&state, 0, sizeof (state));
358 if (mbrtowc (&wc, (const char *) buf, p - buf,
359 &state) == p - buf
360 && __wcrtomb ((char *) buf, towlower (wc), &state) > 0)
361 re_set_fastmap (fastmap, 0, buf[0]);
363 #endif
365 else if (type == SIMPLE_BRACKET)
367 int i, j, ch;
368 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
369 for (j = 0; j < UINT_BITS; ++j, ++ch)
370 if (dfa->nodes[node].opr.sbcset[i] & (1 << j))
371 re_set_fastmap (fastmap, icase, ch);
373 #ifdef RE_ENABLE_I18N
374 else if (type == COMPLEX_BRACKET)
376 int i;
377 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
378 if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
379 || cset->nranges || cset->nchar_classes)
381 # ifdef _LIBC
382 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
384 /* In this case we want to catch the bytes which are
385 the first byte of any collation elements.
386 e.g. In da_DK, we want to catch 'a' since "aa"
387 is a valid collation element, and don't catch
388 'b' since 'b' is the only collation element
389 which starts from 'b'. */
390 int j, ch;
391 const int32_t *table = (const int32_t *)
392 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
393 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
394 for (j = 0; j < UINT_BITS; ++j, ++ch)
395 if (table[ch] < 0)
396 re_set_fastmap (fastmap, icase, ch);
398 # else
399 if (dfa->mb_cur_max > 1)
400 for (i = 0; i < SBC_MAX; ++i)
401 if (__btowc (i) == WEOF)
402 re_set_fastmap (fastmap, icase, i);
403 # endif /* not _LIBC */
405 for (i = 0; i < cset->nmbchars; ++i)
407 char buf[256];
408 mbstate_t state;
409 memset (&state, '\0', sizeof (state));
410 __wcrtomb (buf, cset->mbchars[i], &state);
411 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
412 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
414 __wcrtomb (buf, towlower (cset->mbchars[i]), &state);
415 re_set_fastmap (fastmap, 0, *(unsigned char *) buf);
419 #endif /* RE_ENABLE_I18N */
420 else if (type == OP_PERIOD
421 #ifdef RE_ENABLE_I18N
422 || type == OP_UTF8_PERIOD
423 #endif /* RE_ENABLE_I18N */
424 || type == END_OF_RE)
426 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
427 if (type == END_OF_RE)
428 bufp->can_be_null = 1;
429 return;
434 /* Entry point for POSIX code. */
435 /* regcomp takes a regular expression as a string and compiles it.
437 PREG is a regex_t *. We do not expect any fields to be initialized,
438 since POSIX says we shouldn't. Thus, we set
440 `buffer' to the compiled pattern;
441 `used' to the length of the compiled pattern;
442 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
443 REG_EXTENDED bit in CFLAGS is set; otherwise, to
444 RE_SYNTAX_POSIX_BASIC;
445 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
446 `fastmap' to an allocated space for the fastmap;
447 `fastmap_accurate' to zero;
448 `re_nsub' to the number of subexpressions in PATTERN.
450 PATTERN is the address of the pattern string.
452 CFLAGS is a series of bits which affect compilation.
454 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
455 use POSIX basic syntax.
457 If REG_NEWLINE is set, then . and [^...] don't match newline.
458 Also, regexec will try a match beginning after every newline.
460 If REG_ICASE is set, then we considers upper- and lowercase
461 versions of letters to be equivalent when matching.
463 If REG_NOSUB is set, then when PREG is passed to regexec, that
464 routine will report only success or failure, and nothing about the
465 registers.
467 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
468 the return codes and their meanings.) */
471 regcomp (preg, pattern, cflags)
472 regex_t *__restrict preg;
473 const char *__restrict pattern;
474 int cflags;
476 reg_errcode_t ret;
477 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
478 : RE_SYNTAX_POSIX_BASIC);
480 preg->buffer = NULL;
481 preg->allocated = 0;
482 preg->used = 0;
484 /* Try to allocate space for the fastmap. */
485 preg->fastmap = re_malloc (char, SBC_MAX);
486 if (BE (preg->fastmap == NULL, 0))
487 return REG_ESPACE;
489 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
491 /* If REG_NEWLINE is set, newlines are treated differently. */
492 if (cflags & REG_NEWLINE)
493 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
494 syntax &= ~RE_DOT_NEWLINE;
495 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
496 /* It also changes the matching behavior. */
497 preg->newline_anchor = 1;
499 else
500 preg->newline_anchor = 0;
501 preg->no_sub = !!(cflags & REG_NOSUB);
502 preg->translate = NULL;
504 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
506 /* POSIX doesn't distinguish between an unmatched open-group and an
507 unmatched close-group: both are REG_EPAREN. */
508 if (ret == REG_ERPAREN)
509 ret = REG_EPAREN;
511 /* We have already checked preg->fastmap != NULL. */
512 if (BE (ret == REG_NOERROR, 1))
513 /* Compute the fastmap now, since regexec cannot modify the pattern
514 buffer. This function never fails in this implementation. */
515 (void) re_compile_fastmap (preg);
516 else
518 /* Some error occurred while compiling the expression. */
519 re_free (preg->fastmap);
520 preg->fastmap = NULL;
523 return (int) ret;
525 #ifdef _LIBC
526 weak_alias (__regcomp, regcomp)
527 #endif
529 /* Returns a message corresponding to an error code, ERRCODE, returned
530 from either regcomp or regexec. We don't use PREG here. */
532 size_t
533 regerror (errcode, preg, errbuf, errbuf_size)
534 int errcode;
535 const regex_t *preg;
536 char *errbuf;
537 size_t errbuf_size;
539 const char *msg;
540 size_t msg_size;
542 if (BE (errcode < 0
543 || errcode >= (int) (sizeof (__re_error_msgid_idx)
544 / sizeof (__re_error_msgid_idx[0])), 0))
545 /* Only error codes returned by the rest of the code should be passed
546 to this routine. If we are given anything else, or if other regex
547 code generates an invalid error code, then the program has a bug.
548 Dump core so we can fix it. */
549 abort ();
551 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
553 msg_size = strlen (msg) + 1; /* Includes the null. */
555 if (BE (errbuf_size != 0, 1))
557 if (BE (msg_size > errbuf_size, 0))
559 #if defined HAVE_MEMPCPY || defined _LIBC
560 *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
561 #else
562 memcpy (errbuf, msg, errbuf_size - 1);
563 errbuf[errbuf_size - 1] = 0;
564 #endif
566 else
567 memcpy (errbuf, msg, msg_size);
570 return msg_size;
572 #ifdef _LIBC
573 weak_alias (__regerror, regerror)
574 #endif
577 #ifdef RE_ENABLE_I18N
578 /* This static array is used for the map to single-byte characters when
579 UTF-8 is used. Otherwise we would allocate memory just to initialize
580 it the same all the time. UTF-8 is the preferred encoding so this is
581 a worthwhile optimization. */
582 static const bitset utf8_sb_map =
584 /* Set the first 128 bits. */
585 # if UINT_MAX == 0xffffffff
586 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff
587 # else
588 # error "Add case for new unsigned int size"
589 # endif
591 #endif
594 static void
595 free_dfa_content (re_dfa_t *dfa)
597 int i, j;
599 if (dfa->nodes)
600 for (i = 0; i < dfa->nodes_len; ++i)
602 re_token_t *node = dfa->nodes + i;
603 #ifdef RE_ENABLE_I18N
604 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
605 free_charset (node->opr.mbcset);
606 else
607 #endif /* RE_ENABLE_I18N */
608 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
609 re_free (node->opr.sbcset);
611 re_free (dfa->nexts);
612 for (i = 0; i < dfa->nodes_len; ++i)
614 if (dfa->eclosures != NULL)
615 re_node_set_free (dfa->eclosures + i);
616 if (dfa->inveclosures != NULL)
617 re_node_set_free (dfa->inveclosures + i);
618 if (dfa->edests != NULL)
619 re_node_set_free (dfa->edests + i);
621 re_free (dfa->edests);
622 re_free (dfa->eclosures);
623 re_free (dfa->inveclosures);
624 re_free (dfa->nodes);
626 if (dfa->state_table)
627 for (i = 0; i <= dfa->state_hash_mask; ++i)
629 struct re_state_table_entry *entry = dfa->state_table + i;
630 for (j = 0; j < entry->num; ++j)
632 re_dfastate_t *state = entry->array[j];
633 free_state (state);
635 re_free (entry->array);
637 re_free (dfa->state_table);
638 #ifdef RE_ENABLE_I18N
639 if (dfa->sb_char != utf8_sb_map)
640 re_free (dfa->sb_char);
641 #endif
642 re_free (dfa->subexp_map);
643 #ifdef DEBUG
644 re_free (dfa->re_str);
645 #endif
647 re_free (dfa);
651 /* Free dynamically allocated space used by PREG. */
653 void
654 regfree (preg)
655 regex_t *preg;
657 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
658 if (BE (dfa != NULL, 1))
659 free_dfa_content (dfa);
660 preg->buffer = NULL;
661 preg->allocated = 0;
663 re_free (preg->fastmap);
664 preg->fastmap = NULL;
666 re_free (preg->translate);
667 preg->translate = NULL;
669 #ifdef _LIBC
670 weak_alias (__regfree, regfree)
671 #endif
673 /* Entry points compatible with 4.2 BSD regex library. We don't define
674 them unless specifically requested. */
676 #if defined _REGEX_RE_COMP || defined _LIBC
678 /* BSD has one and only one pattern buffer. */
679 static struct re_pattern_buffer re_comp_buf;
681 char *
682 # ifdef _LIBC
683 /* Make these definitions weak in libc, so POSIX programs can redefine
684 these names if they don't use our functions, and still use
685 regcomp/regexec above without link errors. */
686 weak_function
687 # endif
688 re_comp (s)
689 const char *s;
691 reg_errcode_t ret;
692 char *fastmap;
694 if (!s)
696 if (!re_comp_buf.buffer)
697 return gettext ("No previous regular expression");
698 return 0;
701 if (re_comp_buf.buffer)
703 fastmap = re_comp_buf.fastmap;
704 re_comp_buf.fastmap = NULL;
705 __regfree (&re_comp_buf);
706 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
707 re_comp_buf.fastmap = fastmap;
710 if (re_comp_buf.fastmap == NULL)
712 re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
713 if (re_comp_buf.fastmap == NULL)
714 return (char *) gettext (__re_error_msgid
715 + __re_error_msgid_idx[(int) REG_ESPACE]);
718 /* Since `re_exec' always passes NULL for the `regs' argument, we
719 don't need to initialize the pattern buffer fields which affect it. */
721 /* Match anchors at newlines. */
722 re_comp_buf.newline_anchor = 1;
724 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
726 if (!ret)
727 return NULL;
729 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
730 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
733 #ifdef _LIBC
734 libc_freeres_fn (free_mem)
736 __regfree (&re_comp_buf);
738 #endif
740 #endif /* _REGEX_RE_COMP */
742 /* Internal entry point.
743 Compile the regular expression PATTERN, whose length is LENGTH.
744 SYNTAX indicate regular expression's syntax. */
746 static reg_errcode_t
747 re_compile_internal (preg, pattern, length, syntax)
748 regex_t *preg;
749 const char * pattern;
750 int length;
751 reg_syntax_t syntax;
753 reg_errcode_t err = REG_NOERROR;
754 re_dfa_t *dfa;
755 re_string_t regexp;
757 /* Initialize the pattern buffer. */
758 preg->fastmap_accurate = 0;
759 preg->syntax = syntax;
760 preg->not_bol = preg->not_eol = 0;
761 preg->used = 0;
762 preg->re_nsub = 0;
763 preg->can_be_null = 0;
764 preg->regs_allocated = REGS_UNALLOCATED;
766 /* Initialize the dfa. */
767 dfa = (re_dfa_t *) preg->buffer;
768 if (BE (preg->allocated < sizeof (re_dfa_t), 0))
770 /* If zero allocated, but buffer is non-null, try to realloc
771 enough space. This loses if buffer's address is bogus, but
772 that is the user's responsibility. If ->buffer is NULL this
773 is a simple allocation. */
774 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
775 if (dfa == NULL)
776 return REG_ESPACE;
777 preg->allocated = sizeof (re_dfa_t);
778 preg->buffer = (unsigned char *) dfa;
780 preg->used = sizeof (re_dfa_t);
782 __libc_lock_init (dfa->lock);
784 err = init_dfa (dfa, length);
785 if (BE (err != REG_NOERROR, 0))
787 free_dfa_content (dfa);
788 preg->buffer = NULL;
789 preg->allocated = 0;
790 return err;
792 #ifdef DEBUG
793 dfa->re_str = re_malloc (char, length + 1);
794 strncpy (dfa->re_str, pattern, length + 1);
795 #endif
797 err = re_string_construct (&regexp, pattern, length, preg->translate,
798 syntax & RE_ICASE, dfa);
799 if (BE (err != REG_NOERROR, 0))
801 re_compile_internal_free_return:
802 free_workarea_compile (preg);
803 re_string_destruct (&regexp);
804 free_dfa_content (dfa);
805 preg->buffer = NULL;
806 preg->allocated = 0;
807 return err;
810 /* Parse the regular expression, and build a structure tree. */
811 preg->re_nsub = 0;
812 dfa->str_tree = parse (&regexp, preg, syntax, &err);
813 if (BE (dfa->str_tree == NULL, 0))
814 goto re_compile_internal_free_return;
816 #ifdef RE_ENABLE_I18N
817 /* If possible, do searching in single byte encoding to speed things up. */
818 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
819 optimize_utf8 (dfa);
820 #endif
822 if (preg->re_nsub > 0)
824 struct subexp_optimize so;
826 so.dfa = dfa;
827 so.nodes = dfa->nodes;
828 so.no_sub = preg->no_sub;
829 so.re_nsub = preg->re_nsub;
830 dfa->str_tree = optimize_subexps (&so, dfa->str_tree, -1, 0);
833 /* Analyze the tree and collect information which is necessary to
834 create the dfa. */
835 err = analyze (dfa);
836 if (BE (err != REG_NOERROR, 0))
837 goto re_compile_internal_free_return;
839 /* Then create the initial state of the dfa. */
840 err = create_initial_state (dfa);
842 /* Release work areas. */
843 free_workarea_compile (preg);
844 re_string_destruct (&regexp);
846 if (BE (err != REG_NOERROR, 0))
848 free_dfa_content (dfa);
849 preg->buffer = NULL;
850 preg->allocated = 0;
853 return err;
856 /* Initialize DFA. We use the length of the regular expression PAT_LEN
857 as the initial length of some arrays. */
859 static reg_errcode_t
860 init_dfa (dfa, pat_len)
861 re_dfa_t *dfa;
862 int pat_len;
864 int table_size;
865 #ifndef _LIBC
866 char *codeset_name;
867 #endif
869 memset (dfa, '\0', sizeof (re_dfa_t));
871 /* Force allocation of str_tree_storage the first time. */
872 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
874 dfa->nodes_alloc = pat_len + 1;
875 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
877 dfa->states_alloc = pat_len + 1;
879 /* table_size = 2 ^ ceil(log pat_len) */
880 for (table_size = 1; table_size > 0; table_size <<= 1)
881 if (table_size > pat_len)
882 break;
884 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
885 dfa->state_hash_mask = table_size - 1;
887 dfa->mb_cur_max = MB_CUR_MAX;
888 #ifdef _LIBC
889 if (dfa->mb_cur_max == 6
890 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
891 dfa->is_utf8 = 1;
892 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
893 != 0);
894 #else
895 # ifdef HAVE_LANGINFO_CODESET
896 codeset_name = nl_langinfo (CODESET);
897 # else
898 codeset_name = getenv ("LC_ALL");
899 if (codeset_name == NULL || codeset[0] == '\0')
900 codeset_name = getenv ("LC_CTYPE");
901 if (codeset_name == NULL || codeset[0] == '\0')
902 codeset_name = getenv ("LANG");
903 if (codeset_name == NULL)
904 codeset_name = "";
905 else if (strchr (codeset_name, '.') != NULL)
906 codeset_name = strchr (codeset_name, '.') + 1;
907 # endif
909 if (strcasecmp (codeset_name, "UTF-8") == 0
910 || strcasecmp (codeset_name, "UTF8") == 0)
911 dfa->is_utf8 = 1;
913 /* We check exhaustively in the loop below if this charset is a
914 superset of ASCII. */
915 dfa->map_notascii = 0;
916 #endif
918 #ifdef RE_ENABLE_I18N
919 if (dfa->mb_cur_max > 1)
921 if (dfa->is_utf8)
922 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
923 else
925 int i, j, ch;
927 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset), 1);
928 if (BE (dfa->sb_char == NULL, 0))
929 return REG_ESPACE;
931 /* Clear all bits by, then set those corresponding to single
932 byte chars. */
933 bitset_empty (dfa->sb_char);
935 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
936 for (j = 0; j < UINT_BITS; ++j, ++ch)
938 wchar_t wch = __btowc (ch);
939 if (wch != WEOF)
940 dfa->sb_char[i] |= 1 << j;
941 # ifndef _LIBC
942 if (isascii (ch) && wch != (wchar_t) ch)
943 dfa->map_notascii = 1;
944 # endif
948 #endif
950 if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
951 return REG_ESPACE;
952 return REG_NOERROR;
955 /* Initialize WORD_CHAR table, which indicate which character is
956 "word". In this case "word" means that it is the word construction
957 character used by some operators like "\<", "\>", etc. */
959 static void
960 init_word_char (dfa)
961 re_dfa_t *dfa;
963 int i, j, ch;
964 dfa->word_ops_used = 1;
965 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
966 for (j = 0; j < UINT_BITS; ++j, ++ch)
967 if (isalnum (ch) || ch == '_')
968 dfa->word_char[i] |= 1 << j;
971 /* Free the work area which are only used while compiling. */
973 static void
974 free_workarea_compile (preg)
975 regex_t *preg;
977 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
978 bin_tree_storage_t *storage, *next;
979 for (storage = dfa->str_tree_storage; storage; storage = next)
981 next = storage->next;
982 re_free (storage);
984 dfa->str_tree_storage = NULL;
985 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
986 dfa->str_tree = NULL;
987 re_free (dfa->org_indices);
988 dfa->org_indices = NULL;
991 /* Create initial states for all contexts. */
993 static reg_errcode_t
994 create_initial_state (dfa)
995 re_dfa_t *dfa;
997 int first, i;
998 reg_errcode_t err;
999 re_node_set init_nodes;
1001 /* Initial states have the epsilon closure of the node which is
1002 the first node of the regular expression. */
1003 first = dfa->str_tree->first;
1004 dfa->init_node = first;
1005 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
1006 if (BE (err != REG_NOERROR, 0))
1007 return err;
1009 /* The back-references which are in initial states can epsilon transit,
1010 since in this case all of the subexpressions can be null.
1011 Then we add epsilon closures of the nodes which are the next nodes of
1012 the back-references. */
1013 if (dfa->nbackref > 0)
1014 for (i = 0; i < init_nodes.nelem; ++i)
1016 int node_idx = init_nodes.elems[i];
1017 re_token_type_t type = dfa->nodes[node_idx].type;
1019 int clexp_idx;
1020 if (type != OP_BACK_REF)
1021 continue;
1022 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1024 re_token_t *clexp_node;
1025 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1026 if (clexp_node->type == OP_CLOSE_SUBEXP
1027 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1028 break;
1030 if (clexp_idx == init_nodes.nelem)
1031 continue;
1033 if (type == OP_BACK_REF)
1035 int dest_idx = dfa->edests[node_idx].elems[0];
1036 if (!re_node_set_contains (&init_nodes, dest_idx))
1038 re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1039 i = 0;
1044 /* It must be the first time to invoke acquire_state. */
1045 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1046 /* We don't check ERR here, since the initial state must not be NULL. */
1047 if (BE (dfa->init_state == NULL, 0))
1048 return err;
1049 if (dfa->init_state->has_constraint)
1051 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1052 CONTEXT_WORD);
1053 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1054 CONTEXT_NEWLINE);
1055 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1056 &init_nodes,
1057 CONTEXT_NEWLINE
1058 | CONTEXT_BEGBUF);
1059 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1060 || dfa->init_state_begbuf == NULL, 0))
1061 return err;
1063 else
1064 dfa->init_state_word = dfa->init_state_nl
1065 = dfa->init_state_begbuf = dfa->init_state;
1067 re_node_set_free (&init_nodes);
1068 return REG_NOERROR;
1071 #ifdef RE_ENABLE_I18N
1072 /* If it is possible to do searching in single byte encoding instead of UTF-8
1073 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1074 DFA nodes where needed. */
1076 static void
1077 optimize_utf8 (dfa)
1078 re_dfa_t *dfa;
1080 int node, i, mb_chars = 0, has_period = 0;
1082 for (node = 0; node < dfa->nodes_len; ++node)
1083 switch (dfa->nodes[node].type)
1085 case CHARACTER:
1086 if (dfa->nodes[node].opr.c >= 0x80)
1087 mb_chars = 1;
1088 break;
1089 case ANCHOR:
1090 switch (dfa->nodes[node].opr.idx)
1092 case LINE_FIRST:
1093 case LINE_LAST:
1094 case BUF_FIRST:
1095 case BUF_LAST:
1096 break;
1097 default:
1098 /* Word anchors etc. cannot be handled. */
1099 return;
1101 break;
1102 case OP_PERIOD:
1103 has_period = 1;
1104 break;
1105 case OP_BACK_REF:
1106 case OP_ALT:
1107 case END_OF_RE:
1108 case OP_DUP_ASTERISK:
1109 case OP_DUP_QUESTION:
1110 case OP_OPEN_SUBEXP:
1111 case OP_CLOSE_SUBEXP:
1112 break;
1113 case SIMPLE_BRACKET:
1114 /* Just double check. */
1115 for (i = 0x80 / UINT_BITS; i < BITSET_UINTS; ++i)
1116 if (dfa->nodes[node].opr.sbcset[i])
1117 return;
1118 break;
1119 default:
1120 return;
1123 if (mb_chars || has_period)
1124 for (node = 0; node < dfa->nodes_len; ++node)
1126 if (dfa->nodes[node].type == CHARACTER
1127 && dfa->nodes[node].opr.c >= 0x80)
1128 dfa->nodes[node].mb_partial = 0;
1129 else if (dfa->nodes[node].type == OP_PERIOD)
1130 dfa->nodes[node].type = OP_UTF8_PERIOD;
1133 /* The search can be in single byte locale. */
1134 dfa->mb_cur_max = 1;
1135 dfa->is_utf8 = 0;
1136 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1138 #endif
1140 static bin_tree_t *
1141 optimize_subexps (so, node, sidx, depth)
1142 struct subexp_optimize *so;
1143 bin_tree_t *node;
1144 int sidx, depth;
1146 int idx, new_depth, new_sidx;
1147 bin_tree_t *ret;
1148 if (node == NULL)
1149 return NULL;
1151 new_depth = 0;
1152 new_sidx = sidx;
1153 if ((depth & 1) && node->type == CONCAT
1154 && node->right && node->right->type == 0
1155 && so->nodes[idx = node->right->node_idx].type == OP_CLOSE_SUBEXP)
1157 new_depth = depth + 1;
1158 if (new_depth == 2
1159 || (so->nodes[idx].opr.idx < 8 * sizeof (so->dfa->used_bkref_map)
1160 && so->dfa->used_bkref_map & (1 << so->nodes[idx].opr.idx)))
1161 new_sidx = so->nodes[idx].opr.idx;
1163 node->left = optimize_subexps (so, node->left, new_sidx, new_depth);
1164 new_depth = (depth & 1) == 0 && node->type == CONCAT
1165 && node->left && node->left->type == 0
1166 && so->nodes[node->left->node_idx].type == OP_OPEN_SUBEXP
1167 ? depth + 1 : 0;
1168 node->right = optimize_subexps (so, node->right, sidx, new_depth);
1170 if (node->type != CONCAT)
1171 return node;
1172 if ((depth & 1) == 0
1173 && node->left
1174 && node->left->type == 0
1175 && so->nodes[idx = node->left->node_idx].type == OP_OPEN_SUBEXP)
1176 ret = node->right;
1177 else if ((depth & 1)
1178 && node->right
1179 && node->right->type == 0
1180 && so->nodes[idx = node->right->node_idx].type == OP_CLOSE_SUBEXP)
1181 ret = node->left;
1182 else
1183 return node;
1185 if (so->nodes[idx].opr.idx < 8 * sizeof (so->dfa->used_bkref_map)
1186 && so->dfa->used_bkref_map & (1 << so->nodes[idx].opr.idx))
1187 return node;
1189 if (!so->no_sub)
1191 int i;
1193 if (depth < 2)
1194 return node;
1196 if (so->dfa->subexp_map == NULL)
1198 so->dfa->subexp_map = re_malloc (int, so->re_nsub);
1199 if (so->dfa->subexp_map == NULL)
1200 return node;
1202 for (i = 0; i < so->re_nsub; i++)
1203 so->dfa->subexp_map[i] = i;
1206 i = so->nodes[idx].opr.idx;
1207 assert (sidx < i);
1208 so->dfa->subexp_map[i] = sidx;
1211 so->nodes[idx].type = OP_DELETED_SUBEXP;
1212 ret->parent = node->parent;
1213 return ret;
1216 /* Analyze the structure tree, and calculate "first", "next", "edest",
1217 "eclosure", and "inveclosure". */
1219 static reg_errcode_t
1220 analyze (dfa)
1221 re_dfa_t *dfa;
1223 int i;
1224 reg_errcode_t ret;
1226 /* Allocate arrays. */
1227 dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1228 dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1229 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1230 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1231 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1232 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1233 || dfa->eclosures == NULL || dfa->inveclosures == NULL, 0))
1234 return REG_ESPACE;
1235 /* Initialize them. */
1236 for (i = 0; i < dfa->nodes_len; ++i)
1238 dfa->nexts[i] = -1;
1239 re_node_set_init_empty (dfa->edests + i);
1240 re_node_set_init_empty (dfa->eclosures + i);
1241 re_node_set_init_empty (dfa->inveclosures + i);
1244 ret = analyze_tree (dfa, dfa->str_tree);
1245 if (BE (ret == REG_NOERROR, 1))
1247 ret = calc_eclosure (dfa);
1248 if (ret == REG_NOERROR)
1249 calc_inveclosure (dfa);
1251 return ret;
1254 /* Helper functions for analyze.
1255 This function calculate "first", "next", and "edest" for the subtree
1256 whose root is NODE. */
1258 static reg_errcode_t
1259 analyze_tree (dfa, node)
1260 re_dfa_t *dfa;
1261 bin_tree_t *node;
1263 reg_errcode_t ret;
1264 if (node->first == -1)
1265 calc_first (dfa, node);
1266 if (node->next == -1)
1267 calc_next (dfa, node);
1268 calc_epsdest (dfa, node);
1270 /* Calculate "first" etc. for the left child. */
1271 if (node->left != NULL)
1273 ret = analyze_tree (dfa, node->left);
1274 if (BE (ret != REG_NOERROR, 0))
1275 return ret;
1277 /* Calculate "first" etc. for the right child. */
1278 if (node->right != NULL)
1280 ret = analyze_tree (dfa, node->right);
1281 if (BE (ret != REG_NOERROR, 0))
1282 return ret;
1284 return REG_NOERROR;
1287 /* Calculate "first" for the node NODE. */
1288 static void
1289 calc_first (dfa, node)
1290 re_dfa_t *dfa;
1291 bin_tree_t *node;
1293 int idx, type;
1294 idx = node->node_idx;
1295 type = (node->type == 0) ? dfa->nodes[idx].type : node->type;
1297 switch (type)
1299 #ifdef DEBUG
1300 case OP_OPEN_BRACKET:
1301 case OP_CLOSE_BRACKET:
1302 case OP_OPEN_DUP_NUM:
1303 case OP_CLOSE_DUP_NUM:
1304 case OP_DUP_PLUS:
1305 case OP_NON_MATCH_LIST:
1306 case OP_OPEN_COLL_ELEM:
1307 case OP_CLOSE_COLL_ELEM:
1308 case OP_OPEN_EQUIV_CLASS:
1309 case OP_CLOSE_EQUIV_CLASS:
1310 case OP_OPEN_CHAR_CLASS:
1311 case OP_CLOSE_CHAR_CLASS:
1312 /* These must not appear here. */
1313 assert (0);
1314 #endif
1315 case END_OF_RE:
1316 case CHARACTER:
1317 case OP_PERIOD:
1318 case OP_DUP_ASTERISK:
1319 case OP_DUP_QUESTION:
1320 #ifdef RE_ENABLE_I18N
1321 case OP_UTF8_PERIOD:
1322 case COMPLEX_BRACKET:
1323 #endif /* RE_ENABLE_I18N */
1324 case SIMPLE_BRACKET:
1325 case OP_BACK_REF:
1326 case ANCHOR:
1327 case OP_OPEN_SUBEXP:
1328 case OP_CLOSE_SUBEXP:
1329 node->first = idx;
1330 break;
1331 case OP_ALT:
1332 node->first = idx;
1333 break;
1334 /* else fall through */
1335 default:
1336 #ifdef DEBUG
1337 assert (node->left != NULL);
1338 #endif
1339 if (node->left->first == -1)
1340 calc_first (dfa, node->left);
1341 node->first = node->left->first;
1342 break;
1346 /* Calculate "next" for the node NODE. */
1348 static void
1349 calc_next (dfa, node)
1350 re_dfa_t *dfa;
1351 bin_tree_t *node;
1353 int idx, type;
1354 bin_tree_t *parent = node->parent;
1355 if (parent == NULL)
1357 node->next = -1;
1358 idx = node->node_idx;
1359 if (node->type == 0)
1360 dfa->nexts[idx] = node->next;
1361 return;
1364 idx = parent->node_idx;
1365 type = (parent->type == 0) ? dfa->nodes[idx].type : parent->type;
1367 switch (type)
1369 case OP_DUP_ASTERISK:
1370 node->next = idx;
1371 break;
1372 case CONCAT:
1373 if (parent->left == node)
1375 if (parent->right->first == -1)
1376 calc_first (dfa, parent->right);
1377 node->next = parent->right->first;
1378 break;
1380 /* else fall through */
1381 default:
1382 if (parent->next == -1)
1383 calc_next (dfa, parent);
1384 node->next = parent->next;
1385 break;
1387 idx = node->node_idx;
1388 if (node->type == 0)
1389 dfa->nexts[idx] = node->next;
1392 /* Calculate "edest" for the node NODE. */
1394 static void
1395 calc_epsdest (dfa, node)
1396 re_dfa_t *dfa;
1397 bin_tree_t *node;
1399 int idx;
1400 idx = node->node_idx;
1401 if (node->type == 0)
1403 if (dfa->nodes[idx].type == OP_DUP_ASTERISK
1404 || dfa->nodes[idx].type == OP_DUP_QUESTION)
1406 if (node->left->first == -1)
1407 calc_first (dfa, node->left);
1408 if (node->next == -1)
1409 calc_next (dfa, node);
1410 re_node_set_init_2 (dfa->edests + idx, node->left->first,
1411 node->next);
1413 else if (dfa->nodes[idx].type == OP_ALT)
1415 int left, right;
1416 if (node->left != NULL)
1418 if (node->left->first == -1)
1419 calc_first (dfa, node->left);
1420 left = node->left->first;
1422 else
1424 if (node->next == -1)
1425 calc_next (dfa, node);
1426 left = node->next;
1428 if (node->right != NULL)
1430 if (node->right->first == -1)
1431 calc_first (dfa, node->right);
1432 right = node->right->first;
1434 else
1436 if (node->next == -1)
1437 calc_next (dfa, node);
1438 right = node->next;
1440 re_node_set_init_2 (dfa->edests + idx, left, right);
1442 else if (dfa->nodes[idx].type == ANCHOR
1443 || dfa->nodes[idx].type == OP_OPEN_SUBEXP
1444 || dfa->nodes[idx].type == OP_CLOSE_SUBEXP
1445 || dfa->nodes[idx].type == OP_BACK_REF)
1446 re_node_set_init_1 (dfa->edests + idx, node->next);
1447 else
1448 assert (!IS_EPSILON_NODE (dfa->nodes[idx].type));
1452 /* Duplicate the epsilon closure of the node ROOT_NODE.
1453 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1454 to their own constraint. */
1456 static reg_errcode_t
1457 duplicate_node_closure (dfa, top_org_node, top_clone_node, root_node,
1458 init_constraint)
1459 re_dfa_t *dfa;
1460 int top_org_node, top_clone_node, root_node;
1461 unsigned int init_constraint;
1463 reg_errcode_t err;
1464 int org_node, clone_node, ret;
1465 unsigned int constraint = init_constraint;
1466 for (org_node = top_org_node, clone_node = top_clone_node;;)
1468 int org_dest, clone_dest;
1469 if (dfa->nodes[org_node].type == OP_BACK_REF)
1471 /* If the back reference epsilon-transit, its destination must
1472 also have the constraint. Then duplicate the epsilon closure
1473 of the destination of the back reference, and store it in
1474 edests of the back reference. */
1475 org_dest = dfa->nexts[org_node];
1476 re_node_set_empty (dfa->edests + clone_node);
1477 err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1478 if (BE (err != REG_NOERROR, 0))
1479 return err;
1480 dfa->nexts[clone_node] = dfa->nexts[org_node];
1481 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1482 if (BE (ret < 0, 0))
1483 return REG_ESPACE;
1485 else if (dfa->edests[org_node].nelem == 0)
1487 /* In case of the node can't epsilon-transit, don't duplicate the
1488 destination and store the original destination as the
1489 destination of the node. */
1490 dfa->nexts[clone_node] = dfa->nexts[org_node];
1491 break;
1493 else if (dfa->edests[org_node].nelem == 1)
1495 /* In case of the node can epsilon-transit, and it has only one
1496 destination. */
1497 org_dest = dfa->edests[org_node].elems[0];
1498 re_node_set_empty (dfa->edests + clone_node);
1499 if (dfa->nodes[org_node].type == ANCHOR)
1501 /* In case of the node has another constraint, append it. */
1502 if (org_node == root_node && clone_node != org_node)
1504 /* ...but if the node is root_node itself, it means the
1505 epsilon closure have a loop, then tie it to the
1506 destination of the root_node. */
1507 ret = re_node_set_insert (dfa->edests + clone_node,
1508 org_dest);
1509 if (BE (ret < 0, 0))
1510 return REG_ESPACE;
1511 break;
1513 constraint |= dfa->nodes[org_node].opr.ctx_type;
1515 err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1516 if (BE (err != REG_NOERROR, 0))
1517 return err;
1518 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1519 if (BE (ret < 0, 0))
1520 return REG_ESPACE;
1522 else /* dfa->edests[org_node].nelem == 2 */
1524 /* In case of the node can epsilon-transit, and it has two
1525 destinations. E.g. '|', '*', '+', '?'. */
1526 org_dest = dfa->edests[org_node].elems[0];
1527 re_node_set_empty (dfa->edests + clone_node);
1528 /* Search for a duplicated node which satisfies the constraint. */
1529 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1530 if (clone_dest == -1)
1532 /* There are no such a duplicated node, create a new one. */
1533 err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1534 if (BE (err != REG_NOERROR, 0))
1535 return err;
1536 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1537 if (BE (ret < 0, 0))
1538 return REG_ESPACE;
1539 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1540 root_node, constraint);
1541 if (BE (err != REG_NOERROR, 0))
1542 return err;
1544 else
1546 /* There are a duplicated node which satisfy the constraint,
1547 use it to avoid infinite loop. */
1548 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1549 if (BE (ret < 0, 0))
1550 return REG_ESPACE;
1553 org_dest = dfa->edests[org_node].elems[1];
1554 err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1555 if (BE (err != REG_NOERROR, 0))
1556 return err;
1557 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1558 if (BE (ret < 0, 0))
1559 return REG_ESPACE;
1561 org_node = org_dest;
1562 clone_node = clone_dest;
1564 return REG_NOERROR;
1567 /* Search for a node which is duplicated from the node ORG_NODE, and
1568 satisfies the constraint CONSTRAINT. */
1570 static int
1571 search_duplicated_node (dfa, org_node, constraint)
1572 re_dfa_t *dfa;
1573 int org_node;
1574 unsigned int constraint;
1576 int idx;
1577 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1579 if (org_node == dfa->org_indices[idx]
1580 && constraint == dfa->nodes[idx].constraint)
1581 return idx; /* Found. */
1583 return -1; /* Not found. */
1586 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1587 The new index will be stored in NEW_IDX and return REG_NOERROR if succeeded,
1588 otherwise return the error code. */
1590 static reg_errcode_t
1591 duplicate_node (new_idx, dfa, org_idx, constraint)
1592 re_dfa_t *dfa;
1593 int *new_idx, org_idx;
1594 unsigned int constraint;
1596 int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx], 1);
1597 if (BE (dup_idx == -1, 0))
1598 return REG_ESPACE;
1599 dfa->nodes[dup_idx].constraint = constraint;
1600 if (dfa->nodes[org_idx].type == ANCHOR)
1601 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type;
1602 dfa->nodes[dup_idx].duplicated = 1;
1603 re_node_set_init_empty (dfa->edests + dup_idx);
1604 re_node_set_init_empty (dfa->eclosures + dup_idx);
1605 re_node_set_init_empty (dfa->inveclosures + dup_idx);
1607 /* Store the index of the original node. */
1608 dfa->org_indices[dup_idx] = org_idx;
1609 *new_idx = dup_idx;
1610 return REG_NOERROR;
1613 static void
1614 calc_inveclosure (dfa)
1615 re_dfa_t *dfa;
1617 int src, idx, dest;
1618 for (src = 0; src < dfa->nodes_len; ++src)
1620 if (dfa->nodes[src].type == OP_DELETED_SUBEXP)
1621 continue;
1622 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1624 dest = dfa->eclosures[src].elems[idx];
1625 re_node_set_insert_last (dfa->inveclosures + dest, src);
1630 /* Calculate "eclosure" for all the node in DFA. */
1632 static reg_errcode_t
1633 calc_eclosure (dfa)
1634 re_dfa_t *dfa;
1636 int node_idx, incomplete;
1637 #ifdef DEBUG
1638 assert (dfa->nodes_len > 0);
1639 #endif
1640 incomplete = 0;
1641 /* For each nodes, calculate epsilon closure. */
1642 for (node_idx = 0; ; ++node_idx)
1644 reg_errcode_t err;
1645 re_node_set eclosure_elem;
1646 if (node_idx == dfa->nodes_len)
1648 if (!incomplete)
1649 break;
1650 incomplete = 0;
1651 node_idx = 0;
1654 #ifdef DEBUG
1655 assert (dfa->eclosures[node_idx].nelem != -1);
1656 #endif
1657 if (dfa->nodes[node_idx].type == OP_DELETED_SUBEXP)
1658 continue;
1660 /* If we have already calculated, skip it. */
1661 if (dfa->eclosures[node_idx].nelem != 0)
1662 continue;
1663 /* Calculate epsilon closure of `node_idx'. */
1664 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1665 if (BE (err != REG_NOERROR, 0))
1666 return err;
1668 if (dfa->eclosures[node_idx].nelem == 0)
1670 incomplete = 1;
1671 re_node_set_free (&eclosure_elem);
1674 return REG_NOERROR;
1677 /* Calculate epsilon closure of NODE. */
1679 static reg_errcode_t
1680 calc_eclosure_iter (new_set, dfa, node, root)
1681 re_node_set *new_set;
1682 re_dfa_t *dfa;
1683 int node, root;
1685 reg_errcode_t err;
1686 unsigned int constraint;
1687 int i, incomplete;
1688 re_node_set eclosure;
1689 incomplete = 0;
1690 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1691 if (BE (err != REG_NOERROR, 0))
1692 return err;
1694 /* This indicates that we are calculating this node now.
1695 We reference this value to avoid infinite loop. */
1696 dfa->eclosures[node].nelem = -1;
1698 constraint = ((dfa->nodes[node].type == ANCHOR)
1699 ? dfa->nodes[node].opr.ctx_type : 0);
1700 /* If the current node has constraints, duplicate all nodes.
1701 Since they must inherit the constraints. */
1702 if (constraint
1703 && dfa->edests[node].nelem
1704 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1706 int org_node, cur_node;
1707 org_node = cur_node = node;
1708 err = duplicate_node_closure (dfa, node, node, node, constraint);
1709 if (BE (err != REG_NOERROR, 0))
1710 return err;
1713 /* Expand each epsilon destination nodes. */
1714 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1715 for (i = 0; i < dfa->edests[node].nelem; ++i)
1717 re_node_set eclosure_elem;
1718 int edest = dfa->edests[node].elems[i];
1719 /* If calculating the epsilon closure of `edest' is in progress,
1720 return intermediate result. */
1721 if (dfa->eclosures[edest].nelem == -1)
1723 incomplete = 1;
1724 continue;
1726 /* If we haven't calculated the epsilon closure of `edest' yet,
1727 calculate now. Otherwise use calculated epsilon closure. */
1728 if (dfa->eclosures[edest].nelem == 0)
1730 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1731 if (BE (err != REG_NOERROR, 0))
1732 return err;
1734 else
1735 eclosure_elem = dfa->eclosures[edest];
1736 /* Merge the epsilon closure of `edest'. */
1737 re_node_set_merge (&eclosure, &eclosure_elem);
1738 /* If the epsilon closure of `edest' is incomplete,
1739 the epsilon closure of this node is also incomplete. */
1740 if (dfa->eclosures[edest].nelem == 0)
1742 incomplete = 1;
1743 re_node_set_free (&eclosure_elem);
1747 /* Epsilon closures include itself. */
1748 re_node_set_insert (&eclosure, node);
1749 if (incomplete && !root)
1750 dfa->eclosures[node].nelem = 0;
1751 else
1752 dfa->eclosures[node] = eclosure;
1753 *new_set = eclosure;
1754 return REG_NOERROR;
1757 /* Functions for token which are used in the parser. */
1759 /* Fetch a token from INPUT.
1760 We must not use this function inside bracket expressions. */
1762 static void
1763 fetch_token (result, input, syntax)
1764 re_token_t *result;
1765 re_string_t *input;
1766 reg_syntax_t syntax;
1768 re_string_skip_bytes (input, peek_token (result, input, syntax));
1771 /* Peek a token from INPUT, and return the length of the token.
1772 We must not use this function inside bracket expressions. */
1774 static int
1775 peek_token (token, input, syntax)
1776 re_token_t *token;
1777 re_string_t *input;
1778 reg_syntax_t syntax;
1780 unsigned char c;
1782 if (re_string_eoi (input))
1784 token->type = END_OF_RE;
1785 return 0;
1788 c = re_string_peek_byte (input, 0);
1789 token->opr.c = c;
1791 token->word_char = 0;
1792 #ifdef RE_ENABLE_I18N
1793 token->mb_partial = 0;
1794 if (input->mb_cur_max > 1 &&
1795 !re_string_first_byte (input, re_string_cur_idx (input)))
1797 token->type = CHARACTER;
1798 token->mb_partial = 1;
1799 return 1;
1801 #endif
1802 if (c == '\\')
1804 unsigned char c2;
1805 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1807 token->type = BACK_SLASH;
1808 return 1;
1811 c2 = re_string_peek_byte_case (input, 1);
1812 token->opr.c = c2;
1813 token->type = CHARACTER;
1814 #ifdef RE_ENABLE_I18N
1815 if (input->mb_cur_max > 1)
1817 wint_t wc = re_string_wchar_at (input,
1818 re_string_cur_idx (input) + 1);
1819 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1821 else
1822 #endif
1823 token->word_char = IS_WORD_CHAR (c2) != 0;
1825 switch (c2)
1827 case '|':
1828 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1829 token->type = OP_ALT;
1830 break;
1831 case '1': case '2': case '3': case '4': case '5':
1832 case '6': case '7': case '8': case '9':
1833 if (!(syntax & RE_NO_BK_REFS))
1835 token->type = OP_BACK_REF;
1836 token->opr.idx = c2 - '1';
1838 break;
1839 case '<':
1840 if (!(syntax & RE_NO_GNU_OPS))
1842 token->type = ANCHOR;
1843 token->opr.ctx_type = WORD_FIRST;
1845 break;
1846 case '>':
1847 if (!(syntax & RE_NO_GNU_OPS))
1849 token->type = ANCHOR;
1850 token->opr.ctx_type = WORD_LAST;
1852 break;
1853 case 'b':
1854 if (!(syntax & RE_NO_GNU_OPS))
1856 token->type = ANCHOR;
1857 token->opr.ctx_type = WORD_DELIM;
1859 break;
1860 case 'B':
1861 if (!(syntax & RE_NO_GNU_OPS))
1863 token->type = ANCHOR;
1864 token->opr.ctx_type = NOT_WORD_DELIM;
1866 break;
1867 case 'w':
1868 if (!(syntax & RE_NO_GNU_OPS))
1869 token->type = OP_WORD;
1870 break;
1871 case 'W':
1872 if (!(syntax & RE_NO_GNU_OPS))
1873 token->type = OP_NOTWORD;
1874 break;
1875 case 's':
1876 if (!(syntax & RE_NO_GNU_OPS))
1877 token->type = OP_SPACE;
1878 break;
1879 case 'S':
1880 if (!(syntax & RE_NO_GNU_OPS))
1881 token->type = OP_NOTSPACE;
1882 break;
1883 case '`':
1884 if (!(syntax & RE_NO_GNU_OPS))
1886 token->type = ANCHOR;
1887 token->opr.ctx_type = BUF_FIRST;
1889 break;
1890 case '\'':
1891 if (!(syntax & RE_NO_GNU_OPS))
1893 token->type = ANCHOR;
1894 token->opr.ctx_type = BUF_LAST;
1896 break;
1897 case '(':
1898 if (!(syntax & RE_NO_BK_PARENS))
1899 token->type = OP_OPEN_SUBEXP;
1900 break;
1901 case ')':
1902 if (!(syntax & RE_NO_BK_PARENS))
1903 token->type = OP_CLOSE_SUBEXP;
1904 break;
1905 case '+':
1906 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1907 token->type = OP_DUP_PLUS;
1908 break;
1909 case '?':
1910 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1911 token->type = OP_DUP_QUESTION;
1912 break;
1913 case '{':
1914 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1915 token->type = OP_OPEN_DUP_NUM;
1916 break;
1917 case '}':
1918 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1919 token->type = OP_CLOSE_DUP_NUM;
1920 break;
1921 default:
1922 break;
1924 return 2;
1927 token->type = CHARACTER;
1928 #ifdef RE_ENABLE_I18N
1929 if (input->mb_cur_max > 1)
1931 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1932 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1934 else
1935 #endif
1936 token->word_char = IS_WORD_CHAR (token->opr.c);
1938 switch (c)
1940 case '\n':
1941 if (syntax & RE_NEWLINE_ALT)
1942 token->type = OP_ALT;
1943 break;
1944 case '|':
1945 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1946 token->type = OP_ALT;
1947 break;
1948 case '*':
1949 token->type = OP_DUP_ASTERISK;
1950 break;
1951 case '+':
1952 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1953 token->type = OP_DUP_PLUS;
1954 break;
1955 case '?':
1956 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1957 token->type = OP_DUP_QUESTION;
1958 break;
1959 case '{':
1960 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1961 token->type = OP_OPEN_DUP_NUM;
1962 break;
1963 case '}':
1964 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1965 token->type = OP_CLOSE_DUP_NUM;
1966 break;
1967 case '(':
1968 if (syntax & RE_NO_BK_PARENS)
1969 token->type = OP_OPEN_SUBEXP;
1970 break;
1971 case ')':
1972 if (syntax & RE_NO_BK_PARENS)
1973 token->type = OP_CLOSE_SUBEXP;
1974 break;
1975 case '[':
1976 token->type = OP_OPEN_BRACKET;
1977 break;
1978 case '.':
1979 token->type = OP_PERIOD;
1980 break;
1981 case '^':
1982 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1983 re_string_cur_idx (input) != 0)
1985 char prev = re_string_peek_byte (input, -1);
1986 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1987 break;
1989 token->type = ANCHOR;
1990 token->opr.ctx_type = LINE_FIRST;
1991 break;
1992 case '$':
1993 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1994 re_string_cur_idx (input) + 1 != re_string_length (input))
1996 re_token_t next;
1997 re_string_skip_bytes (input, 1);
1998 peek_token (&next, input, syntax);
1999 re_string_skip_bytes (input, -1);
2000 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
2001 break;
2003 token->type = ANCHOR;
2004 token->opr.ctx_type = LINE_LAST;
2005 break;
2006 default:
2007 break;
2009 return 1;
2012 /* Peek a token from INPUT, and return the length of the token.
2013 We must not use this function out of bracket expressions. */
2015 static int
2016 peek_token_bracket (token, input, syntax)
2017 re_token_t *token;
2018 re_string_t *input;
2019 reg_syntax_t syntax;
2021 unsigned char c;
2022 if (re_string_eoi (input))
2024 token->type = END_OF_RE;
2025 return 0;
2027 c = re_string_peek_byte (input, 0);
2028 token->opr.c = c;
2030 #ifdef RE_ENABLE_I18N
2031 if (input->mb_cur_max > 1 &&
2032 !re_string_first_byte (input, re_string_cur_idx (input)))
2034 token->type = CHARACTER;
2035 return 1;
2037 #endif /* RE_ENABLE_I18N */
2039 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2040 && re_string_cur_idx (input) + 1 < re_string_length (input))
2042 /* In this case, '\' escape a character. */
2043 unsigned char c2;
2044 re_string_skip_bytes (input, 1);
2045 c2 = re_string_peek_byte (input, 0);
2046 token->opr.c = c2;
2047 token->type = CHARACTER;
2048 return 1;
2050 if (c == '[') /* '[' is a special char in a bracket exps. */
2052 unsigned char c2;
2053 int token_len;
2054 if (re_string_cur_idx (input) + 1 < re_string_length (input))
2055 c2 = re_string_peek_byte (input, 1);
2056 else
2057 c2 = 0;
2058 token->opr.c = c2;
2059 token_len = 2;
2060 switch (c2)
2062 case '.':
2063 token->type = OP_OPEN_COLL_ELEM;
2064 break;
2065 case '=':
2066 token->type = OP_OPEN_EQUIV_CLASS;
2067 break;
2068 case ':':
2069 if (syntax & RE_CHAR_CLASSES)
2071 token->type = OP_OPEN_CHAR_CLASS;
2072 break;
2074 /* else fall through. */
2075 default:
2076 token->type = CHARACTER;
2077 token->opr.c = c;
2078 token_len = 1;
2079 break;
2081 return token_len;
2083 switch (c)
2085 case '-':
2086 token->type = OP_CHARSET_RANGE;
2087 break;
2088 case ']':
2089 token->type = OP_CLOSE_BRACKET;
2090 break;
2091 case '^':
2092 token->type = OP_NON_MATCH_LIST;
2093 break;
2094 default:
2095 token->type = CHARACTER;
2097 return 1;
2100 /* Functions for parser. */
2102 /* Entry point of the parser.
2103 Parse the regular expression REGEXP and return the structure tree.
2104 If an error is occured, ERR is set by error code, and return NULL.
2105 This function build the following tree, from regular expression <reg_exp>:
2109 <reg_exp> EOR
2111 CAT means concatenation.
2112 EOR means end of regular expression. */
2114 static bin_tree_t *
2115 parse (regexp, preg, syntax, err)
2116 re_string_t *regexp;
2117 regex_t *preg;
2118 reg_syntax_t syntax;
2119 reg_errcode_t *err;
2121 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2122 bin_tree_t *tree, *eor, *root;
2123 re_token_t current_token;
2124 dfa->syntax = syntax;
2125 fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2126 tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2127 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2128 return NULL;
2129 eor = re_dfa_add_tree_node (dfa, NULL, NULL, &current_token);
2130 if (tree != NULL)
2131 root = create_tree (dfa, tree, eor, CONCAT, 0);
2132 else
2133 root = eor;
2134 if (BE (eor == NULL || root == NULL, 0))
2136 *err = REG_ESPACE;
2137 return NULL;
2139 return root;
2142 /* This function build the following tree, from regular expression
2143 <branch1>|<branch2>:
2147 <branch1> <branch2>
2149 ALT means alternative, which represents the operator `|'. */
2151 static bin_tree_t *
2152 parse_reg_exp (regexp, preg, token, syntax, nest, err)
2153 re_string_t *regexp;
2154 regex_t *preg;
2155 re_token_t *token;
2156 reg_syntax_t syntax;
2157 int nest;
2158 reg_errcode_t *err;
2160 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2161 bin_tree_t *tree, *branch = NULL;
2162 tree = parse_branch (regexp, preg, token, syntax, nest, err);
2163 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2164 return NULL;
2166 while (token->type == OP_ALT)
2168 re_token_t alt_token = *token;
2169 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2170 if (token->type != OP_ALT && token->type != END_OF_RE
2171 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2173 branch = parse_branch (regexp, preg, token, syntax, nest, err);
2174 if (BE (*err != REG_NOERROR && branch == NULL, 0))
2175 return NULL;
2177 else
2178 branch = NULL;
2179 tree = re_dfa_add_tree_node (dfa, tree, branch, &alt_token);
2180 if (BE (tree == NULL, 0))
2182 *err = REG_ESPACE;
2183 return NULL;
2185 dfa->has_plural_match = 1;
2187 return tree;
2190 /* This function build the following tree, from regular expression
2191 <exp1><exp2>:
2195 <exp1> <exp2>
2197 CAT means concatenation. */
2199 static bin_tree_t *
2200 parse_branch (regexp, preg, token, syntax, nest, err)
2201 re_string_t *regexp;
2202 regex_t *preg;
2203 re_token_t *token;
2204 reg_syntax_t syntax;
2205 int nest;
2206 reg_errcode_t *err;
2208 bin_tree_t *tree, *exp;
2209 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2210 tree = parse_expression (regexp, preg, token, syntax, nest, err);
2211 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2212 return NULL;
2214 while (token->type != OP_ALT && token->type != END_OF_RE
2215 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2217 exp = parse_expression (regexp, preg, token, syntax, nest, err);
2218 if (BE (*err != REG_NOERROR && exp == NULL, 0))
2220 return NULL;
2222 if (tree != NULL && exp != NULL)
2224 tree = create_tree (dfa, tree, exp, CONCAT, 0);
2225 if (tree == NULL)
2227 *err = REG_ESPACE;
2228 return NULL;
2231 else if (tree == NULL)
2232 tree = exp;
2233 /* Otherwise exp == NULL, we don't need to create new tree. */
2235 return tree;
2238 /* This function build the following tree, from regular expression a*:
2244 static bin_tree_t *
2245 parse_expression (regexp, preg, token, syntax, nest, err)
2246 re_string_t *regexp;
2247 regex_t *preg;
2248 re_token_t *token;
2249 reg_syntax_t syntax;
2250 int nest;
2251 reg_errcode_t *err;
2253 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2254 bin_tree_t *tree;
2255 switch (token->type)
2257 case CHARACTER:
2258 tree = re_dfa_add_tree_node (dfa, NULL, NULL, token);
2259 if (BE (tree == NULL, 0))
2261 *err = REG_ESPACE;
2262 return NULL;
2264 #ifdef RE_ENABLE_I18N
2265 if (dfa->mb_cur_max > 1)
2267 while (!re_string_eoi (regexp)
2268 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2270 bin_tree_t *mbc_remain;
2271 fetch_token (token, regexp, syntax);
2272 mbc_remain = re_dfa_add_tree_node (dfa, NULL, NULL, token);
2273 tree = create_tree (dfa, tree, mbc_remain, CONCAT, 0);
2274 if (BE (mbc_remain == NULL || tree == NULL, 0))
2276 *err = REG_ESPACE;
2277 return NULL;
2281 #endif
2282 break;
2283 case OP_OPEN_SUBEXP:
2284 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2285 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2286 return NULL;
2287 break;
2288 case OP_OPEN_BRACKET:
2289 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2290 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2291 return NULL;
2292 break;
2293 case OP_BACK_REF:
2294 if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2296 *err = REG_ESUBREG;
2297 return NULL;
2299 dfa->used_bkref_map |= 1 << token->opr.idx;
2300 tree = re_dfa_add_tree_node (dfa, NULL, NULL, token);
2301 if (BE (tree == NULL, 0))
2303 *err = REG_ESPACE;
2304 return NULL;
2306 ++dfa->nbackref;
2307 dfa->has_mb_node = 1;
2308 break;
2309 case OP_OPEN_DUP_NUM:
2310 if (syntax & RE_CONTEXT_INVALID_DUP)
2312 *err = REG_BADRPT;
2313 return NULL;
2315 /* FALLTHROUGH */
2316 case OP_DUP_ASTERISK:
2317 case OP_DUP_PLUS:
2318 case OP_DUP_QUESTION:
2319 if (syntax & RE_CONTEXT_INVALID_OPS)
2321 *err = REG_BADRPT;
2322 return NULL;
2324 else if (syntax & RE_CONTEXT_INDEP_OPS)
2326 fetch_token (token, regexp, syntax);
2327 return parse_expression (regexp, preg, token, syntax, nest, err);
2329 /* else fall through */
2330 case OP_CLOSE_SUBEXP:
2331 if ((token->type == OP_CLOSE_SUBEXP) &&
2332 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2334 *err = REG_ERPAREN;
2335 return NULL;
2337 /* else fall through */
2338 case OP_CLOSE_DUP_NUM:
2339 /* We treat it as a normal character. */
2341 /* Then we can these characters as normal characters. */
2342 token->type = CHARACTER;
2343 /* mb_partial and word_char bits should be initialized already
2344 by peek_token. */
2345 tree = re_dfa_add_tree_node (dfa, NULL, NULL, token);
2346 if (BE (tree == NULL, 0))
2348 *err = REG_ESPACE;
2349 return NULL;
2351 break;
2352 case ANCHOR:
2353 if ((token->opr.ctx_type
2354 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2355 && dfa->word_ops_used == 0)
2356 init_word_char (dfa);
2357 if (token->opr.ctx_type == WORD_DELIM
2358 || token->opr.ctx_type == NOT_WORD_DELIM)
2360 bin_tree_t *tree_first, *tree_last;
2361 if (token->opr.ctx_type == WORD_DELIM)
2363 token->opr.ctx_type = WORD_FIRST;
2364 tree_first = re_dfa_add_tree_node (dfa, NULL, NULL, token);
2365 token->opr.ctx_type = WORD_LAST;
2367 else
2369 token->opr.ctx_type = INSIDE_WORD;
2370 tree_first = re_dfa_add_tree_node (dfa, NULL, NULL, token);
2371 token->opr.ctx_type = INSIDE_NOTWORD;
2373 tree_last = re_dfa_add_tree_node (dfa, NULL, NULL, token);
2374 token->type = OP_ALT;
2375 tree = re_dfa_add_tree_node (dfa, tree_first, tree_last, token);
2376 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2378 *err = REG_ESPACE;
2379 return NULL;
2382 else
2384 tree = re_dfa_add_tree_node (dfa, NULL, NULL, token);
2385 if (BE (tree == NULL, 0))
2387 *err = REG_ESPACE;
2388 return NULL;
2391 /* We must return here, since ANCHORs can't be followed
2392 by repetition operators.
2393 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2394 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2395 fetch_token (token, regexp, syntax);
2396 return tree;
2397 case OP_PERIOD:
2398 tree = re_dfa_add_tree_node (dfa, NULL, NULL, token);
2399 if (BE (tree == NULL, 0))
2401 *err = REG_ESPACE;
2402 return NULL;
2404 if (dfa->mb_cur_max > 1)
2405 dfa->has_mb_node = 1;
2406 break;
2407 case OP_WORD:
2408 case OP_NOTWORD:
2409 tree = build_charclass_op (dfa, regexp->trans,
2410 (const unsigned char *) "alnum",
2411 (const unsigned char *) "_",
2412 token->type == OP_NOTWORD, err);
2413 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2414 return NULL;
2415 break;
2416 case OP_SPACE:
2417 case OP_NOTSPACE:
2418 tree = build_charclass_op (dfa, regexp->trans,
2419 (const unsigned char *) "space",
2420 (const unsigned char *) "",
2421 token->type == OP_NOTSPACE, err);
2422 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2423 return NULL;
2424 break;
2425 case OP_ALT:
2426 case END_OF_RE:
2427 return NULL;
2428 case BACK_SLASH:
2429 *err = REG_EESCAPE;
2430 return NULL;
2431 default:
2432 /* Must not happen? */
2433 #ifdef DEBUG
2434 assert (0);
2435 #endif
2436 return NULL;
2438 fetch_token (token, regexp, syntax);
2440 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2441 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2443 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2444 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2445 return NULL;
2446 /* In BRE consecutive duplications are not allowed. */
2447 if ((syntax & RE_CONTEXT_INVALID_DUP)
2448 && (token->type == OP_DUP_ASTERISK
2449 || token->type == OP_OPEN_DUP_NUM))
2451 *err = REG_BADRPT;
2452 return NULL;
2454 dfa->has_plural_match = 1;
2457 return tree;
2460 /* This function build the following tree, from regular expression
2461 (<reg_exp>):
2462 SUBEXP
2464 <reg_exp>
2467 static bin_tree_t *
2468 parse_sub_exp (regexp, preg, token, syntax, nest, err)
2469 re_string_t *regexp;
2470 regex_t *preg;
2471 re_token_t *token;
2472 reg_syntax_t syntax;
2473 int nest;
2474 reg_errcode_t *err;
2476 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2477 bin_tree_t *tree, *left_par, *right_par;
2478 size_t cur_nsub;
2479 cur_nsub = preg->re_nsub++;
2481 left_par = re_dfa_add_tree_node (dfa, NULL, NULL, token);
2482 if (BE (left_par == NULL, 0))
2484 *err = REG_ESPACE;
2485 return NULL;
2487 dfa->nodes[left_par->node_idx].opr.idx = cur_nsub;
2488 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2490 /* The subexpression may be a null string. */
2491 if (token->type == OP_CLOSE_SUBEXP)
2492 tree = NULL;
2493 else
2495 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2496 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2497 return NULL;
2499 if (BE (token->type != OP_CLOSE_SUBEXP, 0))
2501 *err = REG_EPAREN;
2502 return NULL;
2504 right_par = re_dfa_add_tree_node (dfa, NULL, NULL, token);
2505 dfa->completed_bkref_map |= 1 << cur_nsub;
2506 tree = ((tree == NULL) ? right_par
2507 : create_tree (dfa, tree, right_par, CONCAT, 0));
2508 tree = create_tree (dfa, left_par, tree, CONCAT, 0);
2509 if (BE (right_par == NULL || tree == NULL, 0))
2511 *err = REG_ESPACE;
2512 return NULL;
2514 dfa->nodes[right_par->node_idx].opr.idx = cur_nsub;
2516 return tree;
2519 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2521 static bin_tree_t *
2522 parse_dup_op (elem, regexp, dfa, token, syntax, err)
2523 bin_tree_t *elem;
2524 re_string_t *regexp;
2525 re_dfa_t *dfa;
2526 re_token_t *token;
2527 reg_syntax_t syntax;
2528 reg_errcode_t *err;
2530 re_token_t dup_token;
2531 bin_tree_t *tree = NULL, *old_tree = NULL;
2532 int i, start, end, start_idx = re_string_cur_idx (regexp);
2533 re_token_t start_token = *token;
2535 if (token->type == OP_OPEN_DUP_NUM)
2537 end = 0;
2538 start = fetch_number (regexp, token, syntax);
2539 if (start == -1)
2541 if (token->type == CHARACTER && token->opr.c == ',')
2542 start = 0; /* We treat "{,m}" as "{0,m}". */
2543 else
2545 *err = REG_BADBR; /* <re>{} is invalid. */
2546 return NULL;
2549 if (BE (start != -2, 1))
2551 /* We treat "{n}" as "{n,n}". */
2552 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2553 : ((token->type == CHARACTER && token->opr.c == ',')
2554 ? fetch_number (regexp, token, syntax) : -2));
2556 if (BE (start == -2 || end == -2, 0))
2558 /* Invalid sequence. */
2559 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2561 if (token->type == END_OF_RE)
2562 *err = REG_EBRACE;
2563 else
2564 *err = REG_BADBR;
2566 return NULL;
2569 /* If the syntax bit is set, rollback. */
2570 re_string_set_index (regexp, start_idx);
2571 *token = start_token;
2572 token->type = CHARACTER;
2573 /* mb_partial and word_char bits should be already initialized by
2574 peek_token. */
2575 return elem;
2578 if (BE (end != -1 && start > end, 0))
2580 /* First number greater than second. */
2581 *err = REG_BADBR;
2582 return NULL;
2585 else
2587 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2588 end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2591 fetch_token (token, regexp, syntax);
2593 /* Treat "<re>{0}*" etc. as "<re>{0}". */
2594 if (BE (elem == NULL || (start == 0 && end == 0), 0))
2595 return NULL;
2597 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2598 if (BE (start > 0, 0))
2600 tree = elem;
2601 for (i = 2; i <= start; ++i)
2603 elem = duplicate_tree (elem, dfa);
2604 tree = create_tree (dfa, tree, elem, CONCAT, 0);
2605 if (BE (elem == NULL || tree == NULL, 0))
2606 goto parse_dup_op_espace;
2609 if (start == end)
2610 return tree;
2612 /* Duplicate ELEM before it is marked optional. */
2613 elem = duplicate_tree (elem, dfa);
2614 old_tree = tree;
2616 else
2617 old_tree = NULL;
2619 mark_opt_subexp (elem, dfa);
2620 dup_token.type = (end == -1 ? OP_DUP_ASTERISK : OP_DUP_QUESTION);
2621 tree = re_dfa_add_tree_node (dfa, elem, NULL, &dup_token);
2622 if (BE (tree == NULL, 0))
2623 goto parse_dup_op_espace;
2625 /* This loop is actually executed only when end != -1,
2626 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2627 already created the start+1-th copy. */
2628 for (i = start + 2; i <= end; ++i)
2630 elem = duplicate_tree (elem, dfa);
2631 tree = create_tree (dfa, tree, elem, CONCAT, 0);
2632 if (BE (elem == NULL || tree == NULL, 0))
2633 goto parse_dup_op_espace;
2635 tree = re_dfa_add_tree_node (dfa, tree, NULL, &dup_token);
2636 if (BE (tree == NULL, 0))
2637 goto parse_dup_op_espace;
2640 if (old_tree)
2641 tree = create_tree (dfa, old_tree, tree, CONCAT, 0);
2643 return tree;
2645 parse_dup_op_espace:
2646 *err = REG_ESPACE;
2647 return NULL;
2650 /* Size of the names for collating symbol/equivalence_class/character_class.
2651 I'm not sure, but maybe enough. */
2652 #define BRACKET_NAME_BUF_SIZE 32
2654 #ifndef _LIBC
2655 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2656 Build the range expression which starts from START_ELEM, and ends
2657 at END_ELEM. The result are written to MBCSET and SBCSET.
2658 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2659 mbcset->range_ends, is a pointer argument sinse we may
2660 update it. */
2662 static reg_errcode_t
2663 # ifdef RE_ENABLE_I18N
2664 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2665 re_charset_t *mbcset;
2666 int *range_alloc;
2667 # else /* not RE_ENABLE_I18N */
2668 build_range_exp (sbcset, start_elem, end_elem)
2669 # endif /* not RE_ENABLE_I18N */
2670 re_bitset_ptr_t sbcset;
2671 bracket_elem_t *start_elem, *end_elem;
2673 unsigned int start_ch, end_ch;
2674 /* Equivalence Classes and Character Classes can't be a range start/end. */
2675 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2676 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2678 return REG_ERANGE;
2680 /* We can handle no multi character collating elements without libc
2681 support. */
2682 if (BE ((start_elem->type == COLL_SYM
2683 && strlen ((char *) start_elem->opr.name) > 1)
2684 || (end_elem->type == COLL_SYM
2685 && strlen ((char *) end_elem->opr.name) > 1), 0))
2686 return REG_ECOLLATE;
2688 # ifdef RE_ENABLE_I18N
2690 wchar_t wc, start_wc, end_wc;
2691 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2693 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2694 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2695 : 0));
2696 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2697 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2698 : 0));
2699 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2700 ? __btowc (start_ch) : start_elem->opr.wch);
2701 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2702 ? __btowc (end_ch) : end_elem->opr.wch);
2703 if (start_wc == WEOF || end_wc == WEOF)
2704 return REG_ECOLLATE;
2705 cmp_buf[0] = start_wc;
2706 cmp_buf[4] = end_wc;
2707 if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2708 return REG_ERANGE;
2710 /* Got valid collation sequence values, add them as a new entry.
2711 However, for !_LIBC we have no collation elements: if the
2712 character set is single byte, the single byte character set
2713 that we build below suffices. parse_bracket_exp passes
2714 no MBCSET if dfa->mb_cur_max == 1. */
2715 if (mbcset)
2717 /* Check the space of the arrays. */
2718 if (BE (*range_alloc == mbcset->nranges, 0))
2720 /* There is not enough space, need realloc. */
2721 wchar_t *new_array_start, *new_array_end;
2722 int new_nranges;
2724 /* +1 in case of mbcset->nranges is 0. */
2725 new_nranges = 2 * mbcset->nranges + 1;
2726 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2727 are NULL if *range_alloc == 0. */
2728 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2729 new_nranges);
2730 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2731 new_nranges);
2733 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2734 return REG_ESPACE;
2736 mbcset->range_starts = new_array_start;
2737 mbcset->range_ends = new_array_end;
2738 *range_alloc = new_nranges;
2741 mbcset->range_starts[mbcset->nranges] = start_wc;
2742 mbcset->range_ends[mbcset->nranges++] = end_wc;
2745 /* Build the table for single byte characters. */
2746 for (wc = 0; wc < SBC_MAX; ++wc)
2748 cmp_buf[2] = wc;
2749 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2750 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2751 bitset_set (sbcset, wc);
2754 # else /* not RE_ENABLE_I18N */
2756 unsigned int ch;
2757 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2758 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2759 : 0));
2760 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2761 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2762 : 0));
2763 if (start_ch > end_ch)
2764 return REG_ERANGE;
2765 /* Build the table for single byte characters. */
2766 for (ch = 0; ch < SBC_MAX; ++ch)
2767 if (start_ch <= ch && ch <= end_ch)
2768 bitset_set (sbcset, ch);
2770 # endif /* not RE_ENABLE_I18N */
2771 return REG_NOERROR;
2773 #endif /* not _LIBC */
2775 #ifndef _LIBC
2776 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2777 Build the collating element which is represented by NAME.
2778 The result are written to MBCSET and SBCSET.
2779 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2780 pointer argument since we may update it. */
2782 static reg_errcode_t
2783 # ifdef RE_ENABLE_I18N
2784 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2785 re_charset_t *mbcset;
2786 int *coll_sym_alloc;
2787 # else /* not RE_ENABLE_I18N */
2788 build_collating_symbol (sbcset, name)
2789 # endif /* not RE_ENABLE_I18N */
2790 re_bitset_ptr_t sbcset;
2791 const unsigned char *name;
2793 size_t name_len = strlen ((const char *) name);
2794 if (BE (name_len != 1, 0))
2795 return REG_ECOLLATE;
2796 else
2798 bitset_set (sbcset, name[0]);
2799 return REG_NOERROR;
2802 #endif /* not _LIBC */
2804 /* This function parse bracket expression like "[abc]", "[a-c]",
2805 "[[.a-a.]]" etc. */
2807 static bin_tree_t *
2808 parse_bracket_exp (regexp, dfa, token, syntax, err)
2809 re_string_t *regexp;
2810 re_dfa_t *dfa;
2811 re_token_t *token;
2812 reg_syntax_t syntax;
2813 reg_errcode_t *err;
2815 #ifdef _LIBC
2816 const unsigned char *collseqmb;
2817 const char *collseqwc;
2818 uint32_t nrules;
2819 int32_t table_size;
2820 const int32_t *symb_table;
2821 const unsigned char *extra;
2823 /* Local function for parse_bracket_exp used in _LIBC environement.
2824 Seek the collating symbol entry correspondings to NAME.
2825 Return the index of the symbol in the SYMB_TABLE. */
2827 auto inline int32_t
2828 __attribute ((always_inline))
2829 seek_collating_symbol_entry (name, name_len)
2830 const unsigned char *name;
2831 size_t name_len;
2833 int32_t hash = elem_hash ((const char *) name, name_len);
2834 int32_t elem = hash % table_size;
2835 int32_t second = hash % (table_size - 2);
2836 while (symb_table[2 * elem] != 0)
2838 /* First compare the hashing value. */
2839 if (symb_table[2 * elem] == hash
2840 /* Compare the length of the name. */
2841 && name_len == extra[symb_table[2 * elem + 1]]
2842 /* Compare the name. */
2843 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2844 name_len) == 0)
2846 /* Yep, this is the entry. */
2847 break;
2850 /* Next entry. */
2851 elem += second;
2853 return elem;
2856 /* Local function for parse_bracket_exp used in _LIBC environement.
2857 Look up the collation sequence value of BR_ELEM.
2858 Return the value if succeeded, UINT_MAX otherwise. */
2860 auto inline unsigned int
2861 __attribute ((always_inline))
2862 lookup_collation_sequence_value (br_elem)
2863 bracket_elem_t *br_elem;
2865 if (br_elem->type == SB_CHAR)
2868 if (MB_CUR_MAX == 1)
2870 if (nrules == 0)
2871 return collseqmb[br_elem->opr.ch];
2872 else
2874 wint_t wc = __btowc (br_elem->opr.ch);
2875 return __collseq_table_lookup (collseqwc, wc);
2878 else if (br_elem->type == MB_CHAR)
2880 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2882 else if (br_elem->type == COLL_SYM)
2884 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2885 if (nrules != 0)
2887 int32_t elem, idx;
2888 elem = seek_collating_symbol_entry (br_elem->opr.name,
2889 sym_name_len);
2890 if (symb_table[2 * elem] != 0)
2892 /* We found the entry. */
2893 idx = symb_table[2 * elem + 1];
2894 /* Skip the name of collating element name. */
2895 idx += 1 + extra[idx];
2896 /* Skip the byte sequence of the collating element. */
2897 idx += 1 + extra[idx];
2898 /* Adjust for the alignment. */
2899 idx = (idx + 3) & ~3;
2900 /* Skip the multibyte collation sequence value. */
2901 idx += sizeof (unsigned int);
2902 /* Skip the wide char sequence of the collating element. */
2903 idx += sizeof (unsigned int) *
2904 (1 + *(unsigned int *) (extra + idx));
2905 /* Return the collation sequence value. */
2906 return *(unsigned int *) (extra + idx);
2908 else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2910 /* No valid character. Match it as a single byte
2911 character. */
2912 return collseqmb[br_elem->opr.name[0]];
2915 else if (sym_name_len == 1)
2916 return collseqmb[br_elem->opr.name[0]];
2918 return UINT_MAX;
2921 /* Local function for parse_bracket_exp used in _LIBC environement.
2922 Build the range expression which starts from START_ELEM, and ends
2923 at END_ELEM. The result are written to MBCSET and SBCSET.
2924 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2925 mbcset->range_ends, is a pointer argument sinse we may
2926 update it. */
2928 auto inline reg_errcode_t
2929 __attribute ((always_inline))
2930 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2931 re_charset_t *mbcset;
2932 int *range_alloc;
2933 re_bitset_ptr_t sbcset;
2934 bracket_elem_t *start_elem, *end_elem;
2936 unsigned int ch;
2937 uint32_t start_collseq;
2938 uint32_t end_collseq;
2940 /* Equivalence Classes and Character Classes can't be a range
2941 start/end. */
2942 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2943 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2945 return REG_ERANGE;
2947 start_collseq = lookup_collation_sequence_value (start_elem);
2948 end_collseq = lookup_collation_sequence_value (end_elem);
2949 /* Check start/end collation sequence values. */
2950 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2951 return REG_ECOLLATE;
2952 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2953 return REG_ERANGE;
2955 /* Got valid collation sequence values, add them as a new entry.
2956 However, if we have no collation elements, and the character set
2957 is single byte, the single byte character set that we
2958 build below suffices. */
2959 if (nrules > 0 || dfa->mb_cur_max > 1)
2961 /* Check the space of the arrays. */
2962 if (BE (*range_alloc == mbcset->nranges, 0))
2964 /* There is not enough space, need realloc. */
2965 uint32_t *new_array_start;
2966 uint32_t *new_array_end;
2967 int new_nranges;
2969 /* +1 in case of mbcset->nranges is 0. */
2970 new_nranges = 2 * mbcset->nranges + 1;
2971 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2972 new_nranges);
2973 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2974 new_nranges);
2976 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2977 return REG_ESPACE;
2979 mbcset->range_starts = new_array_start;
2980 mbcset->range_ends = new_array_end;
2981 *range_alloc = new_nranges;
2984 mbcset->range_starts[mbcset->nranges] = start_collseq;
2985 mbcset->range_ends[mbcset->nranges++] = end_collseq;
2988 /* Build the table for single byte characters. */
2989 for (ch = 0; ch < SBC_MAX; ch++)
2991 uint32_t ch_collseq;
2993 if (MB_CUR_MAX == 1)
2995 if (nrules == 0)
2996 ch_collseq = collseqmb[ch];
2997 else
2998 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2999 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
3000 bitset_set (sbcset, ch);
3002 return REG_NOERROR;
3005 /* Local function for parse_bracket_exp used in _LIBC environement.
3006 Build the collating element which is represented by NAME.
3007 The result are written to MBCSET and SBCSET.
3008 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
3009 pointer argument sinse we may update it. */
3011 auto inline reg_errcode_t
3012 __attribute ((always_inline))
3013 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
3014 re_charset_t *mbcset;
3015 int *coll_sym_alloc;
3016 re_bitset_ptr_t sbcset;
3017 const unsigned char *name;
3019 int32_t elem, idx;
3020 size_t name_len = strlen ((const char *) name);
3021 if (nrules != 0)
3023 elem = seek_collating_symbol_entry (name, name_len);
3024 if (symb_table[2 * elem] != 0)
3026 /* We found the entry. */
3027 idx = symb_table[2 * elem + 1];
3028 /* Skip the name of collating element name. */
3029 idx += 1 + extra[idx];
3031 else if (symb_table[2 * elem] == 0 && name_len == 1)
3033 /* No valid character, treat it as a normal
3034 character. */
3035 bitset_set (sbcset, name[0]);
3036 return REG_NOERROR;
3038 else
3039 return REG_ECOLLATE;
3041 /* Got valid collation sequence, add it as a new entry. */
3042 /* Check the space of the arrays. */
3043 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3045 /* Not enough, realloc it. */
3046 /* +1 in case of mbcset->ncoll_syms is 0. */
3047 int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3048 /* Use realloc since mbcset->coll_syms is NULL
3049 if *alloc == 0. */
3050 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3051 new_coll_sym_alloc);
3052 if (BE (new_coll_syms == NULL, 0))
3053 return REG_ESPACE;
3054 mbcset->coll_syms = new_coll_syms;
3055 *coll_sym_alloc = new_coll_sym_alloc;
3057 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3058 return REG_NOERROR;
3060 else
3062 if (BE (name_len != 1, 0))
3063 return REG_ECOLLATE;
3064 else
3066 bitset_set (sbcset, name[0]);
3067 return REG_NOERROR;
3071 #endif
3073 re_token_t br_token;
3074 re_bitset_ptr_t sbcset;
3075 #ifdef RE_ENABLE_I18N
3076 re_charset_t *mbcset;
3077 int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3078 int equiv_class_alloc = 0, char_class_alloc = 0;
3079 #endif /* not RE_ENABLE_I18N */
3080 int non_match = 0;
3081 bin_tree_t *work_tree;
3082 int token_len;
3083 int first_round = 1;
3084 #ifdef _LIBC
3085 collseqmb = (const unsigned char *)
3086 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3087 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3088 if (nrules)
3091 if (MB_CUR_MAX > 1)
3093 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3094 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3095 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3096 _NL_COLLATE_SYMB_TABLEMB);
3097 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3098 _NL_COLLATE_SYMB_EXTRAMB);
3100 #endif
3101 sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
3102 #ifdef RE_ENABLE_I18N
3103 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3104 #endif /* RE_ENABLE_I18N */
3105 #ifdef RE_ENABLE_I18N
3106 if (BE (sbcset == NULL || mbcset == NULL, 0))
3107 #else
3108 if (BE (sbcset == NULL, 0))
3109 #endif /* RE_ENABLE_I18N */
3111 *err = REG_ESPACE;
3112 return NULL;
3115 token_len = peek_token_bracket (token, regexp, syntax);
3116 if (BE (token->type == END_OF_RE, 0))
3118 *err = REG_BADPAT;
3119 goto parse_bracket_exp_free_return;
3121 if (token->type == OP_NON_MATCH_LIST)
3123 #ifdef RE_ENABLE_I18N
3124 mbcset->non_match = 1;
3125 #endif /* not RE_ENABLE_I18N */
3126 non_match = 1;
3127 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3128 bitset_set (sbcset, '\0');
3129 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3130 token_len = peek_token_bracket (token, regexp, syntax);
3131 if (BE (token->type == END_OF_RE, 0))
3133 *err = REG_BADPAT;
3134 goto parse_bracket_exp_free_return;
3138 /* We treat the first ']' as a normal character. */
3139 if (token->type == OP_CLOSE_BRACKET)
3140 token->type = CHARACTER;
3142 while (1)
3144 bracket_elem_t start_elem, end_elem;
3145 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3146 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3147 reg_errcode_t ret;
3148 int token_len2 = 0, is_range_exp = 0;
3149 re_token_t token2;
3151 start_elem.opr.name = start_name_buf;
3152 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3153 syntax, first_round);
3154 if (BE (ret != REG_NOERROR, 0))
3156 *err = ret;
3157 goto parse_bracket_exp_free_return;
3159 first_round = 0;
3161 /* Get information about the next token. We need it in any case. */
3162 token_len = peek_token_bracket (token, regexp, syntax);
3164 /* Do not check for ranges if we know they are not allowed. */
3165 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3167 if (BE (token->type == END_OF_RE, 0))
3169 *err = REG_EBRACK;
3170 goto parse_bracket_exp_free_return;
3172 if (token->type == OP_CHARSET_RANGE)
3174 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3175 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3176 if (BE (token2.type == END_OF_RE, 0))
3178 *err = REG_EBRACK;
3179 goto parse_bracket_exp_free_return;
3181 if (token2.type == OP_CLOSE_BRACKET)
3183 /* We treat the last '-' as a normal character. */
3184 re_string_skip_bytes (regexp, -token_len);
3185 token->type = CHARACTER;
3187 else
3188 is_range_exp = 1;
3192 if (is_range_exp == 1)
3194 end_elem.opr.name = end_name_buf;
3195 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3196 dfa, syntax, 1);
3197 if (BE (ret != REG_NOERROR, 0))
3199 *err = ret;
3200 goto parse_bracket_exp_free_return;
3203 token_len = peek_token_bracket (token, regexp, syntax);
3205 #ifdef _LIBC
3206 *err = build_range_exp (sbcset, mbcset, &range_alloc,
3207 &start_elem, &end_elem);
3208 #else
3209 # ifdef RE_ENABLE_I18N
3210 *err = build_range_exp (sbcset,
3211 dfa->mb_cur_max > 1 ? mbcset : NULL,
3212 &range_alloc, &start_elem, &end_elem);
3213 # else
3214 *err = build_range_exp (sbcset, &start_elem, &end_elem);
3215 # endif
3216 #endif /* RE_ENABLE_I18N */
3217 if (BE (*err != REG_NOERROR, 0))
3218 goto parse_bracket_exp_free_return;
3220 else
3222 switch (start_elem.type)
3224 case SB_CHAR:
3225 bitset_set (sbcset, start_elem.opr.ch);
3226 break;
3227 #ifdef RE_ENABLE_I18N
3228 case MB_CHAR:
3229 /* Check whether the array has enough space. */
3230 if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3232 wchar_t *new_mbchars;
3233 /* Not enough, realloc it. */
3234 /* +1 in case of mbcset->nmbchars is 0. */
3235 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3236 /* Use realloc since array is NULL if *alloc == 0. */
3237 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3238 mbchar_alloc);
3239 if (BE (new_mbchars == NULL, 0))
3240 goto parse_bracket_exp_espace;
3241 mbcset->mbchars = new_mbchars;
3243 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3244 break;
3245 #endif /* RE_ENABLE_I18N */
3246 case EQUIV_CLASS:
3247 *err = build_equiv_class (sbcset,
3248 #ifdef RE_ENABLE_I18N
3249 mbcset, &equiv_class_alloc,
3250 #endif /* RE_ENABLE_I18N */
3251 start_elem.opr.name);
3252 if (BE (*err != REG_NOERROR, 0))
3253 goto parse_bracket_exp_free_return;
3254 break;
3255 case COLL_SYM:
3256 *err = build_collating_symbol (sbcset,
3257 #ifdef RE_ENABLE_I18N
3258 mbcset, &coll_sym_alloc,
3259 #endif /* RE_ENABLE_I18N */
3260 start_elem.opr.name);
3261 if (BE (*err != REG_NOERROR, 0))
3262 goto parse_bracket_exp_free_return;
3263 break;
3264 case CHAR_CLASS:
3265 *err = build_charclass (regexp->trans, sbcset,
3266 #ifdef RE_ENABLE_I18N
3267 mbcset, &char_class_alloc,
3268 #endif /* RE_ENABLE_I18N */
3269 start_elem.opr.name, syntax);
3270 if (BE (*err != REG_NOERROR, 0))
3271 goto parse_bracket_exp_free_return;
3272 break;
3273 default:
3274 assert (0);
3275 break;
3278 if (BE (token->type == END_OF_RE, 0))
3280 *err = REG_EBRACK;
3281 goto parse_bracket_exp_free_return;
3283 if (token->type == OP_CLOSE_BRACKET)
3284 break;
3287 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3289 /* If it is non-matching list. */
3290 if (non_match)
3291 bitset_not (sbcset);
3293 #ifdef RE_ENABLE_I18N
3294 /* Ensure only single byte characters are set. */
3295 if (dfa->mb_cur_max > 1)
3296 bitset_mask (sbcset, dfa->sb_char);
3297 #endif /* RE_ENABLE_I18N */
3299 /* Build a tree for simple bracket. */
3300 br_token.type = SIMPLE_BRACKET;
3301 br_token.opr.sbcset = sbcset;
3302 work_tree = re_dfa_add_tree_node (dfa, NULL, NULL, &br_token);
3303 if (BE (work_tree == NULL, 0))
3304 goto parse_bracket_exp_espace;
3306 #ifdef RE_ENABLE_I18N
3307 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3308 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3309 || mbcset->non_match)))
3311 re_token_t alt_token;
3312 bin_tree_t *mbc_tree;
3313 int sbc_idx;
3314 /* Build a tree for complex bracket. */
3315 dfa->has_mb_node = 1;
3316 for (sbc_idx = 0; sbc_idx < BITSET_UINTS; ++sbc_idx)
3317 if (sbcset[sbc_idx])
3318 break;
3319 /* If there are no bits set in sbcset, there is no point
3320 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3321 if (sbc_idx == BITSET_UINTS)
3323 re_free (sbcset);
3324 dfa->nodes[work_tree->node_idx].type = COMPLEX_BRACKET;
3325 dfa->nodes[work_tree->node_idx].opr.mbcset = mbcset;
3326 return work_tree;
3328 br_token.type = COMPLEX_BRACKET;
3329 br_token.opr.mbcset = mbcset;
3330 mbc_tree = re_dfa_add_tree_node (dfa, NULL, NULL, &br_token);
3331 if (BE (mbc_tree == NULL, 0))
3332 goto parse_bracket_exp_espace;
3333 /* Then join them by ALT node. */
3334 alt_token.type = OP_ALT;
3335 dfa->has_plural_match = 1;
3336 work_tree = re_dfa_add_tree_node (dfa, work_tree, mbc_tree, &alt_token);
3337 if (BE (mbc_tree != NULL, 1))
3338 return work_tree;
3340 else
3342 free_charset (mbcset);
3343 return work_tree;
3345 #else /* not RE_ENABLE_I18N */
3346 return work_tree;
3347 #endif /* not RE_ENABLE_I18N */
3349 parse_bracket_exp_espace:
3350 *err = REG_ESPACE;
3351 parse_bracket_exp_free_return:
3352 re_free (sbcset);
3353 #ifdef RE_ENABLE_I18N
3354 free_charset (mbcset);
3355 #endif /* RE_ENABLE_I18N */
3356 return NULL;
3359 /* Parse an element in the bracket expression. */
3361 static reg_errcode_t
3362 parse_bracket_element (elem, regexp, token, token_len, dfa, syntax,
3363 accept_hyphen)
3364 bracket_elem_t *elem;
3365 re_string_t *regexp;
3366 re_token_t *token;
3367 int token_len;
3368 re_dfa_t *dfa;
3369 reg_syntax_t syntax;
3370 int accept_hyphen;
3372 #ifdef RE_ENABLE_I18N
3373 int cur_char_size;
3374 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3375 if (cur_char_size > 1)
3377 elem->type = MB_CHAR;
3378 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3379 re_string_skip_bytes (regexp, cur_char_size);
3380 return REG_NOERROR;
3382 #endif /* RE_ENABLE_I18N */
3383 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3384 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3385 || token->type == OP_OPEN_EQUIV_CLASS)
3386 return parse_bracket_symbol (elem, regexp, token);
3387 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3389 /* A '-' must only appear as anything but a range indicator before
3390 the closing bracket. Everything else is an error. */
3391 re_token_t token2;
3392 (void) peek_token_bracket (&token2, regexp, syntax);
3393 if (token2.type != OP_CLOSE_BRACKET)
3394 /* The actual error value is not standardized since this whole
3395 case is undefined. But ERANGE makes good sense. */
3396 return REG_ERANGE;
3398 elem->type = SB_CHAR;
3399 elem->opr.ch = token->opr.c;
3400 return REG_NOERROR;
3403 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3404 such as [:<character_class>:], [.<collating_element>.], and
3405 [=<equivalent_class>=]. */
3407 static reg_errcode_t
3408 parse_bracket_symbol (elem, regexp, token)
3409 bracket_elem_t *elem;
3410 re_string_t *regexp;
3411 re_token_t *token;
3413 unsigned char ch, delim = token->opr.c;
3414 int i = 0;
3415 if (re_string_eoi(regexp))
3416 return REG_EBRACK;
3417 for (;; ++i)
3419 if (i >= BRACKET_NAME_BUF_SIZE)
3420 return REG_EBRACK;
3421 if (token->type == OP_OPEN_CHAR_CLASS)
3422 ch = re_string_fetch_byte_case (regexp);
3423 else
3424 ch = re_string_fetch_byte (regexp);
3425 if (re_string_eoi(regexp))
3426 return REG_EBRACK;
3427 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3428 break;
3429 elem->opr.name[i] = ch;
3431 re_string_skip_bytes (regexp, 1);
3432 elem->opr.name[i] = '\0';
3433 switch (token->type)
3435 case OP_OPEN_COLL_ELEM:
3436 elem->type = COLL_SYM;
3437 break;
3438 case OP_OPEN_EQUIV_CLASS:
3439 elem->type = EQUIV_CLASS;
3440 break;
3441 case OP_OPEN_CHAR_CLASS:
3442 elem->type = CHAR_CLASS;
3443 break;
3444 default:
3445 break;
3447 return REG_NOERROR;
3450 /* Helper function for parse_bracket_exp.
3451 Build the equivalence class which is represented by NAME.
3452 The result are written to MBCSET and SBCSET.
3453 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3454 is a pointer argument sinse we may update it. */
3456 static reg_errcode_t
3457 #ifdef RE_ENABLE_I18N
3458 build_equiv_class (sbcset, mbcset, equiv_class_alloc, name)
3459 re_charset_t *mbcset;
3460 int *equiv_class_alloc;
3461 #else /* not RE_ENABLE_I18N */
3462 build_equiv_class (sbcset, name)
3463 #endif /* not RE_ENABLE_I18N */
3464 re_bitset_ptr_t sbcset;
3465 const unsigned char *name;
3467 #if defined _LIBC
3468 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3469 if (nrules != 0)
3471 const int32_t *table, *indirect;
3472 const unsigned char *weights, *extra, *cp;
3473 unsigned char char_buf[2];
3474 int32_t idx1, idx2;
3475 unsigned int ch;
3476 size_t len;
3477 /* This #include defines a local function! */
3478 # include <locale/weight.h>
3479 /* Calculate the index for equivalence class. */
3480 cp = name;
3481 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3482 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3483 _NL_COLLATE_WEIGHTMB);
3484 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3485 _NL_COLLATE_EXTRAMB);
3486 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3487 _NL_COLLATE_INDIRECTMB);
3488 idx1 = findidx (&cp);
3489 if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3490 /* This isn't a valid character. */
3491 return REG_ECOLLATE;
3493 /* Build single byte matcing table for this equivalence class. */
3494 char_buf[1] = (unsigned char) '\0';
3495 len = weights[idx1];
3496 for (ch = 0; ch < SBC_MAX; ++ch)
3498 char_buf[0] = ch;
3499 cp = char_buf;
3500 idx2 = findidx (&cp);
3502 idx2 = table[ch];
3504 if (idx2 == 0)
3505 /* This isn't a valid character. */
3506 continue;
3507 if (len == weights[idx2])
3509 int cnt = 0;
3510 while (cnt <= len &&
3511 weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
3512 ++cnt;
3514 if (cnt > len)
3515 bitset_set (sbcset, ch);
3518 /* Check whether the array has enough space. */
3519 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3521 /* Not enough, realloc it. */
3522 /* +1 in case of mbcset->nequiv_classes is 0. */
3523 int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3524 /* Use realloc since the array is NULL if *alloc == 0. */
3525 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3526 int32_t,
3527 new_equiv_class_alloc);
3528 if (BE (new_equiv_classes == NULL, 0))
3529 return REG_ESPACE;
3530 mbcset->equiv_classes = new_equiv_classes;
3531 *equiv_class_alloc = new_equiv_class_alloc;
3533 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3535 else
3536 #endif /* _LIBC */
3538 if (BE (strlen ((const char *) name) != 1, 0))
3539 return REG_ECOLLATE;
3540 bitset_set (sbcset, *name);
3542 return REG_NOERROR;
3545 /* Helper function for parse_bracket_exp.
3546 Build the character class which is represented by NAME.
3547 The result are written to MBCSET and SBCSET.
3548 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3549 is a pointer argument sinse we may update it. */
3551 static reg_errcode_t
3552 #ifdef RE_ENABLE_I18N
3553 build_charclass (trans, sbcset, mbcset, char_class_alloc, class_name, syntax)
3554 re_charset_t *mbcset;
3555 int *char_class_alloc;
3556 #else /* not RE_ENABLE_I18N */
3557 build_charclass (trans, sbcset, class_name, syntax)
3558 #endif /* not RE_ENABLE_I18N */
3559 unsigned RE_TRANSLATE_TYPE trans;
3560 re_bitset_ptr_t sbcset;
3561 const unsigned char *class_name;
3562 reg_syntax_t syntax;
3564 int i;
3565 const char *name = (const char *) class_name;
3567 /* In case of REG_ICASE "upper" and "lower" match the both of
3568 upper and lower cases. */
3569 if ((syntax & RE_ICASE)
3570 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3571 name = "alpha";
3573 #ifdef RE_ENABLE_I18N
3574 /* Check the space of the arrays. */
3575 if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3577 /* Not enough, realloc it. */
3578 /* +1 in case of mbcset->nchar_classes is 0. */
3579 int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3580 /* Use realloc since array is NULL if *alloc == 0. */
3581 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3582 new_char_class_alloc);
3583 if (BE (new_char_classes == NULL, 0))
3584 return REG_ESPACE;
3585 mbcset->char_classes = new_char_classes;
3586 *char_class_alloc = new_char_class_alloc;
3588 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3589 #endif /* RE_ENABLE_I18N */
3591 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3592 for (i = 0; i < SBC_MAX; ++i) \
3594 if (ctype_func (i)) \
3596 int ch = trans ? trans[i] : i; \
3597 bitset_set (sbcset, ch); \
3601 if (strcmp (name, "alnum") == 0)
3602 BUILD_CHARCLASS_LOOP (isalnum)
3603 else if (strcmp (name, "cntrl") == 0)
3604 BUILD_CHARCLASS_LOOP (iscntrl)
3605 else if (strcmp (name, "lower") == 0)
3606 BUILD_CHARCLASS_LOOP (islower)
3607 else if (strcmp (name, "space") == 0)
3608 BUILD_CHARCLASS_LOOP (isspace)
3609 else if (strcmp (name, "alpha") == 0)
3610 BUILD_CHARCLASS_LOOP (isalpha)
3611 else if (strcmp (name, "digit") == 0)
3612 BUILD_CHARCLASS_LOOP (isdigit)
3613 else if (strcmp (name, "print") == 0)
3614 BUILD_CHARCLASS_LOOP (isprint)
3615 else if (strcmp (name, "upper") == 0)
3616 BUILD_CHARCLASS_LOOP (isupper)
3617 else if (strcmp (name, "blank") == 0)
3618 BUILD_CHARCLASS_LOOP (isblank)
3619 else if (strcmp (name, "graph") == 0)
3620 BUILD_CHARCLASS_LOOP (isgraph)
3621 else if (strcmp (name, "punct") == 0)
3622 BUILD_CHARCLASS_LOOP (ispunct)
3623 else if (strcmp (name, "xdigit") == 0)
3624 BUILD_CHARCLASS_LOOP (isxdigit)
3625 else
3626 return REG_ECTYPE;
3628 return REG_NOERROR;
3631 static bin_tree_t *
3632 build_charclass_op (dfa, trans, class_name, extra, non_match, err)
3633 re_dfa_t *dfa;
3634 unsigned RE_TRANSLATE_TYPE trans;
3635 const unsigned char *class_name;
3636 const unsigned char *extra;
3637 int non_match;
3638 reg_errcode_t *err;
3640 re_bitset_ptr_t sbcset;
3641 #ifdef RE_ENABLE_I18N
3642 re_charset_t *mbcset;
3643 int alloc = 0;
3644 #endif /* not RE_ENABLE_I18N */
3645 reg_errcode_t ret;
3646 re_token_t br_token;
3647 bin_tree_t *tree;
3649 sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
3650 #ifdef RE_ENABLE_I18N
3651 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3652 #endif /* RE_ENABLE_I18N */
3654 #ifdef RE_ENABLE_I18N
3655 if (BE (sbcset == NULL || mbcset == NULL, 0))
3656 #else /* not RE_ENABLE_I18N */
3657 if (BE (sbcset == NULL, 0))
3658 #endif /* not RE_ENABLE_I18N */
3660 *err = REG_ESPACE;
3661 return NULL;
3664 if (non_match)
3666 #ifdef RE_ENABLE_I18N
3668 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3669 bitset_set(cset->sbcset, '\0');
3671 mbcset->non_match = 1;
3672 #endif /* not RE_ENABLE_I18N */
3675 /* We don't care the syntax in this case. */
3676 ret = build_charclass (trans, sbcset,
3677 #ifdef RE_ENABLE_I18N
3678 mbcset, &alloc,
3679 #endif /* RE_ENABLE_I18N */
3680 class_name, 0);
3682 if (BE (ret != REG_NOERROR, 0))
3684 re_free (sbcset);
3685 #ifdef RE_ENABLE_I18N
3686 free_charset (mbcset);
3687 #endif /* RE_ENABLE_I18N */
3688 *err = ret;
3689 return NULL;
3691 /* \w match '_' also. */
3692 for (; *extra; extra++)
3693 bitset_set (sbcset, *extra);
3695 /* If it is non-matching list. */
3696 if (non_match)
3697 bitset_not (sbcset);
3699 #ifdef RE_ENABLE_I18N
3700 /* Ensure only single byte characters are set. */
3701 if (dfa->mb_cur_max > 1)
3702 bitset_mask (sbcset, dfa->sb_char);
3703 #endif
3705 /* Build a tree for simple bracket. */
3706 br_token.type = SIMPLE_BRACKET;
3707 br_token.opr.sbcset = sbcset;
3708 tree = re_dfa_add_tree_node (dfa, NULL, NULL, &br_token);
3709 if (BE (tree == NULL, 0))
3710 goto build_word_op_espace;
3712 #ifdef RE_ENABLE_I18N
3713 if (dfa->mb_cur_max > 1)
3715 re_token_t alt_token;
3716 bin_tree_t *mbc_tree;
3717 /* Build a tree for complex bracket. */
3718 br_token.type = COMPLEX_BRACKET;
3719 br_token.opr.mbcset = mbcset;
3720 dfa->has_mb_node = 1;
3721 mbc_tree = re_dfa_add_tree_node (dfa, NULL, NULL, &br_token);
3722 if (BE (mbc_tree == NULL, 0))
3723 goto build_word_op_espace;
3724 /* Then join them by ALT node. */
3725 alt_token.type = OP_ALT;
3726 dfa->has_plural_match = 1;
3727 tree = re_dfa_add_tree_node (dfa, tree, mbc_tree, &alt_token);
3728 if (BE (mbc_tree != NULL, 1))
3729 return tree;
3731 else
3733 free_charset (mbcset);
3734 return tree;
3736 #else /* not RE_ENABLE_I18N */
3737 return tree;
3738 #endif /* not RE_ENABLE_I18N */
3740 build_word_op_espace:
3741 re_free (sbcset);
3742 #ifdef RE_ENABLE_I18N
3743 free_charset (mbcset);
3744 #endif /* RE_ENABLE_I18N */
3745 *err = REG_ESPACE;
3746 return NULL;
3749 /* This is intended for the expressions like "a{1,3}".
3750 Fetch a number from `input', and return the number.
3751 Return -1, if the number field is empty like "{,1}".
3752 Return -2, If an error is occured. */
3754 static int
3755 fetch_number (input, token, syntax)
3756 re_string_t *input;
3757 re_token_t *token;
3758 reg_syntax_t syntax;
3760 int num = -1;
3761 unsigned char c;
3762 while (1)
3764 fetch_token (token, input, syntax);
3765 c = token->opr.c;
3766 if (BE (token->type == END_OF_RE, 0))
3767 return -2;
3768 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3769 break;
3770 num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3771 ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3772 num = (num > RE_DUP_MAX) ? -2 : num;
3774 return num;
3777 #ifdef RE_ENABLE_I18N
3778 static void
3779 free_charset (re_charset_t *cset)
3781 re_free (cset->mbchars);
3782 # ifdef _LIBC
3783 re_free (cset->coll_syms);
3784 re_free (cset->equiv_classes);
3785 re_free (cset->range_starts);
3786 re_free (cset->range_ends);
3787 # endif
3788 re_free (cset->char_classes);
3789 re_free (cset);
3791 #endif /* RE_ENABLE_I18N */
3793 /* Functions for binary tree operation. */
3795 /* Create a tree node. */
3797 static bin_tree_t *
3798 create_tree (dfa, left, right, type, index)
3799 re_dfa_t *dfa;
3800 bin_tree_t *left;
3801 bin_tree_t *right;
3802 re_token_type_t type;
3803 int index;
3805 bin_tree_t *tree;
3806 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3808 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3810 if (storage == NULL)
3811 return NULL;
3812 storage->next = dfa->str_tree_storage;
3813 dfa->str_tree_storage = storage;
3814 dfa->str_tree_storage_idx = 0;
3816 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3818 tree->parent = NULL;
3819 tree->left = left;
3820 tree->right = right;
3821 tree->type = type;
3822 tree->node_idx = index;
3823 tree->first = -1;
3824 tree->next = -1;
3825 re_node_set_init_empty (&tree->eclosure);
3827 if (left != NULL)
3828 left->parent = tree;
3829 if (right != NULL)
3830 right->parent = tree;
3831 return tree;
3834 /* Create both a DFA node and a tree for it. */
3836 static bin_tree_t *
3837 re_dfa_add_tree_node (dfa, left, right, token)
3838 re_dfa_t *dfa;
3839 bin_tree_t *left;
3840 bin_tree_t *right;
3841 const re_token_t *token;
3843 int new_idx = re_dfa_add_node (dfa, *token, 0);
3845 if (new_idx == -1)
3846 return NULL;
3848 return create_tree (dfa, left, right, 0, new_idx);
3851 /* Mark the tree SRC as an optional subexpression. */
3853 static void
3854 mark_opt_subexp (src, dfa)
3855 const bin_tree_t *src;
3856 re_dfa_t *dfa;
3858 /* Pass an OPT_SUBEXP_IDX which is != 1 if the duplicated tree is
3859 a subexpression. */
3860 if (src->type == CONCAT
3861 && src->left->type == NON_TYPE
3862 && dfa->nodes[src->left->node_idx].type == OP_OPEN_SUBEXP)
3863 mark_opt_subexp_iter (src, dfa, dfa->nodes[src->left->node_idx].opr.idx);
3867 /* Recursive tree walker for mark_opt_subexp. */
3869 static void
3870 mark_opt_subexp_iter (src, dfa, idx)
3871 const bin_tree_t *src;
3872 re_dfa_t *dfa;
3873 int idx;
3875 int node_idx;
3877 if (src->type == NON_TYPE)
3879 node_idx = src->node_idx;
3880 if ((dfa->nodes[node_idx].type == OP_OPEN_SUBEXP
3881 || dfa->nodes[node_idx].type == OP_CLOSE_SUBEXP)
3882 && dfa->nodes[node_idx].opr.idx == idx)
3883 dfa->nodes[node_idx].opt_subexp = 1;
3886 if (src->left != NULL)
3887 mark_opt_subexp_iter (src->left, dfa, idx);
3889 if (src->right != NULL)
3890 mark_opt_subexp_iter (src->right, dfa, idx);
3894 /* Duplicate the node SRC, and return new node. */
3896 static bin_tree_t *
3897 duplicate_tree (src, dfa)
3898 const bin_tree_t *src;
3899 re_dfa_t *dfa;
3901 bin_tree_t *left = NULL, *right = NULL, *new_tree;
3902 int new_node_idx;
3903 /* Since node indies must be according to Post-order of the tree,
3904 we must duplicate the left at first. */
3905 if (src->left != NULL)
3907 left = duplicate_tree (src->left, dfa);
3908 if (left == NULL)
3909 return NULL;
3912 /* Secondaly, duplicate the right. */
3913 if (src->right != NULL)
3915 right = duplicate_tree (src->right, dfa);
3916 if (right == NULL)
3917 return NULL;
3920 /* At last, duplicate itself. */
3921 if (src->type == NON_TYPE)
3923 new_node_idx = re_dfa_add_node (dfa, dfa->nodes[src->node_idx], 0);
3924 dfa->nodes[new_node_idx].duplicated = 1;
3925 if (BE (new_node_idx == -1, 0))
3926 return NULL;
3928 else
3929 new_node_idx = src->type;
3931 new_tree = create_tree (dfa, left, right, src->type, new_node_idx);
3932 return new_tree;