2004-11-18 Alexandre Oliva <aoliva@redhat.com>
[glibc.git] / posix / regcomp.c
blob1a5f7952c37486923fddee56f3d6bf428a55125c
1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19 02111-1307 USA. */
21 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
22 int length, reg_syntax_t syntax);
23 static void re_compile_fastmap_iter (regex_t *bufp,
24 const re_dfastate_t *init_state,
25 char *fastmap);
26 static reg_errcode_t init_dfa (re_dfa_t *dfa, int pat_len);
27 static void init_word_char (re_dfa_t *dfa);
28 #ifdef RE_ENABLE_I18N
29 static void free_charset (re_charset_t *cset);
30 #endif /* RE_ENABLE_I18N */
31 static void free_workarea_compile (regex_t *preg);
32 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
33 #ifdef RE_ENABLE_I18N
34 static void optimize_utf8 (re_dfa_t *dfa);
35 #endif
36 static reg_errcode_t analyze (regex_t *preg);
37 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
38 static reg_errcode_t preorder (bin_tree_t *root,
39 reg_errcode_t (fn (void *, bin_tree_t *)),
40 void *extra);
41 static reg_errcode_t postorder (bin_tree_t *root,
42 reg_errcode_t (fn (void *, bin_tree_t *)),
43 void *extra);
44 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
45 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
46 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
47 bin_tree_t *node);
48 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
49 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
50 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
51 static reg_errcode_t duplicate_node_closure (re_dfa_t *dfa, int top_org_node,
52 int top_clone_node, int root_node,
53 unsigned int constraint);
54 static reg_errcode_t duplicate_node (int *new_idx, re_dfa_t *dfa, int org_idx,
55 unsigned int constraint);
56 static int search_duplicated_node (re_dfa_t *dfa, int org_node,
57 unsigned int constraint);
58 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
59 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
60 int node, int root);
61 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
62 static int fetch_number (re_string_t *input, re_token_t *token,
63 reg_syntax_t syntax);
64 static void fetch_token (re_token_t *result, re_string_t *input,
65 reg_syntax_t syntax);
66 static int peek_token (re_token_t *token, re_string_t *input,
67 reg_syntax_t syntax);
68 static int peek_token_bracket (re_token_t *token, re_string_t *input,
69 reg_syntax_t syntax);
70 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
71 reg_syntax_t syntax, reg_errcode_t *err);
72 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
73 re_token_t *token, reg_syntax_t syntax,
74 int nest, reg_errcode_t *err);
75 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
76 re_token_t *token, reg_syntax_t syntax,
77 int nest, reg_errcode_t *err);
78 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
79 re_token_t *token, reg_syntax_t syntax,
80 int nest, reg_errcode_t *err);
81 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
82 re_token_t *token, reg_syntax_t syntax,
83 int nest, reg_errcode_t *err);
84 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
85 re_dfa_t *dfa, re_token_t *token,
86 reg_syntax_t syntax, reg_errcode_t *err);
87 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
88 re_token_t *token, reg_syntax_t syntax,
89 reg_errcode_t *err);
90 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
91 re_string_t *regexp,
92 re_token_t *token, int token_len,
93 re_dfa_t *dfa,
94 reg_syntax_t syntax,
95 int accept_hyphen);
96 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
97 re_string_t *regexp,
98 re_token_t *token);
99 #ifndef _LIBC
100 # ifdef RE_ENABLE_I18N
101 static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
102 re_charset_t *mbcset, int *range_alloc,
103 bracket_elem_t *start_elem,
104 bracket_elem_t *end_elem);
105 static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
106 re_charset_t *mbcset,
107 int *coll_sym_alloc,
108 const unsigned char *name);
109 # else /* not RE_ENABLE_I18N */
110 static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
111 bracket_elem_t *start_elem,
112 bracket_elem_t *end_elem);
113 static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
114 const unsigned char *name);
115 # endif /* not RE_ENABLE_I18N */
116 #endif /* not _LIBC */
117 #ifdef RE_ENABLE_I18N
118 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
119 re_charset_t *mbcset,
120 int *equiv_class_alloc,
121 const unsigned char *name);
122 static reg_errcode_t build_charclass (unsigned RE_TRANSLATE_TYPE trans,
123 re_bitset_ptr_t sbcset,
124 re_charset_t *mbcset,
125 int *char_class_alloc,
126 const unsigned char *class_name,
127 reg_syntax_t syntax);
128 #else /* not RE_ENABLE_I18N */
129 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
130 const unsigned char *name);
131 static reg_errcode_t build_charclass (unsigned RE_TRANSLATE_TYPE trans,
132 re_bitset_ptr_t sbcset,
133 const unsigned char *class_name,
134 reg_syntax_t syntax);
135 #endif /* not RE_ENABLE_I18N */
136 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
137 unsigned RE_TRANSLATE_TYPE trans,
138 const unsigned char *class_name,
139 const unsigned char *extra,
140 int non_match, reg_errcode_t *err);
141 static bin_tree_t *create_tree (re_dfa_t *dfa,
142 bin_tree_t *left, bin_tree_t *right,
143 re_token_type_t type);
144 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
145 bin_tree_t *left, bin_tree_t *right,
146 const re_token_t *token);
147 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
148 static void free_token (re_token_t *node);
149 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
150 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
152 /* This table gives an error message for each of the error codes listed
153 in regex.h. Obviously the order here has to be same as there.
154 POSIX doesn't require that we do anything for REG_NOERROR,
155 but why not be nice? */
157 const char __re_error_msgid[] attribute_hidden =
159 #define REG_NOERROR_IDX 0
160 gettext_noop ("Success") /* REG_NOERROR */
161 "\0"
162 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
163 gettext_noop ("No match") /* REG_NOMATCH */
164 "\0"
165 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
166 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
167 "\0"
168 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
169 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
170 "\0"
171 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
172 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
173 "\0"
174 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
175 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
176 "\0"
177 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
178 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
179 "\0"
180 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
181 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
182 "\0"
183 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
184 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
185 "\0"
186 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
187 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
188 "\0"
189 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
190 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
191 "\0"
192 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
193 gettext_noop ("Invalid range end") /* REG_ERANGE */
194 "\0"
195 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
196 gettext_noop ("Memory exhausted") /* REG_ESPACE */
197 "\0"
198 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
199 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
200 "\0"
201 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
202 gettext_noop ("Premature end of regular expression") /* REG_EEND */
203 "\0"
204 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
205 gettext_noop ("Regular expression too big") /* REG_ESIZE */
206 "\0"
207 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
208 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
211 const size_t __re_error_msgid_idx[] attribute_hidden =
213 REG_NOERROR_IDX,
214 REG_NOMATCH_IDX,
215 REG_BADPAT_IDX,
216 REG_ECOLLATE_IDX,
217 REG_ECTYPE_IDX,
218 REG_EESCAPE_IDX,
219 REG_ESUBREG_IDX,
220 REG_EBRACK_IDX,
221 REG_EPAREN_IDX,
222 REG_EBRACE_IDX,
223 REG_BADBR_IDX,
224 REG_ERANGE_IDX,
225 REG_ESPACE_IDX,
226 REG_BADRPT_IDX,
227 REG_EEND_IDX,
228 REG_ESIZE_IDX,
229 REG_ERPAREN_IDX
232 /* Entry points for GNU code. */
234 /* re_compile_pattern is the GNU regular expression compiler: it
235 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
236 Returns 0 if the pattern was valid, otherwise an error string.
238 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
239 are set in BUFP on entry. */
241 const char *
242 re_compile_pattern (pattern, length, bufp)
243 const char *pattern;
244 size_t length;
245 struct re_pattern_buffer *bufp;
247 reg_errcode_t ret;
249 /* And GNU code determines whether or not to get register information
250 by passing null for the REGS argument to re_match, etc., not by
251 setting no_sub, unless RE_NO_SUB is set. */
252 bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
254 /* Match anchors at newline. */
255 bufp->newline_anchor = 1;
257 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
259 if (!ret)
260 return NULL;
261 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
263 #ifdef _LIBC
264 weak_alias (__re_compile_pattern, re_compile_pattern)
265 #endif
267 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
268 also be assigned to arbitrarily: each pattern buffer stores its own
269 syntax, so it can be changed between regex compilations. */
270 /* This has no initializer because initialized variables in Emacs
271 become read-only after dumping. */
272 reg_syntax_t re_syntax_options;
275 /* Specify the precise syntax of regexps for compilation. This provides
276 for compatibility for various utilities which historically have
277 different, incompatible syntaxes.
279 The argument SYNTAX is a bit mask comprised of the various bits
280 defined in regex.h. We return the old syntax. */
282 reg_syntax_t
283 re_set_syntax (syntax)
284 reg_syntax_t syntax;
286 reg_syntax_t ret = re_syntax_options;
288 re_syntax_options = syntax;
289 return ret;
291 #ifdef _LIBC
292 weak_alias (__re_set_syntax, re_set_syntax)
293 #endif
296 re_compile_fastmap (bufp)
297 struct re_pattern_buffer *bufp;
299 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
300 char *fastmap = bufp->fastmap;
302 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
303 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
304 if (dfa->init_state != dfa->init_state_word)
305 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
306 if (dfa->init_state != dfa->init_state_nl)
307 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
308 if (dfa->init_state != dfa->init_state_begbuf)
309 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
310 bufp->fastmap_accurate = 1;
311 return 0;
313 #ifdef _LIBC
314 weak_alias (__re_compile_fastmap, re_compile_fastmap)
315 #endif
317 static inline void
318 __attribute ((always_inline))
319 re_set_fastmap (char *fastmap, int icase, int ch)
321 fastmap[ch] = 1;
322 if (icase)
323 fastmap[tolower (ch)] = 1;
326 /* Helper function for re_compile_fastmap.
327 Compile fastmap for the initial_state INIT_STATE. */
329 static void
330 re_compile_fastmap_iter (bufp, init_state, fastmap)
331 regex_t *bufp;
332 const re_dfastate_t *init_state;
333 char *fastmap;
335 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
336 int node_cnt;
337 int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
338 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
340 int node = init_state->nodes.elems[node_cnt];
341 re_token_type_t type = dfa->nodes[node].type;
343 if (type == CHARACTER)
345 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
346 #ifdef RE_ENABLE_I18N
347 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
349 unsigned char *buf = alloca (dfa->mb_cur_max), *p;
350 wchar_t wc;
351 mbstate_t state;
353 p = buf;
354 *p++ = dfa->nodes[node].opr.c;
355 while (++node < dfa->nodes_len
356 && dfa->nodes[node].type == CHARACTER
357 && dfa->nodes[node].mb_partial)
358 *p++ = dfa->nodes[node].opr.c;
359 memset (&state, 0, sizeof (state));
360 if (mbrtowc (&wc, (const char *) buf, p - buf,
361 &state) == p - buf
362 && __wcrtomb ((char *) buf, towlower (wc), &state) > 0)
363 re_set_fastmap (fastmap, 0, buf[0]);
365 #endif
367 else if (type == SIMPLE_BRACKET)
369 int i, j, ch;
370 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
371 for (j = 0; j < UINT_BITS; ++j, ++ch)
372 if (dfa->nodes[node].opr.sbcset[i] & (1 << j))
373 re_set_fastmap (fastmap, icase, ch);
375 #ifdef RE_ENABLE_I18N
376 else if (type == COMPLEX_BRACKET)
378 int i;
379 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
380 if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
381 || cset->nranges || cset->nchar_classes)
383 # ifdef _LIBC
384 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
386 /* In this case we want to catch the bytes which are
387 the first byte of any collation elements.
388 e.g. In da_DK, we want to catch 'a' since "aa"
389 is a valid collation element, and don't catch
390 'b' since 'b' is the only collation element
391 which starts from 'b'. */
392 int j, ch;
393 const int32_t *table = (const int32_t *)
394 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
395 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
396 for (j = 0; j < UINT_BITS; ++j, ++ch)
397 if (table[ch] < 0)
398 re_set_fastmap (fastmap, icase, ch);
400 # else
401 if (dfa->mb_cur_max > 1)
402 for (i = 0; i < SBC_MAX; ++i)
403 if (__btowc (i) == WEOF)
404 re_set_fastmap (fastmap, icase, i);
405 # endif /* not _LIBC */
407 for (i = 0; i < cset->nmbchars; ++i)
409 char buf[256];
410 mbstate_t state;
411 memset (&state, '\0', sizeof (state));
412 __wcrtomb (buf, cset->mbchars[i], &state);
413 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
414 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
416 __wcrtomb (buf, towlower (cset->mbchars[i]), &state);
417 re_set_fastmap (fastmap, 0, *(unsigned char *) buf);
421 #endif /* RE_ENABLE_I18N */
422 else if (type == OP_PERIOD
423 #ifdef RE_ENABLE_I18N
424 || type == OP_UTF8_PERIOD
425 #endif /* RE_ENABLE_I18N */
426 || type == END_OF_RE)
428 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
429 if (type == END_OF_RE)
430 bufp->can_be_null = 1;
431 return;
436 /* Entry point for POSIX code. */
437 /* regcomp takes a regular expression as a string and compiles it.
439 PREG is a regex_t *. We do not expect any fields to be initialized,
440 since POSIX says we shouldn't. Thus, we set
442 `buffer' to the compiled pattern;
443 `used' to the length of the compiled pattern;
444 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
445 REG_EXTENDED bit in CFLAGS is set; otherwise, to
446 RE_SYNTAX_POSIX_BASIC;
447 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
448 `fastmap' to an allocated space for the fastmap;
449 `fastmap_accurate' to zero;
450 `re_nsub' to the number of subexpressions in PATTERN.
452 PATTERN is the address of the pattern string.
454 CFLAGS is a series of bits which affect compilation.
456 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
457 use POSIX basic syntax.
459 If REG_NEWLINE is set, then . and [^...] don't match newline.
460 Also, regexec will try a match beginning after every newline.
462 If REG_ICASE is set, then we considers upper- and lowercase
463 versions of letters to be equivalent when matching.
465 If REG_NOSUB is set, then when PREG is passed to regexec, that
466 routine will report only success or failure, and nothing about the
467 registers.
469 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
470 the return codes and their meanings.) */
473 regcomp (preg, pattern, cflags)
474 regex_t *__restrict preg;
475 const char *__restrict pattern;
476 int cflags;
478 reg_errcode_t ret;
479 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
480 : RE_SYNTAX_POSIX_BASIC);
482 preg->buffer = NULL;
483 preg->allocated = 0;
484 preg->used = 0;
486 /* Try to allocate space for the fastmap. */
487 preg->fastmap = re_malloc (char, SBC_MAX);
488 if (BE (preg->fastmap == NULL, 0))
489 return REG_ESPACE;
491 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
493 /* If REG_NEWLINE is set, newlines are treated differently. */
494 if (cflags & REG_NEWLINE)
495 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
496 syntax &= ~RE_DOT_NEWLINE;
497 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
498 /* It also changes the matching behavior. */
499 preg->newline_anchor = 1;
501 else
502 preg->newline_anchor = 0;
503 preg->no_sub = !!(cflags & REG_NOSUB);
504 preg->translate = NULL;
506 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
508 /* POSIX doesn't distinguish between an unmatched open-group and an
509 unmatched close-group: both are REG_EPAREN. */
510 if (ret == REG_ERPAREN)
511 ret = REG_EPAREN;
513 /* We have already checked preg->fastmap != NULL. */
514 if (BE (ret == REG_NOERROR, 1))
515 /* Compute the fastmap now, since regexec cannot modify the pattern
516 buffer. This function never fails in this implementation. */
517 (void) re_compile_fastmap (preg);
518 else
520 /* Some error occurred while compiling the expression. */
521 re_free (preg->fastmap);
522 preg->fastmap = NULL;
525 return (int) ret;
527 #ifdef _LIBC
528 weak_alias (__regcomp, regcomp)
529 #endif
531 /* Returns a message corresponding to an error code, ERRCODE, returned
532 from either regcomp or regexec. We don't use PREG here. */
534 size_t
535 regerror (errcode, preg, errbuf, errbuf_size)
536 int errcode;
537 const regex_t *preg;
538 char *errbuf;
539 size_t errbuf_size;
541 const char *msg;
542 size_t msg_size;
544 if (BE (errcode < 0
545 || errcode >= (int) (sizeof (__re_error_msgid_idx)
546 / sizeof (__re_error_msgid_idx[0])), 0))
547 /* Only error codes returned by the rest of the code should be passed
548 to this routine. If we are given anything else, or if other regex
549 code generates an invalid error code, then the program has a bug.
550 Dump core so we can fix it. */
551 abort ();
553 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
555 msg_size = strlen (msg) + 1; /* Includes the null. */
557 if (BE (errbuf_size != 0, 1))
559 if (BE (msg_size > errbuf_size, 0))
561 #if defined HAVE_MEMPCPY || defined _LIBC
562 *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
563 #else
564 memcpy (errbuf, msg, errbuf_size - 1);
565 errbuf[errbuf_size - 1] = 0;
566 #endif
568 else
569 memcpy (errbuf, msg, msg_size);
572 return msg_size;
574 #ifdef _LIBC
575 weak_alias (__regerror, regerror)
576 #endif
579 #ifdef RE_ENABLE_I18N
580 /* This static array is used for the map to single-byte characters when
581 UTF-8 is used. Otherwise we would allocate memory just to initialize
582 it the same all the time. UTF-8 is the preferred encoding so this is
583 a worthwhile optimization. */
584 static const bitset utf8_sb_map =
586 /* Set the first 128 bits. */
587 # if UINT_MAX == 0xffffffff
588 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff
589 # else
590 # error "Add case for new unsigned int size"
591 # endif
593 #endif
596 static void
597 free_dfa_content (re_dfa_t *dfa)
599 int i, j;
601 if (dfa->nodes)
602 for (i = 0; i < dfa->nodes_len; ++i)
603 free_token (dfa->nodes + i);
604 re_free (dfa->nexts);
605 for (i = 0; i < dfa->nodes_len; ++i)
607 if (dfa->eclosures != NULL)
608 re_node_set_free (dfa->eclosures + i);
609 if (dfa->inveclosures != NULL)
610 re_node_set_free (dfa->inveclosures + i);
611 if (dfa->edests != NULL)
612 re_node_set_free (dfa->edests + i);
614 re_free (dfa->edests);
615 re_free (dfa->eclosures);
616 re_free (dfa->inveclosures);
617 re_free (dfa->nodes);
619 if (dfa->state_table)
620 for (i = 0; i <= dfa->state_hash_mask; ++i)
622 struct re_state_table_entry *entry = dfa->state_table + i;
623 for (j = 0; j < entry->num; ++j)
625 re_dfastate_t *state = entry->array[j];
626 free_state (state);
628 re_free (entry->array);
630 re_free (dfa->state_table);
631 #ifdef RE_ENABLE_I18N
632 if (dfa->sb_char != utf8_sb_map)
633 re_free (dfa->sb_char);
634 #endif
635 re_free (dfa->subexp_map);
636 #ifdef DEBUG
637 re_free (dfa->re_str);
638 #endif
640 re_free (dfa);
644 /* Free dynamically allocated space used by PREG. */
646 void
647 regfree (preg)
648 regex_t *preg;
650 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
651 if (BE (dfa != NULL, 1))
652 free_dfa_content (dfa);
653 preg->buffer = NULL;
654 preg->allocated = 0;
656 re_free (preg->fastmap);
657 preg->fastmap = NULL;
659 re_free (preg->translate);
660 preg->translate = NULL;
662 #ifdef _LIBC
663 weak_alias (__regfree, regfree)
664 #endif
666 /* Entry points compatible with 4.2 BSD regex library. We don't define
667 them unless specifically requested. */
669 #if defined _REGEX_RE_COMP || defined _LIBC
671 /* BSD has one and only one pattern buffer. */
672 static struct re_pattern_buffer re_comp_buf;
674 char *
675 # ifdef _LIBC
676 /* Make these definitions weak in libc, so POSIX programs can redefine
677 these names if they don't use our functions, and still use
678 regcomp/regexec above without link errors. */
679 weak_function
680 # endif
681 re_comp (s)
682 const char *s;
684 reg_errcode_t ret;
685 char *fastmap;
687 if (!s)
689 if (!re_comp_buf.buffer)
690 return gettext ("No previous regular expression");
691 return 0;
694 if (re_comp_buf.buffer)
696 fastmap = re_comp_buf.fastmap;
697 re_comp_buf.fastmap = NULL;
698 __regfree (&re_comp_buf);
699 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
700 re_comp_buf.fastmap = fastmap;
703 if (re_comp_buf.fastmap == NULL)
705 re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
706 if (re_comp_buf.fastmap == NULL)
707 return (char *) gettext (__re_error_msgid
708 + __re_error_msgid_idx[(int) REG_ESPACE]);
711 /* Since `re_exec' always passes NULL for the `regs' argument, we
712 don't need to initialize the pattern buffer fields which affect it. */
714 /* Match anchors at newlines. */
715 re_comp_buf.newline_anchor = 1;
717 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
719 if (!ret)
720 return NULL;
722 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
723 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
726 #ifdef _LIBC
727 libc_freeres_fn (free_mem)
729 __regfree (&re_comp_buf);
731 #endif
733 #endif /* _REGEX_RE_COMP */
735 /* Internal entry point.
736 Compile the regular expression PATTERN, whose length is LENGTH.
737 SYNTAX indicate regular expression's syntax. */
739 static reg_errcode_t
740 re_compile_internal (preg, pattern, length, syntax)
741 regex_t *preg;
742 const char * pattern;
743 int length;
744 reg_syntax_t syntax;
746 reg_errcode_t err = REG_NOERROR;
747 re_dfa_t *dfa;
748 re_string_t regexp;
750 /* Initialize the pattern buffer. */
751 preg->fastmap_accurate = 0;
752 preg->syntax = syntax;
753 preg->not_bol = preg->not_eol = 0;
754 preg->used = 0;
755 preg->re_nsub = 0;
756 preg->can_be_null = 0;
757 preg->regs_allocated = REGS_UNALLOCATED;
759 /* Initialize the dfa. */
760 dfa = (re_dfa_t *) preg->buffer;
761 if (BE (preg->allocated < sizeof (re_dfa_t), 0))
763 /* If zero allocated, but buffer is non-null, try to realloc
764 enough space. This loses if buffer's address is bogus, but
765 that is the user's responsibility. If ->buffer is NULL this
766 is a simple allocation. */
767 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
768 if (dfa == NULL)
769 return REG_ESPACE;
770 preg->allocated = sizeof (re_dfa_t);
771 preg->buffer = (unsigned char *) dfa;
773 preg->used = sizeof (re_dfa_t);
775 err = init_dfa (dfa, length);
776 if (BE (err != REG_NOERROR, 0))
778 free_dfa_content (dfa);
779 preg->buffer = NULL;
780 preg->allocated = 0;
781 return err;
783 #ifdef DEBUG
784 dfa->re_str = re_malloc (char, length + 1);
785 strncpy (dfa->re_str, pattern, length + 1);
786 #endif
788 err = re_string_construct (&regexp, pattern, length, preg->translate,
789 syntax & RE_ICASE, dfa);
790 if (BE (err != REG_NOERROR, 0))
792 re_compile_internal_free_return:
793 free_workarea_compile (preg);
794 re_string_destruct (&regexp);
795 free_dfa_content (dfa);
796 preg->buffer = NULL;
797 preg->allocated = 0;
798 return err;
801 /* Parse the regular expression, and build a structure tree. */
802 preg->re_nsub = 0;
803 dfa->str_tree = parse (&regexp, preg, syntax, &err);
804 if (BE (dfa->str_tree == NULL, 0))
805 goto re_compile_internal_free_return;
807 /* Analyze the tree and create the nfa. */
808 err = analyze (preg);
809 if (BE (err != REG_NOERROR, 0))
810 goto re_compile_internal_free_return;
812 #ifdef RE_ENABLE_I18N
813 /* If possible, do searching in single byte encoding to speed things up. */
814 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
815 optimize_utf8 (dfa);
816 #endif
818 /* Then create the initial state of the dfa. */
819 err = create_initial_state (dfa);
821 /* Release work areas. */
822 free_workarea_compile (preg);
823 re_string_destruct (&regexp);
825 if (BE (err != REG_NOERROR, 0))
827 free_dfa_content (dfa);
828 preg->buffer = NULL;
829 preg->allocated = 0;
832 return err;
835 /* Initialize DFA. We use the length of the regular expression PAT_LEN
836 as the initial length of some arrays. */
838 static reg_errcode_t
839 init_dfa (dfa, pat_len)
840 re_dfa_t *dfa;
841 int pat_len;
843 int table_size;
844 #ifndef _LIBC
845 char *codeset_name;
846 #endif
848 memset (dfa, '\0', sizeof (re_dfa_t));
850 /* Force allocation of str_tree_storage the first time. */
851 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
853 dfa->nodes_alloc = pat_len + 1;
854 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
856 dfa->states_alloc = pat_len + 1;
858 /* table_size = 2 ^ ceil(log pat_len) */
859 for (table_size = 1; table_size > 0; table_size <<= 1)
860 if (table_size > pat_len)
861 break;
863 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
864 dfa->state_hash_mask = table_size - 1;
866 dfa->mb_cur_max = MB_CUR_MAX;
867 #ifdef _LIBC
868 if (dfa->mb_cur_max == 6
869 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
870 dfa->is_utf8 = 1;
871 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
872 != 0);
873 #else
874 # ifdef HAVE_LANGINFO_CODESET
875 codeset_name = nl_langinfo (CODESET);
876 # else
877 codeset_name = getenv ("LC_ALL");
878 if (codeset_name == NULL || codeset_name[0] == '\0')
879 codeset_name = getenv ("LC_CTYPE");
880 if (codeset_name == NULL || codeset_name[0] == '\0')
881 codeset_name = getenv ("LANG");
882 if (codeset_name == NULL)
883 codeset_name = "";
884 else if (strchr (codeset_name, '.') != NULL)
885 codeset_name = strchr (codeset_name, '.') + 1;
886 # endif
888 if (strcasecmp (codeset_name, "UTF-8") == 0
889 || strcasecmp (codeset_name, "UTF8") == 0)
890 dfa->is_utf8 = 1;
892 /* We check exhaustively in the loop below if this charset is a
893 superset of ASCII. */
894 dfa->map_notascii = 0;
895 #endif
897 #ifdef RE_ENABLE_I18N
898 if (dfa->mb_cur_max > 1)
900 if (dfa->is_utf8)
901 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
902 else
904 int i, j, ch;
906 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset), 1);
907 if (BE (dfa->sb_char == NULL, 0))
908 return REG_ESPACE;
910 /* Clear all bits by, then set those corresponding to single
911 byte chars. */
912 bitset_empty (dfa->sb_char);
914 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
915 for (j = 0; j < UINT_BITS; ++j, ++ch)
917 wchar_t wch = __btowc (ch);
918 if (wch != WEOF)
919 dfa->sb_char[i] |= 1 << j;
920 # ifndef _LIBC
921 if (isascii (ch) && wch != (wchar_t) ch)
922 dfa->map_notascii = 1;
923 # endif
927 #endif
929 if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
930 return REG_ESPACE;
931 return REG_NOERROR;
934 /* Initialize WORD_CHAR table, which indicate which character is
935 "word". In this case "word" means that it is the word construction
936 character used by some operators like "\<", "\>", etc. */
938 static void
939 init_word_char (dfa)
940 re_dfa_t *dfa;
942 int i, j, ch;
943 dfa->word_ops_used = 1;
944 for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
945 for (j = 0; j < UINT_BITS; ++j, ++ch)
946 if (isalnum (ch) || ch == '_')
947 dfa->word_char[i] |= 1 << j;
950 /* Free the work area which are only used while compiling. */
952 static void
953 free_workarea_compile (preg)
954 regex_t *preg;
956 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
957 bin_tree_storage_t *storage, *next;
958 for (storage = dfa->str_tree_storage; storage; storage = next)
960 next = storage->next;
961 re_free (storage);
963 dfa->str_tree_storage = NULL;
964 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
965 dfa->str_tree = NULL;
966 re_free (dfa->org_indices);
967 dfa->org_indices = NULL;
970 /* Create initial states for all contexts. */
972 static reg_errcode_t
973 create_initial_state (dfa)
974 re_dfa_t *dfa;
976 int first, i;
977 reg_errcode_t err;
978 re_node_set init_nodes;
980 /* Initial states have the epsilon closure of the node which is
981 the first node of the regular expression. */
982 first = dfa->str_tree->first->node_idx;
983 dfa->init_node = first;
984 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
985 if (BE (err != REG_NOERROR, 0))
986 return err;
988 /* The back-references which are in initial states can epsilon transit,
989 since in this case all of the subexpressions can be null.
990 Then we add epsilon closures of the nodes which are the next nodes of
991 the back-references. */
992 if (dfa->nbackref > 0)
993 for (i = 0; i < init_nodes.nelem; ++i)
995 int node_idx = init_nodes.elems[i];
996 re_token_type_t type = dfa->nodes[node_idx].type;
998 int clexp_idx;
999 if (type != OP_BACK_REF)
1000 continue;
1001 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1003 re_token_t *clexp_node;
1004 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1005 if (clexp_node->type == OP_CLOSE_SUBEXP
1006 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1007 break;
1009 if (clexp_idx == init_nodes.nelem)
1010 continue;
1012 if (type == OP_BACK_REF)
1014 int dest_idx = dfa->edests[node_idx].elems[0];
1015 if (!re_node_set_contains (&init_nodes, dest_idx))
1017 re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1018 i = 0;
1023 /* It must be the first time to invoke acquire_state. */
1024 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1025 /* We don't check ERR here, since the initial state must not be NULL. */
1026 if (BE (dfa->init_state == NULL, 0))
1027 return err;
1028 if (dfa->init_state->has_constraint)
1030 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1031 CONTEXT_WORD);
1032 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1033 CONTEXT_NEWLINE);
1034 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1035 &init_nodes,
1036 CONTEXT_NEWLINE
1037 | CONTEXT_BEGBUF);
1038 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1039 || dfa->init_state_begbuf == NULL, 0))
1040 return err;
1042 else
1043 dfa->init_state_word = dfa->init_state_nl
1044 = dfa->init_state_begbuf = dfa->init_state;
1046 re_node_set_free (&init_nodes);
1047 return REG_NOERROR;
1050 #ifdef RE_ENABLE_I18N
1051 /* If it is possible to do searching in single byte encoding instead of UTF-8
1052 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1053 DFA nodes where needed. */
1055 static void
1056 optimize_utf8 (dfa)
1057 re_dfa_t *dfa;
1059 int node, i, mb_chars = 0, has_period = 0;
1061 for (node = 0; node < dfa->nodes_len; ++node)
1062 switch (dfa->nodes[node].type)
1064 case CHARACTER:
1065 if (dfa->nodes[node].opr.c >= 0x80)
1066 mb_chars = 1;
1067 break;
1068 case ANCHOR:
1069 switch (dfa->nodes[node].opr.idx)
1071 case LINE_FIRST:
1072 case LINE_LAST:
1073 case BUF_FIRST:
1074 case BUF_LAST:
1075 break;
1076 default:
1077 /* Word anchors etc. cannot be handled. */
1078 return;
1080 break;
1081 case OP_PERIOD:
1082 has_period = 1;
1083 break;
1084 case OP_BACK_REF:
1085 case OP_ALT:
1086 case END_OF_RE:
1087 case OP_DUP_ASTERISK:
1088 case OP_OPEN_SUBEXP:
1089 case OP_CLOSE_SUBEXP:
1090 break;
1091 case COMPLEX_BRACKET:
1092 return;
1093 case SIMPLE_BRACKET:
1094 /* Just double check. */
1095 for (i = 0x80 / UINT_BITS; i < BITSET_UINTS; ++i)
1096 if (dfa->nodes[node].opr.sbcset[i])
1097 return;
1098 break;
1099 default:
1100 abort ();
1103 if (mb_chars || has_period)
1104 for (node = 0; node < dfa->nodes_len; ++node)
1106 if (dfa->nodes[node].type == CHARACTER
1107 && dfa->nodes[node].opr.c >= 0x80)
1108 dfa->nodes[node].mb_partial = 0;
1109 else if (dfa->nodes[node].type == OP_PERIOD)
1110 dfa->nodes[node].type = OP_UTF8_PERIOD;
1113 /* The search can be in single byte locale. */
1114 dfa->mb_cur_max = 1;
1115 dfa->is_utf8 = 0;
1116 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1118 #endif
1120 /* Analyze the structure tree, and calculate "first", "next", "edest",
1121 "eclosure", and "inveclosure". */
1123 static reg_errcode_t
1124 analyze (preg)
1125 regex_t *preg;
1127 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1128 reg_errcode_t ret;
1130 /* Allocate arrays. */
1131 dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1132 dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1133 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1134 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1135 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1136 || dfa->eclosures == NULL, 0))
1137 return REG_ESPACE;
1139 dfa->subexp_map = re_malloc (int, preg->re_nsub);
1140 if (dfa->subexp_map != NULL)
1142 int i;
1143 for (i = 0; i < preg->re_nsub; i++)
1144 dfa->subexp_map[i] = i;
1145 preorder (dfa->str_tree, optimize_subexps, dfa);
1146 for (i = 0; i < preg->re_nsub; i++)
1147 if (dfa->subexp_map[i] != i)
1148 break;
1149 if (i == preg->re_nsub)
1151 free (dfa->subexp_map);
1152 dfa->subexp_map = NULL;
1156 ret = postorder (dfa->str_tree, lower_subexps, preg);
1157 if (BE (ret != REG_NOERROR, 0))
1158 return ret;
1159 ret = postorder (dfa->str_tree, calc_first, dfa);
1160 if (BE (ret != REG_NOERROR, 0))
1161 return ret;
1162 preorder (dfa->str_tree, calc_next, dfa);
1163 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1164 if (BE (ret != REG_NOERROR, 0))
1165 return ret;
1166 ret = calc_eclosure (dfa);
1167 if (BE (ret != REG_NOERROR, 0))
1168 return ret;
1170 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1171 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1172 if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1173 || dfa->nbackref)
1175 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1176 if (BE (dfa->inveclosures == NULL, 0))
1177 return REG_ESPACE;
1178 ret = calc_inveclosure (dfa);
1181 return ret;
1184 /* Our parse trees are very unbalanced, so we cannot use a stack to
1185 implement parse tree visits. Instead, we use parent pointers and
1186 some hairy code in these two functions. */
1187 static reg_errcode_t
1188 postorder (root, fn, extra)
1189 bin_tree_t *root;
1190 reg_errcode_t (fn (void *, bin_tree_t *));
1191 void *extra;
1193 bin_tree_t *node, *prev;
1195 for (node = root; ; )
1197 /* Descend down the tree, preferably to the left (or to the right
1198 if that's the only child). */
1199 while (node->left || node->right)
1200 if (node->left)
1201 node = node->left;
1202 else
1203 node = node->right;
1207 reg_errcode_t err = fn (extra, node);
1208 if (BE (err != REG_NOERROR, 0))
1209 return err;
1210 if (node->parent == NULL)
1211 return REG_NOERROR;
1212 prev = node;
1213 node = node->parent;
1215 /* Go up while we have a node that is reached from the right. */
1216 while (node->right == prev || node->right == NULL);
1217 node = node->right;
1221 static reg_errcode_t
1222 preorder (root, fn, extra)
1223 bin_tree_t *root;
1224 reg_errcode_t (fn (void *, bin_tree_t *));
1225 void *extra;
1227 bin_tree_t *node;
1229 for (node = root; ; )
1231 reg_errcode_t err = fn (extra, node);
1232 if (BE (err != REG_NOERROR, 0))
1233 return err;
1235 /* Go to the left node, or up and to the right. */
1236 if (node->left)
1237 node = node->left;
1238 else
1240 bin_tree_t *prev = NULL;
1241 while (node->right == prev || node->right == NULL)
1243 prev = node;
1244 node = node->parent;
1245 if (!node)
1246 return REG_NOERROR;
1248 node = node->right;
1253 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1254 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1255 backreferences as well. Requires a preorder visit. */
1256 static reg_errcode_t
1257 optimize_subexps (extra, node)
1258 void *extra;
1259 bin_tree_t *node;
1261 re_dfa_t *dfa = (re_dfa_t *) extra;
1263 if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1265 int idx = node->token.opr.idx;
1266 node->token.opr.idx = dfa->subexp_map[idx];
1267 dfa->used_bkref_map |= 1 << node->token.opr.idx;
1270 else if (node->token.type == SUBEXP
1271 && node->left && node->left->token.type == SUBEXP)
1273 int other_idx = node->left->token.opr.idx;
1275 node->left = node->left->left;
1276 if (node->left)
1277 node->left->parent = node;
1279 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1280 if (other_idx < 8 * sizeof (dfa->used_bkref_map))
1281 dfa->used_bkref_map &= ~(1 << other_idx);
1284 return REG_NOERROR;
1287 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1288 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1289 static reg_errcode_t
1290 lower_subexps (extra, node)
1291 void *extra;
1292 bin_tree_t *node;
1294 regex_t *preg = (regex_t *) extra;
1295 reg_errcode_t err = REG_NOERROR;
1297 if (node->left && node->left->token.type == SUBEXP)
1299 node->left = lower_subexp (&err, preg, node->left);
1300 if (node->left)
1301 node->left->parent = node;
1303 if (node->right && node->right->token.type == SUBEXP)
1305 node->right = lower_subexp (&err, preg, node->right);
1306 if (node->right)
1307 node->right->parent = node;
1310 return err;
1313 static bin_tree_t *
1314 lower_subexp (err, preg, node)
1315 reg_errcode_t *err;
1316 regex_t *preg;
1317 bin_tree_t *node;
1319 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1320 bin_tree_t *body = node->left;
1321 bin_tree_t *op, *cls, *tree1, *tree;
1323 if (preg->no_sub
1324 && (node->token.opr.idx >= 8 * sizeof (dfa->used_bkref_map)
1325 || !(dfa->used_bkref_map & (1 << node->token.opr.idx))))
1326 return node->left;
1328 /* Convert the SUBEXP node to the concatenation of an
1329 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1330 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1331 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1332 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1333 tree = create_tree (dfa, op, tree1, CONCAT);
1334 if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1336 *err = REG_ESPACE;
1337 return NULL;
1340 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1341 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1342 return tree;
1345 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1346 nodes. Requires a postorder visit. */
1347 static reg_errcode_t
1348 calc_first (extra, node)
1349 void *extra;
1350 bin_tree_t *node;
1352 re_dfa_t *dfa = (re_dfa_t *) extra;
1353 if (node->token.type == CONCAT)
1355 node->first = node->left->first;
1356 node->node_idx = node->left->node_idx;
1358 else
1360 node->first = node;
1361 node->node_idx = re_dfa_add_node (dfa, node->token);
1362 if (BE (node->node_idx == -1, 0))
1363 return REG_ESPACE;
1365 return REG_NOERROR;
1368 /* Pass 2: compute NEXT on the tree. Preorder visit. */
1369 static reg_errcode_t
1370 calc_next (extra, node)
1371 void *extra;
1372 bin_tree_t *node;
1374 switch (node->token.type)
1376 case OP_DUP_ASTERISK:
1377 node->left->next = node;
1378 break;
1379 case CONCAT:
1380 node->left->next = node->right->first;
1381 node->right->next = node->next;
1382 break;
1383 default:
1384 if (node->left)
1385 node->left->next = node->next;
1386 if (node->right)
1387 node->right->next = node->next;
1388 break;
1390 return REG_NOERROR;
1393 /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1394 static reg_errcode_t
1395 link_nfa_nodes (extra, node)
1396 void *extra;
1397 bin_tree_t *node;
1399 re_dfa_t *dfa = (re_dfa_t *) extra;
1400 int idx = node->node_idx;
1401 reg_errcode_t err = REG_NOERROR;
1403 switch (node->token.type)
1405 case CONCAT:
1406 break;
1408 case END_OF_RE:
1409 assert (node->next == NULL);
1410 break;
1412 case OP_DUP_ASTERISK:
1413 case OP_ALT:
1415 int left, right;
1416 dfa->has_plural_match = 1;
1417 if (node->left != NULL)
1418 left = node->left->first->node_idx;
1419 else
1420 left = node->next->node_idx;
1421 if (node->right != NULL)
1422 right = node->right->first->node_idx;
1423 else
1424 right = node->next->node_idx;
1425 assert (left > -1);
1426 assert (right > -1);
1427 err = re_node_set_init_2 (dfa->edests + idx, left, right);
1429 break;
1431 case ANCHOR:
1432 case OP_OPEN_SUBEXP:
1433 case OP_CLOSE_SUBEXP:
1434 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1435 break;
1437 case OP_BACK_REF:
1438 dfa->nexts[idx] = node->next->node_idx;
1439 if (node->token.type == OP_BACK_REF)
1440 re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1441 break;
1443 default:
1444 assert (!IS_EPSILON_NODE (node->token.type));
1445 dfa->nexts[idx] = node->next->node_idx;
1446 break;
1449 return err;
1452 /* Duplicate the epsilon closure of the node ROOT_NODE.
1453 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1454 to their own constraint. */
1456 static reg_errcode_t
1457 duplicate_node_closure (dfa, top_org_node, top_clone_node, root_node,
1458 init_constraint)
1459 re_dfa_t *dfa;
1460 int top_org_node, top_clone_node, root_node;
1461 unsigned int init_constraint;
1463 reg_errcode_t err;
1464 int org_node, clone_node, ret;
1465 unsigned int constraint = init_constraint;
1466 for (org_node = top_org_node, clone_node = top_clone_node;;)
1468 int org_dest, clone_dest;
1469 if (dfa->nodes[org_node].type == OP_BACK_REF)
1471 /* If the back reference epsilon-transit, its destination must
1472 also have the constraint. Then duplicate the epsilon closure
1473 of the destination of the back reference, and store it in
1474 edests of the back reference. */
1475 org_dest = dfa->nexts[org_node];
1476 re_node_set_empty (dfa->edests + clone_node);
1477 err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1478 if (BE (err != REG_NOERROR, 0))
1479 return err;
1480 dfa->nexts[clone_node] = dfa->nexts[org_node];
1481 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1482 if (BE (ret < 0, 0))
1483 return REG_ESPACE;
1485 else if (dfa->edests[org_node].nelem == 0)
1487 /* In case of the node can't epsilon-transit, don't duplicate the
1488 destination and store the original destination as the
1489 destination of the node. */
1490 dfa->nexts[clone_node] = dfa->nexts[org_node];
1491 break;
1493 else if (dfa->edests[org_node].nelem == 1)
1495 /* In case of the node can epsilon-transit, and it has only one
1496 destination. */
1497 org_dest = dfa->edests[org_node].elems[0];
1498 re_node_set_empty (dfa->edests + clone_node);
1499 if (dfa->nodes[org_node].type == ANCHOR)
1501 /* In case of the node has another constraint, append it. */
1502 if (org_node == root_node && clone_node != org_node)
1504 /* ...but if the node is root_node itself, it means the
1505 epsilon closure have a loop, then tie it to the
1506 destination of the root_node. */
1507 ret = re_node_set_insert (dfa->edests + clone_node,
1508 org_dest);
1509 if (BE (ret < 0, 0))
1510 return REG_ESPACE;
1511 break;
1513 constraint |= dfa->nodes[org_node].opr.ctx_type;
1515 err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1516 if (BE (err != REG_NOERROR, 0))
1517 return err;
1518 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1519 if (BE (ret < 0, 0))
1520 return REG_ESPACE;
1522 else /* dfa->edests[org_node].nelem == 2 */
1524 /* In case of the node can epsilon-transit, and it has two
1525 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1526 org_dest = dfa->edests[org_node].elems[0];
1527 re_node_set_empty (dfa->edests + clone_node);
1528 /* Search for a duplicated node which satisfies the constraint. */
1529 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1530 if (clone_dest == -1)
1532 /* There are no such a duplicated node, create a new one. */
1533 err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1534 if (BE (err != REG_NOERROR, 0))
1535 return err;
1536 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1537 if (BE (ret < 0, 0))
1538 return REG_ESPACE;
1539 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1540 root_node, constraint);
1541 if (BE (err != REG_NOERROR, 0))
1542 return err;
1544 else
1546 /* There are a duplicated node which satisfy the constraint,
1547 use it to avoid infinite loop. */
1548 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1549 if (BE (ret < 0, 0))
1550 return REG_ESPACE;
1553 org_dest = dfa->edests[org_node].elems[1];
1554 err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1555 if (BE (err != REG_NOERROR, 0))
1556 return err;
1557 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1558 if (BE (ret < 0, 0))
1559 return REG_ESPACE;
1561 org_node = org_dest;
1562 clone_node = clone_dest;
1564 return REG_NOERROR;
1567 /* Search for a node which is duplicated from the node ORG_NODE, and
1568 satisfies the constraint CONSTRAINT. */
1570 static int
1571 search_duplicated_node (dfa, org_node, constraint)
1572 re_dfa_t *dfa;
1573 int org_node;
1574 unsigned int constraint;
1576 int idx;
1577 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1579 if (org_node == dfa->org_indices[idx]
1580 && constraint == dfa->nodes[idx].constraint)
1581 return idx; /* Found. */
1583 return -1; /* Not found. */
1586 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1587 The new index will be stored in NEW_IDX and return REG_NOERROR if succeeded,
1588 otherwise return the error code. */
1590 static reg_errcode_t
1591 duplicate_node (new_idx, dfa, org_idx, constraint)
1592 re_dfa_t *dfa;
1593 int *new_idx, org_idx;
1594 unsigned int constraint;
1596 int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1597 if (BE (dup_idx == -1, 0))
1598 return REG_ESPACE;
1599 dfa->nodes[dup_idx].constraint = constraint;
1600 if (dfa->nodes[org_idx].type == ANCHOR)
1601 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type;
1602 dfa->nodes[dup_idx].duplicated = 1;
1604 /* Store the index of the original node. */
1605 dfa->org_indices[dup_idx] = org_idx;
1606 *new_idx = dup_idx;
1607 return REG_NOERROR;
1610 static reg_errcode_t
1611 calc_inveclosure (dfa)
1612 re_dfa_t *dfa;
1614 int src, idx, ret;
1615 for (idx = 0; idx < dfa->nodes_len; ++idx)
1616 re_node_set_init_empty (dfa->inveclosures + idx);
1618 for (src = 0; src < dfa->nodes_len; ++src)
1620 int *elems = dfa->eclosures[src].elems;
1621 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1623 ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1624 if (BE (ret == -1, 0))
1625 return REG_ESPACE;
1629 return REG_NOERROR;
1632 /* Calculate "eclosure" for all the node in DFA. */
1634 static reg_errcode_t
1635 calc_eclosure (dfa)
1636 re_dfa_t *dfa;
1638 int node_idx, incomplete;
1639 #ifdef DEBUG
1640 assert (dfa->nodes_len > 0);
1641 #endif
1642 incomplete = 0;
1643 /* For each nodes, calculate epsilon closure. */
1644 for (node_idx = 0; ; ++node_idx)
1646 reg_errcode_t err;
1647 re_node_set eclosure_elem;
1648 if (node_idx == dfa->nodes_len)
1650 if (!incomplete)
1651 break;
1652 incomplete = 0;
1653 node_idx = 0;
1656 #ifdef DEBUG
1657 assert (dfa->eclosures[node_idx].nelem != -1);
1658 #endif
1660 /* If we have already calculated, skip it. */
1661 if (dfa->eclosures[node_idx].nelem != 0)
1662 continue;
1663 /* Calculate epsilon closure of `node_idx'. */
1664 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1665 if (BE (err != REG_NOERROR, 0))
1666 return err;
1668 if (dfa->eclosures[node_idx].nelem == 0)
1670 incomplete = 1;
1671 re_node_set_free (&eclosure_elem);
1674 return REG_NOERROR;
1677 /* Calculate epsilon closure of NODE. */
1679 static reg_errcode_t
1680 calc_eclosure_iter (new_set, dfa, node, root)
1681 re_node_set *new_set;
1682 re_dfa_t *dfa;
1683 int node, root;
1685 reg_errcode_t err;
1686 unsigned int constraint;
1687 int i, incomplete;
1688 re_node_set eclosure;
1689 incomplete = 0;
1690 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1691 if (BE (err != REG_NOERROR, 0))
1692 return err;
1694 /* This indicates that we are calculating this node now.
1695 We reference this value to avoid infinite loop. */
1696 dfa->eclosures[node].nelem = -1;
1698 constraint = ((dfa->nodes[node].type == ANCHOR)
1699 ? dfa->nodes[node].opr.ctx_type : 0);
1700 /* If the current node has constraints, duplicate all nodes.
1701 Since they must inherit the constraints. */
1702 if (constraint
1703 && dfa->edests[node].nelem
1704 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1706 int org_node, cur_node;
1707 org_node = cur_node = node;
1708 err = duplicate_node_closure (dfa, node, node, node, constraint);
1709 if (BE (err != REG_NOERROR, 0))
1710 return err;
1713 /* Expand each epsilon destination nodes. */
1714 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1715 for (i = 0; i < dfa->edests[node].nelem; ++i)
1717 re_node_set eclosure_elem;
1718 int edest = dfa->edests[node].elems[i];
1719 /* If calculating the epsilon closure of `edest' is in progress,
1720 return intermediate result. */
1721 if (dfa->eclosures[edest].nelem == -1)
1723 incomplete = 1;
1724 continue;
1726 /* If we haven't calculated the epsilon closure of `edest' yet,
1727 calculate now. Otherwise use calculated epsilon closure. */
1728 if (dfa->eclosures[edest].nelem == 0)
1730 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1731 if (BE (err != REG_NOERROR, 0))
1732 return err;
1734 else
1735 eclosure_elem = dfa->eclosures[edest];
1736 /* Merge the epsilon closure of `edest'. */
1737 re_node_set_merge (&eclosure, &eclosure_elem);
1738 /* If the epsilon closure of `edest' is incomplete,
1739 the epsilon closure of this node is also incomplete. */
1740 if (dfa->eclosures[edest].nelem == 0)
1742 incomplete = 1;
1743 re_node_set_free (&eclosure_elem);
1747 /* Epsilon closures include itself. */
1748 re_node_set_insert (&eclosure, node);
1749 if (incomplete && !root)
1750 dfa->eclosures[node].nelem = 0;
1751 else
1752 dfa->eclosures[node] = eclosure;
1753 *new_set = eclosure;
1754 return REG_NOERROR;
1757 /* Functions for token which are used in the parser. */
1759 /* Fetch a token from INPUT.
1760 We must not use this function inside bracket expressions. */
1762 static void
1763 fetch_token (result, input, syntax)
1764 re_token_t *result;
1765 re_string_t *input;
1766 reg_syntax_t syntax;
1768 re_string_skip_bytes (input, peek_token (result, input, syntax));
1771 /* Peek a token from INPUT, and return the length of the token.
1772 We must not use this function inside bracket expressions. */
1774 static int
1775 peek_token (token, input, syntax)
1776 re_token_t *token;
1777 re_string_t *input;
1778 reg_syntax_t syntax;
1780 unsigned char c;
1782 if (re_string_eoi (input))
1784 token->type = END_OF_RE;
1785 return 0;
1788 c = re_string_peek_byte (input, 0);
1789 token->opr.c = c;
1791 token->word_char = 0;
1792 #ifdef RE_ENABLE_I18N
1793 token->mb_partial = 0;
1794 if (input->mb_cur_max > 1 &&
1795 !re_string_first_byte (input, re_string_cur_idx (input)))
1797 token->type = CHARACTER;
1798 token->mb_partial = 1;
1799 return 1;
1801 #endif
1802 if (c == '\\')
1804 unsigned char c2;
1805 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1807 token->type = BACK_SLASH;
1808 return 1;
1811 c2 = re_string_peek_byte_case (input, 1);
1812 token->opr.c = c2;
1813 token->type = CHARACTER;
1814 #ifdef RE_ENABLE_I18N
1815 if (input->mb_cur_max > 1)
1817 wint_t wc = re_string_wchar_at (input,
1818 re_string_cur_idx (input) + 1);
1819 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1821 else
1822 #endif
1823 token->word_char = IS_WORD_CHAR (c2) != 0;
1825 switch (c2)
1827 case '|':
1828 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1829 token->type = OP_ALT;
1830 break;
1831 case '1': case '2': case '3': case '4': case '5':
1832 case '6': case '7': case '8': case '9':
1833 if (!(syntax & RE_NO_BK_REFS))
1835 token->type = OP_BACK_REF;
1836 token->opr.idx = c2 - '1';
1838 break;
1839 case '<':
1840 if (!(syntax & RE_NO_GNU_OPS))
1842 token->type = ANCHOR;
1843 token->opr.ctx_type = WORD_FIRST;
1845 break;
1846 case '>':
1847 if (!(syntax & RE_NO_GNU_OPS))
1849 token->type = ANCHOR;
1850 token->opr.ctx_type = WORD_LAST;
1852 break;
1853 case 'b':
1854 if (!(syntax & RE_NO_GNU_OPS))
1856 token->type = ANCHOR;
1857 token->opr.ctx_type = WORD_DELIM;
1859 break;
1860 case 'B':
1861 if (!(syntax & RE_NO_GNU_OPS))
1863 token->type = ANCHOR;
1864 token->opr.ctx_type = NOT_WORD_DELIM;
1866 break;
1867 case 'w':
1868 if (!(syntax & RE_NO_GNU_OPS))
1869 token->type = OP_WORD;
1870 break;
1871 case 'W':
1872 if (!(syntax & RE_NO_GNU_OPS))
1873 token->type = OP_NOTWORD;
1874 break;
1875 case 's':
1876 if (!(syntax & RE_NO_GNU_OPS))
1877 token->type = OP_SPACE;
1878 break;
1879 case 'S':
1880 if (!(syntax & RE_NO_GNU_OPS))
1881 token->type = OP_NOTSPACE;
1882 break;
1883 case '`':
1884 if (!(syntax & RE_NO_GNU_OPS))
1886 token->type = ANCHOR;
1887 token->opr.ctx_type = BUF_FIRST;
1889 break;
1890 case '\'':
1891 if (!(syntax & RE_NO_GNU_OPS))
1893 token->type = ANCHOR;
1894 token->opr.ctx_type = BUF_LAST;
1896 break;
1897 case '(':
1898 if (!(syntax & RE_NO_BK_PARENS))
1899 token->type = OP_OPEN_SUBEXP;
1900 break;
1901 case ')':
1902 if (!(syntax & RE_NO_BK_PARENS))
1903 token->type = OP_CLOSE_SUBEXP;
1904 break;
1905 case '+':
1906 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1907 token->type = OP_DUP_PLUS;
1908 break;
1909 case '?':
1910 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1911 token->type = OP_DUP_QUESTION;
1912 break;
1913 case '{':
1914 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1915 token->type = OP_OPEN_DUP_NUM;
1916 break;
1917 case '}':
1918 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1919 token->type = OP_CLOSE_DUP_NUM;
1920 break;
1921 default:
1922 break;
1924 return 2;
1927 token->type = CHARACTER;
1928 #ifdef RE_ENABLE_I18N
1929 if (input->mb_cur_max > 1)
1931 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1932 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1934 else
1935 #endif
1936 token->word_char = IS_WORD_CHAR (token->opr.c);
1938 switch (c)
1940 case '\n':
1941 if (syntax & RE_NEWLINE_ALT)
1942 token->type = OP_ALT;
1943 break;
1944 case '|':
1945 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1946 token->type = OP_ALT;
1947 break;
1948 case '*':
1949 token->type = OP_DUP_ASTERISK;
1950 break;
1951 case '+':
1952 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1953 token->type = OP_DUP_PLUS;
1954 break;
1955 case '?':
1956 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1957 token->type = OP_DUP_QUESTION;
1958 break;
1959 case '{':
1960 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1961 token->type = OP_OPEN_DUP_NUM;
1962 break;
1963 case '}':
1964 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1965 token->type = OP_CLOSE_DUP_NUM;
1966 break;
1967 case '(':
1968 if (syntax & RE_NO_BK_PARENS)
1969 token->type = OP_OPEN_SUBEXP;
1970 break;
1971 case ')':
1972 if (syntax & RE_NO_BK_PARENS)
1973 token->type = OP_CLOSE_SUBEXP;
1974 break;
1975 case '[':
1976 token->type = OP_OPEN_BRACKET;
1977 break;
1978 case '.':
1979 token->type = OP_PERIOD;
1980 break;
1981 case '^':
1982 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1983 re_string_cur_idx (input) != 0)
1985 char prev = re_string_peek_byte (input, -1);
1986 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1987 break;
1989 token->type = ANCHOR;
1990 token->opr.ctx_type = LINE_FIRST;
1991 break;
1992 case '$':
1993 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1994 re_string_cur_idx (input) + 1 != re_string_length (input))
1996 re_token_t next;
1997 re_string_skip_bytes (input, 1);
1998 peek_token (&next, input, syntax);
1999 re_string_skip_bytes (input, -1);
2000 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
2001 break;
2003 token->type = ANCHOR;
2004 token->opr.ctx_type = LINE_LAST;
2005 break;
2006 default:
2007 break;
2009 return 1;
2012 /* Peek a token from INPUT, and return the length of the token.
2013 We must not use this function out of bracket expressions. */
2015 static int
2016 peek_token_bracket (token, input, syntax)
2017 re_token_t *token;
2018 re_string_t *input;
2019 reg_syntax_t syntax;
2021 unsigned char c;
2022 if (re_string_eoi (input))
2024 token->type = END_OF_RE;
2025 return 0;
2027 c = re_string_peek_byte (input, 0);
2028 token->opr.c = c;
2030 #ifdef RE_ENABLE_I18N
2031 if (input->mb_cur_max > 1 &&
2032 !re_string_first_byte (input, re_string_cur_idx (input)))
2034 token->type = CHARACTER;
2035 return 1;
2037 #endif /* RE_ENABLE_I18N */
2039 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2040 && re_string_cur_idx (input) + 1 < re_string_length (input))
2042 /* In this case, '\' escape a character. */
2043 unsigned char c2;
2044 re_string_skip_bytes (input, 1);
2045 c2 = re_string_peek_byte (input, 0);
2046 token->opr.c = c2;
2047 token->type = CHARACTER;
2048 return 1;
2050 if (c == '[') /* '[' is a special char in a bracket exps. */
2052 unsigned char c2;
2053 int token_len;
2054 if (re_string_cur_idx (input) + 1 < re_string_length (input))
2055 c2 = re_string_peek_byte (input, 1);
2056 else
2057 c2 = 0;
2058 token->opr.c = c2;
2059 token_len = 2;
2060 switch (c2)
2062 case '.':
2063 token->type = OP_OPEN_COLL_ELEM;
2064 break;
2065 case '=':
2066 token->type = OP_OPEN_EQUIV_CLASS;
2067 break;
2068 case ':':
2069 if (syntax & RE_CHAR_CLASSES)
2071 token->type = OP_OPEN_CHAR_CLASS;
2072 break;
2074 /* else fall through. */
2075 default:
2076 token->type = CHARACTER;
2077 token->opr.c = c;
2078 token_len = 1;
2079 break;
2081 return token_len;
2083 switch (c)
2085 case '-':
2086 token->type = OP_CHARSET_RANGE;
2087 break;
2088 case ']':
2089 token->type = OP_CLOSE_BRACKET;
2090 break;
2091 case '^':
2092 token->type = OP_NON_MATCH_LIST;
2093 break;
2094 default:
2095 token->type = CHARACTER;
2097 return 1;
2100 /* Functions for parser. */
2102 /* Entry point of the parser.
2103 Parse the regular expression REGEXP and return the structure tree.
2104 If an error is occured, ERR is set by error code, and return NULL.
2105 This function build the following tree, from regular expression <reg_exp>:
2109 <reg_exp> EOR
2111 CAT means concatenation.
2112 EOR means end of regular expression. */
2114 static bin_tree_t *
2115 parse (regexp, preg, syntax, err)
2116 re_string_t *regexp;
2117 regex_t *preg;
2118 reg_syntax_t syntax;
2119 reg_errcode_t *err;
2121 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2122 bin_tree_t *tree, *eor, *root;
2123 re_token_t current_token;
2124 dfa->syntax = syntax;
2125 fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2126 tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2127 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2128 return NULL;
2129 eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2130 if (tree != NULL)
2131 root = create_tree (dfa, tree, eor, CONCAT);
2132 else
2133 root = eor;
2134 if (BE (eor == NULL || root == NULL, 0))
2136 *err = REG_ESPACE;
2137 return NULL;
2139 return root;
2142 /* This function build the following tree, from regular expression
2143 <branch1>|<branch2>:
2147 <branch1> <branch2>
2149 ALT means alternative, which represents the operator `|'. */
2151 static bin_tree_t *
2152 parse_reg_exp (regexp, preg, token, syntax, nest, err)
2153 re_string_t *regexp;
2154 regex_t *preg;
2155 re_token_t *token;
2156 reg_syntax_t syntax;
2157 int nest;
2158 reg_errcode_t *err;
2160 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2161 bin_tree_t *tree, *branch = NULL;
2162 tree = parse_branch (regexp, preg, token, syntax, nest, err);
2163 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2164 return NULL;
2166 while (token->type == OP_ALT)
2168 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2169 if (token->type != OP_ALT && token->type != END_OF_RE
2170 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2172 branch = parse_branch (regexp, preg, token, syntax, nest, err);
2173 if (BE (*err != REG_NOERROR && branch == NULL, 0))
2174 return NULL;
2176 else
2177 branch = NULL;
2178 tree = create_tree (dfa, tree, branch, OP_ALT);
2179 if (BE (tree == NULL, 0))
2181 *err = REG_ESPACE;
2182 return NULL;
2185 return tree;
2188 /* This function build the following tree, from regular expression
2189 <exp1><exp2>:
2193 <exp1> <exp2>
2195 CAT means concatenation. */
2197 static bin_tree_t *
2198 parse_branch (regexp, preg, token, syntax, nest, err)
2199 re_string_t *regexp;
2200 regex_t *preg;
2201 re_token_t *token;
2202 reg_syntax_t syntax;
2203 int nest;
2204 reg_errcode_t *err;
2206 bin_tree_t *tree, *exp;
2207 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2208 tree = parse_expression (regexp, preg, token, syntax, nest, err);
2209 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2210 return NULL;
2212 while (token->type != OP_ALT && token->type != END_OF_RE
2213 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2215 exp = parse_expression (regexp, preg, token, syntax, nest, err);
2216 if (BE (*err != REG_NOERROR && exp == NULL, 0))
2218 return NULL;
2220 if (tree != NULL && exp != NULL)
2222 tree = create_tree (dfa, tree, exp, CONCAT);
2223 if (tree == NULL)
2225 *err = REG_ESPACE;
2226 return NULL;
2229 else if (tree == NULL)
2230 tree = exp;
2231 /* Otherwise exp == NULL, we don't need to create new tree. */
2233 return tree;
2236 /* This function build the following tree, from regular expression a*:
2242 static bin_tree_t *
2243 parse_expression (regexp, preg, token, syntax, nest, err)
2244 re_string_t *regexp;
2245 regex_t *preg;
2246 re_token_t *token;
2247 reg_syntax_t syntax;
2248 int nest;
2249 reg_errcode_t *err;
2251 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2252 bin_tree_t *tree;
2253 switch (token->type)
2255 case CHARACTER:
2256 tree = create_token_tree (dfa, NULL, NULL, token);
2257 if (BE (tree == NULL, 0))
2259 *err = REG_ESPACE;
2260 return NULL;
2262 #ifdef RE_ENABLE_I18N
2263 if (dfa->mb_cur_max > 1)
2265 while (!re_string_eoi (regexp)
2266 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2268 bin_tree_t *mbc_remain;
2269 fetch_token (token, regexp, syntax);
2270 mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2271 tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2272 if (BE (mbc_remain == NULL || tree == NULL, 0))
2274 *err = REG_ESPACE;
2275 return NULL;
2279 #endif
2280 break;
2281 case OP_OPEN_SUBEXP:
2282 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2283 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2284 return NULL;
2285 break;
2286 case OP_OPEN_BRACKET:
2287 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2288 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2289 return NULL;
2290 break;
2291 case OP_BACK_REF:
2292 if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2294 *err = REG_ESUBREG;
2295 return NULL;
2297 dfa->used_bkref_map |= 1 << token->opr.idx;
2298 tree = create_token_tree (dfa, NULL, NULL, token);
2299 if (BE (tree == NULL, 0))
2301 *err = REG_ESPACE;
2302 return NULL;
2304 ++dfa->nbackref;
2305 dfa->has_mb_node = 1;
2306 break;
2307 case OP_OPEN_DUP_NUM:
2308 if (syntax & RE_CONTEXT_INVALID_DUP)
2310 *err = REG_BADRPT;
2311 return NULL;
2313 /* FALLTHROUGH */
2314 case OP_DUP_ASTERISK:
2315 case OP_DUP_PLUS:
2316 case OP_DUP_QUESTION:
2317 if (syntax & RE_CONTEXT_INVALID_OPS)
2319 *err = REG_BADRPT;
2320 return NULL;
2322 else if (syntax & RE_CONTEXT_INDEP_OPS)
2324 fetch_token (token, regexp, syntax);
2325 return parse_expression (regexp, preg, token, syntax, nest, err);
2327 /* else fall through */
2328 case OP_CLOSE_SUBEXP:
2329 if ((token->type == OP_CLOSE_SUBEXP) &&
2330 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2332 *err = REG_ERPAREN;
2333 return NULL;
2335 /* else fall through */
2336 case OP_CLOSE_DUP_NUM:
2337 /* We treat it as a normal character. */
2339 /* Then we can these characters as normal characters. */
2340 token->type = CHARACTER;
2341 /* mb_partial and word_char bits should be initialized already
2342 by peek_token. */
2343 tree = create_token_tree (dfa, NULL, NULL, token);
2344 if (BE (tree == NULL, 0))
2346 *err = REG_ESPACE;
2347 return NULL;
2349 break;
2350 case ANCHOR:
2351 if ((token->opr.ctx_type
2352 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2353 && dfa->word_ops_used == 0)
2354 init_word_char (dfa);
2355 if (token->opr.ctx_type == WORD_DELIM
2356 || token->opr.ctx_type == NOT_WORD_DELIM)
2358 bin_tree_t *tree_first, *tree_last;
2359 if (token->opr.ctx_type == WORD_DELIM)
2361 token->opr.ctx_type = WORD_FIRST;
2362 tree_first = create_token_tree (dfa, NULL, NULL, token);
2363 token->opr.ctx_type = WORD_LAST;
2365 else
2367 token->opr.ctx_type = INSIDE_WORD;
2368 tree_first = create_token_tree (dfa, NULL, NULL, token);
2369 token->opr.ctx_type = INSIDE_NOTWORD;
2371 tree_last = create_token_tree (dfa, NULL, NULL, token);
2372 tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2373 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2375 *err = REG_ESPACE;
2376 return NULL;
2379 else
2381 tree = create_token_tree (dfa, NULL, NULL, token);
2382 if (BE (tree == NULL, 0))
2384 *err = REG_ESPACE;
2385 return NULL;
2388 /* We must return here, since ANCHORs can't be followed
2389 by repetition operators.
2390 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2391 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2392 fetch_token (token, regexp, syntax);
2393 return tree;
2394 case OP_PERIOD:
2395 tree = create_token_tree (dfa, NULL, NULL, token);
2396 if (BE (tree == NULL, 0))
2398 *err = REG_ESPACE;
2399 return NULL;
2401 if (dfa->mb_cur_max > 1)
2402 dfa->has_mb_node = 1;
2403 break;
2404 case OP_WORD:
2405 case OP_NOTWORD:
2406 tree = build_charclass_op (dfa, regexp->trans,
2407 (const unsigned char *) "alnum",
2408 (const unsigned char *) "_",
2409 token->type == OP_NOTWORD, err);
2410 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2411 return NULL;
2412 break;
2413 case OP_SPACE:
2414 case OP_NOTSPACE:
2415 tree = build_charclass_op (dfa, regexp->trans,
2416 (const unsigned char *) "space",
2417 (const unsigned char *) "",
2418 token->type == OP_NOTSPACE, err);
2419 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2420 return NULL;
2421 break;
2422 case OP_ALT:
2423 case END_OF_RE:
2424 return NULL;
2425 case BACK_SLASH:
2426 *err = REG_EESCAPE;
2427 return NULL;
2428 default:
2429 /* Must not happen? */
2430 #ifdef DEBUG
2431 assert (0);
2432 #endif
2433 return NULL;
2435 fetch_token (token, regexp, syntax);
2437 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2438 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2440 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2441 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2442 return NULL;
2443 /* In BRE consecutive duplications are not allowed. */
2444 if ((syntax & RE_CONTEXT_INVALID_DUP)
2445 && (token->type == OP_DUP_ASTERISK
2446 || token->type == OP_OPEN_DUP_NUM))
2448 *err = REG_BADRPT;
2449 return NULL;
2453 return tree;
2456 /* This function build the following tree, from regular expression
2457 (<reg_exp>):
2458 SUBEXP
2460 <reg_exp>
2463 static bin_tree_t *
2464 parse_sub_exp (regexp, preg, token, syntax, nest, err)
2465 re_string_t *regexp;
2466 regex_t *preg;
2467 re_token_t *token;
2468 reg_syntax_t syntax;
2469 int nest;
2470 reg_errcode_t *err;
2472 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2473 bin_tree_t *tree;
2474 size_t cur_nsub;
2475 cur_nsub = preg->re_nsub++;
2477 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2479 /* The subexpression may be a null string. */
2480 if (token->type == OP_CLOSE_SUBEXP)
2481 tree = NULL;
2482 else
2484 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2485 if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2486 *err = REG_EPAREN;
2487 if (BE (*err != REG_NOERROR, 0))
2488 return NULL;
2490 dfa->completed_bkref_map |= 1 << cur_nsub;
2492 tree = create_tree (dfa, tree, NULL, SUBEXP);
2493 if (BE (tree == NULL, 0))
2495 *err = REG_ESPACE;
2496 return NULL;
2498 tree->token.opr.idx = cur_nsub;
2499 return tree;
2502 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2504 static bin_tree_t *
2505 parse_dup_op (elem, regexp, dfa, token, syntax, err)
2506 bin_tree_t *elem;
2507 re_string_t *regexp;
2508 re_dfa_t *dfa;
2509 re_token_t *token;
2510 reg_syntax_t syntax;
2511 reg_errcode_t *err;
2513 bin_tree_t *tree = NULL, *old_tree = NULL;
2514 int i, start, end, start_idx = re_string_cur_idx (regexp);
2515 re_token_t start_token = *token;
2517 if (token->type == OP_OPEN_DUP_NUM)
2519 end = 0;
2520 start = fetch_number (regexp, token, syntax);
2521 if (start == -1)
2523 if (token->type == CHARACTER && token->opr.c == ',')
2524 start = 0; /* We treat "{,m}" as "{0,m}". */
2525 else
2527 *err = REG_BADBR; /* <re>{} is invalid. */
2528 return NULL;
2531 if (BE (start != -2, 1))
2533 /* We treat "{n}" as "{n,n}". */
2534 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2535 : ((token->type == CHARACTER && token->opr.c == ',')
2536 ? fetch_number (regexp, token, syntax) : -2));
2538 if (BE (start == -2 || end == -2, 0))
2540 /* Invalid sequence. */
2541 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2543 if (token->type == END_OF_RE)
2544 *err = REG_EBRACE;
2545 else
2546 *err = REG_BADBR;
2548 return NULL;
2551 /* If the syntax bit is set, rollback. */
2552 re_string_set_index (regexp, start_idx);
2553 *token = start_token;
2554 token->type = CHARACTER;
2555 /* mb_partial and word_char bits should be already initialized by
2556 peek_token. */
2557 return elem;
2560 if (BE (end != -1 && start > end, 0))
2562 /* First number greater than second. */
2563 *err = REG_BADBR;
2564 return NULL;
2567 else
2569 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2570 end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2573 fetch_token (token, regexp, syntax);
2575 if (BE (elem == NULL, 0))
2576 return NULL;
2577 if (BE (start == 0 && end == 0, 0))
2579 postorder (elem, free_tree, NULL);
2580 return NULL;
2583 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2584 if (BE (start > 0, 0))
2586 tree = elem;
2587 for (i = 2; i <= start; ++i)
2589 elem = duplicate_tree (elem, dfa);
2590 tree = create_tree (dfa, tree, elem, CONCAT);
2591 if (BE (elem == NULL || tree == NULL, 0))
2592 goto parse_dup_op_espace;
2595 if (start == end)
2596 return tree;
2598 /* Duplicate ELEM before it is marked optional. */
2599 elem = duplicate_tree (elem, dfa);
2600 old_tree = tree;
2602 else
2603 old_tree = NULL;
2605 if (elem->token.type == SUBEXP)
2606 postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2608 tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2609 if (BE (tree == NULL, 0))
2610 goto parse_dup_op_espace;
2612 /* This loop is actually executed only when end != -1,
2613 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2614 already created the start+1-th copy. */
2615 for (i = start + 2; i <= end; ++i)
2617 elem = duplicate_tree (elem, dfa);
2618 tree = create_tree (dfa, tree, elem, CONCAT);
2619 if (BE (elem == NULL || tree == NULL, 0))
2620 goto parse_dup_op_espace;
2622 tree = create_tree (dfa, tree, NULL, OP_ALT);
2623 if (BE (tree == NULL, 0))
2624 goto parse_dup_op_espace;
2627 if (old_tree)
2628 tree = create_tree (dfa, old_tree, tree, CONCAT);
2630 return tree;
2632 parse_dup_op_espace:
2633 *err = REG_ESPACE;
2634 return NULL;
2637 /* Size of the names for collating symbol/equivalence_class/character_class.
2638 I'm not sure, but maybe enough. */
2639 #define BRACKET_NAME_BUF_SIZE 32
2641 #ifndef _LIBC
2642 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2643 Build the range expression which starts from START_ELEM, and ends
2644 at END_ELEM. The result are written to MBCSET and SBCSET.
2645 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2646 mbcset->range_ends, is a pointer argument sinse we may
2647 update it. */
2649 static reg_errcode_t
2650 # ifdef RE_ENABLE_I18N
2651 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2652 re_charset_t *mbcset;
2653 int *range_alloc;
2654 # else /* not RE_ENABLE_I18N */
2655 build_range_exp (sbcset, start_elem, end_elem)
2656 # endif /* not RE_ENABLE_I18N */
2657 re_bitset_ptr_t sbcset;
2658 bracket_elem_t *start_elem, *end_elem;
2660 unsigned int start_ch, end_ch;
2661 /* Equivalence Classes and Character Classes can't be a range start/end. */
2662 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2663 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2665 return REG_ERANGE;
2667 /* We can handle no multi character collating elements without libc
2668 support. */
2669 if (BE ((start_elem->type == COLL_SYM
2670 && strlen ((char *) start_elem->opr.name) > 1)
2671 || (end_elem->type == COLL_SYM
2672 && strlen ((char *) end_elem->opr.name) > 1), 0))
2673 return REG_ECOLLATE;
2675 # ifdef RE_ENABLE_I18N
2677 wchar_t wc, start_wc, end_wc;
2678 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2680 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2681 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2682 : 0));
2683 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2684 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2685 : 0));
2686 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2687 ? __btowc (start_ch) : start_elem->opr.wch);
2688 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2689 ? __btowc (end_ch) : end_elem->opr.wch);
2690 if (start_wc == WEOF || end_wc == WEOF)
2691 return REG_ECOLLATE;
2692 cmp_buf[0] = start_wc;
2693 cmp_buf[4] = end_wc;
2694 if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2695 return REG_ERANGE;
2697 /* Got valid collation sequence values, add them as a new entry.
2698 However, for !_LIBC we have no collation elements: if the
2699 character set is single byte, the single byte character set
2700 that we build below suffices. parse_bracket_exp passes
2701 no MBCSET if dfa->mb_cur_max == 1. */
2702 if (mbcset)
2704 /* Check the space of the arrays. */
2705 if (BE (*range_alloc == mbcset->nranges, 0))
2707 /* There is not enough space, need realloc. */
2708 wchar_t *new_array_start, *new_array_end;
2709 int new_nranges;
2711 /* +1 in case of mbcset->nranges is 0. */
2712 new_nranges = 2 * mbcset->nranges + 1;
2713 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2714 are NULL if *range_alloc == 0. */
2715 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2716 new_nranges);
2717 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2718 new_nranges);
2720 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2721 return REG_ESPACE;
2723 mbcset->range_starts = new_array_start;
2724 mbcset->range_ends = new_array_end;
2725 *range_alloc = new_nranges;
2728 mbcset->range_starts[mbcset->nranges] = start_wc;
2729 mbcset->range_ends[mbcset->nranges++] = end_wc;
2732 /* Build the table for single byte characters. */
2733 for (wc = 0; wc < SBC_MAX; ++wc)
2735 cmp_buf[2] = wc;
2736 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2737 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2738 bitset_set (sbcset, wc);
2741 # else /* not RE_ENABLE_I18N */
2743 unsigned int ch;
2744 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2745 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2746 : 0));
2747 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2748 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2749 : 0));
2750 if (start_ch > end_ch)
2751 return REG_ERANGE;
2752 /* Build the table for single byte characters. */
2753 for (ch = 0; ch < SBC_MAX; ++ch)
2754 if (start_ch <= ch && ch <= end_ch)
2755 bitset_set (sbcset, ch);
2757 # endif /* not RE_ENABLE_I18N */
2758 return REG_NOERROR;
2760 #endif /* not _LIBC */
2762 #ifndef _LIBC
2763 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2764 Build the collating element which is represented by NAME.
2765 The result are written to MBCSET and SBCSET.
2766 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2767 pointer argument since we may update it. */
2769 static reg_errcode_t
2770 # ifdef RE_ENABLE_I18N
2771 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2772 re_charset_t *mbcset;
2773 int *coll_sym_alloc;
2774 # else /* not RE_ENABLE_I18N */
2775 build_collating_symbol (sbcset, name)
2776 # endif /* not RE_ENABLE_I18N */
2777 re_bitset_ptr_t sbcset;
2778 const unsigned char *name;
2780 size_t name_len = strlen ((const char *) name);
2781 if (BE (name_len != 1, 0))
2782 return REG_ECOLLATE;
2783 else
2785 bitset_set (sbcset, name[0]);
2786 return REG_NOERROR;
2789 #endif /* not _LIBC */
2791 /* This function parse bracket expression like "[abc]", "[a-c]",
2792 "[[.a-a.]]" etc. */
2794 static bin_tree_t *
2795 parse_bracket_exp (regexp, dfa, token, syntax, err)
2796 re_string_t *regexp;
2797 re_dfa_t *dfa;
2798 re_token_t *token;
2799 reg_syntax_t syntax;
2800 reg_errcode_t *err;
2802 #ifdef _LIBC
2803 const unsigned char *collseqmb;
2804 const char *collseqwc;
2805 uint32_t nrules;
2806 int32_t table_size;
2807 const int32_t *symb_table;
2808 const unsigned char *extra;
2810 /* Local function for parse_bracket_exp used in _LIBC environement.
2811 Seek the collating symbol entry correspondings to NAME.
2812 Return the index of the symbol in the SYMB_TABLE. */
2814 auto inline int32_t
2815 __attribute ((always_inline))
2816 seek_collating_symbol_entry (name, name_len)
2817 const unsigned char *name;
2818 size_t name_len;
2820 int32_t hash = elem_hash ((const char *) name, name_len);
2821 int32_t elem = hash % table_size;
2822 int32_t second = hash % (table_size - 2);
2823 while (symb_table[2 * elem] != 0)
2825 /* First compare the hashing value. */
2826 if (symb_table[2 * elem] == hash
2827 /* Compare the length of the name. */
2828 && name_len == extra[symb_table[2 * elem + 1]]
2829 /* Compare the name. */
2830 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2831 name_len) == 0)
2833 /* Yep, this is the entry. */
2834 break;
2837 /* Next entry. */
2838 elem += second;
2840 return elem;
2843 /* Local function for parse_bracket_exp used in _LIBC environement.
2844 Look up the collation sequence value of BR_ELEM.
2845 Return the value if succeeded, UINT_MAX otherwise. */
2847 auto inline unsigned int
2848 __attribute ((always_inline))
2849 lookup_collation_sequence_value (br_elem)
2850 bracket_elem_t *br_elem;
2852 if (br_elem->type == SB_CHAR)
2855 if (MB_CUR_MAX == 1)
2857 if (nrules == 0)
2858 return collseqmb[br_elem->opr.ch];
2859 else
2861 wint_t wc = __btowc (br_elem->opr.ch);
2862 return __collseq_table_lookup (collseqwc, wc);
2865 else if (br_elem->type == MB_CHAR)
2867 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2869 else if (br_elem->type == COLL_SYM)
2871 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2872 if (nrules != 0)
2874 int32_t elem, idx;
2875 elem = seek_collating_symbol_entry (br_elem->opr.name,
2876 sym_name_len);
2877 if (symb_table[2 * elem] != 0)
2879 /* We found the entry. */
2880 idx = symb_table[2 * elem + 1];
2881 /* Skip the name of collating element name. */
2882 idx += 1 + extra[idx];
2883 /* Skip the byte sequence of the collating element. */
2884 idx += 1 + extra[idx];
2885 /* Adjust for the alignment. */
2886 idx = (idx + 3) & ~3;
2887 /* Skip the multibyte collation sequence value. */
2888 idx += sizeof (unsigned int);
2889 /* Skip the wide char sequence of the collating element. */
2890 idx += sizeof (unsigned int) *
2891 (1 + *(unsigned int *) (extra + idx));
2892 /* Return the collation sequence value. */
2893 return *(unsigned int *) (extra + idx);
2895 else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2897 /* No valid character. Match it as a single byte
2898 character. */
2899 return collseqmb[br_elem->opr.name[0]];
2902 else if (sym_name_len == 1)
2903 return collseqmb[br_elem->opr.name[0]];
2905 return UINT_MAX;
2908 /* Local function for parse_bracket_exp used in _LIBC environement.
2909 Build the range expression which starts from START_ELEM, and ends
2910 at END_ELEM. The result are written to MBCSET and SBCSET.
2911 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2912 mbcset->range_ends, is a pointer argument sinse we may
2913 update it. */
2915 auto inline reg_errcode_t
2916 __attribute ((always_inline))
2917 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2918 re_charset_t *mbcset;
2919 int *range_alloc;
2920 re_bitset_ptr_t sbcset;
2921 bracket_elem_t *start_elem, *end_elem;
2923 unsigned int ch;
2924 uint32_t start_collseq;
2925 uint32_t end_collseq;
2927 /* Equivalence Classes and Character Classes can't be a range
2928 start/end. */
2929 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2930 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2932 return REG_ERANGE;
2934 start_collseq = lookup_collation_sequence_value (start_elem);
2935 end_collseq = lookup_collation_sequence_value (end_elem);
2936 /* Check start/end collation sequence values. */
2937 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2938 return REG_ECOLLATE;
2939 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2940 return REG_ERANGE;
2942 /* Got valid collation sequence values, add them as a new entry.
2943 However, if we have no collation elements, and the character set
2944 is single byte, the single byte character set that we
2945 build below suffices. */
2946 if (nrules > 0 || dfa->mb_cur_max > 1)
2948 /* Check the space of the arrays. */
2949 if (BE (*range_alloc == mbcset->nranges, 0))
2951 /* There is not enough space, need realloc. */
2952 uint32_t *new_array_start;
2953 uint32_t *new_array_end;
2954 int new_nranges;
2956 /* +1 in case of mbcset->nranges is 0. */
2957 new_nranges = 2 * mbcset->nranges + 1;
2958 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2959 new_nranges);
2960 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2961 new_nranges);
2963 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2964 return REG_ESPACE;
2966 mbcset->range_starts = new_array_start;
2967 mbcset->range_ends = new_array_end;
2968 *range_alloc = new_nranges;
2971 mbcset->range_starts[mbcset->nranges] = start_collseq;
2972 mbcset->range_ends[mbcset->nranges++] = end_collseq;
2975 /* Build the table for single byte characters. */
2976 for (ch = 0; ch < SBC_MAX; ch++)
2978 uint32_t ch_collseq;
2980 if (MB_CUR_MAX == 1)
2982 if (nrules == 0)
2983 ch_collseq = collseqmb[ch];
2984 else
2985 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2986 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2987 bitset_set (sbcset, ch);
2989 return REG_NOERROR;
2992 /* Local function for parse_bracket_exp used in _LIBC environement.
2993 Build the collating element which is represented by NAME.
2994 The result are written to MBCSET and SBCSET.
2995 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2996 pointer argument sinse we may update it. */
2998 auto inline reg_errcode_t
2999 __attribute ((always_inline))
3000 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
3001 re_charset_t *mbcset;
3002 int *coll_sym_alloc;
3003 re_bitset_ptr_t sbcset;
3004 const unsigned char *name;
3006 int32_t elem, idx;
3007 size_t name_len = strlen ((const char *) name);
3008 if (nrules != 0)
3010 elem = seek_collating_symbol_entry (name, name_len);
3011 if (symb_table[2 * elem] != 0)
3013 /* We found the entry. */
3014 idx = symb_table[2 * elem + 1];
3015 /* Skip the name of collating element name. */
3016 idx += 1 + extra[idx];
3018 else if (symb_table[2 * elem] == 0 && name_len == 1)
3020 /* No valid character, treat it as a normal
3021 character. */
3022 bitset_set (sbcset, name[0]);
3023 return REG_NOERROR;
3025 else
3026 return REG_ECOLLATE;
3028 /* Got valid collation sequence, add it as a new entry. */
3029 /* Check the space of the arrays. */
3030 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3032 /* Not enough, realloc it. */
3033 /* +1 in case of mbcset->ncoll_syms is 0. */
3034 int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3035 /* Use realloc since mbcset->coll_syms is NULL
3036 if *alloc == 0. */
3037 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3038 new_coll_sym_alloc);
3039 if (BE (new_coll_syms == NULL, 0))
3040 return REG_ESPACE;
3041 mbcset->coll_syms = new_coll_syms;
3042 *coll_sym_alloc = new_coll_sym_alloc;
3044 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3045 return REG_NOERROR;
3047 else
3049 if (BE (name_len != 1, 0))
3050 return REG_ECOLLATE;
3051 else
3053 bitset_set (sbcset, name[0]);
3054 return REG_NOERROR;
3058 #endif
3060 re_token_t br_token;
3061 re_bitset_ptr_t sbcset;
3062 #ifdef RE_ENABLE_I18N
3063 re_charset_t *mbcset;
3064 int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3065 int equiv_class_alloc = 0, char_class_alloc = 0;
3066 #endif /* not RE_ENABLE_I18N */
3067 int non_match = 0;
3068 bin_tree_t *work_tree;
3069 int token_len;
3070 int first_round = 1;
3071 #ifdef _LIBC
3072 collseqmb = (const unsigned char *)
3073 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3074 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3075 if (nrules)
3078 if (MB_CUR_MAX > 1)
3080 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3081 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3082 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3083 _NL_COLLATE_SYMB_TABLEMB);
3084 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3085 _NL_COLLATE_SYMB_EXTRAMB);
3087 #endif
3088 sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
3089 #ifdef RE_ENABLE_I18N
3090 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3091 #endif /* RE_ENABLE_I18N */
3092 #ifdef RE_ENABLE_I18N
3093 if (BE (sbcset == NULL || mbcset == NULL, 0))
3094 #else
3095 if (BE (sbcset == NULL, 0))
3096 #endif /* RE_ENABLE_I18N */
3098 *err = REG_ESPACE;
3099 return NULL;
3102 token_len = peek_token_bracket (token, regexp, syntax);
3103 if (BE (token->type == END_OF_RE, 0))
3105 *err = REG_BADPAT;
3106 goto parse_bracket_exp_free_return;
3108 if (token->type == OP_NON_MATCH_LIST)
3110 #ifdef RE_ENABLE_I18N
3111 mbcset->non_match = 1;
3112 #endif /* not RE_ENABLE_I18N */
3113 non_match = 1;
3114 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3115 bitset_set (sbcset, '\0');
3116 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3117 token_len = peek_token_bracket (token, regexp, syntax);
3118 if (BE (token->type == END_OF_RE, 0))
3120 *err = REG_BADPAT;
3121 goto parse_bracket_exp_free_return;
3125 /* We treat the first ']' as a normal character. */
3126 if (token->type == OP_CLOSE_BRACKET)
3127 token->type = CHARACTER;
3129 while (1)
3131 bracket_elem_t start_elem, end_elem;
3132 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3133 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3134 reg_errcode_t ret;
3135 int token_len2 = 0, is_range_exp = 0;
3136 re_token_t token2;
3138 start_elem.opr.name = start_name_buf;
3139 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3140 syntax, first_round);
3141 if (BE (ret != REG_NOERROR, 0))
3143 *err = ret;
3144 goto parse_bracket_exp_free_return;
3146 first_round = 0;
3148 /* Get information about the next token. We need it in any case. */
3149 token_len = peek_token_bracket (token, regexp, syntax);
3151 /* Do not check for ranges if we know they are not allowed. */
3152 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3154 if (BE (token->type == END_OF_RE, 0))
3156 *err = REG_EBRACK;
3157 goto parse_bracket_exp_free_return;
3159 if (token->type == OP_CHARSET_RANGE)
3161 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3162 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3163 if (BE (token2.type == END_OF_RE, 0))
3165 *err = REG_EBRACK;
3166 goto parse_bracket_exp_free_return;
3168 if (token2.type == OP_CLOSE_BRACKET)
3170 /* We treat the last '-' as a normal character. */
3171 re_string_skip_bytes (regexp, -token_len);
3172 token->type = CHARACTER;
3174 else
3175 is_range_exp = 1;
3179 if (is_range_exp == 1)
3181 end_elem.opr.name = end_name_buf;
3182 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3183 dfa, syntax, 1);
3184 if (BE (ret != REG_NOERROR, 0))
3186 *err = ret;
3187 goto parse_bracket_exp_free_return;
3190 token_len = peek_token_bracket (token, regexp, syntax);
3192 #ifdef _LIBC
3193 *err = build_range_exp (sbcset, mbcset, &range_alloc,
3194 &start_elem, &end_elem);
3195 #else
3196 # ifdef RE_ENABLE_I18N
3197 *err = build_range_exp (sbcset,
3198 dfa->mb_cur_max > 1 ? mbcset : NULL,
3199 &range_alloc, &start_elem, &end_elem);
3200 # else
3201 *err = build_range_exp (sbcset, &start_elem, &end_elem);
3202 # endif
3203 #endif /* RE_ENABLE_I18N */
3204 if (BE (*err != REG_NOERROR, 0))
3205 goto parse_bracket_exp_free_return;
3207 else
3209 switch (start_elem.type)
3211 case SB_CHAR:
3212 bitset_set (sbcset, start_elem.opr.ch);
3213 break;
3214 #ifdef RE_ENABLE_I18N
3215 case MB_CHAR:
3216 /* Check whether the array has enough space. */
3217 if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3219 wchar_t *new_mbchars;
3220 /* Not enough, realloc it. */
3221 /* +1 in case of mbcset->nmbchars is 0. */
3222 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3223 /* Use realloc since array is NULL if *alloc == 0. */
3224 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3225 mbchar_alloc);
3226 if (BE (new_mbchars == NULL, 0))
3227 goto parse_bracket_exp_espace;
3228 mbcset->mbchars = new_mbchars;
3230 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3231 break;
3232 #endif /* RE_ENABLE_I18N */
3233 case EQUIV_CLASS:
3234 *err = build_equiv_class (sbcset,
3235 #ifdef RE_ENABLE_I18N
3236 mbcset, &equiv_class_alloc,
3237 #endif /* RE_ENABLE_I18N */
3238 start_elem.opr.name);
3239 if (BE (*err != REG_NOERROR, 0))
3240 goto parse_bracket_exp_free_return;
3241 break;
3242 case COLL_SYM:
3243 *err = build_collating_symbol (sbcset,
3244 #ifdef RE_ENABLE_I18N
3245 mbcset, &coll_sym_alloc,
3246 #endif /* RE_ENABLE_I18N */
3247 start_elem.opr.name);
3248 if (BE (*err != REG_NOERROR, 0))
3249 goto parse_bracket_exp_free_return;
3250 break;
3251 case CHAR_CLASS:
3252 *err = build_charclass (regexp->trans, sbcset,
3253 #ifdef RE_ENABLE_I18N
3254 mbcset, &char_class_alloc,
3255 #endif /* RE_ENABLE_I18N */
3256 start_elem.opr.name, syntax);
3257 if (BE (*err != REG_NOERROR, 0))
3258 goto parse_bracket_exp_free_return;
3259 break;
3260 default:
3261 assert (0);
3262 break;
3265 if (BE (token->type == END_OF_RE, 0))
3267 *err = REG_EBRACK;
3268 goto parse_bracket_exp_free_return;
3270 if (token->type == OP_CLOSE_BRACKET)
3271 break;
3274 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3276 /* If it is non-matching list. */
3277 if (non_match)
3278 bitset_not (sbcset);
3280 #ifdef RE_ENABLE_I18N
3281 /* Ensure only single byte characters are set. */
3282 if (dfa->mb_cur_max > 1)
3283 bitset_mask (sbcset, dfa->sb_char);
3285 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3286 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3287 || mbcset->non_match)))
3289 bin_tree_t *mbc_tree;
3290 int sbc_idx;
3291 /* Build a tree for complex bracket. */
3292 dfa->has_mb_node = 1;
3293 br_token.type = COMPLEX_BRACKET;
3294 br_token.opr.mbcset = mbcset;
3295 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3296 if (BE (mbc_tree == NULL, 0))
3297 goto parse_bracket_exp_espace;
3298 for (sbc_idx = 0; sbc_idx < BITSET_UINTS; ++sbc_idx)
3299 if (sbcset[sbc_idx])
3300 break;
3301 /* If there are no bits set in sbcset, there is no point
3302 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3303 if (sbc_idx < BITSET_UINTS)
3305 /* Build a tree for simple bracket. */
3306 br_token.type = SIMPLE_BRACKET;
3307 br_token.opr.sbcset = sbcset;
3308 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3309 if (BE (work_tree == NULL, 0))
3310 goto parse_bracket_exp_espace;
3312 /* Then join them by ALT node. */
3313 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3314 if (BE (work_tree == NULL, 0))
3315 goto parse_bracket_exp_espace;
3317 else
3319 re_free (sbcset);
3320 work_tree = mbc_tree;
3323 else
3324 #endif /* not RE_ENABLE_I18N */
3326 #ifdef RE_ENABLE_I18N
3327 free_charset (mbcset);
3328 #endif
3329 /* Build a tree for simple bracket. */
3330 br_token.type = SIMPLE_BRACKET;
3331 br_token.opr.sbcset = sbcset;
3332 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3333 if (BE (work_tree == NULL, 0))
3334 goto parse_bracket_exp_espace;
3336 return work_tree;
3338 parse_bracket_exp_espace:
3339 *err = REG_ESPACE;
3340 parse_bracket_exp_free_return:
3341 re_free (sbcset);
3342 #ifdef RE_ENABLE_I18N
3343 free_charset (mbcset);
3344 #endif /* RE_ENABLE_I18N */
3345 return NULL;
3348 /* Parse an element in the bracket expression. */
3350 static reg_errcode_t
3351 parse_bracket_element (elem, regexp, token, token_len, dfa, syntax,
3352 accept_hyphen)
3353 bracket_elem_t *elem;
3354 re_string_t *regexp;
3355 re_token_t *token;
3356 int token_len;
3357 re_dfa_t *dfa;
3358 reg_syntax_t syntax;
3359 int accept_hyphen;
3361 #ifdef RE_ENABLE_I18N
3362 int cur_char_size;
3363 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3364 if (cur_char_size > 1)
3366 elem->type = MB_CHAR;
3367 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3368 re_string_skip_bytes (regexp, cur_char_size);
3369 return REG_NOERROR;
3371 #endif /* RE_ENABLE_I18N */
3372 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3373 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3374 || token->type == OP_OPEN_EQUIV_CLASS)
3375 return parse_bracket_symbol (elem, regexp, token);
3376 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3378 /* A '-' must only appear as anything but a range indicator before
3379 the closing bracket. Everything else is an error. */
3380 re_token_t token2;
3381 (void) peek_token_bracket (&token2, regexp, syntax);
3382 if (token2.type != OP_CLOSE_BRACKET)
3383 /* The actual error value is not standardized since this whole
3384 case is undefined. But ERANGE makes good sense. */
3385 return REG_ERANGE;
3387 elem->type = SB_CHAR;
3388 elem->opr.ch = token->opr.c;
3389 return REG_NOERROR;
3392 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3393 such as [:<character_class>:], [.<collating_element>.], and
3394 [=<equivalent_class>=]. */
3396 static reg_errcode_t
3397 parse_bracket_symbol (elem, regexp, token)
3398 bracket_elem_t *elem;
3399 re_string_t *regexp;
3400 re_token_t *token;
3402 unsigned char ch, delim = token->opr.c;
3403 int i = 0;
3404 if (re_string_eoi(regexp))
3405 return REG_EBRACK;
3406 for (;; ++i)
3408 if (i >= BRACKET_NAME_BUF_SIZE)
3409 return REG_EBRACK;
3410 if (token->type == OP_OPEN_CHAR_CLASS)
3411 ch = re_string_fetch_byte_case (regexp);
3412 else
3413 ch = re_string_fetch_byte (regexp);
3414 if (re_string_eoi(regexp))
3415 return REG_EBRACK;
3416 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3417 break;
3418 elem->opr.name[i] = ch;
3420 re_string_skip_bytes (regexp, 1);
3421 elem->opr.name[i] = '\0';
3422 switch (token->type)
3424 case OP_OPEN_COLL_ELEM:
3425 elem->type = COLL_SYM;
3426 break;
3427 case OP_OPEN_EQUIV_CLASS:
3428 elem->type = EQUIV_CLASS;
3429 break;
3430 case OP_OPEN_CHAR_CLASS:
3431 elem->type = CHAR_CLASS;
3432 break;
3433 default:
3434 break;
3436 return REG_NOERROR;
3439 /* Helper function for parse_bracket_exp.
3440 Build the equivalence class which is represented by NAME.
3441 The result are written to MBCSET and SBCSET.
3442 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3443 is a pointer argument sinse we may update it. */
3445 static reg_errcode_t
3446 #ifdef RE_ENABLE_I18N
3447 build_equiv_class (sbcset, mbcset, equiv_class_alloc, name)
3448 re_charset_t *mbcset;
3449 int *equiv_class_alloc;
3450 #else /* not RE_ENABLE_I18N */
3451 build_equiv_class (sbcset, name)
3452 #endif /* not RE_ENABLE_I18N */
3453 re_bitset_ptr_t sbcset;
3454 const unsigned char *name;
3456 #if defined _LIBC
3457 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3458 if (nrules != 0)
3460 const int32_t *table, *indirect;
3461 const unsigned char *weights, *extra, *cp;
3462 unsigned char char_buf[2];
3463 int32_t idx1, idx2;
3464 unsigned int ch;
3465 size_t len;
3466 /* This #include defines a local function! */
3467 # include <locale/weight.h>
3468 /* Calculate the index for equivalence class. */
3469 cp = name;
3470 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3471 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3472 _NL_COLLATE_WEIGHTMB);
3473 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3474 _NL_COLLATE_EXTRAMB);
3475 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3476 _NL_COLLATE_INDIRECTMB);
3477 idx1 = findidx (&cp);
3478 if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3479 /* This isn't a valid character. */
3480 return REG_ECOLLATE;
3482 /* Build single byte matcing table for this equivalence class. */
3483 char_buf[1] = (unsigned char) '\0';
3484 len = weights[idx1];
3485 for (ch = 0; ch < SBC_MAX; ++ch)
3487 char_buf[0] = ch;
3488 cp = char_buf;
3489 idx2 = findidx (&cp);
3491 idx2 = table[ch];
3493 if (idx2 == 0)
3494 /* This isn't a valid character. */
3495 continue;
3496 if (len == weights[idx2])
3498 int cnt = 0;
3499 while (cnt <= len &&
3500 weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
3501 ++cnt;
3503 if (cnt > len)
3504 bitset_set (sbcset, ch);
3507 /* Check whether the array has enough space. */
3508 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3510 /* Not enough, realloc it. */
3511 /* +1 in case of mbcset->nequiv_classes is 0. */
3512 int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3513 /* Use realloc since the array is NULL if *alloc == 0. */
3514 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3515 int32_t,
3516 new_equiv_class_alloc);
3517 if (BE (new_equiv_classes == NULL, 0))
3518 return REG_ESPACE;
3519 mbcset->equiv_classes = new_equiv_classes;
3520 *equiv_class_alloc = new_equiv_class_alloc;
3522 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3524 else
3525 #endif /* _LIBC */
3527 if (BE (strlen ((const char *) name) != 1, 0))
3528 return REG_ECOLLATE;
3529 bitset_set (sbcset, *name);
3531 return REG_NOERROR;
3534 /* Helper function for parse_bracket_exp.
3535 Build the character class which is represented by NAME.
3536 The result are written to MBCSET and SBCSET.
3537 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3538 is a pointer argument sinse we may update it. */
3540 static reg_errcode_t
3541 #ifdef RE_ENABLE_I18N
3542 build_charclass (trans, sbcset, mbcset, char_class_alloc, class_name, syntax)
3543 re_charset_t *mbcset;
3544 int *char_class_alloc;
3545 #else /* not RE_ENABLE_I18N */
3546 build_charclass (trans, sbcset, class_name, syntax)
3547 #endif /* not RE_ENABLE_I18N */
3548 unsigned RE_TRANSLATE_TYPE trans;
3549 re_bitset_ptr_t sbcset;
3550 const unsigned char *class_name;
3551 reg_syntax_t syntax;
3553 int i;
3554 const char *name = (const char *) class_name;
3556 /* In case of REG_ICASE "upper" and "lower" match the both of
3557 upper and lower cases. */
3558 if ((syntax & RE_ICASE)
3559 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3560 name = "alpha";
3562 #ifdef RE_ENABLE_I18N
3563 /* Check the space of the arrays. */
3564 if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3566 /* Not enough, realloc it. */
3567 /* +1 in case of mbcset->nchar_classes is 0. */
3568 int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3569 /* Use realloc since array is NULL if *alloc == 0. */
3570 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3571 new_char_class_alloc);
3572 if (BE (new_char_classes == NULL, 0))
3573 return REG_ESPACE;
3574 mbcset->char_classes = new_char_classes;
3575 *char_class_alloc = new_char_class_alloc;
3577 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3578 #endif /* RE_ENABLE_I18N */
3580 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3581 for (i = 0; i < SBC_MAX; ++i) \
3583 if (ctype_func (i)) \
3585 int ch = trans ? trans[i] : i; \
3586 bitset_set (sbcset, ch); \
3590 if (strcmp (name, "alnum") == 0)
3591 BUILD_CHARCLASS_LOOP (isalnum)
3592 else if (strcmp (name, "cntrl") == 0)
3593 BUILD_CHARCLASS_LOOP (iscntrl)
3594 else if (strcmp (name, "lower") == 0)
3595 BUILD_CHARCLASS_LOOP (islower)
3596 else if (strcmp (name, "space") == 0)
3597 BUILD_CHARCLASS_LOOP (isspace)
3598 else if (strcmp (name, "alpha") == 0)
3599 BUILD_CHARCLASS_LOOP (isalpha)
3600 else if (strcmp (name, "digit") == 0)
3601 BUILD_CHARCLASS_LOOP (isdigit)
3602 else if (strcmp (name, "print") == 0)
3603 BUILD_CHARCLASS_LOOP (isprint)
3604 else if (strcmp (name, "upper") == 0)
3605 BUILD_CHARCLASS_LOOP (isupper)
3606 else if (strcmp (name, "blank") == 0)
3607 BUILD_CHARCLASS_LOOP (isblank)
3608 else if (strcmp (name, "graph") == 0)
3609 BUILD_CHARCLASS_LOOP (isgraph)
3610 else if (strcmp (name, "punct") == 0)
3611 BUILD_CHARCLASS_LOOP (ispunct)
3612 else if (strcmp (name, "xdigit") == 0)
3613 BUILD_CHARCLASS_LOOP (isxdigit)
3614 else
3615 return REG_ECTYPE;
3617 return REG_NOERROR;
3620 static bin_tree_t *
3621 build_charclass_op (dfa, trans, class_name, extra, non_match, err)
3622 re_dfa_t *dfa;
3623 unsigned RE_TRANSLATE_TYPE trans;
3624 const unsigned char *class_name;
3625 const unsigned char *extra;
3626 int non_match;
3627 reg_errcode_t *err;
3629 re_bitset_ptr_t sbcset;
3630 #ifdef RE_ENABLE_I18N
3631 re_charset_t *mbcset;
3632 int alloc = 0;
3633 #endif /* not RE_ENABLE_I18N */
3634 reg_errcode_t ret;
3635 re_token_t br_token;
3636 bin_tree_t *tree;
3638 sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
3639 #ifdef RE_ENABLE_I18N
3640 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3641 #endif /* RE_ENABLE_I18N */
3643 #ifdef RE_ENABLE_I18N
3644 if (BE (sbcset == NULL || mbcset == NULL, 0))
3645 #else /* not RE_ENABLE_I18N */
3646 if (BE (sbcset == NULL, 0))
3647 #endif /* not RE_ENABLE_I18N */
3649 *err = REG_ESPACE;
3650 return NULL;
3653 if (non_match)
3655 #ifdef RE_ENABLE_I18N
3657 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3658 bitset_set(cset->sbcset, '\0');
3660 mbcset->non_match = 1;
3661 #endif /* not RE_ENABLE_I18N */
3664 /* We don't care the syntax in this case. */
3665 ret = build_charclass (trans, sbcset,
3666 #ifdef RE_ENABLE_I18N
3667 mbcset, &alloc,
3668 #endif /* RE_ENABLE_I18N */
3669 class_name, 0);
3671 if (BE (ret != REG_NOERROR, 0))
3673 re_free (sbcset);
3674 #ifdef RE_ENABLE_I18N
3675 free_charset (mbcset);
3676 #endif /* RE_ENABLE_I18N */
3677 *err = ret;
3678 return NULL;
3680 /* \w match '_' also. */
3681 for (; *extra; extra++)
3682 bitset_set (sbcset, *extra);
3684 /* If it is non-matching list. */
3685 if (non_match)
3686 bitset_not (sbcset);
3688 #ifdef RE_ENABLE_I18N
3689 /* Ensure only single byte characters are set. */
3690 if (dfa->mb_cur_max > 1)
3691 bitset_mask (sbcset, dfa->sb_char);
3692 #endif
3694 /* Build a tree for simple bracket. */
3695 br_token.type = SIMPLE_BRACKET;
3696 br_token.opr.sbcset = sbcset;
3697 tree = create_token_tree (dfa, NULL, NULL, &br_token);
3698 if (BE (tree == NULL, 0))
3699 goto build_word_op_espace;
3701 #ifdef RE_ENABLE_I18N
3702 if (dfa->mb_cur_max > 1)
3704 bin_tree_t *mbc_tree;
3705 /* Build a tree for complex bracket. */
3706 br_token.type = COMPLEX_BRACKET;
3707 br_token.opr.mbcset = mbcset;
3708 dfa->has_mb_node = 1;
3709 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3710 if (BE (mbc_tree == NULL, 0))
3711 goto build_word_op_espace;
3712 /* Then join them by ALT node. */
3713 tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3714 if (BE (mbc_tree != NULL, 1))
3715 return tree;
3717 else
3719 free_charset (mbcset);
3720 return tree;
3722 #else /* not RE_ENABLE_I18N */
3723 return tree;
3724 #endif /* not RE_ENABLE_I18N */
3726 build_word_op_espace:
3727 re_free (sbcset);
3728 #ifdef RE_ENABLE_I18N
3729 free_charset (mbcset);
3730 #endif /* RE_ENABLE_I18N */
3731 *err = REG_ESPACE;
3732 return NULL;
3735 /* This is intended for the expressions like "a{1,3}".
3736 Fetch a number from `input', and return the number.
3737 Return -1, if the number field is empty like "{,1}".
3738 Return -2, If an error is occured. */
3740 static int
3741 fetch_number (input, token, syntax)
3742 re_string_t *input;
3743 re_token_t *token;
3744 reg_syntax_t syntax;
3746 int num = -1;
3747 unsigned char c;
3748 while (1)
3750 fetch_token (token, input, syntax);
3751 c = token->opr.c;
3752 if (BE (token->type == END_OF_RE, 0))
3753 return -2;
3754 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3755 break;
3756 num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3757 ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3758 num = (num > RE_DUP_MAX) ? -2 : num;
3760 return num;
3763 #ifdef RE_ENABLE_I18N
3764 static void
3765 free_charset (re_charset_t *cset)
3767 re_free (cset->mbchars);
3768 # ifdef _LIBC
3769 re_free (cset->coll_syms);
3770 re_free (cset->equiv_classes);
3771 re_free (cset->range_starts);
3772 re_free (cset->range_ends);
3773 # endif
3774 re_free (cset->char_classes);
3775 re_free (cset);
3777 #endif /* RE_ENABLE_I18N */
3779 /* Functions for binary tree operation. */
3781 /* Create a tree node. */
3783 static bin_tree_t *
3784 create_tree (dfa, left, right, type)
3785 re_dfa_t *dfa;
3786 bin_tree_t *left;
3787 bin_tree_t *right;
3788 re_token_type_t type;
3790 re_token_t t;
3791 t.type = type;
3792 return create_token_tree (dfa, left, right, &t);
3795 static bin_tree_t *
3796 create_token_tree (dfa, left, right, token)
3797 re_dfa_t *dfa;
3798 bin_tree_t *left;
3799 bin_tree_t *right;
3800 const re_token_t *token;
3802 bin_tree_t *tree;
3803 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3805 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3807 if (storage == NULL)
3808 return NULL;
3809 storage->next = dfa->str_tree_storage;
3810 dfa->str_tree_storage = storage;
3811 dfa->str_tree_storage_idx = 0;
3813 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3815 tree->parent = NULL;
3816 tree->left = left;
3817 tree->right = right;
3818 tree->token = *token;
3819 tree->token.duplicated = 0;
3820 tree->token.opt_subexp = 0;
3821 tree->first = NULL;
3822 tree->next = NULL;
3823 tree->node_idx = -1;
3825 if (left != NULL)
3826 left->parent = tree;
3827 if (right != NULL)
3828 right->parent = tree;
3829 return tree;
3832 /* Mark the tree SRC as an optional subexpression.
3833 To be called from preorder or postorder. */
3835 static reg_errcode_t
3836 mark_opt_subexp (extra, node)
3837 void *extra;
3838 bin_tree_t *node;
3840 int idx = (int) (long) extra;
3841 if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3842 node->token.opt_subexp = 1;
3844 return REG_NOERROR;
3847 /* Free the allocated memory inside NODE. */
3849 static void
3850 free_token (re_token_t *node)
3852 #ifdef RE_ENABLE_I18N
3853 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3854 free_charset (node->opr.mbcset);
3855 else
3856 #endif /* RE_ENABLE_I18N */
3857 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3858 re_free (node->opr.sbcset);
3861 /* Worker function for tree walking. Free the allocated memory inside NODE
3862 and its children. */
3864 static reg_errcode_t
3865 free_tree (void *extra, bin_tree_t *node)
3867 free_token (&node->token);
3868 return REG_NOERROR;
3872 /* Duplicate the node SRC, and return new node. This is a preorder
3873 visit similar to the one implemented by the generic visitor, but
3874 we need more infrastructure to maintain two parallel trees --- so,
3875 it's easier to duplicate. */
3877 static bin_tree_t *
3878 duplicate_tree (root, dfa)
3879 const bin_tree_t *root;
3880 re_dfa_t *dfa;
3882 const bin_tree_t *node;
3883 bin_tree_t *dup_root;
3884 bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3886 for (node = root; ; )
3888 /* Create a new tree and link it back to the current parent. */
3889 *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3890 if (*p_new == NULL)
3891 return NULL;
3892 (*p_new)->parent = dup_node;
3893 (*p_new)->token.duplicated = 1;
3894 dup_node = *p_new;
3896 /* Go to the left node, or up and to the right. */
3897 if (node->left)
3899 node = node->left;
3900 p_new = &dup_node->left;
3902 else
3904 const bin_tree_t *prev = NULL;
3905 while (node->right == prev || node->right == NULL)
3907 prev = node;
3908 node = node->parent;
3909 dup_node = dup_node->parent;
3910 if (!node)
3911 return dup_root;
3913 node = node->right;
3914 p_new = &dup_node->right;