2.9
[glibc/nacl-glibc.git] / posix / regcomp.c
blob8ba7668e8be18f6de152344307e0abcec0c297f4
1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002,2003,2004,2005,2006,2007 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19 02111-1307 USA. */
21 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
22 size_t length, reg_syntax_t syntax);
23 static void re_compile_fastmap_iter (regex_t *bufp,
24 const re_dfastate_t *init_state,
25 char *fastmap);
26 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
27 #ifdef RE_ENABLE_I18N
28 static void free_charset (re_charset_t *cset);
29 #endif /* RE_ENABLE_I18N */
30 static void free_workarea_compile (regex_t *preg);
31 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
32 #ifdef RE_ENABLE_I18N
33 static void optimize_utf8 (re_dfa_t *dfa);
34 #endif
35 static reg_errcode_t analyze (regex_t *preg);
36 static reg_errcode_t preorder (bin_tree_t *root,
37 reg_errcode_t (fn (void *, bin_tree_t *)),
38 void *extra);
39 static reg_errcode_t postorder (bin_tree_t *root,
40 reg_errcode_t (fn (void *, bin_tree_t *)),
41 void *extra);
42 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
43 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
44 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
45 bin_tree_t *node);
46 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
47 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
48 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
49 static int duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint);
50 static int search_duplicated_node (const re_dfa_t *dfa, int org_node,
51 unsigned int constraint);
52 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
53 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
54 int node, int root);
55 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
56 static int fetch_number (re_string_t *input, re_token_t *token,
57 reg_syntax_t syntax);
58 static int peek_token (re_token_t *token, re_string_t *input,
59 reg_syntax_t syntax) internal_function;
60 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
61 reg_syntax_t syntax, reg_errcode_t *err);
62 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
63 re_token_t *token, reg_syntax_t syntax,
64 int nest, reg_errcode_t *err);
65 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
66 re_token_t *token, reg_syntax_t syntax,
67 int nest, reg_errcode_t *err);
68 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
69 re_token_t *token, reg_syntax_t syntax,
70 int nest, reg_errcode_t *err);
71 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
72 re_token_t *token, reg_syntax_t syntax,
73 int nest, reg_errcode_t *err);
74 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
75 re_dfa_t *dfa, re_token_t *token,
76 reg_syntax_t syntax, reg_errcode_t *err);
77 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
78 re_token_t *token, reg_syntax_t syntax,
79 reg_errcode_t *err);
80 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
81 re_string_t *regexp,
82 re_token_t *token, int token_len,
83 re_dfa_t *dfa,
84 reg_syntax_t syntax,
85 int accept_hyphen);
86 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
87 re_string_t *regexp,
88 re_token_t *token);
89 #ifdef RE_ENABLE_I18N
90 static reg_errcode_t build_equiv_class (bitset_t sbcset,
91 re_charset_t *mbcset,
92 int *equiv_class_alloc,
93 const unsigned char *name);
94 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
95 bitset_t sbcset,
96 re_charset_t *mbcset,
97 int *char_class_alloc,
98 const unsigned char *class_name,
99 reg_syntax_t syntax);
100 #else /* not RE_ENABLE_I18N */
101 static reg_errcode_t build_equiv_class (bitset_t sbcset,
102 const unsigned char *name);
103 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
104 bitset_t sbcset,
105 const unsigned char *class_name,
106 reg_syntax_t syntax);
107 #endif /* not RE_ENABLE_I18N */
108 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
109 RE_TRANSLATE_TYPE trans,
110 const unsigned char *class_name,
111 const unsigned char *extra,
112 int non_match, reg_errcode_t *err);
113 static bin_tree_t *create_tree (re_dfa_t *dfa,
114 bin_tree_t *left, bin_tree_t *right,
115 re_token_type_t type);
116 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
117 bin_tree_t *left, bin_tree_t *right,
118 const re_token_t *token);
119 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
120 static void free_token (re_token_t *node);
121 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
122 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
124 /* This table gives an error message for each of the error codes listed
125 in regex.h. Obviously the order here has to be same as there.
126 POSIX doesn't require that we do anything for REG_NOERROR,
127 but why not be nice? */
129 const char __re_error_msgid[] attribute_hidden =
131 #define REG_NOERROR_IDX 0
132 gettext_noop ("Success") /* REG_NOERROR */
133 "\0"
134 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
135 gettext_noop ("No match") /* REG_NOMATCH */
136 "\0"
137 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
138 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
139 "\0"
140 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
141 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
142 "\0"
143 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
144 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
145 "\0"
146 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
147 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
148 "\0"
149 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
150 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
151 "\0"
152 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
153 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
154 "\0"
155 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
156 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
157 "\0"
158 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
159 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
160 "\0"
161 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
162 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
163 "\0"
164 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
165 gettext_noop ("Invalid range end") /* REG_ERANGE */
166 "\0"
167 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
168 gettext_noop ("Memory exhausted") /* REG_ESPACE */
169 "\0"
170 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
171 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
172 "\0"
173 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
174 gettext_noop ("Premature end of regular expression") /* REG_EEND */
175 "\0"
176 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
177 gettext_noop ("Regular expression too big") /* REG_ESIZE */
178 "\0"
179 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
180 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
183 const size_t __re_error_msgid_idx[] attribute_hidden =
185 REG_NOERROR_IDX,
186 REG_NOMATCH_IDX,
187 REG_BADPAT_IDX,
188 REG_ECOLLATE_IDX,
189 REG_ECTYPE_IDX,
190 REG_EESCAPE_IDX,
191 REG_ESUBREG_IDX,
192 REG_EBRACK_IDX,
193 REG_EPAREN_IDX,
194 REG_EBRACE_IDX,
195 REG_BADBR_IDX,
196 REG_ERANGE_IDX,
197 REG_ESPACE_IDX,
198 REG_BADRPT_IDX,
199 REG_EEND_IDX,
200 REG_ESIZE_IDX,
201 REG_ERPAREN_IDX
204 /* Entry points for GNU code. */
206 /* re_compile_pattern is the GNU regular expression compiler: it
207 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
208 Returns 0 if the pattern was valid, otherwise an error string.
210 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
211 are set in BUFP on entry. */
213 const char *
214 re_compile_pattern (pattern, length, bufp)
215 const char *pattern;
216 size_t length;
217 struct re_pattern_buffer *bufp;
219 reg_errcode_t ret;
221 /* And GNU code determines whether or not to get register information
222 by passing null for the REGS argument to re_match, etc., not by
223 setting no_sub, unless RE_NO_SUB is set. */
224 bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
226 /* Match anchors at newline. */
227 bufp->newline_anchor = 1;
229 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
231 if (!ret)
232 return NULL;
233 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
235 #ifdef _LIBC
236 weak_alias (__re_compile_pattern, re_compile_pattern)
237 #endif
239 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
240 also be assigned to arbitrarily: each pattern buffer stores its own
241 syntax, so it can be changed between regex compilations. */
242 /* This has no initializer because initialized variables in Emacs
243 become read-only after dumping. */
244 reg_syntax_t re_syntax_options;
247 /* Specify the precise syntax of regexps for compilation. This provides
248 for compatibility for various utilities which historically have
249 different, incompatible syntaxes.
251 The argument SYNTAX is a bit mask comprised of the various bits
252 defined in regex.h. We return the old syntax. */
254 reg_syntax_t
255 re_set_syntax (syntax)
256 reg_syntax_t syntax;
258 reg_syntax_t ret = re_syntax_options;
260 re_syntax_options = syntax;
261 return ret;
263 #ifdef _LIBC
264 weak_alias (__re_set_syntax, re_set_syntax)
265 #endif
268 re_compile_fastmap (bufp)
269 struct re_pattern_buffer *bufp;
271 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
272 char *fastmap = bufp->fastmap;
274 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
275 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
276 if (dfa->init_state != dfa->init_state_word)
277 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
278 if (dfa->init_state != dfa->init_state_nl)
279 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
280 if (dfa->init_state != dfa->init_state_begbuf)
281 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
282 bufp->fastmap_accurate = 1;
283 return 0;
285 #ifdef _LIBC
286 weak_alias (__re_compile_fastmap, re_compile_fastmap)
287 #endif
289 static inline void
290 __attribute ((always_inline))
291 re_set_fastmap (char *fastmap, int icase, int ch)
293 fastmap[ch] = 1;
294 if (icase)
295 fastmap[tolower (ch)] = 1;
298 /* Helper function for re_compile_fastmap.
299 Compile fastmap for the initial_state INIT_STATE. */
301 static void
302 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
303 char *fastmap)
305 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
306 int node_cnt;
307 int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
308 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
310 int node = init_state->nodes.elems[node_cnt];
311 re_token_type_t type = dfa->nodes[node].type;
313 if (type == CHARACTER)
315 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
316 #ifdef RE_ENABLE_I18N
317 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
319 unsigned char *buf = alloca (dfa->mb_cur_max), *p;
320 wchar_t wc;
321 mbstate_t state;
323 p = buf;
324 *p++ = dfa->nodes[node].opr.c;
325 while (++node < dfa->nodes_len
326 && dfa->nodes[node].type == CHARACTER
327 && dfa->nodes[node].mb_partial)
328 *p++ = dfa->nodes[node].opr.c;
329 memset (&state, '\0', sizeof (state));
330 if (mbrtowc (&wc, (const char *) buf, p - buf,
331 &state) == p - buf
332 && (__wcrtomb ((char *) buf, towlower (wc), &state)
333 != (size_t) -1))
334 re_set_fastmap (fastmap, 0, buf[0]);
336 #endif
338 else if (type == SIMPLE_BRACKET)
340 int i, ch;
341 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
343 int j;
344 bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
345 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
346 if (w & ((bitset_word_t) 1 << j))
347 re_set_fastmap (fastmap, icase, ch);
350 #ifdef RE_ENABLE_I18N
351 else if (type == COMPLEX_BRACKET)
353 int i;
354 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
355 if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
356 || cset->nranges || cset->nchar_classes)
358 # ifdef _LIBC
359 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
361 /* In this case we want to catch the bytes which are
362 the first byte of any collation elements.
363 e.g. In da_DK, we want to catch 'a' since "aa"
364 is a valid collation element, and don't catch
365 'b' since 'b' is the only collation element
366 which starts from 'b'. */
367 const int32_t *table = (const int32_t *)
368 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
369 for (i = 0; i < SBC_MAX; ++i)
370 if (table[i] < 0)
371 re_set_fastmap (fastmap, icase, i);
373 # else
374 if (dfa->mb_cur_max > 1)
375 for (i = 0; i < SBC_MAX; ++i)
376 if (__btowc (i) == WEOF)
377 re_set_fastmap (fastmap, icase, i);
378 # endif /* not _LIBC */
380 for (i = 0; i < cset->nmbchars; ++i)
382 char buf[256];
383 mbstate_t state;
384 memset (&state, '\0', sizeof (state));
385 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
386 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
387 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
389 if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
390 != (size_t) -1)
391 re_set_fastmap (fastmap, 0, *(unsigned char *) buf);
395 #endif /* RE_ENABLE_I18N */
396 else if (type == OP_PERIOD
397 #ifdef RE_ENABLE_I18N
398 || type == OP_UTF8_PERIOD
399 #endif /* RE_ENABLE_I18N */
400 || type == END_OF_RE)
402 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
403 if (type == END_OF_RE)
404 bufp->can_be_null = 1;
405 return;
410 /* Entry point for POSIX code. */
411 /* regcomp takes a regular expression as a string and compiles it.
413 PREG is a regex_t *. We do not expect any fields to be initialized,
414 since POSIX says we shouldn't. Thus, we set
416 `buffer' to the compiled pattern;
417 `used' to the length of the compiled pattern;
418 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
419 REG_EXTENDED bit in CFLAGS is set; otherwise, to
420 RE_SYNTAX_POSIX_BASIC;
421 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
422 `fastmap' to an allocated space for the fastmap;
423 `fastmap_accurate' to zero;
424 `re_nsub' to the number of subexpressions in PATTERN.
426 PATTERN is the address of the pattern string.
428 CFLAGS is a series of bits which affect compilation.
430 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
431 use POSIX basic syntax.
433 If REG_NEWLINE is set, then . and [^...] don't match newline.
434 Also, regexec will try a match beginning after every newline.
436 If REG_ICASE is set, then we considers upper- and lowercase
437 versions of letters to be equivalent when matching.
439 If REG_NOSUB is set, then when PREG is passed to regexec, that
440 routine will report only success or failure, and nothing about the
441 registers.
443 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
444 the return codes and their meanings.) */
447 regcomp (preg, pattern, cflags)
448 regex_t *__restrict preg;
449 const char *__restrict pattern;
450 int cflags;
452 reg_errcode_t ret;
453 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
454 : RE_SYNTAX_POSIX_BASIC);
456 preg->buffer = NULL;
457 preg->allocated = 0;
458 preg->used = 0;
460 /* Try to allocate space for the fastmap. */
461 preg->fastmap = re_malloc (char, SBC_MAX);
462 if (BE (preg->fastmap == NULL, 0))
463 return REG_ESPACE;
465 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
467 /* If REG_NEWLINE is set, newlines are treated differently. */
468 if (cflags & REG_NEWLINE)
469 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
470 syntax &= ~RE_DOT_NEWLINE;
471 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
472 /* It also changes the matching behavior. */
473 preg->newline_anchor = 1;
475 else
476 preg->newline_anchor = 0;
477 preg->no_sub = !!(cflags & REG_NOSUB);
478 preg->translate = NULL;
480 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
482 /* POSIX doesn't distinguish between an unmatched open-group and an
483 unmatched close-group: both are REG_EPAREN. */
484 if (ret == REG_ERPAREN)
485 ret = REG_EPAREN;
487 /* We have already checked preg->fastmap != NULL. */
488 if (BE (ret == REG_NOERROR, 1))
489 /* Compute the fastmap now, since regexec cannot modify the pattern
490 buffer. This function never fails in this implementation. */
491 (void) re_compile_fastmap (preg);
492 else
494 /* Some error occurred while compiling the expression. */
495 re_free (preg->fastmap);
496 preg->fastmap = NULL;
499 return (int) ret;
501 #ifdef _LIBC
502 weak_alias (__regcomp, regcomp)
503 #endif
505 /* Returns a message corresponding to an error code, ERRCODE, returned
506 from either regcomp or regexec. We don't use PREG here. */
508 size_t
509 regerror (errcode, preg, errbuf, errbuf_size)
510 int errcode;
511 const regex_t *__restrict preg;
512 char *__restrict errbuf;
513 size_t errbuf_size;
515 const char *msg;
516 size_t msg_size;
518 if (BE (errcode < 0
519 || errcode >= (int) (sizeof (__re_error_msgid_idx)
520 / sizeof (__re_error_msgid_idx[0])), 0))
521 /* Only error codes returned by the rest of the code should be passed
522 to this routine. If we are given anything else, or if other regex
523 code generates an invalid error code, then the program has a bug.
524 Dump core so we can fix it. */
525 abort ();
527 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
529 msg_size = strlen (msg) + 1; /* Includes the null. */
531 if (BE (errbuf_size != 0, 1))
533 if (BE (msg_size > errbuf_size, 0))
535 #if defined HAVE_MEMPCPY || defined _LIBC
536 *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
537 #else
538 memcpy (errbuf, msg, errbuf_size - 1);
539 errbuf[errbuf_size - 1] = 0;
540 #endif
542 else
543 memcpy (errbuf, msg, msg_size);
546 return msg_size;
548 #ifdef _LIBC
549 weak_alias (__regerror, regerror)
550 #endif
553 #ifdef RE_ENABLE_I18N
554 /* This static array is used for the map to single-byte characters when
555 UTF-8 is used. Otherwise we would allocate memory just to initialize
556 it the same all the time. UTF-8 is the preferred encoding so this is
557 a worthwhile optimization. */
558 static const bitset_t utf8_sb_map =
560 /* Set the first 128 bits. */
561 [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
563 #endif
566 static void
567 free_dfa_content (re_dfa_t *dfa)
569 int i, j;
571 if (dfa->nodes)
572 for (i = 0; i < dfa->nodes_len; ++i)
573 free_token (dfa->nodes + i);
574 re_free (dfa->nexts);
575 for (i = 0; i < dfa->nodes_len; ++i)
577 if (dfa->eclosures != NULL)
578 re_node_set_free (dfa->eclosures + i);
579 if (dfa->inveclosures != NULL)
580 re_node_set_free (dfa->inveclosures + i);
581 if (dfa->edests != NULL)
582 re_node_set_free (dfa->edests + i);
584 re_free (dfa->edests);
585 re_free (dfa->eclosures);
586 re_free (dfa->inveclosures);
587 re_free (dfa->nodes);
589 if (dfa->state_table)
590 for (i = 0; i <= dfa->state_hash_mask; ++i)
592 struct re_state_table_entry *entry = dfa->state_table + i;
593 for (j = 0; j < entry->num; ++j)
595 re_dfastate_t *state = entry->array[j];
596 free_state (state);
598 re_free (entry->array);
600 re_free (dfa->state_table);
601 #ifdef RE_ENABLE_I18N
602 if (dfa->sb_char != utf8_sb_map)
603 re_free (dfa->sb_char);
604 #endif
605 re_free (dfa->subexp_map);
606 #ifdef DEBUG
607 re_free (dfa->re_str);
608 #endif
610 re_free (dfa);
614 /* Free dynamically allocated space used by PREG. */
616 void
617 regfree (preg)
618 regex_t *preg;
620 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
621 if (BE (dfa != NULL, 1))
622 free_dfa_content (dfa);
623 preg->buffer = NULL;
624 preg->allocated = 0;
626 re_free (preg->fastmap);
627 preg->fastmap = NULL;
629 re_free (preg->translate);
630 preg->translate = NULL;
632 #ifdef _LIBC
633 weak_alias (__regfree, regfree)
634 #endif
636 /* Entry points compatible with 4.2 BSD regex library. We don't define
637 them unless specifically requested. */
639 #if defined _REGEX_RE_COMP || defined _LIBC
641 /* BSD has one and only one pattern buffer. */
642 static struct re_pattern_buffer re_comp_buf;
644 char *
645 # ifdef _LIBC
646 /* Make these definitions weak in libc, so POSIX programs can redefine
647 these names if they don't use our functions, and still use
648 regcomp/regexec above without link errors. */
649 weak_function
650 # endif
651 re_comp (s)
652 const char *s;
654 reg_errcode_t ret;
655 char *fastmap;
657 if (!s)
659 if (!re_comp_buf.buffer)
660 return gettext ("No previous regular expression");
661 return 0;
664 if (re_comp_buf.buffer)
666 fastmap = re_comp_buf.fastmap;
667 re_comp_buf.fastmap = NULL;
668 __regfree (&re_comp_buf);
669 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
670 re_comp_buf.fastmap = fastmap;
673 if (re_comp_buf.fastmap == NULL)
675 re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
676 if (re_comp_buf.fastmap == NULL)
677 return (char *) gettext (__re_error_msgid
678 + __re_error_msgid_idx[(int) REG_ESPACE]);
681 /* Since `re_exec' always passes NULL for the `regs' argument, we
682 don't need to initialize the pattern buffer fields which affect it. */
684 /* Match anchors at newlines. */
685 re_comp_buf.newline_anchor = 1;
687 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
689 if (!ret)
690 return NULL;
692 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
693 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
696 #ifdef _LIBC
697 libc_freeres_fn (free_mem)
699 __regfree (&re_comp_buf);
701 #endif
703 #endif /* _REGEX_RE_COMP */
705 /* Internal entry point.
706 Compile the regular expression PATTERN, whose length is LENGTH.
707 SYNTAX indicate regular expression's syntax. */
709 static reg_errcode_t
710 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
711 reg_syntax_t syntax)
713 reg_errcode_t err = REG_NOERROR;
714 re_dfa_t *dfa;
715 re_string_t regexp;
717 /* Initialize the pattern buffer. */
718 preg->fastmap_accurate = 0;
719 preg->syntax = syntax;
720 preg->not_bol = preg->not_eol = 0;
721 preg->used = 0;
722 preg->re_nsub = 0;
723 preg->can_be_null = 0;
724 preg->regs_allocated = REGS_UNALLOCATED;
726 /* Initialize the dfa. */
727 dfa = (re_dfa_t *) preg->buffer;
728 if (BE (preg->allocated < sizeof (re_dfa_t), 0))
730 /* If zero allocated, but buffer is non-null, try to realloc
731 enough space. This loses if buffer's address is bogus, but
732 that is the user's responsibility. If ->buffer is NULL this
733 is a simple allocation. */
734 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
735 if (dfa == NULL)
736 return REG_ESPACE;
737 preg->allocated = sizeof (re_dfa_t);
738 preg->buffer = (unsigned char *) dfa;
740 preg->used = sizeof (re_dfa_t);
742 err = init_dfa (dfa, length);
743 if (BE (err != REG_NOERROR, 0))
745 free_dfa_content (dfa);
746 preg->buffer = NULL;
747 preg->allocated = 0;
748 return err;
750 #ifdef DEBUG
751 /* Note: length+1 will not overflow since it is checked in init_dfa. */
752 dfa->re_str = re_malloc (char, length + 1);
753 strncpy (dfa->re_str, pattern, length + 1);
754 #endif
756 __libc_lock_init (dfa->lock);
758 err = re_string_construct (&regexp, pattern, length, preg->translate,
759 syntax & RE_ICASE, dfa);
760 if (BE (err != REG_NOERROR, 0))
762 re_compile_internal_free_return:
763 free_workarea_compile (preg);
764 re_string_destruct (&regexp);
765 free_dfa_content (dfa);
766 preg->buffer = NULL;
767 preg->allocated = 0;
768 return err;
771 /* Parse the regular expression, and build a structure tree. */
772 preg->re_nsub = 0;
773 dfa->str_tree = parse (&regexp, preg, syntax, &err);
774 if (BE (dfa->str_tree == NULL, 0))
775 goto re_compile_internal_free_return;
777 /* Analyze the tree and create the nfa. */
778 err = analyze (preg);
779 if (BE (err != REG_NOERROR, 0))
780 goto re_compile_internal_free_return;
782 #ifdef RE_ENABLE_I18N
783 /* If possible, do searching in single byte encoding to speed things up. */
784 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
785 optimize_utf8 (dfa);
786 #endif
788 /* Then create the initial state of the dfa. */
789 err = create_initial_state (dfa);
791 /* Release work areas. */
792 free_workarea_compile (preg);
793 re_string_destruct (&regexp);
795 if (BE (err != REG_NOERROR, 0))
797 free_dfa_content (dfa);
798 preg->buffer = NULL;
799 preg->allocated = 0;
802 return err;
805 /* Initialize DFA. We use the length of the regular expression PAT_LEN
806 as the initial length of some arrays. */
808 static reg_errcode_t
809 init_dfa (re_dfa_t *dfa, size_t pat_len)
811 unsigned int table_size;
812 #ifndef _LIBC
813 char *codeset_name;
814 #endif
816 memset (dfa, '\0', sizeof (re_dfa_t));
818 /* Force allocation of str_tree_storage the first time. */
819 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
821 /* Avoid overflows. */
822 if (pat_len == SIZE_MAX)
823 return REG_ESPACE;
825 dfa->nodes_alloc = pat_len + 1;
826 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
828 /* table_size = 2 ^ ceil(log pat_len) */
829 for (table_size = 1; ; table_size <<= 1)
830 if (table_size > pat_len)
831 break;
833 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
834 dfa->state_hash_mask = table_size - 1;
836 dfa->mb_cur_max = MB_CUR_MAX;
837 #ifdef _LIBC
838 if (dfa->mb_cur_max == 6
839 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
840 dfa->is_utf8 = 1;
841 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
842 != 0);
843 #else
844 # ifdef HAVE_LANGINFO_CODESET
845 codeset_name = nl_langinfo (CODESET);
846 # else
847 codeset_name = getenv ("LC_ALL");
848 if (codeset_name == NULL || codeset_name[0] == '\0')
849 codeset_name = getenv ("LC_CTYPE");
850 if (codeset_name == NULL || codeset_name[0] == '\0')
851 codeset_name = getenv ("LANG");
852 if (codeset_name == NULL)
853 codeset_name = "";
854 else if (strchr (codeset_name, '.') != NULL)
855 codeset_name = strchr (codeset_name, '.') + 1;
856 # endif
858 if (strcasecmp (codeset_name, "UTF-8") == 0
859 || strcasecmp (codeset_name, "UTF8") == 0)
860 dfa->is_utf8 = 1;
862 /* We check exhaustively in the loop below if this charset is a
863 superset of ASCII. */
864 dfa->map_notascii = 0;
865 #endif
867 #ifdef RE_ENABLE_I18N
868 if (dfa->mb_cur_max > 1)
870 if (dfa->is_utf8)
871 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
872 else
874 int i, j, ch;
876 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
877 if (BE (dfa->sb_char == NULL, 0))
878 return REG_ESPACE;
880 /* Set the bits corresponding to single byte chars. */
881 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
882 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
884 wint_t wch = __btowc (ch);
885 if (wch != WEOF)
886 dfa->sb_char[i] |= (bitset_word_t) 1 << j;
887 # ifndef _LIBC
888 if (isascii (ch) && wch != ch)
889 dfa->map_notascii = 1;
890 # endif
894 #endif
896 if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
897 return REG_ESPACE;
898 return REG_NOERROR;
901 /* Initialize WORD_CHAR table, which indicate which character is
902 "word". In this case "word" means that it is the word construction
903 character used by some operators like "\<", "\>", etc. */
905 static void
906 internal_function
907 init_word_char (re_dfa_t *dfa)
909 int i, j, ch;
910 dfa->word_ops_used = 1;
911 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
912 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
913 if (isalnum (ch) || ch == '_')
914 dfa->word_char[i] |= (bitset_word_t) 1 << j;
917 /* Free the work area which are only used while compiling. */
919 static void
920 free_workarea_compile (regex_t *preg)
922 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
923 bin_tree_storage_t *storage, *next;
924 for (storage = dfa->str_tree_storage; storage; storage = next)
926 next = storage->next;
927 re_free (storage);
929 dfa->str_tree_storage = NULL;
930 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
931 dfa->str_tree = NULL;
932 re_free (dfa->org_indices);
933 dfa->org_indices = NULL;
936 /* Create initial states for all contexts. */
938 static reg_errcode_t
939 create_initial_state (re_dfa_t *dfa)
941 int first, i;
942 reg_errcode_t err;
943 re_node_set init_nodes;
945 /* Initial states have the epsilon closure of the node which is
946 the first node of the regular expression. */
947 first = dfa->str_tree->first->node_idx;
948 dfa->init_node = first;
949 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
950 if (BE (err != REG_NOERROR, 0))
951 return err;
953 /* The back-references which are in initial states can epsilon transit,
954 since in this case all of the subexpressions can be null.
955 Then we add epsilon closures of the nodes which are the next nodes of
956 the back-references. */
957 if (dfa->nbackref > 0)
958 for (i = 0; i < init_nodes.nelem; ++i)
960 int node_idx = init_nodes.elems[i];
961 re_token_type_t type = dfa->nodes[node_idx].type;
963 int clexp_idx;
964 if (type != OP_BACK_REF)
965 continue;
966 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
968 re_token_t *clexp_node;
969 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
970 if (clexp_node->type == OP_CLOSE_SUBEXP
971 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
972 break;
974 if (clexp_idx == init_nodes.nelem)
975 continue;
977 if (type == OP_BACK_REF)
979 int dest_idx = dfa->edests[node_idx].elems[0];
980 if (!re_node_set_contains (&init_nodes, dest_idx))
982 re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
983 i = 0;
988 /* It must be the first time to invoke acquire_state. */
989 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
990 /* We don't check ERR here, since the initial state must not be NULL. */
991 if (BE (dfa->init_state == NULL, 0))
992 return err;
993 if (dfa->init_state->has_constraint)
995 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
996 CONTEXT_WORD);
997 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
998 CONTEXT_NEWLINE);
999 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1000 &init_nodes,
1001 CONTEXT_NEWLINE
1002 | CONTEXT_BEGBUF);
1003 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1004 || dfa->init_state_begbuf == NULL, 0))
1005 return err;
1007 else
1008 dfa->init_state_word = dfa->init_state_nl
1009 = dfa->init_state_begbuf = dfa->init_state;
1011 re_node_set_free (&init_nodes);
1012 return REG_NOERROR;
1015 #ifdef RE_ENABLE_I18N
1016 /* If it is possible to do searching in single byte encoding instead of UTF-8
1017 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1018 DFA nodes where needed. */
1020 static void
1021 optimize_utf8 (re_dfa_t *dfa)
1023 int node, i, mb_chars = 0, has_period = 0;
1025 for (node = 0; node < dfa->nodes_len; ++node)
1026 switch (dfa->nodes[node].type)
1028 case CHARACTER:
1029 if (dfa->nodes[node].opr.c >= 0x80)
1030 mb_chars = 1;
1031 break;
1032 case ANCHOR:
1033 switch (dfa->nodes[node].opr.ctx_type)
1035 case LINE_FIRST:
1036 case LINE_LAST:
1037 case BUF_FIRST:
1038 case BUF_LAST:
1039 break;
1040 default:
1041 /* Word anchors etc. cannot be handled. It's okay to test
1042 opr.ctx_type since constraints (for all DFA nodes) are
1043 created by ORing one or more opr.ctx_type values. */
1044 return;
1046 break;
1047 case OP_PERIOD:
1048 has_period = 1;
1049 break;
1050 case OP_BACK_REF:
1051 case OP_ALT:
1052 case END_OF_RE:
1053 case OP_DUP_ASTERISK:
1054 case OP_OPEN_SUBEXP:
1055 case OP_CLOSE_SUBEXP:
1056 break;
1057 case COMPLEX_BRACKET:
1058 return;
1059 case SIMPLE_BRACKET:
1060 /* Just double check. The non-ASCII range starts at 0x80. */
1061 assert (0x80 % BITSET_WORD_BITS == 0);
1062 for (i = 0x80 / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1063 if (dfa->nodes[node].opr.sbcset[i])
1064 return;
1065 break;
1066 default:
1067 abort ();
1070 if (mb_chars || has_period)
1071 for (node = 0; node < dfa->nodes_len; ++node)
1073 if (dfa->nodes[node].type == CHARACTER
1074 && dfa->nodes[node].opr.c >= 0x80)
1075 dfa->nodes[node].mb_partial = 0;
1076 else if (dfa->nodes[node].type == OP_PERIOD)
1077 dfa->nodes[node].type = OP_UTF8_PERIOD;
1080 /* The search can be in single byte locale. */
1081 dfa->mb_cur_max = 1;
1082 dfa->is_utf8 = 0;
1083 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1085 #endif
1087 /* Analyze the structure tree, and calculate "first", "next", "edest",
1088 "eclosure", and "inveclosure". */
1090 static reg_errcode_t
1091 analyze (regex_t *preg)
1093 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1094 reg_errcode_t ret;
1096 /* Allocate arrays. */
1097 dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1098 dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1099 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1100 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1101 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1102 || dfa->eclosures == NULL, 0))
1103 return REG_ESPACE;
1105 dfa->subexp_map = re_malloc (int, preg->re_nsub);
1106 if (dfa->subexp_map != NULL)
1108 int i;
1109 for (i = 0; i < preg->re_nsub; i++)
1110 dfa->subexp_map[i] = i;
1111 preorder (dfa->str_tree, optimize_subexps, dfa);
1112 for (i = 0; i < preg->re_nsub; i++)
1113 if (dfa->subexp_map[i] != i)
1114 break;
1115 if (i == preg->re_nsub)
1117 free (dfa->subexp_map);
1118 dfa->subexp_map = NULL;
1122 ret = postorder (dfa->str_tree, lower_subexps, preg);
1123 if (BE (ret != REG_NOERROR, 0))
1124 return ret;
1125 ret = postorder (dfa->str_tree, calc_first, dfa);
1126 if (BE (ret != REG_NOERROR, 0))
1127 return ret;
1128 preorder (dfa->str_tree, calc_next, dfa);
1129 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1130 if (BE (ret != REG_NOERROR, 0))
1131 return ret;
1132 ret = calc_eclosure (dfa);
1133 if (BE (ret != REG_NOERROR, 0))
1134 return ret;
1136 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1137 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1138 if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1139 || dfa->nbackref)
1141 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1142 if (BE (dfa->inveclosures == NULL, 0))
1143 return REG_ESPACE;
1144 ret = calc_inveclosure (dfa);
1147 return ret;
1150 /* Our parse trees are very unbalanced, so we cannot use a stack to
1151 implement parse tree visits. Instead, we use parent pointers and
1152 some hairy code in these two functions. */
1153 static reg_errcode_t
1154 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1155 void *extra)
1157 bin_tree_t *node, *prev;
1159 for (node = root; ; )
1161 /* Descend down the tree, preferably to the left (or to the right
1162 if that's the only child). */
1163 while (node->left || node->right)
1164 if (node->left)
1165 node = node->left;
1166 else
1167 node = node->right;
1171 reg_errcode_t err = fn (extra, node);
1172 if (BE (err != REG_NOERROR, 0))
1173 return err;
1174 if (node->parent == NULL)
1175 return REG_NOERROR;
1176 prev = node;
1177 node = node->parent;
1179 /* Go up while we have a node that is reached from the right. */
1180 while (node->right == prev || node->right == NULL);
1181 node = node->right;
1185 static reg_errcode_t
1186 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1187 void *extra)
1189 bin_tree_t *node;
1191 for (node = root; ; )
1193 reg_errcode_t err = fn (extra, node);
1194 if (BE (err != REG_NOERROR, 0))
1195 return err;
1197 /* Go to the left node, or up and to the right. */
1198 if (node->left)
1199 node = node->left;
1200 else
1202 bin_tree_t *prev = NULL;
1203 while (node->right == prev || node->right == NULL)
1205 prev = node;
1206 node = node->parent;
1207 if (!node)
1208 return REG_NOERROR;
1210 node = node->right;
1215 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1216 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1217 backreferences as well. Requires a preorder visit. */
1218 static reg_errcode_t
1219 optimize_subexps (void *extra, bin_tree_t *node)
1221 re_dfa_t *dfa = (re_dfa_t *) extra;
1223 if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1225 int idx = node->token.opr.idx;
1226 node->token.opr.idx = dfa->subexp_map[idx];
1227 dfa->used_bkref_map |= 1 << node->token.opr.idx;
1230 else if (node->token.type == SUBEXP
1231 && node->left && node->left->token.type == SUBEXP)
1233 int other_idx = node->left->token.opr.idx;
1235 node->left = node->left->left;
1236 if (node->left)
1237 node->left->parent = node;
1239 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1240 if (other_idx < BITSET_WORD_BITS)
1241 dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1244 return REG_NOERROR;
1247 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1248 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1249 static reg_errcode_t
1250 lower_subexps (void *extra, bin_tree_t *node)
1252 regex_t *preg = (regex_t *) extra;
1253 reg_errcode_t err = REG_NOERROR;
1255 if (node->left && node->left->token.type == SUBEXP)
1257 node->left = lower_subexp (&err, preg, node->left);
1258 if (node->left)
1259 node->left->parent = node;
1261 if (node->right && node->right->token.type == SUBEXP)
1263 node->right = lower_subexp (&err, preg, node->right);
1264 if (node->right)
1265 node->right->parent = node;
1268 return err;
1271 static bin_tree_t *
1272 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1274 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1275 bin_tree_t *body = node->left;
1276 bin_tree_t *op, *cls, *tree1, *tree;
1278 if (preg->no_sub
1279 /* We do not optimize empty subexpressions, because otherwise we may
1280 have bad CONCAT nodes with NULL children. This is obviously not
1281 very common, so we do not lose much. An example that triggers
1282 this case is the sed "script" /\(\)/x. */
1283 && node->left != NULL
1284 && (node->token.opr.idx >= BITSET_WORD_BITS
1285 || !(dfa->used_bkref_map
1286 & ((bitset_word_t) 1 << node->token.opr.idx))))
1287 return node->left;
1289 /* Convert the SUBEXP node to the concatenation of an
1290 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1291 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1292 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1293 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1294 tree = create_tree (dfa, op, tree1, CONCAT);
1295 if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1297 *err = REG_ESPACE;
1298 return NULL;
1301 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1302 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1303 return tree;
1306 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1307 nodes. Requires a postorder visit. */
1308 static reg_errcode_t
1309 calc_first (void *extra, bin_tree_t *node)
1311 re_dfa_t *dfa = (re_dfa_t *) extra;
1312 if (node->token.type == CONCAT)
1314 node->first = node->left->first;
1315 node->node_idx = node->left->node_idx;
1317 else
1319 node->first = node;
1320 node->node_idx = re_dfa_add_node (dfa, node->token);
1321 if (BE (node->node_idx == -1, 0))
1322 return REG_ESPACE;
1323 if (node->token.type == ANCHOR)
1324 dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1326 return REG_NOERROR;
1329 /* Pass 2: compute NEXT on the tree. Preorder visit. */
1330 static reg_errcode_t
1331 calc_next (void *extra, bin_tree_t *node)
1333 switch (node->token.type)
1335 case OP_DUP_ASTERISK:
1336 node->left->next = node;
1337 break;
1338 case CONCAT:
1339 node->left->next = node->right->first;
1340 node->right->next = node->next;
1341 break;
1342 default:
1343 if (node->left)
1344 node->left->next = node->next;
1345 if (node->right)
1346 node->right->next = node->next;
1347 break;
1349 return REG_NOERROR;
1352 /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1353 static reg_errcode_t
1354 link_nfa_nodes (void *extra, bin_tree_t *node)
1356 re_dfa_t *dfa = (re_dfa_t *) extra;
1357 int idx = node->node_idx;
1358 reg_errcode_t err = REG_NOERROR;
1360 switch (node->token.type)
1362 case CONCAT:
1363 break;
1365 case END_OF_RE:
1366 assert (node->next == NULL);
1367 break;
1369 case OP_DUP_ASTERISK:
1370 case OP_ALT:
1372 int left, right;
1373 dfa->has_plural_match = 1;
1374 if (node->left != NULL)
1375 left = node->left->first->node_idx;
1376 else
1377 left = node->next->node_idx;
1378 if (node->right != NULL)
1379 right = node->right->first->node_idx;
1380 else
1381 right = node->next->node_idx;
1382 assert (left > -1);
1383 assert (right > -1);
1384 err = re_node_set_init_2 (dfa->edests + idx, left, right);
1386 break;
1388 case ANCHOR:
1389 case OP_OPEN_SUBEXP:
1390 case OP_CLOSE_SUBEXP:
1391 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1392 break;
1394 case OP_BACK_REF:
1395 dfa->nexts[idx] = node->next->node_idx;
1396 if (node->token.type == OP_BACK_REF)
1397 re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1398 break;
1400 default:
1401 assert (!IS_EPSILON_NODE (node->token.type));
1402 dfa->nexts[idx] = node->next->node_idx;
1403 break;
1406 return err;
1409 /* Duplicate the epsilon closure of the node ROOT_NODE.
1410 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1411 to their own constraint. */
1413 static reg_errcode_t
1414 internal_function
1415 duplicate_node_closure (re_dfa_t *dfa, int top_org_node, int top_clone_node,
1416 int root_node, unsigned int init_constraint)
1418 int org_node, clone_node, ret;
1419 unsigned int constraint = init_constraint;
1420 for (org_node = top_org_node, clone_node = top_clone_node;;)
1422 int org_dest, clone_dest;
1423 if (dfa->nodes[org_node].type == OP_BACK_REF)
1425 /* If the back reference epsilon-transit, its destination must
1426 also have the constraint. Then duplicate the epsilon closure
1427 of the destination of the back reference, and store it in
1428 edests of the back reference. */
1429 org_dest = dfa->nexts[org_node];
1430 re_node_set_empty (dfa->edests + clone_node);
1431 clone_dest = duplicate_node (dfa, org_dest, constraint);
1432 if (BE (clone_dest == -1, 0))
1433 return REG_ESPACE;
1434 dfa->nexts[clone_node] = dfa->nexts[org_node];
1435 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1436 if (BE (ret < 0, 0))
1437 return REG_ESPACE;
1439 else if (dfa->edests[org_node].nelem == 0)
1441 /* In case of the node can't epsilon-transit, don't duplicate the
1442 destination and store the original destination as the
1443 destination of the node. */
1444 dfa->nexts[clone_node] = dfa->nexts[org_node];
1445 break;
1447 else if (dfa->edests[org_node].nelem == 1)
1449 /* In case of the node can epsilon-transit, and it has only one
1450 destination. */
1451 org_dest = dfa->edests[org_node].elems[0];
1452 re_node_set_empty (dfa->edests + clone_node);
1453 /* If the node is root_node itself, it means the epsilon clsoure
1454 has a loop. Then tie it to the destination of the root_node. */
1455 if (org_node == root_node && clone_node != org_node)
1457 ret = re_node_set_insert (dfa->edests + clone_node, org_dest);
1458 if (BE (ret < 0, 0))
1459 return REG_ESPACE;
1460 break;
1462 /* In case of the node has another constraint, add it. */
1463 constraint |= dfa->nodes[org_node].constraint;
1464 clone_dest = duplicate_node (dfa, org_dest, constraint);
1465 if (BE (clone_dest == -1, 0))
1466 return REG_ESPACE;
1467 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1468 if (BE (ret < 0, 0))
1469 return REG_ESPACE;
1471 else /* dfa->edests[org_node].nelem == 2 */
1473 /* In case of the node can epsilon-transit, and it has two
1474 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1475 org_dest = dfa->edests[org_node].elems[0];
1476 re_node_set_empty (dfa->edests + clone_node);
1477 /* Search for a duplicated node which satisfies the constraint. */
1478 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1479 if (clone_dest == -1)
1481 /* There is no such duplicated node, create a new one. */
1482 reg_errcode_t err;
1483 clone_dest = duplicate_node (dfa, org_dest, constraint);
1484 if (BE (clone_dest == -1, 0))
1485 return REG_ESPACE;
1486 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1487 if (BE (ret < 0, 0))
1488 return REG_ESPACE;
1489 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1490 root_node, constraint);
1491 if (BE (err != REG_NOERROR, 0))
1492 return err;
1494 else
1496 /* There is a duplicated node which satisfies the constraint,
1497 use it to avoid infinite loop. */
1498 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1499 if (BE (ret < 0, 0))
1500 return REG_ESPACE;
1503 org_dest = dfa->edests[org_node].elems[1];
1504 clone_dest = duplicate_node (dfa, org_dest, constraint);
1505 if (BE (clone_dest == -1, 0))
1506 return REG_ESPACE;
1507 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1508 if (BE (ret < 0, 0))
1509 return REG_ESPACE;
1511 org_node = org_dest;
1512 clone_node = clone_dest;
1514 return REG_NOERROR;
1517 /* Search for a node which is duplicated from the node ORG_NODE, and
1518 satisfies the constraint CONSTRAINT. */
1520 static int
1521 search_duplicated_node (const re_dfa_t *dfa, int org_node,
1522 unsigned int constraint)
1524 int idx;
1525 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1527 if (org_node == dfa->org_indices[idx]
1528 && constraint == dfa->nodes[idx].constraint)
1529 return idx; /* Found. */
1531 return -1; /* Not found. */
1534 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1535 Return the index of the new node, or -1 if insufficient storage is
1536 available. */
1538 static int
1539 duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint)
1541 int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1542 if (BE (dup_idx != -1, 1))
1544 dfa->nodes[dup_idx].constraint = constraint;
1545 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1546 dfa->nodes[dup_idx].duplicated = 1;
1548 /* Store the index of the original node. */
1549 dfa->org_indices[dup_idx] = org_idx;
1551 return dup_idx;
1554 static reg_errcode_t
1555 calc_inveclosure (re_dfa_t *dfa)
1557 int src, idx, ret;
1558 for (idx = 0; idx < dfa->nodes_len; ++idx)
1559 re_node_set_init_empty (dfa->inveclosures + idx);
1561 for (src = 0; src < dfa->nodes_len; ++src)
1563 int *elems = dfa->eclosures[src].elems;
1564 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1566 ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1567 if (BE (ret == -1, 0))
1568 return REG_ESPACE;
1572 return REG_NOERROR;
1575 /* Calculate "eclosure" for all the node in DFA. */
1577 static reg_errcode_t
1578 calc_eclosure (re_dfa_t *dfa)
1580 int node_idx, incomplete;
1581 #ifdef DEBUG
1582 assert (dfa->nodes_len > 0);
1583 #endif
1584 incomplete = 0;
1585 /* For each nodes, calculate epsilon closure. */
1586 for (node_idx = 0; ; ++node_idx)
1588 reg_errcode_t err;
1589 re_node_set eclosure_elem;
1590 if (node_idx == dfa->nodes_len)
1592 if (!incomplete)
1593 break;
1594 incomplete = 0;
1595 node_idx = 0;
1598 #ifdef DEBUG
1599 assert (dfa->eclosures[node_idx].nelem != -1);
1600 #endif
1602 /* If we have already calculated, skip it. */
1603 if (dfa->eclosures[node_idx].nelem != 0)
1604 continue;
1605 /* Calculate epsilon closure of `node_idx'. */
1606 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1607 if (BE (err != REG_NOERROR, 0))
1608 return err;
1610 if (dfa->eclosures[node_idx].nelem == 0)
1612 incomplete = 1;
1613 re_node_set_free (&eclosure_elem);
1616 return REG_NOERROR;
1619 /* Calculate epsilon closure of NODE. */
1621 static reg_errcode_t
1622 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, int node, int root)
1624 reg_errcode_t err;
1625 int i, incomplete;
1626 re_node_set eclosure;
1627 incomplete = 0;
1628 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1629 if (BE (err != REG_NOERROR, 0))
1630 return err;
1632 /* This indicates that we are calculating this node now.
1633 We reference this value to avoid infinite loop. */
1634 dfa->eclosures[node].nelem = -1;
1636 /* If the current node has constraints, duplicate all nodes
1637 since they must inherit the constraints. */
1638 if (dfa->nodes[node].constraint
1639 && dfa->edests[node].nelem
1640 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1642 err = duplicate_node_closure (dfa, node, node, node,
1643 dfa->nodes[node].constraint);
1644 if (BE (err != REG_NOERROR, 0))
1645 return err;
1648 /* Expand each epsilon destination nodes. */
1649 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1650 for (i = 0; i < dfa->edests[node].nelem; ++i)
1652 re_node_set eclosure_elem;
1653 int edest = dfa->edests[node].elems[i];
1654 /* If calculating the epsilon closure of `edest' is in progress,
1655 return intermediate result. */
1656 if (dfa->eclosures[edest].nelem == -1)
1658 incomplete = 1;
1659 continue;
1661 /* If we haven't calculated the epsilon closure of `edest' yet,
1662 calculate now. Otherwise use calculated epsilon closure. */
1663 if (dfa->eclosures[edest].nelem == 0)
1665 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1666 if (BE (err != REG_NOERROR, 0))
1667 return err;
1669 else
1670 eclosure_elem = dfa->eclosures[edest];
1671 /* Merge the epsilon closure of `edest'. */
1672 re_node_set_merge (&eclosure, &eclosure_elem);
1673 /* If the epsilon closure of `edest' is incomplete,
1674 the epsilon closure of this node is also incomplete. */
1675 if (dfa->eclosures[edest].nelem == 0)
1677 incomplete = 1;
1678 re_node_set_free (&eclosure_elem);
1682 /* Epsilon closures include itself. */
1683 re_node_set_insert (&eclosure, node);
1684 if (incomplete && !root)
1685 dfa->eclosures[node].nelem = 0;
1686 else
1687 dfa->eclosures[node] = eclosure;
1688 *new_set = eclosure;
1689 return REG_NOERROR;
1692 /* Functions for token which are used in the parser. */
1694 /* Fetch a token from INPUT.
1695 We must not use this function inside bracket expressions. */
1697 static void
1698 internal_function
1699 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1701 re_string_skip_bytes (input, peek_token (result, input, syntax));
1704 /* Peek a token from INPUT, and return the length of the token.
1705 We must not use this function inside bracket expressions. */
1707 static int
1708 internal_function
1709 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1711 unsigned char c;
1713 if (re_string_eoi (input))
1715 token->type = END_OF_RE;
1716 return 0;
1719 c = re_string_peek_byte (input, 0);
1720 token->opr.c = c;
1722 token->word_char = 0;
1723 #ifdef RE_ENABLE_I18N
1724 token->mb_partial = 0;
1725 if (input->mb_cur_max > 1 &&
1726 !re_string_first_byte (input, re_string_cur_idx (input)))
1728 token->type = CHARACTER;
1729 token->mb_partial = 1;
1730 return 1;
1732 #endif
1733 if (c == '\\')
1735 unsigned char c2;
1736 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1738 token->type = BACK_SLASH;
1739 return 1;
1742 c2 = re_string_peek_byte_case (input, 1);
1743 token->opr.c = c2;
1744 token->type = CHARACTER;
1745 #ifdef RE_ENABLE_I18N
1746 if (input->mb_cur_max > 1)
1748 wint_t wc = re_string_wchar_at (input,
1749 re_string_cur_idx (input) + 1);
1750 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1752 else
1753 #endif
1754 token->word_char = IS_WORD_CHAR (c2) != 0;
1756 switch (c2)
1758 case '|':
1759 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1760 token->type = OP_ALT;
1761 break;
1762 case '1': case '2': case '3': case '4': case '5':
1763 case '6': case '7': case '8': case '9':
1764 if (!(syntax & RE_NO_BK_REFS))
1766 token->type = OP_BACK_REF;
1767 token->opr.idx = c2 - '1';
1769 break;
1770 case '<':
1771 if (!(syntax & RE_NO_GNU_OPS))
1773 token->type = ANCHOR;
1774 token->opr.ctx_type = WORD_FIRST;
1776 break;
1777 case '>':
1778 if (!(syntax & RE_NO_GNU_OPS))
1780 token->type = ANCHOR;
1781 token->opr.ctx_type = WORD_LAST;
1783 break;
1784 case 'b':
1785 if (!(syntax & RE_NO_GNU_OPS))
1787 token->type = ANCHOR;
1788 token->opr.ctx_type = WORD_DELIM;
1790 break;
1791 case 'B':
1792 if (!(syntax & RE_NO_GNU_OPS))
1794 token->type = ANCHOR;
1795 token->opr.ctx_type = NOT_WORD_DELIM;
1797 break;
1798 case 'w':
1799 if (!(syntax & RE_NO_GNU_OPS))
1800 token->type = OP_WORD;
1801 break;
1802 case 'W':
1803 if (!(syntax & RE_NO_GNU_OPS))
1804 token->type = OP_NOTWORD;
1805 break;
1806 case 's':
1807 if (!(syntax & RE_NO_GNU_OPS))
1808 token->type = OP_SPACE;
1809 break;
1810 case 'S':
1811 if (!(syntax & RE_NO_GNU_OPS))
1812 token->type = OP_NOTSPACE;
1813 break;
1814 case '`':
1815 if (!(syntax & RE_NO_GNU_OPS))
1817 token->type = ANCHOR;
1818 token->opr.ctx_type = BUF_FIRST;
1820 break;
1821 case '\'':
1822 if (!(syntax & RE_NO_GNU_OPS))
1824 token->type = ANCHOR;
1825 token->opr.ctx_type = BUF_LAST;
1827 break;
1828 case '(':
1829 if (!(syntax & RE_NO_BK_PARENS))
1830 token->type = OP_OPEN_SUBEXP;
1831 break;
1832 case ')':
1833 if (!(syntax & RE_NO_BK_PARENS))
1834 token->type = OP_CLOSE_SUBEXP;
1835 break;
1836 case '+':
1837 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1838 token->type = OP_DUP_PLUS;
1839 break;
1840 case '?':
1841 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1842 token->type = OP_DUP_QUESTION;
1843 break;
1844 case '{':
1845 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1846 token->type = OP_OPEN_DUP_NUM;
1847 break;
1848 case '}':
1849 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1850 token->type = OP_CLOSE_DUP_NUM;
1851 break;
1852 default:
1853 break;
1855 return 2;
1858 token->type = CHARACTER;
1859 #ifdef RE_ENABLE_I18N
1860 if (input->mb_cur_max > 1)
1862 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1863 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1865 else
1866 #endif
1867 token->word_char = IS_WORD_CHAR (token->opr.c);
1869 switch (c)
1871 case '\n':
1872 if (syntax & RE_NEWLINE_ALT)
1873 token->type = OP_ALT;
1874 break;
1875 case '|':
1876 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1877 token->type = OP_ALT;
1878 break;
1879 case '*':
1880 token->type = OP_DUP_ASTERISK;
1881 break;
1882 case '+':
1883 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1884 token->type = OP_DUP_PLUS;
1885 break;
1886 case '?':
1887 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1888 token->type = OP_DUP_QUESTION;
1889 break;
1890 case '{':
1891 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1892 token->type = OP_OPEN_DUP_NUM;
1893 break;
1894 case '}':
1895 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1896 token->type = OP_CLOSE_DUP_NUM;
1897 break;
1898 case '(':
1899 if (syntax & RE_NO_BK_PARENS)
1900 token->type = OP_OPEN_SUBEXP;
1901 break;
1902 case ')':
1903 if (syntax & RE_NO_BK_PARENS)
1904 token->type = OP_CLOSE_SUBEXP;
1905 break;
1906 case '[':
1907 token->type = OP_OPEN_BRACKET;
1908 break;
1909 case '.':
1910 token->type = OP_PERIOD;
1911 break;
1912 case '^':
1913 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1914 re_string_cur_idx (input) != 0)
1916 char prev = re_string_peek_byte (input, -1);
1917 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1918 break;
1920 token->type = ANCHOR;
1921 token->opr.ctx_type = LINE_FIRST;
1922 break;
1923 case '$':
1924 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1925 re_string_cur_idx (input) + 1 != re_string_length (input))
1927 re_token_t next;
1928 re_string_skip_bytes (input, 1);
1929 peek_token (&next, input, syntax);
1930 re_string_skip_bytes (input, -1);
1931 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1932 break;
1934 token->type = ANCHOR;
1935 token->opr.ctx_type = LINE_LAST;
1936 break;
1937 default:
1938 break;
1940 return 1;
1943 /* Peek a token from INPUT, and return the length of the token.
1944 We must not use this function out of bracket expressions. */
1946 static int
1947 internal_function
1948 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1950 unsigned char c;
1951 if (re_string_eoi (input))
1953 token->type = END_OF_RE;
1954 return 0;
1956 c = re_string_peek_byte (input, 0);
1957 token->opr.c = c;
1959 #ifdef RE_ENABLE_I18N
1960 if (input->mb_cur_max > 1 &&
1961 !re_string_first_byte (input, re_string_cur_idx (input)))
1963 token->type = CHARACTER;
1964 return 1;
1966 #endif /* RE_ENABLE_I18N */
1968 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
1969 && re_string_cur_idx (input) + 1 < re_string_length (input))
1971 /* In this case, '\' escape a character. */
1972 unsigned char c2;
1973 re_string_skip_bytes (input, 1);
1974 c2 = re_string_peek_byte (input, 0);
1975 token->opr.c = c2;
1976 token->type = CHARACTER;
1977 return 1;
1979 if (c == '[') /* '[' is a special char in a bracket exps. */
1981 unsigned char c2;
1982 int token_len;
1983 if (re_string_cur_idx (input) + 1 < re_string_length (input))
1984 c2 = re_string_peek_byte (input, 1);
1985 else
1986 c2 = 0;
1987 token->opr.c = c2;
1988 token_len = 2;
1989 switch (c2)
1991 case '.':
1992 token->type = OP_OPEN_COLL_ELEM;
1993 break;
1994 case '=':
1995 token->type = OP_OPEN_EQUIV_CLASS;
1996 break;
1997 case ':':
1998 if (syntax & RE_CHAR_CLASSES)
2000 token->type = OP_OPEN_CHAR_CLASS;
2001 break;
2003 /* else fall through. */
2004 default:
2005 token->type = CHARACTER;
2006 token->opr.c = c;
2007 token_len = 1;
2008 break;
2010 return token_len;
2012 switch (c)
2014 case '-':
2015 token->type = OP_CHARSET_RANGE;
2016 break;
2017 case ']':
2018 token->type = OP_CLOSE_BRACKET;
2019 break;
2020 case '^':
2021 token->type = OP_NON_MATCH_LIST;
2022 break;
2023 default:
2024 token->type = CHARACTER;
2026 return 1;
2029 /* Functions for parser. */
2031 /* Entry point of the parser.
2032 Parse the regular expression REGEXP and return the structure tree.
2033 If an error is occured, ERR is set by error code, and return NULL.
2034 This function build the following tree, from regular expression <reg_exp>:
2038 <reg_exp> EOR
2040 CAT means concatenation.
2041 EOR means end of regular expression. */
2043 static bin_tree_t *
2044 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2045 reg_errcode_t *err)
2047 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2048 bin_tree_t *tree, *eor, *root;
2049 re_token_t current_token;
2050 dfa->syntax = syntax;
2051 fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2052 tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2053 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2054 return NULL;
2055 eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2056 if (tree != NULL)
2057 root = create_tree (dfa, tree, eor, CONCAT);
2058 else
2059 root = eor;
2060 if (BE (eor == NULL || root == NULL, 0))
2062 *err = REG_ESPACE;
2063 return NULL;
2065 return root;
2068 /* This function build the following tree, from regular expression
2069 <branch1>|<branch2>:
2073 <branch1> <branch2>
2075 ALT means alternative, which represents the operator `|'. */
2077 static bin_tree_t *
2078 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2079 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2081 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2082 bin_tree_t *tree, *branch = NULL;
2083 tree = parse_branch (regexp, preg, token, syntax, nest, err);
2084 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2085 return NULL;
2087 while (token->type == OP_ALT)
2089 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2090 if (token->type != OP_ALT && token->type != END_OF_RE
2091 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2093 branch = parse_branch (regexp, preg, token, syntax, nest, err);
2094 if (BE (*err != REG_NOERROR && branch == NULL, 0))
2095 return NULL;
2097 else
2098 branch = NULL;
2099 tree = create_tree (dfa, tree, branch, OP_ALT);
2100 if (BE (tree == NULL, 0))
2102 *err = REG_ESPACE;
2103 return NULL;
2106 return tree;
2109 /* This function build the following tree, from regular expression
2110 <exp1><exp2>:
2114 <exp1> <exp2>
2116 CAT means concatenation. */
2118 static bin_tree_t *
2119 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2120 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2122 bin_tree_t *tree, *exp;
2123 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2124 tree = parse_expression (regexp, preg, token, syntax, nest, err);
2125 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2126 return NULL;
2128 while (token->type != OP_ALT && token->type != END_OF_RE
2129 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2131 exp = parse_expression (regexp, preg, token, syntax, nest, err);
2132 if (BE (*err != REG_NOERROR && exp == NULL, 0))
2134 return NULL;
2136 if (tree != NULL && exp != NULL)
2138 tree = create_tree (dfa, tree, exp, CONCAT);
2139 if (tree == NULL)
2141 *err = REG_ESPACE;
2142 return NULL;
2145 else if (tree == NULL)
2146 tree = exp;
2147 /* Otherwise exp == NULL, we don't need to create new tree. */
2149 return tree;
2152 /* This function build the following tree, from regular expression a*:
2158 static bin_tree_t *
2159 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2160 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2162 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2163 bin_tree_t *tree;
2164 switch (token->type)
2166 case CHARACTER:
2167 tree = create_token_tree (dfa, NULL, NULL, token);
2168 if (BE (tree == NULL, 0))
2170 *err = REG_ESPACE;
2171 return NULL;
2173 #ifdef RE_ENABLE_I18N
2174 if (dfa->mb_cur_max > 1)
2176 while (!re_string_eoi (regexp)
2177 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2179 bin_tree_t *mbc_remain;
2180 fetch_token (token, regexp, syntax);
2181 mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2182 tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2183 if (BE (mbc_remain == NULL || tree == NULL, 0))
2185 *err = REG_ESPACE;
2186 return NULL;
2190 #endif
2191 break;
2192 case OP_OPEN_SUBEXP:
2193 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2194 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2195 return NULL;
2196 break;
2197 case OP_OPEN_BRACKET:
2198 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2199 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2200 return NULL;
2201 break;
2202 case OP_BACK_REF:
2203 if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2205 *err = REG_ESUBREG;
2206 return NULL;
2208 dfa->used_bkref_map |= 1 << token->opr.idx;
2209 tree = create_token_tree (dfa, NULL, NULL, token);
2210 if (BE (tree == NULL, 0))
2212 *err = REG_ESPACE;
2213 return NULL;
2215 ++dfa->nbackref;
2216 dfa->has_mb_node = 1;
2217 break;
2218 case OP_OPEN_DUP_NUM:
2219 if (syntax & RE_CONTEXT_INVALID_DUP)
2221 *err = REG_BADRPT;
2222 return NULL;
2224 /* FALLTHROUGH */
2225 case OP_DUP_ASTERISK:
2226 case OP_DUP_PLUS:
2227 case OP_DUP_QUESTION:
2228 if (syntax & RE_CONTEXT_INVALID_OPS)
2230 *err = REG_BADRPT;
2231 return NULL;
2233 else if (syntax & RE_CONTEXT_INDEP_OPS)
2235 fetch_token (token, regexp, syntax);
2236 return parse_expression (regexp, preg, token, syntax, nest, err);
2238 /* else fall through */
2239 case OP_CLOSE_SUBEXP:
2240 if ((token->type == OP_CLOSE_SUBEXP) &&
2241 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2243 *err = REG_ERPAREN;
2244 return NULL;
2246 /* else fall through */
2247 case OP_CLOSE_DUP_NUM:
2248 /* We treat it as a normal character. */
2250 /* Then we can these characters as normal characters. */
2251 token->type = CHARACTER;
2252 /* mb_partial and word_char bits should be initialized already
2253 by peek_token. */
2254 tree = create_token_tree (dfa, NULL, NULL, token);
2255 if (BE (tree == NULL, 0))
2257 *err = REG_ESPACE;
2258 return NULL;
2260 break;
2261 case ANCHOR:
2262 if ((token->opr.ctx_type
2263 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2264 && dfa->word_ops_used == 0)
2265 init_word_char (dfa);
2266 if (token->opr.ctx_type == WORD_DELIM
2267 || token->opr.ctx_type == NOT_WORD_DELIM)
2269 bin_tree_t *tree_first, *tree_last;
2270 if (token->opr.ctx_type == WORD_DELIM)
2272 token->opr.ctx_type = WORD_FIRST;
2273 tree_first = create_token_tree (dfa, NULL, NULL, token);
2274 token->opr.ctx_type = WORD_LAST;
2276 else
2278 token->opr.ctx_type = INSIDE_WORD;
2279 tree_first = create_token_tree (dfa, NULL, NULL, token);
2280 token->opr.ctx_type = INSIDE_NOTWORD;
2282 tree_last = create_token_tree (dfa, NULL, NULL, token);
2283 tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2284 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2286 *err = REG_ESPACE;
2287 return NULL;
2290 else
2292 tree = create_token_tree (dfa, NULL, NULL, token);
2293 if (BE (tree == NULL, 0))
2295 *err = REG_ESPACE;
2296 return NULL;
2299 /* We must return here, since ANCHORs can't be followed
2300 by repetition operators.
2301 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2302 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2303 fetch_token (token, regexp, syntax);
2304 return tree;
2305 case OP_PERIOD:
2306 tree = create_token_tree (dfa, NULL, NULL, token);
2307 if (BE (tree == NULL, 0))
2309 *err = REG_ESPACE;
2310 return NULL;
2312 if (dfa->mb_cur_max > 1)
2313 dfa->has_mb_node = 1;
2314 break;
2315 case OP_WORD:
2316 case OP_NOTWORD:
2317 tree = build_charclass_op (dfa, regexp->trans,
2318 (const unsigned char *) "alnum",
2319 (const unsigned char *) "_",
2320 token->type == OP_NOTWORD, err);
2321 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2322 return NULL;
2323 break;
2324 case OP_SPACE:
2325 case OP_NOTSPACE:
2326 tree = build_charclass_op (dfa, regexp->trans,
2327 (const unsigned char *) "space",
2328 (const unsigned char *) "",
2329 token->type == OP_NOTSPACE, err);
2330 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2331 return NULL;
2332 break;
2333 case OP_ALT:
2334 case END_OF_RE:
2335 return NULL;
2336 case BACK_SLASH:
2337 *err = REG_EESCAPE;
2338 return NULL;
2339 default:
2340 /* Must not happen? */
2341 #ifdef DEBUG
2342 assert (0);
2343 #endif
2344 return NULL;
2346 fetch_token (token, regexp, syntax);
2348 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2349 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2351 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2352 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2353 return NULL;
2354 /* In BRE consecutive duplications are not allowed. */
2355 if ((syntax & RE_CONTEXT_INVALID_DUP)
2356 && (token->type == OP_DUP_ASTERISK
2357 || token->type == OP_OPEN_DUP_NUM))
2359 *err = REG_BADRPT;
2360 return NULL;
2364 return tree;
2367 /* This function build the following tree, from regular expression
2368 (<reg_exp>):
2369 SUBEXP
2371 <reg_exp>
2374 static bin_tree_t *
2375 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2376 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2378 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2379 bin_tree_t *tree;
2380 size_t cur_nsub;
2381 cur_nsub = preg->re_nsub++;
2383 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2385 /* The subexpression may be a null string. */
2386 if (token->type == OP_CLOSE_SUBEXP)
2387 tree = NULL;
2388 else
2390 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2391 if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2392 *err = REG_EPAREN;
2393 if (BE (*err != REG_NOERROR, 0))
2394 return NULL;
2397 if (cur_nsub <= '9' - '1')
2398 dfa->completed_bkref_map |= 1 << cur_nsub;
2400 tree = create_tree (dfa, tree, NULL, SUBEXP);
2401 if (BE (tree == NULL, 0))
2403 *err = REG_ESPACE;
2404 return NULL;
2406 tree->token.opr.idx = cur_nsub;
2407 return tree;
2410 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2412 static bin_tree_t *
2413 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2414 re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2416 bin_tree_t *tree = NULL, *old_tree = NULL;
2417 int i, start, end, start_idx = re_string_cur_idx (regexp);
2418 re_token_t start_token = *token;
2420 if (token->type == OP_OPEN_DUP_NUM)
2422 end = 0;
2423 start = fetch_number (regexp, token, syntax);
2424 if (start == -1)
2426 if (token->type == CHARACTER && token->opr.c == ',')
2427 start = 0; /* We treat "{,m}" as "{0,m}". */
2428 else
2430 *err = REG_BADBR; /* <re>{} is invalid. */
2431 return NULL;
2434 if (BE (start != -2, 1))
2436 /* We treat "{n}" as "{n,n}". */
2437 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2438 : ((token->type == CHARACTER && token->opr.c == ',')
2439 ? fetch_number (regexp, token, syntax) : -2));
2441 if (BE (start == -2 || end == -2, 0))
2443 /* Invalid sequence. */
2444 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2446 if (token->type == END_OF_RE)
2447 *err = REG_EBRACE;
2448 else
2449 *err = REG_BADBR;
2451 return NULL;
2454 /* If the syntax bit is set, rollback. */
2455 re_string_set_index (regexp, start_idx);
2456 *token = start_token;
2457 token->type = CHARACTER;
2458 /* mb_partial and word_char bits should be already initialized by
2459 peek_token. */
2460 return elem;
2463 if (BE (end != -1 && start > end, 0))
2465 /* First number greater than second. */
2466 *err = REG_BADBR;
2467 return NULL;
2470 else
2472 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2473 end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2476 fetch_token (token, regexp, syntax);
2478 if (BE (elem == NULL, 0))
2479 return NULL;
2480 if (BE (start == 0 && end == 0, 0))
2482 postorder (elem, free_tree, NULL);
2483 return NULL;
2486 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2487 if (BE (start > 0, 0))
2489 tree = elem;
2490 for (i = 2; i <= start; ++i)
2492 elem = duplicate_tree (elem, dfa);
2493 tree = create_tree (dfa, tree, elem, CONCAT);
2494 if (BE (elem == NULL || tree == NULL, 0))
2495 goto parse_dup_op_espace;
2498 if (start == end)
2499 return tree;
2501 /* Duplicate ELEM before it is marked optional. */
2502 elem = duplicate_tree (elem, dfa);
2503 old_tree = tree;
2505 else
2506 old_tree = NULL;
2508 if (elem->token.type == SUBEXP)
2509 postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2511 tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2512 if (BE (tree == NULL, 0))
2513 goto parse_dup_op_espace;
2515 /* This loop is actually executed only when end != -1,
2516 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2517 already created the start+1-th copy. */
2518 for (i = start + 2; i <= end; ++i)
2520 elem = duplicate_tree (elem, dfa);
2521 tree = create_tree (dfa, tree, elem, CONCAT);
2522 if (BE (elem == NULL || tree == NULL, 0))
2523 goto parse_dup_op_espace;
2525 tree = create_tree (dfa, tree, NULL, OP_ALT);
2526 if (BE (tree == NULL, 0))
2527 goto parse_dup_op_espace;
2530 if (old_tree)
2531 tree = create_tree (dfa, old_tree, tree, CONCAT);
2533 return tree;
2535 parse_dup_op_espace:
2536 *err = REG_ESPACE;
2537 return NULL;
2540 /* Size of the names for collating symbol/equivalence_class/character_class.
2541 I'm not sure, but maybe enough. */
2542 #define BRACKET_NAME_BUF_SIZE 32
2544 #ifndef _LIBC
2545 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2546 Build the range expression which starts from START_ELEM, and ends
2547 at END_ELEM. The result are written to MBCSET and SBCSET.
2548 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2549 mbcset->range_ends, is a pointer argument sinse we may
2550 update it. */
2552 static reg_errcode_t
2553 internal_function
2554 # ifdef RE_ENABLE_I18N
2555 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2556 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2557 # else /* not RE_ENABLE_I18N */
2558 build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
2559 bracket_elem_t *end_elem)
2560 # endif /* not RE_ENABLE_I18N */
2562 unsigned int start_ch, end_ch;
2563 /* Equivalence Classes and Character Classes can't be a range start/end. */
2564 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2565 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2567 return REG_ERANGE;
2569 /* We can handle no multi character collating elements without libc
2570 support. */
2571 if (BE ((start_elem->type == COLL_SYM
2572 && strlen ((char *) start_elem->opr.name) > 1)
2573 || (end_elem->type == COLL_SYM
2574 && strlen ((char *) end_elem->opr.name) > 1), 0))
2575 return REG_ECOLLATE;
2577 # ifdef RE_ENABLE_I18N
2579 wchar_t wc;
2580 wint_t start_wc;
2581 wint_t end_wc;
2582 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2584 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2585 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2586 : 0));
2587 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2588 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2589 : 0));
2590 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2591 ? __btowc (start_ch) : start_elem->opr.wch);
2592 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2593 ? __btowc (end_ch) : end_elem->opr.wch);
2594 if (start_wc == WEOF || end_wc == WEOF)
2595 return REG_ECOLLATE;
2596 cmp_buf[0] = start_wc;
2597 cmp_buf[4] = end_wc;
2598 if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2599 return REG_ERANGE;
2601 /* Got valid collation sequence values, add them as a new entry.
2602 However, for !_LIBC we have no collation elements: if the
2603 character set is single byte, the single byte character set
2604 that we build below suffices. parse_bracket_exp passes
2605 no MBCSET if dfa->mb_cur_max == 1. */
2606 if (mbcset)
2608 /* Check the space of the arrays. */
2609 if (BE (*range_alloc == mbcset->nranges, 0))
2611 /* There is not enough space, need realloc. */
2612 wchar_t *new_array_start, *new_array_end;
2613 int new_nranges;
2615 /* +1 in case of mbcset->nranges is 0. */
2616 new_nranges = 2 * mbcset->nranges + 1;
2617 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2618 are NULL if *range_alloc == 0. */
2619 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2620 new_nranges);
2621 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2622 new_nranges);
2624 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2625 return REG_ESPACE;
2627 mbcset->range_starts = new_array_start;
2628 mbcset->range_ends = new_array_end;
2629 *range_alloc = new_nranges;
2632 mbcset->range_starts[mbcset->nranges] = start_wc;
2633 mbcset->range_ends[mbcset->nranges++] = end_wc;
2636 /* Build the table for single byte characters. */
2637 for (wc = 0; wc < SBC_MAX; ++wc)
2639 cmp_buf[2] = wc;
2640 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2641 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2642 bitset_set (sbcset, wc);
2645 # else /* not RE_ENABLE_I18N */
2647 unsigned int ch;
2648 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2649 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2650 : 0));
2651 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2652 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2653 : 0));
2654 if (start_ch > end_ch)
2655 return REG_ERANGE;
2656 /* Build the table for single byte characters. */
2657 for (ch = 0; ch < SBC_MAX; ++ch)
2658 if (start_ch <= ch && ch <= end_ch)
2659 bitset_set (sbcset, ch);
2661 # endif /* not RE_ENABLE_I18N */
2662 return REG_NOERROR;
2664 #endif /* not _LIBC */
2666 #ifndef _LIBC
2667 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2668 Build the collating element which is represented by NAME.
2669 The result are written to MBCSET and SBCSET.
2670 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2671 pointer argument since we may update it. */
2673 static reg_errcode_t
2674 internal_function
2675 # ifdef RE_ENABLE_I18N
2676 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2677 int *coll_sym_alloc, const unsigned char *name)
2678 # else /* not RE_ENABLE_I18N */
2679 build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2680 # endif /* not RE_ENABLE_I18N */
2682 size_t name_len = strlen ((const char *) name);
2683 if (BE (name_len != 1, 0))
2684 return REG_ECOLLATE;
2685 else
2687 bitset_set (sbcset, name[0]);
2688 return REG_NOERROR;
2691 #endif /* not _LIBC */
2693 /* This function parse bracket expression like "[abc]", "[a-c]",
2694 "[[.a-a.]]" etc. */
2696 static bin_tree_t *
2697 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2698 reg_syntax_t syntax, reg_errcode_t *err)
2700 #ifdef _LIBC
2701 const unsigned char *collseqmb;
2702 const char *collseqwc;
2703 uint32_t nrules;
2704 int32_t table_size;
2705 const int32_t *symb_table;
2706 const unsigned char *extra;
2708 /* Local function for parse_bracket_exp used in _LIBC environement.
2709 Seek the collating symbol entry correspondings to NAME.
2710 Return the index of the symbol in the SYMB_TABLE. */
2712 auto inline int32_t
2713 __attribute ((always_inline))
2714 seek_collating_symbol_entry (name, name_len)
2715 const unsigned char *name;
2716 size_t name_len;
2718 int32_t hash = elem_hash ((const char *) name, name_len);
2719 int32_t elem = hash % table_size;
2720 if (symb_table[2 * elem] != 0)
2722 int32_t second = hash % (table_size - 2) + 1;
2726 /* First compare the hashing value. */
2727 if (symb_table[2 * elem] == hash
2728 /* Compare the length of the name. */
2729 && name_len == extra[symb_table[2 * elem + 1]]
2730 /* Compare the name. */
2731 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2732 name_len) == 0)
2734 /* Yep, this is the entry. */
2735 break;
2738 /* Next entry. */
2739 elem += second;
2741 while (symb_table[2 * elem] != 0);
2743 return elem;
2746 /* Local function for parse_bracket_exp used in _LIBC environment.
2747 Look up the collation sequence value of BR_ELEM.
2748 Return the value if succeeded, UINT_MAX otherwise. */
2750 auto inline unsigned int
2751 __attribute ((always_inline))
2752 lookup_collation_sequence_value (br_elem)
2753 bracket_elem_t *br_elem;
2755 if (br_elem->type == SB_CHAR)
2758 if (MB_CUR_MAX == 1)
2760 if (nrules == 0)
2761 return collseqmb[br_elem->opr.ch];
2762 else
2764 wint_t wc = __btowc (br_elem->opr.ch);
2765 return __collseq_table_lookup (collseqwc, wc);
2768 else if (br_elem->type == MB_CHAR)
2770 if (nrules != 0)
2771 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2773 else if (br_elem->type == COLL_SYM)
2775 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2776 if (nrules != 0)
2778 int32_t elem, idx;
2779 elem = seek_collating_symbol_entry (br_elem->opr.name,
2780 sym_name_len);
2781 if (symb_table[2 * elem] != 0)
2783 /* We found the entry. */
2784 idx = symb_table[2 * elem + 1];
2785 /* Skip the name of collating element name. */
2786 idx += 1 + extra[idx];
2787 /* Skip the byte sequence of the collating element. */
2788 idx += 1 + extra[idx];
2789 /* Adjust for the alignment. */
2790 idx = (idx + 3) & ~3;
2791 /* Skip the multibyte collation sequence value. */
2792 idx += sizeof (unsigned int);
2793 /* Skip the wide char sequence of the collating element. */
2794 idx += sizeof (unsigned int) *
2795 (1 + *(unsigned int *) (extra + idx));
2796 /* Return the collation sequence value. */
2797 return *(unsigned int *) (extra + idx);
2799 else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2801 /* No valid character. Match it as a single byte
2802 character. */
2803 return collseqmb[br_elem->opr.name[0]];
2806 else if (sym_name_len == 1)
2807 return collseqmb[br_elem->opr.name[0]];
2809 return UINT_MAX;
2812 /* Local function for parse_bracket_exp used in _LIBC environement.
2813 Build the range expression which starts from START_ELEM, and ends
2814 at END_ELEM. The result are written to MBCSET and SBCSET.
2815 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2816 mbcset->range_ends, is a pointer argument sinse we may
2817 update it. */
2819 auto inline reg_errcode_t
2820 __attribute ((always_inline))
2821 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2822 re_charset_t *mbcset;
2823 int *range_alloc;
2824 bitset_t sbcset;
2825 bracket_elem_t *start_elem, *end_elem;
2827 unsigned int ch;
2828 uint32_t start_collseq;
2829 uint32_t end_collseq;
2831 /* Equivalence Classes and Character Classes can't be a range
2832 start/end. */
2833 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2834 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2836 return REG_ERANGE;
2838 start_collseq = lookup_collation_sequence_value (start_elem);
2839 end_collseq = lookup_collation_sequence_value (end_elem);
2840 /* Check start/end collation sequence values. */
2841 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2842 return REG_ECOLLATE;
2843 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2844 return REG_ERANGE;
2846 /* Got valid collation sequence values, add them as a new entry.
2847 However, if we have no collation elements, and the character set
2848 is single byte, the single byte character set that we
2849 build below suffices. */
2850 if (nrules > 0 || dfa->mb_cur_max > 1)
2852 /* Check the space of the arrays. */
2853 if (BE (*range_alloc == mbcset->nranges, 0))
2855 /* There is not enough space, need realloc. */
2856 uint32_t *new_array_start;
2857 uint32_t *new_array_end;
2858 int new_nranges;
2860 /* +1 in case of mbcset->nranges is 0. */
2861 new_nranges = 2 * mbcset->nranges + 1;
2862 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2863 new_nranges);
2864 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2865 new_nranges);
2867 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2868 return REG_ESPACE;
2870 mbcset->range_starts = new_array_start;
2871 mbcset->range_ends = new_array_end;
2872 *range_alloc = new_nranges;
2875 mbcset->range_starts[mbcset->nranges] = start_collseq;
2876 mbcset->range_ends[mbcset->nranges++] = end_collseq;
2879 /* Build the table for single byte characters. */
2880 for (ch = 0; ch < SBC_MAX; ch++)
2882 uint32_t ch_collseq;
2884 if (MB_CUR_MAX == 1)
2886 if (nrules == 0)
2887 ch_collseq = collseqmb[ch];
2888 else
2889 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2890 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2891 bitset_set (sbcset, ch);
2893 return REG_NOERROR;
2896 /* Local function for parse_bracket_exp used in _LIBC environement.
2897 Build the collating element which is represented by NAME.
2898 The result are written to MBCSET and SBCSET.
2899 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2900 pointer argument sinse we may update it. */
2902 auto inline reg_errcode_t
2903 __attribute ((always_inline))
2904 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2905 re_charset_t *mbcset;
2906 int *coll_sym_alloc;
2907 bitset_t sbcset;
2908 const unsigned char *name;
2910 int32_t elem, idx;
2911 size_t name_len = strlen ((const char *) name);
2912 if (nrules != 0)
2914 elem = seek_collating_symbol_entry (name, name_len);
2915 if (symb_table[2 * elem] != 0)
2917 /* We found the entry. */
2918 idx = symb_table[2 * elem + 1];
2919 /* Skip the name of collating element name. */
2920 idx += 1 + extra[idx];
2922 else if (symb_table[2 * elem] == 0 && name_len == 1)
2924 /* No valid character, treat it as a normal
2925 character. */
2926 bitset_set (sbcset, name[0]);
2927 return REG_NOERROR;
2929 else
2930 return REG_ECOLLATE;
2932 /* Got valid collation sequence, add it as a new entry. */
2933 /* Check the space of the arrays. */
2934 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
2936 /* Not enough, realloc it. */
2937 /* +1 in case of mbcset->ncoll_syms is 0. */
2938 int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
2939 /* Use realloc since mbcset->coll_syms is NULL
2940 if *alloc == 0. */
2941 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
2942 new_coll_sym_alloc);
2943 if (BE (new_coll_syms == NULL, 0))
2944 return REG_ESPACE;
2945 mbcset->coll_syms = new_coll_syms;
2946 *coll_sym_alloc = new_coll_sym_alloc;
2948 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
2949 return REG_NOERROR;
2951 else
2953 if (BE (name_len != 1, 0))
2954 return REG_ECOLLATE;
2955 else
2957 bitset_set (sbcset, name[0]);
2958 return REG_NOERROR;
2962 #endif
2964 re_token_t br_token;
2965 re_bitset_ptr_t sbcset;
2966 #ifdef RE_ENABLE_I18N
2967 re_charset_t *mbcset;
2968 int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
2969 int equiv_class_alloc = 0, char_class_alloc = 0;
2970 #endif /* not RE_ENABLE_I18N */
2971 int non_match = 0;
2972 bin_tree_t *work_tree;
2973 int token_len;
2974 int first_round = 1;
2975 #ifdef _LIBC
2976 collseqmb = (const unsigned char *)
2977 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
2978 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
2979 if (nrules)
2982 if (MB_CUR_MAX > 1)
2984 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
2985 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
2986 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
2987 _NL_COLLATE_SYMB_TABLEMB);
2988 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
2989 _NL_COLLATE_SYMB_EXTRAMB);
2991 #endif
2992 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
2993 #ifdef RE_ENABLE_I18N
2994 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
2995 #endif /* RE_ENABLE_I18N */
2996 #ifdef RE_ENABLE_I18N
2997 if (BE (sbcset == NULL || mbcset == NULL, 0))
2998 #else
2999 if (BE (sbcset == NULL, 0))
3000 #endif /* RE_ENABLE_I18N */
3002 *err = REG_ESPACE;
3003 return NULL;
3006 token_len = peek_token_bracket (token, regexp, syntax);
3007 if (BE (token->type == END_OF_RE, 0))
3009 *err = REG_BADPAT;
3010 goto parse_bracket_exp_free_return;
3012 if (token->type == OP_NON_MATCH_LIST)
3014 #ifdef RE_ENABLE_I18N
3015 mbcset->non_match = 1;
3016 #endif /* not RE_ENABLE_I18N */
3017 non_match = 1;
3018 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3019 bitset_set (sbcset, '\n');
3020 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3021 token_len = peek_token_bracket (token, regexp, syntax);
3022 if (BE (token->type == END_OF_RE, 0))
3024 *err = REG_BADPAT;
3025 goto parse_bracket_exp_free_return;
3029 /* We treat the first ']' as a normal character. */
3030 if (token->type == OP_CLOSE_BRACKET)
3031 token->type = CHARACTER;
3033 while (1)
3035 bracket_elem_t start_elem, end_elem;
3036 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3037 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3038 reg_errcode_t ret;
3039 int token_len2 = 0, is_range_exp = 0;
3040 re_token_t token2;
3042 start_elem.opr.name = start_name_buf;
3043 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3044 syntax, first_round);
3045 if (BE (ret != REG_NOERROR, 0))
3047 *err = ret;
3048 goto parse_bracket_exp_free_return;
3050 first_round = 0;
3052 /* Get information about the next token. We need it in any case. */
3053 token_len = peek_token_bracket (token, regexp, syntax);
3055 /* Do not check for ranges if we know they are not allowed. */
3056 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3058 if (BE (token->type == END_OF_RE, 0))
3060 *err = REG_EBRACK;
3061 goto parse_bracket_exp_free_return;
3063 if (token->type == OP_CHARSET_RANGE)
3065 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3066 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3067 if (BE (token2.type == END_OF_RE, 0))
3069 *err = REG_EBRACK;
3070 goto parse_bracket_exp_free_return;
3072 if (token2.type == OP_CLOSE_BRACKET)
3074 /* We treat the last '-' as a normal character. */
3075 re_string_skip_bytes (regexp, -token_len);
3076 token->type = CHARACTER;
3078 else
3079 is_range_exp = 1;
3083 if (is_range_exp == 1)
3085 end_elem.opr.name = end_name_buf;
3086 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3087 dfa, syntax, 1);
3088 if (BE (ret != REG_NOERROR, 0))
3090 *err = ret;
3091 goto parse_bracket_exp_free_return;
3094 token_len = peek_token_bracket (token, regexp, syntax);
3096 #ifdef _LIBC
3097 *err = build_range_exp (sbcset, mbcset, &range_alloc,
3098 &start_elem, &end_elem);
3099 #else
3100 # ifdef RE_ENABLE_I18N
3101 *err = build_range_exp (sbcset,
3102 dfa->mb_cur_max > 1 ? mbcset : NULL,
3103 &range_alloc, &start_elem, &end_elem);
3104 # else
3105 *err = build_range_exp (sbcset, &start_elem, &end_elem);
3106 # endif
3107 #endif /* RE_ENABLE_I18N */
3108 if (BE (*err != REG_NOERROR, 0))
3109 goto parse_bracket_exp_free_return;
3111 else
3113 switch (start_elem.type)
3115 case SB_CHAR:
3116 bitset_set (sbcset, start_elem.opr.ch);
3117 break;
3118 #ifdef RE_ENABLE_I18N
3119 case MB_CHAR:
3120 /* Check whether the array has enough space. */
3121 if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3123 wchar_t *new_mbchars;
3124 /* Not enough, realloc it. */
3125 /* +1 in case of mbcset->nmbchars is 0. */
3126 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3127 /* Use realloc since array is NULL if *alloc == 0. */
3128 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3129 mbchar_alloc);
3130 if (BE (new_mbchars == NULL, 0))
3131 goto parse_bracket_exp_espace;
3132 mbcset->mbchars = new_mbchars;
3134 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3135 break;
3136 #endif /* RE_ENABLE_I18N */
3137 case EQUIV_CLASS:
3138 *err = build_equiv_class (sbcset,
3139 #ifdef RE_ENABLE_I18N
3140 mbcset, &equiv_class_alloc,
3141 #endif /* RE_ENABLE_I18N */
3142 start_elem.opr.name);
3143 if (BE (*err != REG_NOERROR, 0))
3144 goto parse_bracket_exp_free_return;
3145 break;
3146 case COLL_SYM:
3147 *err = build_collating_symbol (sbcset,
3148 #ifdef RE_ENABLE_I18N
3149 mbcset, &coll_sym_alloc,
3150 #endif /* RE_ENABLE_I18N */
3151 start_elem.opr.name);
3152 if (BE (*err != REG_NOERROR, 0))
3153 goto parse_bracket_exp_free_return;
3154 break;
3155 case CHAR_CLASS:
3156 *err = build_charclass (regexp->trans, sbcset,
3157 #ifdef RE_ENABLE_I18N
3158 mbcset, &char_class_alloc,
3159 #endif /* RE_ENABLE_I18N */
3160 start_elem.opr.name, syntax);
3161 if (BE (*err != REG_NOERROR, 0))
3162 goto parse_bracket_exp_free_return;
3163 break;
3164 default:
3165 assert (0);
3166 break;
3169 if (BE (token->type == END_OF_RE, 0))
3171 *err = REG_EBRACK;
3172 goto parse_bracket_exp_free_return;
3174 if (token->type == OP_CLOSE_BRACKET)
3175 break;
3178 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3180 /* If it is non-matching list. */
3181 if (non_match)
3182 bitset_not (sbcset);
3184 #ifdef RE_ENABLE_I18N
3185 /* Ensure only single byte characters are set. */
3186 if (dfa->mb_cur_max > 1)
3187 bitset_mask (sbcset, dfa->sb_char);
3189 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3190 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3191 || mbcset->non_match)))
3193 bin_tree_t *mbc_tree;
3194 int sbc_idx;
3195 /* Build a tree for complex bracket. */
3196 dfa->has_mb_node = 1;
3197 br_token.type = COMPLEX_BRACKET;
3198 br_token.opr.mbcset = mbcset;
3199 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3200 if (BE (mbc_tree == NULL, 0))
3201 goto parse_bracket_exp_espace;
3202 for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3203 if (sbcset[sbc_idx])
3204 break;
3205 /* If there are no bits set in sbcset, there is no point
3206 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3207 if (sbc_idx < BITSET_WORDS)
3209 /* Build a tree for simple bracket. */
3210 br_token.type = SIMPLE_BRACKET;
3211 br_token.opr.sbcset = sbcset;
3212 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3213 if (BE (work_tree == NULL, 0))
3214 goto parse_bracket_exp_espace;
3216 /* Then join them by ALT node. */
3217 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3218 if (BE (work_tree == NULL, 0))
3219 goto parse_bracket_exp_espace;
3221 else
3223 re_free (sbcset);
3224 work_tree = mbc_tree;
3227 else
3228 #endif /* not RE_ENABLE_I18N */
3230 #ifdef RE_ENABLE_I18N
3231 free_charset (mbcset);
3232 #endif
3233 /* Build a tree for simple bracket. */
3234 br_token.type = SIMPLE_BRACKET;
3235 br_token.opr.sbcset = sbcset;
3236 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3237 if (BE (work_tree == NULL, 0))
3238 goto parse_bracket_exp_espace;
3240 return work_tree;
3242 parse_bracket_exp_espace:
3243 *err = REG_ESPACE;
3244 parse_bracket_exp_free_return:
3245 re_free (sbcset);
3246 #ifdef RE_ENABLE_I18N
3247 free_charset (mbcset);
3248 #endif /* RE_ENABLE_I18N */
3249 return NULL;
3252 /* Parse an element in the bracket expression. */
3254 static reg_errcode_t
3255 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3256 re_token_t *token, int token_len, re_dfa_t *dfa,
3257 reg_syntax_t syntax, int accept_hyphen)
3259 #ifdef RE_ENABLE_I18N
3260 int cur_char_size;
3261 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3262 if (cur_char_size > 1)
3264 elem->type = MB_CHAR;
3265 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3266 re_string_skip_bytes (regexp, cur_char_size);
3267 return REG_NOERROR;
3269 #endif /* RE_ENABLE_I18N */
3270 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3271 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3272 || token->type == OP_OPEN_EQUIV_CLASS)
3273 return parse_bracket_symbol (elem, regexp, token);
3274 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3276 /* A '-' must only appear as anything but a range indicator before
3277 the closing bracket. Everything else is an error. */
3278 re_token_t token2;
3279 (void) peek_token_bracket (&token2, regexp, syntax);
3280 if (token2.type != OP_CLOSE_BRACKET)
3281 /* The actual error value is not standardized since this whole
3282 case is undefined. But ERANGE makes good sense. */
3283 return REG_ERANGE;
3285 elem->type = SB_CHAR;
3286 elem->opr.ch = token->opr.c;
3287 return REG_NOERROR;
3290 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3291 such as [:<character_class>:], [.<collating_element>.], and
3292 [=<equivalent_class>=]. */
3294 static reg_errcode_t
3295 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3296 re_token_t *token)
3298 unsigned char ch, delim = token->opr.c;
3299 int i = 0;
3300 if (re_string_eoi(regexp))
3301 return REG_EBRACK;
3302 for (;; ++i)
3304 if (i >= BRACKET_NAME_BUF_SIZE)
3305 return REG_EBRACK;
3306 if (token->type == OP_OPEN_CHAR_CLASS)
3307 ch = re_string_fetch_byte_case (regexp);
3308 else
3309 ch = re_string_fetch_byte (regexp);
3310 if (re_string_eoi(regexp))
3311 return REG_EBRACK;
3312 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3313 break;
3314 elem->opr.name[i] = ch;
3316 re_string_skip_bytes (regexp, 1);
3317 elem->opr.name[i] = '\0';
3318 switch (token->type)
3320 case OP_OPEN_COLL_ELEM:
3321 elem->type = COLL_SYM;
3322 break;
3323 case OP_OPEN_EQUIV_CLASS:
3324 elem->type = EQUIV_CLASS;
3325 break;
3326 case OP_OPEN_CHAR_CLASS:
3327 elem->type = CHAR_CLASS;
3328 break;
3329 default:
3330 break;
3332 return REG_NOERROR;
3335 /* Helper function for parse_bracket_exp.
3336 Build the equivalence class which is represented by NAME.
3337 The result are written to MBCSET and SBCSET.
3338 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3339 is a pointer argument sinse we may update it. */
3341 static reg_errcode_t
3342 #ifdef RE_ENABLE_I18N
3343 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3344 int *equiv_class_alloc, const unsigned char *name)
3345 #else /* not RE_ENABLE_I18N */
3346 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3347 #endif /* not RE_ENABLE_I18N */
3349 #ifdef _LIBC
3350 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3351 if (nrules != 0)
3353 const int32_t *table, *indirect;
3354 const unsigned char *weights, *extra, *cp;
3355 unsigned char char_buf[2];
3356 int32_t idx1, idx2;
3357 unsigned int ch;
3358 size_t len;
3359 /* This #include defines a local function! */
3360 # include <locale/weight.h>
3361 /* Calculate the index for equivalence class. */
3362 cp = name;
3363 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3364 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3365 _NL_COLLATE_WEIGHTMB);
3366 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3367 _NL_COLLATE_EXTRAMB);
3368 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3369 _NL_COLLATE_INDIRECTMB);
3370 idx1 = findidx (&cp);
3371 if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3372 /* This isn't a valid character. */
3373 return REG_ECOLLATE;
3375 /* Build single byte matcing table for this equivalence class. */
3376 char_buf[1] = (unsigned char) '\0';
3377 len = weights[idx1 & 0xffffff];
3378 for (ch = 0; ch < SBC_MAX; ++ch)
3380 char_buf[0] = ch;
3381 cp = char_buf;
3382 idx2 = findidx (&cp);
3384 idx2 = table[ch];
3386 if (idx2 == 0)
3387 /* This isn't a valid character. */
3388 continue;
3389 /* Compare only if the length matches and the collation rule
3390 index is the same. */
3391 if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3393 int cnt = 0;
3395 while (cnt <= len &&
3396 weights[(idx1 & 0xffffff) + 1 + cnt]
3397 == weights[(idx2 & 0xffffff) + 1 + cnt])
3398 ++cnt;
3400 if (cnt > len)
3401 bitset_set (sbcset, ch);
3404 /* Check whether the array has enough space. */
3405 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3407 /* Not enough, realloc it. */
3408 /* +1 in case of mbcset->nequiv_classes is 0. */
3409 int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3410 /* Use realloc since the array is NULL if *alloc == 0. */
3411 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3412 int32_t,
3413 new_equiv_class_alloc);
3414 if (BE (new_equiv_classes == NULL, 0))
3415 return REG_ESPACE;
3416 mbcset->equiv_classes = new_equiv_classes;
3417 *equiv_class_alloc = new_equiv_class_alloc;
3419 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3421 else
3422 #endif /* _LIBC */
3424 if (BE (strlen ((const char *) name) != 1, 0))
3425 return REG_ECOLLATE;
3426 bitset_set (sbcset, *name);
3428 return REG_NOERROR;
3431 /* Helper function for parse_bracket_exp.
3432 Build the character class which is represented by NAME.
3433 The result are written to MBCSET and SBCSET.
3434 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3435 is a pointer argument sinse we may update it. */
3437 static reg_errcode_t
3438 #ifdef RE_ENABLE_I18N
3439 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3440 re_charset_t *mbcset, int *char_class_alloc,
3441 const unsigned char *class_name, reg_syntax_t syntax)
3442 #else /* not RE_ENABLE_I18N */
3443 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3444 const unsigned char *class_name, reg_syntax_t syntax)
3445 #endif /* not RE_ENABLE_I18N */
3447 int i;
3448 const char *name = (const char *) class_name;
3450 /* In case of REG_ICASE "upper" and "lower" match the both of
3451 upper and lower cases. */
3452 if ((syntax & RE_ICASE)
3453 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3454 name = "alpha";
3456 #ifdef RE_ENABLE_I18N
3457 /* Check the space of the arrays. */
3458 if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3460 /* Not enough, realloc it. */
3461 /* +1 in case of mbcset->nchar_classes is 0. */
3462 int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3463 /* Use realloc since array is NULL if *alloc == 0. */
3464 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3465 new_char_class_alloc);
3466 if (BE (new_char_classes == NULL, 0))
3467 return REG_ESPACE;
3468 mbcset->char_classes = new_char_classes;
3469 *char_class_alloc = new_char_class_alloc;
3471 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3472 #endif /* RE_ENABLE_I18N */
3474 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3475 do { \
3476 if (BE (trans != NULL, 0)) \
3478 for (i = 0; i < SBC_MAX; ++i) \
3479 if (ctype_func (i)) \
3480 bitset_set (sbcset, trans[i]); \
3482 else \
3484 for (i = 0; i < SBC_MAX; ++i) \
3485 if (ctype_func (i)) \
3486 bitset_set (sbcset, i); \
3488 } while (0)
3490 if (strcmp (name, "alnum") == 0)
3491 BUILD_CHARCLASS_LOOP (isalnum);
3492 else if (strcmp (name, "cntrl") == 0)
3493 BUILD_CHARCLASS_LOOP (iscntrl);
3494 else if (strcmp (name, "lower") == 0)
3495 BUILD_CHARCLASS_LOOP (islower);
3496 else if (strcmp (name, "space") == 0)
3497 BUILD_CHARCLASS_LOOP (isspace);
3498 else if (strcmp (name, "alpha") == 0)
3499 BUILD_CHARCLASS_LOOP (isalpha);
3500 else if (strcmp (name, "digit") == 0)
3501 BUILD_CHARCLASS_LOOP (isdigit);
3502 else if (strcmp (name, "print") == 0)
3503 BUILD_CHARCLASS_LOOP (isprint);
3504 else if (strcmp (name, "upper") == 0)
3505 BUILD_CHARCLASS_LOOP (isupper);
3506 else if (strcmp (name, "blank") == 0)
3507 BUILD_CHARCLASS_LOOP (isblank);
3508 else if (strcmp (name, "graph") == 0)
3509 BUILD_CHARCLASS_LOOP (isgraph);
3510 else if (strcmp (name, "punct") == 0)
3511 BUILD_CHARCLASS_LOOP (ispunct);
3512 else if (strcmp (name, "xdigit") == 0)
3513 BUILD_CHARCLASS_LOOP (isxdigit);
3514 else
3515 return REG_ECTYPE;
3517 return REG_NOERROR;
3520 static bin_tree_t *
3521 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3522 const unsigned char *class_name,
3523 const unsigned char *extra, int non_match,
3524 reg_errcode_t *err)
3526 re_bitset_ptr_t sbcset;
3527 #ifdef RE_ENABLE_I18N
3528 re_charset_t *mbcset;
3529 int alloc = 0;
3530 #endif /* not RE_ENABLE_I18N */
3531 reg_errcode_t ret;
3532 re_token_t br_token;
3533 bin_tree_t *tree;
3535 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3536 #ifdef RE_ENABLE_I18N
3537 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3538 #endif /* RE_ENABLE_I18N */
3540 #ifdef RE_ENABLE_I18N
3541 if (BE (sbcset == NULL || mbcset == NULL, 0))
3542 #else /* not RE_ENABLE_I18N */
3543 if (BE (sbcset == NULL, 0))
3544 #endif /* not RE_ENABLE_I18N */
3546 *err = REG_ESPACE;
3547 return NULL;
3550 if (non_match)
3552 #ifdef RE_ENABLE_I18N
3553 mbcset->non_match = 1;
3554 #endif /* not RE_ENABLE_I18N */
3557 /* We don't care the syntax in this case. */
3558 ret = build_charclass (trans, sbcset,
3559 #ifdef RE_ENABLE_I18N
3560 mbcset, &alloc,
3561 #endif /* RE_ENABLE_I18N */
3562 class_name, 0);
3564 if (BE (ret != REG_NOERROR, 0))
3566 re_free (sbcset);
3567 #ifdef RE_ENABLE_I18N
3568 free_charset (mbcset);
3569 #endif /* RE_ENABLE_I18N */
3570 *err = ret;
3571 return NULL;
3573 /* \w match '_' also. */
3574 for (; *extra; extra++)
3575 bitset_set (sbcset, *extra);
3577 /* If it is non-matching list. */
3578 if (non_match)
3579 bitset_not (sbcset);
3581 #ifdef RE_ENABLE_I18N
3582 /* Ensure only single byte characters are set. */
3583 if (dfa->mb_cur_max > 1)
3584 bitset_mask (sbcset, dfa->sb_char);
3585 #endif
3587 /* Build a tree for simple bracket. */
3588 br_token.type = SIMPLE_BRACKET;
3589 br_token.opr.sbcset = sbcset;
3590 tree = create_token_tree (dfa, NULL, NULL, &br_token);
3591 if (BE (tree == NULL, 0))
3592 goto build_word_op_espace;
3594 #ifdef RE_ENABLE_I18N
3595 if (dfa->mb_cur_max > 1)
3597 bin_tree_t *mbc_tree;
3598 /* Build a tree for complex bracket. */
3599 br_token.type = COMPLEX_BRACKET;
3600 br_token.opr.mbcset = mbcset;
3601 dfa->has_mb_node = 1;
3602 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3603 if (BE (mbc_tree == NULL, 0))
3604 goto build_word_op_espace;
3605 /* Then join them by ALT node. */
3606 tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3607 if (BE (mbc_tree != NULL, 1))
3608 return tree;
3610 else
3612 free_charset (mbcset);
3613 return tree;
3615 #else /* not RE_ENABLE_I18N */
3616 return tree;
3617 #endif /* not RE_ENABLE_I18N */
3619 build_word_op_espace:
3620 re_free (sbcset);
3621 #ifdef RE_ENABLE_I18N
3622 free_charset (mbcset);
3623 #endif /* RE_ENABLE_I18N */
3624 *err = REG_ESPACE;
3625 return NULL;
3628 /* This is intended for the expressions like "a{1,3}".
3629 Fetch a number from `input', and return the number.
3630 Return -1, if the number field is empty like "{,1}".
3631 Return -2, If an error is occured. */
3633 static int
3634 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3636 int num = -1;
3637 unsigned char c;
3638 while (1)
3640 fetch_token (token, input, syntax);
3641 c = token->opr.c;
3642 if (BE (token->type == END_OF_RE, 0))
3643 return -2;
3644 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3645 break;
3646 num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3647 ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3648 num = (num > RE_DUP_MAX) ? -2 : num;
3650 return num;
3653 #ifdef RE_ENABLE_I18N
3654 static void
3655 free_charset (re_charset_t *cset)
3657 re_free (cset->mbchars);
3658 # ifdef _LIBC
3659 re_free (cset->coll_syms);
3660 re_free (cset->equiv_classes);
3661 re_free (cset->range_starts);
3662 re_free (cset->range_ends);
3663 # endif
3664 re_free (cset->char_classes);
3665 re_free (cset);
3667 #endif /* RE_ENABLE_I18N */
3669 /* Functions for binary tree operation. */
3671 /* Create a tree node. */
3673 static bin_tree_t *
3674 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3675 re_token_type_t type)
3677 re_token_t t;
3678 t.type = type;
3679 return create_token_tree (dfa, left, right, &t);
3682 static bin_tree_t *
3683 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3684 const re_token_t *token)
3686 bin_tree_t *tree;
3687 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3689 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3691 if (storage == NULL)
3692 return NULL;
3693 storage->next = dfa->str_tree_storage;
3694 dfa->str_tree_storage = storage;
3695 dfa->str_tree_storage_idx = 0;
3697 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3699 tree->parent = NULL;
3700 tree->left = left;
3701 tree->right = right;
3702 tree->token = *token;
3703 tree->token.duplicated = 0;
3704 tree->token.opt_subexp = 0;
3705 tree->first = NULL;
3706 tree->next = NULL;
3707 tree->node_idx = -1;
3709 if (left != NULL)
3710 left->parent = tree;
3711 if (right != NULL)
3712 right->parent = tree;
3713 return tree;
3716 /* Mark the tree SRC as an optional subexpression.
3717 To be called from preorder or postorder. */
3719 static reg_errcode_t
3720 mark_opt_subexp (void *extra, bin_tree_t *node)
3722 int idx = (int) (long) extra;
3723 if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3724 node->token.opt_subexp = 1;
3726 return REG_NOERROR;
3729 /* Free the allocated memory inside NODE. */
3731 static void
3732 free_token (re_token_t *node)
3734 #ifdef RE_ENABLE_I18N
3735 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3736 free_charset (node->opr.mbcset);
3737 else
3738 #endif /* RE_ENABLE_I18N */
3739 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3740 re_free (node->opr.sbcset);
3743 /* Worker function for tree walking. Free the allocated memory inside NODE
3744 and its children. */
3746 static reg_errcode_t
3747 free_tree (void *extra, bin_tree_t *node)
3749 free_token (&node->token);
3750 return REG_NOERROR;
3754 /* Duplicate the node SRC, and return new node. This is a preorder
3755 visit similar to the one implemented by the generic visitor, but
3756 we need more infrastructure to maintain two parallel trees --- so,
3757 it's easier to duplicate. */
3759 static bin_tree_t *
3760 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3762 const bin_tree_t *node;
3763 bin_tree_t *dup_root;
3764 bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3766 for (node = root; ; )
3768 /* Create a new tree and link it back to the current parent. */
3769 *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3770 if (*p_new == NULL)
3771 return NULL;
3772 (*p_new)->parent = dup_node;
3773 (*p_new)->token.duplicated = 1;
3774 dup_node = *p_new;
3776 /* Go to the left node, or up and to the right. */
3777 if (node->left)
3779 node = node->left;
3780 p_new = &dup_node->left;
3782 else
3784 const bin_tree_t *prev = NULL;
3785 while (node->right == prev || node->right == NULL)
3787 prev = node;
3788 node = node->parent;
3789 dup_node = dup_node->parent;
3790 if (!node)
3791 return dup_root;
3793 node = node->right;
3794 p_new = &dup_node->right;