..
[glibc.git] / posix / regcomp.c
blob4843cfea339b117b44234e643181f5af8e1ad5a7
1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002,2003,2004,2005,2006,2007,2009
3 Free Software Foundation, Inc.
4 This file is part of the GNU C Library.
5 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
7 The GNU C Library is free software; you can redistribute it and/or
8 modify it under the terms of the GNU Lesser General Public
9 License as published by the Free Software Foundation; either
10 version 2.1 of the License, or (at your option) any later version.
12 The GNU C Library is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 Lesser General Public License for more details.
17 You should have received a copy of the GNU Lesser General Public
18 License along with the GNU C Library; if not, write to the Free
19 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
20 02111-1307 USA. */
22 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
23 size_t length, reg_syntax_t syntax);
24 static void re_compile_fastmap_iter (regex_t *bufp,
25 const re_dfastate_t *init_state,
26 char *fastmap);
27 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
28 #ifdef RE_ENABLE_I18N
29 static void free_charset (re_charset_t *cset);
30 #endif /* RE_ENABLE_I18N */
31 static void free_workarea_compile (regex_t *preg);
32 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
33 #ifdef RE_ENABLE_I18N
34 static void optimize_utf8 (re_dfa_t *dfa);
35 #endif
36 static reg_errcode_t analyze (regex_t *preg);
37 static reg_errcode_t preorder (bin_tree_t *root,
38 reg_errcode_t (fn (void *, bin_tree_t *)),
39 void *extra);
40 static reg_errcode_t postorder (bin_tree_t *root,
41 reg_errcode_t (fn (void *, bin_tree_t *)),
42 void *extra);
43 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
44 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
45 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
46 bin_tree_t *node);
47 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
48 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
49 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
50 static int duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint);
51 static int search_duplicated_node (const re_dfa_t *dfa, int org_node,
52 unsigned int constraint);
53 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
54 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
55 int node, int root);
56 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
57 static int fetch_number (re_string_t *input, re_token_t *token,
58 reg_syntax_t syntax);
59 static int peek_token (re_token_t *token, re_string_t *input,
60 reg_syntax_t syntax) internal_function;
61 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
62 reg_syntax_t syntax, reg_errcode_t *err);
63 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
64 re_token_t *token, reg_syntax_t syntax,
65 int nest, reg_errcode_t *err);
66 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
67 re_token_t *token, reg_syntax_t syntax,
68 int nest, reg_errcode_t *err);
69 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
70 re_token_t *token, reg_syntax_t syntax,
71 int nest, reg_errcode_t *err);
72 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
73 re_token_t *token, reg_syntax_t syntax,
74 int nest, reg_errcode_t *err);
75 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
76 re_dfa_t *dfa, re_token_t *token,
77 reg_syntax_t syntax, reg_errcode_t *err);
78 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
79 re_token_t *token, reg_syntax_t syntax,
80 reg_errcode_t *err);
81 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
82 re_string_t *regexp,
83 re_token_t *token, int token_len,
84 re_dfa_t *dfa,
85 reg_syntax_t syntax,
86 int accept_hyphen);
87 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
88 re_string_t *regexp,
89 re_token_t *token);
90 #ifdef RE_ENABLE_I18N
91 static reg_errcode_t build_equiv_class (bitset_t sbcset,
92 re_charset_t *mbcset,
93 int *equiv_class_alloc,
94 const unsigned char *name);
95 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
96 bitset_t sbcset,
97 re_charset_t *mbcset,
98 int *char_class_alloc,
99 const unsigned char *class_name,
100 reg_syntax_t syntax);
101 #else /* not RE_ENABLE_I18N */
102 static reg_errcode_t build_equiv_class (bitset_t sbcset,
103 const unsigned char *name);
104 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
105 bitset_t sbcset,
106 const unsigned char *class_name,
107 reg_syntax_t syntax);
108 #endif /* not RE_ENABLE_I18N */
109 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
110 RE_TRANSLATE_TYPE trans,
111 const unsigned char *class_name,
112 const unsigned char *extra,
113 int non_match, reg_errcode_t *err);
114 static bin_tree_t *create_tree (re_dfa_t *dfa,
115 bin_tree_t *left, bin_tree_t *right,
116 re_token_type_t type);
117 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
118 bin_tree_t *left, bin_tree_t *right,
119 const re_token_t *token);
120 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
121 static void free_token (re_token_t *node);
122 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
123 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
125 /* This table gives an error message for each of the error codes listed
126 in regex.h. Obviously the order here has to be same as there.
127 POSIX doesn't require that we do anything for REG_NOERROR,
128 but why not be nice? */
130 const char __re_error_msgid[] attribute_hidden =
132 #define REG_NOERROR_IDX 0
133 gettext_noop ("Success") /* REG_NOERROR */
134 "\0"
135 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
136 gettext_noop ("No match") /* REG_NOMATCH */
137 "\0"
138 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
139 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
140 "\0"
141 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
142 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
143 "\0"
144 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
145 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
146 "\0"
147 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
148 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
149 "\0"
150 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
151 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
152 "\0"
153 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
154 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
155 "\0"
156 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
157 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
158 "\0"
159 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
160 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
161 "\0"
162 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
163 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
164 "\0"
165 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
166 gettext_noop ("Invalid range end") /* REG_ERANGE */
167 "\0"
168 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
169 gettext_noop ("Memory exhausted") /* REG_ESPACE */
170 "\0"
171 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
172 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
173 "\0"
174 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
175 gettext_noop ("Premature end of regular expression") /* REG_EEND */
176 "\0"
177 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
178 gettext_noop ("Regular expression too big") /* REG_ESIZE */
179 "\0"
180 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
181 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
184 const size_t __re_error_msgid_idx[] attribute_hidden =
186 REG_NOERROR_IDX,
187 REG_NOMATCH_IDX,
188 REG_BADPAT_IDX,
189 REG_ECOLLATE_IDX,
190 REG_ECTYPE_IDX,
191 REG_EESCAPE_IDX,
192 REG_ESUBREG_IDX,
193 REG_EBRACK_IDX,
194 REG_EPAREN_IDX,
195 REG_EBRACE_IDX,
196 REG_BADBR_IDX,
197 REG_ERANGE_IDX,
198 REG_ESPACE_IDX,
199 REG_BADRPT_IDX,
200 REG_EEND_IDX,
201 REG_ESIZE_IDX,
202 REG_ERPAREN_IDX
205 /* Entry points for GNU code. */
207 /* re_compile_pattern is the GNU regular expression compiler: it
208 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
209 Returns 0 if the pattern was valid, otherwise an error string.
211 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
212 are set in BUFP on entry. */
214 const char *
215 re_compile_pattern (pattern, length, bufp)
216 const char *pattern;
217 size_t length;
218 struct re_pattern_buffer *bufp;
220 reg_errcode_t ret;
222 /* And GNU code determines whether or not to get register information
223 by passing null for the REGS argument to re_match, etc., not by
224 setting no_sub, unless RE_NO_SUB is set. */
225 bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
227 /* Match anchors at newline. */
228 bufp->newline_anchor = 1;
230 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
232 if (!ret)
233 return NULL;
234 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
236 #ifdef _LIBC
237 weak_alias (__re_compile_pattern, re_compile_pattern)
238 #endif
240 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
241 also be assigned to arbitrarily: each pattern buffer stores its own
242 syntax, so it can be changed between regex compilations. */
243 /* This has no initializer because initialized variables in Emacs
244 become read-only after dumping. */
245 reg_syntax_t re_syntax_options;
248 /* Specify the precise syntax of regexps for compilation. This provides
249 for compatibility for various utilities which historically have
250 different, incompatible syntaxes.
252 The argument SYNTAX is a bit mask comprised of the various bits
253 defined in regex.h. We return the old syntax. */
255 reg_syntax_t
256 re_set_syntax (syntax)
257 reg_syntax_t syntax;
259 reg_syntax_t ret = re_syntax_options;
261 re_syntax_options = syntax;
262 return ret;
264 #ifdef _LIBC
265 weak_alias (__re_set_syntax, re_set_syntax)
266 #endif
269 re_compile_fastmap (bufp)
270 struct re_pattern_buffer *bufp;
272 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
273 char *fastmap = bufp->fastmap;
275 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
276 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
277 if (dfa->init_state != dfa->init_state_word)
278 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
279 if (dfa->init_state != dfa->init_state_nl)
280 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
281 if (dfa->init_state != dfa->init_state_begbuf)
282 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
283 bufp->fastmap_accurate = 1;
284 return 0;
286 #ifdef _LIBC
287 weak_alias (__re_compile_fastmap, re_compile_fastmap)
288 #endif
290 static inline void
291 __attribute ((always_inline))
292 re_set_fastmap (char *fastmap, int icase, int ch)
294 fastmap[ch] = 1;
295 if (icase)
296 fastmap[tolower (ch)] = 1;
299 /* Helper function for re_compile_fastmap.
300 Compile fastmap for the initial_state INIT_STATE. */
302 static void
303 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
304 char *fastmap)
306 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
307 int node_cnt;
308 int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
309 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
311 int node = init_state->nodes.elems[node_cnt];
312 re_token_type_t type = dfa->nodes[node].type;
314 if (type == CHARACTER)
316 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
317 #ifdef RE_ENABLE_I18N
318 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
320 unsigned char *buf = alloca (dfa->mb_cur_max), *p;
321 wchar_t wc;
322 mbstate_t state;
324 p = buf;
325 *p++ = dfa->nodes[node].opr.c;
326 while (++node < dfa->nodes_len
327 && dfa->nodes[node].type == CHARACTER
328 && dfa->nodes[node].mb_partial)
329 *p++ = dfa->nodes[node].opr.c;
330 memset (&state, '\0', sizeof (state));
331 if (__mbrtowc (&wc, (const char *) buf, p - buf,
332 &state) == p - buf
333 && (__wcrtomb ((char *) buf, towlower (wc), &state)
334 != (size_t) -1))
335 re_set_fastmap (fastmap, 0, buf[0]);
337 #endif
339 else if (type == SIMPLE_BRACKET)
341 int i, ch;
342 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
344 int j;
345 bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
346 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
347 if (w & ((bitset_word_t) 1 << j))
348 re_set_fastmap (fastmap, icase, ch);
351 #ifdef RE_ENABLE_I18N
352 else if (type == COMPLEX_BRACKET)
354 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
355 int i;
357 # ifdef _LIBC
358 /* See if we have to try all bytes which start multiple collation
359 elements.
360 e.g. In da_DK, we want to catch 'a' since "aa" is a valid
361 collation element, and don't catch 'b' since 'b' is
362 the only collation element which starts from 'b' (and
363 it is caught by SIMPLE_BRACKET). */
364 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
365 && (cset->ncoll_syms || cset->nranges))
367 const int32_t *table = (const int32_t *)
368 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
369 for (i = 0; i < SBC_MAX; ++i)
370 if (table[i] < 0)
371 re_set_fastmap (fastmap, icase, i);
373 # endif /* _LIBC */
375 /* See if we have to start the match at all multibyte characters,
376 i.e. where we would not find an invalid sequence. This only
377 applies to multibyte character sets; for single byte character
378 sets, the SIMPLE_BRACKET again suffices. */
379 if (dfa->mb_cur_max > 1
380 && (cset->nchar_classes || cset->non_match
381 # ifdef _LIBC
382 || cset->nequiv_classes
383 # endif /* _LIBC */
386 unsigned char c = 0;
389 mbstate_t mbs;
390 memset (&mbs, 0, sizeof (mbs));
391 if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
392 re_set_fastmap (fastmap, false, (int) c);
394 while (++c != 0);
397 else
399 /* ... Else catch all bytes which can start the mbchars. */
400 for (i = 0; i < cset->nmbchars; ++i)
402 char buf[256];
403 mbstate_t state;
404 memset (&state, '\0', sizeof (state));
405 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
406 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
407 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
409 if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
410 != (size_t) -1)
411 re_set_fastmap (fastmap, false, *(unsigned char *) buf);
416 #endif /* RE_ENABLE_I18N */
417 else if (type == OP_PERIOD
418 #ifdef RE_ENABLE_I18N
419 || type == OP_UTF8_PERIOD
420 #endif /* RE_ENABLE_I18N */
421 || type == END_OF_RE)
423 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
424 if (type == END_OF_RE)
425 bufp->can_be_null = 1;
426 return;
431 /* Entry point for POSIX code. */
432 /* regcomp takes a regular expression as a string and compiles it.
434 PREG is a regex_t *. We do not expect any fields to be initialized,
435 since POSIX says we shouldn't. Thus, we set
437 `buffer' to the compiled pattern;
438 `used' to the length of the compiled pattern;
439 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
440 REG_EXTENDED bit in CFLAGS is set; otherwise, to
441 RE_SYNTAX_POSIX_BASIC;
442 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
443 `fastmap' to an allocated space for the fastmap;
444 `fastmap_accurate' to zero;
445 `re_nsub' to the number of subexpressions in PATTERN.
447 PATTERN is the address of the pattern string.
449 CFLAGS is a series of bits which affect compilation.
451 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
452 use POSIX basic syntax.
454 If REG_NEWLINE is set, then . and [^...] don't match newline.
455 Also, regexec will try a match beginning after every newline.
457 If REG_ICASE is set, then we considers upper- and lowercase
458 versions of letters to be equivalent when matching.
460 If REG_NOSUB is set, then when PREG is passed to regexec, that
461 routine will report only success or failure, and nothing about the
462 registers.
464 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
465 the return codes and their meanings.) */
468 regcomp (preg, pattern, cflags)
469 regex_t *__restrict preg;
470 const char *__restrict pattern;
471 int cflags;
473 reg_errcode_t ret;
474 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
475 : RE_SYNTAX_POSIX_BASIC);
477 preg->buffer = NULL;
478 preg->allocated = 0;
479 preg->used = 0;
481 /* Try to allocate space for the fastmap. */
482 preg->fastmap = re_malloc (char, SBC_MAX);
483 if (BE (preg->fastmap == NULL, 0))
484 return REG_ESPACE;
486 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
488 /* If REG_NEWLINE is set, newlines are treated differently. */
489 if (cflags & REG_NEWLINE)
490 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
491 syntax &= ~RE_DOT_NEWLINE;
492 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
493 /* It also changes the matching behavior. */
494 preg->newline_anchor = 1;
496 else
497 preg->newline_anchor = 0;
498 preg->no_sub = !!(cflags & REG_NOSUB);
499 preg->translate = NULL;
501 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
503 /* POSIX doesn't distinguish between an unmatched open-group and an
504 unmatched close-group: both are REG_EPAREN. */
505 if (ret == REG_ERPAREN)
506 ret = REG_EPAREN;
508 /* We have already checked preg->fastmap != NULL. */
509 if (BE (ret == REG_NOERROR, 1))
510 /* Compute the fastmap now, since regexec cannot modify the pattern
511 buffer. This function never fails in this implementation. */
512 (void) re_compile_fastmap (preg);
513 else
515 /* Some error occurred while compiling the expression. */
516 re_free (preg->fastmap);
517 preg->fastmap = NULL;
520 return (int) ret;
522 #ifdef _LIBC
523 weak_alias (__regcomp, regcomp)
524 #endif
526 /* Returns a message corresponding to an error code, ERRCODE, returned
527 from either regcomp or regexec. We don't use PREG here. */
529 size_t
530 regerror (errcode, preg, errbuf, errbuf_size)
531 int errcode;
532 const regex_t *__restrict preg;
533 char *__restrict errbuf;
534 size_t errbuf_size;
536 const char *msg;
537 size_t msg_size;
539 if (BE (errcode < 0
540 || errcode >= (int) (sizeof (__re_error_msgid_idx)
541 / sizeof (__re_error_msgid_idx[0])), 0))
542 /* Only error codes returned by the rest of the code should be passed
543 to this routine. If we are given anything else, or if other regex
544 code generates an invalid error code, then the program has a bug.
545 Dump core so we can fix it. */
546 abort ();
548 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
550 msg_size = strlen (msg) + 1; /* Includes the null. */
552 if (BE (errbuf_size != 0, 1))
554 if (BE (msg_size > errbuf_size, 0))
556 #if defined HAVE_MEMPCPY || defined _LIBC
557 *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
558 #else
559 memcpy (errbuf, msg, errbuf_size - 1);
560 errbuf[errbuf_size - 1] = 0;
561 #endif
563 else
564 memcpy (errbuf, msg, msg_size);
567 return msg_size;
569 #ifdef _LIBC
570 weak_alias (__regerror, regerror)
571 #endif
574 #ifdef RE_ENABLE_I18N
575 /* This static array is used for the map to single-byte characters when
576 UTF-8 is used. Otherwise we would allocate memory just to initialize
577 it the same all the time. UTF-8 is the preferred encoding so this is
578 a worthwhile optimization. */
579 static const bitset_t utf8_sb_map =
581 /* Set the first 128 bits. */
582 [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
584 #endif
587 static void
588 free_dfa_content (re_dfa_t *dfa)
590 int i, j;
592 if (dfa->nodes)
593 for (i = 0; i < dfa->nodes_len; ++i)
594 free_token (dfa->nodes + i);
595 re_free (dfa->nexts);
596 for (i = 0; i < dfa->nodes_len; ++i)
598 if (dfa->eclosures != NULL)
599 re_node_set_free (dfa->eclosures + i);
600 if (dfa->inveclosures != NULL)
601 re_node_set_free (dfa->inveclosures + i);
602 if (dfa->edests != NULL)
603 re_node_set_free (dfa->edests + i);
605 re_free (dfa->edests);
606 re_free (dfa->eclosures);
607 re_free (dfa->inveclosures);
608 re_free (dfa->nodes);
610 if (dfa->state_table)
611 for (i = 0; i <= dfa->state_hash_mask; ++i)
613 struct re_state_table_entry *entry = dfa->state_table + i;
614 for (j = 0; j < entry->num; ++j)
616 re_dfastate_t *state = entry->array[j];
617 free_state (state);
619 re_free (entry->array);
621 re_free (dfa->state_table);
622 #ifdef RE_ENABLE_I18N
623 if (dfa->sb_char != utf8_sb_map)
624 re_free (dfa->sb_char);
625 #endif
626 re_free (dfa->subexp_map);
627 #ifdef DEBUG
628 re_free (dfa->re_str);
629 #endif
631 re_free (dfa);
635 /* Free dynamically allocated space used by PREG. */
637 void
638 regfree (preg)
639 regex_t *preg;
641 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
642 if (BE (dfa != NULL, 1))
643 free_dfa_content (dfa);
644 preg->buffer = NULL;
645 preg->allocated = 0;
647 re_free (preg->fastmap);
648 preg->fastmap = NULL;
650 re_free (preg->translate);
651 preg->translate = NULL;
653 #ifdef _LIBC
654 weak_alias (__regfree, regfree)
655 #endif
657 /* Entry points compatible with 4.2 BSD regex library. We don't define
658 them unless specifically requested. */
660 #if defined _REGEX_RE_COMP || defined _LIBC
662 /* BSD has one and only one pattern buffer. */
663 static struct re_pattern_buffer re_comp_buf;
665 char *
666 # ifdef _LIBC
667 /* Make these definitions weak in libc, so POSIX programs can redefine
668 these names if they don't use our functions, and still use
669 regcomp/regexec above without link errors. */
670 weak_function
671 # endif
672 re_comp (s)
673 const char *s;
675 reg_errcode_t ret;
676 char *fastmap;
678 if (!s)
680 if (!re_comp_buf.buffer)
681 return gettext ("No previous regular expression");
682 return 0;
685 if (re_comp_buf.buffer)
687 fastmap = re_comp_buf.fastmap;
688 re_comp_buf.fastmap = NULL;
689 __regfree (&re_comp_buf);
690 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
691 re_comp_buf.fastmap = fastmap;
694 if (re_comp_buf.fastmap == NULL)
696 re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
697 if (re_comp_buf.fastmap == NULL)
698 return (char *) gettext (__re_error_msgid
699 + __re_error_msgid_idx[(int) REG_ESPACE]);
702 /* Since `re_exec' always passes NULL for the `regs' argument, we
703 don't need to initialize the pattern buffer fields which affect it. */
705 /* Match anchors at newlines. */
706 re_comp_buf.newline_anchor = 1;
708 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
710 if (!ret)
711 return NULL;
713 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
714 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
717 #ifdef _LIBC
718 libc_freeres_fn (free_mem)
720 __regfree (&re_comp_buf);
722 #endif
724 #endif /* _REGEX_RE_COMP */
726 /* Internal entry point.
727 Compile the regular expression PATTERN, whose length is LENGTH.
728 SYNTAX indicate regular expression's syntax. */
730 static reg_errcode_t
731 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
732 reg_syntax_t syntax)
734 reg_errcode_t err = REG_NOERROR;
735 re_dfa_t *dfa;
736 re_string_t regexp;
738 /* Initialize the pattern buffer. */
739 preg->fastmap_accurate = 0;
740 preg->syntax = syntax;
741 preg->not_bol = preg->not_eol = 0;
742 preg->used = 0;
743 preg->re_nsub = 0;
744 preg->can_be_null = 0;
745 preg->regs_allocated = REGS_UNALLOCATED;
747 /* Initialize the dfa. */
748 dfa = (re_dfa_t *) preg->buffer;
749 if (BE (preg->allocated < sizeof (re_dfa_t), 0))
751 /* If zero allocated, but buffer is non-null, try to realloc
752 enough space. This loses if buffer's address is bogus, but
753 that is the user's responsibility. If ->buffer is NULL this
754 is a simple allocation. */
755 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
756 if (dfa == NULL)
757 return REG_ESPACE;
758 preg->allocated = sizeof (re_dfa_t);
759 preg->buffer = (unsigned char *) dfa;
761 preg->used = sizeof (re_dfa_t);
763 err = init_dfa (dfa, length);
764 if (BE (err != REG_NOERROR, 0))
766 free_dfa_content (dfa);
767 preg->buffer = NULL;
768 preg->allocated = 0;
769 return err;
771 #ifdef DEBUG
772 /* Note: length+1 will not overflow since it is checked in init_dfa. */
773 dfa->re_str = re_malloc (char, length + 1);
774 strncpy (dfa->re_str, pattern, length + 1);
775 #endif
777 __libc_lock_init (dfa->lock);
779 err = re_string_construct (&regexp, pattern, length, preg->translate,
780 syntax & RE_ICASE, dfa);
781 if (BE (err != REG_NOERROR, 0))
783 re_compile_internal_free_return:
784 free_workarea_compile (preg);
785 re_string_destruct (&regexp);
786 free_dfa_content (dfa);
787 preg->buffer = NULL;
788 preg->allocated = 0;
789 return err;
792 /* Parse the regular expression, and build a structure tree. */
793 preg->re_nsub = 0;
794 dfa->str_tree = parse (&regexp, preg, syntax, &err);
795 if (BE (dfa->str_tree == NULL, 0))
796 goto re_compile_internal_free_return;
798 /* Analyze the tree and create the nfa. */
799 err = analyze (preg);
800 if (BE (err != REG_NOERROR, 0))
801 goto re_compile_internal_free_return;
803 #ifdef RE_ENABLE_I18N
804 /* If possible, do searching in single byte encoding to speed things up. */
805 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
806 optimize_utf8 (dfa);
807 #endif
809 /* Then create the initial state of the dfa. */
810 err = create_initial_state (dfa);
812 /* Release work areas. */
813 free_workarea_compile (preg);
814 re_string_destruct (&regexp);
816 if (BE (err != REG_NOERROR, 0))
818 free_dfa_content (dfa);
819 preg->buffer = NULL;
820 preg->allocated = 0;
823 return err;
826 /* Initialize DFA. We use the length of the regular expression PAT_LEN
827 as the initial length of some arrays. */
829 static reg_errcode_t
830 init_dfa (re_dfa_t *dfa, size_t pat_len)
832 unsigned int table_size;
833 #ifndef _LIBC
834 char *codeset_name;
835 #endif
837 memset (dfa, '\0', sizeof (re_dfa_t));
839 /* Force allocation of str_tree_storage the first time. */
840 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
842 /* Avoid overflows. */
843 if (pat_len == SIZE_MAX)
844 return REG_ESPACE;
846 dfa->nodes_alloc = pat_len + 1;
847 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
849 /* table_size = 2 ^ ceil(log pat_len) */
850 for (table_size = 1; ; table_size <<= 1)
851 if (table_size > pat_len)
852 break;
854 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
855 dfa->state_hash_mask = table_size - 1;
857 dfa->mb_cur_max = MB_CUR_MAX;
858 #ifdef _LIBC
859 if (dfa->mb_cur_max == 6
860 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
861 dfa->is_utf8 = 1;
862 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
863 != 0);
864 #else
865 # ifdef HAVE_LANGINFO_CODESET
866 codeset_name = nl_langinfo (CODESET);
867 # else
868 codeset_name = getenv ("LC_ALL");
869 if (codeset_name == NULL || codeset_name[0] == '\0')
870 codeset_name = getenv ("LC_CTYPE");
871 if (codeset_name == NULL || codeset_name[0] == '\0')
872 codeset_name = getenv ("LANG");
873 if (codeset_name == NULL)
874 codeset_name = "";
875 else if (strchr (codeset_name, '.') != NULL)
876 codeset_name = strchr (codeset_name, '.') + 1;
877 # endif
879 if (strcasecmp (codeset_name, "UTF-8") == 0
880 || strcasecmp (codeset_name, "UTF8") == 0)
881 dfa->is_utf8 = 1;
883 /* We check exhaustively in the loop below if this charset is a
884 superset of ASCII. */
885 dfa->map_notascii = 0;
886 #endif
888 #ifdef RE_ENABLE_I18N
889 if (dfa->mb_cur_max > 1)
891 if (dfa->is_utf8)
892 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
893 else
895 int i, j, ch;
897 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
898 if (BE (dfa->sb_char == NULL, 0))
899 return REG_ESPACE;
901 /* Set the bits corresponding to single byte chars. */
902 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
903 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
905 wint_t wch = __btowc (ch);
906 if (wch != WEOF)
907 dfa->sb_char[i] |= (bitset_word_t) 1 << j;
908 # ifndef _LIBC
909 if (isascii (ch) && wch != ch)
910 dfa->map_notascii = 1;
911 # endif
915 #endif
917 if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
918 return REG_ESPACE;
919 return REG_NOERROR;
922 /* Initialize WORD_CHAR table, which indicate which character is
923 "word". In this case "word" means that it is the word construction
924 character used by some operators like "\<", "\>", etc. */
926 static void
927 internal_function
928 init_word_char (re_dfa_t *dfa)
930 int i, j, ch;
931 dfa->word_ops_used = 1;
932 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
933 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
934 if (isalnum (ch) || ch == '_')
935 dfa->word_char[i] |= (bitset_word_t) 1 << j;
938 /* Free the work area which are only used while compiling. */
940 static void
941 free_workarea_compile (regex_t *preg)
943 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
944 bin_tree_storage_t *storage, *next;
945 for (storage = dfa->str_tree_storage; storage; storage = next)
947 next = storage->next;
948 re_free (storage);
950 dfa->str_tree_storage = NULL;
951 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
952 dfa->str_tree = NULL;
953 re_free (dfa->org_indices);
954 dfa->org_indices = NULL;
957 /* Create initial states for all contexts. */
959 static reg_errcode_t
960 create_initial_state (re_dfa_t *dfa)
962 int first, i;
963 reg_errcode_t err;
964 re_node_set init_nodes;
966 /* Initial states have the epsilon closure of the node which is
967 the first node of the regular expression. */
968 first = dfa->str_tree->first->node_idx;
969 dfa->init_node = first;
970 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
971 if (BE (err != REG_NOERROR, 0))
972 return err;
974 /* The back-references which are in initial states can epsilon transit,
975 since in this case all of the subexpressions can be null.
976 Then we add epsilon closures of the nodes which are the next nodes of
977 the back-references. */
978 if (dfa->nbackref > 0)
979 for (i = 0; i < init_nodes.nelem; ++i)
981 int node_idx = init_nodes.elems[i];
982 re_token_type_t type = dfa->nodes[node_idx].type;
984 int clexp_idx;
985 if (type != OP_BACK_REF)
986 continue;
987 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
989 re_token_t *clexp_node;
990 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
991 if (clexp_node->type == OP_CLOSE_SUBEXP
992 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
993 break;
995 if (clexp_idx == init_nodes.nelem)
996 continue;
998 if (type == OP_BACK_REF)
1000 int dest_idx = dfa->edests[node_idx].elems[0];
1001 if (!re_node_set_contains (&init_nodes, dest_idx))
1003 re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1004 i = 0;
1009 /* It must be the first time to invoke acquire_state. */
1010 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1011 /* We don't check ERR here, since the initial state must not be NULL. */
1012 if (BE (dfa->init_state == NULL, 0))
1013 return err;
1014 if (dfa->init_state->has_constraint)
1016 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1017 CONTEXT_WORD);
1018 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1019 CONTEXT_NEWLINE);
1020 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1021 &init_nodes,
1022 CONTEXT_NEWLINE
1023 | CONTEXT_BEGBUF);
1024 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1025 || dfa->init_state_begbuf == NULL, 0))
1026 return err;
1028 else
1029 dfa->init_state_word = dfa->init_state_nl
1030 = dfa->init_state_begbuf = dfa->init_state;
1032 re_node_set_free (&init_nodes);
1033 return REG_NOERROR;
1036 #ifdef RE_ENABLE_I18N
1037 /* If it is possible to do searching in single byte encoding instead of UTF-8
1038 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1039 DFA nodes where needed. */
1041 static void
1042 optimize_utf8 (re_dfa_t *dfa)
1044 int node, i, mb_chars = 0, has_period = 0;
1046 for (node = 0; node < dfa->nodes_len; ++node)
1047 switch (dfa->nodes[node].type)
1049 case CHARACTER:
1050 if (dfa->nodes[node].opr.c >= 0x80)
1051 mb_chars = 1;
1052 break;
1053 case ANCHOR:
1054 switch (dfa->nodes[node].opr.ctx_type)
1056 case LINE_FIRST:
1057 case LINE_LAST:
1058 case BUF_FIRST:
1059 case BUF_LAST:
1060 break;
1061 default:
1062 /* Word anchors etc. cannot be handled. It's okay to test
1063 opr.ctx_type since constraints (for all DFA nodes) are
1064 created by ORing one or more opr.ctx_type values. */
1065 return;
1067 break;
1068 case OP_PERIOD:
1069 has_period = 1;
1070 break;
1071 case OP_BACK_REF:
1072 case OP_ALT:
1073 case END_OF_RE:
1074 case OP_DUP_ASTERISK:
1075 case OP_OPEN_SUBEXP:
1076 case OP_CLOSE_SUBEXP:
1077 break;
1078 case COMPLEX_BRACKET:
1079 return;
1080 case SIMPLE_BRACKET:
1081 /* Just double check. The non-ASCII range starts at 0x80. */
1082 assert (0x80 % BITSET_WORD_BITS == 0);
1083 for (i = 0x80 / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1084 if (dfa->nodes[node].opr.sbcset[i])
1085 return;
1086 break;
1087 default:
1088 abort ();
1091 if (mb_chars || has_period)
1092 for (node = 0; node < dfa->nodes_len; ++node)
1094 if (dfa->nodes[node].type == CHARACTER
1095 && dfa->nodes[node].opr.c >= 0x80)
1096 dfa->nodes[node].mb_partial = 0;
1097 else if (dfa->nodes[node].type == OP_PERIOD)
1098 dfa->nodes[node].type = OP_UTF8_PERIOD;
1101 /* The search can be in single byte locale. */
1102 dfa->mb_cur_max = 1;
1103 dfa->is_utf8 = 0;
1104 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1106 #endif
1108 /* Analyze the structure tree, and calculate "first", "next", "edest",
1109 "eclosure", and "inveclosure". */
1111 static reg_errcode_t
1112 analyze (regex_t *preg)
1114 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1115 reg_errcode_t ret;
1117 /* Allocate arrays. */
1118 dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1119 dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1120 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1121 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1122 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1123 || dfa->eclosures == NULL, 0))
1124 return REG_ESPACE;
1126 dfa->subexp_map = re_malloc (int, preg->re_nsub);
1127 if (dfa->subexp_map != NULL)
1129 int i;
1130 for (i = 0; i < preg->re_nsub; i++)
1131 dfa->subexp_map[i] = i;
1132 preorder (dfa->str_tree, optimize_subexps, dfa);
1133 for (i = 0; i < preg->re_nsub; i++)
1134 if (dfa->subexp_map[i] != i)
1135 break;
1136 if (i == preg->re_nsub)
1138 free (dfa->subexp_map);
1139 dfa->subexp_map = NULL;
1143 ret = postorder (dfa->str_tree, lower_subexps, preg);
1144 if (BE (ret != REG_NOERROR, 0))
1145 return ret;
1146 ret = postorder (dfa->str_tree, calc_first, dfa);
1147 if (BE (ret != REG_NOERROR, 0))
1148 return ret;
1149 preorder (dfa->str_tree, calc_next, dfa);
1150 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1151 if (BE (ret != REG_NOERROR, 0))
1152 return ret;
1153 ret = calc_eclosure (dfa);
1154 if (BE (ret != REG_NOERROR, 0))
1155 return ret;
1157 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1158 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1159 if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1160 || dfa->nbackref)
1162 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1163 if (BE (dfa->inveclosures == NULL, 0))
1164 return REG_ESPACE;
1165 ret = calc_inveclosure (dfa);
1168 return ret;
1171 /* Our parse trees are very unbalanced, so we cannot use a stack to
1172 implement parse tree visits. Instead, we use parent pointers and
1173 some hairy code in these two functions. */
1174 static reg_errcode_t
1175 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1176 void *extra)
1178 bin_tree_t *node, *prev;
1180 for (node = root; ; )
1182 /* Descend down the tree, preferably to the left (or to the right
1183 if that's the only child). */
1184 while (node->left || node->right)
1185 if (node->left)
1186 node = node->left;
1187 else
1188 node = node->right;
1192 reg_errcode_t err = fn (extra, node);
1193 if (BE (err != REG_NOERROR, 0))
1194 return err;
1195 if (node->parent == NULL)
1196 return REG_NOERROR;
1197 prev = node;
1198 node = node->parent;
1200 /* Go up while we have a node that is reached from the right. */
1201 while (node->right == prev || node->right == NULL);
1202 node = node->right;
1206 static reg_errcode_t
1207 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1208 void *extra)
1210 bin_tree_t *node;
1212 for (node = root; ; )
1214 reg_errcode_t err = fn (extra, node);
1215 if (BE (err != REG_NOERROR, 0))
1216 return err;
1218 /* Go to the left node, or up and to the right. */
1219 if (node->left)
1220 node = node->left;
1221 else
1223 bin_tree_t *prev = NULL;
1224 while (node->right == prev || node->right == NULL)
1226 prev = node;
1227 node = node->parent;
1228 if (!node)
1229 return REG_NOERROR;
1231 node = node->right;
1236 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1237 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1238 backreferences as well. Requires a preorder visit. */
1239 static reg_errcode_t
1240 optimize_subexps (void *extra, bin_tree_t *node)
1242 re_dfa_t *dfa = (re_dfa_t *) extra;
1244 if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1246 int idx = node->token.opr.idx;
1247 node->token.opr.idx = dfa->subexp_map[idx];
1248 dfa->used_bkref_map |= 1 << node->token.opr.idx;
1251 else if (node->token.type == SUBEXP
1252 && node->left && node->left->token.type == SUBEXP)
1254 int other_idx = node->left->token.opr.idx;
1256 node->left = node->left->left;
1257 if (node->left)
1258 node->left->parent = node;
1260 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1261 if (other_idx < BITSET_WORD_BITS)
1262 dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1265 return REG_NOERROR;
1268 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1269 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1270 static reg_errcode_t
1271 lower_subexps (void *extra, bin_tree_t *node)
1273 regex_t *preg = (regex_t *) extra;
1274 reg_errcode_t err = REG_NOERROR;
1276 if (node->left && node->left->token.type == SUBEXP)
1278 node->left = lower_subexp (&err, preg, node->left);
1279 if (node->left)
1280 node->left->parent = node;
1282 if (node->right && node->right->token.type == SUBEXP)
1284 node->right = lower_subexp (&err, preg, node->right);
1285 if (node->right)
1286 node->right->parent = node;
1289 return err;
1292 static bin_tree_t *
1293 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1295 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1296 bin_tree_t *body = node->left;
1297 bin_tree_t *op, *cls, *tree1, *tree;
1299 if (preg->no_sub
1300 /* We do not optimize empty subexpressions, because otherwise we may
1301 have bad CONCAT nodes with NULL children. This is obviously not
1302 very common, so we do not lose much. An example that triggers
1303 this case is the sed "script" /\(\)/x. */
1304 && node->left != NULL
1305 && (node->token.opr.idx >= BITSET_WORD_BITS
1306 || !(dfa->used_bkref_map
1307 & ((bitset_word_t) 1 << node->token.opr.idx))))
1308 return node->left;
1310 /* Convert the SUBEXP node to the concatenation of an
1311 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1312 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1313 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1314 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1315 tree = create_tree (dfa, op, tree1, CONCAT);
1316 if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1318 *err = REG_ESPACE;
1319 return NULL;
1322 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1323 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1324 return tree;
1327 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1328 nodes. Requires a postorder visit. */
1329 static reg_errcode_t
1330 calc_first (void *extra, bin_tree_t *node)
1332 re_dfa_t *dfa = (re_dfa_t *) extra;
1333 if (node->token.type == CONCAT)
1335 node->first = node->left->first;
1336 node->node_idx = node->left->node_idx;
1338 else
1340 node->first = node;
1341 node->node_idx = re_dfa_add_node (dfa, node->token);
1342 if (BE (node->node_idx == -1, 0))
1343 return REG_ESPACE;
1344 if (node->token.type == ANCHOR)
1345 dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1347 return REG_NOERROR;
1350 /* Pass 2: compute NEXT on the tree. Preorder visit. */
1351 static reg_errcode_t
1352 calc_next (void *extra, bin_tree_t *node)
1354 switch (node->token.type)
1356 case OP_DUP_ASTERISK:
1357 node->left->next = node;
1358 break;
1359 case CONCAT:
1360 node->left->next = node->right->first;
1361 node->right->next = node->next;
1362 break;
1363 default:
1364 if (node->left)
1365 node->left->next = node->next;
1366 if (node->right)
1367 node->right->next = node->next;
1368 break;
1370 return REG_NOERROR;
1373 /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1374 static reg_errcode_t
1375 link_nfa_nodes (void *extra, bin_tree_t *node)
1377 re_dfa_t *dfa = (re_dfa_t *) extra;
1378 int idx = node->node_idx;
1379 reg_errcode_t err = REG_NOERROR;
1381 switch (node->token.type)
1383 case CONCAT:
1384 break;
1386 case END_OF_RE:
1387 assert (node->next == NULL);
1388 break;
1390 case OP_DUP_ASTERISK:
1391 case OP_ALT:
1393 int left, right;
1394 dfa->has_plural_match = 1;
1395 if (node->left != NULL)
1396 left = node->left->first->node_idx;
1397 else
1398 left = node->next->node_idx;
1399 if (node->right != NULL)
1400 right = node->right->first->node_idx;
1401 else
1402 right = node->next->node_idx;
1403 assert (left > -1);
1404 assert (right > -1);
1405 err = re_node_set_init_2 (dfa->edests + idx, left, right);
1407 break;
1409 case ANCHOR:
1410 case OP_OPEN_SUBEXP:
1411 case OP_CLOSE_SUBEXP:
1412 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1413 break;
1415 case OP_BACK_REF:
1416 dfa->nexts[idx] = node->next->node_idx;
1417 if (node->token.type == OP_BACK_REF)
1418 re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1419 break;
1421 default:
1422 assert (!IS_EPSILON_NODE (node->token.type));
1423 dfa->nexts[idx] = node->next->node_idx;
1424 break;
1427 return err;
1430 /* Duplicate the epsilon closure of the node ROOT_NODE.
1431 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1432 to their own constraint. */
1434 static reg_errcode_t
1435 internal_function
1436 duplicate_node_closure (re_dfa_t *dfa, int top_org_node, int top_clone_node,
1437 int root_node, unsigned int init_constraint)
1439 int org_node, clone_node, ret;
1440 unsigned int constraint = init_constraint;
1441 for (org_node = top_org_node, clone_node = top_clone_node;;)
1443 int org_dest, clone_dest;
1444 if (dfa->nodes[org_node].type == OP_BACK_REF)
1446 /* If the back reference epsilon-transit, its destination must
1447 also have the constraint. Then duplicate the epsilon closure
1448 of the destination of the back reference, and store it in
1449 edests of the back reference. */
1450 org_dest = dfa->nexts[org_node];
1451 re_node_set_empty (dfa->edests + clone_node);
1452 clone_dest = duplicate_node (dfa, org_dest, constraint);
1453 if (BE (clone_dest == -1, 0))
1454 return REG_ESPACE;
1455 dfa->nexts[clone_node] = dfa->nexts[org_node];
1456 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1457 if (BE (ret < 0, 0))
1458 return REG_ESPACE;
1460 else if (dfa->edests[org_node].nelem == 0)
1462 /* In case of the node can't epsilon-transit, don't duplicate the
1463 destination and store the original destination as the
1464 destination of the node. */
1465 dfa->nexts[clone_node] = dfa->nexts[org_node];
1466 break;
1468 else if (dfa->edests[org_node].nelem == 1)
1470 /* In case of the node can epsilon-transit, and it has only one
1471 destination. */
1472 org_dest = dfa->edests[org_node].elems[0];
1473 re_node_set_empty (dfa->edests + clone_node);
1474 /* If the node is root_node itself, it means the epsilon clsoure
1475 has a loop. Then tie it to the destination of the root_node. */
1476 if (org_node == root_node && clone_node != org_node)
1478 ret = re_node_set_insert (dfa->edests + clone_node, org_dest);
1479 if (BE (ret < 0, 0))
1480 return REG_ESPACE;
1481 break;
1483 /* In case of the node has another constraint, add it. */
1484 constraint |= dfa->nodes[org_node].constraint;
1485 clone_dest = duplicate_node (dfa, org_dest, constraint);
1486 if (BE (clone_dest == -1, 0))
1487 return REG_ESPACE;
1488 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1489 if (BE (ret < 0, 0))
1490 return REG_ESPACE;
1492 else /* dfa->edests[org_node].nelem == 2 */
1494 /* In case of the node can epsilon-transit, and it has two
1495 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1496 org_dest = dfa->edests[org_node].elems[0];
1497 re_node_set_empty (dfa->edests + clone_node);
1498 /* Search for a duplicated node which satisfies the constraint. */
1499 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1500 if (clone_dest == -1)
1502 /* There is no such duplicated node, create a new one. */
1503 reg_errcode_t err;
1504 clone_dest = duplicate_node (dfa, org_dest, constraint);
1505 if (BE (clone_dest == -1, 0))
1506 return REG_ESPACE;
1507 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1508 if (BE (ret < 0, 0))
1509 return REG_ESPACE;
1510 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1511 root_node, constraint);
1512 if (BE (err != REG_NOERROR, 0))
1513 return err;
1515 else
1517 /* There is a duplicated node which satisfies the constraint,
1518 use it to avoid infinite loop. */
1519 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1520 if (BE (ret < 0, 0))
1521 return REG_ESPACE;
1524 org_dest = dfa->edests[org_node].elems[1];
1525 clone_dest = duplicate_node (dfa, org_dest, constraint);
1526 if (BE (clone_dest == -1, 0))
1527 return REG_ESPACE;
1528 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1529 if (BE (ret < 0, 0))
1530 return REG_ESPACE;
1532 org_node = org_dest;
1533 clone_node = clone_dest;
1535 return REG_NOERROR;
1538 /* Search for a node which is duplicated from the node ORG_NODE, and
1539 satisfies the constraint CONSTRAINT. */
1541 static int
1542 search_duplicated_node (const re_dfa_t *dfa, int org_node,
1543 unsigned int constraint)
1545 int idx;
1546 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1548 if (org_node == dfa->org_indices[idx]
1549 && constraint == dfa->nodes[idx].constraint)
1550 return idx; /* Found. */
1552 return -1; /* Not found. */
1555 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1556 Return the index of the new node, or -1 if insufficient storage is
1557 available. */
1559 static int
1560 duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint)
1562 int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1563 if (BE (dup_idx != -1, 1))
1565 dfa->nodes[dup_idx].constraint = constraint;
1566 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1567 dfa->nodes[dup_idx].duplicated = 1;
1569 /* Store the index of the original node. */
1570 dfa->org_indices[dup_idx] = org_idx;
1572 return dup_idx;
1575 static reg_errcode_t
1576 calc_inveclosure (re_dfa_t *dfa)
1578 int src, idx, ret;
1579 for (idx = 0; idx < dfa->nodes_len; ++idx)
1580 re_node_set_init_empty (dfa->inveclosures + idx);
1582 for (src = 0; src < dfa->nodes_len; ++src)
1584 int *elems = dfa->eclosures[src].elems;
1585 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1587 ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1588 if (BE (ret == -1, 0))
1589 return REG_ESPACE;
1593 return REG_NOERROR;
1596 /* Calculate "eclosure" for all the node in DFA. */
1598 static reg_errcode_t
1599 calc_eclosure (re_dfa_t *dfa)
1601 int node_idx, incomplete;
1602 #ifdef DEBUG
1603 assert (dfa->nodes_len > 0);
1604 #endif
1605 incomplete = 0;
1606 /* For each nodes, calculate epsilon closure. */
1607 for (node_idx = 0; ; ++node_idx)
1609 reg_errcode_t err;
1610 re_node_set eclosure_elem;
1611 if (node_idx == dfa->nodes_len)
1613 if (!incomplete)
1614 break;
1615 incomplete = 0;
1616 node_idx = 0;
1619 #ifdef DEBUG
1620 assert (dfa->eclosures[node_idx].nelem != -1);
1621 #endif
1623 /* If we have already calculated, skip it. */
1624 if (dfa->eclosures[node_idx].nelem != 0)
1625 continue;
1626 /* Calculate epsilon closure of `node_idx'. */
1627 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1628 if (BE (err != REG_NOERROR, 0))
1629 return err;
1631 if (dfa->eclosures[node_idx].nelem == 0)
1633 incomplete = 1;
1634 re_node_set_free (&eclosure_elem);
1637 return REG_NOERROR;
1640 /* Calculate epsilon closure of NODE. */
1642 static reg_errcode_t
1643 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, int node, int root)
1645 reg_errcode_t err;
1646 int i, incomplete;
1647 re_node_set eclosure;
1648 incomplete = 0;
1649 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1650 if (BE (err != REG_NOERROR, 0))
1651 return err;
1653 /* This indicates that we are calculating this node now.
1654 We reference this value to avoid infinite loop. */
1655 dfa->eclosures[node].nelem = -1;
1657 /* If the current node has constraints, duplicate all nodes
1658 since they must inherit the constraints. */
1659 if (dfa->nodes[node].constraint
1660 && dfa->edests[node].nelem
1661 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1663 err = duplicate_node_closure (dfa, node, node, node,
1664 dfa->nodes[node].constraint);
1665 if (BE (err != REG_NOERROR, 0))
1666 return err;
1669 /* Expand each epsilon destination nodes. */
1670 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1671 for (i = 0; i < dfa->edests[node].nelem; ++i)
1673 re_node_set eclosure_elem;
1674 int edest = dfa->edests[node].elems[i];
1675 /* If calculating the epsilon closure of `edest' is in progress,
1676 return intermediate result. */
1677 if (dfa->eclosures[edest].nelem == -1)
1679 incomplete = 1;
1680 continue;
1682 /* If we haven't calculated the epsilon closure of `edest' yet,
1683 calculate now. Otherwise use calculated epsilon closure. */
1684 if (dfa->eclosures[edest].nelem == 0)
1686 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1687 if (BE (err != REG_NOERROR, 0))
1688 return err;
1690 else
1691 eclosure_elem = dfa->eclosures[edest];
1692 /* Merge the epsilon closure of `edest'. */
1693 re_node_set_merge (&eclosure, &eclosure_elem);
1694 /* If the epsilon closure of `edest' is incomplete,
1695 the epsilon closure of this node is also incomplete. */
1696 if (dfa->eclosures[edest].nelem == 0)
1698 incomplete = 1;
1699 re_node_set_free (&eclosure_elem);
1703 /* Epsilon closures include itself. */
1704 re_node_set_insert (&eclosure, node);
1705 if (incomplete && !root)
1706 dfa->eclosures[node].nelem = 0;
1707 else
1708 dfa->eclosures[node] = eclosure;
1709 *new_set = eclosure;
1710 return REG_NOERROR;
1713 /* Functions for token which are used in the parser. */
1715 /* Fetch a token from INPUT.
1716 We must not use this function inside bracket expressions. */
1718 static void
1719 internal_function
1720 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1722 re_string_skip_bytes (input, peek_token (result, input, syntax));
1725 /* Peek a token from INPUT, and return the length of the token.
1726 We must not use this function inside bracket expressions. */
1728 static int
1729 internal_function
1730 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1732 unsigned char c;
1734 if (re_string_eoi (input))
1736 token->type = END_OF_RE;
1737 return 0;
1740 c = re_string_peek_byte (input, 0);
1741 token->opr.c = c;
1743 token->word_char = 0;
1744 #ifdef RE_ENABLE_I18N
1745 token->mb_partial = 0;
1746 if (input->mb_cur_max > 1 &&
1747 !re_string_first_byte (input, re_string_cur_idx (input)))
1749 token->type = CHARACTER;
1750 token->mb_partial = 1;
1751 return 1;
1753 #endif
1754 if (c == '\\')
1756 unsigned char c2;
1757 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1759 token->type = BACK_SLASH;
1760 return 1;
1763 c2 = re_string_peek_byte_case (input, 1);
1764 token->opr.c = c2;
1765 token->type = CHARACTER;
1766 #ifdef RE_ENABLE_I18N
1767 if (input->mb_cur_max > 1)
1769 wint_t wc = re_string_wchar_at (input,
1770 re_string_cur_idx (input) + 1);
1771 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1773 else
1774 #endif
1775 token->word_char = IS_WORD_CHAR (c2) != 0;
1777 switch (c2)
1779 case '|':
1780 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1781 token->type = OP_ALT;
1782 break;
1783 case '1': case '2': case '3': case '4': case '5':
1784 case '6': case '7': case '8': case '9':
1785 if (!(syntax & RE_NO_BK_REFS))
1787 token->type = OP_BACK_REF;
1788 token->opr.idx = c2 - '1';
1790 break;
1791 case '<':
1792 if (!(syntax & RE_NO_GNU_OPS))
1794 token->type = ANCHOR;
1795 token->opr.ctx_type = WORD_FIRST;
1797 break;
1798 case '>':
1799 if (!(syntax & RE_NO_GNU_OPS))
1801 token->type = ANCHOR;
1802 token->opr.ctx_type = WORD_LAST;
1804 break;
1805 case 'b':
1806 if (!(syntax & RE_NO_GNU_OPS))
1808 token->type = ANCHOR;
1809 token->opr.ctx_type = WORD_DELIM;
1811 break;
1812 case 'B':
1813 if (!(syntax & RE_NO_GNU_OPS))
1815 token->type = ANCHOR;
1816 token->opr.ctx_type = NOT_WORD_DELIM;
1818 break;
1819 case 'w':
1820 if (!(syntax & RE_NO_GNU_OPS))
1821 token->type = OP_WORD;
1822 break;
1823 case 'W':
1824 if (!(syntax & RE_NO_GNU_OPS))
1825 token->type = OP_NOTWORD;
1826 break;
1827 case 's':
1828 if (!(syntax & RE_NO_GNU_OPS))
1829 token->type = OP_SPACE;
1830 break;
1831 case 'S':
1832 if (!(syntax & RE_NO_GNU_OPS))
1833 token->type = OP_NOTSPACE;
1834 break;
1835 case '`':
1836 if (!(syntax & RE_NO_GNU_OPS))
1838 token->type = ANCHOR;
1839 token->opr.ctx_type = BUF_FIRST;
1841 break;
1842 case '\'':
1843 if (!(syntax & RE_NO_GNU_OPS))
1845 token->type = ANCHOR;
1846 token->opr.ctx_type = BUF_LAST;
1848 break;
1849 case '(':
1850 if (!(syntax & RE_NO_BK_PARENS))
1851 token->type = OP_OPEN_SUBEXP;
1852 break;
1853 case ')':
1854 if (!(syntax & RE_NO_BK_PARENS))
1855 token->type = OP_CLOSE_SUBEXP;
1856 break;
1857 case '+':
1858 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1859 token->type = OP_DUP_PLUS;
1860 break;
1861 case '?':
1862 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1863 token->type = OP_DUP_QUESTION;
1864 break;
1865 case '{':
1866 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1867 token->type = OP_OPEN_DUP_NUM;
1868 break;
1869 case '}':
1870 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1871 token->type = OP_CLOSE_DUP_NUM;
1872 break;
1873 default:
1874 break;
1876 return 2;
1879 token->type = CHARACTER;
1880 #ifdef RE_ENABLE_I18N
1881 if (input->mb_cur_max > 1)
1883 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1884 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1886 else
1887 #endif
1888 token->word_char = IS_WORD_CHAR (token->opr.c);
1890 switch (c)
1892 case '\n':
1893 if (syntax & RE_NEWLINE_ALT)
1894 token->type = OP_ALT;
1895 break;
1896 case '|':
1897 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1898 token->type = OP_ALT;
1899 break;
1900 case '*':
1901 token->type = OP_DUP_ASTERISK;
1902 break;
1903 case '+':
1904 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1905 token->type = OP_DUP_PLUS;
1906 break;
1907 case '?':
1908 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1909 token->type = OP_DUP_QUESTION;
1910 break;
1911 case '{':
1912 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1913 token->type = OP_OPEN_DUP_NUM;
1914 break;
1915 case '}':
1916 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1917 token->type = OP_CLOSE_DUP_NUM;
1918 break;
1919 case '(':
1920 if (syntax & RE_NO_BK_PARENS)
1921 token->type = OP_OPEN_SUBEXP;
1922 break;
1923 case ')':
1924 if (syntax & RE_NO_BK_PARENS)
1925 token->type = OP_CLOSE_SUBEXP;
1926 break;
1927 case '[':
1928 token->type = OP_OPEN_BRACKET;
1929 break;
1930 case '.':
1931 token->type = OP_PERIOD;
1932 break;
1933 case '^':
1934 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1935 re_string_cur_idx (input) != 0)
1937 char prev = re_string_peek_byte (input, -1);
1938 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1939 break;
1941 token->type = ANCHOR;
1942 token->opr.ctx_type = LINE_FIRST;
1943 break;
1944 case '$':
1945 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1946 re_string_cur_idx (input) + 1 != re_string_length (input))
1948 re_token_t next;
1949 re_string_skip_bytes (input, 1);
1950 peek_token (&next, input, syntax);
1951 re_string_skip_bytes (input, -1);
1952 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1953 break;
1955 token->type = ANCHOR;
1956 token->opr.ctx_type = LINE_LAST;
1957 break;
1958 default:
1959 break;
1961 return 1;
1964 /* Peek a token from INPUT, and return the length of the token.
1965 We must not use this function out of bracket expressions. */
1967 static int
1968 internal_function
1969 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1971 unsigned char c;
1972 if (re_string_eoi (input))
1974 token->type = END_OF_RE;
1975 return 0;
1977 c = re_string_peek_byte (input, 0);
1978 token->opr.c = c;
1980 #ifdef RE_ENABLE_I18N
1981 if (input->mb_cur_max > 1 &&
1982 !re_string_first_byte (input, re_string_cur_idx (input)))
1984 token->type = CHARACTER;
1985 return 1;
1987 #endif /* RE_ENABLE_I18N */
1989 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
1990 && re_string_cur_idx (input) + 1 < re_string_length (input))
1992 /* In this case, '\' escape a character. */
1993 unsigned char c2;
1994 re_string_skip_bytes (input, 1);
1995 c2 = re_string_peek_byte (input, 0);
1996 token->opr.c = c2;
1997 token->type = CHARACTER;
1998 return 1;
2000 if (c == '[') /* '[' is a special char in a bracket exps. */
2002 unsigned char c2;
2003 int token_len;
2004 if (re_string_cur_idx (input) + 1 < re_string_length (input))
2005 c2 = re_string_peek_byte (input, 1);
2006 else
2007 c2 = 0;
2008 token->opr.c = c2;
2009 token_len = 2;
2010 switch (c2)
2012 case '.':
2013 token->type = OP_OPEN_COLL_ELEM;
2014 break;
2015 case '=':
2016 token->type = OP_OPEN_EQUIV_CLASS;
2017 break;
2018 case ':':
2019 if (syntax & RE_CHAR_CLASSES)
2021 token->type = OP_OPEN_CHAR_CLASS;
2022 break;
2024 /* else fall through. */
2025 default:
2026 token->type = CHARACTER;
2027 token->opr.c = c;
2028 token_len = 1;
2029 break;
2031 return token_len;
2033 switch (c)
2035 case '-':
2036 token->type = OP_CHARSET_RANGE;
2037 break;
2038 case ']':
2039 token->type = OP_CLOSE_BRACKET;
2040 break;
2041 case '^':
2042 token->type = OP_NON_MATCH_LIST;
2043 break;
2044 default:
2045 token->type = CHARACTER;
2047 return 1;
2050 /* Functions for parser. */
2052 /* Entry point of the parser.
2053 Parse the regular expression REGEXP and return the structure tree.
2054 If an error is occured, ERR is set by error code, and return NULL.
2055 This function build the following tree, from regular expression <reg_exp>:
2059 <reg_exp> EOR
2061 CAT means concatenation.
2062 EOR means end of regular expression. */
2064 static bin_tree_t *
2065 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2066 reg_errcode_t *err)
2068 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2069 bin_tree_t *tree, *eor, *root;
2070 re_token_t current_token;
2071 dfa->syntax = syntax;
2072 fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2073 tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2074 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2075 return NULL;
2076 eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2077 if (tree != NULL)
2078 root = create_tree (dfa, tree, eor, CONCAT);
2079 else
2080 root = eor;
2081 if (BE (eor == NULL || root == NULL, 0))
2083 *err = REG_ESPACE;
2084 return NULL;
2086 return root;
2089 /* This function build the following tree, from regular expression
2090 <branch1>|<branch2>:
2094 <branch1> <branch2>
2096 ALT means alternative, which represents the operator `|'. */
2098 static bin_tree_t *
2099 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2100 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2102 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2103 bin_tree_t *tree, *branch = NULL;
2104 tree = parse_branch (regexp, preg, token, syntax, nest, err);
2105 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2106 return NULL;
2108 while (token->type == OP_ALT)
2110 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2111 if (token->type != OP_ALT && token->type != END_OF_RE
2112 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2114 branch = parse_branch (regexp, preg, token, syntax, nest, err);
2115 if (BE (*err != REG_NOERROR && branch == NULL, 0))
2116 return NULL;
2118 else
2119 branch = NULL;
2120 tree = create_tree (dfa, tree, branch, OP_ALT);
2121 if (BE (tree == NULL, 0))
2123 *err = REG_ESPACE;
2124 return NULL;
2127 return tree;
2130 /* This function build the following tree, from regular expression
2131 <exp1><exp2>:
2135 <exp1> <exp2>
2137 CAT means concatenation. */
2139 static bin_tree_t *
2140 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2141 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2143 bin_tree_t *tree, *exp;
2144 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2145 tree = parse_expression (regexp, preg, token, syntax, nest, err);
2146 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2147 return NULL;
2149 while (token->type != OP_ALT && token->type != END_OF_RE
2150 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2152 exp = parse_expression (regexp, preg, token, syntax, nest, err);
2153 if (BE (*err != REG_NOERROR && exp == NULL, 0))
2155 return NULL;
2157 if (tree != NULL && exp != NULL)
2159 tree = create_tree (dfa, tree, exp, CONCAT);
2160 if (tree == NULL)
2162 *err = REG_ESPACE;
2163 return NULL;
2166 else if (tree == NULL)
2167 tree = exp;
2168 /* Otherwise exp == NULL, we don't need to create new tree. */
2170 return tree;
2173 /* This function build the following tree, from regular expression a*:
2179 static bin_tree_t *
2180 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2181 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2183 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2184 bin_tree_t *tree;
2185 switch (token->type)
2187 case CHARACTER:
2188 tree = create_token_tree (dfa, NULL, NULL, token);
2189 if (BE (tree == NULL, 0))
2191 *err = REG_ESPACE;
2192 return NULL;
2194 #ifdef RE_ENABLE_I18N
2195 if (dfa->mb_cur_max > 1)
2197 while (!re_string_eoi (regexp)
2198 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2200 bin_tree_t *mbc_remain;
2201 fetch_token (token, regexp, syntax);
2202 mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2203 tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2204 if (BE (mbc_remain == NULL || tree == NULL, 0))
2206 *err = REG_ESPACE;
2207 return NULL;
2211 #endif
2212 break;
2213 case OP_OPEN_SUBEXP:
2214 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2215 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2216 return NULL;
2217 break;
2218 case OP_OPEN_BRACKET:
2219 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2220 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2221 return NULL;
2222 break;
2223 case OP_BACK_REF:
2224 if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2226 *err = REG_ESUBREG;
2227 return NULL;
2229 dfa->used_bkref_map |= 1 << token->opr.idx;
2230 tree = create_token_tree (dfa, NULL, NULL, token);
2231 if (BE (tree == NULL, 0))
2233 *err = REG_ESPACE;
2234 return NULL;
2236 ++dfa->nbackref;
2237 dfa->has_mb_node = 1;
2238 break;
2239 case OP_OPEN_DUP_NUM:
2240 if (syntax & RE_CONTEXT_INVALID_DUP)
2242 *err = REG_BADRPT;
2243 return NULL;
2245 /* FALLTHROUGH */
2246 case OP_DUP_ASTERISK:
2247 case OP_DUP_PLUS:
2248 case OP_DUP_QUESTION:
2249 if (syntax & RE_CONTEXT_INVALID_OPS)
2251 *err = REG_BADRPT;
2252 return NULL;
2254 else if (syntax & RE_CONTEXT_INDEP_OPS)
2256 fetch_token (token, regexp, syntax);
2257 return parse_expression (regexp, preg, token, syntax, nest, err);
2259 /* else fall through */
2260 case OP_CLOSE_SUBEXP:
2261 if ((token->type == OP_CLOSE_SUBEXP) &&
2262 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2264 *err = REG_ERPAREN;
2265 return NULL;
2267 /* else fall through */
2268 case OP_CLOSE_DUP_NUM:
2269 /* We treat it as a normal character. */
2271 /* Then we can these characters as normal characters. */
2272 token->type = CHARACTER;
2273 /* mb_partial and word_char bits should be initialized already
2274 by peek_token. */
2275 tree = create_token_tree (dfa, NULL, NULL, token);
2276 if (BE (tree == NULL, 0))
2278 *err = REG_ESPACE;
2279 return NULL;
2281 break;
2282 case ANCHOR:
2283 if ((token->opr.ctx_type
2284 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2285 && dfa->word_ops_used == 0)
2286 init_word_char (dfa);
2287 if (token->opr.ctx_type == WORD_DELIM
2288 || token->opr.ctx_type == NOT_WORD_DELIM)
2290 bin_tree_t *tree_first, *tree_last;
2291 if (token->opr.ctx_type == WORD_DELIM)
2293 token->opr.ctx_type = WORD_FIRST;
2294 tree_first = create_token_tree (dfa, NULL, NULL, token);
2295 token->opr.ctx_type = WORD_LAST;
2297 else
2299 token->opr.ctx_type = INSIDE_WORD;
2300 tree_first = create_token_tree (dfa, NULL, NULL, token);
2301 token->opr.ctx_type = INSIDE_NOTWORD;
2303 tree_last = create_token_tree (dfa, NULL, NULL, token);
2304 tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2305 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2307 *err = REG_ESPACE;
2308 return NULL;
2311 else
2313 tree = create_token_tree (dfa, NULL, NULL, token);
2314 if (BE (tree == NULL, 0))
2316 *err = REG_ESPACE;
2317 return NULL;
2320 /* We must return here, since ANCHORs can't be followed
2321 by repetition operators.
2322 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2323 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2324 fetch_token (token, regexp, syntax);
2325 return tree;
2326 case OP_PERIOD:
2327 tree = create_token_tree (dfa, NULL, NULL, token);
2328 if (BE (tree == NULL, 0))
2330 *err = REG_ESPACE;
2331 return NULL;
2333 if (dfa->mb_cur_max > 1)
2334 dfa->has_mb_node = 1;
2335 break;
2336 case OP_WORD:
2337 case OP_NOTWORD:
2338 tree = build_charclass_op (dfa, regexp->trans,
2339 (const unsigned char *) "alnum",
2340 (const unsigned char *) "_",
2341 token->type == OP_NOTWORD, err);
2342 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2343 return NULL;
2344 break;
2345 case OP_SPACE:
2346 case OP_NOTSPACE:
2347 tree = build_charclass_op (dfa, regexp->trans,
2348 (const unsigned char *) "space",
2349 (const unsigned char *) "",
2350 token->type == OP_NOTSPACE, err);
2351 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2352 return NULL;
2353 break;
2354 case OP_ALT:
2355 case END_OF_RE:
2356 return NULL;
2357 case BACK_SLASH:
2358 *err = REG_EESCAPE;
2359 return NULL;
2360 default:
2361 /* Must not happen? */
2362 #ifdef DEBUG
2363 assert (0);
2364 #endif
2365 return NULL;
2367 fetch_token (token, regexp, syntax);
2369 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2370 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2372 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2373 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2374 return NULL;
2375 /* In BRE consecutive duplications are not allowed. */
2376 if ((syntax & RE_CONTEXT_INVALID_DUP)
2377 && (token->type == OP_DUP_ASTERISK
2378 || token->type == OP_OPEN_DUP_NUM))
2380 *err = REG_BADRPT;
2381 return NULL;
2385 return tree;
2388 /* This function build the following tree, from regular expression
2389 (<reg_exp>):
2390 SUBEXP
2392 <reg_exp>
2395 static bin_tree_t *
2396 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2397 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2399 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2400 bin_tree_t *tree;
2401 size_t cur_nsub;
2402 cur_nsub = preg->re_nsub++;
2404 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2406 /* The subexpression may be a null string. */
2407 if (token->type == OP_CLOSE_SUBEXP)
2408 tree = NULL;
2409 else
2411 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2412 if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2413 *err = REG_EPAREN;
2414 if (BE (*err != REG_NOERROR, 0))
2415 return NULL;
2418 if (cur_nsub <= '9' - '1')
2419 dfa->completed_bkref_map |= 1 << cur_nsub;
2421 tree = create_tree (dfa, tree, NULL, SUBEXP);
2422 if (BE (tree == NULL, 0))
2424 *err = REG_ESPACE;
2425 return NULL;
2427 tree->token.opr.idx = cur_nsub;
2428 return tree;
2431 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2433 static bin_tree_t *
2434 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2435 re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2437 bin_tree_t *tree = NULL, *old_tree = NULL;
2438 int i, start, end, start_idx = re_string_cur_idx (regexp);
2439 re_token_t start_token = *token;
2441 if (token->type == OP_OPEN_DUP_NUM)
2443 end = 0;
2444 start = fetch_number (regexp, token, syntax);
2445 if (start == -1)
2447 if (token->type == CHARACTER && token->opr.c == ',')
2448 start = 0; /* We treat "{,m}" as "{0,m}". */
2449 else
2451 *err = REG_BADBR; /* <re>{} is invalid. */
2452 return NULL;
2455 if (BE (start != -2, 1))
2457 /* We treat "{n}" as "{n,n}". */
2458 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2459 : ((token->type == CHARACTER && token->opr.c == ',')
2460 ? fetch_number (regexp, token, syntax) : -2));
2462 if (BE (start == -2 || end == -2, 0))
2464 /* Invalid sequence. */
2465 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2467 if (token->type == END_OF_RE)
2468 *err = REG_EBRACE;
2469 else
2470 *err = REG_BADBR;
2472 return NULL;
2475 /* If the syntax bit is set, rollback. */
2476 re_string_set_index (regexp, start_idx);
2477 *token = start_token;
2478 token->type = CHARACTER;
2479 /* mb_partial and word_char bits should be already initialized by
2480 peek_token. */
2481 return elem;
2484 if (BE (end != -1 && start > end, 0))
2486 /* First number greater than second. */
2487 *err = REG_BADBR;
2488 return NULL;
2491 else
2493 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2494 end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2497 fetch_token (token, regexp, syntax);
2499 if (BE (elem == NULL, 0))
2500 return NULL;
2501 if (BE (start == 0 && end == 0, 0))
2503 postorder (elem, free_tree, NULL);
2504 return NULL;
2507 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2508 if (BE (start > 0, 0))
2510 tree = elem;
2511 for (i = 2; i <= start; ++i)
2513 elem = duplicate_tree (elem, dfa);
2514 tree = create_tree (dfa, tree, elem, CONCAT);
2515 if (BE (elem == NULL || tree == NULL, 0))
2516 goto parse_dup_op_espace;
2519 if (start == end)
2520 return tree;
2522 /* Duplicate ELEM before it is marked optional. */
2523 elem = duplicate_tree (elem, dfa);
2524 old_tree = tree;
2526 else
2527 old_tree = NULL;
2529 if (elem->token.type == SUBEXP)
2530 postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2532 tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2533 if (BE (tree == NULL, 0))
2534 goto parse_dup_op_espace;
2536 /* This loop is actually executed only when end != -1,
2537 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2538 already created the start+1-th copy. */
2539 for (i = start + 2; i <= end; ++i)
2541 elem = duplicate_tree (elem, dfa);
2542 tree = create_tree (dfa, tree, elem, CONCAT);
2543 if (BE (elem == NULL || tree == NULL, 0))
2544 goto parse_dup_op_espace;
2546 tree = create_tree (dfa, tree, NULL, OP_ALT);
2547 if (BE (tree == NULL, 0))
2548 goto parse_dup_op_espace;
2551 if (old_tree)
2552 tree = create_tree (dfa, old_tree, tree, CONCAT);
2554 return tree;
2556 parse_dup_op_espace:
2557 *err = REG_ESPACE;
2558 return NULL;
2561 /* Size of the names for collating symbol/equivalence_class/character_class.
2562 I'm not sure, but maybe enough. */
2563 #define BRACKET_NAME_BUF_SIZE 32
2565 #ifndef _LIBC
2566 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2567 Build the range expression which starts from START_ELEM, and ends
2568 at END_ELEM. The result are written to MBCSET and SBCSET.
2569 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2570 mbcset->range_ends, is a pointer argument sinse we may
2571 update it. */
2573 static reg_errcode_t
2574 internal_function
2575 # ifdef RE_ENABLE_I18N
2576 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2577 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2578 # else /* not RE_ENABLE_I18N */
2579 build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
2580 bracket_elem_t *end_elem)
2581 # endif /* not RE_ENABLE_I18N */
2583 unsigned int start_ch, end_ch;
2584 /* Equivalence Classes and Character Classes can't be a range start/end. */
2585 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2586 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2588 return REG_ERANGE;
2590 /* We can handle no multi character collating elements without libc
2591 support. */
2592 if (BE ((start_elem->type == COLL_SYM
2593 && strlen ((char *) start_elem->opr.name) > 1)
2594 || (end_elem->type == COLL_SYM
2595 && strlen ((char *) end_elem->opr.name) > 1), 0))
2596 return REG_ECOLLATE;
2598 # ifdef RE_ENABLE_I18N
2600 wchar_t wc;
2601 wint_t start_wc;
2602 wint_t end_wc;
2603 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2605 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2606 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2607 : 0));
2608 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2609 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2610 : 0));
2611 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2612 ? __btowc (start_ch) : start_elem->opr.wch);
2613 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2614 ? __btowc (end_ch) : end_elem->opr.wch);
2615 if (start_wc == WEOF || end_wc == WEOF)
2616 return REG_ECOLLATE;
2617 cmp_buf[0] = start_wc;
2618 cmp_buf[4] = end_wc;
2619 if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2620 return REG_ERANGE;
2622 /* Got valid collation sequence values, add them as a new entry.
2623 However, for !_LIBC we have no collation elements: if the
2624 character set is single byte, the single byte character set
2625 that we build below suffices. parse_bracket_exp passes
2626 no MBCSET if dfa->mb_cur_max == 1. */
2627 if (mbcset)
2629 /* Check the space of the arrays. */
2630 if (BE (*range_alloc == mbcset->nranges, 0))
2632 /* There is not enough space, need realloc. */
2633 wchar_t *new_array_start, *new_array_end;
2634 int new_nranges;
2636 /* +1 in case of mbcset->nranges is 0. */
2637 new_nranges = 2 * mbcset->nranges + 1;
2638 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2639 are NULL if *range_alloc == 0. */
2640 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2641 new_nranges);
2642 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2643 new_nranges);
2645 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2646 return REG_ESPACE;
2648 mbcset->range_starts = new_array_start;
2649 mbcset->range_ends = new_array_end;
2650 *range_alloc = new_nranges;
2653 mbcset->range_starts[mbcset->nranges] = start_wc;
2654 mbcset->range_ends[mbcset->nranges++] = end_wc;
2657 /* Build the table for single byte characters. */
2658 for (wc = 0; wc < SBC_MAX; ++wc)
2660 cmp_buf[2] = wc;
2661 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2662 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2663 bitset_set (sbcset, wc);
2666 # else /* not RE_ENABLE_I18N */
2668 unsigned int ch;
2669 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2670 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2671 : 0));
2672 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2673 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2674 : 0));
2675 if (start_ch > end_ch)
2676 return REG_ERANGE;
2677 /* Build the table for single byte characters. */
2678 for (ch = 0; ch < SBC_MAX; ++ch)
2679 if (start_ch <= ch && ch <= end_ch)
2680 bitset_set (sbcset, ch);
2682 # endif /* not RE_ENABLE_I18N */
2683 return REG_NOERROR;
2685 #endif /* not _LIBC */
2687 #ifndef _LIBC
2688 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2689 Build the collating element which is represented by NAME.
2690 The result are written to MBCSET and SBCSET.
2691 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2692 pointer argument since we may update it. */
2694 static reg_errcode_t
2695 internal_function
2696 # ifdef RE_ENABLE_I18N
2697 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2698 int *coll_sym_alloc, const unsigned char *name)
2699 # else /* not RE_ENABLE_I18N */
2700 build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2701 # endif /* not RE_ENABLE_I18N */
2703 size_t name_len = strlen ((const char *) name);
2704 if (BE (name_len != 1, 0))
2705 return REG_ECOLLATE;
2706 else
2708 bitset_set (sbcset, name[0]);
2709 return REG_NOERROR;
2712 #endif /* not _LIBC */
2714 /* This function parse bracket expression like "[abc]", "[a-c]",
2715 "[[.a-a.]]" etc. */
2717 static bin_tree_t *
2718 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2719 reg_syntax_t syntax, reg_errcode_t *err)
2721 #ifdef _LIBC
2722 const unsigned char *collseqmb;
2723 const char *collseqwc;
2724 uint32_t nrules;
2725 int32_t table_size;
2726 const int32_t *symb_table;
2727 const unsigned char *extra;
2729 /* Local function for parse_bracket_exp used in _LIBC environement.
2730 Seek the collating symbol entry correspondings to NAME.
2731 Return the index of the symbol in the SYMB_TABLE. */
2733 auto inline int32_t
2734 __attribute ((always_inline))
2735 seek_collating_symbol_entry (name, name_len)
2736 const unsigned char *name;
2737 size_t name_len;
2739 int32_t hash = elem_hash ((const char *) name, name_len);
2740 int32_t elem = hash % table_size;
2741 if (symb_table[2 * elem] != 0)
2743 int32_t second = hash % (table_size - 2) + 1;
2747 /* First compare the hashing value. */
2748 if (symb_table[2 * elem] == hash
2749 /* Compare the length of the name. */
2750 && name_len == extra[symb_table[2 * elem + 1]]
2751 /* Compare the name. */
2752 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2753 name_len) == 0)
2755 /* Yep, this is the entry. */
2756 break;
2759 /* Next entry. */
2760 elem += second;
2762 while (symb_table[2 * elem] != 0);
2764 return elem;
2767 /* Local function for parse_bracket_exp used in _LIBC environment.
2768 Look up the collation sequence value of BR_ELEM.
2769 Return the value if succeeded, UINT_MAX otherwise. */
2771 auto inline unsigned int
2772 __attribute ((always_inline))
2773 lookup_collation_sequence_value (br_elem)
2774 bracket_elem_t *br_elem;
2776 if (br_elem->type == SB_CHAR)
2779 if (MB_CUR_MAX == 1)
2781 if (nrules == 0)
2782 return collseqmb[br_elem->opr.ch];
2783 else
2785 wint_t wc = __btowc (br_elem->opr.ch);
2786 return __collseq_table_lookup (collseqwc, wc);
2789 else if (br_elem->type == MB_CHAR)
2791 if (nrules != 0)
2792 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2794 else if (br_elem->type == COLL_SYM)
2796 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2797 if (nrules != 0)
2799 int32_t elem, idx;
2800 elem = seek_collating_symbol_entry (br_elem->opr.name,
2801 sym_name_len);
2802 if (symb_table[2 * elem] != 0)
2804 /* We found the entry. */
2805 idx = symb_table[2 * elem + 1];
2806 /* Skip the name of collating element name. */
2807 idx += 1 + extra[idx];
2808 /* Skip the byte sequence of the collating element. */
2809 idx += 1 + extra[idx];
2810 /* Adjust for the alignment. */
2811 idx = (idx + 3) & ~3;
2812 /* Skip the multibyte collation sequence value. */
2813 idx += sizeof (unsigned int);
2814 /* Skip the wide char sequence of the collating element. */
2815 idx += sizeof (unsigned int) *
2816 (1 + *(unsigned int *) (extra + idx));
2817 /* Return the collation sequence value. */
2818 return *(unsigned int *) (extra + idx);
2820 else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2822 /* No valid character. Match it as a single byte
2823 character. */
2824 return collseqmb[br_elem->opr.name[0]];
2827 else if (sym_name_len == 1)
2828 return collseqmb[br_elem->opr.name[0]];
2830 return UINT_MAX;
2833 /* Local function for parse_bracket_exp used in _LIBC environement.
2834 Build the range expression which starts from START_ELEM, and ends
2835 at END_ELEM. The result are written to MBCSET and SBCSET.
2836 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2837 mbcset->range_ends, is a pointer argument sinse we may
2838 update it. */
2840 auto inline reg_errcode_t
2841 __attribute ((always_inline))
2842 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2843 re_charset_t *mbcset;
2844 int *range_alloc;
2845 bitset_t sbcset;
2846 bracket_elem_t *start_elem, *end_elem;
2848 unsigned int ch;
2849 uint32_t start_collseq;
2850 uint32_t end_collseq;
2852 /* Equivalence Classes and Character Classes can't be a range
2853 start/end. */
2854 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2855 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2857 return REG_ERANGE;
2859 start_collseq = lookup_collation_sequence_value (start_elem);
2860 end_collseq = lookup_collation_sequence_value (end_elem);
2861 /* Check start/end collation sequence values. */
2862 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2863 return REG_ECOLLATE;
2864 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2865 return REG_ERANGE;
2867 /* Got valid collation sequence values, add them as a new entry.
2868 However, if we have no collation elements, and the character set
2869 is single byte, the single byte character set that we
2870 build below suffices. */
2871 if (nrules > 0 || dfa->mb_cur_max > 1)
2873 /* Check the space of the arrays. */
2874 if (BE (*range_alloc == mbcset->nranges, 0))
2876 /* There is not enough space, need realloc. */
2877 uint32_t *new_array_start;
2878 uint32_t *new_array_end;
2879 int new_nranges;
2881 /* +1 in case of mbcset->nranges is 0. */
2882 new_nranges = 2 * mbcset->nranges + 1;
2883 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2884 new_nranges);
2885 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2886 new_nranges);
2888 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2889 return REG_ESPACE;
2891 mbcset->range_starts = new_array_start;
2892 mbcset->range_ends = new_array_end;
2893 *range_alloc = new_nranges;
2896 mbcset->range_starts[mbcset->nranges] = start_collseq;
2897 mbcset->range_ends[mbcset->nranges++] = end_collseq;
2900 /* Build the table for single byte characters. */
2901 for (ch = 0; ch < SBC_MAX; ch++)
2903 uint32_t ch_collseq;
2905 if (MB_CUR_MAX == 1)
2907 if (nrules == 0)
2908 ch_collseq = collseqmb[ch];
2909 else
2910 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2911 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2912 bitset_set (sbcset, ch);
2914 return REG_NOERROR;
2917 /* Local function for parse_bracket_exp used in _LIBC environement.
2918 Build the collating element which is represented by NAME.
2919 The result are written to MBCSET and SBCSET.
2920 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2921 pointer argument sinse we may update it. */
2923 auto inline reg_errcode_t
2924 __attribute ((always_inline))
2925 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2926 re_charset_t *mbcset;
2927 int *coll_sym_alloc;
2928 bitset_t sbcset;
2929 const unsigned char *name;
2931 int32_t elem, idx;
2932 size_t name_len = strlen ((const char *) name);
2933 if (nrules != 0)
2935 elem = seek_collating_symbol_entry (name, name_len);
2936 if (symb_table[2 * elem] != 0)
2938 /* We found the entry. */
2939 idx = symb_table[2 * elem + 1];
2940 /* Skip the name of collating element name. */
2941 idx += 1 + extra[idx];
2943 else if (symb_table[2 * elem] == 0 && name_len == 1)
2945 /* No valid character, treat it as a normal
2946 character. */
2947 bitset_set (sbcset, name[0]);
2948 return REG_NOERROR;
2950 else
2951 return REG_ECOLLATE;
2953 /* Got valid collation sequence, add it as a new entry. */
2954 /* Check the space of the arrays. */
2955 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
2957 /* Not enough, realloc it. */
2958 /* +1 in case of mbcset->ncoll_syms is 0. */
2959 int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
2960 /* Use realloc since mbcset->coll_syms is NULL
2961 if *alloc == 0. */
2962 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
2963 new_coll_sym_alloc);
2964 if (BE (new_coll_syms == NULL, 0))
2965 return REG_ESPACE;
2966 mbcset->coll_syms = new_coll_syms;
2967 *coll_sym_alloc = new_coll_sym_alloc;
2969 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
2970 return REG_NOERROR;
2972 else
2974 if (BE (name_len != 1, 0))
2975 return REG_ECOLLATE;
2976 else
2978 bitset_set (sbcset, name[0]);
2979 return REG_NOERROR;
2983 #endif
2985 re_token_t br_token;
2986 re_bitset_ptr_t sbcset;
2987 #ifdef RE_ENABLE_I18N
2988 re_charset_t *mbcset;
2989 int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
2990 int equiv_class_alloc = 0, char_class_alloc = 0;
2991 #endif /* not RE_ENABLE_I18N */
2992 int non_match = 0;
2993 bin_tree_t *work_tree;
2994 int token_len;
2995 int first_round = 1;
2996 #ifdef _LIBC
2997 collseqmb = (const unsigned char *)
2998 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
2999 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3000 if (nrules)
3003 if (MB_CUR_MAX > 1)
3005 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3006 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3007 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3008 _NL_COLLATE_SYMB_TABLEMB);
3009 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3010 _NL_COLLATE_SYMB_EXTRAMB);
3012 #endif
3013 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3014 #ifdef RE_ENABLE_I18N
3015 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3016 #endif /* RE_ENABLE_I18N */
3017 #ifdef RE_ENABLE_I18N
3018 if (BE (sbcset == NULL || mbcset == NULL, 0))
3019 #else
3020 if (BE (sbcset == NULL, 0))
3021 #endif /* RE_ENABLE_I18N */
3023 *err = REG_ESPACE;
3024 return NULL;
3027 token_len = peek_token_bracket (token, regexp, syntax);
3028 if (BE (token->type == END_OF_RE, 0))
3030 *err = REG_BADPAT;
3031 goto parse_bracket_exp_free_return;
3033 if (token->type == OP_NON_MATCH_LIST)
3035 #ifdef RE_ENABLE_I18N
3036 mbcset->non_match = 1;
3037 #endif /* not RE_ENABLE_I18N */
3038 non_match = 1;
3039 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3040 bitset_set (sbcset, '\n');
3041 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3042 token_len = peek_token_bracket (token, regexp, syntax);
3043 if (BE (token->type == END_OF_RE, 0))
3045 *err = REG_BADPAT;
3046 goto parse_bracket_exp_free_return;
3050 /* We treat the first ']' as a normal character. */
3051 if (token->type == OP_CLOSE_BRACKET)
3052 token->type = CHARACTER;
3054 while (1)
3056 bracket_elem_t start_elem, end_elem;
3057 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3058 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3059 reg_errcode_t ret;
3060 int token_len2 = 0, is_range_exp = 0;
3061 re_token_t token2;
3063 start_elem.opr.name = start_name_buf;
3064 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3065 syntax, first_round);
3066 if (BE (ret != REG_NOERROR, 0))
3068 *err = ret;
3069 goto parse_bracket_exp_free_return;
3071 first_round = 0;
3073 /* Get information about the next token. We need it in any case. */
3074 token_len = peek_token_bracket (token, regexp, syntax);
3076 /* Do not check for ranges if we know they are not allowed. */
3077 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3079 if (BE (token->type == END_OF_RE, 0))
3081 *err = REG_EBRACK;
3082 goto parse_bracket_exp_free_return;
3084 if (token->type == OP_CHARSET_RANGE)
3086 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3087 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3088 if (BE (token2.type == END_OF_RE, 0))
3090 *err = REG_EBRACK;
3091 goto parse_bracket_exp_free_return;
3093 if (token2.type == OP_CLOSE_BRACKET)
3095 /* We treat the last '-' as a normal character. */
3096 re_string_skip_bytes (regexp, -token_len);
3097 token->type = CHARACTER;
3099 else
3100 is_range_exp = 1;
3104 if (is_range_exp == 1)
3106 end_elem.opr.name = end_name_buf;
3107 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3108 dfa, syntax, 1);
3109 if (BE (ret != REG_NOERROR, 0))
3111 *err = ret;
3112 goto parse_bracket_exp_free_return;
3115 token_len = peek_token_bracket (token, regexp, syntax);
3117 #ifdef _LIBC
3118 *err = build_range_exp (sbcset, mbcset, &range_alloc,
3119 &start_elem, &end_elem);
3120 #else
3121 # ifdef RE_ENABLE_I18N
3122 *err = build_range_exp (sbcset,
3123 dfa->mb_cur_max > 1 ? mbcset : NULL,
3124 &range_alloc, &start_elem, &end_elem);
3125 # else
3126 *err = build_range_exp (sbcset, &start_elem, &end_elem);
3127 # endif
3128 #endif /* RE_ENABLE_I18N */
3129 if (BE (*err != REG_NOERROR, 0))
3130 goto parse_bracket_exp_free_return;
3132 else
3134 switch (start_elem.type)
3136 case SB_CHAR:
3137 bitset_set (sbcset, start_elem.opr.ch);
3138 break;
3139 #ifdef RE_ENABLE_I18N
3140 case MB_CHAR:
3141 /* Check whether the array has enough space. */
3142 if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3144 wchar_t *new_mbchars;
3145 /* Not enough, realloc it. */
3146 /* +1 in case of mbcset->nmbchars is 0. */
3147 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3148 /* Use realloc since array is NULL if *alloc == 0. */
3149 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3150 mbchar_alloc);
3151 if (BE (new_mbchars == NULL, 0))
3152 goto parse_bracket_exp_espace;
3153 mbcset->mbchars = new_mbchars;
3155 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3156 break;
3157 #endif /* RE_ENABLE_I18N */
3158 case EQUIV_CLASS:
3159 *err = build_equiv_class (sbcset,
3160 #ifdef RE_ENABLE_I18N
3161 mbcset, &equiv_class_alloc,
3162 #endif /* RE_ENABLE_I18N */
3163 start_elem.opr.name);
3164 if (BE (*err != REG_NOERROR, 0))
3165 goto parse_bracket_exp_free_return;
3166 break;
3167 case COLL_SYM:
3168 *err = build_collating_symbol (sbcset,
3169 #ifdef RE_ENABLE_I18N
3170 mbcset, &coll_sym_alloc,
3171 #endif /* RE_ENABLE_I18N */
3172 start_elem.opr.name);
3173 if (BE (*err != REG_NOERROR, 0))
3174 goto parse_bracket_exp_free_return;
3175 break;
3176 case CHAR_CLASS:
3177 *err = build_charclass (regexp->trans, sbcset,
3178 #ifdef RE_ENABLE_I18N
3179 mbcset, &char_class_alloc,
3180 #endif /* RE_ENABLE_I18N */
3181 start_elem.opr.name, syntax);
3182 if (BE (*err != REG_NOERROR, 0))
3183 goto parse_bracket_exp_free_return;
3184 break;
3185 default:
3186 assert (0);
3187 break;
3190 if (BE (token->type == END_OF_RE, 0))
3192 *err = REG_EBRACK;
3193 goto parse_bracket_exp_free_return;
3195 if (token->type == OP_CLOSE_BRACKET)
3196 break;
3199 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3201 /* If it is non-matching list. */
3202 if (non_match)
3203 bitset_not (sbcset);
3205 #ifdef RE_ENABLE_I18N
3206 /* Ensure only single byte characters are set. */
3207 if (dfa->mb_cur_max > 1)
3208 bitset_mask (sbcset, dfa->sb_char);
3210 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3211 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3212 || mbcset->non_match)))
3214 bin_tree_t *mbc_tree;
3215 int sbc_idx;
3216 /* Build a tree for complex bracket. */
3217 dfa->has_mb_node = 1;
3218 br_token.type = COMPLEX_BRACKET;
3219 br_token.opr.mbcset = mbcset;
3220 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3221 if (BE (mbc_tree == NULL, 0))
3222 goto parse_bracket_exp_espace;
3223 for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3224 if (sbcset[sbc_idx])
3225 break;
3226 /* If there are no bits set in sbcset, there is no point
3227 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3228 if (sbc_idx < BITSET_WORDS)
3230 /* Build a tree for simple bracket. */
3231 br_token.type = SIMPLE_BRACKET;
3232 br_token.opr.sbcset = sbcset;
3233 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3234 if (BE (work_tree == NULL, 0))
3235 goto parse_bracket_exp_espace;
3237 /* Then join them by ALT node. */
3238 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3239 if (BE (work_tree == NULL, 0))
3240 goto parse_bracket_exp_espace;
3242 else
3244 re_free (sbcset);
3245 work_tree = mbc_tree;
3248 else
3249 #endif /* not RE_ENABLE_I18N */
3251 #ifdef RE_ENABLE_I18N
3252 free_charset (mbcset);
3253 #endif
3254 /* Build a tree for simple bracket. */
3255 br_token.type = SIMPLE_BRACKET;
3256 br_token.opr.sbcset = sbcset;
3257 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3258 if (BE (work_tree == NULL, 0))
3259 goto parse_bracket_exp_espace;
3261 return work_tree;
3263 parse_bracket_exp_espace:
3264 *err = REG_ESPACE;
3265 parse_bracket_exp_free_return:
3266 re_free (sbcset);
3267 #ifdef RE_ENABLE_I18N
3268 free_charset (mbcset);
3269 #endif /* RE_ENABLE_I18N */
3270 return NULL;
3273 /* Parse an element in the bracket expression. */
3275 static reg_errcode_t
3276 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3277 re_token_t *token, int token_len, re_dfa_t *dfa,
3278 reg_syntax_t syntax, int accept_hyphen)
3280 #ifdef RE_ENABLE_I18N
3281 int cur_char_size;
3282 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3283 if (cur_char_size > 1)
3285 elem->type = MB_CHAR;
3286 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3287 re_string_skip_bytes (regexp, cur_char_size);
3288 return REG_NOERROR;
3290 #endif /* RE_ENABLE_I18N */
3291 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3292 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3293 || token->type == OP_OPEN_EQUIV_CLASS)
3294 return parse_bracket_symbol (elem, regexp, token);
3295 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3297 /* A '-' must only appear as anything but a range indicator before
3298 the closing bracket. Everything else is an error. */
3299 re_token_t token2;
3300 (void) peek_token_bracket (&token2, regexp, syntax);
3301 if (token2.type != OP_CLOSE_BRACKET)
3302 /* The actual error value is not standardized since this whole
3303 case is undefined. But ERANGE makes good sense. */
3304 return REG_ERANGE;
3306 elem->type = SB_CHAR;
3307 elem->opr.ch = token->opr.c;
3308 return REG_NOERROR;
3311 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3312 such as [:<character_class>:], [.<collating_element>.], and
3313 [=<equivalent_class>=]. */
3315 static reg_errcode_t
3316 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3317 re_token_t *token)
3319 unsigned char ch, delim = token->opr.c;
3320 int i = 0;
3321 if (re_string_eoi(regexp))
3322 return REG_EBRACK;
3323 for (;; ++i)
3325 if (i >= BRACKET_NAME_BUF_SIZE)
3326 return REG_EBRACK;
3327 if (token->type == OP_OPEN_CHAR_CLASS)
3328 ch = re_string_fetch_byte_case (regexp);
3329 else
3330 ch = re_string_fetch_byte (regexp);
3331 if (re_string_eoi(regexp))
3332 return REG_EBRACK;
3333 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3334 break;
3335 elem->opr.name[i] = ch;
3337 re_string_skip_bytes (regexp, 1);
3338 elem->opr.name[i] = '\0';
3339 switch (token->type)
3341 case OP_OPEN_COLL_ELEM:
3342 elem->type = COLL_SYM;
3343 break;
3344 case OP_OPEN_EQUIV_CLASS:
3345 elem->type = EQUIV_CLASS;
3346 break;
3347 case OP_OPEN_CHAR_CLASS:
3348 elem->type = CHAR_CLASS;
3349 break;
3350 default:
3351 break;
3353 return REG_NOERROR;
3356 /* Helper function for parse_bracket_exp.
3357 Build the equivalence class which is represented by NAME.
3358 The result are written to MBCSET and SBCSET.
3359 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3360 is a pointer argument sinse we may update it. */
3362 static reg_errcode_t
3363 #ifdef RE_ENABLE_I18N
3364 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3365 int *equiv_class_alloc, const unsigned char *name)
3366 #else /* not RE_ENABLE_I18N */
3367 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3368 #endif /* not RE_ENABLE_I18N */
3370 #ifdef _LIBC
3371 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3372 if (nrules != 0)
3374 const int32_t *table, *indirect;
3375 const unsigned char *weights, *extra, *cp;
3376 unsigned char char_buf[2];
3377 int32_t idx1, idx2;
3378 unsigned int ch;
3379 size_t len;
3380 /* This #include defines a local function! */
3381 # include <locale/weight.h>
3382 /* Calculate the index for equivalence class. */
3383 cp = name;
3384 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3385 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3386 _NL_COLLATE_WEIGHTMB);
3387 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3388 _NL_COLLATE_EXTRAMB);
3389 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3390 _NL_COLLATE_INDIRECTMB);
3391 idx1 = findidx (&cp);
3392 if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3393 /* This isn't a valid character. */
3394 return REG_ECOLLATE;
3396 /* Build single byte matcing table for this equivalence class. */
3397 char_buf[1] = (unsigned char) '\0';
3398 len = weights[idx1 & 0xffffff];
3399 for (ch = 0; ch < SBC_MAX; ++ch)
3401 char_buf[0] = ch;
3402 cp = char_buf;
3403 idx2 = findidx (&cp);
3405 idx2 = table[ch];
3407 if (idx2 == 0)
3408 /* This isn't a valid character. */
3409 continue;
3410 /* Compare only if the length matches and the collation rule
3411 index is the same. */
3412 if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3414 int cnt = 0;
3416 while (cnt <= len &&
3417 weights[(idx1 & 0xffffff) + 1 + cnt]
3418 == weights[(idx2 & 0xffffff) + 1 + cnt])
3419 ++cnt;
3421 if (cnt > len)
3422 bitset_set (sbcset, ch);
3425 /* Check whether the array has enough space. */
3426 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3428 /* Not enough, realloc it. */
3429 /* +1 in case of mbcset->nequiv_classes is 0. */
3430 int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3431 /* Use realloc since the array is NULL if *alloc == 0. */
3432 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3433 int32_t,
3434 new_equiv_class_alloc);
3435 if (BE (new_equiv_classes == NULL, 0))
3436 return REG_ESPACE;
3437 mbcset->equiv_classes = new_equiv_classes;
3438 *equiv_class_alloc = new_equiv_class_alloc;
3440 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3442 else
3443 #endif /* _LIBC */
3445 if (BE (strlen ((const char *) name) != 1, 0))
3446 return REG_ECOLLATE;
3447 bitset_set (sbcset, *name);
3449 return REG_NOERROR;
3452 /* Helper function for parse_bracket_exp.
3453 Build the character class which is represented by NAME.
3454 The result are written to MBCSET and SBCSET.
3455 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3456 is a pointer argument sinse we may update it. */
3458 static reg_errcode_t
3459 #ifdef RE_ENABLE_I18N
3460 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3461 re_charset_t *mbcset, int *char_class_alloc,
3462 const unsigned char *class_name, reg_syntax_t syntax)
3463 #else /* not RE_ENABLE_I18N */
3464 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3465 const unsigned char *class_name, reg_syntax_t syntax)
3466 #endif /* not RE_ENABLE_I18N */
3468 int i;
3469 const char *name = (const char *) class_name;
3471 /* In case of REG_ICASE "upper" and "lower" match the both of
3472 upper and lower cases. */
3473 if ((syntax & RE_ICASE)
3474 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3475 name = "alpha";
3477 #ifdef RE_ENABLE_I18N
3478 /* Check the space of the arrays. */
3479 if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3481 /* Not enough, realloc it. */
3482 /* +1 in case of mbcset->nchar_classes is 0. */
3483 int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3484 /* Use realloc since array is NULL if *alloc == 0. */
3485 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3486 new_char_class_alloc);
3487 if (BE (new_char_classes == NULL, 0))
3488 return REG_ESPACE;
3489 mbcset->char_classes = new_char_classes;
3490 *char_class_alloc = new_char_class_alloc;
3492 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3493 #endif /* RE_ENABLE_I18N */
3495 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3496 do { \
3497 if (BE (trans != NULL, 0)) \
3499 for (i = 0; i < SBC_MAX; ++i) \
3500 if (ctype_func (i)) \
3501 bitset_set (sbcset, trans[i]); \
3503 else \
3505 for (i = 0; i < SBC_MAX; ++i) \
3506 if (ctype_func (i)) \
3507 bitset_set (sbcset, i); \
3509 } while (0)
3511 if (strcmp (name, "alnum") == 0)
3512 BUILD_CHARCLASS_LOOP (isalnum);
3513 else if (strcmp (name, "cntrl") == 0)
3514 BUILD_CHARCLASS_LOOP (iscntrl);
3515 else if (strcmp (name, "lower") == 0)
3516 BUILD_CHARCLASS_LOOP (islower);
3517 else if (strcmp (name, "space") == 0)
3518 BUILD_CHARCLASS_LOOP (isspace);
3519 else if (strcmp (name, "alpha") == 0)
3520 BUILD_CHARCLASS_LOOP (isalpha);
3521 else if (strcmp (name, "digit") == 0)
3522 BUILD_CHARCLASS_LOOP (isdigit);
3523 else if (strcmp (name, "print") == 0)
3524 BUILD_CHARCLASS_LOOP (isprint);
3525 else if (strcmp (name, "upper") == 0)
3526 BUILD_CHARCLASS_LOOP (isupper);
3527 else if (strcmp (name, "blank") == 0)
3528 BUILD_CHARCLASS_LOOP (isblank);
3529 else if (strcmp (name, "graph") == 0)
3530 BUILD_CHARCLASS_LOOP (isgraph);
3531 else if (strcmp (name, "punct") == 0)
3532 BUILD_CHARCLASS_LOOP (ispunct);
3533 else if (strcmp (name, "xdigit") == 0)
3534 BUILD_CHARCLASS_LOOP (isxdigit);
3535 else
3536 return REG_ECTYPE;
3538 return REG_NOERROR;
3541 static bin_tree_t *
3542 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3543 const unsigned char *class_name,
3544 const unsigned char *extra, int non_match,
3545 reg_errcode_t *err)
3547 re_bitset_ptr_t sbcset;
3548 #ifdef RE_ENABLE_I18N
3549 re_charset_t *mbcset;
3550 int alloc = 0;
3551 #endif /* not RE_ENABLE_I18N */
3552 reg_errcode_t ret;
3553 re_token_t br_token;
3554 bin_tree_t *tree;
3556 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3557 #ifdef RE_ENABLE_I18N
3558 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3559 #endif /* RE_ENABLE_I18N */
3561 #ifdef RE_ENABLE_I18N
3562 if (BE (sbcset == NULL || mbcset == NULL, 0))
3563 #else /* not RE_ENABLE_I18N */
3564 if (BE (sbcset == NULL, 0))
3565 #endif /* not RE_ENABLE_I18N */
3567 *err = REG_ESPACE;
3568 return NULL;
3571 if (non_match)
3573 #ifdef RE_ENABLE_I18N
3574 mbcset->non_match = 1;
3575 #endif /* not RE_ENABLE_I18N */
3578 /* We don't care the syntax in this case. */
3579 ret = build_charclass (trans, sbcset,
3580 #ifdef RE_ENABLE_I18N
3581 mbcset, &alloc,
3582 #endif /* RE_ENABLE_I18N */
3583 class_name, 0);
3585 if (BE (ret != REG_NOERROR, 0))
3587 re_free (sbcset);
3588 #ifdef RE_ENABLE_I18N
3589 free_charset (mbcset);
3590 #endif /* RE_ENABLE_I18N */
3591 *err = ret;
3592 return NULL;
3594 /* \w match '_' also. */
3595 for (; *extra; extra++)
3596 bitset_set (sbcset, *extra);
3598 /* If it is non-matching list. */
3599 if (non_match)
3600 bitset_not (sbcset);
3602 #ifdef RE_ENABLE_I18N
3603 /* Ensure only single byte characters are set. */
3604 if (dfa->mb_cur_max > 1)
3605 bitset_mask (sbcset, dfa->sb_char);
3606 #endif
3608 /* Build a tree for simple bracket. */
3609 br_token.type = SIMPLE_BRACKET;
3610 br_token.opr.sbcset = sbcset;
3611 tree = create_token_tree (dfa, NULL, NULL, &br_token);
3612 if (BE (tree == NULL, 0))
3613 goto build_word_op_espace;
3615 #ifdef RE_ENABLE_I18N
3616 if (dfa->mb_cur_max > 1)
3618 bin_tree_t *mbc_tree;
3619 /* Build a tree for complex bracket. */
3620 br_token.type = COMPLEX_BRACKET;
3621 br_token.opr.mbcset = mbcset;
3622 dfa->has_mb_node = 1;
3623 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3624 if (BE (mbc_tree == NULL, 0))
3625 goto build_word_op_espace;
3626 /* Then join them by ALT node. */
3627 tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3628 if (BE (mbc_tree != NULL, 1))
3629 return tree;
3631 else
3633 free_charset (mbcset);
3634 return tree;
3636 #else /* not RE_ENABLE_I18N */
3637 return tree;
3638 #endif /* not RE_ENABLE_I18N */
3640 build_word_op_espace:
3641 re_free (sbcset);
3642 #ifdef RE_ENABLE_I18N
3643 free_charset (mbcset);
3644 #endif /* RE_ENABLE_I18N */
3645 *err = REG_ESPACE;
3646 return NULL;
3649 /* This is intended for the expressions like "a{1,3}".
3650 Fetch a number from `input', and return the number.
3651 Return -1, if the number field is empty like "{,1}".
3652 Return -2, If an error is occured. */
3654 static int
3655 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3657 int num = -1;
3658 unsigned char c;
3659 while (1)
3661 fetch_token (token, input, syntax);
3662 c = token->opr.c;
3663 if (BE (token->type == END_OF_RE, 0))
3664 return -2;
3665 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3666 break;
3667 num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3668 ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3669 num = (num > RE_DUP_MAX) ? -2 : num;
3671 return num;
3674 #ifdef RE_ENABLE_I18N
3675 static void
3676 free_charset (re_charset_t *cset)
3678 re_free (cset->mbchars);
3679 # ifdef _LIBC
3680 re_free (cset->coll_syms);
3681 re_free (cset->equiv_classes);
3682 re_free (cset->range_starts);
3683 re_free (cset->range_ends);
3684 # endif
3685 re_free (cset->char_classes);
3686 re_free (cset);
3688 #endif /* RE_ENABLE_I18N */
3690 /* Functions for binary tree operation. */
3692 /* Create a tree node. */
3694 static bin_tree_t *
3695 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3696 re_token_type_t type)
3698 re_token_t t;
3699 t.type = type;
3700 return create_token_tree (dfa, left, right, &t);
3703 static bin_tree_t *
3704 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3705 const re_token_t *token)
3707 bin_tree_t *tree;
3708 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3710 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3712 if (storage == NULL)
3713 return NULL;
3714 storage->next = dfa->str_tree_storage;
3715 dfa->str_tree_storage = storage;
3716 dfa->str_tree_storage_idx = 0;
3718 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3720 tree->parent = NULL;
3721 tree->left = left;
3722 tree->right = right;
3723 tree->token = *token;
3724 tree->token.duplicated = 0;
3725 tree->token.opt_subexp = 0;
3726 tree->first = NULL;
3727 tree->next = NULL;
3728 tree->node_idx = -1;
3730 if (left != NULL)
3731 left->parent = tree;
3732 if (right != NULL)
3733 right->parent = tree;
3734 return tree;
3737 /* Mark the tree SRC as an optional subexpression.
3738 To be called from preorder or postorder. */
3740 static reg_errcode_t
3741 mark_opt_subexp (void *extra, bin_tree_t *node)
3743 int idx = (int) (long) extra;
3744 if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3745 node->token.opt_subexp = 1;
3747 return REG_NOERROR;
3750 /* Free the allocated memory inside NODE. */
3752 static void
3753 free_token (re_token_t *node)
3755 #ifdef RE_ENABLE_I18N
3756 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3757 free_charset (node->opr.mbcset);
3758 else
3759 #endif /* RE_ENABLE_I18N */
3760 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3761 re_free (node->opr.sbcset);
3764 /* Worker function for tree walking. Free the allocated memory inside NODE
3765 and its children. */
3767 static reg_errcode_t
3768 free_tree (void *extra, bin_tree_t *node)
3770 free_token (&node->token);
3771 return REG_NOERROR;
3775 /* Duplicate the node SRC, and return new node. This is a preorder
3776 visit similar to the one implemented by the generic visitor, but
3777 we need more infrastructure to maintain two parallel trees --- so,
3778 it's easier to duplicate. */
3780 static bin_tree_t *
3781 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3783 const bin_tree_t *node;
3784 bin_tree_t *dup_root;
3785 bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3787 for (node = root; ; )
3789 /* Create a new tree and link it back to the current parent. */
3790 *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3791 if (*p_new == NULL)
3792 return NULL;
3793 (*p_new)->parent = dup_node;
3794 (*p_new)->token.duplicated = 1;
3795 dup_node = *p_new;
3797 /* Go to the left node, or up and to the right. */
3798 if (node->left)
3800 node = node->left;
3801 p_new = &dup_node->left;
3803 else
3805 const bin_tree_t *prev = NULL;
3806 while (node->right == prev || node->right == NULL)
3808 prev = node;
3809 node = node->parent;
3810 dup_node = dup_node->parent;
3811 if (!node)
3812 return dup_root;
3814 node = node->right;
3815 p_new = &dup_node->right;