Merge branch 'yb/utf-16le-bom-spellfix'
[git/raj.git] / compat / regex / regcomp.c
blobc0d838834ad8714cc29148bf23895ab30498947e
1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002-2007,2009,2010 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <http://www.gnu.org/licenses/>. */
20 #if defined __TANDEM
21 /* This is currently duplicated from git-compat-utils.h */
22 # ifdef NO_INTPTR_T
23 typedef long intptr_t;
24 typedef unsigned long uintptr_t;
25 # endif
26 #endif
28 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
29 size_t length, reg_syntax_t syntax);
30 static void re_compile_fastmap_iter (regex_t *bufp,
31 const re_dfastate_t *init_state,
32 char *fastmap);
33 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
34 #ifdef RE_ENABLE_I18N
35 static void free_charset (re_charset_t *cset);
36 #endif /* RE_ENABLE_I18N */
37 static void free_workarea_compile (regex_t *preg);
38 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
39 #ifdef RE_ENABLE_I18N
40 static void optimize_utf8 (re_dfa_t *dfa);
41 #endif
42 static reg_errcode_t analyze (regex_t *preg);
43 static reg_errcode_t preorder (bin_tree_t *root,
44 reg_errcode_t (fn (void *, bin_tree_t *)),
45 void *extra);
46 static reg_errcode_t postorder (bin_tree_t *root,
47 reg_errcode_t (fn (void *, bin_tree_t *)),
48 void *extra);
49 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
50 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
51 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
52 bin_tree_t *node);
53 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
54 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
55 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
56 static int duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint);
57 static int search_duplicated_node (const re_dfa_t *dfa, int org_node,
58 unsigned int constraint);
59 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
60 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
61 int node, int root);
62 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
63 static int fetch_number (re_string_t *input, re_token_t *token,
64 reg_syntax_t syntax);
65 static int peek_token (re_token_t *token, re_string_t *input,
66 reg_syntax_t syntax) internal_function;
67 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
68 reg_syntax_t syntax, reg_errcode_t *err);
69 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
70 re_token_t *token, reg_syntax_t syntax,
71 int nest, reg_errcode_t *err);
72 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
73 re_token_t *token, reg_syntax_t syntax,
74 int nest, reg_errcode_t *err);
75 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
76 re_token_t *token, reg_syntax_t syntax,
77 int nest, reg_errcode_t *err);
78 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
79 re_token_t *token, reg_syntax_t syntax,
80 int nest, reg_errcode_t *err);
81 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
82 re_dfa_t *dfa, re_token_t *token,
83 reg_syntax_t syntax, reg_errcode_t *err);
84 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
85 re_token_t *token, reg_syntax_t syntax,
86 reg_errcode_t *err);
87 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
88 re_string_t *regexp,
89 re_token_t *token, int token_len,
90 re_dfa_t *dfa,
91 reg_syntax_t syntax,
92 int accept_hyphen);
93 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
94 re_string_t *regexp,
95 re_token_t *token);
96 #ifdef RE_ENABLE_I18N
97 static reg_errcode_t build_equiv_class (bitset_t sbcset,
98 re_charset_t *mbcset,
99 int *equiv_class_alloc,
100 const unsigned char *name);
101 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
102 bitset_t sbcset,
103 re_charset_t *mbcset,
104 int *char_class_alloc,
105 const char *class_name,
106 reg_syntax_t syntax);
107 #else /* not RE_ENABLE_I18N */
108 static reg_errcode_t build_equiv_class (bitset_t sbcset,
109 const unsigned char *name);
110 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
111 bitset_t sbcset,
112 const char *class_name,
113 reg_syntax_t syntax);
114 #endif /* not RE_ENABLE_I18N */
115 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
116 RE_TRANSLATE_TYPE trans,
117 const char *class_name,
118 const char *extra,
119 int non_match, reg_errcode_t *err);
120 static bin_tree_t *create_tree (re_dfa_t *dfa,
121 bin_tree_t *left, bin_tree_t *right,
122 re_token_type_t type);
123 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
124 bin_tree_t *left, bin_tree_t *right,
125 const re_token_t *token);
126 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
127 static void free_token (re_token_t *node);
128 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
129 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
131 /* This table gives an error message for each of the error codes listed
132 in regex.h. Obviously the order here has to be same as there.
133 POSIX doesn't require that we do anything for REG_NOERROR,
134 but why not be nice? */
136 const char __re_error_msgid[] attribute_hidden =
138 #define REG_NOERROR_IDX 0
139 gettext_noop ("Success") /* REG_NOERROR */
140 "\0"
141 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
142 gettext_noop ("No match") /* REG_NOMATCH */
143 "\0"
144 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
145 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
146 "\0"
147 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
148 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
149 "\0"
150 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
151 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
152 "\0"
153 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
154 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
155 "\0"
156 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
157 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
158 "\0"
159 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
160 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
161 "\0"
162 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
163 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
164 "\0"
165 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
166 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
167 "\0"
168 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
169 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
170 "\0"
171 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
172 gettext_noop ("Invalid range end") /* REG_ERANGE */
173 "\0"
174 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
175 gettext_noop ("Memory exhausted") /* REG_ESPACE */
176 "\0"
177 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
178 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
179 "\0"
180 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
181 gettext_noop ("Premature end of regular expression") /* REG_EEND */
182 "\0"
183 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
184 gettext_noop ("Regular expression too big") /* REG_ESIZE */
185 "\0"
186 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
187 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
190 const size_t __re_error_msgid_idx[] attribute_hidden =
192 REG_NOERROR_IDX,
193 REG_NOMATCH_IDX,
194 REG_BADPAT_IDX,
195 REG_ECOLLATE_IDX,
196 REG_ECTYPE_IDX,
197 REG_EESCAPE_IDX,
198 REG_ESUBREG_IDX,
199 REG_EBRACK_IDX,
200 REG_EPAREN_IDX,
201 REG_EBRACE_IDX,
202 REG_BADBR_IDX,
203 REG_ERANGE_IDX,
204 REG_ESPACE_IDX,
205 REG_BADRPT_IDX,
206 REG_EEND_IDX,
207 REG_ESIZE_IDX,
208 REG_ERPAREN_IDX
211 /* Entry points for GNU code. */
214 #ifdef ZOS_USS
216 /* For ZOS USS we must define btowc */
218 wchar_t
219 btowc (int c)
221 wchar_t wtmp[2];
222 char tmp[2];
224 tmp[0] = c;
225 tmp[1] = 0;
227 mbtowc (wtmp, tmp, 1);
228 return wtmp[0];
230 #endif
232 /* re_compile_pattern is the GNU regular expression compiler: it
233 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
234 Returns 0 if the pattern was valid, otherwise an error string.
236 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
237 are set in BUFP on entry. */
239 const char *
240 re_compile_pattern (const char *pattern,
241 size_t length,
242 struct re_pattern_buffer *bufp)
244 reg_errcode_t ret;
246 /* And GNU code determines whether or not to get register information
247 by passing null for the REGS argument to re_match, etc., not by
248 setting no_sub, unless RE_NO_SUB is set. */
249 bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
251 /* Match anchors at newline. */
252 bufp->newline_anchor = 1;
254 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
256 if (!ret)
257 return NULL;
258 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
260 #ifdef _LIBC
261 weak_alias (__re_compile_pattern, re_compile_pattern)
262 #endif
264 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
265 also be assigned to arbitrarily: each pattern buffer stores its own
266 syntax, so it can be changed between regex compilations. */
267 /* This has no initializer because initialized variables in Emacs
268 become read-only after dumping. */
269 reg_syntax_t re_syntax_options;
272 /* Specify the precise syntax of regexps for compilation. This provides
273 for compatibility for various utilities which historically have
274 different, incompatible syntaxes.
276 The argument SYNTAX is a bit mask comprised of the various bits
277 defined in regex.h. We return the old syntax. */
279 reg_syntax_t
280 re_set_syntax (reg_syntax_t syntax)
282 reg_syntax_t ret = re_syntax_options;
284 re_syntax_options = syntax;
285 return ret;
287 #ifdef _LIBC
288 weak_alias (__re_set_syntax, re_set_syntax)
289 #endif
292 re_compile_fastmap (struct re_pattern_buffer *bufp)
294 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
295 char *fastmap = bufp->fastmap;
297 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
298 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
299 if (dfa->init_state != dfa->init_state_word)
300 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
301 if (dfa->init_state != dfa->init_state_nl)
302 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
303 if (dfa->init_state != dfa->init_state_begbuf)
304 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
305 bufp->fastmap_accurate = 1;
306 return 0;
308 #ifdef _LIBC
309 weak_alias (__re_compile_fastmap, re_compile_fastmap)
310 #endif
312 static inline void
313 __attribute ((always_inline))
314 re_set_fastmap (char *fastmap, int icase, int ch)
316 fastmap[ch] = 1;
317 if (icase)
318 fastmap[tolower (ch)] = 1;
321 /* Helper function for re_compile_fastmap.
322 Compile fastmap for the initial_state INIT_STATE. */
324 static void
325 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
326 char *fastmap)
328 volatile re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
329 int node_cnt;
330 int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
331 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
333 int node = init_state->nodes.elems[node_cnt];
334 re_token_type_t type = dfa->nodes[node].type;
336 if (type == CHARACTER)
338 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
339 #ifdef RE_ENABLE_I18N
340 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
342 unsigned char *buf = re_malloc (unsigned char, dfa->mb_cur_max), *p;
343 wchar_t wc;
344 mbstate_t state;
346 p = buf;
347 *p++ = dfa->nodes[node].opr.c;
348 while (++node < dfa->nodes_len
349 && dfa->nodes[node].type == CHARACTER
350 && dfa->nodes[node].mb_partial)
351 *p++ = dfa->nodes[node].opr.c;
352 memset (&state, '\0', sizeof (state));
353 if (__mbrtowc (&wc, (const char *) buf, p - buf,
354 &state) == p - buf
355 && (__wcrtomb ((char *) buf, towlower (wc), &state)
356 != (size_t) -1))
357 re_set_fastmap (fastmap, 0, buf[0]);
358 re_free (buf);
360 #endif
362 else if (type == SIMPLE_BRACKET)
364 int i, ch;
365 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
367 int j;
368 bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
369 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
370 if (w & ((bitset_word_t) 1 << j))
371 re_set_fastmap (fastmap, icase, ch);
374 #ifdef RE_ENABLE_I18N
375 else if (type == COMPLEX_BRACKET)
377 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
378 int i;
380 # ifdef _LIBC
381 /* See if we have to try all bytes which start multiple collation
382 elements.
383 e.g. In da_DK, we want to catch 'a' since "aa" is a valid
384 collation element, and don't catch 'b' since 'b' is
385 the only collation element which starts from 'b' (and
386 it is caught by SIMPLE_BRACKET). */
387 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
388 && (cset->ncoll_syms || cset->nranges))
390 const int32_t *table = (const int32_t *)
391 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
392 for (i = 0; i < SBC_MAX; ++i)
393 if (table[i] < 0)
394 re_set_fastmap (fastmap, icase, i);
396 # endif /* _LIBC */
398 /* See if we have to start the match at all multibyte characters,
399 i.e. where we would not find an invalid sequence. This only
400 applies to multibyte character sets; for single byte character
401 sets, the SIMPLE_BRACKET again suffices. */
402 if (dfa->mb_cur_max > 1
403 && (cset->nchar_classes || cset->non_match || cset->nranges
404 # ifdef _LIBC
405 || cset->nequiv_classes
406 # endif /* _LIBC */
409 unsigned char c = 0;
412 mbstate_t mbs;
413 memset (&mbs, 0, sizeof (mbs));
414 if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
415 re_set_fastmap (fastmap, false, (int) c);
417 while (++c != 0);
420 else
422 /* ... Else catch all bytes which can start the mbchars. */
423 for (i = 0; i < cset->nmbchars; ++i)
425 char buf[256];
426 mbstate_t state;
427 memset (&state, '\0', sizeof (state));
428 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
429 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
430 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
432 if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
433 != (size_t) -1)
434 re_set_fastmap (fastmap, false, *(unsigned char *) buf);
439 #endif /* RE_ENABLE_I18N */
440 else if (type == OP_PERIOD
441 #ifdef RE_ENABLE_I18N
442 || type == OP_UTF8_PERIOD
443 #endif /* RE_ENABLE_I18N */
444 || type == END_OF_RE)
446 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
447 if (type == END_OF_RE)
448 bufp->can_be_null = 1;
449 return;
454 /* Entry point for POSIX code. */
455 /* regcomp takes a regular expression as a string and compiles it.
457 PREG is a regex_t *. We do not expect any fields to be initialized,
458 since POSIX says we shouldn't. Thus, we set
460 `buffer' to the compiled pattern;
461 `used' to the length of the compiled pattern;
462 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
463 REG_EXTENDED bit in CFLAGS is set; otherwise, to
464 RE_SYNTAX_POSIX_BASIC;
465 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
466 `fastmap' to an allocated space for the fastmap;
467 `fastmap_accurate' to zero;
468 `re_nsub' to the number of subexpressions in PATTERN.
470 PATTERN is the address of the pattern string.
472 CFLAGS is a series of bits which affect compilation.
474 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
475 use POSIX basic syntax.
477 If REG_NEWLINE is set, then . and [^...] don't match newline.
478 Also, regexec will try a match beginning after every newline.
480 If REG_ICASE is set, then we considers upper- and lowercase
481 versions of letters to be equivalent when matching.
483 If REG_NOSUB is set, then when PREG is passed to regexec, that
484 routine will report only success or failure, and nothing about the
485 registers.
487 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
488 the return codes and their meanings.) */
491 regcomp (regex_t *__restrict preg,
492 const char *__restrict pattern,
493 int cflags)
495 reg_errcode_t ret;
496 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
497 : RE_SYNTAX_POSIX_BASIC);
499 preg->buffer = NULL;
500 preg->allocated = 0;
501 preg->used = 0;
503 /* Try to allocate space for the fastmap. */
504 preg->fastmap = re_malloc (char, SBC_MAX);
505 if (BE (preg->fastmap == NULL, 0))
506 return REG_ESPACE;
508 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
510 /* If REG_NEWLINE is set, newlines are treated differently. */
511 if (cflags & REG_NEWLINE)
512 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
513 syntax &= ~RE_DOT_NEWLINE;
514 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
515 /* It also changes the matching behavior. */
516 preg->newline_anchor = 1;
518 else
519 preg->newline_anchor = 0;
520 preg->no_sub = !!(cflags & REG_NOSUB);
521 preg->translate = NULL;
523 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
525 /* POSIX doesn't distinguish between an unmatched open-group and an
526 unmatched close-group: both are REG_EPAREN. */
527 if (ret == REG_ERPAREN)
528 ret = REG_EPAREN;
530 /* We have already checked preg->fastmap != NULL. */
531 if (BE (ret == REG_NOERROR, 1))
532 /* Compute the fastmap now, since regexec cannot modify the pattern
533 buffer. This function never fails in this implementation. */
534 (void) re_compile_fastmap (preg);
535 else
537 /* Some error occurred while compiling the expression. */
538 re_free (preg->fastmap);
539 preg->fastmap = NULL;
542 return (int) ret;
544 #ifdef _LIBC
545 weak_alias (__regcomp, regcomp)
546 #endif
548 /* Returns a message corresponding to an error code, ERRCODE, returned
549 from either regcomp or regexec. We don't use PREG here. */
551 size_t
552 regerror(int errcode, const regex_t *__restrict preg,
553 char *__restrict errbuf, size_t errbuf_size)
555 const char *msg;
556 size_t msg_size;
558 if (BE (errcode < 0
559 || errcode >= (int) (sizeof (__re_error_msgid_idx)
560 / sizeof (__re_error_msgid_idx[0])), 0))
561 /* Only error codes returned by the rest of the code should be passed
562 to this routine. If we are given anything else, or if other regex
563 code generates an invalid error code, then the program has a bug.
564 Dump core so we can fix it. */
565 abort ();
567 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
569 msg_size = strlen (msg) + 1; /* Includes the null. */
571 if (BE (errbuf_size != 0, 1))
573 if (BE (msg_size > errbuf_size, 0))
575 memcpy (errbuf, msg, errbuf_size - 1);
576 errbuf[errbuf_size - 1] = 0;
578 else
579 memcpy (errbuf, msg, msg_size);
582 return msg_size;
584 #ifdef _LIBC
585 weak_alias (__regerror, regerror)
586 #endif
589 #ifdef RE_ENABLE_I18N
590 /* This static array is used for the map to single-byte characters when
591 UTF-8 is used. Otherwise we would allocate memory just to initialize
592 it the same all the time. UTF-8 is the preferred encoding so this is
593 a worthwhile optimization. */
594 #if __GNUC__ >= 3
595 static const bitset_t utf8_sb_map = {
596 /* Set the first 128 bits. */
597 [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
599 #else /* ! (__GNUC__ >= 3) */
600 static bitset_t utf8_sb_map;
601 #endif /* __GNUC__ >= 3 */
602 #endif /* RE_ENABLE_I18N */
605 static void
606 free_dfa_content (re_dfa_t *dfa)
608 int i, j;
610 if (dfa->nodes)
611 for (i = 0; i < dfa->nodes_len; ++i)
612 free_token (dfa->nodes + i);
613 re_free (dfa->nexts);
614 for (i = 0; i < dfa->nodes_len; ++i)
616 if (dfa->eclosures != NULL)
617 re_node_set_free (dfa->eclosures + i);
618 if (dfa->inveclosures != NULL)
619 re_node_set_free (dfa->inveclosures + i);
620 if (dfa->edests != NULL)
621 re_node_set_free (dfa->edests + i);
623 re_free (dfa->edests);
624 re_free (dfa->eclosures);
625 re_free (dfa->inveclosures);
626 re_free (dfa->nodes);
628 if (dfa->state_table)
629 for (i = 0; i <= dfa->state_hash_mask; ++i)
631 struct re_state_table_entry *entry = dfa->state_table + i;
632 for (j = 0; j < entry->num; ++j)
634 re_dfastate_t *state = entry->array[j];
635 free_state (state);
637 re_free (entry->array);
639 re_free (dfa->state_table);
640 #ifdef RE_ENABLE_I18N
641 if (dfa->sb_char != utf8_sb_map)
642 re_free (dfa->sb_char);
643 #endif
644 re_free (dfa->subexp_map);
645 #ifdef DEBUG
646 re_free (dfa->re_str);
647 #endif
649 re_free (dfa);
653 /* Free dynamically allocated space used by PREG. */
655 void
656 regfree (regex_t *preg)
658 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
659 if (BE (dfa != NULL, 1))
660 free_dfa_content (dfa);
661 preg->buffer = NULL;
662 preg->allocated = 0;
664 re_free (preg->fastmap);
665 preg->fastmap = NULL;
667 re_free (preg->translate);
668 preg->translate = NULL;
670 #ifdef _LIBC
671 weak_alias (__regfree, regfree)
672 #endif
674 /* Entry points compatible with 4.2 BSD regex library. We don't define
675 them unless specifically requested. */
677 #if defined _REGEX_RE_COMP || defined _LIBC
679 /* BSD has one and only one pattern buffer. */
680 static struct re_pattern_buffer re_comp_buf;
682 char *
683 # ifdef _LIBC
684 /* Make these definitions weak in libc, so POSIX programs can redefine
685 these names if they don't use our functions, and still use
686 regcomp/regexec above without link errors. */
687 weak_function
688 # endif
689 re_comp (s)
690 const char *s;
692 reg_errcode_t ret;
693 char *fastmap;
695 if (!s)
697 if (!re_comp_buf.buffer)
698 return gettext ("No previous regular expression");
699 return 0;
702 if (re_comp_buf.buffer)
704 fastmap = re_comp_buf.fastmap;
705 re_comp_buf.fastmap = NULL;
706 __regfree (&re_comp_buf);
707 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
708 re_comp_buf.fastmap = fastmap;
711 if (re_comp_buf.fastmap == NULL)
713 re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
714 if (re_comp_buf.fastmap == NULL)
715 return (char *) gettext (__re_error_msgid
716 + __re_error_msgid_idx[(int) REG_ESPACE]);
719 /* Since `re_exec' always passes NULL for the `regs' argument, we
720 don't need to initialize the pattern buffer fields which affect it. */
722 /* Match anchors at newlines. */
723 re_comp_buf.newline_anchor = 1;
725 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
727 if (!ret)
728 return NULL;
730 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
731 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
734 #ifdef _LIBC
735 libc_freeres_fn (free_mem)
737 __regfree (&re_comp_buf);
739 #endif
741 #endif /* _REGEX_RE_COMP */
743 /* Internal entry point.
744 Compile the regular expression PATTERN, whose length is LENGTH.
745 SYNTAX indicate regular expression's syntax. */
747 static reg_errcode_t
748 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
749 reg_syntax_t syntax)
751 reg_errcode_t err = REG_NOERROR;
752 re_dfa_t *dfa;
753 re_string_t regexp;
755 /* Initialize the pattern buffer. */
756 preg->fastmap_accurate = 0;
757 preg->syntax = syntax;
758 preg->not_bol = preg->not_eol = 0;
759 preg->used = 0;
760 preg->re_nsub = 0;
761 preg->can_be_null = 0;
762 preg->regs_allocated = REGS_UNALLOCATED;
764 /* Initialize the dfa. */
765 dfa = (re_dfa_t *) preg->buffer;
766 if (BE (preg->allocated < sizeof (re_dfa_t), 0))
768 /* If zero allocated, but buffer is non-null, try to realloc
769 enough space. This loses if buffer's address is bogus, but
770 that is the user's responsibility. If ->buffer is NULL this
771 is a simple allocation. */
772 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
773 if (dfa == NULL)
774 return REG_ESPACE;
775 preg->allocated = sizeof (re_dfa_t);
776 preg->buffer = (unsigned char *) dfa;
778 preg->used = sizeof (re_dfa_t);
780 err = init_dfa (dfa, length);
781 if (BE (err != REG_NOERROR, 0))
783 free_dfa_content (dfa);
784 preg->buffer = NULL;
785 preg->allocated = 0;
786 return err;
788 #ifdef DEBUG
789 /* Note: length+1 will not overflow since it is checked in init_dfa. */
790 dfa->re_str = re_malloc (char, length + 1);
791 strncpy (dfa->re_str, pattern, length + 1);
792 #endif
794 __libc_lock_init (dfa->lock);
796 err = re_string_construct (&regexp, pattern, length, preg->translate,
797 syntax & RE_ICASE, dfa);
798 if (BE (err != REG_NOERROR, 0))
800 re_compile_internal_free_return:
801 free_workarea_compile (preg);
802 re_string_destruct (&regexp);
803 free_dfa_content (dfa);
804 preg->buffer = NULL;
805 preg->allocated = 0;
806 return err;
809 /* Parse the regular expression, and build a structure tree. */
810 preg->re_nsub = 0;
811 dfa->str_tree = parse (&regexp, preg, syntax, &err);
812 if (BE (dfa->str_tree == NULL, 0))
813 goto re_compile_internal_free_return;
815 /* Analyze the tree and create the nfa. */
816 err = analyze (preg);
817 if (BE (err != REG_NOERROR, 0))
818 goto re_compile_internal_free_return;
820 #ifdef RE_ENABLE_I18N
821 /* If possible, do searching in single byte encoding to speed things up. */
822 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
823 optimize_utf8 (dfa);
824 #endif
826 /* Then create the initial state of the dfa. */
827 err = create_initial_state (dfa);
829 /* Release work areas. */
830 free_workarea_compile (preg);
831 re_string_destruct (&regexp);
833 if (BE (err != REG_NOERROR, 0))
835 free_dfa_content (dfa);
836 preg->buffer = NULL;
837 preg->allocated = 0;
840 return err;
843 /* Initialize DFA. We use the length of the regular expression PAT_LEN
844 as the initial length of some arrays. */
846 static reg_errcode_t
847 init_dfa (re_dfa_t *dfa, size_t pat_len)
849 unsigned int table_size;
850 #ifndef _LIBC
851 char *codeset_name;
852 #endif
854 memset (dfa, '\0', sizeof (re_dfa_t));
856 /* Force allocation of str_tree_storage the first time. */
857 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
859 /* Avoid overflows. */
860 if (pat_len == SIZE_MAX)
861 return REG_ESPACE;
863 dfa->nodes_alloc = pat_len + 1;
864 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
866 /* table_size = 2 ^ ceil(log pat_len) */
867 for (table_size = 1; ; table_size <<= 1)
868 if (table_size > pat_len)
869 break;
871 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
872 dfa->state_hash_mask = table_size - 1;
874 dfa->mb_cur_max = MB_CUR_MAX;
875 #ifdef _LIBC
876 if (dfa->mb_cur_max == 6
877 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
878 dfa->is_utf8 = 1;
879 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
880 != 0);
881 #else
882 # ifdef HAVE_LANGINFO_CODESET
883 codeset_name = nl_langinfo (CODESET);
884 # else
885 codeset_name = getenv ("LC_ALL");
886 if (codeset_name == NULL || codeset_name[0] == '\0')
887 codeset_name = getenv ("LC_CTYPE");
888 if (codeset_name == NULL || codeset_name[0] == '\0')
889 codeset_name = getenv ("LANG");
890 if (codeset_name == NULL)
891 codeset_name = "";
892 else if (strchr (codeset_name, '.') != NULL)
893 codeset_name = strchr (codeset_name, '.') + 1;
894 # endif
896 /* strcasecmp isn't a standard interface. brute force check */
897 #if 0
898 if (strcasecmp (codeset_name, "UTF-8") == 0
899 || strcasecmp (codeset_name, "UTF8") == 0)
900 dfa->is_utf8 = 1;
901 #else
902 if ( (codeset_name[0] == 'U' || codeset_name[0] == 'u')
903 && (codeset_name[1] == 'T' || codeset_name[1] == 't')
904 && (codeset_name[2] == 'F' || codeset_name[2] == 'f')
905 && (codeset_name[3] == '-'
906 ? codeset_name[4] == '8' && codeset_name[5] == '\0'
907 : codeset_name[3] == '8' && codeset_name[4] == '\0'))
908 dfa->is_utf8 = 1;
909 #endif
911 /* We check exhaustively in the loop below if this charset is a
912 superset of ASCII. */
913 dfa->map_notascii = 0;
914 #endif
916 #ifdef RE_ENABLE_I18N
917 if (dfa->mb_cur_max > 1)
919 if (dfa->is_utf8)
921 #if !defined(__GNUC__) || __GNUC__ < 3
922 static short utf8_sb_map_inited = 0;
924 if (! utf8_sb_map_inited)
926 int i;
928 utf8_sb_map_inited = 0;
929 for (i = 0; i <= 0x80 / BITSET_WORD_BITS - 1; i++)
930 utf8_sb_map[i] = BITSET_WORD_MAX;
932 #endif
933 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
935 else
937 int i, j, ch;
939 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
940 if (BE (dfa->sb_char == NULL, 0))
941 return REG_ESPACE;
943 /* Set the bits corresponding to single byte chars. */
944 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
945 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
947 wint_t wch = __btowc (ch);
948 if (wch != WEOF)
949 dfa->sb_char[i] |= (bitset_word_t) 1 << j;
950 # ifndef _LIBC
951 if (isascii (ch) && wch != ch)
952 dfa->map_notascii = 1;
953 # endif
957 #endif
959 if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
960 return REG_ESPACE;
961 return REG_NOERROR;
964 /* Initialize WORD_CHAR table, which indicate which character is
965 "word". In this case "word" means that it is the word construction
966 character used by some operators like "\<", "\>", etc. */
968 static void
969 internal_function
970 init_word_char (re_dfa_t *dfa)
972 int i, j, ch;
973 dfa->word_ops_used = 1;
974 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
975 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
976 if (isalnum (ch) || ch == '_')
977 dfa->word_char[i] |= (bitset_word_t) 1 << j;
980 /* Free the work area which are only used while compiling. */
982 static void
983 free_workarea_compile (regex_t *preg)
985 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
986 bin_tree_storage_t *storage, *next;
987 for (storage = dfa->str_tree_storage; storage; storage = next)
989 next = storage->next;
990 re_free (storage);
992 dfa->str_tree_storage = NULL;
993 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
994 dfa->str_tree = NULL;
995 re_free (dfa->org_indices);
996 dfa->org_indices = NULL;
999 /* Create initial states for all contexts. */
1001 static reg_errcode_t
1002 create_initial_state (re_dfa_t *dfa)
1004 int first, i;
1005 reg_errcode_t err;
1006 re_node_set init_nodes;
1008 /* Initial states have the epsilon closure of the node which is
1009 the first node of the regular expression. */
1010 first = dfa->str_tree->first->node_idx;
1011 dfa->init_node = first;
1012 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
1013 if (BE (err != REG_NOERROR, 0))
1014 return err;
1016 /* The back-references which are in initial states can epsilon transit,
1017 since in this case all of the subexpressions can be null.
1018 Then we add epsilon closures of the nodes which are the next nodes of
1019 the back-references. */
1020 if (dfa->nbackref > 0)
1021 for (i = 0; i < init_nodes.nelem; ++i)
1023 int node_idx = init_nodes.elems[i];
1024 re_token_type_t type = dfa->nodes[node_idx].type;
1026 int clexp_idx;
1027 if (type != OP_BACK_REF)
1028 continue;
1029 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1031 re_token_t *clexp_node;
1032 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1033 if (clexp_node->type == OP_CLOSE_SUBEXP
1034 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1035 break;
1037 if (clexp_idx == init_nodes.nelem)
1038 continue;
1040 if (type == OP_BACK_REF)
1042 int dest_idx = dfa->edests[node_idx].elems[0];
1043 if (!re_node_set_contains (&init_nodes, dest_idx))
1045 reg_errcode_t err = re_node_set_merge (&init_nodes,
1046 dfa->eclosures
1047 + dest_idx);
1048 if (err != REG_NOERROR)
1049 return err;
1050 i = 0;
1055 /* It must be the first time to invoke acquire_state. */
1056 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1057 /* We don't check ERR here, since the initial state must not be NULL. */
1058 if (BE (dfa->init_state == NULL, 0))
1059 return err;
1060 if (dfa->init_state->has_constraint)
1062 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1063 CONTEXT_WORD);
1064 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1065 CONTEXT_NEWLINE);
1066 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1067 &init_nodes,
1068 CONTEXT_NEWLINE
1069 | CONTEXT_BEGBUF);
1070 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1071 || dfa->init_state_begbuf == NULL, 0))
1072 return err;
1074 else
1075 dfa->init_state_word = dfa->init_state_nl
1076 = dfa->init_state_begbuf = dfa->init_state;
1078 re_node_set_free (&init_nodes);
1079 return REG_NOERROR;
1082 #ifdef RE_ENABLE_I18N
1083 /* If it is possible to do searching in single byte encoding instead of UTF-8
1084 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1085 DFA nodes where needed. */
1087 static void
1088 optimize_utf8 (re_dfa_t *dfa)
1090 int node, i, mb_chars = 0, has_period = 0;
1092 for (node = 0; node < dfa->nodes_len; ++node)
1093 switch (dfa->nodes[node].type)
1095 case CHARACTER:
1096 if (dfa->nodes[node].opr.c >= 0x80)
1097 mb_chars = 1;
1098 break;
1099 case ANCHOR:
1100 switch (dfa->nodes[node].opr.ctx_type)
1102 case LINE_FIRST:
1103 case LINE_LAST:
1104 case BUF_FIRST:
1105 case BUF_LAST:
1106 break;
1107 default:
1108 /* Word anchors etc. cannot be handled. It's okay to test
1109 opr.ctx_type since constraints (for all DFA nodes) are
1110 created by ORing one or more opr.ctx_type values. */
1111 return;
1113 break;
1114 case OP_PERIOD:
1115 has_period = 1;
1116 break;
1117 case OP_BACK_REF:
1118 case OP_ALT:
1119 case END_OF_RE:
1120 case OP_DUP_ASTERISK:
1121 case OP_OPEN_SUBEXP:
1122 case OP_CLOSE_SUBEXP:
1123 break;
1124 case COMPLEX_BRACKET:
1125 return;
1126 case SIMPLE_BRACKET:
1127 /* Just double check. The non-ASCII range starts at 0x80. */
1128 assert (0x80 % BITSET_WORD_BITS == 0);
1129 for (i = 0x80 / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1130 if (dfa->nodes[node].opr.sbcset[i])
1131 return;
1132 break;
1133 default:
1134 abort ();
1137 if (mb_chars || has_period)
1138 for (node = 0; node < dfa->nodes_len; ++node)
1140 if (dfa->nodes[node].type == CHARACTER
1141 && dfa->nodes[node].opr.c >= 0x80)
1142 dfa->nodes[node].mb_partial = 0;
1143 else if (dfa->nodes[node].type == OP_PERIOD)
1144 dfa->nodes[node].type = OP_UTF8_PERIOD;
1147 /* The search can be in single byte locale. */
1148 dfa->mb_cur_max = 1;
1149 dfa->is_utf8 = 0;
1150 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1152 #endif
1154 /* Analyze the structure tree, and calculate "first", "next", "edest",
1155 "eclosure", and "inveclosure". */
1157 static reg_errcode_t
1158 analyze (regex_t *preg)
1160 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1161 reg_errcode_t ret;
1163 /* Allocate arrays. */
1164 dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1165 dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1166 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1167 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1168 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1169 || dfa->eclosures == NULL, 0))
1170 return REG_ESPACE;
1172 dfa->subexp_map = re_malloc (int, preg->re_nsub);
1173 if (dfa->subexp_map != NULL)
1175 int i;
1176 for (i = 0; i < preg->re_nsub; i++)
1177 dfa->subexp_map[i] = i;
1178 preorder (dfa->str_tree, optimize_subexps, dfa);
1179 for (i = 0; i < preg->re_nsub; i++)
1180 if (dfa->subexp_map[i] != i)
1181 break;
1182 if (i == preg->re_nsub)
1184 free (dfa->subexp_map);
1185 dfa->subexp_map = NULL;
1189 ret = postorder (dfa->str_tree, lower_subexps, preg);
1190 if (BE (ret != REG_NOERROR, 0))
1191 return ret;
1192 ret = postorder (dfa->str_tree, calc_first, dfa);
1193 if (BE (ret != REG_NOERROR, 0))
1194 return ret;
1195 preorder (dfa->str_tree, calc_next, dfa);
1196 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1197 if (BE (ret != REG_NOERROR, 0))
1198 return ret;
1199 ret = calc_eclosure (dfa);
1200 if (BE (ret != REG_NOERROR, 0))
1201 return ret;
1203 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1204 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1205 if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1206 || dfa->nbackref)
1208 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1209 if (BE (dfa->inveclosures == NULL, 0))
1210 return REG_ESPACE;
1211 ret = calc_inveclosure (dfa);
1214 return ret;
1217 /* Our parse trees are very unbalanced, so we cannot use a stack to
1218 implement parse tree visits. Instead, we use parent pointers and
1219 some hairy code in these two functions. */
1220 static reg_errcode_t
1221 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1222 void *extra)
1224 bin_tree_t *node, *prev;
1226 for (node = root; ; )
1228 /* Descend down the tree, preferably to the left (or to the right
1229 if that's the only child). */
1230 while (node->left || node->right)
1231 if (node->left)
1232 node = node->left;
1233 else
1234 node = node->right;
1238 reg_errcode_t err = fn (extra, node);
1239 if (BE (err != REG_NOERROR, 0))
1240 return err;
1241 if (node->parent == NULL)
1242 return REG_NOERROR;
1243 prev = node;
1244 node = node->parent;
1246 /* Go up while we have a node that is reached from the right. */
1247 while (node->right == prev || node->right == NULL);
1248 node = node->right;
1252 static reg_errcode_t
1253 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1254 void *extra)
1256 bin_tree_t *node;
1258 for (node = root; ; )
1260 reg_errcode_t err = fn (extra, node);
1261 if (BE (err != REG_NOERROR, 0))
1262 return err;
1264 /* Go to the left node, or up and to the right. */
1265 if (node->left)
1266 node = node->left;
1267 else
1269 bin_tree_t *prev = NULL;
1270 while (node->right == prev || node->right == NULL)
1272 prev = node;
1273 node = node->parent;
1274 if (!node)
1275 return REG_NOERROR;
1277 node = node->right;
1282 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1283 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1284 backreferences as well. Requires a preorder visit. */
1285 static reg_errcode_t
1286 optimize_subexps (void *extra, bin_tree_t *node)
1288 re_dfa_t *dfa = (re_dfa_t *) extra;
1290 if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1292 int idx = node->token.opr.idx;
1293 node->token.opr.idx = dfa->subexp_map[idx];
1294 dfa->used_bkref_map |= 1 << node->token.opr.idx;
1297 else if (node->token.type == SUBEXP
1298 && node->left && node->left->token.type == SUBEXP)
1300 int other_idx = node->left->token.opr.idx;
1302 node->left = node->left->left;
1303 if (node->left)
1304 node->left->parent = node;
1306 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1307 if (other_idx < BITSET_WORD_BITS)
1308 dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1311 return REG_NOERROR;
1314 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1315 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1316 static reg_errcode_t
1317 lower_subexps (void *extra, bin_tree_t *node)
1319 regex_t *preg = (regex_t *) extra;
1320 reg_errcode_t err = REG_NOERROR;
1322 if (node->left && node->left->token.type == SUBEXP)
1324 node->left = lower_subexp (&err, preg, node->left);
1325 if (node->left)
1326 node->left->parent = node;
1328 if (node->right && node->right->token.type == SUBEXP)
1330 node->right = lower_subexp (&err, preg, node->right);
1331 if (node->right)
1332 node->right->parent = node;
1335 return err;
1338 static bin_tree_t *
1339 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1341 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1342 bin_tree_t *body = node->left;
1343 bin_tree_t *op, *cls, *tree1, *tree;
1345 if (preg->no_sub
1346 /* We do not optimize empty subexpressions, because otherwise we may
1347 have bad CONCAT nodes with NULL children. This is obviously not
1348 very common, so we do not lose much. An example that triggers
1349 this case is the sed "script" /\(\)/x. */
1350 && node->left != NULL
1351 && (node->token.opr.idx >= BITSET_WORD_BITS
1352 || !(dfa->used_bkref_map
1353 & ((bitset_word_t) 1 << node->token.opr.idx))))
1354 return node->left;
1356 /* Convert the SUBEXP node to the concatenation of an
1357 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1358 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1359 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1360 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1361 tree = create_tree (dfa, op, tree1, CONCAT);
1362 if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1364 *err = REG_ESPACE;
1365 return NULL;
1368 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1369 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1370 return tree;
1373 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1374 nodes. Requires a postorder visit. */
1375 static reg_errcode_t
1376 calc_first (void *extra, bin_tree_t *node)
1378 re_dfa_t *dfa = (re_dfa_t *) extra;
1379 if (node->token.type == CONCAT)
1381 node->first = node->left->first;
1382 node->node_idx = node->left->node_idx;
1384 else
1386 node->first = node;
1387 node->node_idx = re_dfa_add_node (dfa, node->token);
1388 if (BE (node->node_idx == -1, 0))
1389 return REG_ESPACE;
1390 if (node->token.type == ANCHOR)
1391 dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1393 return REG_NOERROR;
1396 /* Pass 2: compute NEXT on the tree. Preorder visit. */
1397 static reg_errcode_t
1398 calc_next (void *extra, bin_tree_t *node)
1400 switch (node->token.type)
1402 case OP_DUP_ASTERISK:
1403 node->left->next = node;
1404 break;
1405 case CONCAT:
1406 node->left->next = node->right->first;
1407 node->right->next = node->next;
1408 break;
1409 default:
1410 if (node->left)
1411 node->left->next = node->next;
1412 if (node->right)
1413 node->right->next = node->next;
1414 break;
1416 return REG_NOERROR;
1419 /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1420 static reg_errcode_t
1421 link_nfa_nodes (void *extra, bin_tree_t *node)
1423 re_dfa_t *dfa = (re_dfa_t *) extra;
1424 int idx = node->node_idx;
1425 reg_errcode_t err = REG_NOERROR;
1427 switch (node->token.type)
1429 case CONCAT:
1430 break;
1432 case END_OF_RE:
1433 assert (node->next == NULL);
1434 break;
1436 case OP_DUP_ASTERISK:
1437 case OP_ALT:
1439 int left, right;
1440 dfa->has_plural_match = 1;
1441 if (node->left != NULL)
1442 left = node->left->first->node_idx;
1443 else
1444 left = node->next->node_idx;
1445 if (node->right != NULL)
1446 right = node->right->first->node_idx;
1447 else
1448 right = node->next->node_idx;
1449 assert (left > -1);
1450 assert (right > -1);
1451 err = re_node_set_init_2 (dfa->edests + idx, left, right);
1453 break;
1455 case ANCHOR:
1456 case OP_OPEN_SUBEXP:
1457 case OP_CLOSE_SUBEXP:
1458 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1459 break;
1461 case OP_BACK_REF:
1462 dfa->nexts[idx] = node->next->node_idx;
1463 if (node->token.type == OP_BACK_REF)
1464 err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1465 break;
1467 default:
1468 assert (!IS_EPSILON_NODE (node->token.type));
1469 dfa->nexts[idx] = node->next->node_idx;
1470 break;
1473 return err;
1476 /* Duplicate the epsilon closure of the node ROOT_NODE.
1477 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1478 to their own constraint. */
1480 static reg_errcode_t
1481 internal_function
1482 duplicate_node_closure (re_dfa_t *dfa, int top_org_node, int top_clone_node,
1483 int root_node, unsigned int init_constraint)
1485 int org_node, clone_node, ret;
1486 unsigned int constraint = init_constraint;
1487 for (org_node = top_org_node, clone_node = top_clone_node;;)
1489 int org_dest, clone_dest;
1490 if (dfa->nodes[org_node].type == OP_BACK_REF)
1492 /* If the back reference epsilon-transit, its destination must
1493 also have the constraint. Then duplicate the epsilon closure
1494 of the destination of the back reference, and store it in
1495 edests of the back reference. */
1496 org_dest = dfa->nexts[org_node];
1497 re_node_set_empty (dfa->edests + clone_node);
1498 clone_dest = duplicate_node (dfa, org_dest, constraint);
1499 if (BE (clone_dest == -1, 0))
1500 return REG_ESPACE;
1501 dfa->nexts[clone_node] = dfa->nexts[org_node];
1502 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1503 if (BE (ret < 0, 0))
1504 return REG_ESPACE;
1506 else if (dfa->edests[org_node].nelem == 0)
1508 /* In case of the node can't epsilon-transit, don't duplicate the
1509 destination and store the original destination as the
1510 destination of the node. */
1511 dfa->nexts[clone_node] = dfa->nexts[org_node];
1512 break;
1514 else if (dfa->edests[org_node].nelem == 1)
1516 /* In case of the node can epsilon-transit, and it has only one
1517 destination. */
1518 org_dest = dfa->edests[org_node].elems[0];
1519 re_node_set_empty (dfa->edests + clone_node);
1520 /* If the node is root_node itself, it means the epsilon clsoure
1521 has a loop. Then tie it to the destination of the root_node. */
1522 if (org_node == root_node && clone_node != org_node)
1524 ret = re_node_set_insert (dfa->edests + clone_node, org_dest);
1525 if (BE (ret < 0, 0))
1526 return REG_ESPACE;
1527 break;
1529 /* In case of the node has another constraint, add it. */
1530 constraint |= dfa->nodes[org_node].constraint;
1531 clone_dest = duplicate_node (dfa, org_dest, constraint);
1532 if (BE (clone_dest == -1, 0))
1533 return REG_ESPACE;
1534 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1535 if (BE (ret < 0, 0))
1536 return REG_ESPACE;
1538 else /* dfa->edests[org_node].nelem == 2 */
1540 /* In case of the node can epsilon-transit, and it has two
1541 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1542 org_dest = dfa->edests[org_node].elems[0];
1543 re_node_set_empty (dfa->edests + clone_node);
1544 /* Search for a duplicated node which satisfies the constraint. */
1545 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1546 if (clone_dest == -1)
1548 /* There is no such duplicated node, create a new one. */
1549 reg_errcode_t err;
1550 clone_dest = duplicate_node (dfa, org_dest, constraint);
1551 if (BE (clone_dest == -1, 0))
1552 return REG_ESPACE;
1553 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1554 if (BE (ret < 0, 0))
1555 return REG_ESPACE;
1556 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1557 root_node, constraint);
1558 if (BE (err != REG_NOERROR, 0))
1559 return err;
1561 else
1563 /* There is a duplicated node which satisfies the constraint,
1564 use it to avoid infinite loop. */
1565 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1566 if (BE (ret < 0, 0))
1567 return REG_ESPACE;
1570 org_dest = dfa->edests[org_node].elems[1];
1571 clone_dest = duplicate_node (dfa, org_dest, constraint);
1572 if (BE (clone_dest == -1, 0))
1573 return REG_ESPACE;
1574 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1575 if (BE (ret < 0, 0))
1576 return REG_ESPACE;
1578 org_node = org_dest;
1579 clone_node = clone_dest;
1581 return REG_NOERROR;
1584 /* Search for a node which is duplicated from the node ORG_NODE, and
1585 satisfies the constraint CONSTRAINT. */
1587 static int
1588 search_duplicated_node (const re_dfa_t *dfa, int org_node,
1589 unsigned int constraint)
1591 int idx;
1592 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1594 if (org_node == dfa->org_indices[idx]
1595 && constraint == dfa->nodes[idx].constraint)
1596 return idx; /* Found. */
1598 return -1; /* Not found. */
1601 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1602 Return the index of the new node, or -1 if insufficient storage is
1603 available. */
1605 static int
1606 duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint)
1608 int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1609 if (BE (dup_idx != -1, 1))
1611 dfa->nodes[dup_idx].constraint = constraint;
1612 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1613 dfa->nodes[dup_idx].duplicated = 1;
1615 /* Store the index of the original node. */
1616 dfa->org_indices[dup_idx] = org_idx;
1618 return dup_idx;
1621 static reg_errcode_t
1622 calc_inveclosure (re_dfa_t *dfa)
1624 int src, idx, ret;
1625 for (idx = 0; idx < dfa->nodes_len; ++idx)
1626 re_node_set_init_empty (dfa->inveclosures + idx);
1628 for (src = 0; src < dfa->nodes_len; ++src)
1630 int *elems = dfa->eclosures[src].elems;
1631 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1633 ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1634 if (BE (ret == -1, 0))
1635 return REG_ESPACE;
1639 return REG_NOERROR;
1642 /* Calculate "eclosure" for all the node in DFA. */
1644 static reg_errcode_t
1645 calc_eclosure (re_dfa_t *dfa)
1647 int node_idx, incomplete;
1648 #ifdef DEBUG
1649 assert (dfa->nodes_len > 0);
1650 #endif
1651 incomplete = 0;
1652 /* For each nodes, calculate epsilon closure. */
1653 for (node_idx = 0; ; ++node_idx)
1655 reg_errcode_t err;
1656 re_node_set eclosure_elem;
1657 if (node_idx == dfa->nodes_len)
1659 if (!incomplete)
1660 break;
1661 incomplete = 0;
1662 node_idx = 0;
1665 #ifdef DEBUG
1666 assert (dfa->eclosures[node_idx].nelem != -1);
1667 #endif
1669 /* If we have already calculated, skip it. */
1670 if (dfa->eclosures[node_idx].nelem != 0)
1671 continue;
1672 /* Calculate epsilon closure of `node_idx'. */
1673 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1674 if (BE (err != REG_NOERROR, 0))
1675 return err;
1677 if (dfa->eclosures[node_idx].nelem == 0)
1679 incomplete = 1;
1680 re_node_set_free (&eclosure_elem);
1683 return REG_NOERROR;
1686 /* Calculate epsilon closure of NODE. */
1688 static reg_errcode_t
1689 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, int node, int root)
1691 reg_errcode_t err;
1692 int i;
1693 re_node_set eclosure;
1694 int ret;
1695 int incomplete = 0;
1696 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1697 if (BE (err != REG_NOERROR, 0))
1698 return err;
1700 /* This indicates that we are calculating this node now.
1701 We reference this value to avoid infinite loop. */
1702 dfa->eclosures[node].nelem = -1;
1704 /* If the current node has constraints, duplicate all nodes
1705 since they must inherit the constraints. */
1706 if (dfa->nodes[node].constraint
1707 && dfa->edests[node].nelem
1708 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1710 err = duplicate_node_closure (dfa, node, node, node,
1711 dfa->nodes[node].constraint);
1712 if (BE (err != REG_NOERROR, 0))
1713 return err;
1716 /* Expand each epsilon destination nodes. */
1717 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1718 for (i = 0; i < dfa->edests[node].nelem; ++i)
1720 re_node_set eclosure_elem;
1721 int edest = dfa->edests[node].elems[i];
1722 /* If calculating the epsilon closure of `edest' is in progress,
1723 return intermediate result. */
1724 if (dfa->eclosures[edest].nelem == -1)
1726 incomplete = 1;
1727 continue;
1729 /* If we haven't calculated the epsilon closure of `edest' yet,
1730 calculate now. Otherwise use calculated epsilon closure. */
1731 if (dfa->eclosures[edest].nelem == 0)
1733 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1734 if (BE (err != REG_NOERROR, 0))
1735 return err;
1737 else
1738 eclosure_elem = dfa->eclosures[edest];
1739 /* Merge the epsilon closure of `edest'. */
1740 err = re_node_set_merge (&eclosure, &eclosure_elem);
1741 if (BE (err != REG_NOERROR, 0))
1742 return err;
1743 /* If the epsilon closure of `edest' is incomplete,
1744 the epsilon closure of this node is also incomplete. */
1745 if (dfa->eclosures[edest].nelem == 0)
1747 incomplete = 1;
1748 re_node_set_free (&eclosure_elem);
1752 /* An epsilon closure includes itself. */
1753 ret = re_node_set_insert (&eclosure, node);
1754 if (BE (ret < 0, 0))
1755 return REG_ESPACE;
1756 if (incomplete && !root)
1757 dfa->eclosures[node].nelem = 0;
1758 else
1759 dfa->eclosures[node] = eclosure;
1760 *new_set = eclosure;
1761 return REG_NOERROR;
1764 /* Functions for token which are used in the parser. */
1766 /* Fetch a token from INPUT.
1767 We must not use this function inside bracket expressions. */
1769 static void
1770 internal_function
1771 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1773 re_string_skip_bytes (input, peek_token (result, input, syntax));
1776 /* Peek a token from INPUT, and return the length of the token.
1777 We must not use this function inside bracket expressions. */
1779 static int
1780 internal_function
1781 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1783 unsigned char c;
1785 if (re_string_eoi (input))
1787 token->type = END_OF_RE;
1788 return 0;
1791 c = re_string_peek_byte (input, 0);
1792 token->opr.c = c;
1794 token->word_char = 0;
1795 #ifdef RE_ENABLE_I18N
1796 token->mb_partial = 0;
1797 if (input->mb_cur_max > 1 &&
1798 !re_string_first_byte (input, re_string_cur_idx (input)))
1800 token->type = CHARACTER;
1801 token->mb_partial = 1;
1802 return 1;
1804 #endif
1805 if (c == '\\')
1807 unsigned char c2;
1808 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1810 token->type = BACK_SLASH;
1811 return 1;
1814 c2 = re_string_peek_byte_case (input, 1);
1815 token->opr.c = c2;
1816 token->type = CHARACTER;
1817 #ifdef RE_ENABLE_I18N
1818 if (input->mb_cur_max > 1)
1820 wint_t wc = re_string_wchar_at (input,
1821 re_string_cur_idx (input) + 1);
1822 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1824 else
1825 #endif
1826 token->word_char = IS_WORD_CHAR (c2) != 0;
1828 switch (c2)
1830 case '|':
1831 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1832 token->type = OP_ALT;
1833 break;
1834 case '1': case '2': case '3': case '4': case '5':
1835 case '6': case '7': case '8': case '9':
1836 if (!(syntax & RE_NO_BK_REFS))
1838 token->type = OP_BACK_REF;
1839 token->opr.idx = c2 - '1';
1841 break;
1842 case '<':
1843 if (!(syntax & RE_NO_GNU_OPS))
1845 token->type = ANCHOR;
1846 token->opr.ctx_type = WORD_FIRST;
1848 break;
1849 case '>':
1850 if (!(syntax & RE_NO_GNU_OPS))
1852 token->type = ANCHOR;
1853 token->opr.ctx_type = WORD_LAST;
1855 break;
1856 case 'b':
1857 if (!(syntax & RE_NO_GNU_OPS))
1859 token->type = ANCHOR;
1860 token->opr.ctx_type = WORD_DELIM;
1862 break;
1863 case 'B':
1864 if (!(syntax & RE_NO_GNU_OPS))
1866 token->type = ANCHOR;
1867 token->opr.ctx_type = NOT_WORD_DELIM;
1869 break;
1870 case 'w':
1871 if (!(syntax & RE_NO_GNU_OPS))
1872 token->type = OP_WORD;
1873 break;
1874 case 'W':
1875 if (!(syntax & RE_NO_GNU_OPS))
1876 token->type = OP_NOTWORD;
1877 break;
1878 case 's':
1879 if (!(syntax & RE_NO_GNU_OPS))
1880 token->type = OP_SPACE;
1881 break;
1882 case 'S':
1883 if (!(syntax & RE_NO_GNU_OPS))
1884 token->type = OP_NOTSPACE;
1885 break;
1886 case '`':
1887 if (!(syntax & RE_NO_GNU_OPS))
1889 token->type = ANCHOR;
1890 token->opr.ctx_type = BUF_FIRST;
1892 break;
1893 case '\'':
1894 if (!(syntax & RE_NO_GNU_OPS))
1896 token->type = ANCHOR;
1897 token->opr.ctx_type = BUF_LAST;
1899 break;
1900 case '(':
1901 if (!(syntax & RE_NO_BK_PARENS))
1902 token->type = OP_OPEN_SUBEXP;
1903 break;
1904 case ')':
1905 if (!(syntax & RE_NO_BK_PARENS))
1906 token->type = OP_CLOSE_SUBEXP;
1907 break;
1908 case '+':
1909 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1910 token->type = OP_DUP_PLUS;
1911 break;
1912 case '?':
1913 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1914 token->type = OP_DUP_QUESTION;
1915 break;
1916 case '{':
1917 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1918 token->type = OP_OPEN_DUP_NUM;
1919 break;
1920 case '}':
1921 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1922 token->type = OP_CLOSE_DUP_NUM;
1923 break;
1924 default:
1925 break;
1927 return 2;
1930 token->type = CHARACTER;
1931 #ifdef RE_ENABLE_I18N
1932 if (input->mb_cur_max > 1)
1934 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1935 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1937 else
1938 #endif
1939 token->word_char = IS_WORD_CHAR (token->opr.c);
1941 switch (c)
1943 case '\n':
1944 if (syntax & RE_NEWLINE_ALT)
1945 token->type = OP_ALT;
1946 break;
1947 case '|':
1948 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1949 token->type = OP_ALT;
1950 break;
1951 case '*':
1952 token->type = OP_DUP_ASTERISK;
1953 break;
1954 case '+':
1955 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1956 token->type = OP_DUP_PLUS;
1957 break;
1958 case '?':
1959 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1960 token->type = OP_DUP_QUESTION;
1961 break;
1962 case '{':
1963 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1964 token->type = OP_OPEN_DUP_NUM;
1965 break;
1966 case '}':
1967 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1968 token->type = OP_CLOSE_DUP_NUM;
1969 break;
1970 case '(':
1971 if (syntax & RE_NO_BK_PARENS)
1972 token->type = OP_OPEN_SUBEXP;
1973 break;
1974 case ')':
1975 if (syntax & RE_NO_BK_PARENS)
1976 token->type = OP_CLOSE_SUBEXP;
1977 break;
1978 case '[':
1979 token->type = OP_OPEN_BRACKET;
1980 break;
1981 case '.':
1982 token->type = OP_PERIOD;
1983 break;
1984 case '^':
1985 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1986 re_string_cur_idx (input) != 0)
1988 char prev = re_string_peek_byte (input, -1);
1989 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1990 break;
1992 token->type = ANCHOR;
1993 token->opr.ctx_type = LINE_FIRST;
1994 break;
1995 case '$':
1996 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1997 re_string_cur_idx (input) + 1 != re_string_length (input))
1999 re_token_t next;
2000 re_string_skip_bytes (input, 1);
2001 peek_token (&next, input, syntax);
2002 re_string_skip_bytes (input, -1);
2003 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
2004 break;
2006 token->type = ANCHOR;
2007 token->opr.ctx_type = LINE_LAST;
2008 break;
2009 default:
2010 break;
2012 return 1;
2015 /* Peek a token from INPUT, and return the length of the token.
2016 We must not use this function out of bracket expressions. */
2018 static int
2019 internal_function
2020 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2022 unsigned char c;
2023 if (re_string_eoi (input))
2025 token->type = END_OF_RE;
2026 return 0;
2028 c = re_string_peek_byte (input, 0);
2029 token->opr.c = c;
2031 #ifdef RE_ENABLE_I18N
2032 if (input->mb_cur_max > 1 &&
2033 !re_string_first_byte (input, re_string_cur_idx (input)))
2035 token->type = CHARACTER;
2036 return 1;
2038 #endif /* RE_ENABLE_I18N */
2040 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2041 && re_string_cur_idx (input) + 1 < re_string_length (input))
2043 /* In this case, '\' escape a character. */
2044 unsigned char c2;
2045 re_string_skip_bytes (input, 1);
2046 c2 = re_string_peek_byte (input, 0);
2047 token->opr.c = c2;
2048 token->type = CHARACTER;
2049 return 1;
2051 if (c == '[') /* '[' is a special char in a bracket exps. */
2053 unsigned char c2;
2054 int token_len;
2055 if (re_string_cur_idx (input) + 1 < re_string_length (input))
2056 c2 = re_string_peek_byte (input, 1);
2057 else
2058 c2 = 0;
2059 token->opr.c = c2;
2060 token_len = 2;
2061 switch (c2)
2063 case '.':
2064 token->type = OP_OPEN_COLL_ELEM;
2065 break;
2066 case '=':
2067 token->type = OP_OPEN_EQUIV_CLASS;
2068 break;
2069 case ':':
2070 if (syntax & RE_CHAR_CLASSES)
2072 token->type = OP_OPEN_CHAR_CLASS;
2073 break;
2075 /* else fall through. */
2076 default:
2077 token->type = CHARACTER;
2078 token->opr.c = c;
2079 token_len = 1;
2080 break;
2082 return token_len;
2084 switch (c)
2086 case '-':
2087 token->type = OP_CHARSET_RANGE;
2088 break;
2089 case ']':
2090 token->type = OP_CLOSE_BRACKET;
2091 break;
2092 case '^':
2093 token->type = OP_NON_MATCH_LIST;
2094 break;
2095 default:
2096 token->type = CHARACTER;
2098 return 1;
2101 /* Functions for parser. */
2103 /* Entry point of the parser.
2104 Parse the regular expression REGEXP and return the structure tree.
2105 If an error has occurred, ERR is set by error code, and return NULL.
2106 This function build the following tree, from regular expression <reg_exp>:
2110 <reg_exp> EOR
2112 CAT means concatenation.
2113 EOR means end of regular expression. */
2115 static bin_tree_t *
2116 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2117 reg_errcode_t *err)
2119 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2120 bin_tree_t *tree, *eor, *root;
2121 re_token_t current_token;
2122 dfa->syntax = syntax;
2123 fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2124 tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2125 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2126 return NULL;
2127 eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2128 if (tree != NULL)
2129 root = create_tree (dfa, tree, eor, CONCAT);
2130 else
2131 root = eor;
2132 if (BE (eor == NULL || root == NULL, 0))
2134 *err = REG_ESPACE;
2135 return NULL;
2137 return root;
2140 /* This function build the following tree, from regular expression
2141 <branch1>|<branch2>:
2145 <branch1> <branch2>
2147 ALT means alternative, which represents the operator `|'. */
2149 static bin_tree_t *
2150 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2151 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2153 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2154 bin_tree_t *tree, *branch = NULL;
2155 tree = parse_branch (regexp, preg, token, syntax, nest, err);
2156 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2157 return NULL;
2159 while (token->type == OP_ALT)
2161 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2162 if (token->type != OP_ALT && token->type != END_OF_RE
2163 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2165 branch = parse_branch (regexp, preg, token, syntax, nest, err);
2166 if (BE (*err != REG_NOERROR && branch == NULL, 0))
2167 return NULL;
2169 else
2170 branch = NULL;
2171 tree = create_tree (dfa, tree, branch, OP_ALT);
2172 if (BE (tree == NULL, 0))
2174 *err = REG_ESPACE;
2175 return NULL;
2178 return tree;
2181 /* This function build the following tree, from regular expression
2182 <exp1><exp2>:
2186 <exp1> <exp2>
2188 CAT means concatenation. */
2190 static bin_tree_t *
2191 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2192 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2194 bin_tree_t *tree, *exp;
2195 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2196 tree = parse_expression (regexp, preg, token, syntax, nest, err);
2197 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2198 return NULL;
2200 while (token->type != OP_ALT && token->type != END_OF_RE
2201 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2203 exp = parse_expression (regexp, preg, token, syntax, nest, err);
2204 if (BE (*err != REG_NOERROR && exp == NULL, 0))
2206 return NULL;
2208 if (tree != NULL && exp != NULL)
2210 tree = create_tree (dfa, tree, exp, CONCAT);
2211 if (tree == NULL)
2213 *err = REG_ESPACE;
2214 return NULL;
2217 else if (tree == NULL)
2218 tree = exp;
2219 /* Otherwise exp == NULL, we don't need to create new tree. */
2221 return tree;
2224 /* This function build the following tree, from regular expression a*:
2230 static bin_tree_t *
2231 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2232 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2234 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2235 bin_tree_t *tree;
2236 switch (token->type)
2238 case CHARACTER:
2239 tree = create_token_tree (dfa, NULL, NULL, token);
2240 if (BE (tree == NULL, 0))
2242 *err = REG_ESPACE;
2243 return NULL;
2245 #ifdef RE_ENABLE_I18N
2246 if (dfa->mb_cur_max > 1)
2248 while (!re_string_eoi (regexp)
2249 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2251 bin_tree_t *mbc_remain;
2252 fetch_token (token, regexp, syntax);
2253 mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2254 tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2255 if (BE (mbc_remain == NULL || tree == NULL, 0))
2257 *err = REG_ESPACE;
2258 return NULL;
2262 #endif
2263 break;
2264 case OP_OPEN_SUBEXP:
2265 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2266 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2267 return NULL;
2268 break;
2269 case OP_OPEN_BRACKET:
2270 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2271 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2272 return NULL;
2273 break;
2274 case OP_BACK_REF:
2275 if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2277 *err = REG_ESUBREG;
2278 return NULL;
2280 dfa->used_bkref_map |= 1 << token->opr.idx;
2281 tree = create_token_tree (dfa, NULL, NULL, token);
2282 if (BE (tree == NULL, 0))
2284 *err = REG_ESPACE;
2285 return NULL;
2287 ++dfa->nbackref;
2288 dfa->has_mb_node = 1;
2289 break;
2290 case OP_OPEN_DUP_NUM:
2291 if (syntax & RE_CONTEXT_INVALID_DUP)
2293 *err = REG_BADRPT;
2294 return NULL;
2296 /* FALLTHROUGH */
2297 case OP_DUP_ASTERISK:
2298 case OP_DUP_PLUS:
2299 case OP_DUP_QUESTION:
2300 if (syntax & RE_CONTEXT_INVALID_OPS)
2302 *err = REG_BADRPT;
2303 return NULL;
2305 else if (syntax & RE_CONTEXT_INDEP_OPS)
2307 fetch_token (token, regexp, syntax);
2308 return parse_expression (regexp, preg, token, syntax, nest, err);
2310 /* else fall through */
2311 case OP_CLOSE_SUBEXP:
2312 if ((token->type == OP_CLOSE_SUBEXP) &&
2313 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2315 *err = REG_ERPAREN;
2316 return NULL;
2318 /* else fall through */
2319 case OP_CLOSE_DUP_NUM:
2320 /* We treat it as a normal character. */
2322 /* Then we can these characters as normal characters. */
2323 token->type = CHARACTER;
2324 /* mb_partial and word_char bits should be initialized already
2325 by peek_token. */
2326 tree = create_token_tree (dfa, NULL, NULL, token);
2327 if (BE (tree == NULL, 0))
2329 *err = REG_ESPACE;
2330 return NULL;
2332 break;
2333 case ANCHOR:
2334 if ((token->opr.ctx_type
2335 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2336 && dfa->word_ops_used == 0)
2337 init_word_char (dfa);
2338 if (token->opr.ctx_type == WORD_DELIM
2339 || token->opr.ctx_type == NOT_WORD_DELIM)
2341 bin_tree_t *tree_first, *tree_last;
2342 if (token->opr.ctx_type == WORD_DELIM)
2344 token->opr.ctx_type = WORD_FIRST;
2345 tree_first = create_token_tree (dfa, NULL, NULL, token);
2346 token->opr.ctx_type = WORD_LAST;
2348 else
2350 token->opr.ctx_type = INSIDE_WORD;
2351 tree_first = create_token_tree (dfa, NULL, NULL, token);
2352 token->opr.ctx_type = INSIDE_NOTWORD;
2354 tree_last = create_token_tree (dfa, NULL, NULL, token);
2355 tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2356 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2358 *err = REG_ESPACE;
2359 return NULL;
2362 else
2364 tree = create_token_tree (dfa, NULL, NULL, token);
2365 if (BE (tree == NULL, 0))
2367 *err = REG_ESPACE;
2368 return NULL;
2371 /* We must return here, since ANCHORs can't be followed
2372 by repetition operators.
2373 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2374 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2375 fetch_token (token, regexp, syntax);
2376 return tree;
2377 case OP_PERIOD:
2378 tree = create_token_tree (dfa, NULL, NULL, token);
2379 if (BE (tree == NULL, 0))
2381 *err = REG_ESPACE;
2382 return NULL;
2384 if (dfa->mb_cur_max > 1)
2385 dfa->has_mb_node = 1;
2386 break;
2387 case OP_WORD:
2388 case OP_NOTWORD:
2389 tree = build_charclass_op (dfa, regexp->trans,
2390 "alnum",
2391 "_",
2392 token->type == OP_NOTWORD, err);
2393 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2394 return NULL;
2395 break;
2396 case OP_SPACE:
2397 case OP_NOTSPACE:
2398 tree = build_charclass_op (dfa, regexp->trans,
2399 "space",
2401 token->type == OP_NOTSPACE, err);
2402 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2403 return NULL;
2404 break;
2405 case OP_ALT:
2406 case END_OF_RE:
2407 return NULL;
2408 case BACK_SLASH:
2409 *err = REG_EESCAPE;
2410 return NULL;
2411 default:
2412 /* Must not happen? */
2413 #ifdef DEBUG
2414 assert (0);
2415 #endif
2416 return NULL;
2418 fetch_token (token, regexp, syntax);
2420 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2421 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2423 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2424 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2425 return NULL;
2426 /* In BRE consecutive duplications are not allowed. */
2427 if ((syntax & RE_CONTEXT_INVALID_DUP)
2428 && (token->type == OP_DUP_ASTERISK
2429 || token->type == OP_OPEN_DUP_NUM))
2431 *err = REG_BADRPT;
2432 return NULL;
2436 return tree;
2439 /* This function build the following tree, from regular expression
2440 (<reg_exp>):
2441 SUBEXP
2443 <reg_exp>
2446 static bin_tree_t *
2447 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2448 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2450 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2451 bin_tree_t *tree;
2452 size_t cur_nsub;
2453 cur_nsub = preg->re_nsub++;
2455 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2457 /* The subexpression may be a null string. */
2458 if (token->type == OP_CLOSE_SUBEXP)
2459 tree = NULL;
2460 else
2462 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2463 if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2464 *err = REG_EPAREN;
2465 if (BE (*err != REG_NOERROR, 0))
2466 return NULL;
2469 if (cur_nsub <= '9' - '1')
2470 dfa->completed_bkref_map |= 1 << cur_nsub;
2472 tree = create_tree (dfa, tree, NULL, SUBEXP);
2473 if (BE (tree == NULL, 0))
2475 *err = REG_ESPACE;
2476 return NULL;
2478 tree->token.opr.idx = cur_nsub;
2479 return tree;
2482 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2484 static bin_tree_t *
2485 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2486 re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2488 bin_tree_t *tree = NULL, *old_tree = NULL;
2489 int i, start, end, start_idx = re_string_cur_idx (regexp);
2490 #ifndef RE_TOKEN_INIT_BUG
2491 re_token_t start_token = *token;
2492 #else
2493 re_token_t start_token;
2495 memcpy ((void *) &start_token, (void *) token, sizeof start_token);
2496 #endif
2498 if (token->type == OP_OPEN_DUP_NUM)
2500 end = 0;
2501 start = fetch_number (regexp, token, syntax);
2502 if (start == -1)
2504 if (token->type == CHARACTER && token->opr.c == ',')
2505 start = 0; /* We treat "{,m}" as "{0,m}". */
2506 else
2508 *err = REG_BADBR; /* <re>{} is invalid. */
2509 return NULL;
2512 if (BE (start != -2, 1))
2514 /* We treat "{n}" as "{n,n}". */
2515 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2516 : ((token->type == CHARACTER && token->opr.c == ',')
2517 ? fetch_number (regexp, token, syntax) : -2));
2519 if (BE (start == -2 || end == -2, 0))
2521 /* Invalid sequence. */
2522 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2524 if (token->type == END_OF_RE)
2525 *err = REG_EBRACE;
2526 else
2527 *err = REG_BADBR;
2529 return NULL;
2532 /* If the syntax bit is set, rollback. */
2533 re_string_set_index (regexp, start_idx);
2534 *token = start_token;
2535 token->type = CHARACTER;
2536 /* mb_partial and word_char bits should be already initialized by
2537 peek_token. */
2538 return elem;
2541 if (BE ((end != -1 && start > end) || token->type != OP_CLOSE_DUP_NUM, 0))
2543 /* First number greater than second. */
2544 *err = REG_BADBR;
2545 return NULL;
2548 else
2550 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2551 end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2554 fetch_token (token, regexp, syntax);
2556 if (BE (elem == NULL, 0))
2557 return NULL;
2558 if (BE (start == 0 && end == 0, 0))
2560 postorder (elem, free_tree, NULL);
2561 return NULL;
2564 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2565 if (BE (start > 0, 0))
2567 tree = elem;
2568 for (i = 2; i <= start; ++i)
2570 elem = duplicate_tree (elem, dfa);
2571 tree = create_tree (dfa, tree, elem, CONCAT);
2572 if (BE (elem == NULL || tree == NULL, 0))
2573 goto parse_dup_op_espace;
2576 if (start == end)
2577 return tree;
2579 /* Duplicate ELEM before it is marked optional. */
2580 elem = duplicate_tree (elem, dfa);
2581 old_tree = tree;
2583 else
2584 old_tree = NULL;
2586 if (elem->token.type == SUBEXP)
2587 postorder (elem, mark_opt_subexp, (void *) (intptr_t) elem->token.opr.idx);
2589 tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2590 if (BE (tree == NULL, 0))
2591 goto parse_dup_op_espace;
2593 /* This loop is actually executed only when end != -1,
2594 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2595 already created the start+1-th copy. */
2596 for (i = start + 2; i <= end; ++i)
2598 elem = duplicate_tree (elem, dfa);
2599 tree = create_tree (dfa, tree, elem, CONCAT);
2600 if (BE (elem == NULL || tree == NULL, 0))
2601 goto parse_dup_op_espace;
2603 tree = create_tree (dfa, tree, NULL, OP_ALT);
2604 if (BE (tree == NULL, 0))
2605 goto parse_dup_op_espace;
2608 if (old_tree)
2609 tree = create_tree (dfa, old_tree, tree, CONCAT);
2611 return tree;
2613 parse_dup_op_espace:
2614 *err = REG_ESPACE;
2615 return NULL;
2618 /* Size of the names for collating symbol/equivalence_class/character_class.
2619 I'm not sure, but maybe enough. */
2620 #define BRACKET_NAME_BUF_SIZE 32
2622 #ifndef _LIBC
2623 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2624 Build the range expression which starts from START_ELEM, and ends
2625 at END_ELEM. The result are written to MBCSET and SBCSET.
2626 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2627 mbcset->range_ends, is a pointer argument since we may
2628 update it. */
2630 static reg_errcode_t
2631 internal_function
2632 # ifdef RE_ENABLE_I18N
2633 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2634 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2635 # else /* not RE_ENABLE_I18N */
2636 build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
2637 bracket_elem_t *end_elem)
2638 # endif /* not RE_ENABLE_I18N */
2640 unsigned int start_ch, end_ch;
2641 /* Equivalence Classes and Character Classes can't be a range start/end. */
2642 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2643 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2645 return REG_ERANGE;
2647 /* We can handle no multi character collating elements without libc
2648 support. */
2649 if (BE ((start_elem->type == COLL_SYM
2650 && strlen ((char *) start_elem->opr.name) > 1)
2651 || (end_elem->type == COLL_SYM
2652 && strlen ((char *) end_elem->opr.name) > 1), 0))
2653 return REG_ECOLLATE;
2655 # ifdef RE_ENABLE_I18N
2657 wchar_t wc;
2658 wint_t start_wc;
2659 wint_t end_wc;
2660 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2662 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2663 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2664 : 0));
2665 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2666 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2667 : 0));
2668 #ifdef GAWK
2670 * Fedora Core 2, maybe others, have broken `btowc' that returns -1
2671 * for any value > 127. Sigh. Note that `start_ch' and `end_ch' are
2672 * unsigned, so we don't have sign extension problems.
2674 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2675 ? start_ch : start_elem->opr.wch);
2676 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2677 ? end_ch : end_elem->opr.wch);
2678 #else
2679 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2680 ? __btowc (start_ch) : start_elem->opr.wch);
2681 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2682 ? __btowc (end_ch) : end_elem->opr.wch);
2683 #endif
2684 if (start_wc == WEOF || end_wc == WEOF)
2685 return REG_ECOLLATE;
2686 cmp_buf[0] = start_wc;
2687 cmp_buf[4] = end_wc;
2688 if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2689 return REG_ERANGE;
2691 /* Got valid collation sequence values, add them as a new entry.
2692 However, for !_LIBC we have no collation elements: if the
2693 character set is single byte, the single byte character set
2694 that we build below suffices. parse_bracket_exp passes
2695 no MBCSET if dfa->mb_cur_max == 1. */
2696 if (mbcset)
2698 /* Check the space of the arrays. */
2699 if (BE (*range_alloc == mbcset->nranges, 0))
2701 /* There is not enough space, need realloc. */
2702 wchar_t *new_array_start, *new_array_end;
2703 int new_nranges;
2705 /* +1 in case of mbcset->nranges is 0. */
2706 new_nranges = 2 * mbcset->nranges + 1;
2707 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2708 are NULL if *range_alloc == 0. */
2709 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2710 new_nranges);
2711 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2712 new_nranges);
2714 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2715 return REG_ESPACE;
2717 mbcset->range_starts = new_array_start;
2718 mbcset->range_ends = new_array_end;
2719 *range_alloc = new_nranges;
2722 mbcset->range_starts[mbcset->nranges] = start_wc;
2723 mbcset->range_ends[mbcset->nranges++] = end_wc;
2726 /* Build the table for single byte characters. */
2727 for (wc = 0; wc < SBC_MAX; ++wc)
2729 cmp_buf[2] = wc;
2730 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2731 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2732 bitset_set (sbcset, wc);
2735 # else /* not RE_ENABLE_I18N */
2737 unsigned int ch;
2738 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2739 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2740 : 0));
2741 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2742 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2743 : 0));
2744 if (start_ch > end_ch)
2745 return REG_ERANGE;
2746 /* Build the table for single byte characters. */
2747 for (ch = 0; ch < SBC_MAX; ++ch)
2748 if (start_ch <= ch && ch <= end_ch)
2749 bitset_set (sbcset, ch);
2751 # endif /* not RE_ENABLE_I18N */
2752 return REG_NOERROR;
2754 #endif /* not _LIBC */
2756 #ifndef _LIBC
2757 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2758 Build the collating element which is represented by NAME.
2759 The result are written to MBCSET and SBCSET.
2760 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2761 pointer argument since we may update it. */
2763 static reg_errcode_t
2764 internal_function
2765 # ifdef RE_ENABLE_I18N
2766 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2767 int *coll_sym_alloc, const unsigned char *name)
2768 # else /* not RE_ENABLE_I18N */
2769 build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2770 # endif /* not RE_ENABLE_I18N */
2772 size_t name_len = strlen ((const char *) name);
2773 if (BE (name_len != 1, 0))
2774 return REG_ECOLLATE;
2775 else
2777 bitset_set (sbcset, name[0]);
2778 return REG_NOERROR;
2781 #endif /* not _LIBC */
2783 /* This function parse bracket expression like "[abc]", "[a-c]",
2784 "[[.a-a.]]" etc. */
2786 static bin_tree_t *
2787 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2788 reg_syntax_t syntax, reg_errcode_t *err)
2790 #ifdef _LIBC
2791 const unsigned char *collseqmb;
2792 const char *collseqwc;
2793 uint32_t nrules;
2794 int32_t table_size;
2795 const int32_t *symb_table;
2796 const unsigned char *extra;
2798 /* Local function for parse_bracket_exp used in _LIBC environment.
2799 Seek the collating symbol entry correspondings to NAME.
2800 Return the index of the symbol in the SYMB_TABLE. */
2802 auto inline int32_t
2803 __attribute ((always_inline))
2804 seek_collating_symbol_entry (name, name_len)
2805 const unsigned char *name;
2806 size_t name_len;
2808 int32_t hash = elem_hash ((const char *) name, name_len);
2809 int32_t elem = hash % table_size;
2810 if (symb_table[2 * elem] != 0)
2812 int32_t second = hash % (table_size - 2) + 1;
2816 /* First compare the hashing value. */
2817 if (symb_table[2 * elem] == hash
2818 /* Compare the length of the name. */
2819 && name_len == extra[symb_table[2 * elem + 1]]
2820 /* Compare the name. */
2821 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2822 name_len) == 0)
2824 /* Yep, this is the entry. */
2825 break;
2828 /* Next entry. */
2829 elem += second;
2831 while (symb_table[2 * elem] != 0);
2833 return elem;
2836 /* Local function for parse_bracket_exp used in _LIBC environment.
2837 Look up the collation sequence value of BR_ELEM.
2838 Return the value if succeeded, UINT_MAX otherwise. */
2840 auto inline unsigned int
2841 __attribute ((always_inline))
2842 lookup_collation_sequence_value (br_elem)
2843 bracket_elem_t *br_elem;
2845 if (br_elem->type == SB_CHAR)
2848 if (MB_CUR_MAX == 1)
2850 if (nrules == 0)
2851 return collseqmb[br_elem->opr.ch];
2852 else
2854 wint_t wc = __btowc (br_elem->opr.ch);
2855 return __collseq_table_lookup (collseqwc, wc);
2858 else if (br_elem->type == MB_CHAR)
2860 if (nrules != 0)
2861 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2863 else if (br_elem->type == COLL_SYM)
2865 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2866 if (nrules != 0)
2868 int32_t elem, idx;
2869 elem = seek_collating_symbol_entry (br_elem->opr.name,
2870 sym_name_len);
2871 if (symb_table[2 * elem] != 0)
2873 /* We found the entry. */
2874 idx = symb_table[2 * elem + 1];
2875 /* Skip the name of collating element name. */
2876 idx += 1 + extra[idx];
2877 /* Skip the byte sequence of the collating element. */
2878 idx += 1 + extra[idx];
2879 /* Adjust for the alignment. */
2880 idx = (idx + 3) & ~3;
2881 /* Skip the multibyte collation sequence value. */
2882 idx += sizeof (unsigned int);
2883 /* Skip the wide char sequence of the collating element. */
2884 idx += sizeof (unsigned int) *
2885 (1 + *(unsigned int *) (extra + idx));
2886 /* Return the collation sequence value. */
2887 return *(unsigned int *) (extra + idx);
2889 else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2891 /* No valid character. Match it as a single byte
2892 character. */
2893 return collseqmb[br_elem->opr.name[0]];
2896 else if (sym_name_len == 1)
2897 return collseqmb[br_elem->opr.name[0]];
2899 return UINT_MAX;
2902 /* Local function for parse_bracket_exp used in _LIBC environment.
2903 Build the range expression which starts from START_ELEM, and ends
2904 at END_ELEM. The result are written to MBCSET and SBCSET.
2905 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2906 mbcset->range_ends, is a pointer argument since we may
2907 update it. */
2909 auto inline reg_errcode_t
2910 __attribute ((always_inline))
2911 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2912 re_charset_t *mbcset;
2913 int *range_alloc;
2914 bitset_t sbcset;
2915 bracket_elem_t *start_elem, *end_elem;
2917 unsigned int ch;
2918 uint32_t start_collseq;
2919 uint32_t end_collseq;
2921 /* Equivalence Classes and Character Classes can't be a range
2922 start/end. */
2923 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2924 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2926 return REG_ERANGE;
2928 start_collseq = lookup_collation_sequence_value (start_elem);
2929 end_collseq = lookup_collation_sequence_value (end_elem);
2930 /* Check start/end collation sequence values. */
2931 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2932 return REG_ECOLLATE;
2933 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2934 return REG_ERANGE;
2936 /* Got valid collation sequence values, add them as a new entry.
2937 However, if we have no collation elements, and the character set
2938 is single byte, the single byte character set that we
2939 build below suffices. */
2940 if (nrules > 0 || dfa->mb_cur_max > 1)
2942 /* Check the space of the arrays. */
2943 if (BE (*range_alloc == mbcset->nranges, 0))
2945 /* There is not enough space, need realloc. */
2946 uint32_t *new_array_start;
2947 uint32_t *new_array_end;
2948 int new_nranges;
2950 /* +1 in case of mbcset->nranges is 0. */
2951 new_nranges = 2 * mbcset->nranges + 1;
2952 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2953 new_nranges);
2954 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2955 new_nranges);
2957 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2958 return REG_ESPACE;
2960 mbcset->range_starts = new_array_start;
2961 mbcset->range_ends = new_array_end;
2962 *range_alloc = new_nranges;
2965 mbcset->range_starts[mbcset->nranges] = start_collseq;
2966 mbcset->range_ends[mbcset->nranges++] = end_collseq;
2969 /* Build the table for single byte characters. */
2970 for (ch = 0; ch < SBC_MAX; ch++)
2972 uint32_t ch_collseq;
2974 if (MB_CUR_MAX == 1)
2976 if (nrules == 0)
2977 ch_collseq = collseqmb[ch];
2978 else
2979 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2980 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2981 bitset_set (sbcset, ch);
2983 return REG_NOERROR;
2986 /* Local function for parse_bracket_exp used in _LIBC environment.
2987 Build the collating element which is represented by NAME.
2988 The result are written to MBCSET and SBCSET.
2989 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2990 pointer argument since we may update it. */
2992 auto inline reg_errcode_t
2993 __attribute ((always_inline))
2994 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2995 re_charset_t *mbcset;
2996 int *coll_sym_alloc;
2997 bitset_t sbcset;
2998 const unsigned char *name;
3000 int32_t elem, idx;
3001 size_t name_len = strlen ((const char *) name);
3002 if (nrules != 0)
3004 elem = seek_collating_symbol_entry (name, name_len);
3005 if (symb_table[2 * elem] != 0)
3007 /* We found the entry. */
3008 idx = symb_table[2 * elem + 1];
3009 /* Skip the name of collating element name. */
3010 idx += 1 + extra[idx];
3012 else if (symb_table[2 * elem] == 0 && name_len == 1)
3014 /* No valid character, treat it as a normal
3015 character. */
3016 bitset_set (sbcset, name[0]);
3017 return REG_NOERROR;
3019 else
3020 return REG_ECOLLATE;
3022 /* Got valid collation sequence, add it as a new entry. */
3023 /* Check the space of the arrays. */
3024 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3026 /* Not enough, realloc it. */
3027 /* +1 in case of mbcset->ncoll_syms is 0. */
3028 int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3029 /* Use realloc since mbcset->coll_syms is NULL
3030 if *alloc == 0. */
3031 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3032 new_coll_sym_alloc);
3033 if (BE (new_coll_syms == NULL, 0))
3034 return REG_ESPACE;
3035 mbcset->coll_syms = new_coll_syms;
3036 *coll_sym_alloc = new_coll_sym_alloc;
3038 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3039 return REG_NOERROR;
3041 else
3043 if (BE (name_len != 1, 0))
3044 return REG_ECOLLATE;
3045 else
3047 bitset_set (sbcset, name[0]);
3048 return REG_NOERROR;
3052 #endif
3054 re_token_t br_token;
3055 re_bitset_ptr_t sbcset;
3056 #ifdef RE_ENABLE_I18N
3057 re_charset_t *mbcset;
3058 int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3059 int equiv_class_alloc = 0, char_class_alloc = 0;
3060 #endif /* not RE_ENABLE_I18N */
3061 int non_match = 0;
3062 bin_tree_t *work_tree;
3063 int token_len;
3064 int first_round = 1;
3065 #ifdef _LIBC
3066 collseqmb = (const unsigned char *)
3067 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3068 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3069 if (nrules)
3072 if (MB_CUR_MAX > 1)
3074 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3075 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3076 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3077 _NL_COLLATE_SYMB_TABLEMB);
3078 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3079 _NL_COLLATE_SYMB_EXTRAMB);
3081 #endif
3082 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3083 #ifdef RE_ENABLE_I18N
3084 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3085 #endif /* RE_ENABLE_I18N */
3086 #ifdef RE_ENABLE_I18N
3087 if (BE (sbcset == NULL || mbcset == NULL, 0))
3088 #else
3089 if (BE (sbcset == NULL, 0))
3090 #endif /* RE_ENABLE_I18N */
3092 *err = REG_ESPACE;
3093 return NULL;
3096 token_len = peek_token_bracket (token, regexp, syntax);
3097 if (BE (token->type == END_OF_RE, 0))
3099 *err = REG_BADPAT;
3100 goto parse_bracket_exp_free_return;
3102 if (token->type == OP_NON_MATCH_LIST)
3104 #ifdef RE_ENABLE_I18N
3105 mbcset->non_match = 1;
3106 #endif /* not RE_ENABLE_I18N */
3107 non_match = 1;
3108 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3109 bitset_set (sbcset, '\n');
3110 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3111 token_len = peek_token_bracket (token, regexp, syntax);
3112 if (BE (token->type == END_OF_RE, 0))
3114 *err = REG_BADPAT;
3115 goto parse_bracket_exp_free_return;
3119 /* We treat the first ']' as a normal character. */
3120 if (token->type == OP_CLOSE_BRACKET)
3121 token->type = CHARACTER;
3123 while (1)
3125 bracket_elem_t start_elem, end_elem;
3126 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3127 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3128 reg_errcode_t ret;
3129 int token_len2 = 0, is_range_exp = 0;
3130 re_token_t token2;
3132 start_elem.opr.name = start_name_buf;
3133 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3134 syntax, first_round);
3135 if (BE (ret != REG_NOERROR, 0))
3137 *err = ret;
3138 goto parse_bracket_exp_free_return;
3140 first_round = 0;
3142 /* Get information about the next token. We need it in any case. */
3143 token_len = peek_token_bracket (token, regexp, syntax);
3145 /* Do not check for ranges if we know they are not allowed. */
3146 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3148 if (BE (token->type == END_OF_RE, 0))
3150 *err = REG_EBRACK;
3151 goto parse_bracket_exp_free_return;
3153 if (token->type == OP_CHARSET_RANGE)
3155 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3156 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3157 if (BE (token2.type == END_OF_RE, 0))
3159 *err = REG_EBRACK;
3160 goto parse_bracket_exp_free_return;
3162 if (token2.type == OP_CLOSE_BRACKET)
3164 /* We treat the last '-' as a normal character. */
3165 re_string_skip_bytes (regexp, -token_len);
3166 token->type = CHARACTER;
3168 else
3169 is_range_exp = 1;
3173 if (is_range_exp == 1)
3175 end_elem.opr.name = end_name_buf;
3176 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3177 dfa, syntax, 1);
3178 if (BE (ret != REG_NOERROR, 0))
3180 *err = ret;
3181 goto parse_bracket_exp_free_return;
3184 token_len = peek_token_bracket (token, regexp, syntax);
3186 #ifdef _LIBC
3187 *err = build_range_exp (sbcset, mbcset, &range_alloc,
3188 &start_elem, &end_elem);
3189 #else
3190 # ifdef RE_ENABLE_I18N
3191 *err = build_range_exp (sbcset,
3192 dfa->mb_cur_max > 1 ? mbcset : NULL,
3193 &range_alloc, &start_elem, &end_elem);
3194 # else
3195 *err = build_range_exp (sbcset, &start_elem, &end_elem);
3196 # endif
3197 #endif /* RE_ENABLE_I18N */
3198 if (BE (*err != REG_NOERROR, 0))
3199 goto parse_bracket_exp_free_return;
3201 else
3203 switch (start_elem.type)
3205 case SB_CHAR:
3206 bitset_set (sbcset, start_elem.opr.ch);
3207 break;
3208 #ifdef RE_ENABLE_I18N
3209 case MB_CHAR:
3210 /* Check whether the array has enough space. */
3211 if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3213 wchar_t *new_mbchars;
3214 /* Not enough, realloc it. */
3215 /* +1 in case of mbcset->nmbchars is 0. */
3216 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3217 /* Use realloc since array is NULL if *alloc == 0. */
3218 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3219 mbchar_alloc);
3220 if (BE (new_mbchars == NULL, 0))
3221 goto parse_bracket_exp_espace;
3222 mbcset->mbchars = new_mbchars;
3224 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3225 break;
3226 #endif /* RE_ENABLE_I18N */
3227 case EQUIV_CLASS:
3228 *err = build_equiv_class (sbcset,
3229 #ifdef RE_ENABLE_I18N
3230 mbcset, &equiv_class_alloc,
3231 #endif /* RE_ENABLE_I18N */
3232 start_elem.opr.name);
3233 if (BE (*err != REG_NOERROR, 0))
3234 goto parse_bracket_exp_free_return;
3235 break;
3236 case COLL_SYM:
3237 *err = build_collating_symbol (sbcset,
3238 #ifdef RE_ENABLE_I18N
3239 mbcset, &coll_sym_alloc,
3240 #endif /* RE_ENABLE_I18N */
3241 start_elem.opr.name);
3242 if (BE (*err != REG_NOERROR, 0))
3243 goto parse_bracket_exp_free_return;
3244 break;
3245 case CHAR_CLASS:
3246 *err = build_charclass (regexp->trans, sbcset,
3247 #ifdef RE_ENABLE_I18N
3248 mbcset, &char_class_alloc,
3249 #endif /* RE_ENABLE_I18N */
3250 (const char *) start_elem.opr.name, syntax);
3251 if (BE (*err != REG_NOERROR, 0))
3252 goto parse_bracket_exp_free_return;
3253 break;
3254 default:
3255 assert (0);
3256 break;
3259 if (BE (token->type == END_OF_RE, 0))
3261 *err = REG_EBRACK;
3262 goto parse_bracket_exp_free_return;
3264 if (token->type == OP_CLOSE_BRACKET)
3265 break;
3268 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3270 /* If it is non-matching list. */
3271 if (non_match)
3272 bitset_not (sbcset);
3274 #ifdef RE_ENABLE_I18N
3275 /* Ensure only single byte characters are set. */
3276 if (dfa->mb_cur_max > 1)
3277 bitset_mask (sbcset, dfa->sb_char);
3279 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3280 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3281 || mbcset->non_match)))
3283 bin_tree_t *mbc_tree;
3284 int sbc_idx;
3285 /* Build a tree for complex bracket. */
3286 dfa->has_mb_node = 1;
3287 br_token.type = COMPLEX_BRACKET;
3288 br_token.opr.mbcset = mbcset;
3289 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3290 if (BE (mbc_tree == NULL, 0))
3291 goto parse_bracket_exp_espace;
3292 for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3293 if (sbcset[sbc_idx])
3294 break;
3295 /* If there are no bits set in sbcset, there is no point
3296 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3297 if (sbc_idx < BITSET_WORDS)
3299 /* Build a tree for simple bracket. */
3300 br_token.type = SIMPLE_BRACKET;
3301 br_token.opr.sbcset = sbcset;
3302 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3303 if (BE (work_tree == NULL, 0))
3304 goto parse_bracket_exp_espace;
3306 /* Then join them by ALT node. */
3307 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3308 if (BE (work_tree == NULL, 0))
3309 goto parse_bracket_exp_espace;
3311 else
3313 re_free (sbcset);
3314 work_tree = mbc_tree;
3317 else
3318 #endif /* not RE_ENABLE_I18N */
3320 #ifdef RE_ENABLE_I18N
3321 free_charset (mbcset);
3322 #endif
3323 /* Build a tree for simple bracket. */
3324 br_token.type = SIMPLE_BRACKET;
3325 br_token.opr.sbcset = sbcset;
3326 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3327 if (BE (work_tree == NULL, 0))
3328 goto parse_bracket_exp_espace;
3330 return work_tree;
3332 parse_bracket_exp_espace:
3333 *err = REG_ESPACE;
3334 parse_bracket_exp_free_return:
3335 re_free (sbcset);
3336 #ifdef RE_ENABLE_I18N
3337 free_charset (mbcset);
3338 #endif /* RE_ENABLE_I18N */
3339 return NULL;
3342 /* Parse an element in the bracket expression. */
3344 static reg_errcode_t
3345 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3346 re_token_t *token, int token_len, re_dfa_t *dfa,
3347 reg_syntax_t syntax, int accept_hyphen)
3349 #ifdef RE_ENABLE_I18N
3350 int cur_char_size;
3351 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3352 if (cur_char_size > 1)
3354 elem->type = MB_CHAR;
3355 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3356 re_string_skip_bytes (regexp, cur_char_size);
3357 return REG_NOERROR;
3359 #endif /* RE_ENABLE_I18N */
3360 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3361 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3362 || token->type == OP_OPEN_EQUIV_CLASS)
3363 return parse_bracket_symbol (elem, regexp, token);
3364 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3366 /* A '-' must only appear as anything but a range indicator before
3367 the closing bracket. Everything else is an error. */
3368 re_token_t token2;
3369 (void) peek_token_bracket (&token2, regexp, syntax);
3370 if (token2.type != OP_CLOSE_BRACKET)
3371 /* The actual error value is not standardized since this whole
3372 case is undefined. But ERANGE makes good sense. */
3373 return REG_ERANGE;
3375 elem->type = SB_CHAR;
3376 elem->opr.ch = token->opr.c;
3377 return REG_NOERROR;
3380 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3381 such as [:<character_class>:], [.<collating_element>.], and
3382 [=<equivalent_class>=]. */
3384 static reg_errcode_t
3385 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3386 re_token_t *token)
3388 unsigned char ch, delim = token->opr.c;
3389 int i = 0;
3390 if (re_string_eoi(regexp))
3391 return REG_EBRACK;
3392 for (;; ++i)
3394 if (i >= BRACKET_NAME_BUF_SIZE)
3395 return REG_EBRACK;
3396 if (token->type == OP_OPEN_CHAR_CLASS)
3397 ch = re_string_fetch_byte_case (regexp);
3398 else
3399 ch = re_string_fetch_byte (regexp);
3400 if (re_string_eoi(regexp))
3401 return REG_EBRACK;
3402 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3403 break;
3404 elem->opr.name[i] = ch;
3406 re_string_skip_bytes (regexp, 1);
3407 elem->opr.name[i] = '\0';
3408 switch (token->type)
3410 case OP_OPEN_COLL_ELEM:
3411 elem->type = COLL_SYM;
3412 break;
3413 case OP_OPEN_EQUIV_CLASS:
3414 elem->type = EQUIV_CLASS;
3415 break;
3416 case OP_OPEN_CHAR_CLASS:
3417 elem->type = CHAR_CLASS;
3418 break;
3419 default:
3420 break;
3422 return REG_NOERROR;
3425 /* Helper function for parse_bracket_exp.
3426 Build the equivalence class which is represented by NAME.
3427 The result are written to MBCSET and SBCSET.
3428 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3429 is a pointer argument since we may update it. */
3431 static reg_errcode_t
3432 #ifdef RE_ENABLE_I18N
3433 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3434 int *equiv_class_alloc, const unsigned char *name)
3435 #else /* not RE_ENABLE_I18N */
3436 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3437 #endif /* not RE_ENABLE_I18N */
3439 #ifdef _LIBC
3440 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3441 if (nrules != 0)
3443 const int32_t *table, *indirect;
3444 const unsigned char *weights, *extra, *cp;
3445 unsigned char char_buf[2];
3446 int32_t idx1, idx2;
3447 unsigned int ch;
3448 size_t len;
3449 /* This #include defines a local function! */
3450 # include <locale/weight.h>
3451 /* Calculate the index for equivalence class. */
3452 cp = name;
3453 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3454 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3455 _NL_COLLATE_WEIGHTMB);
3456 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3457 _NL_COLLATE_EXTRAMB);
3458 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3459 _NL_COLLATE_INDIRECTMB);
3460 idx1 = findidx (&cp);
3461 if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3462 /* This isn't a valid character. */
3463 return REG_ECOLLATE;
3465 /* Build single byte matcing table for this equivalence class. */
3466 char_buf[1] = (unsigned char) '\0';
3467 len = weights[idx1 & 0xffffff];
3468 for (ch = 0; ch < SBC_MAX; ++ch)
3470 char_buf[0] = ch;
3471 cp = char_buf;
3472 idx2 = findidx (&cp);
3474 idx2 = table[ch];
3476 if (idx2 == 0)
3477 /* This isn't a valid character. */
3478 continue;
3479 /* Compare only if the length matches and the collation rule
3480 index is the same. */
3481 if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3483 int cnt = 0;
3485 while (cnt <= len &&
3486 weights[(idx1 & 0xffffff) + 1 + cnt]
3487 == weights[(idx2 & 0xffffff) + 1 + cnt])
3488 ++cnt;
3490 if (cnt > len)
3491 bitset_set (sbcset, ch);
3494 /* Check whether the array has enough space. */
3495 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3497 /* Not enough, realloc it. */
3498 /* +1 in case of mbcset->nequiv_classes is 0. */
3499 int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3500 /* Use realloc since the array is NULL if *alloc == 0. */
3501 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3502 int32_t,
3503 new_equiv_class_alloc);
3504 if (BE (new_equiv_classes == NULL, 0))
3505 return REG_ESPACE;
3506 mbcset->equiv_classes = new_equiv_classes;
3507 *equiv_class_alloc = new_equiv_class_alloc;
3509 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3511 else
3512 #endif /* _LIBC */
3514 if (BE (strlen ((const char *) name) != 1, 0))
3515 return REG_ECOLLATE;
3516 bitset_set (sbcset, *name);
3518 return REG_NOERROR;
3521 /* Helper function for parse_bracket_exp.
3522 Build the character class which is represented by NAME.
3523 The result are written to MBCSET and SBCSET.
3524 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3525 is a pointer argument since we may update it. */
3527 static reg_errcode_t
3528 #ifdef RE_ENABLE_I18N
3529 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3530 re_charset_t *mbcset, int *char_class_alloc,
3531 const char *class_name, reg_syntax_t syntax)
3532 #else /* not RE_ENABLE_I18N */
3533 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3534 const char *class_name, reg_syntax_t syntax)
3535 #endif /* not RE_ENABLE_I18N */
3537 int i;
3539 /* In case of REG_ICASE "upper" and "lower" match the both of
3540 upper and lower cases. */
3541 if ((syntax & RE_ICASE)
3542 && (strcmp (class_name, "upper") == 0 || strcmp (class_name, "lower") == 0))
3543 class_name = "alpha";
3545 #ifdef RE_ENABLE_I18N
3546 /* Check the space of the arrays. */
3547 if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3549 /* Not enough, realloc it. */
3550 /* +1 in case of mbcset->nchar_classes is 0. */
3551 int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3552 /* Use realloc since array is NULL if *alloc == 0. */
3553 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3554 new_char_class_alloc);
3555 if (BE (new_char_classes == NULL, 0))
3556 return REG_ESPACE;
3557 mbcset->char_classes = new_char_classes;
3558 *char_class_alloc = new_char_class_alloc;
3560 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (class_name);
3561 #endif /* RE_ENABLE_I18N */
3563 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3564 do { \
3565 if (BE (trans != NULL, 0)) \
3567 for (i = 0; i < SBC_MAX; ++i) \
3568 if (ctype_func (i)) \
3569 bitset_set (sbcset, trans[i]); \
3571 else \
3573 for (i = 0; i < SBC_MAX; ++i) \
3574 if (ctype_func (i)) \
3575 bitset_set (sbcset, i); \
3577 } while (0)
3579 if (strcmp (class_name, "alnum") == 0)
3580 BUILD_CHARCLASS_LOOP (isalnum);
3581 else if (strcmp (class_name, "cntrl") == 0)
3582 BUILD_CHARCLASS_LOOP (iscntrl);
3583 else if (strcmp (class_name, "lower") == 0)
3584 BUILD_CHARCLASS_LOOP (islower);
3585 else if (strcmp (class_name, "space") == 0)
3586 BUILD_CHARCLASS_LOOP (isspace);
3587 else if (strcmp (class_name, "alpha") == 0)
3588 BUILD_CHARCLASS_LOOP (isalpha);
3589 else if (strcmp (class_name, "digit") == 0)
3590 BUILD_CHARCLASS_LOOP (isdigit);
3591 else if (strcmp (class_name, "print") == 0)
3592 BUILD_CHARCLASS_LOOP (isprint);
3593 else if (strcmp (class_name, "upper") == 0)
3594 BUILD_CHARCLASS_LOOP (isupper);
3595 else if (strcmp (class_name, "blank") == 0)
3596 #ifndef GAWK
3597 BUILD_CHARCLASS_LOOP (isblank);
3598 #else
3599 /* see comments above */
3600 BUILD_CHARCLASS_LOOP (is_blank);
3601 #endif
3602 else if (strcmp (class_name, "graph") == 0)
3603 BUILD_CHARCLASS_LOOP (isgraph);
3604 else if (strcmp (class_name, "punct") == 0)
3605 BUILD_CHARCLASS_LOOP (ispunct);
3606 else if (strcmp (class_name, "xdigit") == 0)
3607 BUILD_CHARCLASS_LOOP (isxdigit);
3608 else
3609 return REG_ECTYPE;
3611 return REG_NOERROR;
3614 static bin_tree_t *
3615 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3616 const char *class_name,
3617 const char *extra, int non_match,
3618 reg_errcode_t *err)
3620 re_bitset_ptr_t sbcset;
3621 #ifdef RE_ENABLE_I18N
3622 re_charset_t *mbcset;
3623 int alloc = 0;
3624 #endif /* not RE_ENABLE_I18N */
3625 reg_errcode_t ret;
3626 re_token_t br_token;
3627 bin_tree_t *tree;
3629 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3630 #ifdef RE_ENABLE_I18N
3631 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3632 #endif /* RE_ENABLE_I18N */
3634 #ifdef RE_ENABLE_I18N
3635 if (BE (sbcset == NULL || mbcset == NULL, 0))
3636 #else /* not RE_ENABLE_I18N */
3637 if (BE (sbcset == NULL, 0))
3638 #endif /* not RE_ENABLE_I18N */
3640 *err = REG_ESPACE;
3641 return NULL;
3644 if (non_match)
3646 #ifdef RE_ENABLE_I18N
3647 mbcset->non_match = 1;
3648 #endif /* not RE_ENABLE_I18N */
3651 /* We don't care the syntax in this case. */
3652 ret = build_charclass (trans, sbcset,
3653 #ifdef RE_ENABLE_I18N
3654 mbcset, &alloc,
3655 #endif /* RE_ENABLE_I18N */
3656 class_name, 0);
3658 if (BE (ret != REG_NOERROR, 0))
3660 re_free (sbcset);
3661 #ifdef RE_ENABLE_I18N
3662 free_charset (mbcset);
3663 #endif /* RE_ENABLE_I18N */
3664 *err = ret;
3665 return NULL;
3667 /* \w match '_' also. */
3668 for (; *extra; extra++)
3669 bitset_set (sbcset, *extra);
3671 /* If it is non-matching list. */
3672 if (non_match)
3673 bitset_not (sbcset);
3675 #ifdef RE_ENABLE_I18N
3676 /* Ensure only single byte characters are set. */
3677 if (dfa->mb_cur_max > 1)
3678 bitset_mask (sbcset, dfa->sb_char);
3679 #endif
3681 /* Build a tree for simple bracket. */
3682 br_token.type = SIMPLE_BRACKET;
3683 br_token.opr.sbcset = sbcset;
3684 tree = create_token_tree (dfa, NULL, NULL, &br_token);
3685 if (BE (tree == NULL, 0))
3686 goto build_word_op_espace;
3688 #ifdef RE_ENABLE_I18N
3689 if (dfa->mb_cur_max > 1)
3691 bin_tree_t *mbc_tree;
3692 /* Build a tree for complex bracket. */
3693 br_token.type = COMPLEX_BRACKET;
3694 br_token.opr.mbcset = mbcset;
3695 dfa->has_mb_node = 1;
3696 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3697 if (BE (mbc_tree == NULL, 0))
3698 goto build_word_op_espace;
3699 /* Then join them by ALT node. */
3700 tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3701 if (BE (mbc_tree != NULL, 1))
3702 return tree;
3704 else
3706 free_charset (mbcset);
3707 return tree;
3709 #else /* not RE_ENABLE_I18N */
3710 return tree;
3711 #endif /* not RE_ENABLE_I18N */
3713 build_word_op_espace:
3714 re_free (sbcset);
3715 #ifdef RE_ENABLE_I18N
3716 free_charset (mbcset);
3717 #endif /* RE_ENABLE_I18N */
3718 *err = REG_ESPACE;
3719 return NULL;
3722 /* This is intended for the expressions like "a{1,3}".
3723 Fetch a number from `input', and return the number.
3724 Return -1, if the number field is empty like "{,1}".
3725 Return -2, if an error has occurred. */
3727 static int
3728 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3730 int num = -1;
3731 unsigned char c;
3732 while (1)
3734 fetch_token (token, input, syntax);
3735 c = token->opr.c;
3736 if (BE (token->type == END_OF_RE, 0))
3737 return -2;
3738 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3739 break;
3740 num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3741 ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3742 num = (num > RE_DUP_MAX) ? -2 : num;
3744 return num;
3747 #ifdef RE_ENABLE_I18N
3748 static void
3749 free_charset (re_charset_t *cset)
3751 re_free (cset->mbchars);
3752 # ifdef _LIBC
3753 re_free (cset->coll_syms);
3754 re_free (cset->equiv_classes);
3755 re_free (cset->range_starts);
3756 re_free (cset->range_ends);
3757 # endif
3758 re_free (cset->char_classes);
3759 re_free (cset);
3761 #endif /* RE_ENABLE_I18N */
3763 /* Functions for binary tree operation. */
3765 /* Create a tree node. */
3767 static bin_tree_t *
3768 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3769 re_token_type_t type)
3771 re_token_t t;
3772 t.type = type;
3773 return create_token_tree (dfa, left, right, &t);
3776 static bin_tree_t *
3777 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3778 const re_token_t *token)
3780 bin_tree_t *tree;
3781 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3783 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3785 if (storage == NULL)
3786 return NULL;
3787 storage->next = dfa->str_tree_storage;
3788 dfa->str_tree_storage = storage;
3789 dfa->str_tree_storage_idx = 0;
3791 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3793 tree->parent = NULL;
3794 tree->left = left;
3795 tree->right = right;
3796 tree->token = *token;
3797 tree->token.duplicated = 0;
3798 tree->token.opt_subexp = 0;
3799 tree->first = NULL;
3800 tree->next = NULL;
3801 tree->node_idx = -1;
3803 if (left != NULL)
3804 left->parent = tree;
3805 if (right != NULL)
3806 right->parent = tree;
3807 return tree;
3810 /* Mark the tree SRC as an optional subexpression.
3811 To be called from preorder or postorder. */
3813 static reg_errcode_t
3814 mark_opt_subexp (void *extra, bin_tree_t *node)
3816 int idx = (int) (intptr_t) extra;
3817 if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3818 node->token.opt_subexp = 1;
3820 return REG_NOERROR;
3823 /* Free the allocated memory inside NODE. */
3825 static void
3826 free_token (re_token_t *node)
3828 #ifdef RE_ENABLE_I18N
3829 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3830 free_charset (node->opr.mbcset);
3831 else
3832 #endif /* RE_ENABLE_I18N */
3833 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3834 re_free (node->opr.sbcset);
3837 /* Worker function for tree walking. Free the allocated memory inside NODE
3838 and its children. */
3840 static reg_errcode_t
3841 free_tree (void *extra, bin_tree_t *node)
3843 free_token (&node->token);
3844 return REG_NOERROR;
3848 /* Duplicate the node SRC, and return new node. This is a preorder
3849 visit similar to the one implemented by the generic visitor, but
3850 we need more infrastructure to maintain two parallel trees --- so,
3851 it's easier to duplicate. */
3853 static bin_tree_t *
3854 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3856 const bin_tree_t *node;
3857 bin_tree_t *dup_root;
3858 bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3860 for (node = root; ; )
3862 /* Create a new tree and link it back to the current parent. */
3863 *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3864 if (*p_new == NULL)
3865 return NULL;
3866 (*p_new)->parent = dup_node;
3867 (*p_new)->token.duplicated = 1;
3868 dup_node = *p_new;
3870 /* Go to the left node, or up and to the right. */
3871 if (node->left)
3873 node = node->left;
3874 p_new = &dup_node->left;
3876 else
3878 const bin_tree_t *prev = NULL;
3879 while (node->right == prev || node->right == NULL)
3881 prev = node;
3882 node = node->parent;
3883 dup_node = dup_node->parent;
3884 if (!node)
3885 return dup_root;
3887 node = node->right;
3888 p_new = &dup_node->right;