compat/regex: use the regex engine from gawk for compat
[alt-git.git] / compat / regex / regcomp.c
blob5115d7a0eed4862311fe802bf02d424436dd00b2
1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002-2007,2009,2010 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19 02110-1301 USA. */
21 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
22 size_t length, reg_syntax_t syntax);
23 static void re_compile_fastmap_iter (regex_t *bufp,
24 const re_dfastate_t *init_state,
25 char *fastmap);
26 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
27 #ifdef RE_ENABLE_I18N
28 static void free_charset (re_charset_t *cset);
29 #endif /* RE_ENABLE_I18N */
30 static void free_workarea_compile (regex_t *preg);
31 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
32 #ifdef RE_ENABLE_I18N
33 static void optimize_utf8 (re_dfa_t *dfa);
34 #endif
35 static reg_errcode_t analyze (regex_t *preg);
36 static reg_errcode_t preorder (bin_tree_t *root,
37 reg_errcode_t (fn (void *, bin_tree_t *)),
38 void *extra);
39 static reg_errcode_t postorder (bin_tree_t *root,
40 reg_errcode_t (fn (void *, bin_tree_t *)),
41 void *extra);
42 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
43 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
44 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
45 bin_tree_t *node);
46 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
47 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
48 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
49 static int duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint);
50 static int search_duplicated_node (const re_dfa_t *dfa, int org_node,
51 unsigned int constraint);
52 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
53 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
54 int node, int root);
55 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
56 static int fetch_number (re_string_t *input, re_token_t *token,
57 reg_syntax_t syntax);
58 static int peek_token (re_token_t *token, re_string_t *input,
59 reg_syntax_t syntax) internal_function;
60 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
61 reg_syntax_t syntax, reg_errcode_t *err);
62 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
63 re_token_t *token, reg_syntax_t syntax,
64 int nest, reg_errcode_t *err);
65 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
66 re_token_t *token, reg_syntax_t syntax,
67 int nest, reg_errcode_t *err);
68 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
69 re_token_t *token, reg_syntax_t syntax,
70 int nest, reg_errcode_t *err);
71 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
72 re_token_t *token, reg_syntax_t syntax,
73 int nest, reg_errcode_t *err);
74 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
75 re_dfa_t *dfa, re_token_t *token,
76 reg_syntax_t syntax, reg_errcode_t *err);
77 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
78 re_token_t *token, reg_syntax_t syntax,
79 reg_errcode_t *err);
80 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
81 re_string_t *regexp,
82 re_token_t *token, int token_len,
83 re_dfa_t *dfa,
84 reg_syntax_t syntax,
85 int accept_hyphen);
86 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
87 re_string_t *regexp,
88 re_token_t *token);
89 #ifdef RE_ENABLE_I18N
90 static reg_errcode_t build_equiv_class (bitset_t sbcset,
91 re_charset_t *mbcset,
92 int *equiv_class_alloc,
93 const unsigned char *name);
94 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
95 bitset_t sbcset,
96 re_charset_t *mbcset,
97 int *char_class_alloc,
98 const char *class_name,
99 reg_syntax_t syntax);
100 #else /* not RE_ENABLE_I18N */
101 static reg_errcode_t build_equiv_class (bitset_t sbcset,
102 const unsigned char *name);
103 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
104 bitset_t sbcset,
105 const char *class_name,
106 reg_syntax_t syntax);
107 #endif /* not RE_ENABLE_I18N */
108 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
109 RE_TRANSLATE_TYPE trans,
110 const char *class_name,
111 const char *extra,
112 int non_match, reg_errcode_t *err);
113 static bin_tree_t *create_tree (re_dfa_t *dfa,
114 bin_tree_t *left, bin_tree_t *right,
115 re_token_type_t type);
116 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
117 bin_tree_t *left, bin_tree_t *right,
118 const re_token_t *token);
119 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
120 static void free_token (re_token_t *node);
121 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
122 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
124 /* This table gives an error message for each of the error codes listed
125 in regex.h. Obviously the order here has to be same as there.
126 POSIX doesn't require that we do anything for REG_NOERROR,
127 but why not be nice? */
129 const char __re_error_msgid[] attribute_hidden =
131 #define REG_NOERROR_IDX 0
132 gettext_noop ("Success") /* REG_NOERROR */
133 "\0"
134 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
135 gettext_noop ("No match") /* REG_NOMATCH */
136 "\0"
137 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
138 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
139 "\0"
140 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
141 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
142 "\0"
143 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
144 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
145 "\0"
146 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
147 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
148 "\0"
149 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
150 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
151 "\0"
152 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
153 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
154 "\0"
155 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
156 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
157 "\0"
158 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
159 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
160 "\0"
161 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
162 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
163 "\0"
164 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
165 gettext_noop ("Invalid range end") /* REG_ERANGE */
166 "\0"
167 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
168 gettext_noop ("Memory exhausted") /* REG_ESPACE */
169 "\0"
170 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
171 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
172 "\0"
173 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
174 gettext_noop ("Premature end of regular expression") /* REG_EEND */
175 "\0"
176 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
177 gettext_noop ("Regular expression too big") /* REG_ESIZE */
178 "\0"
179 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
180 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
183 const size_t __re_error_msgid_idx[] attribute_hidden =
185 REG_NOERROR_IDX,
186 REG_NOMATCH_IDX,
187 REG_BADPAT_IDX,
188 REG_ECOLLATE_IDX,
189 REG_ECTYPE_IDX,
190 REG_EESCAPE_IDX,
191 REG_ESUBREG_IDX,
192 REG_EBRACK_IDX,
193 REG_EPAREN_IDX,
194 REG_EBRACE_IDX,
195 REG_BADBR_IDX,
196 REG_ERANGE_IDX,
197 REG_ESPACE_IDX,
198 REG_BADRPT_IDX,
199 REG_EEND_IDX,
200 REG_ESIZE_IDX,
201 REG_ERPAREN_IDX
204 /* Entry points for GNU code. */
207 #ifdef ZOS_USS
209 /* For ZOS USS we must define btowc */
211 wchar_t
212 btowc (int c)
214 wchar_t wtmp[2];
215 char tmp[2];
217 tmp[0] = c;
218 tmp[1] = 0;
220 mbtowc (wtmp, tmp, 1);
221 return wtmp[0];
223 #endif
225 /* re_compile_pattern is the GNU regular expression compiler: it
226 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
227 Returns 0 if the pattern was valid, otherwise an error string.
229 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
230 are set in BUFP on entry. */
232 const char *
233 re_compile_pattern (pattern, length, bufp)
234 const char *pattern;
235 size_t length;
236 struct re_pattern_buffer *bufp;
238 reg_errcode_t ret;
240 /* And GNU code determines whether or not to get register information
241 by passing null for the REGS argument to re_match, etc., not by
242 setting no_sub, unless RE_NO_SUB is set. */
243 bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
245 /* Match anchors at newline. */
246 bufp->newline_anchor = 1;
248 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
250 if (!ret)
251 return NULL;
252 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
254 #ifdef _LIBC
255 weak_alias (__re_compile_pattern, re_compile_pattern)
256 #endif
258 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
259 also be assigned to arbitrarily: each pattern buffer stores its own
260 syntax, so it can be changed between regex compilations. */
261 /* This has no initializer because initialized variables in Emacs
262 become read-only after dumping. */
263 reg_syntax_t re_syntax_options;
266 /* Specify the precise syntax of regexps for compilation. This provides
267 for compatibility for various utilities which historically have
268 different, incompatible syntaxes.
270 The argument SYNTAX is a bit mask comprised of the various bits
271 defined in regex.h. We return the old syntax. */
273 reg_syntax_t
274 re_set_syntax (syntax)
275 reg_syntax_t syntax;
277 reg_syntax_t ret = re_syntax_options;
279 re_syntax_options = syntax;
280 return ret;
282 #ifdef _LIBC
283 weak_alias (__re_set_syntax, re_set_syntax)
284 #endif
287 re_compile_fastmap (bufp)
288 struct re_pattern_buffer *bufp;
290 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
291 char *fastmap = bufp->fastmap;
293 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
294 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
295 if (dfa->init_state != dfa->init_state_word)
296 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
297 if (dfa->init_state != dfa->init_state_nl)
298 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
299 if (dfa->init_state != dfa->init_state_begbuf)
300 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
301 bufp->fastmap_accurate = 1;
302 return 0;
304 #ifdef _LIBC
305 weak_alias (__re_compile_fastmap, re_compile_fastmap)
306 #endif
308 static inline void
309 __attribute ((always_inline))
310 re_set_fastmap (char *fastmap, int icase, int ch)
312 fastmap[ch] = 1;
313 if (icase)
314 fastmap[tolower (ch)] = 1;
317 /* Helper function for re_compile_fastmap.
318 Compile fastmap for the initial_state INIT_STATE. */
320 static void
321 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
322 char *fastmap)
324 volatile re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
325 int node_cnt;
326 int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
327 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
329 int node = init_state->nodes.elems[node_cnt];
330 re_token_type_t type = dfa->nodes[node].type;
332 if (type == CHARACTER)
334 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
335 #ifdef RE_ENABLE_I18N
336 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
338 unsigned char *buf = re_malloc (unsigned char, dfa->mb_cur_max), *p;
339 wchar_t wc;
340 mbstate_t state;
342 p = buf;
343 *p++ = dfa->nodes[node].opr.c;
344 while (++node < dfa->nodes_len
345 && dfa->nodes[node].type == CHARACTER
346 && dfa->nodes[node].mb_partial)
347 *p++ = dfa->nodes[node].opr.c;
348 memset (&state, '\0', sizeof (state));
349 if (__mbrtowc (&wc, (const char *) buf, p - buf,
350 &state) == p - buf
351 && (__wcrtomb ((char *) buf, towlower (wc), &state)
352 != (size_t) -1))
353 re_set_fastmap (fastmap, 0, buf[0]);
354 re_free (buf);
356 #endif
358 else if (type == SIMPLE_BRACKET)
360 int i, ch;
361 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
363 int j;
364 bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
365 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
366 if (w & ((bitset_word_t) 1 << j))
367 re_set_fastmap (fastmap, icase, ch);
370 #ifdef RE_ENABLE_I18N
371 else if (type == COMPLEX_BRACKET)
373 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
374 int i;
376 # ifdef _LIBC
377 /* See if we have to try all bytes which start multiple collation
378 elements.
379 e.g. In da_DK, we want to catch 'a' since "aa" is a valid
380 collation element, and don't catch 'b' since 'b' is
381 the only collation element which starts from 'b' (and
382 it is caught by SIMPLE_BRACKET). */
383 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
384 && (cset->ncoll_syms || cset->nranges))
386 const int32_t *table = (const int32_t *)
387 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
388 for (i = 0; i < SBC_MAX; ++i)
389 if (table[i] < 0)
390 re_set_fastmap (fastmap, icase, i);
392 # endif /* _LIBC */
394 /* See if we have to start the match at all multibyte characters,
395 i.e. where we would not find an invalid sequence. This only
396 applies to multibyte character sets; for single byte character
397 sets, the SIMPLE_BRACKET again suffices. */
398 if (dfa->mb_cur_max > 1
399 && (cset->nchar_classes || cset->non_match || cset->nranges
400 # ifdef _LIBC
401 || cset->nequiv_classes
402 # endif /* _LIBC */
405 unsigned char c = 0;
408 mbstate_t mbs;
409 memset (&mbs, 0, sizeof (mbs));
410 if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
411 re_set_fastmap (fastmap, false, (int) c);
413 while (++c != 0);
416 else
418 /* ... Else catch all bytes which can start the mbchars. */
419 for (i = 0; i < cset->nmbchars; ++i)
421 char buf[256];
422 mbstate_t state;
423 memset (&state, '\0', sizeof (state));
424 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
425 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
426 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
428 if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
429 != (size_t) -1)
430 re_set_fastmap (fastmap, false, *(unsigned char *) buf);
435 #endif /* RE_ENABLE_I18N */
436 else if (type == OP_PERIOD
437 #ifdef RE_ENABLE_I18N
438 || type == OP_UTF8_PERIOD
439 #endif /* RE_ENABLE_I18N */
440 || type == END_OF_RE)
442 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
443 if (type == END_OF_RE)
444 bufp->can_be_null = 1;
445 return;
450 /* Entry point for POSIX code. */
451 /* regcomp takes a regular expression as a string and compiles it.
453 PREG is a regex_t *. We do not expect any fields to be initialized,
454 since POSIX says we shouldn't. Thus, we set
456 `buffer' to the compiled pattern;
457 `used' to the length of the compiled pattern;
458 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
459 REG_EXTENDED bit in CFLAGS is set; otherwise, to
460 RE_SYNTAX_POSIX_BASIC;
461 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
462 `fastmap' to an allocated space for the fastmap;
463 `fastmap_accurate' to zero;
464 `re_nsub' to the number of subexpressions in PATTERN.
466 PATTERN is the address of the pattern string.
468 CFLAGS is a series of bits which affect compilation.
470 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
471 use POSIX basic syntax.
473 If REG_NEWLINE is set, then . and [^...] don't match newline.
474 Also, regexec will try a match beginning after every newline.
476 If REG_ICASE is set, then we considers upper- and lowercase
477 versions of letters to be equivalent when matching.
479 If REG_NOSUB is set, then when PREG is passed to regexec, that
480 routine will report only success or failure, and nothing about the
481 registers.
483 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
484 the return codes and their meanings.) */
487 regcomp (preg, pattern, cflags)
488 regex_t *__restrict preg;
489 const char *__restrict pattern;
490 int cflags;
492 reg_errcode_t ret;
493 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
494 : RE_SYNTAX_POSIX_BASIC);
496 preg->buffer = NULL;
497 preg->allocated = 0;
498 preg->used = 0;
500 /* Try to allocate space for the fastmap. */
501 preg->fastmap = re_malloc (char, SBC_MAX);
502 if (BE (preg->fastmap == NULL, 0))
503 return REG_ESPACE;
505 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
507 /* If REG_NEWLINE is set, newlines are treated differently. */
508 if (cflags & REG_NEWLINE)
509 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
510 syntax &= ~RE_DOT_NEWLINE;
511 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
512 /* It also changes the matching behavior. */
513 preg->newline_anchor = 1;
515 else
516 preg->newline_anchor = 0;
517 preg->no_sub = !!(cflags & REG_NOSUB);
518 preg->translate = NULL;
520 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
522 /* POSIX doesn't distinguish between an unmatched open-group and an
523 unmatched close-group: both are REG_EPAREN. */
524 if (ret == REG_ERPAREN)
525 ret = REG_EPAREN;
527 /* We have already checked preg->fastmap != NULL. */
528 if (BE (ret == REG_NOERROR, 1))
529 /* Compute the fastmap now, since regexec cannot modify the pattern
530 buffer. This function never fails in this implementation. */
531 (void) re_compile_fastmap (preg);
532 else
534 /* Some error occurred while compiling the expression. */
535 re_free (preg->fastmap);
536 preg->fastmap = NULL;
539 return (int) ret;
541 #ifdef _LIBC
542 weak_alias (__regcomp, regcomp)
543 #endif
545 /* Returns a message corresponding to an error code, ERRCODE, returned
546 from either regcomp or regexec. We don't use PREG here. */
548 size_t
549 regerror (errcode, preg, errbuf, errbuf_size)
550 int errcode;
551 const regex_t *__restrict preg;
552 char *__restrict errbuf;
553 size_t errbuf_size;
555 const char *msg;
556 size_t msg_size;
558 if (BE (errcode < 0
559 || errcode >= (int) (sizeof (__re_error_msgid_idx)
560 / sizeof (__re_error_msgid_idx[0])), 0))
561 /* Only error codes returned by the rest of the code should be passed
562 to this routine. If we are given anything else, or if other regex
563 code generates an invalid error code, then the program has a bug.
564 Dump core so we can fix it. */
565 abort ();
567 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
569 msg_size = strlen (msg) + 1; /* Includes the null. */
571 if (BE (errbuf_size != 0, 1))
573 if (BE (msg_size > errbuf_size, 0))
575 memcpy (errbuf, msg, errbuf_size - 1);
576 errbuf[errbuf_size - 1] = 0;
578 else
579 memcpy (errbuf, msg, msg_size);
582 return msg_size;
584 #ifdef _LIBC
585 weak_alias (__regerror, regerror)
586 #endif
589 #ifdef RE_ENABLE_I18N
590 /* This static array is used for the map to single-byte characters when
591 UTF-8 is used. Otherwise we would allocate memory just to initialize
592 it the same all the time. UTF-8 is the preferred encoding so this is
593 a worthwhile optimization. */
594 #if __GNUC__ >= 3
595 static const bitset_t utf8_sb_map = {
596 /* Set the first 128 bits. */
597 [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
599 #else /* ! (__GNUC__ >= 3) */
600 static bitset_t utf8_sb_map;
601 #endif /* __GNUC__ >= 3 */
602 #endif /* RE_ENABLE_I18N */
605 static void
606 free_dfa_content (re_dfa_t *dfa)
608 int i, j;
610 if (dfa->nodes)
611 for (i = 0; i < dfa->nodes_len; ++i)
612 free_token (dfa->nodes + i);
613 re_free (dfa->nexts);
614 for (i = 0; i < dfa->nodes_len; ++i)
616 if (dfa->eclosures != NULL)
617 re_node_set_free (dfa->eclosures + i);
618 if (dfa->inveclosures != NULL)
619 re_node_set_free (dfa->inveclosures + i);
620 if (dfa->edests != NULL)
621 re_node_set_free (dfa->edests + i);
623 re_free (dfa->edests);
624 re_free (dfa->eclosures);
625 re_free (dfa->inveclosures);
626 re_free (dfa->nodes);
628 if (dfa->state_table)
629 for (i = 0; i <= dfa->state_hash_mask; ++i)
631 struct re_state_table_entry *entry = dfa->state_table + i;
632 for (j = 0; j < entry->num; ++j)
634 re_dfastate_t *state = entry->array[j];
635 free_state (state);
637 re_free (entry->array);
639 re_free (dfa->state_table);
640 #ifdef RE_ENABLE_I18N
641 if (dfa->sb_char != utf8_sb_map)
642 re_free (dfa->sb_char);
643 #endif
644 re_free (dfa->subexp_map);
645 #ifdef DEBUG
646 re_free (dfa->re_str);
647 #endif
649 re_free (dfa);
653 /* Free dynamically allocated space used by PREG. */
655 void
656 regfree (preg)
657 regex_t *preg;
659 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
660 if (BE (dfa != NULL, 1))
661 free_dfa_content (dfa);
662 preg->buffer = NULL;
663 preg->allocated = 0;
665 re_free (preg->fastmap);
666 preg->fastmap = NULL;
668 re_free (preg->translate);
669 preg->translate = NULL;
671 #ifdef _LIBC
672 weak_alias (__regfree, regfree)
673 #endif
675 /* Entry points compatible with 4.2 BSD regex library. We don't define
676 them unless specifically requested. */
678 #if defined _REGEX_RE_COMP || defined _LIBC
680 /* BSD has one and only one pattern buffer. */
681 static struct re_pattern_buffer re_comp_buf;
683 char *
684 # ifdef _LIBC
685 /* Make these definitions weak in libc, so POSIX programs can redefine
686 these names if they don't use our functions, and still use
687 regcomp/regexec above without link errors. */
688 weak_function
689 # endif
690 re_comp (s)
691 const char *s;
693 reg_errcode_t ret;
694 char *fastmap;
696 if (!s)
698 if (!re_comp_buf.buffer)
699 return gettext ("No previous regular expression");
700 return 0;
703 if (re_comp_buf.buffer)
705 fastmap = re_comp_buf.fastmap;
706 re_comp_buf.fastmap = NULL;
707 __regfree (&re_comp_buf);
708 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
709 re_comp_buf.fastmap = fastmap;
712 if (re_comp_buf.fastmap == NULL)
714 re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
715 if (re_comp_buf.fastmap == NULL)
716 return (char *) gettext (__re_error_msgid
717 + __re_error_msgid_idx[(int) REG_ESPACE]);
720 /* Since `re_exec' always passes NULL for the `regs' argument, we
721 don't need to initialize the pattern buffer fields which affect it. */
723 /* Match anchors at newlines. */
724 re_comp_buf.newline_anchor = 1;
726 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
728 if (!ret)
729 return NULL;
731 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
732 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
735 #ifdef _LIBC
736 libc_freeres_fn (free_mem)
738 __regfree (&re_comp_buf);
740 #endif
742 #endif /* _REGEX_RE_COMP */
744 /* Internal entry point.
745 Compile the regular expression PATTERN, whose length is LENGTH.
746 SYNTAX indicate regular expression's syntax. */
748 static reg_errcode_t
749 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
750 reg_syntax_t syntax)
752 reg_errcode_t err = REG_NOERROR;
753 re_dfa_t *dfa;
754 re_string_t regexp;
756 /* Initialize the pattern buffer. */
757 preg->fastmap_accurate = 0;
758 preg->syntax = syntax;
759 preg->not_bol = preg->not_eol = 0;
760 preg->used = 0;
761 preg->re_nsub = 0;
762 preg->can_be_null = 0;
763 preg->regs_allocated = REGS_UNALLOCATED;
765 /* Initialize the dfa. */
766 dfa = (re_dfa_t *) preg->buffer;
767 if (BE (preg->allocated < sizeof (re_dfa_t), 0))
769 /* If zero allocated, but buffer is non-null, try to realloc
770 enough space. This loses if buffer's address is bogus, but
771 that is the user's responsibility. If ->buffer is NULL this
772 is a simple allocation. */
773 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
774 if (dfa == NULL)
775 return REG_ESPACE;
776 preg->allocated = sizeof (re_dfa_t);
777 preg->buffer = (unsigned char *) dfa;
779 preg->used = sizeof (re_dfa_t);
781 err = init_dfa (dfa, length);
782 if (BE (err != REG_NOERROR, 0))
784 free_dfa_content (dfa);
785 preg->buffer = NULL;
786 preg->allocated = 0;
787 return err;
789 #ifdef DEBUG
790 /* Note: length+1 will not overflow since it is checked in init_dfa. */
791 dfa->re_str = re_malloc (char, length + 1);
792 strncpy (dfa->re_str, pattern, length + 1);
793 #endif
795 __libc_lock_init (dfa->lock);
797 err = re_string_construct (&regexp, pattern, length, preg->translate,
798 syntax & RE_ICASE, dfa);
799 if (BE (err != REG_NOERROR, 0))
801 re_compile_internal_free_return:
802 free_workarea_compile (preg);
803 re_string_destruct (&regexp);
804 free_dfa_content (dfa);
805 preg->buffer = NULL;
806 preg->allocated = 0;
807 return err;
810 /* Parse the regular expression, and build a structure tree. */
811 preg->re_nsub = 0;
812 dfa->str_tree = parse (&regexp, preg, syntax, &err);
813 if (BE (dfa->str_tree == NULL, 0))
814 goto re_compile_internal_free_return;
816 /* Analyze the tree and create the nfa. */
817 err = analyze (preg);
818 if (BE (err != REG_NOERROR, 0))
819 goto re_compile_internal_free_return;
821 #ifdef RE_ENABLE_I18N
822 /* If possible, do searching in single byte encoding to speed things up. */
823 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
824 optimize_utf8 (dfa);
825 #endif
827 /* Then create the initial state of the dfa. */
828 err = create_initial_state (dfa);
830 /* Release work areas. */
831 free_workarea_compile (preg);
832 re_string_destruct (&regexp);
834 if (BE (err != REG_NOERROR, 0))
836 free_dfa_content (dfa);
837 preg->buffer = NULL;
838 preg->allocated = 0;
841 return err;
844 /* Initialize DFA. We use the length of the regular expression PAT_LEN
845 as the initial length of some arrays. */
847 static reg_errcode_t
848 init_dfa (re_dfa_t *dfa, size_t pat_len)
850 unsigned int table_size;
851 #ifndef _LIBC
852 char *codeset_name;
853 #endif
855 memset (dfa, '\0', sizeof (re_dfa_t));
857 /* Force allocation of str_tree_storage the first time. */
858 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
860 /* Avoid overflows. */
861 if (pat_len == SIZE_MAX)
862 return REG_ESPACE;
864 dfa->nodes_alloc = pat_len + 1;
865 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
867 /* table_size = 2 ^ ceil(log pat_len) */
868 for (table_size = 1; ; table_size <<= 1)
869 if (table_size > pat_len)
870 break;
872 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
873 dfa->state_hash_mask = table_size - 1;
875 dfa->mb_cur_max = MB_CUR_MAX;
876 #ifdef _LIBC
877 if (dfa->mb_cur_max == 6
878 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
879 dfa->is_utf8 = 1;
880 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
881 != 0);
882 #else
883 # ifdef HAVE_LANGINFO_CODESET
884 codeset_name = nl_langinfo (CODESET);
885 # else
886 codeset_name = getenv ("LC_ALL");
887 if (codeset_name == NULL || codeset_name[0] == '\0')
888 codeset_name = getenv ("LC_CTYPE");
889 if (codeset_name == NULL || codeset_name[0] == '\0')
890 codeset_name = getenv ("LANG");
891 if (codeset_name == NULL)
892 codeset_name = "";
893 else if (strchr (codeset_name, '.') != NULL)
894 codeset_name = strchr (codeset_name, '.') + 1;
895 # endif
897 /* strcasecmp isn't a standard interface. brute force check */
898 #if 0
899 if (strcasecmp (codeset_name, "UTF-8") == 0
900 || strcasecmp (codeset_name, "UTF8") == 0)
901 dfa->is_utf8 = 1;
902 #else
903 if ( (codeset_name[0] == 'U' || codeset_name[0] == 'u')
904 && (codeset_name[1] == 'T' || codeset_name[1] == 't')
905 && (codeset_name[2] == 'F' || codeset_name[2] == 'f')
906 && (codeset_name[3] == '-'
907 ? codeset_name[4] == '8' && codeset_name[5] == '\0'
908 : codeset_name[3] == '8' && codeset_name[4] == '\0'))
909 dfa->is_utf8 = 1;
910 #endif
912 /* We check exhaustively in the loop below if this charset is a
913 superset of ASCII. */
914 dfa->map_notascii = 0;
915 #endif
917 #ifdef RE_ENABLE_I18N
918 if (dfa->mb_cur_max > 1)
920 if (dfa->is_utf8)
922 #if !defined(__GNUC__) || __GNUC__ < 3
923 static short utf8_sb_map_inited = 0;
925 if (! utf8_sb_map_inited)
927 int i;
929 utf8_sb_map_inited = 0;
930 for (i = 0; i <= 0x80 / BITSET_WORD_BITS - 1; i++)
931 utf8_sb_map[i] = BITSET_WORD_MAX;
933 #endif
934 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
936 else
938 int i, j, ch;
940 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
941 if (BE (dfa->sb_char == NULL, 0))
942 return REG_ESPACE;
944 /* Set the bits corresponding to single byte chars. */
945 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
946 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
948 wint_t wch = __btowc (ch);
949 if (wch != WEOF)
950 dfa->sb_char[i] |= (bitset_word_t) 1 << j;
951 # ifndef _LIBC
952 if (isascii (ch) && wch != ch)
953 dfa->map_notascii = 1;
954 # endif
958 #endif
960 if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
961 return REG_ESPACE;
962 return REG_NOERROR;
965 /* Initialize WORD_CHAR table, which indicate which character is
966 "word". In this case "word" means that it is the word construction
967 character used by some operators like "\<", "\>", etc. */
969 static void
970 internal_function
971 init_word_char (re_dfa_t *dfa)
973 int i, j, ch;
974 dfa->word_ops_used = 1;
975 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
976 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
977 if (isalnum (ch) || ch == '_')
978 dfa->word_char[i] |= (bitset_word_t) 1 << j;
981 /* Free the work area which are only used while compiling. */
983 static void
984 free_workarea_compile (regex_t *preg)
986 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
987 bin_tree_storage_t *storage, *next;
988 for (storage = dfa->str_tree_storage; storage; storage = next)
990 next = storage->next;
991 re_free (storage);
993 dfa->str_tree_storage = NULL;
994 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
995 dfa->str_tree = NULL;
996 re_free (dfa->org_indices);
997 dfa->org_indices = NULL;
1000 /* Create initial states for all contexts. */
1002 static reg_errcode_t
1003 create_initial_state (re_dfa_t *dfa)
1005 int first, i;
1006 reg_errcode_t err;
1007 re_node_set init_nodes;
1009 /* Initial states have the epsilon closure of the node which is
1010 the first node of the regular expression. */
1011 first = dfa->str_tree->first->node_idx;
1012 dfa->init_node = first;
1013 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
1014 if (BE (err != REG_NOERROR, 0))
1015 return err;
1017 /* The back-references which are in initial states can epsilon transit,
1018 since in this case all of the subexpressions can be null.
1019 Then we add epsilon closures of the nodes which are the next nodes of
1020 the back-references. */
1021 if (dfa->nbackref > 0)
1022 for (i = 0; i < init_nodes.nelem; ++i)
1024 int node_idx = init_nodes.elems[i];
1025 re_token_type_t type = dfa->nodes[node_idx].type;
1027 int clexp_idx;
1028 if (type != OP_BACK_REF)
1029 continue;
1030 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1032 re_token_t *clexp_node;
1033 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1034 if (clexp_node->type == OP_CLOSE_SUBEXP
1035 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1036 break;
1038 if (clexp_idx == init_nodes.nelem)
1039 continue;
1041 if (type == OP_BACK_REF)
1043 int dest_idx = dfa->edests[node_idx].elems[0];
1044 if (!re_node_set_contains (&init_nodes, dest_idx))
1046 reg_errcode_t err = re_node_set_merge (&init_nodes,
1047 dfa->eclosures
1048 + dest_idx);
1049 if (err != REG_NOERROR)
1050 return err;
1051 i = 0;
1056 /* It must be the first time to invoke acquire_state. */
1057 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1058 /* We don't check ERR here, since the initial state must not be NULL. */
1059 if (BE (dfa->init_state == NULL, 0))
1060 return err;
1061 if (dfa->init_state->has_constraint)
1063 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1064 CONTEXT_WORD);
1065 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1066 CONTEXT_NEWLINE);
1067 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1068 &init_nodes,
1069 CONTEXT_NEWLINE
1070 | CONTEXT_BEGBUF);
1071 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1072 || dfa->init_state_begbuf == NULL, 0))
1073 return err;
1075 else
1076 dfa->init_state_word = dfa->init_state_nl
1077 = dfa->init_state_begbuf = dfa->init_state;
1079 re_node_set_free (&init_nodes);
1080 return REG_NOERROR;
1083 #ifdef RE_ENABLE_I18N
1084 /* If it is possible to do searching in single byte encoding instead of UTF-8
1085 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1086 DFA nodes where needed. */
1088 static void
1089 optimize_utf8 (re_dfa_t *dfa)
1091 int node, i, mb_chars = 0, has_period = 0;
1093 for (node = 0; node < dfa->nodes_len; ++node)
1094 switch (dfa->nodes[node].type)
1096 case CHARACTER:
1097 if (dfa->nodes[node].opr.c >= 0x80)
1098 mb_chars = 1;
1099 break;
1100 case ANCHOR:
1101 switch (dfa->nodes[node].opr.ctx_type)
1103 case LINE_FIRST:
1104 case LINE_LAST:
1105 case BUF_FIRST:
1106 case BUF_LAST:
1107 break;
1108 default:
1109 /* Word anchors etc. cannot be handled. It's okay to test
1110 opr.ctx_type since constraints (for all DFA nodes) are
1111 created by ORing one or more opr.ctx_type values. */
1112 return;
1114 break;
1115 case OP_PERIOD:
1116 has_period = 1;
1117 break;
1118 case OP_BACK_REF:
1119 case OP_ALT:
1120 case END_OF_RE:
1121 case OP_DUP_ASTERISK:
1122 case OP_OPEN_SUBEXP:
1123 case OP_CLOSE_SUBEXP:
1124 break;
1125 case COMPLEX_BRACKET:
1126 return;
1127 case SIMPLE_BRACKET:
1128 /* Just double check. The non-ASCII range starts at 0x80. */
1129 assert (0x80 % BITSET_WORD_BITS == 0);
1130 for (i = 0x80 / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1131 if (dfa->nodes[node].opr.sbcset[i])
1132 return;
1133 break;
1134 default:
1135 abort ();
1138 if (mb_chars || has_period)
1139 for (node = 0; node < dfa->nodes_len; ++node)
1141 if (dfa->nodes[node].type == CHARACTER
1142 && dfa->nodes[node].opr.c >= 0x80)
1143 dfa->nodes[node].mb_partial = 0;
1144 else if (dfa->nodes[node].type == OP_PERIOD)
1145 dfa->nodes[node].type = OP_UTF8_PERIOD;
1148 /* The search can be in single byte locale. */
1149 dfa->mb_cur_max = 1;
1150 dfa->is_utf8 = 0;
1151 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1153 #endif
1155 /* Analyze the structure tree, and calculate "first", "next", "edest",
1156 "eclosure", and "inveclosure". */
1158 static reg_errcode_t
1159 analyze (regex_t *preg)
1161 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1162 reg_errcode_t ret;
1164 /* Allocate arrays. */
1165 dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1166 dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1167 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1168 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1169 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1170 || dfa->eclosures == NULL, 0))
1171 return REG_ESPACE;
1173 dfa->subexp_map = re_malloc (int, preg->re_nsub);
1174 if (dfa->subexp_map != NULL)
1176 int i;
1177 for (i = 0; i < preg->re_nsub; i++)
1178 dfa->subexp_map[i] = i;
1179 preorder (dfa->str_tree, optimize_subexps, dfa);
1180 for (i = 0; i < preg->re_nsub; i++)
1181 if (dfa->subexp_map[i] != i)
1182 break;
1183 if (i == preg->re_nsub)
1185 free (dfa->subexp_map);
1186 dfa->subexp_map = NULL;
1190 ret = postorder (dfa->str_tree, lower_subexps, preg);
1191 if (BE (ret != REG_NOERROR, 0))
1192 return ret;
1193 ret = postorder (dfa->str_tree, calc_first, dfa);
1194 if (BE (ret != REG_NOERROR, 0))
1195 return ret;
1196 preorder (dfa->str_tree, calc_next, dfa);
1197 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1198 if (BE (ret != REG_NOERROR, 0))
1199 return ret;
1200 ret = calc_eclosure (dfa);
1201 if (BE (ret != REG_NOERROR, 0))
1202 return ret;
1204 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1205 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1206 if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1207 || dfa->nbackref)
1209 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1210 if (BE (dfa->inveclosures == NULL, 0))
1211 return REG_ESPACE;
1212 ret = calc_inveclosure (dfa);
1215 return ret;
1218 /* Our parse trees are very unbalanced, so we cannot use a stack to
1219 implement parse tree visits. Instead, we use parent pointers and
1220 some hairy code in these two functions. */
1221 static reg_errcode_t
1222 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1223 void *extra)
1225 bin_tree_t *node, *prev;
1227 for (node = root; ; )
1229 /* Descend down the tree, preferably to the left (or to the right
1230 if that's the only child). */
1231 while (node->left || node->right)
1232 if (node->left)
1233 node = node->left;
1234 else
1235 node = node->right;
1239 reg_errcode_t err = fn (extra, node);
1240 if (BE (err != REG_NOERROR, 0))
1241 return err;
1242 if (node->parent == NULL)
1243 return REG_NOERROR;
1244 prev = node;
1245 node = node->parent;
1247 /* Go up while we have a node that is reached from the right. */
1248 while (node->right == prev || node->right == NULL);
1249 node = node->right;
1253 static reg_errcode_t
1254 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1255 void *extra)
1257 bin_tree_t *node;
1259 for (node = root; ; )
1261 reg_errcode_t err = fn (extra, node);
1262 if (BE (err != REG_NOERROR, 0))
1263 return err;
1265 /* Go to the left node, or up and to the right. */
1266 if (node->left)
1267 node = node->left;
1268 else
1270 bin_tree_t *prev = NULL;
1271 while (node->right == prev || node->right == NULL)
1273 prev = node;
1274 node = node->parent;
1275 if (!node)
1276 return REG_NOERROR;
1278 node = node->right;
1283 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1284 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1285 backreferences as well. Requires a preorder visit. */
1286 static reg_errcode_t
1287 optimize_subexps (void *extra, bin_tree_t *node)
1289 re_dfa_t *dfa = (re_dfa_t *) extra;
1291 if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1293 int idx = node->token.opr.idx;
1294 node->token.opr.idx = dfa->subexp_map[idx];
1295 dfa->used_bkref_map |= 1 << node->token.opr.idx;
1298 else if (node->token.type == SUBEXP
1299 && node->left && node->left->token.type == SUBEXP)
1301 int other_idx = node->left->token.opr.idx;
1303 node->left = node->left->left;
1304 if (node->left)
1305 node->left->parent = node;
1307 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1308 if (other_idx < BITSET_WORD_BITS)
1309 dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1312 return REG_NOERROR;
1315 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1316 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1317 static reg_errcode_t
1318 lower_subexps (void *extra, bin_tree_t *node)
1320 regex_t *preg = (regex_t *) extra;
1321 reg_errcode_t err = REG_NOERROR;
1323 if (node->left && node->left->token.type == SUBEXP)
1325 node->left = lower_subexp (&err, preg, node->left);
1326 if (node->left)
1327 node->left->parent = node;
1329 if (node->right && node->right->token.type == SUBEXP)
1331 node->right = lower_subexp (&err, preg, node->right);
1332 if (node->right)
1333 node->right->parent = node;
1336 return err;
1339 static bin_tree_t *
1340 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1342 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1343 bin_tree_t *body = node->left;
1344 bin_tree_t *op, *cls, *tree1, *tree;
1346 if (preg->no_sub
1347 /* We do not optimize empty subexpressions, because otherwise we may
1348 have bad CONCAT nodes with NULL children. This is obviously not
1349 very common, so we do not lose much. An example that triggers
1350 this case is the sed "script" /\(\)/x. */
1351 && node->left != NULL
1352 && (node->token.opr.idx >= BITSET_WORD_BITS
1353 || !(dfa->used_bkref_map
1354 & ((bitset_word_t) 1 << node->token.opr.idx))))
1355 return node->left;
1357 /* Convert the SUBEXP node to the concatenation of an
1358 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1359 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1360 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1361 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1362 tree = create_tree (dfa, op, tree1, CONCAT);
1363 if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1365 *err = REG_ESPACE;
1366 return NULL;
1369 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1370 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1371 return tree;
1374 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1375 nodes. Requires a postorder visit. */
1376 static reg_errcode_t
1377 calc_first (void *extra, bin_tree_t *node)
1379 re_dfa_t *dfa = (re_dfa_t *) extra;
1380 if (node->token.type == CONCAT)
1382 node->first = node->left->first;
1383 node->node_idx = node->left->node_idx;
1385 else
1387 node->first = node;
1388 node->node_idx = re_dfa_add_node (dfa, node->token);
1389 if (BE (node->node_idx == -1, 0))
1390 return REG_ESPACE;
1391 if (node->token.type == ANCHOR)
1392 dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1394 return REG_NOERROR;
1397 /* Pass 2: compute NEXT on the tree. Preorder visit. */
1398 static reg_errcode_t
1399 calc_next (void *extra, bin_tree_t *node)
1401 switch (node->token.type)
1403 case OP_DUP_ASTERISK:
1404 node->left->next = node;
1405 break;
1406 case CONCAT:
1407 node->left->next = node->right->first;
1408 node->right->next = node->next;
1409 break;
1410 default:
1411 if (node->left)
1412 node->left->next = node->next;
1413 if (node->right)
1414 node->right->next = node->next;
1415 break;
1417 return REG_NOERROR;
1420 /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1421 static reg_errcode_t
1422 link_nfa_nodes (void *extra, bin_tree_t *node)
1424 re_dfa_t *dfa = (re_dfa_t *) extra;
1425 int idx = node->node_idx;
1426 reg_errcode_t err = REG_NOERROR;
1428 switch (node->token.type)
1430 case CONCAT:
1431 break;
1433 case END_OF_RE:
1434 assert (node->next == NULL);
1435 break;
1437 case OP_DUP_ASTERISK:
1438 case OP_ALT:
1440 int left, right;
1441 dfa->has_plural_match = 1;
1442 if (node->left != NULL)
1443 left = node->left->first->node_idx;
1444 else
1445 left = node->next->node_idx;
1446 if (node->right != NULL)
1447 right = node->right->first->node_idx;
1448 else
1449 right = node->next->node_idx;
1450 assert (left > -1);
1451 assert (right > -1);
1452 err = re_node_set_init_2 (dfa->edests + idx, left, right);
1454 break;
1456 case ANCHOR:
1457 case OP_OPEN_SUBEXP:
1458 case OP_CLOSE_SUBEXP:
1459 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1460 break;
1462 case OP_BACK_REF:
1463 dfa->nexts[idx] = node->next->node_idx;
1464 if (node->token.type == OP_BACK_REF)
1465 err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1466 break;
1468 default:
1469 assert (!IS_EPSILON_NODE (node->token.type));
1470 dfa->nexts[idx] = node->next->node_idx;
1471 break;
1474 return err;
1477 /* Duplicate the epsilon closure of the node ROOT_NODE.
1478 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1479 to their own constraint. */
1481 static reg_errcode_t
1482 internal_function
1483 duplicate_node_closure (re_dfa_t *dfa, int top_org_node, int top_clone_node,
1484 int root_node, unsigned int init_constraint)
1486 int org_node, clone_node, ret;
1487 unsigned int constraint = init_constraint;
1488 for (org_node = top_org_node, clone_node = top_clone_node;;)
1490 int org_dest, clone_dest;
1491 if (dfa->nodes[org_node].type == OP_BACK_REF)
1493 /* If the back reference epsilon-transit, its destination must
1494 also have the constraint. Then duplicate the epsilon closure
1495 of the destination of the back reference, and store it in
1496 edests of the back reference. */
1497 org_dest = dfa->nexts[org_node];
1498 re_node_set_empty (dfa->edests + clone_node);
1499 clone_dest = duplicate_node (dfa, org_dest, constraint);
1500 if (BE (clone_dest == -1, 0))
1501 return REG_ESPACE;
1502 dfa->nexts[clone_node] = dfa->nexts[org_node];
1503 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1504 if (BE (ret < 0, 0))
1505 return REG_ESPACE;
1507 else if (dfa->edests[org_node].nelem == 0)
1509 /* In case of the node can't epsilon-transit, don't duplicate the
1510 destination and store the original destination as the
1511 destination of the node. */
1512 dfa->nexts[clone_node] = dfa->nexts[org_node];
1513 break;
1515 else if (dfa->edests[org_node].nelem == 1)
1517 /* In case of the node can epsilon-transit, and it has only one
1518 destination. */
1519 org_dest = dfa->edests[org_node].elems[0];
1520 re_node_set_empty (dfa->edests + clone_node);
1521 /* If the node is root_node itself, it means the epsilon clsoure
1522 has a loop. Then tie it to the destination of the root_node. */
1523 if (org_node == root_node && clone_node != org_node)
1525 ret = re_node_set_insert (dfa->edests + clone_node, org_dest);
1526 if (BE (ret < 0, 0))
1527 return REG_ESPACE;
1528 break;
1530 /* In case of the node has another constraint, add it. */
1531 constraint |= dfa->nodes[org_node].constraint;
1532 clone_dest = duplicate_node (dfa, org_dest, constraint);
1533 if (BE (clone_dest == -1, 0))
1534 return REG_ESPACE;
1535 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1536 if (BE (ret < 0, 0))
1537 return REG_ESPACE;
1539 else /* dfa->edests[org_node].nelem == 2 */
1541 /* In case of the node can epsilon-transit, and it has two
1542 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1543 org_dest = dfa->edests[org_node].elems[0];
1544 re_node_set_empty (dfa->edests + clone_node);
1545 /* Search for a duplicated node which satisfies the constraint. */
1546 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1547 if (clone_dest == -1)
1549 /* There is no such duplicated node, create a new one. */
1550 reg_errcode_t err;
1551 clone_dest = duplicate_node (dfa, org_dest, constraint);
1552 if (BE (clone_dest == -1, 0))
1553 return REG_ESPACE;
1554 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1555 if (BE (ret < 0, 0))
1556 return REG_ESPACE;
1557 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1558 root_node, constraint);
1559 if (BE (err != REG_NOERROR, 0))
1560 return err;
1562 else
1564 /* There is a duplicated node which satisfies the constraint,
1565 use it to avoid infinite loop. */
1566 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1567 if (BE (ret < 0, 0))
1568 return REG_ESPACE;
1571 org_dest = dfa->edests[org_node].elems[1];
1572 clone_dest = duplicate_node (dfa, org_dest, constraint);
1573 if (BE (clone_dest == -1, 0))
1574 return REG_ESPACE;
1575 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1576 if (BE (ret < 0, 0))
1577 return REG_ESPACE;
1579 org_node = org_dest;
1580 clone_node = clone_dest;
1582 return REG_NOERROR;
1585 /* Search for a node which is duplicated from the node ORG_NODE, and
1586 satisfies the constraint CONSTRAINT. */
1588 static int
1589 search_duplicated_node (const re_dfa_t *dfa, int org_node,
1590 unsigned int constraint)
1592 int idx;
1593 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1595 if (org_node == dfa->org_indices[idx]
1596 && constraint == dfa->nodes[idx].constraint)
1597 return idx; /* Found. */
1599 return -1; /* Not found. */
1602 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1603 Return the index of the new node, or -1 if insufficient storage is
1604 available. */
1606 static int
1607 duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint)
1609 int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1610 if (BE (dup_idx != -1, 1))
1612 dfa->nodes[dup_idx].constraint = constraint;
1613 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1614 dfa->nodes[dup_idx].duplicated = 1;
1616 /* Store the index of the original node. */
1617 dfa->org_indices[dup_idx] = org_idx;
1619 return dup_idx;
1622 static reg_errcode_t
1623 calc_inveclosure (re_dfa_t *dfa)
1625 int src, idx, ret;
1626 for (idx = 0; idx < dfa->nodes_len; ++idx)
1627 re_node_set_init_empty (dfa->inveclosures + idx);
1629 for (src = 0; src < dfa->nodes_len; ++src)
1631 int *elems = dfa->eclosures[src].elems;
1632 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1634 ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1635 if (BE (ret == -1, 0))
1636 return REG_ESPACE;
1640 return REG_NOERROR;
1643 /* Calculate "eclosure" for all the node in DFA. */
1645 static reg_errcode_t
1646 calc_eclosure (re_dfa_t *dfa)
1648 int node_idx, incomplete;
1649 #ifdef DEBUG
1650 assert (dfa->nodes_len > 0);
1651 #endif
1652 incomplete = 0;
1653 /* For each nodes, calculate epsilon closure. */
1654 for (node_idx = 0; ; ++node_idx)
1656 reg_errcode_t err;
1657 re_node_set eclosure_elem;
1658 if (node_idx == dfa->nodes_len)
1660 if (!incomplete)
1661 break;
1662 incomplete = 0;
1663 node_idx = 0;
1666 #ifdef DEBUG
1667 assert (dfa->eclosures[node_idx].nelem != -1);
1668 #endif
1670 /* If we have already calculated, skip it. */
1671 if (dfa->eclosures[node_idx].nelem != 0)
1672 continue;
1673 /* Calculate epsilon closure of `node_idx'. */
1674 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1675 if (BE (err != REG_NOERROR, 0))
1676 return err;
1678 if (dfa->eclosures[node_idx].nelem == 0)
1680 incomplete = 1;
1681 re_node_set_free (&eclosure_elem);
1684 return REG_NOERROR;
1687 /* Calculate epsilon closure of NODE. */
1689 static reg_errcode_t
1690 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, int node, int root)
1692 reg_errcode_t err;
1693 int i;
1694 re_node_set eclosure;
1695 int ret;
1696 int incomplete = 0;
1697 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1698 if (BE (err != REG_NOERROR, 0))
1699 return err;
1701 /* This indicates that we are calculating this node now.
1702 We reference this value to avoid infinite loop. */
1703 dfa->eclosures[node].nelem = -1;
1705 /* If the current node has constraints, duplicate all nodes
1706 since they must inherit the constraints. */
1707 if (dfa->nodes[node].constraint
1708 && dfa->edests[node].nelem
1709 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1711 err = duplicate_node_closure (dfa, node, node, node,
1712 dfa->nodes[node].constraint);
1713 if (BE (err != REG_NOERROR, 0))
1714 return err;
1717 /* Expand each epsilon destination nodes. */
1718 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1719 for (i = 0; i < dfa->edests[node].nelem; ++i)
1721 re_node_set eclosure_elem;
1722 int edest = dfa->edests[node].elems[i];
1723 /* If calculating the epsilon closure of `edest' is in progress,
1724 return intermediate result. */
1725 if (dfa->eclosures[edest].nelem == -1)
1727 incomplete = 1;
1728 continue;
1730 /* If we haven't calculated the epsilon closure of `edest' yet,
1731 calculate now. Otherwise use calculated epsilon closure. */
1732 if (dfa->eclosures[edest].nelem == 0)
1734 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1735 if (BE (err != REG_NOERROR, 0))
1736 return err;
1738 else
1739 eclosure_elem = dfa->eclosures[edest];
1740 /* Merge the epsilon closure of `edest'. */
1741 err = re_node_set_merge (&eclosure, &eclosure_elem);
1742 if (BE (err != REG_NOERROR, 0))
1743 return err;
1744 /* If the epsilon closure of `edest' is incomplete,
1745 the epsilon closure of this node is also incomplete. */
1746 if (dfa->eclosures[edest].nelem == 0)
1748 incomplete = 1;
1749 re_node_set_free (&eclosure_elem);
1753 /* An epsilon closure includes itself. */
1754 ret = re_node_set_insert (&eclosure, node);
1755 if (BE (ret < 0, 0))
1756 return REG_ESPACE;
1757 if (incomplete && !root)
1758 dfa->eclosures[node].nelem = 0;
1759 else
1760 dfa->eclosures[node] = eclosure;
1761 *new_set = eclosure;
1762 return REG_NOERROR;
1765 /* Functions for token which are used in the parser. */
1767 /* Fetch a token from INPUT.
1768 We must not use this function inside bracket expressions. */
1770 static void
1771 internal_function
1772 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1774 re_string_skip_bytes (input, peek_token (result, input, syntax));
1777 /* Peek a token from INPUT, and return the length of the token.
1778 We must not use this function inside bracket expressions. */
1780 static int
1781 internal_function
1782 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1784 unsigned char c;
1786 if (re_string_eoi (input))
1788 token->type = END_OF_RE;
1789 return 0;
1792 c = re_string_peek_byte (input, 0);
1793 token->opr.c = c;
1795 token->word_char = 0;
1796 #ifdef RE_ENABLE_I18N
1797 token->mb_partial = 0;
1798 if (input->mb_cur_max > 1 &&
1799 !re_string_first_byte (input, re_string_cur_idx (input)))
1801 token->type = CHARACTER;
1802 token->mb_partial = 1;
1803 return 1;
1805 #endif
1806 if (c == '\\')
1808 unsigned char c2;
1809 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1811 token->type = BACK_SLASH;
1812 return 1;
1815 c2 = re_string_peek_byte_case (input, 1);
1816 token->opr.c = c2;
1817 token->type = CHARACTER;
1818 #ifdef RE_ENABLE_I18N
1819 if (input->mb_cur_max > 1)
1821 wint_t wc = re_string_wchar_at (input,
1822 re_string_cur_idx (input) + 1);
1823 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1825 else
1826 #endif
1827 token->word_char = IS_WORD_CHAR (c2) != 0;
1829 switch (c2)
1831 case '|':
1832 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1833 token->type = OP_ALT;
1834 break;
1835 case '1': case '2': case '3': case '4': case '5':
1836 case '6': case '7': case '8': case '9':
1837 if (!(syntax & RE_NO_BK_REFS))
1839 token->type = OP_BACK_REF;
1840 token->opr.idx = c2 - '1';
1842 break;
1843 case '<':
1844 if (!(syntax & RE_NO_GNU_OPS))
1846 token->type = ANCHOR;
1847 token->opr.ctx_type = WORD_FIRST;
1849 break;
1850 case '>':
1851 if (!(syntax & RE_NO_GNU_OPS))
1853 token->type = ANCHOR;
1854 token->opr.ctx_type = WORD_LAST;
1856 break;
1857 case 'b':
1858 if (!(syntax & RE_NO_GNU_OPS))
1860 token->type = ANCHOR;
1861 token->opr.ctx_type = WORD_DELIM;
1863 break;
1864 case 'B':
1865 if (!(syntax & RE_NO_GNU_OPS))
1867 token->type = ANCHOR;
1868 token->opr.ctx_type = NOT_WORD_DELIM;
1870 break;
1871 case 'w':
1872 if (!(syntax & RE_NO_GNU_OPS))
1873 token->type = OP_WORD;
1874 break;
1875 case 'W':
1876 if (!(syntax & RE_NO_GNU_OPS))
1877 token->type = OP_NOTWORD;
1878 break;
1879 case 's':
1880 if (!(syntax & RE_NO_GNU_OPS))
1881 token->type = OP_SPACE;
1882 break;
1883 case 'S':
1884 if (!(syntax & RE_NO_GNU_OPS))
1885 token->type = OP_NOTSPACE;
1886 break;
1887 case '`':
1888 if (!(syntax & RE_NO_GNU_OPS))
1890 token->type = ANCHOR;
1891 token->opr.ctx_type = BUF_FIRST;
1893 break;
1894 case '\'':
1895 if (!(syntax & RE_NO_GNU_OPS))
1897 token->type = ANCHOR;
1898 token->opr.ctx_type = BUF_LAST;
1900 break;
1901 case '(':
1902 if (!(syntax & RE_NO_BK_PARENS))
1903 token->type = OP_OPEN_SUBEXP;
1904 break;
1905 case ')':
1906 if (!(syntax & RE_NO_BK_PARENS))
1907 token->type = OP_CLOSE_SUBEXP;
1908 break;
1909 case '+':
1910 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1911 token->type = OP_DUP_PLUS;
1912 break;
1913 case '?':
1914 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1915 token->type = OP_DUP_QUESTION;
1916 break;
1917 case '{':
1918 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1919 token->type = OP_OPEN_DUP_NUM;
1920 break;
1921 case '}':
1922 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1923 token->type = OP_CLOSE_DUP_NUM;
1924 break;
1925 default:
1926 break;
1928 return 2;
1931 token->type = CHARACTER;
1932 #ifdef RE_ENABLE_I18N
1933 if (input->mb_cur_max > 1)
1935 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1936 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1938 else
1939 #endif
1940 token->word_char = IS_WORD_CHAR (token->opr.c);
1942 switch (c)
1944 case '\n':
1945 if (syntax & RE_NEWLINE_ALT)
1946 token->type = OP_ALT;
1947 break;
1948 case '|':
1949 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1950 token->type = OP_ALT;
1951 break;
1952 case '*':
1953 token->type = OP_DUP_ASTERISK;
1954 break;
1955 case '+':
1956 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1957 token->type = OP_DUP_PLUS;
1958 break;
1959 case '?':
1960 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1961 token->type = OP_DUP_QUESTION;
1962 break;
1963 case '{':
1964 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1965 token->type = OP_OPEN_DUP_NUM;
1966 break;
1967 case '}':
1968 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1969 token->type = OP_CLOSE_DUP_NUM;
1970 break;
1971 case '(':
1972 if (syntax & RE_NO_BK_PARENS)
1973 token->type = OP_OPEN_SUBEXP;
1974 break;
1975 case ')':
1976 if (syntax & RE_NO_BK_PARENS)
1977 token->type = OP_CLOSE_SUBEXP;
1978 break;
1979 case '[':
1980 token->type = OP_OPEN_BRACKET;
1981 break;
1982 case '.':
1983 token->type = OP_PERIOD;
1984 break;
1985 case '^':
1986 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1987 re_string_cur_idx (input) != 0)
1989 char prev = re_string_peek_byte (input, -1);
1990 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1991 break;
1993 token->type = ANCHOR;
1994 token->opr.ctx_type = LINE_FIRST;
1995 break;
1996 case '$':
1997 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1998 re_string_cur_idx (input) + 1 != re_string_length (input))
2000 re_token_t next;
2001 re_string_skip_bytes (input, 1);
2002 peek_token (&next, input, syntax);
2003 re_string_skip_bytes (input, -1);
2004 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
2005 break;
2007 token->type = ANCHOR;
2008 token->opr.ctx_type = LINE_LAST;
2009 break;
2010 default:
2011 break;
2013 return 1;
2016 /* Peek a token from INPUT, and return the length of the token.
2017 We must not use this function out of bracket expressions. */
2019 static int
2020 internal_function
2021 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2023 unsigned char c;
2024 if (re_string_eoi (input))
2026 token->type = END_OF_RE;
2027 return 0;
2029 c = re_string_peek_byte (input, 0);
2030 token->opr.c = c;
2032 #ifdef RE_ENABLE_I18N
2033 if (input->mb_cur_max > 1 &&
2034 !re_string_first_byte (input, re_string_cur_idx (input)))
2036 token->type = CHARACTER;
2037 return 1;
2039 #endif /* RE_ENABLE_I18N */
2041 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2042 && re_string_cur_idx (input) + 1 < re_string_length (input))
2044 /* In this case, '\' escape a character. */
2045 unsigned char c2;
2046 re_string_skip_bytes (input, 1);
2047 c2 = re_string_peek_byte (input, 0);
2048 token->opr.c = c2;
2049 token->type = CHARACTER;
2050 return 1;
2052 if (c == '[') /* '[' is a special char in a bracket exps. */
2054 unsigned char c2;
2055 int token_len;
2056 if (re_string_cur_idx (input) + 1 < re_string_length (input))
2057 c2 = re_string_peek_byte (input, 1);
2058 else
2059 c2 = 0;
2060 token->opr.c = c2;
2061 token_len = 2;
2062 switch (c2)
2064 case '.':
2065 token->type = OP_OPEN_COLL_ELEM;
2066 break;
2067 case '=':
2068 token->type = OP_OPEN_EQUIV_CLASS;
2069 break;
2070 case ':':
2071 if (syntax & RE_CHAR_CLASSES)
2073 token->type = OP_OPEN_CHAR_CLASS;
2074 break;
2076 /* else fall through. */
2077 default:
2078 token->type = CHARACTER;
2079 token->opr.c = c;
2080 token_len = 1;
2081 break;
2083 return token_len;
2085 switch (c)
2087 case '-':
2088 token->type = OP_CHARSET_RANGE;
2089 break;
2090 case ']':
2091 token->type = OP_CLOSE_BRACKET;
2092 break;
2093 case '^':
2094 token->type = OP_NON_MATCH_LIST;
2095 break;
2096 default:
2097 token->type = CHARACTER;
2099 return 1;
2102 /* Functions for parser. */
2104 /* Entry point of the parser.
2105 Parse the regular expression REGEXP and return the structure tree.
2106 If an error is occured, ERR is set by error code, and return NULL.
2107 This function build the following tree, from regular expression <reg_exp>:
2111 <reg_exp> EOR
2113 CAT means concatenation.
2114 EOR means end of regular expression. */
2116 static bin_tree_t *
2117 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2118 reg_errcode_t *err)
2120 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2121 bin_tree_t *tree, *eor, *root;
2122 re_token_t current_token;
2123 dfa->syntax = syntax;
2124 fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2125 tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2126 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2127 return NULL;
2128 eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2129 if (tree != NULL)
2130 root = create_tree (dfa, tree, eor, CONCAT);
2131 else
2132 root = eor;
2133 if (BE (eor == NULL || root == NULL, 0))
2135 *err = REG_ESPACE;
2136 return NULL;
2138 return root;
2141 /* This function build the following tree, from regular expression
2142 <branch1>|<branch2>:
2146 <branch1> <branch2>
2148 ALT means alternative, which represents the operator `|'. */
2150 static bin_tree_t *
2151 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2152 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2154 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2155 bin_tree_t *tree, *branch = NULL;
2156 tree = parse_branch (regexp, preg, token, syntax, nest, err);
2157 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2158 return NULL;
2160 while (token->type == OP_ALT)
2162 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2163 if (token->type != OP_ALT && token->type != END_OF_RE
2164 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2166 branch = parse_branch (regexp, preg, token, syntax, nest, err);
2167 if (BE (*err != REG_NOERROR && branch == NULL, 0))
2168 return NULL;
2170 else
2171 branch = NULL;
2172 tree = create_tree (dfa, tree, branch, OP_ALT);
2173 if (BE (tree == NULL, 0))
2175 *err = REG_ESPACE;
2176 return NULL;
2179 return tree;
2182 /* This function build the following tree, from regular expression
2183 <exp1><exp2>:
2187 <exp1> <exp2>
2189 CAT means concatenation. */
2191 static bin_tree_t *
2192 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2193 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2195 bin_tree_t *tree, *exp;
2196 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2197 tree = parse_expression (regexp, preg, token, syntax, nest, err);
2198 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2199 return NULL;
2201 while (token->type != OP_ALT && token->type != END_OF_RE
2202 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2204 exp = parse_expression (regexp, preg, token, syntax, nest, err);
2205 if (BE (*err != REG_NOERROR && exp == NULL, 0))
2207 return NULL;
2209 if (tree != NULL && exp != NULL)
2211 tree = create_tree (dfa, tree, exp, CONCAT);
2212 if (tree == NULL)
2214 *err = REG_ESPACE;
2215 return NULL;
2218 else if (tree == NULL)
2219 tree = exp;
2220 /* Otherwise exp == NULL, we don't need to create new tree. */
2222 return tree;
2225 /* This function build the following tree, from regular expression a*:
2231 static bin_tree_t *
2232 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2233 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2235 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2236 bin_tree_t *tree;
2237 switch (token->type)
2239 case CHARACTER:
2240 tree = create_token_tree (dfa, NULL, NULL, token);
2241 if (BE (tree == NULL, 0))
2243 *err = REG_ESPACE;
2244 return NULL;
2246 #ifdef RE_ENABLE_I18N
2247 if (dfa->mb_cur_max > 1)
2249 while (!re_string_eoi (regexp)
2250 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2252 bin_tree_t *mbc_remain;
2253 fetch_token (token, regexp, syntax);
2254 mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2255 tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2256 if (BE (mbc_remain == NULL || tree == NULL, 0))
2258 *err = REG_ESPACE;
2259 return NULL;
2263 #endif
2264 break;
2265 case OP_OPEN_SUBEXP:
2266 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2267 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2268 return NULL;
2269 break;
2270 case OP_OPEN_BRACKET:
2271 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2272 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2273 return NULL;
2274 break;
2275 case OP_BACK_REF:
2276 if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2278 *err = REG_ESUBREG;
2279 return NULL;
2281 dfa->used_bkref_map |= 1 << token->opr.idx;
2282 tree = create_token_tree (dfa, NULL, NULL, token);
2283 if (BE (tree == NULL, 0))
2285 *err = REG_ESPACE;
2286 return NULL;
2288 ++dfa->nbackref;
2289 dfa->has_mb_node = 1;
2290 break;
2291 case OP_OPEN_DUP_NUM:
2292 if (syntax & RE_CONTEXT_INVALID_DUP)
2294 *err = REG_BADRPT;
2295 return NULL;
2297 /* FALLTHROUGH */
2298 case OP_DUP_ASTERISK:
2299 case OP_DUP_PLUS:
2300 case OP_DUP_QUESTION:
2301 if (syntax & RE_CONTEXT_INVALID_OPS)
2303 *err = REG_BADRPT;
2304 return NULL;
2306 else if (syntax & RE_CONTEXT_INDEP_OPS)
2308 fetch_token (token, regexp, syntax);
2309 return parse_expression (regexp, preg, token, syntax, nest, err);
2311 /* else fall through */
2312 case OP_CLOSE_SUBEXP:
2313 if ((token->type == OP_CLOSE_SUBEXP) &&
2314 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2316 *err = REG_ERPAREN;
2317 return NULL;
2319 /* else fall through */
2320 case OP_CLOSE_DUP_NUM:
2321 /* We treat it as a normal character. */
2323 /* Then we can these characters as normal characters. */
2324 token->type = CHARACTER;
2325 /* mb_partial and word_char bits should be initialized already
2326 by peek_token. */
2327 tree = create_token_tree (dfa, NULL, NULL, token);
2328 if (BE (tree == NULL, 0))
2330 *err = REG_ESPACE;
2331 return NULL;
2333 break;
2334 case ANCHOR:
2335 if ((token->opr.ctx_type
2336 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2337 && dfa->word_ops_used == 0)
2338 init_word_char (dfa);
2339 if (token->opr.ctx_type == WORD_DELIM
2340 || token->opr.ctx_type == NOT_WORD_DELIM)
2342 bin_tree_t *tree_first, *tree_last;
2343 if (token->opr.ctx_type == WORD_DELIM)
2345 token->opr.ctx_type = WORD_FIRST;
2346 tree_first = create_token_tree (dfa, NULL, NULL, token);
2347 token->opr.ctx_type = WORD_LAST;
2349 else
2351 token->opr.ctx_type = INSIDE_WORD;
2352 tree_first = create_token_tree (dfa, NULL, NULL, token);
2353 token->opr.ctx_type = INSIDE_NOTWORD;
2355 tree_last = create_token_tree (dfa, NULL, NULL, token);
2356 tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2357 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2359 *err = REG_ESPACE;
2360 return NULL;
2363 else
2365 tree = create_token_tree (dfa, NULL, NULL, token);
2366 if (BE (tree == NULL, 0))
2368 *err = REG_ESPACE;
2369 return NULL;
2372 /* We must return here, since ANCHORs can't be followed
2373 by repetition operators.
2374 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2375 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2376 fetch_token (token, regexp, syntax);
2377 return tree;
2378 case OP_PERIOD:
2379 tree = create_token_tree (dfa, NULL, NULL, token);
2380 if (BE (tree == NULL, 0))
2382 *err = REG_ESPACE;
2383 return NULL;
2385 if (dfa->mb_cur_max > 1)
2386 dfa->has_mb_node = 1;
2387 break;
2388 case OP_WORD:
2389 case OP_NOTWORD:
2390 tree = build_charclass_op (dfa, regexp->trans,
2391 "alnum",
2392 "_",
2393 token->type == OP_NOTWORD, err);
2394 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2395 return NULL;
2396 break;
2397 case OP_SPACE:
2398 case OP_NOTSPACE:
2399 tree = build_charclass_op (dfa, regexp->trans,
2400 "space",
2402 token->type == OP_NOTSPACE, err);
2403 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2404 return NULL;
2405 break;
2406 case OP_ALT:
2407 case END_OF_RE:
2408 return NULL;
2409 case BACK_SLASH:
2410 *err = REG_EESCAPE;
2411 return NULL;
2412 default:
2413 /* Must not happen? */
2414 #ifdef DEBUG
2415 assert (0);
2416 #endif
2417 return NULL;
2419 fetch_token (token, regexp, syntax);
2421 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2422 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2424 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2425 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2426 return NULL;
2427 /* In BRE consecutive duplications are not allowed. */
2428 if ((syntax & RE_CONTEXT_INVALID_DUP)
2429 && (token->type == OP_DUP_ASTERISK
2430 || token->type == OP_OPEN_DUP_NUM))
2432 *err = REG_BADRPT;
2433 return NULL;
2437 return tree;
2440 /* This function build the following tree, from regular expression
2441 (<reg_exp>):
2442 SUBEXP
2444 <reg_exp>
2447 static bin_tree_t *
2448 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2449 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2451 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2452 bin_tree_t *tree;
2453 size_t cur_nsub;
2454 cur_nsub = preg->re_nsub++;
2456 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2458 /* The subexpression may be a null string. */
2459 if (token->type == OP_CLOSE_SUBEXP)
2460 tree = NULL;
2461 else
2463 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2464 if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2465 *err = REG_EPAREN;
2466 if (BE (*err != REG_NOERROR, 0))
2467 return NULL;
2470 if (cur_nsub <= '9' - '1')
2471 dfa->completed_bkref_map |= 1 << cur_nsub;
2473 tree = create_tree (dfa, tree, NULL, SUBEXP);
2474 if (BE (tree == NULL, 0))
2476 *err = REG_ESPACE;
2477 return NULL;
2479 tree->token.opr.idx = cur_nsub;
2480 return tree;
2483 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2485 static bin_tree_t *
2486 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2487 re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2489 bin_tree_t *tree = NULL, *old_tree = NULL;
2490 int i, start, end, start_idx = re_string_cur_idx (regexp);
2491 #ifndef RE_TOKEN_INIT_BUG
2492 re_token_t start_token = *token;
2493 #else
2494 re_token_t start_token;
2496 memcpy ((void *) &start_token, (void *) token, sizeof start_token);
2497 #endif
2499 if (token->type == OP_OPEN_DUP_NUM)
2501 end = 0;
2502 start = fetch_number (regexp, token, syntax);
2503 if (start == -1)
2505 if (token->type == CHARACTER && token->opr.c == ',')
2506 start = 0; /* We treat "{,m}" as "{0,m}". */
2507 else
2509 *err = REG_BADBR; /* <re>{} is invalid. */
2510 return NULL;
2513 if (BE (start != -2, 1))
2515 /* We treat "{n}" as "{n,n}". */
2516 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2517 : ((token->type == CHARACTER && token->opr.c == ',')
2518 ? fetch_number (regexp, token, syntax) : -2));
2520 if (BE (start == -2 || end == -2, 0))
2522 /* Invalid sequence. */
2523 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2525 if (token->type == END_OF_RE)
2526 *err = REG_EBRACE;
2527 else
2528 *err = REG_BADBR;
2530 return NULL;
2533 /* If the syntax bit is set, rollback. */
2534 re_string_set_index (regexp, start_idx);
2535 *token = start_token;
2536 token->type = CHARACTER;
2537 /* mb_partial and word_char bits should be already initialized by
2538 peek_token. */
2539 return elem;
2542 if (BE ((end != -1 && start > end) || token->type != OP_CLOSE_DUP_NUM, 0))
2544 /* First number greater than second. */
2545 *err = REG_BADBR;
2546 return NULL;
2549 else
2551 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2552 end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2555 fetch_token (token, regexp, syntax);
2557 if (BE (elem == NULL, 0))
2558 return NULL;
2559 if (BE (start == 0 && end == 0, 0))
2561 postorder (elem, free_tree, NULL);
2562 return NULL;
2565 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2566 if (BE (start > 0, 0))
2568 tree = elem;
2569 for (i = 2; i <= start; ++i)
2571 elem = duplicate_tree (elem, dfa);
2572 tree = create_tree (dfa, tree, elem, CONCAT);
2573 if (BE (elem == NULL || tree == NULL, 0))
2574 goto parse_dup_op_espace;
2577 if (start == end)
2578 return tree;
2580 /* Duplicate ELEM before it is marked optional. */
2581 elem = duplicate_tree (elem, dfa);
2582 old_tree = tree;
2584 else
2585 old_tree = NULL;
2587 if (elem->token.type == SUBEXP)
2588 postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2590 tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2591 if (BE (tree == NULL, 0))
2592 goto parse_dup_op_espace;
2594 /* This loop is actually executed only when end != -1,
2595 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2596 already created the start+1-th copy. */
2597 for (i = start + 2; i <= end; ++i)
2599 elem = duplicate_tree (elem, dfa);
2600 tree = create_tree (dfa, tree, elem, CONCAT);
2601 if (BE (elem == NULL || tree == NULL, 0))
2602 goto parse_dup_op_espace;
2604 tree = create_tree (dfa, tree, NULL, OP_ALT);
2605 if (BE (tree == NULL, 0))
2606 goto parse_dup_op_espace;
2609 if (old_tree)
2610 tree = create_tree (dfa, old_tree, tree, CONCAT);
2612 return tree;
2614 parse_dup_op_espace:
2615 *err = REG_ESPACE;
2616 return NULL;
2619 /* Size of the names for collating symbol/equivalence_class/character_class.
2620 I'm not sure, but maybe enough. */
2621 #define BRACKET_NAME_BUF_SIZE 32
2623 #ifndef _LIBC
2624 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2625 Build the range expression which starts from START_ELEM, and ends
2626 at END_ELEM. The result are written to MBCSET and SBCSET.
2627 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2628 mbcset->range_ends, is a pointer argument sinse we may
2629 update it. */
2631 static reg_errcode_t
2632 internal_function
2633 # ifdef RE_ENABLE_I18N
2634 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2635 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2636 # else /* not RE_ENABLE_I18N */
2637 build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
2638 bracket_elem_t *end_elem)
2639 # endif /* not RE_ENABLE_I18N */
2641 unsigned int start_ch, end_ch;
2642 /* Equivalence Classes and Character Classes can't be a range start/end. */
2643 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2644 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2646 return REG_ERANGE;
2648 /* We can handle no multi character collating elements without libc
2649 support. */
2650 if (BE ((start_elem->type == COLL_SYM
2651 && strlen ((char *) start_elem->opr.name) > 1)
2652 || (end_elem->type == COLL_SYM
2653 && strlen ((char *) end_elem->opr.name) > 1), 0))
2654 return REG_ECOLLATE;
2656 # ifdef RE_ENABLE_I18N
2658 wchar_t wc;
2659 wint_t start_wc;
2660 wint_t end_wc;
2661 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2663 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2664 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2665 : 0));
2666 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2667 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2668 : 0));
2669 #ifdef GAWK
2671 * Fedora Core 2, maybe others, have broken `btowc' that returns -1
2672 * for any value > 127. Sigh. Note that `start_ch' and `end_ch' are
2673 * unsigned, so we don't have sign extension problems.
2675 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2676 ? start_ch : start_elem->opr.wch);
2677 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2678 ? end_ch : end_elem->opr.wch);
2679 #else
2680 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2681 ? __btowc (start_ch) : start_elem->opr.wch);
2682 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2683 ? __btowc (end_ch) : end_elem->opr.wch);
2684 #endif
2685 if (start_wc == WEOF || end_wc == WEOF)
2686 return REG_ECOLLATE;
2687 cmp_buf[0] = start_wc;
2688 cmp_buf[4] = end_wc;
2689 if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2690 return REG_ERANGE;
2692 /* Got valid collation sequence values, add them as a new entry.
2693 However, for !_LIBC we have no collation elements: if the
2694 character set is single byte, the single byte character set
2695 that we build below suffices. parse_bracket_exp passes
2696 no MBCSET if dfa->mb_cur_max == 1. */
2697 if (mbcset)
2699 /* Check the space of the arrays. */
2700 if (BE (*range_alloc == mbcset->nranges, 0))
2702 /* There is not enough space, need realloc. */
2703 wchar_t *new_array_start, *new_array_end;
2704 int new_nranges;
2706 /* +1 in case of mbcset->nranges is 0. */
2707 new_nranges = 2 * mbcset->nranges + 1;
2708 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2709 are NULL if *range_alloc == 0. */
2710 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2711 new_nranges);
2712 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2713 new_nranges);
2715 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2716 return REG_ESPACE;
2718 mbcset->range_starts = new_array_start;
2719 mbcset->range_ends = new_array_end;
2720 *range_alloc = new_nranges;
2723 mbcset->range_starts[mbcset->nranges] = start_wc;
2724 mbcset->range_ends[mbcset->nranges++] = end_wc;
2727 /* Build the table for single byte characters. */
2728 for (wc = 0; wc < SBC_MAX; ++wc)
2730 cmp_buf[2] = wc;
2731 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2732 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2733 bitset_set (sbcset, wc);
2736 # else /* not RE_ENABLE_I18N */
2738 unsigned int ch;
2739 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2740 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2741 : 0));
2742 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2743 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2744 : 0));
2745 if (start_ch > end_ch)
2746 return REG_ERANGE;
2747 /* Build the table for single byte characters. */
2748 for (ch = 0; ch < SBC_MAX; ++ch)
2749 if (start_ch <= ch && ch <= end_ch)
2750 bitset_set (sbcset, ch);
2752 # endif /* not RE_ENABLE_I18N */
2753 return REG_NOERROR;
2755 #endif /* not _LIBC */
2757 #ifndef _LIBC
2758 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2759 Build the collating element which is represented by NAME.
2760 The result are written to MBCSET and SBCSET.
2761 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2762 pointer argument since we may update it. */
2764 static reg_errcode_t
2765 internal_function
2766 # ifdef RE_ENABLE_I18N
2767 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2768 int *coll_sym_alloc, const unsigned char *name)
2769 # else /* not RE_ENABLE_I18N */
2770 build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2771 # endif /* not RE_ENABLE_I18N */
2773 size_t name_len = strlen ((const char *) name);
2774 if (BE (name_len != 1, 0))
2775 return REG_ECOLLATE;
2776 else
2778 bitset_set (sbcset, name[0]);
2779 return REG_NOERROR;
2782 #endif /* not _LIBC */
2784 /* This function parse bracket expression like "[abc]", "[a-c]",
2785 "[[.a-a.]]" etc. */
2787 static bin_tree_t *
2788 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2789 reg_syntax_t syntax, reg_errcode_t *err)
2791 #ifdef _LIBC
2792 const unsigned char *collseqmb;
2793 const char *collseqwc;
2794 uint32_t nrules;
2795 int32_t table_size;
2796 const int32_t *symb_table;
2797 const unsigned char *extra;
2799 /* Local function for parse_bracket_exp used in _LIBC environement.
2800 Seek the collating symbol entry correspondings to NAME.
2801 Return the index of the symbol in the SYMB_TABLE. */
2803 auto inline int32_t
2804 __attribute ((always_inline))
2805 seek_collating_symbol_entry (name, name_len)
2806 const unsigned char *name;
2807 size_t name_len;
2809 int32_t hash = elem_hash ((const char *) name, name_len);
2810 int32_t elem = hash % table_size;
2811 if (symb_table[2 * elem] != 0)
2813 int32_t second = hash % (table_size - 2) + 1;
2817 /* First compare the hashing value. */
2818 if (symb_table[2 * elem] == hash
2819 /* Compare the length of the name. */
2820 && name_len == extra[symb_table[2 * elem + 1]]
2821 /* Compare the name. */
2822 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2823 name_len) == 0)
2825 /* Yep, this is the entry. */
2826 break;
2829 /* Next entry. */
2830 elem += second;
2832 while (symb_table[2 * elem] != 0);
2834 return elem;
2837 /* Local function for parse_bracket_exp used in _LIBC environment.
2838 Look up the collation sequence value of BR_ELEM.
2839 Return the value if succeeded, UINT_MAX otherwise. */
2841 auto inline unsigned int
2842 __attribute ((always_inline))
2843 lookup_collation_sequence_value (br_elem)
2844 bracket_elem_t *br_elem;
2846 if (br_elem->type == SB_CHAR)
2849 if (MB_CUR_MAX == 1)
2851 if (nrules == 0)
2852 return collseqmb[br_elem->opr.ch];
2853 else
2855 wint_t wc = __btowc (br_elem->opr.ch);
2856 return __collseq_table_lookup (collseqwc, wc);
2859 else if (br_elem->type == MB_CHAR)
2861 if (nrules != 0)
2862 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2864 else if (br_elem->type == COLL_SYM)
2866 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2867 if (nrules != 0)
2869 int32_t elem, idx;
2870 elem = seek_collating_symbol_entry (br_elem->opr.name,
2871 sym_name_len);
2872 if (symb_table[2 * elem] != 0)
2874 /* We found the entry. */
2875 idx = symb_table[2 * elem + 1];
2876 /* Skip the name of collating element name. */
2877 idx += 1 + extra[idx];
2878 /* Skip the byte sequence of the collating element. */
2879 idx += 1 + extra[idx];
2880 /* Adjust for the alignment. */
2881 idx = (idx + 3) & ~3;
2882 /* Skip the multibyte collation sequence value. */
2883 idx += sizeof (unsigned int);
2884 /* Skip the wide char sequence of the collating element. */
2885 idx += sizeof (unsigned int) *
2886 (1 + *(unsigned int *) (extra + idx));
2887 /* Return the collation sequence value. */
2888 return *(unsigned int *) (extra + idx);
2890 else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2892 /* No valid character. Match it as a single byte
2893 character. */
2894 return collseqmb[br_elem->opr.name[0]];
2897 else if (sym_name_len == 1)
2898 return collseqmb[br_elem->opr.name[0]];
2900 return UINT_MAX;
2903 /* Local function for parse_bracket_exp used in _LIBC environement.
2904 Build the range expression which starts from START_ELEM, and ends
2905 at END_ELEM. The result are written to MBCSET and SBCSET.
2906 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2907 mbcset->range_ends, is a pointer argument sinse we may
2908 update it. */
2910 auto inline reg_errcode_t
2911 __attribute ((always_inline))
2912 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2913 re_charset_t *mbcset;
2914 int *range_alloc;
2915 bitset_t sbcset;
2916 bracket_elem_t *start_elem, *end_elem;
2918 unsigned int ch;
2919 uint32_t start_collseq;
2920 uint32_t end_collseq;
2922 /* Equivalence Classes and Character Classes can't be a range
2923 start/end. */
2924 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2925 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2927 return REG_ERANGE;
2929 start_collseq = lookup_collation_sequence_value (start_elem);
2930 end_collseq = lookup_collation_sequence_value (end_elem);
2931 /* Check start/end collation sequence values. */
2932 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2933 return REG_ECOLLATE;
2934 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2935 return REG_ERANGE;
2937 /* Got valid collation sequence values, add them as a new entry.
2938 However, if we have no collation elements, and the character set
2939 is single byte, the single byte character set that we
2940 build below suffices. */
2941 if (nrules > 0 || dfa->mb_cur_max > 1)
2943 /* Check the space of the arrays. */
2944 if (BE (*range_alloc == mbcset->nranges, 0))
2946 /* There is not enough space, need realloc. */
2947 uint32_t *new_array_start;
2948 uint32_t *new_array_end;
2949 int new_nranges;
2951 /* +1 in case of mbcset->nranges is 0. */
2952 new_nranges = 2 * mbcset->nranges + 1;
2953 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2954 new_nranges);
2955 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2956 new_nranges);
2958 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2959 return REG_ESPACE;
2961 mbcset->range_starts = new_array_start;
2962 mbcset->range_ends = new_array_end;
2963 *range_alloc = new_nranges;
2966 mbcset->range_starts[mbcset->nranges] = start_collseq;
2967 mbcset->range_ends[mbcset->nranges++] = end_collseq;
2970 /* Build the table for single byte characters. */
2971 for (ch = 0; ch < SBC_MAX; ch++)
2973 uint32_t ch_collseq;
2975 if (MB_CUR_MAX == 1)
2977 if (nrules == 0)
2978 ch_collseq = collseqmb[ch];
2979 else
2980 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2981 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2982 bitset_set (sbcset, ch);
2984 return REG_NOERROR;
2987 /* Local function for parse_bracket_exp used in _LIBC environement.
2988 Build the collating element which is represented by NAME.
2989 The result are written to MBCSET and SBCSET.
2990 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2991 pointer argument sinse we may update it. */
2993 auto inline reg_errcode_t
2994 __attribute ((always_inline))
2995 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2996 re_charset_t *mbcset;
2997 int *coll_sym_alloc;
2998 bitset_t sbcset;
2999 const unsigned char *name;
3001 int32_t elem, idx;
3002 size_t name_len = strlen ((const char *) name);
3003 if (nrules != 0)
3005 elem = seek_collating_symbol_entry (name, name_len);
3006 if (symb_table[2 * elem] != 0)
3008 /* We found the entry. */
3009 idx = symb_table[2 * elem + 1];
3010 /* Skip the name of collating element name. */
3011 idx += 1 + extra[idx];
3013 else if (symb_table[2 * elem] == 0 && name_len == 1)
3015 /* No valid character, treat it as a normal
3016 character. */
3017 bitset_set (sbcset, name[0]);
3018 return REG_NOERROR;
3020 else
3021 return REG_ECOLLATE;
3023 /* Got valid collation sequence, add it as a new entry. */
3024 /* Check the space of the arrays. */
3025 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3027 /* Not enough, realloc it. */
3028 /* +1 in case of mbcset->ncoll_syms is 0. */
3029 int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3030 /* Use realloc since mbcset->coll_syms is NULL
3031 if *alloc == 0. */
3032 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3033 new_coll_sym_alloc);
3034 if (BE (new_coll_syms == NULL, 0))
3035 return REG_ESPACE;
3036 mbcset->coll_syms = new_coll_syms;
3037 *coll_sym_alloc = new_coll_sym_alloc;
3039 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3040 return REG_NOERROR;
3042 else
3044 if (BE (name_len != 1, 0))
3045 return REG_ECOLLATE;
3046 else
3048 bitset_set (sbcset, name[0]);
3049 return REG_NOERROR;
3053 #endif
3055 re_token_t br_token;
3056 re_bitset_ptr_t sbcset;
3057 #ifdef RE_ENABLE_I18N
3058 re_charset_t *mbcset;
3059 int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3060 int equiv_class_alloc = 0, char_class_alloc = 0;
3061 #endif /* not RE_ENABLE_I18N */
3062 int non_match = 0;
3063 bin_tree_t *work_tree;
3064 int token_len;
3065 int first_round = 1;
3066 #ifdef _LIBC
3067 collseqmb = (const unsigned char *)
3068 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3069 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3070 if (nrules)
3073 if (MB_CUR_MAX > 1)
3075 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3076 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3077 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3078 _NL_COLLATE_SYMB_TABLEMB);
3079 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3080 _NL_COLLATE_SYMB_EXTRAMB);
3082 #endif
3083 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3084 #ifdef RE_ENABLE_I18N
3085 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3086 #endif /* RE_ENABLE_I18N */
3087 #ifdef RE_ENABLE_I18N
3088 if (BE (sbcset == NULL || mbcset == NULL, 0))
3089 #else
3090 if (BE (sbcset == NULL, 0))
3091 #endif /* RE_ENABLE_I18N */
3093 *err = REG_ESPACE;
3094 return NULL;
3097 token_len = peek_token_bracket (token, regexp, syntax);
3098 if (BE (token->type == END_OF_RE, 0))
3100 *err = REG_BADPAT;
3101 goto parse_bracket_exp_free_return;
3103 if (token->type == OP_NON_MATCH_LIST)
3105 #ifdef RE_ENABLE_I18N
3106 mbcset->non_match = 1;
3107 #endif /* not RE_ENABLE_I18N */
3108 non_match = 1;
3109 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3110 bitset_set (sbcset, '\n');
3111 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3112 token_len = peek_token_bracket (token, regexp, syntax);
3113 if (BE (token->type == END_OF_RE, 0))
3115 *err = REG_BADPAT;
3116 goto parse_bracket_exp_free_return;
3120 /* We treat the first ']' as a normal character. */
3121 if (token->type == OP_CLOSE_BRACKET)
3122 token->type = CHARACTER;
3124 while (1)
3126 bracket_elem_t start_elem, end_elem;
3127 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3128 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3129 reg_errcode_t ret;
3130 int token_len2 = 0, is_range_exp = 0;
3131 re_token_t token2;
3133 start_elem.opr.name = start_name_buf;
3134 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3135 syntax, first_round);
3136 if (BE (ret != REG_NOERROR, 0))
3138 *err = ret;
3139 goto parse_bracket_exp_free_return;
3141 first_round = 0;
3143 /* Get information about the next token. We need it in any case. */
3144 token_len = peek_token_bracket (token, regexp, syntax);
3146 /* Do not check for ranges if we know they are not allowed. */
3147 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3149 if (BE (token->type == END_OF_RE, 0))
3151 *err = REG_EBRACK;
3152 goto parse_bracket_exp_free_return;
3154 if (token->type == OP_CHARSET_RANGE)
3156 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3157 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3158 if (BE (token2.type == END_OF_RE, 0))
3160 *err = REG_EBRACK;
3161 goto parse_bracket_exp_free_return;
3163 if (token2.type == OP_CLOSE_BRACKET)
3165 /* We treat the last '-' as a normal character. */
3166 re_string_skip_bytes (regexp, -token_len);
3167 token->type = CHARACTER;
3169 else
3170 is_range_exp = 1;
3174 if (is_range_exp == 1)
3176 end_elem.opr.name = end_name_buf;
3177 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3178 dfa, syntax, 1);
3179 if (BE (ret != REG_NOERROR, 0))
3181 *err = ret;
3182 goto parse_bracket_exp_free_return;
3185 token_len = peek_token_bracket (token, regexp, syntax);
3187 #ifdef _LIBC
3188 *err = build_range_exp (sbcset, mbcset, &range_alloc,
3189 &start_elem, &end_elem);
3190 #else
3191 # ifdef RE_ENABLE_I18N
3192 *err = build_range_exp (sbcset,
3193 dfa->mb_cur_max > 1 ? mbcset : NULL,
3194 &range_alloc, &start_elem, &end_elem);
3195 # else
3196 *err = build_range_exp (sbcset, &start_elem, &end_elem);
3197 # endif
3198 #endif /* RE_ENABLE_I18N */
3199 if (BE (*err != REG_NOERROR, 0))
3200 goto parse_bracket_exp_free_return;
3202 else
3204 switch (start_elem.type)
3206 case SB_CHAR:
3207 bitset_set (sbcset, start_elem.opr.ch);
3208 break;
3209 #ifdef RE_ENABLE_I18N
3210 case MB_CHAR:
3211 /* Check whether the array has enough space. */
3212 if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3214 wchar_t *new_mbchars;
3215 /* Not enough, realloc it. */
3216 /* +1 in case of mbcset->nmbchars is 0. */
3217 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3218 /* Use realloc since array is NULL if *alloc == 0. */
3219 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3220 mbchar_alloc);
3221 if (BE (new_mbchars == NULL, 0))
3222 goto parse_bracket_exp_espace;
3223 mbcset->mbchars = new_mbchars;
3225 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3226 break;
3227 #endif /* RE_ENABLE_I18N */
3228 case EQUIV_CLASS:
3229 *err = build_equiv_class (sbcset,
3230 #ifdef RE_ENABLE_I18N
3231 mbcset, &equiv_class_alloc,
3232 #endif /* RE_ENABLE_I18N */
3233 start_elem.opr.name);
3234 if (BE (*err != REG_NOERROR, 0))
3235 goto parse_bracket_exp_free_return;
3236 break;
3237 case COLL_SYM:
3238 *err = build_collating_symbol (sbcset,
3239 #ifdef RE_ENABLE_I18N
3240 mbcset, &coll_sym_alloc,
3241 #endif /* RE_ENABLE_I18N */
3242 start_elem.opr.name);
3243 if (BE (*err != REG_NOERROR, 0))
3244 goto parse_bracket_exp_free_return;
3245 break;
3246 case CHAR_CLASS:
3247 *err = build_charclass (regexp->trans, sbcset,
3248 #ifdef RE_ENABLE_I18N
3249 mbcset, &char_class_alloc,
3250 #endif /* RE_ENABLE_I18N */
3251 (const char *) start_elem.opr.name, syntax);
3252 if (BE (*err != REG_NOERROR, 0))
3253 goto parse_bracket_exp_free_return;
3254 break;
3255 default:
3256 assert (0);
3257 break;
3260 if (BE (token->type == END_OF_RE, 0))
3262 *err = REG_EBRACK;
3263 goto parse_bracket_exp_free_return;
3265 if (token->type == OP_CLOSE_BRACKET)
3266 break;
3269 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3271 /* If it is non-matching list. */
3272 if (non_match)
3273 bitset_not (sbcset);
3275 #ifdef RE_ENABLE_I18N
3276 /* Ensure only single byte characters are set. */
3277 if (dfa->mb_cur_max > 1)
3278 bitset_mask (sbcset, dfa->sb_char);
3280 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3281 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3282 || mbcset->non_match)))
3284 bin_tree_t *mbc_tree;
3285 int sbc_idx;
3286 /* Build a tree for complex bracket. */
3287 dfa->has_mb_node = 1;
3288 br_token.type = COMPLEX_BRACKET;
3289 br_token.opr.mbcset = mbcset;
3290 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3291 if (BE (mbc_tree == NULL, 0))
3292 goto parse_bracket_exp_espace;
3293 for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3294 if (sbcset[sbc_idx])
3295 break;
3296 /* If there are no bits set in sbcset, there is no point
3297 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3298 if (sbc_idx < BITSET_WORDS)
3300 /* Build a tree for simple bracket. */
3301 br_token.type = SIMPLE_BRACKET;
3302 br_token.opr.sbcset = sbcset;
3303 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3304 if (BE (work_tree == NULL, 0))
3305 goto parse_bracket_exp_espace;
3307 /* Then join them by ALT node. */
3308 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3309 if (BE (work_tree == NULL, 0))
3310 goto parse_bracket_exp_espace;
3312 else
3314 re_free (sbcset);
3315 work_tree = mbc_tree;
3318 else
3319 #endif /* not RE_ENABLE_I18N */
3321 #ifdef RE_ENABLE_I18N
3322 free_charset (mbcset);
3323 #endif
3324 /* Build a tree for simple bracket. */
3325 br_token.type = SIMPLE_BRACKET;
3326 br_token.opr.sbcset = sbcset;
3327 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3328 if (BE (work_tree == NULL, 0))
3329 goto parse_bracket_exp_espace;
3331 return work_tree;
3333 parse_bracket_exp_espace:
3334 *err = REG_ESPACE;
3335 parse_bracket_exp_free_return:
3336 re_free (sbcset);
3337 #ifdef RE_ENABLE_I18N
3338 free_charset (mbcset);
3339 #endif /* RE_ENABLE_I18N */
3340 return NULL;
3343 /* Parse an element in the bracket expression. */
3345 static reg_errcode_t
3346 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3347 re_token_t *token, int token_len, re_dfa_t *dfa,
3348 reg_syntax_t syntax, int accept_hyphen)
3350 #ifdef RE_ENABLE_I18N
3351 int cur_char_size;
3352 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3353 if (cur_char_size > 1)
3355 elem->type = MB_CHAR;
3356 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3357 re_string_skip_bytes (regexp, cur_char_size);
3358 return REG_NOERROR;
3360 #endif /* RE_ENABLE_I18N */
3361 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3362 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3363 || token->type == OP_OPEN_EQUIV_CLASS)
3364 return parse_bracket_symbol (elem, regexp, token);
3365 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3367 /* A '-' must only appear as anything but a range indicator before
3368 the closing bracket. Everything else is an error. */
3369 re_token_t token2;
3370 (void) peek_token_bracket (&token2, regexp, syntax);
3371 if (token2.type != OP_CLOSE_BRACKET)
3372 /* The actual error value is not standardized since this whole
3373 case is undefined. But ERANGE makes good sense. */
3374 return REG_ERANGE;
3376 elem->type = SB_CHAR;
3377 elem->opr.ch = token->opr.c;
3378 return REG_NOERROR;
3381 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3382 such as [:<character_class>:], [.<collating_element>.], and
3383 [=<equivalent_class>=]. */
3385 static reg_errcode_t
3386 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3387 re_token_t *token)
3389 unsigned char ch, delim = token->opr.c;
3390 int i = 0;
3391 if (re_string_eoi(regexp))
3392 return REG_EBRACK;
3393 for (;; ++i)
3395 if (i >= BRACKET_NAME_BUF_SIZE)
3396 return REG_EBRACK;
3397 if (token->type == OP_OPEN_CHAR_CLASS)
3398 ch = re_string_fetch_byte_case (regexp);
3399 else
3400 ch = re_string_fetch_byte (regexp);
3401 if (re_string_eoi(regexp))
3402 return REG_EBRACK;
3403 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3404 break;
3405 elem->opr.name[i] = ch;
3407 re_string_skip_bytes (regexp, 1);
3408 elem->opr.name[i] = '\0';
3409 switch (token->type)
3411 case OP_OPEN_COLL_ELEM:
3412 elem->type = COLL_SYM;
3413 break;
3414 case OP_OPEN_EQUIV_CLASS:
3415 elem->type = EQUIV_CLASS;
3416 break;
3417 case OP_OPEN_CHAR_CLASS:
3418 elem->type = CHAR_CLASS;
3419 break;
3420 default:
3421 break;
3423 return REG_NOERROR;
3426 /* Helper function for parse_bracket_exp.
3427 Build the equivalence class which is represented by NAME.
3428 The result are written to MBCSET and SBCSET.
3429 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3430 is a pointer argument sinse we may update it. */
3432 static reg_errcode_t
3433 #ifdef RE_ENABLE_I18N
3434 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3435 int *equiv_class_alloc, const unsigned char *name)
3436 #else /* not RE_ENABLE_I18N */
3437 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3438 #endif /* not RE_ENABLE_I18N */
3440 #ifdef _LIBC
3441 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3442 if (nrules != 0)
3444 const int32_t *table, *indirect;
3445 const unsigned char *weights, *extra, *cp;
3446 unsigned char char_buf[2];
3447 int32_t idx1, idx2;
3448 unsigned int ch;
3449 size_t len;
3450 /* This #include defines a local function! */
3451 # include <locale/weight.h>
3452 /* Calculate the index for equivalence class. */
3453 cp = name;
3454 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3455 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3456 _NL_COLLATE_WEIGHTMB);
3457 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3458 _NL_COLLATE_EXTRAMB);
3459 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3460 _NL_COLLATE_INDIRECTMB);
3461 idx1 = findidx (&cp);
3462 if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3463 /* This isn't a valid character. */
3464 return REG_ECOLLATE;
3466 /* Build single byte matcing table for this equivalence class. */
3467 char_buf[1] = (unsigned char) '\0';
3468 len = weights[idx1 & 0xffffff];
3469 for (ch = 0; ch < SBC_MAX; ++ch)
3471 char_buf[0] = ch;
3472 cp = char_buf;
3473 idx2 = findidx (&cp);
3475 idx2 = table[ch];
3477 if (idx2 == 0)
3478 /* This isn't a valid character. */
3479 continue;
3480 /* Compare only if the length matches and the collation rule
3481 index is the same. */
3482 if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3484 int cnt = 0;
3486 while (cnt <= len &&
3487 weights[(idx1 & 0xffffff) + 1 + cnt]
3488 == weights[(idx2 & 0xffffff) + 1 + cnt])
3489 ++cnt;
3491 if (cnt > len)
3492 bitset_set (sbcset, ch);
3495 /* Check whether the array has enough space. */
3496 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3498 /* Not enough, realloc it. */
3499 /* +1 in case of mbcset->nequiv_classes is 0. */
3500 int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3501 /* Use realloc since the array is NULL if *alloc == 0. */
3502 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3503 int32_t,
3504 new_equiv_class_alloc);
3505 if (BE (new_equiv_classes == NULL, 0))
3506 return REG_ESPACE;
3507 mbcset->equiv_classes = new_equiv_classes;
3508 *equiv_class_alloc = new_equiv_class_alloc;
3510 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3512 else
3513 #endif /* _LIBC */
3515 if (BE (strlen ((const char *) name) != 1, 0))
3516 return REG_ECOLLATE;
3517 bitset_set (sbcset, *name);
3519 return REG_NOERROR;
3522 /* Helper function for parse_bracket_exp.
3523 Build the character class which is represented by NAME.
3524 The result are written to MBCSET and SBCSET.
3525 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3526 is a pointer argument sinse we may update it. */
3528 static reg_errcode_t
3529 #ifdef RE_ENABLE_I18N
3530 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3531 re_charset_t *mbcset, int *char_class_alloc,
3532 const char *class_name, reg_syntax_t syntax)
3533 #else /* not RE_ENABLE_I18N */
3534 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3535 const char *class_name, reg_syntax_t syntax)
3536 #endif /* not RE_ENABLE_I18N */
3538 int i;
3540 /* In case of REG_ICASE "upper" and "lower" match the both of
3541 upper and lower cases. */
3542 if ((syntax & RE_ICASE)
3543 && (strcmp (class_name, "upper") == 0 || strcmp (class_name, "lower") == 0))
3544 class_name = "alpha";
3546 #ifdef RE_ENABLE_I18N
3547 /* Check the space of the arrays. */
3548 if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3550 /* Not enough, realloc it. */
3551 /* +1 in case of mbcset->nchar_classes is 0. */
3552 int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3553 /* Use realloc since array is NULL if *alloc == 0. */
3554 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3555 new_char_class_alloc);
3556 if (BE (new_char_classes == NULL, 0))
3557 return REG_ESPACE;
3558 mbcset->char_classes = new_char_classes;
3559 *char_class_alloc = new_char_class_alloc;
3561 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (class_name);
3562 #endif /* RE_ENABLE_I18N */
3564 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3565 do { \
3566 if (BE (trans != NULL, 0)) \
3568 for (i = 0; i < SBC_MAX; ++i) \
3569 if (ctype_func (i)) \
3570 bitset_set (sbcset, trans[i]); \
3572 else \
3574 for (i = 0; i < SBC_MAX; ++i) \
3575 if (ctype_func (i)) \
3576 bitset_set (sbcset, i); \
3578 } while (0)
3580 if (strcmp (class_name, "alnum") == 0)
3581 BUILD_CHARCLASS_LOOP (isalnum);
3582 else if (strcmp (class_name, "cntrl") == 0)
3583 BUILD_CHARCLASS_LOOP (iscntrl);
3584 else if (strcmp (class_name, "lower") == 0)
3585 BUILD_CHARCLASS_LOOP (islower);
3586 else if (strcmp (class_name, "space") == 0)
3587 BUILD_CHARCLASS_LOOP (isspace);
3588 else if (strcmp (class_name, "alpha") == 0)
3589 BUILD_CHARCLASS_LOOP (isalpha);
3590 else if (strcmp (class_name, "digit") == 0)
3591 BUILD_CHARCLASS_LOOP (isdigit);
3592 else if (strcmp (class_name, "print") == 0)
3593 BUILD_CHARCLASS_LOOP (isprint);
3594 else if (strcmp (class_name, "upper") == 0)
3595 BUILD_CHARCLASS_LOOP (isupper);
3596 else if (strcmp (class_name, "blank") == 0)
3597 #ifndef GAWK
3598 BUILD_CHARCLASS_LOOP (isblank);
3599 #else
3600 /* see comments above */
3601 BUILD_CHARCLASS_LOOP (is_blank);
3602 #endif
3603 else if (strcmp (class_name, "graph") == 0)
3604 BUILD_CHARCLASS_LOOP (isgraph);
3605 else if (strcmp (class_name, "punct") == 0)
3606 BUILD_CHARCLASS_LOOP (ispunct);
3607 else if (strcmp (class_name, "xdigit") == 0)
3608 BUILD_CHARCLASS_LOOP (isxdigit);
3609 else
3610 return REG_ECTYPE;
3612 return REG_NOERROR;
3615 static bin_tree_t *
3616 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3617 const char *class_name,
3618 const char *extra, int non_match,
3619 reg_errcode_t *err)
3621 re_bitset_ptr_t sbcset;
3622 #ifdef RE_ENABLE_I18N
3623 re_charset_t *mbcset;
3624 int alloc = 0;
3625 #endif /* not RE_ENABLE_I18N */
3626 reg_errcode_t ret;
3627 re_token_t br_token;
3628 bin_tree_t *tree;
3630 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3631 #ifdef RE_ENABLE_I18N
3632 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3633 #endif /* RE_ENABLE_I18N */
3635 #ifdef RE_ENABLE_I18N
3636 if (BE (sbcset == NULL || mbcset == NULL, 0))
3637 #else /* not RE_ENABLE_I18N */
3638 if (BE (sbcset == NULL, 0))
3639 #endif /* not RE_ENABLE_I18N */
3641 *err = REG_ESPACE;
3642 return NULL;
3645 if (non_match)
3647 #ifdef RE_ENABLE_I18N
3648 mbcset->non_match = 1;
3649 #endif /* not RE_ENABLE_I18N */
3652 /* We don't care the syntax in this case. */
3653 ret = build_charclass (trans, sbcset,
3654 #ifdef RE_ENABLE_I18N
3655 mbcset, &alloc,
3656 #endif /* RE_ENABLE_I18N */
3657 class_name, 0);
3659 if (BE (ret != REG_NOERROR, 0))
3661 re_free (sbcset);
3662 #ifdef RE_ENABLE_I18N
3663 free_charset (mbcset);
3664 #endif /* RE_ENABLE_I18N */
3665 *err = ret;
3666 return NULL;
3668 /* \w match '_' also. */
3669 for (; *extra; extra++)
3670 bitset_set (sbcset, *extra);
3672 /* If it is non-matching list. */
3673 if (non_match)
3674 bitset_not (sbcset);
3676 #ifdef RE_ENABLE_I18N
3677 /* Ensure only single byte characters are set. */
3678 if (dfa->mb_cur_max > 1)
3679 bitset_mask (sbcset, dfa->sb_char);
3680 #endif
3682 /* Build a tree for simple bracket. */
3683 br_token.type = SIMPLE_BRACKET;
3684 br_token.opr.sbcset = sbcset;
3685 tree = create_token_tree (dfa, NULL, NULL, &br_token);
3686 if (BE (tree == NULL, 0))
3687 goto build_word_op_espace;
3689 #ifdef RE_ENABLE_I18N
3690 if (dfa->mb_cur_max > 1)
3692 bin_tree_t *mbc_tree;
3693 /* Build a tree for complex bracket. */
3694 br_token.type = COMPLEX_BRACKET;
3695 br_token.opr.mbcset = mbcset;
3696 dfa->has_mb_node = 1;
3697 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3698 if (BE (mbc_tree == NULL, 0))
3699 goto build_word_op_espace;
3700 /* Then join them by ALT node. */
3701 tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3702 if (BE (mbc_tree != NULL, 1))
3703 return tree;
3705 else
3707 free_charset (mbcset);
3708 return tree;
3710 #else /* not RE_ENABLE_I18N */
3711 return tree;
3712 #endif /* not RE_ENABLE_I18N */
3714 build_word_op_espace:
3715 re_free (sbcset);
3716 #ifdef RE_ENABLE_I18N
3717 free_charset (mbcset);
3718 #endif /* RE_ENABLE_I18N */
3719 *err = REG_ESPACE;
3720 return NULL;
3723 /* This is intended for the expressions like "a{1,3}".
3724 Fetch a number from `input', and return the number.
3725 Return -1, if the number field is empty like "{,1}".
3726 Return -2, If an error is occured. */
3728 static int
3729 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3731 int num = -1;
3732 unsigned char c;
3733 while (1)
3735 fetch_token (token, input, syntax);
3736 c = token->opr.c;
3737 if (BE (token->type == END_OF_RE, 0))
3738 return -2;
3739 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3740 break;
3741 num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3742 ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3743 num = (num > RE_DUP_MAX) ? -2 : num;
3745 return num;
3748 #ifdef RE_ENABLE_I18N
3749 static void
3750 free_charset (re_charset_t *cset)
3752 re_free (cset->mbchars);
3753 # ifdef _LIBC
3754 re_free (cset->coll_syms);
3755 re_free (cset->equiv_classes);
3756 re_free (cset->range_starts);
3757 re_free (cset->range_ends);
3758 # endif
3759 re_free (cset->char_classes);
3760 re_free (cset);
3762 #endif /* RE_ENABLE_I18N */
3764 /* Functions for binary tree operation. */
3766 /* Create a tree node. */
3768 static bin_tree_t *
3769 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3770 re_token_type_t type)
3772 re_token_t t;
3773 t.type = type;
3774 return create_token_tree (dfa, left, right, &t);
3777 static bin_tree_t *
3778 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3779 const re_token_t *token)
3781 bin_tree_t *tree;
3782 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3784 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3786 if (storage == NULL)
3787 return NULL;
3788 storage->next = dfa->str_tree_storage;
3789 dfa->str_tree_storage = storage;
3790 dfa->str_tree_storage_idx = 0;
3792 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3794 tree->parent = NULL;
3795 tree->left = left;
3796 tree->right = right;
3797 tree->token = *token;
3798 tree->token.duplicated = 0;
3799 tree->token.opt_subexp = 0;
3800 tree->first = NULL;
3801 tree->next = NULL;
3802 tree->node_idx = -1;
3804 if (left != NULL)
3805 left->parent = tree;
3806 if (right != NULL)
3807 right->parent = tree;
3808 return tree;
3811 /* Mark the tree SRC as an optional subexpression.
3812 To be called from preorder or postorder. */
3814 static reg_errcode_t
3815 mark_opt_subexp (void *extra, bin_tree_t *node)
3817 int idx = (int) (long) extra;
3818 if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3819 node->token.opt_subexp = 1;
3821 return REG_NOERROR;
3824 /* Free the allocated memory inside NODE. */
3826 static void
3827 free_token (re_token_t *node)
3829 #ifdef RE_ENABLE_I18N
3830 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3831 free_charset (node->opr.mbcset);
3832 else
3833 #endif /* RE_ENABLE_I18N */
3834 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3835 re_free (node->opr.sbcset);
3838 /* Worker function for tree walking. Free the allocated memory inside NODE
3839 and its children. */
3841 static reg_errcode_t
3842 free_tree (void *extra, bin_tree_t *node)
3844 free_token (&node->token);
3845 return REG_NOERROR;
3849 /* Duplicate the node SRC, and return new node. This is a preorder
3850 visit similar to the one implemented by the generic visitor, but
3851 we need more infrastructure to maintain two parallel trees --- so,
3852 it's easier to duplicate. */
3854 static bin_tree_t *
3855 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3857 const bin_tree_t *node;
3858 bin_tree_t *dup_root;
3859 bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3861 for (node = root; ; )
3863 /* Create a new tree and link it back to the current parent. */
3864 *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3865 if (*p_new == NULL)
3866 return NULL;
3867 (*p_new)->parent = dup_node;
3868 (*p_new)->token.duplicated = 1;
3869 dup_node = *p_new;
3871 /* Go to the left node, or up and to the right. */
3872 if (node->left)
3874 node = node->left;
3875 p_new = &dup_node->left;
3877 else
3879 const bin_tree_t *prev = NULL;
3880 while (node->right == prev || node->right == NULL)
3882 prev = node;
3883 node = node->parent;
3884 dup_node = dup_node->parent;
3885 if (!node)
3886 return dup_root;
3888 node = node->right;
3889 p_new = &dup_node->right;